Clean-up example.

This commit is contained in:
Raymond Hettinger 2011-05-23 12:45:34 -07:00
parent b43dd4b8ca
commit 4b244ef255
1 changed files with 41 additions and 31 deletions

View File

@ -1298,24 +1298,27 @@ The text categories are specified with regular expressions. The technique is
to combine those into a single master regular expression and to loop over to combine those into a single master regular expression and to loop over
successive matches:: successive matches::
Token = collections.namedtuple('Token', 'typ value line column') import collections
import re
Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])
def tokenize(s): def tokenize(s):
keywords = {'IF', 'THEN', 'FOR', 'NEXT', 'GOSUB', 'RETURN'} keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
tok_spec = [ token_specification = [
('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number ('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number
('ASSIGN', r':='), # Assignment operator ('ASSIGN', r':='), # Assignment operator
('END', ';'), # Statement terminator ('END', r';'), # Statement terminator
('ID', r'[A-Za-z]+'), # Identifiers ('ID', r'[A-Za-z]+'), # Identifiers
('OP', r'[+*\/\-]'), # Arithmetic operators ('OP', r'[+*\/\-]'), # Arithmetic operators
('NEWLINE', r'\n'), # Line endings ('NEWLINE', r'\n'), # Line endings
('SKIP', r'[ \t]'), # Skip over spaces and tabs ('SKIP', r'[ \t]'), # Skip over spaces and tabs
] ]
tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
gettok = re.compile(tok_re).match get_token = re.compile(tok_regex).match
line = 1 line = 1
pos = line_start = 0 pos = line_start = 0
mo = gettok(s) mo = get_token(s)
while mo is not None: while mo is not None:
typ = mo.lastgroup typ = mo.lastgroup
if typ == 'NEWLINE': if typ == 'NEWLINE':
@ -1327,13 +1330,15 @@ successive matches::
typ = val typ = val
yield Token(typ, val, line, mo.start()-line_start) yield Token(typ, val, line, mo.start()-line_start)
pos = mo.end() pos = mo.end()
mo = gettok(s, pos) mo = get_token(s, pos)
if pos != len(s): if pos != len(s):
raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line)) raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line))
statements = '''\ statements = '''
IF quantity THEN
total := total + price * quantity; total := total + price * quantity;
tax := price * 0.05; tax := price * 0.05;
ENDIF;
''' '''
for token in tokenize(statements): for token in tokenize(statements):
@ -1341,17 +1346,22 @@ successive matches::
The tokenizer produces the following output:: The tokenizer produces the following output::
Token(typ='ID', value='total', line=1, column=8) Token(typ='IF', value='IF', line=2, column=5)
Token(typ='ASSIGN', value=':=', line=1, column=14) Token(typ='ID', value='quantity', line=2, column=8)
Token(typ='ID', value='total', line=1, column=17) Token(typ='THEN', value='THEN', line=2, column=17)
Token(typ='OP', value='+', line=1, column=23) Token(typ='ID', value='total', line=3, column=9)
Token(typ='ID', value='price', line=1, column=25) Token(typ='ASSIGN', value=':=', line=3, column=15)
Token(typ='OP', value='*', line=1, column=31) Token(typ='ID', value='total', line=3, column=18)
Token(typ='ID', value='quantity', line=1, column=33) Token(typ='OP', value='+', line=3, column=24)
Token(typ='END', value=';', line=1, column=41) Token(typ='ID', value='price', line=3, column=26)
Token(typ='ID', value='tax', line=2, column=9) Token(typ='OP', value='*', line=3, column=32)
Token(typ='ASSIGN', value=':=', line=2, column=13) Token(typ='ID', value='quantity', line=3, column=34)
Token(typ='ID', value='price', line=2, column=16) Token(typ='END', value=';', line=3, column=42)
Token(typ='OP', value='*', line=2, column=22) Token(typ='ID', value='tax', line=4, column=9)
Token(typ='NUMBER', value='0.05', line=2, column=24) Token(typ='ASSIGN', value=':=', line=4, column=13)
Token(typ='END', value=';', line=2, column=28) Token(typ='ID', value='price', line=4, column=16)
Token(typ='OP', value='*', line=4, column=22)
Token(typ='NUMBER', value='0.05', line=4, column=24)
Token(typ='END', value=';', line=4, column=28)
Token(typ='ENDIF', value='ENDIF', line=5, column=5)
Token(typ='END', value=';', line=5, column=10)