handle names starting with non-ascii characters correctly #9712

This commit is contained in:
Benjamin Peterson 2010-08-30 14:41:20 +00:00
parent e01de8f2f3
commit 33856de84d
3 changed files with 25 additions and 5 deletions

View File

@ -531,6 +531,7 @@ pass the '-ucompiler' option to process the full directory.
True True
Evil tabs Evil tabs
>>> dump_tokens("def f():\\n\\tif x\\n \\tpass") >>> dump_tokens("def f():\\n\\tif x\\n \\tpass")
ENCODING 'utf-8' (0, 0) (0, 0) ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'def' (1, 0) (1, 3) NAME 'def' (1, 0) (1, 3)
@ -547,6 +548,18 @@ Evil tabs
NAME 'pass' (3, 9) (3, 13) NAME 'pass' (3, 9) (3, 13)
DEDENT '' (4, 0) (4, 0) DEDENT '' (4, 0) (4, 0)
DEDENT '' (4, 0) (4, 0) DEDENT '' (4, 0) (4, 0)
Non-ascii identifiers
>>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'Örter' (1, 0) (1, 5)
OP '=' (1, 6) (1, 7)
STRING "'places'" (1, 8) (1, 16)
NEWLINE '\\n' (1, 16) (1, 17)
NAME 'grün' (2, 0) (2, 4)
OP '=' (2, 5) (2, 6)
STRING "'green'" (2, 7) (2, 14)
""" """
from test import support from test import support

View File

@ -92,7 +92,7 @@ def maybe(*choices): return group(*choices) + '?'
Whitespace = r'[ \f\t]*' Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*' Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Name = r'[a-zA-Z_]\w*' Name = r'\w+'
Hexnumber = r'0[xX][0-9a-fA-F]+' Hexnumber = r'0[xX][0-9a-fA-F]+'
Binnumber = r'0[bB][01]+' Binnumber = r'0[bB][01]+'
@ -142,9 +142,12 @@ ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
PseudoExtras = group(r'\\\r?\n', Comment, Triple) PseudoExtras = group(r'\\\r?\n', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
def _compile(expr):
return re.compile(expr, re.UNICODE)
tokenprog, pseudoprog, single3prog, double3prog = map( tokenprog, pseudoprog, single3prog, double3prog = map(
re.compile, (Token, PseudoToken, Single3, Double3)) _compile, (Token, PseudoToken, Single3, Double3))
endprogs = {"'": re.compile(Single), '"': re.compile(Double), endprogs = {"'": _compile(Single), '"': _compile(Double),
"'''": single3prog, '"""': double3prog, "'''": single3prog, '"""': double3prog,
"r'''": single3prog, 'r"""': double3prog, "r'''": single3prog, 'r"""': double3prog,
"b'''": single3prog, 'b"""': double3prog, "b'''": single3prog, 'b"""': double3prog,
@ -171,6 +174,8 @@ for t in ("'", '"',
"bR'", 'bR"', "BR'", 'BR"' ): "bR'", 'bR"', "BR'", 'BR"' ):
single_quoted[t] = t single_quoted[t] = t
del _compile
tabsize = 8 tabsize = 8
class TokenError(Exception): pass class TokenError(Exception): pass
@ -393,7 +398,7 @@ def tokenize(readline):
def _tokenize(readline, encoding): def _tokenize(readline, encoding):
lnum = parenlev = continued = 0 lnum = parenlev = continued = 0
namechars, numchars = string.ascii_letters + '_', '0123456789' numchars = '0123456789'
contstr, needcont = '', 0 contstr, needcont = '', 0
contline = None contline = None
indents = [0] indents = [0]
@ -520,7 +525,7 @@ def _tokenize(readline, encoding):
break break
else: # ordinary string else: # ordinary string
yield TokenInfo(STRING, token, spos, epos, line) yield TokenInfo(STRING, token, spos, epos, line)
elif initial in namechars: # ordinary name elif initial.isidentifier(): # ordinary name
yield TokenInfo(NAME, token, spos, epos, line) yield TokenInfo(NAME, token, spos, epos, line)
elif initial == '\\': # continued stmt elif initial == '\\': # continued stmt
continued = 1 continued = 1

View File

@ -12,6 +12,8 @@ What's New in Python 3.2 Alpha 2?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #9712: Fix tokenize on identifiers that start with non-ascii names.
- Issue #9688: __basicsize__ and __itemsize__ must be accessed as Py_ssize_t. - Issue #9688: __basicsize__ and __itemsize__ must be accessed as Py_ssize_t.
- Issue #9684: Added a definition for SIZEOF_WCHAR_T to PC/pyconfig.h, - Issue #9684: Added a definition for SIZEOF_WCHAR_T to PC/pyconfig.h,