Issue 25311: Add support for f-strings to tokenize.py. Also added some comments to explain what's happening, since it's not so obvious.

This commit is contained in:
Eric V. Smith 2015-10-26 04:37:55 -04:00
parent f1c47e4751
commit 1c8222c80a
2 changed files with 83 additions and 50 deletions

View File

@ -332,6 +332,23 @@ b"""', """\
b\ b\
c"""', """\ c"""', """\
STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4) STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
""")
self.check_tokenize('f"abc"', """\
STRING 'f"abc"' (1, 0) (1, 6)
""")
self.check_tokenize('fR"a{b}c"', """\
STRING 'fR"a{b}c"' (1, 0) (1, 9)
""")
self.check_tokenize('f"""abc"""', """\
STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10)
""")
self.check_tokenize(r'f"abc\
def"', """\
STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4)
""")
self.check_tokenize(r'Rf"abc\
def"', """\
STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
""") """)
def test_function(self): def test_function(self):

View File

@ -29,6 +29,7 @@ from codecs import lookup, BOM_UTF8
import collections import collections
from io import TextIOWrapper from io import TextIOWrapper
from itertools import chain from itertools import chain
import itertools as _itertools
import re import re
import sys import sys
from token import * from token import *
@ -131,7 +132,28 @@ Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Number = group(Imagnumber, Floatnumber, Intnumber) Number = group(Imagnumber, Floatnumber, Intnumber)
StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?' # Return the empty string, plus all of the valid string prefixes.
def _all_string_prefixes():
# The valid string prefixes. Only contain the lower case versions,
# and don't contain any permuations (include 'fr', but not
# 'rf'). The various permutations will be generated.
_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
# if we add binary f-strings, add: ['fb', 'fbr']
result = set([''])
for prefix in _valid_string_prefixes:
for t in _itertools.permutations(prefix):
# create a list with upper and lower versions of each
# character
for u in _itertools.product(*[(c, c.upper()) for c in t]):
result.add(''.join(u))
return result
def _compile(expr):
return re.compile(expr, re.UNICODE)
# Note that since _all_string_prefixes includes the empty string,
# StringPrefix can be the empty string (making it optional).
StringPrefix = group(*_all_string_prefixes())
# Tail end of ' string. # Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'" Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
@ -169,50 +191,25 @@ ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
def _compile(expr): # For a given string prefix plus quotes, endpats maps it to a regex
return re.compile(expr, re.UNICODE) # to match the remainder of that string. _prefix can be empty, for
# a normal single or triple quoted string (with no prefix).
endpats = {}
for _prefix in _all_string_prefixes():
endpats[_prefix + "'"] = Single
endpats[_prefix + '"'] = Double
endpats[_prefix + "'''"] = Single3
endpats[_prefix + '"""'] = Double3
endpats = {"'": Single, '"': Double, # A set of all of the single and triple quoted string prefixes,
"'''": Single3, '"""': Double3, # including the opening quotes.
"r'''": Single3, 'r"""': Double3, single_quoted = set()
"b'''": Single3, 'b"""': Double3, triple_quoted = set()
"R'''": Single3, 'R"""': Double3, for t in _all_string_prefixes():
"B'''": Single3, 'B"""': Double3, for u in (t + '"', t + "'"):
"br'''": Single3, 'br"""': Double3, single_quoted.add(u)
"bR'''": Single3, 'bR"""': Double3, for u in (t + '"""', t + "'''"):
"Br'''": Single3, 'Br"""': Double3, triple_quoted.add(u)
"BR'''": Single3, 'BR"""': Double3,
"rb'''": Single3, 'rb"""': Double3,
"Rb'''": Single3, 'Rb"""': Double3,
"rB'''": Single3, 'rB"""': Double3,
"RB'''": Single3, 'RB"""': Double3,
"u'''": Single3, 'u"""': Double3,
"U'''": Single3, 'U"""': Double3,
'r': None, 'R': None, 'b': None, 'B': None,
'u': None, 'U': None}
triple_quoted = {}
for t in ("'''", '"""',
"r'''", 'r"""', "R'''", 'R"""',
"b'''", 'b"""', "B'''", 'B"""',
"br'''", 'br"""', "Br'''", 'Br"""',
"bR'''", 'bR"""', "BR'''", 'BR"""',
"rb'''", 'rb"""', "rB'''", 'rB"""',
"Rb'''", 'Rb"""', "RB'''", 'RB"""',
"u'''", 'u"""', "U'''", 'U"""',
):
triple_quoted[t] = t
single_quoted = {}
for t in ("'", '"',
"r'", 'r"', "R'", 'R"',
"b'", 'b"', "B'", 'B"',
"br'", 'br"', "Br'", 'Br"',
"bR'", 'bR"', "BR'", 'BR"' ,
"rb'", 'rb"', "rB'", 'rB"',
"Rb'", 'Rb"', "RB'", 'RB"' ,
"u'", 'u"', "U'", 'U"',
):
single_quoted[t] = t
tabsize = 8 tabsize = 8
@ -626,6 +623,7 @@ def _tokenize(readline, encoding):
yield stashed yield stashed
stashed = None stashed = None
yield TokenInfo(COMMENT, token, spos, epos, line) yield TokenInfo(COMMENT, token, spos, epos, line)
elif token in triple_quoted: elif token in triple_quoted:
endprog = _compile(endpats[token]) endprog = _compile(endpats[token])
endmatch = endprog.match(line, pos) endmatch = endprog.match(line, pos)
@ -638,19 +636,37 @@ def _tokenize(readline, encoding):
contstr = line[start:] contstr = line[start:]
contline = line contline = line
break break
elif initial in single_quoted or \
token[:2] in single_quoted or \ # Check up to the first 3 chars of the token to see if
token[:3] in single_quoted: # they're in the single_quoted set. If so, they start
# a string.
# We're using the first 3, because we're looking for
# "rb'" (for example) at the start of the token. If
# we switch to longer prefixes, this needs to be
# adjusted.
# Note that initial == token[:1].
# Also note that single quote checking must come afer
# triple quote checking (above).
elif (initial in single_quoted or
token[:2] in single_quoted or
token[:3] in single_quoted):
if token[-1] == '\n': # continued string if token[-1] == '\n': # continued string
strstart = (lnum, start) strstart = (lnum, start)
endprog = _compile(endpats[initial] or # Again, using the first 3 chars of the
endpats[token[1]] or # token. This is looking for the matching end
endpats[token[2]]) # regex for the correct type of quote
# character. So it's really looking for
# endpats["'"] or endpats['"'], by trying to
# skip string prefix characters, if any.
endprog = _compile(endpats.get(initial) or
endpats.get(token[1]) or
endpats.get(token[2]))
contstr, needcont = line[start:], 1 contstr, needcont = line[start:], 1
contline = line contline = line
break break
else: # ordinary string else: # ordinary string
yield TokenInfo(STRING, token, spos, epos, line) yield TokenInfo(STRING, token, spos, epos, line)
elif initial.isidentifier(): # ordinary name elif initial.isidentifier(): # ordinary name
if token in ('async', 'await'): if token in ('async', 'await'):
if async_def: if async_def: