Issue 25311: Add support for f-strings to tokenize.py. Also added some comments to explain what's happening, since it's not so obvious.

2015-10-26 04:37:55 -04:00 · 2015-10-26 04:37:55 -04:00 · 1c8222c80a
parent f1c47e4751
commit 1c8222c80a
2 changed files with 83 additions and 50 deletions
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@ -332,6 +332,23 @@ b"""', """\
 b\
 c"""', """\
    STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
    """)
        self.check_tokenize('f"abc"', """\
    STRING     'f"abc"'      (1, 0) (1, 6)
    """)
        self.check_tokenize('fR"a{b}c"', """\
    STRING     'fR"a{b}c"'   (1, 0) (1, 9)
    """)
        self.check_tokenize('f"""abc"""', """\
    STRING     'f\"\"\"abc\"\"\"'  (1, 0) (1, 10)
    """)
        self.check_tokenize(r'f"abc\
 def"', """\
    STRING     'f"abc\\\\\\ndef"' (1, 0) (2, 4)
    """)
        self.check_tokenize(r'Rf"abc\
 def"', """\
    STRING     'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
    """)
    def test_function(self):
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@ -29,6 +29,7 @@ from codecs import lookup, BOM_UTF8
 import collections
 from io import TextIOWrapper
 from itertools import chain
 import itertools as _itertools
 import re
 import sys
 from token import *
@ -131,7 +132,28 @@ Floatnumber = group(Pointfloat, Expfloat)
 Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
 Number = group(Imagnumber, Floatnumber, Intnumber)
-StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
+# Return the empty string, plus all of the valid string prefixes.
 def _all_string_prefixes():
    # The valid string prefixes. Only contain the lower case versions,
    #  and don't contain any permuations (include 'fr', but not
    #  'rf'). The various permutations will be generated.
    _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
    # if we add binary f-strings, add: ['fb', 'fbr']
    result = set([''])
    for prefix in _valid_string_prefixes:
        for t in _itertools.permutations(prefix):
            # create a list with upper and lower versions of each
            #  character
            for u in _itertools.product(*[(c, c.upper()) for c in t]):
                result.add(''.join(u))
    return result
 def _compile(expr):
    return re.compile(expr, re.UNICODE)
 # Note that since _all_string_prefixes includes the empty string,
 #  StringPrefix can be the empty string (making it optional).
 StringPrefix = group(*_all_string_prefixes())
 # Tail end of ' string.
 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
@ -169,50 +191,25 @@ ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
 PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
-def _compile(expr):
+# For a given string prefix plus quotes, endpats maps it to a regex
-    return re.compile(expr, re.UNICODE)
+#  to match the remainder of that string. _prefix can be empty, for
 #  a normal single or triple quoted string (with no prefix).
 endpats = {}
 for _prefix in _all_string_prefixes():
    endpats[_prefix + "'"] = Single
    endpats[_prefix + '"'] = Double
    endpats[_prefix + "'''"] = Single3
    endpats[_prefix + '"""'] = Double3
-endpats = {"'": Single, '"': Double,
+# A set of all of the single and triple quoted string prefixes,
-           "'''": Single3, '"""': Double3,
+#  including the opening quotes.
-           "r'''": Single3, 'r"""': Double3,
+single_quoted = set()
-           "b'''": Single3, 'b"""': Double3,
+triple_quoted = set()
-           "R'''": Single3, 'R"""': Double3,
+for t in _all_string_prefixes():
-           "B'''": Single3, 'B"""': Double3,
+    for u in (t + '"', t + "'"):
-           "br'''": Single3, 'br"""': Double3,
+        single_quoted.add(u)
-           "bR'''": Single3, 'bR"""': Double3,
+    for u in (t + '"""', t + "'''"):
-           "Br'''": Single3, 'Br"""': Double3,
+        triple_quoted.add(u)
           "BR'''": Single3, 'BR"""': Double3,
           "rb'''": Single3, 'rb"""': Double3,
           "Rb'''": Single3, 'Rb"""': Double3,
           "rB'''": Single3, 'rB"""': Double3,
           "RB'''": Single3, 'RB"""': Double3,
           "u'''": Single3, 'u"""': Double3,
           "U'''": Single3, 'U"""': Double3,
           'r': None, 'R': None, 'b': None, 'B': None,
           'u': None, 'U': None}
 triple_quoted = {}
 for t in ("'''", '"""',
          "r'''", 'r"""', "R'''", 'R"""',
          "b'''", 'b"""', "B'''", 'B"""',
          "br'''", 'br"""', "Br'''", 'Br"""',
          "bR'''", 'bR"""', "BR'''", 'BR"""',
          "rb'''", 'rb"""', "rB'''", 'rB"""',
          "Rb'''", 'Rb"""', "RB'''", 'RB"""',
          "u'''", 'u"""', "U'''", 'U"""',
          ):
    triple_quoted[t] = t
 single_quoted = {}
 for t in ("'", '"',
          "r'", 'r"', "R'", 'R"',
          "b'", 'b"', "B'", 'B"',
          "br'", 'br"', "Br'", 'Br"',
          "bR'", 'bR"', "BR'", 'BR"' ,
          "rb'", 'rb"', "rB'", 'rB"',
          "Rb'", 'Rb"', "RB'", 'RB"' ,
          "u'", 'u"', "U'", 'U"',
          ):
    single_quoted[t] = t
 tabsize = 8
@ -626,6 +623,7 @@ def _tokenize(readline, encoding):
                        yield stashed
                        stashed = None
                    yield TokenInfo(COMMENT, token, spos, epos, line)
                elif token in triple_quoted:
                    endprog = _compile(endpats[token])
                    endmatch = endprog.match(line, pos)
@ -638,19 +636,37 @@ def _tokenize(readline, encoding):
                        contstr = line[start:]
                        contline = line
                        break
-                elif initial in single_quoted or \
+
-                    token[:2] in single_quoted or \
+                # Check up to the first 3 chars of the token to see if
-                    token[:3] in single_quoted:
+                #  they're in the single_quoted set. If so, they start
                #  a string.
                # We're using the first 3, because we're looking for
                #  "rb'" (for example) at the start of the token. If
                #  we switch to longer prefixes, this needs to be
                #  adjusted.
                # Note that initial == token[:1].
                # Also note that single quote checking must come afer
                #  triple quote checking (above).
                elif (initial in single_quoted or
                      token[:2] in single_quoted or
                      token[:3] in single_quoted):
                    if token[-1] == '\n':                  # continued string
                        strstart = (lnum, start)
-                        endprog = _compile(endpats[initial] or
+                        # Again, using the first 3 chars of the
-                                           endpats[token[1]] or
+                        #  token. This is looking for the matching end
-                                           endpats[token[2]])
+                        #  regex for the correct type of quote
                        #  character. So it's really looking for
                        #  endpats["'"] or endpats['"'], by trying to
                        #  skip string prefix characters, if any.
                        endprog = _compile(endpats.get(initial) or
                                           endpats.get(token[1]) or
                                           endpats.get(token[2]))
                        contstr, needcont = line[start:], 1
                        contline = line
                        break
                    else:                                  # ordinary string
                        yield TokenInfo(STRING, token, spos, epos, line)
                elif initial.isidentifier():               # ordinary name
                    if token in ('async', 'await'):
                        if async_def: