bpo-12486: Document tokenize.generate_tokens() as public API (#6957)
* Document tokenize.generate_tokens() * Add news file * Add test for generate_tokens * Document behaviour around ENCODING token * Add generate_tokens to __all__
This commit is contained in:
parent
c2745d2d05
commit
c56b17bd8c
|
@ -57,6 +57,16 @@ The primary entry point is a :term:`generator`:
|
||||||
:func:`.tokenize` determines the source encoding of the file by looking for a
|
:func:`.tokenize` determines the source encoding of the file by looking for a
|
||||||
UTF-8 BOM or encoding cookie, according to :pep:`263`.
|
UTF-8 BOM or encoding cookie, according to :pep:`263`.
|
||||||
|
|
||||||
|
.. function:: generate_tokens(readline)
|
||||||
|
|
||||||
|
Tokenize a source reading unicode strings instead of bytes.
|
||||||
|
|
||||||
|
Like :func:`.tokenize`, the *readline* argument is a callable returning
|
||||||
|
a single line of input. However, :func:`generate_tokens` expects *readline*
|
||||||
|
to return a str object rather than bytes.
|
||||||
|
|
||||||
|
The result is an iterator yielding named tuples, exactly like
|
||||||
|
:func:`.tokenize`. It does not yield an :data:`~token.ENCODING` token.
|
||||||
|
|
||||||
All constants from the :mod:`token` module are also exported from
|
All constants from the :mod:`token` module are also exported from
|
||||||
:mod:`tokenize`.
|
:mod:`tokenize`.
|
||||||
|
@ -79,7 +89,8 @@ write back the modified script.
|
||||||
positions) may change.
|
positions) may change.
|
||||||
|
|
||||||
It returns bytes, encoded using the :data:`~token.ENCODING` token, which
|
It returns bytes, encoded using the :data:`~token.ENCODING` token, which
|
||||||
is the first token sequence output by :func:`.tokenize`.
|
is the first token sequence output by :func:`.tokenize`. If there is no
|
||||||
|
encoding token in the input, it returns a str instead.
|
||||||
|
|
||||||
|
|
||||||
:func:`.tokenize` needs to detect the encoding of source files it tokenizes. The
|
:func:`.tokenize` needs to detect the encoding of source files it tokenizes. The
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from test import support
|
from test import support
|
||||||
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
|
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
|
||||||
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
|
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
|
||||||
open as tokenize_open, Untokenizer)
|
open as tokenize_open, Untokenizer, generate_tokens)
|
||||||
from io import BytesIO
|
from io import BytesIO, StringIO
|
||||||
import unittest
|
import unittest
|
||||||
from unittest import TestCase, mock
|
from unittest import TestCase, mock
|
||||||
from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
|
from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
|
||||||
|
@ -919,6 +919,19 @@ async def f():
|
||||||
DEDENT '' (7, 0) (7, 0)
|
DEDENT '' (7, 0) (7, 0)
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
class GenerateTokensTest(TokenizeTest):
|
||||||
|
def check_tokenize(self, s, expected):
|
||||||
|
# Format the tokens in s in a table format.
|
||||||
|
# The ENDMARKER is omitted.
|
||||||
|
result = []
|
||||||
|
f = StringIO(s)
|
||||||
|
for type, token, start, end, line in generate_tokens(f.readline):
|
||||||
|
if type == ENDMARKER:
|
||||||
|
break
|
||||||
|
type = tok_name[type]
|
||||||
|
result.append(f" {type:10} {token!r:13} {start} {end}")
|
||||||
|
self.assertEqual(result, expected.rstrip().splitlines())
|
||||||
|
|
||||||
|
|
||||||
def decistmt(s):
|
def decistmt(s):
|
||||||
result = []
|
result = []
|
||||||
|
|
|
@ -37,7 +37,7 @@ cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
||||||
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
||||||
|
|
||||||
import token
|
import token
|
||||||
__all__ = token.__all__ + ["tokenize", "detect_encoding",
|
__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
|
||||||
"untokenize", "TokenInfo"]
|
"untokenize", "TokenInfo"]
|
||||||
del token
|
del token
|
||||||
|
|
||||||
|
@ -653,9 +653,12 @@ def _tokenize(readline, encoding):
|
||||||
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
|
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
|
||||||
|
|
||||||
|
|
||||||
# An undocumented, backwards compatible, API for all the places in the standard
|
|
||||||
# library that expect to be able to use tokenize with strings
|
|
||||||
def generate_tokens(readline):
|
def generate_tokens(readline):
|
||||||
|
"""Tokenize a source reading Python code as unicode strings.
|
||||||
|
|
||||||
|
This has the same API as tokenize(), except that it expects the *readline*
|
||||||
|
callable to return str objects instead of bytes.
|
||||||
|
"""
|
||||||
return _tokenize(readline, None)
|
return _tokenize(readline, None)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
:func:`tokenize.generate_tokens` is now documented as a public API to
|
||||||
|
tokenize unicode strings. It was previously present but undocumented.
|
Loading…
Reference in New Issue