diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 7008d0e46fd..7a472b21f37 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -578,9 +578,15 @@ pass the '-ucpu' option to process the full directory. >>> tempdir = os.path.dirname(f) or os.curdir >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py")) -tokenize is broken on test_pep3131.py because regular expressions are broken on -the obscure unicode identifiers in it. *sigh* +Tokenize is broken on test_pep3131.py because regular expressions are +broken on the obscure unicode identifiers in it. *sigh* +With roundtrip extended to test the 5-tuple mode of untokenize, +7 more testfiles fail. Remove them also until the failure is diagnosed. + >>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py")) + >>> for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'): + ... testfiles.remove(os.path.join(tempdir, "test_%s.py") % f) + ... >>> if not support.is_resource_enabled("cpu"): ... testfiles = random.sample(testfiles, 10) ... @@ -659,21 +665,39 @@ def dump_tokens(s): def roundtrip(f): """ Test roundtrip for `untokenize`. `f` is an open file or a string. - The source code in f is tokenized, converted back to source code via - tokenize.untokenize(), and tokenized again from the latter. The test - fails if the second tokenization doesn't match the first. + The source code in f is tokenized to both 5- and 2-tuples. + Both sequences are converted back to source code via + tokenize.untokenize(), and the latter tokenized again to 2-tuples. + The test fails if the 3 pair tokenizations do not match. + + When untokenize bugs are fixed, untokenize with 5-tuples should + reproduce code that does not contain a backslash continuation + following spaces. A proper test should test this. + + This function would be more useful for correcting bugs if it reported + the first point of failure, like assertEqual, rather than just + returning False -- or if it were only used in unittests and not + doctest and actually used assertEqual. """ + # Get source code and original tokenizations if isinstance(f, str): - f = BytesIO(f.encode('utf-8')) - try: - token_list = list(tokenize(f.readline)) - finally: + code = f.encode('utf-8') + else: + code = f.read() f.close() - tokens1 = [tok[:2] for tok in token_list] - new_bytes = untokenize(tokens1) - readline = (line for line in new_bytes.splitlines(keepends=True)).__next__ - tokens2 = [tok[:2] for tok in tokenize(readline)] - return tokens1 == tokens2 + readline = iter(code.splitlines(keepends=True)).__next__ + tokens5 = list(tokenize(readline)) + tokens2 = [tok[:2] for tok in tokens5] + # Reproduce tokens2 from pairs + bytes_from2 = untokenize(tokens2) + readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__ + tokens2_from2 = [tok[:2] for tok in tokenize(readline2)] + # Reproduce tokens2 from 5-tuples + bytes_from5 = untokenize(tokens5) + readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__ + tokens2_from5 = [tok[:2] for tok in tokenize(readline5)] + # Compare 3 versions + return tokens2 == tokens2_from2 == tokens2_from5 # This is an example from the docs, set up as a doctest. def decistmt(s):