Issue #18960: Fix bugs with Python source code encoding in the second line.
* The first line of Python script could be executed twice when the source encoding (not equal to 'utf-8') was specified on the second line. * Now the source encoding declaration on the second line isn't effective if the first line contains anything except a comment. * As a consequence, 'python -x' works now again with files with the source encoding declarations specified on the second file, and can be used again to make Python batch files on Windows. * The tokenize module now ignore the source encoding declaration on the second line if the first line contains anything except a comment. * IDLE now ignores the source encoding declaration on the second line if the first line contains anything except a comment. * 2to3 and the findnocoding.py script now ignore the source encoding declaration on the second line if the first line contains anything except a comment.
This commit is contained in:
commit
7282ff6d5b
|
@ -64,6 +64,7 @@ encoding = locale_encoding ### KBK 07Sep07 This is used all over IDLE, check!
|
|||
### 'encoding' is used below in encode(), check!
|
||||
|
||||
coding_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
||||
blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
||||
|
||||
def coding_spec(data):
|
||||
"""Return the encoding declaration according to PEP 263.
|
||||
|
@ -93,6 +94,8 @@ def coding_spec(data):
|
|||
match = coding_re.match(line)
|
||||
if match is not None:
|
||||
break
|
||||
if not blank_re.match(line):
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
name = match.group(1)
|
||||
|
|
|
@ -237,6 +237,7 @@ class Untokenizer:
|
|||
toks_append(tokval)
|
||||
|
||||
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
||||
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
||||
|
||||
def _get_normal_name(orig_enc):
|
||||
"""Imitates get_normal_name in tokenizer.c."""
|
||||
|
@ -309,6 +310,8 @@ def detect_encoding(readline):
|
|||
encoding = find_cookie(first)
|
||||
if encoding:
|
||||
return encoding, [first]
|
||||
if not blank_re.match(first):
|
||||
return default, [first]
|
||||
|
||||
second = read_or_stop()
|
||||
if not second:
|
||||
|
|
|
@ -885,6 +885,39 @@ class TestDetectEncoding(TestCase):
|
|||
readline = self.get_readline(lines)
|
||||
self.assertRaises(SyntaxError, detect_encoding, readline)
|
||||
|
||||
def test_cookie_second_line_noncommented_first_line(self):
|
||||
lines = (
|
||||
b"print('\xc2\xa3')\n",
|
||||
b'# vim: set fileencoding=iso8859-15 :\n',
|
||||
b"print('\xe2\x82\xac')\n"
|
||||
)
|
||||
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
|
||||
self.assertEqual(encoding, 'utf-8')
|
||||
expected = [b"print('\xc2\xa3')\n"]
|
||||
self.assertEqual(consumed_lines, expected)
|
||||
|
||||
def test_cookie_second_line_commented_first_line(self):
|
||||
lines = (
|
||||
b"#print('\xc2\xa3')\n",
|
||||
b'# vim: set fileencoding=iso8859-15 :\n',
|
||||
b"print('\xe2\x82\xac')\n"
|
||||
)
|
||||
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
|
||||
self.assertEqual(encoding, 'iso8859-15')
|
||||
expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
|
||||
self.assertEqual(consumed_lines, expected)
|
||||
|
||||
def test_cookie_second_line_empty_first_line(self):
|
||||
lines = (
|
||||
b'\n',
|
||||
b'# vim: set fileencoding=iso8859-15 :\n',
|
||||
b"print('\xe2\x82\xac')\n"
|
||||
)
|
||||
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
|
||||
self.assertEqual(encoding, 'iso8859-15')
|
||||
expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
|
||||
self.assertEqual(consumed_lines, expected)
|
||||
|
||||
def test_latin1_normalization(self):
|
||||
# See get_normal_name() in tokenizer.c.
|
||||
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
|
||||
|
|
|
@ -32,6 +32,7 @@ from codecs import lookup, BOM_UTF8
|
|||
import collections
|
||||
from io import TextIOWrapper
|
||||
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
||||
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
||||
|
||||
import token
|
||||
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
|
||||
|
@ -409,6 +410,8 @@ def detect_encoding(readline):
|
|||
encoding = find_cookie(first)
|
||||
if encoding:
|
||||
return encoding, [first]
|
||||
if not blank_re.match(first):
|
||||
return default, [first]
|
||||
|
||||
second = read_or_stop()
|
||||
if not second:
|
||||
|
|
20
Misc/NEWS
20
Misc/NEWS
|
@ -10,6 +10,13 @@ Release date: 2014-01-19
|
|||
Core and Builtins
|
||||
-----------------
|
||||
|
||||
- Issue #18960: The first line of Python script could be executed twice when
|
||||
the source encoding was specified on the second line. Now the source encoding
|
||||
declaration on the second line isn't effective if the first line contains
|
||||
anything except a comment. 'python -x' works now again with files with the
|
||||
source encoding declarations, and can be used to make Python batch files
|
||||
on Windows.
|
||||
|
||||
- Issue #19081: When a zipimport .zip file in sys.path being imported from
|
||||
is modified during the lifetime of the Python process after zipimport has
|
||||
already cached the zip's table of contents we detect this and recover
|
||||
|
@ -18,6 +25,9 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #18960: The tokenize module now ignore the source encoding declaration
|
||||
on the second line if the first line contains anything except a comment.
|
||||
|
||||
- Issue #20078: Reading malformed zipfiles no longer hangs with 100% CPU
|
||||
consumption.
|
||||
|
||||
|
@ -33,9 +43,19 @@ Library
|
|||
|
||||
- Issue #20072: Fixed multiple errors in tkinter with wantobjects is False.
|
||||
|
||||
IDLE
|
||||
----
|
||||
|
||||
- Issue #18960: IDLE now ignores the source encoding declaration on the second
|
||||
line if the first line contains anything except a comment.
|
||||
|
||||
Tools/Demos
|
||||
-----------
|
||||
|
||||
- Issue #18960: 2to3 and the findnocoding.py script now ignore the source
|
||||
encoding declaration on the second line if the first line contains anything
|
||||
except a comment.
|
||||
|
||||
- Issue #19723: The marker comments Argument Clinic uses have been changed
|
||||
to improve readability.
|
||||
|
||||
|
|
|
@ -283,13 +283,27 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
|
|||
char *cs;
|
||||
int r = 1;
|
||||
|
||||
if (tok->cont_line)
|
||||
if (tok->cont_line) {
|
||||
/* It's a continuation line, so it can't be a coding spec. */
|
||||
tok->read_coding_spec = 1;
|
||||
return 1;
|
||||
}
|
||||
if (!get_coding_spec(line, &cs, size, tok))
|
||||
return 0;
|
||||
if (!cs)
|
||||
if (!cs) {
|
||||
Py_ssize_t i;
|
||||
for (i = 0; i < size; i++) {
|
||||
if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
|
||||
break;
|
||||
if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
|
||||
/* Stop checking coding spec after a line containing
|
||||
* anything except a comment. */
|
||||
tok->read_coding_spec = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
tok->read_coding_spec = 1;
|
||||
if (tok->encoding == NULL) {
|
||||
assert(tok->decoding_state == STATE_RAW);
|
||||
|
@ -476,13 +490,17 @@ fp_setreadl(struct tok_state *tok, const char* enc)
|
|||
_Py_IDENTIFIER(open);
|
||||
_Py_IDENTIFIER(readline);
|
||||
int fd;
|
||||
long pos;
|
||||
|
||||
io = PyImport_ImportModuleNoBlock("io");
|
||||
if (io == NULL)
|
||||
goto cleanup;
|
||||
|
||||
fd = fileno(tok->fp);
|
||||
if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
|
||||
/* Due to buffering the file offset for fd can be different from the file
|
||||
* position of tok->fp. */
|
||||
pos = ftell(tok->fp);
|
||||
if (pos == -1 || lseek(fd, (off_t)pos, SEEK_SET) == (off_t)-1) {
|
||||
PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
|
||||
goto cleanup;
|
||||
}
|
||||
|
@ -752,7 +770,7 @@ decode_str(const char *input, int single, struct tok_state *tok)
|
|||
if (newl[0]) {
|
||||
if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
|
||||
return error_ret(tok);
|
||||
if (tok->enc == NULL && newl[1]) {
|
||||
if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
|
||||
if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
|
||||
tok, buf_setreadl))
|
||||
return error_ret(tok);
|
||||
|
|
|
@ -33,6 +33,7 @@ except ImportError:
|
|||
|
||||
|
||||
decl_re = re.compile(rb'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)')
|
||||
blank_re = re.compile(rb'^[ \t\f]*(?:[#\r\n]|$)')
|
||||
|
||||
def get_declaration(line):
|
||||
match = decl_re.match(line)
|
||||
|
@ -58,7 +59,8 @@ def needs_declaration(fullpath):
|
|||
line1 = infile.readline()
|
||||
line2 = infile.readline()
|
||||
|
||||
if get_declaration(line1) or get_declaration(line2):
|
||||
if (get_declaration(line1) or
|
||||
blank_re.match(line1) and get_declaration(line2)):
|
||||
# the file does have an encoding declaration, so trust it
|
||||
return False
|
||||
|
||||
|
|
Loading…
Reference in New Issue