Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error.

This commit is contained in:
Victor Stinner 2011-04-05 01:48:03 +02:00
parent 7f2fee3640
commit fe7c5b5bdf
6 changed files with 43 additions and 23 deletions

View File

@ -58,6 +58,12 @@ class ImportTests(unittest.TestCase):
with imp.find_module('module_' + mod, self.test_path)[0] as fd:
self.assertEqual(fd.encoding, encoding)
path = [os.path.dirname(__file__)]
self.assertRaisesRegex(SyntaxError,
r"Non-UTF-8 code starting with '\\xf6'"
r" in file .*badsyntax_pep3120.py",
imp.find_module, 'badsyntax_pep3120', path)
def test_issue1267(self):
for mod, encoding, _ in self.test_strings:
fp, filename, info = imp.find_module('module_' + mod,

View File

@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins
-----------------
- Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error.
- Issue #10785: Store the filename as Unicode in the Python parser.
- Issue #11619: _PyImport_LoadDynamicModule() doesn't encode the path to bytes

View File

@ -1690,17 +1690,18 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
return result;
}
/* Get -*- encoding -*- from a Python file.
/* Get the encoding of a Python file. Check for the coding cookie and check if
the file starts with a BOM.
PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
the first or second line of the file (in which case the encoding
should be assumed to be PyUnicode_GetDefaultEncoding()).
PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
encoding in the first or second line of the file (in which case the encoding
should be assumed to be UTF-8).
The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
by the caller. */
The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
by the caller.
*/
char *
PyTokenizer_FindEncoding(int fd)
PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
{
struct tok_state *tok;
FILE *fp;
@ -1720,9 +1721,18 @@ PyTokenizer_FindEncoding(int fd)
return NULL;
}
#ifndef PGEN
tok->filename = PyUnicode_FromString("<string>");
if (tok->filename == NULL)
goto error;
if (filename != NULL) {
Py_INCREF(filename);
tok->filename = filename;
}
else {
tok->filename = PyUnicode_FromString("<string>");
if (tok->filename == NULL) {
fclose(fp);
PyTokenizer_Free(tok);
return encoding;
}
}
#endif
while (tok->lineno < 2 && tok->done == E_OK) {
PyTokenizer_Get(tok, &p_start, &p_end);
@ -1733,13 +1743,16 @@ PyTokenizer_FindEncoding(int fd)
if (encoding)
strcpy(encoding, tok->encoding);
}
#ifndef PGEN
error:
#endif
PyTokenizer_Free(tok);
return encoding;
}
char *
PyTokenizer_FindEncoding(int fd)
{
return PyTokenizer_FindEncodingFilename(fd, NULL);
}
#ifdef Py_DEBUG
void

View File

@ -75,7 +75,6 @@ extern void PyTokenizer_Free(struct tok_state *);
extern int PyTokenizer_Get(struct tok_state *, char **, char **);
extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
int len, int *offset);
extern char * PyTokenizer_FindEncoding(int);
#ifdef __cplusplus
}

View File

@ -124,12 +124,12 @@ static const Py_UNICODE PYC_TAG_UNICODE[] = {
/* See _PyImport_FixupExtensionObject() below */
static PyObject *extensions = NULL;
/* Function from Parser/tokenizer.c */
extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);
/* This table is defined in config.c: */
extern struct _inittab _PyImport_Inittab[];
/* Method from Parser/tokenizer.c */
extern char * PyTokenizer_FindEncoding(int);
struct _inittab *PyImport_Inittab = _PyImport_Inittab;
/* these tables define the module suffixes that Python recognizes */
@ -3540,9 +3540,9 @@ call_find_module(PyObject *name, PyObject *path_list)
}
if (fd != -1) {
if (strchr(fdp->mode, 'b') == NULL) {
/* PyTokenizer_FindEncoding() returns PyMem_MALLOC'ed
/* PyTokenizer_FindEncodingFilename() returns PyMem_MALLOC'ed
memory. */
found_encoding = PyTokenizer_FindEncoding(fd);
found_encoding = PyTokenizer_FindEncodingFilename(fd, pathobj);
lseek(fd, 0, 0); /* Reset position */
if (found_encoding == NULL && PyErr_Occurred()) {
Py_XDECREF(pathobj);

View File

@ -18,8 +18,8 @@
#define MAX_FRAME_DEPTH 100
#define MAX_NTHREADS 100
/* Method from Parser/tokenizer.c */
extern char * PyTokenizer_FindEncoding(int);
/* Function from Parser/tokenizer.c */
extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);
static PyObject *
tb_dir(PyTracebackObject *self)
@ -251,7 +251,7 @@ _Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent)
/* use the right encoding to decode the file as unicode */
fd = PyObject_AsFileDescriptor(binary);
found_encoding = PyTokenizer_FindEncoding(fd);
found_encoding = PyTokenizer_FindEncodingFilename(fd, filename);
encoding = (found_encoding != NULL) ? found_encoding : "utf-8";
lseek(fd, 0, 0); /* Reset position */
fob = PyObject_CallMethod(io, "TextIOWrapper", "Os", binary, encoding);