From 7f2fee36401f7b987a368fe043637b3ae7116600 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 5 Apr 2011 00:39:01 +0200 Subject: [PATCH] Issue #10785: Store the filename as Unicode in the Python parser. --- Include/parsetok.h | 9 +++++++-- Makefile.pre.in | 7 ++++--- Misc/NEWS | 2 ++ Modules/parsermodule.c | 1 + Parser/parsetok.c | 32 +++++++++++++++++++++++++------- Parser/parsetok_pgen.c | 2 ++ Parser/tokenizer.c | 35 ++++++++++++++++++++++------------- Parser/tokenizer.h | 8 +++++++- Python/pythonrun.c | 40 ++++++++++++++++++++++++---------------- 9 files changed, 94 insertions(+), 42 deletions(-) create mode 100644 Parser/parsetok_pgen.c diff --git a/Include/parsetok.h b/Include/parsetok.h index 4b7694f90f1..911dfc10e40 100644 --- a/Include/parsetok.h +++ b/Include/parsetok.h @@ -9,7 +9,10 @@ extern "C" { typedef struct { int error; - const char *filename; /* decoded from the filesystem encoding */ +#ifndef PGEN + /* The filename is useless for pgen, see comment in tok_state structure */ + PyObject *filename; +#endif int lineno; int offset; char *text; /* UTF-8-encoded string */ @@ -66,8 +69,10 @@ PyAPI_FUNC(node *) PyParser_ParseStringFlagsFilenameEx( perrdetail *err_ret, int *flags); -/* Note that he following function is defined in pythonrun.c not parsetok.c. */ +/* Note that the following functions are defined in pythonrun.c, + not in parsetok.c */ PyAPI_FUNC(void) PyParser_SetError(perrdetail *); +PyAPI_FUNC(void) PyParser_ClearError(perrdetail *); #ifdef __cplusplus } diff --git a/Makefile.pre.in b/Makefile.pre.in index c3004ceaf85..6a206c2afdf 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -238,14 +238,13 @@ POBJS= \ Parser/listnode.o \ Parser/node.o \ Parser/parser.o \ - Parser/parsetok.o \ Parser/bitset.o \ Parser/metagrammar.o \ Parser/firstsets.o \ Parser/grammar.o \ Parser/pgen.o -PARSER_OBJS= $(POBJS) Parser/myreadline.o Parser/tokenizer.o +PARSER_OBJS= $(POBJS) Parser/myreadline.o Parser/parsetok.o Parser/tokenizer.o PGOBJS= \ Objects/obmalloc.o \ @@ -254,10 +253,12 @@ PGOBJS= \ Python/pyctype.o \ Parser/tokenizer_pgen.o \ Parser/printgrammar.o \ + Parser/parsetok_pgen.o \ Parser/pgenmain.o PARSER_HEADERS= \ Parser/parser.h \ + Include/parsetok.h \ Parser/tokenizer.h PGENOBJS= $(PGENMAIN) $(POBJS) $(PGOBJS) @@ -593,6 +594,7 @@ Parser/grammar.o: $(srcdir)/Parser/grammar.c \ Parser/metagrammar.o: $(srcdir)/Parser/metagrammar.c Parser/tokenizer_pgen.o: $(srcdir)/Parser/tokenizer.c +Parser/parsetok_pgen.o: $(srcdir)/Parser/parsetok.c Parser/pgenmain.o: $(srcdir)/Include/parsetok.h @@ -700,7 +702,6 @@ PYTHON_HEADERS= \ Include/objimpl.h \ Include/opcode.h \ Include/osdefs.h \ - Include/parsetok.h \ Include/patchlevel.h \ Include/pgen.h \ Include/pgenheaders.h \ diff --git a/Misc/NEWS b/Misc/NEWS index d4297d2055e..30d7c50e728 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1? Core and Builtins ----------------- +- Issue #10785: Store the filename as Unicode in the Python parser. + - Issue #11619: _PyImport_LoadDynamicModule() doesn't encode the path to bytes on Windows. diff --git a/Modules/parsermodule.c b/Modules/parsermodule.c index 3cdf1359a68..e5b4e559431 100644 --- a/Modules/parsermodule.c +++ b/Modules/parsermodule.c @@ -584,6 +584,7 @@ parser_do_parse(PyObject *args, PyObject *kw, char *argspec, int type) else PyParser_SetError(&err); } + PyParser_ClearError(&err); return (res); } diff --git a/Parser/parsetok.c b/Parser/parsetok.c index 2251cacf74f..eef650ac544 100644 --- a/Parser/parsetok.c +++ b/Parser/parsetok.c @@ -13,7 +13,7 @@ /* Forward */ static node *parsetok(struct tok_state *, grammar *, int, perrdetail *, int *); -static void initerr(perrdetail *err_ret, const char* filename); +static int initerr(perrdetail *err_ret, const char* filename); /* Parse input coming from a string. Return error code, print some errors. */ node * @@ -48,7 +48,8 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename, struct tok_state *tok; int exec_input = start == file_input; - initerr(err_ret, filename); + if (initerr(err_ret, filename) < 0) + return NULL; if (*flags & PyPARSE_IGNORE_COOKIE) tok = PyTokenizer_FromUTF8(s, exec_input); @@ -59,7 +60,10 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename, return NULL; } - tok->filename = filename ? filename : ""; +#ifndef PGEN + Py_INCREF(err_ret->filename); + tok->filename = err_ret->filename; +#endif return parsetok(tok, g, start, err_ret, flags); } @@ -90,13 +94,17 @@ PyParser_ParseFileFlagsEx(FILE *fp, const char *filename, { struct tok_state *tok; - initerr(err_ret, filename); + if (initerr(err_ret, filename) < 0) + return NULL; if ((tok = PyTokenizer_FromFile(fp, (char *)enc, ps1, ps2)) == NULL) { err_ret->error = E_NOMEM; return NULL; } - tok->filename = filename; +#ifndef PGEN + Py_INCREF(err_ret->filename); + tok->filename = err_ret->filename; +#endif return parsetok(tok, g, start, err_ret, flags); } @@ -267,14 +275,24 @@ done: return n; } -static void +static int initerr(perrdetail *err_ret, const char *filename) { err_ret->error = E_OK; - err_ret->filename = filename; err_ret->lineno = 0; err_ret->offset = 0; err_ret->text = NULL; err_ret->token = -1; err_ret->expected = -1; +#ifndef PGEN + if (filename) + err_ret->filename = PyUnicode_DecodeFSDefault(filename); + else + err_ret->filename = PyUnicode_FromString(""); + if (err_ret->filename == NULL) { + err_ret->error = E_ERROR; + return -1; + } +#endif + return 0; } diff --git a/Parser/parsetok_pgen.c b/Parser/parsetok_pgen.c new file mode 100644 index 00000000000..97b92883f3f --- /dev/null +++ b/Parser/parsetok_pgen.c @@ -0,0 +1,2 @@ +#define PGEN +#include "parsetok.c" diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 3f6be2f640e..5edd9589be4 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -128,7 +128,6 @@ tok_new(void) tok->prompt = tok->nextprompt = NULL; tok->lineno = 0; tok->level = 0; - tok->filename = NULL; tok->altwarning = 1; tok->alterror = 1; tok->alttabsize = 1; @@ -140,6 +139,7 @@ tok_new(void) tok->encoding = NULL; tok->cont_line = 0; #ifndef PGEN + tok->filename = NULL; tok->decoding_readline = NULL; tok->decoding_buffer = NULL; #endif @@ -545,7 +545,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok) { char *line = NULL; int badchar = 0; - PyObject *filename; for (;;) { if (tok->decoding_state == STATE_NORMAL) { /* We already have a codec associated with @@ -586,16 +585,12 @@ decoding_fgets(char *s, int size, struct tok_state *tok) if (badchar) { /* Need to add 1 to the line number, since this line has not been counted, yet. */ - filename = PyUnicode_DecodeFSDefault(tok->filename); - if (filename != NULL) { - PyErr_Format(PyExc_SyntaxError, - "Non-UTF-8 code starting with '\\x%.2x' " - "in file %U on line %i, " - "but no encoding declared; " - "see http://python.org/dev/peps/pep-0263/ for details", - badchar, filename, tok->lineno + 1); - Py_DECREF(filename); - } + PyErr_Format(PyExc_SyntaxError, + "Non-UTF-8 code starting with '\\x%.2x' " + "in file %U on line %i, " + "but no encoding declared; " + "see http://python.org/dev/peps/pep-0263/ for details", + badchar, tok->filename, tok->lineno + 1); return error_ret(tok); } #endif @@ -853,6 +848,7 @@ PyTokenizer_Free(struct tok_state *tok) #ifndef PGEN Py_XDECREF(tok->decoding_readline); Py_XDECREF(tok->decoding_buffer); + Py_XDECREF(tok->filename); #endif if (tok->fp != NULL && tok->buf != NULL) PyMem_FREE(tok->buf); @@ -1247,8 +1243,13 @@ indenterror(struct tok_state *tok) return 1; } if (tok->altwarning) { - PySys_WriteStderr("%s: inconsistent use of tabs and spaces " +#ifdef PGEN + PySys_WriteStderr("inconsistent use of tabs and spaces " + "in indentation\n"); +#else + PySys_FormatStderr("%U: inconsistent use of tabs and spaces " "in indentation\n", tok->filename); +#endif tok->altwarning = 0; } return 0; @@ -1718,6 +1719,11 @@ PyTokenizer_FindEncoding(int fd) fclose(fp); return NULL; } +#ifndef PGEN + tok->filename = PyUnicode_FromString(""); + if (tok->filename == NULL) + goto error; +#endif while (tok->lineno < 2 && tok->done == E_OK) { PyTokenizer_Get(tok, &p_start, &p_end); } @@ -1727,6 +1733,9 @@ PyTokenizer_FindEncoding(int fd) if (encoding) strcpy(encoding, tok->encoding); } +#ifndef PGEN +error: +#endif PyTokenizer_Free(tok); return encoding; } diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 2be3bf2e315..3a0d3cb08e8 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -40,7 +40,13 @@ struct tok_state { int level; /* () [] {} Parentheses nesting level */ /* Used to allow free continuations inside them */ /* Stuff for checking on different tab sizes */ - const char *filename; /* encoded to the filesystem encoding */ +#ifndef PGEN + /* pgen doesn't have access to Python codecs, it cannot decode the input + filename. The bytes filename might be kept, but it is only used by + indenterror() and it is not really needed: pgen only compiles one file + (Grammar/Grammar). */ + PyObject *filename; +#endif int altwarning; /* Issue warning if alternate tabs don't match */ int alterror; /* Issue error if alternate tabs don't match */ int alttabsize; /* Alternate tab spacing */ diff --git a/Python/pythonrun.c b/Python/pythonrun.c index 1c36e63ac46..a6787c4fc71 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -62,6 +62,7 @@ static PyObject *run_mod(mod_ty, const char *, PyObject *, PyObject *, static PyObject *run_pyc_file(FILE *, const char *, PyObject *, PyObject *, PyCompilerFlags *); static void err_input(perrdetail *); +static void err_free(perrdetail *); static void initsigs(void); static void call_py_exitfuncs(void); static void wait_for_thread_shutdown(void); @@ -1887,12 +1888,13 @@ PyParser_ASTFromString(const char *s, const char *filename, int start, flags->cf_flags |= iflags & PyCF_MASK; mod = PyAST_FromNode(n, flags, filename, arena); PyNode_Free(n); - return mod; } else { err_input(&err); - return NULL; + mod = NULL; } + err_free(&err); + return mod; } mod_ty @@ -1917,14 +1919,15 @@ PyParser_ASTFromFile(FILE *fp, const char *filename, const char* enc, flags->cf_flags |= iflags & PyCF_MASK; mod = PyAST_FromNode(n, flags, filename, arena); PyNode_Free(n); - return mod; } else { err_input(&err); if (errcode) *errcode = err.error; - return NULL; + mod = NULL; } + err_free(&err); + return mod; } /* Simplified interface to parsefile -- return node or set exception */ @@ -1938,6 +1941,7 @@ PyParser_SimpleParseFileFlags(FILE *fp, const char *filename, int start, int fla start, NULL, NULL, &err, flags); if (n == NULL) err_input(&err); + err_free(&err); return n; } @@ -1952,6 +1956,7 @@ PyParser_SimpleParseStringFlags(const char *str, int start, int flags) start, &err, flags); if (n == NULL) err_input(&err); + err_free(&err); return n; } @@ -1964,6 +1969,7 @@ PyParser_SimpleParseStringFlagsFilename(const char *str, const char *filename, &_PyParser_Grammar, start, &err, flags); if (n == NULL) err_input(&err); + err_free(&err); return n; } @@ -1976,12 +1982,24 @@ PyParser_SimpleParseStringFilename(const char *str, const char *filename, int st /* May want to move a more generalized form of this to parsetok.c or even parser modules. */ +void +PyParser_ClearError(perrdetail *err) +{ + err_free(err); +} + void PyParser_SetError(perrdetail *err) { err_input(err); } +static void +err_free(perrdetail *err) +{ + Py_CLEAR(err->filename); +} + /* Set the error appropriate to the given input error code (see errcode.h) */ static void @@ -1989,7 +2007,6 @@ err_input(perrdetail *err) { PyObject *v, *w, *errtype, *errtext; PyObject *msg_obj = NULL; - PyObject *filename; char *msg = NULL; errtype = PyExc_SyntaxError; @@ -2075,17 +2092,8 @@ err_input(perrdetail *err) errtext = PyUnicode_DecodeUTF8(err->text, strlen(err->text), "replace"); } - if (err->filename != NULL) - filename = PyUnicode_DecodeFSDefault(err->filename); - else { - Py_INCREF(Py_None); - filename = Py_None; - } - if (filename != NULL) - v = Py_BuildValue("(NiiN)", filename, - err->lineno, err->offset, errtext); - else - v = NULL; + v = Py_BuildValue("(OiiN)", err->filename, + err->lineno, err->offset, errtext); if (v != NULL) { if (msg_obj) w = Py_BuildValue("(OO)", msg_obj, v);