Issue #10785: Store the filename as Unicode in the Python parser.

2011-04-05 00:39:01 +02:00 · 2011-04-05 00:39:01 +02:00 · 7f2fee3640
parent 9bdb43e43f
commit 7f2fee3640
9 changed files with 94 additions and 42 deletions
--- a/Include/parsetok.h
+++ b/Include/parsetok.h
@ -9,7 +9,10 @@ extern "C" {
 typedef struct {
    int error;
-    const char *filename;       /* decoded from the filesystem encoding */
+#ifndef PGEN
    /* The filename is useless for pgen, see comment in tok_state structure */
    PyObject *filename;
 #endif
    int lineno;
    int offset;
    char *text;                 /* UTF-8-encoded string */
@ -66,8 +69,10 @@ PyAPI_FUNC(node *) PyParser_ParseStringFlagsFilenameEx(
    perrdetail *err_ret,
    int *flags);
-/* Note that he following function is defined in pythonrun.c not parsetok.c. */
+/* Note that the following functions are defined in pythonrun.c,
   not in parsetok.c */
 PyAPI_FUNC(void) PyParser_SetError(perrdetail *);
 PyAPI_FUNC(void) PyParser_ClearError(perrdetail *);
 #ifdef __cplusplus
 }
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@ -238,14 +238,13 @@ POBJS=		\
 		Parser/listnode.o \
 		Parser/node.o \
 		Parser/parser.o \
 		Parser/parsetok.o \
 		Parser/bitset.o \
 		Parser/metagrammar.o \
 		Parser/firstsets.o \
 		Parser/grammar.o \
 		Parser/pgen.o
-PARSER_OBJS=	$(POBJS) Parser/myreadline.o Parser/tokenizer.o
+PARSER_OBJS=	$(POBJS) Parser/myreadline.o Parser/parsetok.o Parser/tokenizer.o
 PGOBJS=		\
 		Objects/obmalloc.o \
@ -254,10 +253,12 @@ PGOBJS=		\
 		Python/pyctype.o \
 		Parser/tokenizer_pgen.o \
 		Parser/printgrammar.o \
 		Parser/parsetok_pgen.o \
 		Parser/pgenmain.o
 PARSER_HEADERS= \
 		Parser/parser.h \
 		Include/parsetok.h \
 		Parser/tokenizer.h
 PGENOBJS=	$(PGENMAIN) $(POBJS) $(PGOBJS)
@ -593,6 +594,7 @@ Parser/grammar.o:	$(srcdir)/Parser/grammar.c \
 Parser/metagrammar.o:	$(srcdir)/Parser/metagrammar.c
 Parser/tokenizer_pgen.o:	$(srcdir)/Parser/tokenizer.c
 Parser/parsetok_pgen.o:	$(srcdir)/Parser/parsetok.c
 Parser/pgenmain.o:	$(srcdir)/Include/parsetok.h
@ -700,7 +702,6 @@ PYTHON_HEADERS= \
 		Include/objimpl.h \
 		Include/opcode.h \
 		Include/osdefs.h \
 		Include/parsetok.h \
 		Include/patchlevel.h \
 		Include/pgen.h \
 		Include/pgenheaders.h \
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
 Core and Builtins
 -----------------
 - Issue #10785: Store the filename as Unicode in the Python parser.
 - Issue #11619: _PyImport_LoadDynamicModule() doesn't encode the path to bytes
  on Windows.
--- a/Modules/parsermodule.c
+++ b/Modules/parsermodule.c
@ -584,6 +584,7 @@ parser_do_parse(PyObject *args, PyObject *kw, char *argspec, int type)
        else
            PyParser_SetError(&err);
    }
    PyParser_ClearError(&err);
    return (res);
 }
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@ -13,7 +13,7 @@
 /* Forward */
 static node *parsetok(struct tok_state *, grammar *, int, perrdetail *, int *);
-static void initerr(perrdetail *err_ret, const char* filename);
+static int initerr(perrdetail *err_ret, const char* filename);
 /* Parse input coming from a string.  Return error code, print some errors. */
 node *
@ -48,7 +48,8 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename,
    struct tok_state *tok;
    int exec_input = start == file_input;
-    initerr(err_ret, filename);
+    if (initerr(err_ret, filename) < 0)
        return NULL;
    if (*flags & PyPARSE_IGNORE_COOKIE)
        tok = PyTokenizer_FromUTF8(s, exec_input);
@ -59,7 +60,10 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename,
        return NULL;
    }
-    tok->filename = filename ? filename : "<string>";
+#ifndef PGEN
    Py_INCREF(err_ret->filename);
    tok->filename = err_ret->filename;
 #endif
    return parsetok(tok, g, start, err_ret, flags);
 }
@ -90,13 +94,17 @@ PyParser_ParseFileFlagsEx(FILE *fp, const char *filename,
 {
    struct tok_state *tok;
-    initerr(err_ret, filename);
+    if (initerr(err_ret, filename) < 0)
        return NULL;
    if ((tok = PyTokenizer_FromFile(fp, (char *)enc, ps1, ps2)) == NULL) {
        err_ret->error = E_NOMEM;
        return NULL;
    }
-    tok->filename = filename;
+#ifndef PGEN
    Py_INCREF(err_ret->filename);
    tok->filename = err_ret->filename;
 #endif
    return parsetok(tok, g, start, err_ret, flags);
 }
@ -267,14 +275,24 @@ done:
    return n;
 }
-static void
+static int
 initerr(perrdetail *err_ret, const char *filename)
 {
    err_ret->error = E_OK;
    err_ret->filename = filename;
    err_ret->lineno = 0;
    err_ret->offset = 0;
    err_ret->text = NULL;
    err_ret->token = -1;
    err_ret->expected = -1;
 #ifndef PGEN
    if (filename)
        err_ret->filename = PyUnicode_DecodeFSDefault(filename);
    else
        err_ret->filename = PyUnicode_FromString("<string>");
    if (err_ret->filename == NULL) {
        err_ret->error = E_ERROR;
        return -1;
    }
 #endif
    return 0;
 }
--- a/Parser/parsetok_pgen.c
+++ b/Parser/parsetok_pgen.c
@ -0,0 +1,2 @@
 #define PGEN
 #include "parsetok.c"
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -128,7 +128,6 @@ tok_new(void)
    tok->prompt = tok->nextprompt = NULL;
    tok->lineno = 0;
    tok->level = 0;
    tok->filename = NULL;
    tok->altwarning = 1;
    tok->alterror = 1;
    tok->alttabsize = 1;
@ -140,6 +139,7 @@ tok_new(void)
    tok->encoding = NULL;
    tok->cont_line = 0;
 #ifndef PGEN
    tok->filename = NULL;
    tok->decoding_readline = NULL;
    tok->decoding_buffer = NULL;
 #endif
@ -545,7 +545,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
 {
    char *line = NULL;
    int badchar = 0;
    PyObject *filename;
    for (;;) {
        if (tok->decoding_state == STATE_NORMAL) {
            /* We already have a codec associated with
@ -586,16 +585,12 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
    if (badchar) {
        /* Need to add 1 to the line number, since this line
           has not been counted, yet.  */
-        filename = PyUnicode_DecodeFSDefault(tok->filename);
+        PyErr_Format(PyExc_SyntaxError,
-        if (filename != NULL) {
+                "Non-UTF-8 code starting with '\\x%.2x' "
-            PyErr_Format(PyExc_SyntaxError,
+                "in file %U on line %i, "
-                    "Non-UTF-8 code starting with '\\x%.2x' "
+                "but no encoding declared; "
-                    "in file %U on line %i, "
+                "see http://python.org/dev/peps/pep-0263/ for details",
-                    "but no encoding declared; "
+                badchar, tok->filename, tok->lineno + 1);
                    "see http://python.org/dev/peps/pep-0263/ for details",
                    badchar, filename, tok->lineno + 1);
            Py_DECREF(filename);
        }
        return error_ret(tok);
    }
 #endif
@ -853,6 +848,7 @@ PyTokenizer_Free(struct tok_state *tok)
 #ifndef PGEN
    Py_XDECREF(tok->decoding_readline);
    Py_XDECREF(tok->decoding_buffer);
    Py_XDECREF(tok->filename);
 #endif
    if (tok->fp != NULL && tok->buf != NULL)
        PyMem_FREE(tok->buf);
@ -1247,8 +1243,13 @@ indenterror(struct tok_state *tok)
        return 1;
    }
    if (tok->altwarning) {
-        PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
+#ifdef PGEN
        PySys_WriteStderr("inconsistent use of tabs and spaces "
                          "in indentation\n");
 #else
        PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
                          "in indentation\n", tok->filename);
 #endif
        tok->altwarning = 0;
    }
    return 0;
@ -1718,6 +1719,11 @@ PyTokenizer_FindEncoding(int fd)
        fclose(fp);
        return NULL;
    }
 #ifndef PGEN
    tok->filename = PyUnicode_FromString("<string>");
    if (tok->filename == NULL)
        goto error;
 #endif
    while (tok->lineno < 2 && tok->done == E_OK) {
        PyTokenizer_Get(tok, &p_start, &p_end);
    }
@ -1727,6 +1733,9 @@ PyTokenizer_FindEncoding(int fd)
        if (encoding)
        strcpy(encoding, tok->encoding);
    }
 #ifndef PGEN
 error:
 #endif
    PyTokenizer_Free(tok);
    return encoding;
 }
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@ -40,7 +40,13 @@ struct tok_state {
    int level;          /* () [] {} Parentheses nesting level */
            /* Used to allow free continuations inside them */
    /* Stuff for checking on different tab sizes */
-    const char *filename;   /* encoded to the filesystem encoding */
+#ifndef PGEN
    /* pgen doesn't have access to Python codecs, it cannot decode the input
       filename. The bytes filename might be kept, but it is only used by
       indenterror() and it is not really needed: pgen only compiles one file
       (Grammar/Grammar). */
    PyObject *filename;
 #endif
    int altwarning;     /* Issue warning if alternate tabs don't match */
    int alterror;       /* Issue error if alternate tabs don't match */
    int alttabsize;     /* Alternate tab spacing */
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@ -62,6 +62,7 @@ static PyObject *run_mod(mod_ty, const char *, PyObject *, PyObject *,
 static PyObject *run_pyc_file(FILE *, const char *, PyObject *, PyObject *,
                              PyCompilerFlags *);
 static void err_input(perrdetail *);
 static void err_free(perrdetail *);
 static void initsigs(void);
 static void call_py_exitfuncs(void);
 static void wait_for_thread_shutdown(void);
@ -1887,12 +1888,13 @@ PyParser_ASTFromString(const char *s, const char *filename, int start,
        flags->cf_flags |= iflags & PyCF_MASK;
        mod = PyAST_FromNode(n, flags, filename, arena);
        PyNode_Free(n);
        return mod;
    }
    else {
        err_input(&err);
-        return NULL;
+        mod = NULL;
    }
    err_free(&err);
    return mod;
 }
 mod_ty
@ -1917,14 +1919,15 @@ PyParser_ASTFromFile(FILE *fp, const char *filename, const char* enc,
        flags->cf_flags |= iflags & PyCF_MASK;
        mod = PyAST_FromNode(n, flags, filename, arena);
        PyNode_Free(n);
        return mod;
    }
    else {
        err_input(&err);
        if (errcode)
            *errcode = err.error;
-        return NULL;
+        mod = NULL;
    }
    err_free(&err);
    return mod;
 }
 /* Simplified interface to parsefile -- return node or set exception */
@ -1938,6 +1941,7 @@ PyParser_SimpleParseFileFlags(FILE *fp, const char *filename, int start, int fla
                                      start, NULL, NULL, &err, flags);
    if (n == NULL)
        err_input(&err);
    err_free(&err);
    return n;
 }
@ -1952,6 +1956,7 @@ PyParser_SimpleParseStringFlags(const char *str, int start, int flags)
                                        start, &err, flags);
    if (n == NULL)
        err_input(&err);
    err_free(&err);
    return n;
 }
@ -1964,6 +1969,7 @@ PyParser_SimpleParseStringFlagsFilename(const char *str, const char *filename,
                            &_PyParser_Grammar, start, &err, flags);
    if (n == NULL)
        err_input(&err);
    err_free(&err);
    return n;
 }
@ -1976,12 +1982,24 @@ PyParser_SimpleParseStringFilename(const char *str, const char *filename, int st
 /* May want to move a more generalized form of this to parsetok.c or
   even parser modules. */
 void
 PyParser_ClearError(perrdetail *err)
 {
    err_free(err);
 }
 void
 PyParser_SetError(perrdetail *err)
 {
    err_input(err);
 }
 static void
 err_free(perrdetail *err)
 {
    Py_CLEAR(err->filename);
 }
 /* Set the error appropriate to the given input error code (see errcode.h) */
 static void
@ -1989,7 +2007,6 @@ err_input(perrdetail *err)
 {
    PyObject *v, *w, *errtype, *errtext;
    PyObject *msg_obj = NULL;
    PyObject *filename;
    char *msg = NULL;
    errtype = PyExc_SyntaxError;
@ -2075,17 +2092,8 @@ err_input(perrdetail *err)
        errtext = PyUnicode_DecodeUTF8(err->text, strlen(err->text),
                                       "replace");
    }
-    if (err->filename != NULL)
+    v = Py_BuildValue("(OiiN)", err->filename,
-        filename = PyUnicode_DecodeFSDefault(err->filename);
+                      err->lineno, err->offset, errtext);
    else {
        Py_INCREF(Py_None);
        filename = Py_None;
    }
    if (filename != NULL)
        v = Py_BuildValue("(NiiN)", filename,
                          err->lineno, err->offset, errtext);
    else
        v = NULL;
    if (v != NULL) {
        if (msg_obj)
            w = Py_BuildValue("(OO)", msg_obj, v);