From 259314622750c72de2ef377e77a0b70b8d8b2fb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Mon, 17 Mar 2008 20:43:42 +0000 Subject: [PATCH] Bug #2301: Don't try decoding the source code into the original encoding for syntax errors. --- Lib/test/test_pep263.py | 7 +++++ Misc/NEWS | 6 ++++ Parser/parsetok.c | 15 ++++------ Parser/tokenizer.c | 64 ----------------------------------------- 4 files changed, 18 insertions(+), 74 deletions(-) diff --git a/Lib/test/test_pep263.py b/Lib/test/test_pep263.py index cc126ba687c..92065c9fe15 100644 --- a/Lib/test/test_pep263.py +++ b/Lib/test/test_pep263.py @@ -23,6 +23,13 @@ class PEP263Test(unittest.TestCase): exec(c, d) self.assertEqual(d['u'], '\xf3') + def test_issue2301(self): + try: + compile(b"# coding: cp932\nprint '\x94\x4e'", "dummy", "exec") + except SyntaxError as v: + self.assertEquals(v.text, "print '\u5e74'") + else: + self.fail() def test_main(): test_support.run_unittest(PEP263Test) diff --git a/Misc/NEWS b/Misc/NEWS index 16652569c56..6c38150fee8 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -9,6 +9,12 @@ What's New in Python 3.0a4? *Release date: XX-XXX-2008* +Core and Builtins +----------------- + +- Bug #2301: Don't try decoding the source code into the original + encoding for syntax errors. + Extension Modules ----------------- diff --git a/Parser/parsetok.c b/Parser/parsetok.c index 0b3314ec92a..708c26df22f 100644 --- a/Parser/parsetok.c +++ b/Parser/parsetok.c @@ -213,21 +213,16 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, err_ret->error = E_EOF; err_ret->lineno = tok->lineno; if (tok->buf != NULL) { - char *text = NULL; size_t len; assert(tok->cur - tok->buf < INT_MAX); err_ret->offset = (int)(tok->cur - tok->buf); len = tok->inp - tok->buf; - text = PyTokenizer_RestoreEncoding(tok, len, &err_ret->offset); - if (text == NULL) { - text = (char *) PyObject_MALLOC(len + 1); - if (text != NULL) { - if (len > 0) - strncpy(text, tok->buf, len); - text[len] = '\0'; - } + err_ret->text = (char *) PyObject_MALLOC(len + 1); + if (err_ret->text != NULL) { + if (len > 0) + strncpy(err_ret->text, tok->buf, len); + err_ret->text[len] = '\0'; } - err_ret->text = text; } } else if (tok->encoding != NULL) { node* r = PyNode_New(encoding_decl); diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 2833e532f7f..0b8341a0a27 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1579,70 +1579,6 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) return result; } -/* This function is only called from parsetok. However, it cannot live - there, as it must be empty for PGEN, and we can check for PGEN only - in this file. */ - -#ifdef PGEN -char* -PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset) -{ - return NULL; -} -#else -static PyObject * -dec_utf8(const char *enc, const char *text, size_t len) { - PyObject *ret = NULL; - PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace"); - if (unicode_text) { - ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace"); - Py_DECREF(unicode_text); - } - if (!ret) { - PyErr_Clear(); - } - else { - assert(PyString_Check(ret)); - } - return ret; -} - -char * -PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset) -{ - char *text = NULL; - if (tok->encoding) { - /* convert source to original encondig */ - PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len); - if (lineobj != NULL) { - int linelen = PyString_GET_SIZE(lineobj); - const char *line = PyString_AS_STRING(lineobj); - text = PyObject_MALLOC(linelen + 1); - if (text != NULL && line != NULL) { - if (linelen) - strncpy(text, line, linelen); - text[linelen] = '\0'; - } - Py_DECREF(lineobj); - - /* adjust error offset */ - if (*offset > 1) { - PyObject *offsetobj = dec_utf8(tok->encoding, - tok->buf, - *offset-1); - if (offsetobj) { - *offset = 1 + Py_SIZE(offsetobj); - Py_DECREF(offsetobj); - } - } - - } - } - return text; - -} -#endif - /* Get -*- encoding -*- from a Python file. PyTokenizer_FindEncoding returns NULL when it can't find the encoding in