Patch #1031213: Decode source line in SyntaxErrors back to its original

source encoding.
This commit is contained in:
Martin v. Löwis 2007-09-04 14:20:25 +00:00
parent cf754ba39f
commit fc787d515d
5 changed files with 106 additions and 5 deletions

View File

@ -155,6 +155,32 @@ class CompilerTest(unittest.TestCase):
self.assertEquals(dct.get('result'), 1)
def _testErrEnc(self, src, text, offset):
try:
compile(src, "", "exec")
except SyntaxError, e:
self.assertEquals(e.offset, offset)
self.assertEquals(e.text, text)
def testSourceCodeEncodingsError(self):
# Test SyntaxError with encoding definition
sjis = "print '\x83\x70\x83\x43\x83\x5c\x83\x93', '\n"
ascii = "print '12345678', '\n"
encdef = "#! -*- coding: ShiftJIS -*-\n"
# ascii source without encdef
self._testErrEnc(ascii, ascii, 19)
# ascii source with encdef
self._testErrEnc(encdef+ascii, ascii, 19)
# non-ascii source with encdef
self._testErrEnc(encdef+sjis, sjis, 19)
# ShiftJIS source without encdef
self._testErrEnc(sjis, sjis, 19)
NOLINENO = (compiler.ast.Module, compiler.ast.Stmt, compiler.ast.Discard)
###############################################################################

View File

@ -12,6 +12,9 @@ What's New in Python 2.5.2c1?
Core and builtins
-----------------
- Patch #1031213: Decode source line in SyntaxErrors back to its original source
encoding.
- Patch #1673759: add a missing overflow check when formatting floats
with %G.

View File

@ -216,16 +216,24 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
err_ret->error = E_EOF;
err_ret->lineno = tok->lineno;
if (tok->buf != NULL) {
char *text = NULL;
size_t len;
assert(tok->cur - tok->buf < INT_MAX);
err_ret->offset = (int)(tok->cur - tok->buf);
len = tok->inp - tok->buf;
err_ret->text = (char *) PyObject_MALLOC(len + 1);
if (err_ret->text != NULL) {
if (len > 0)
strncpy(err_ret->text, tok->buf, len);
err_ret->text[len] = '\0';
#ifdef Py_USING_UNICODE
text = PyTokenizer_RestoreEncoding(tok, len, &err_ret->offset);
#endif
if (text == NULL) {
text = (char *) PyObject_MALLOC(len + 1);
if (text != NULL) {
if (len > 0)
strncpy(text, tok->buf, len);
text[len] = '\0';
}
}
err_ret->text = text;
}
} else if (tok->encoding != NULL) {
node* r = PyNode_New(encoding_decl);

View File

@ -1522,6 +1522,68 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
return result;
}
/* This function is only called from parsetok. However, it cannot live
there, as it must be empty for PGEN, and we can check for PGEN only
in this file. */
#ifdef PGEN
char*
PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
{
return NULL;
}
#else
static PyObject *
dec_utf8(const char *enc, const char *text, size_t len) {
PyObject *ret = NULL;
PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
if (unicode_text) {
ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
Py_DECREF(unicode_text);
}
if (!ret) {
PyErr_Print();
}
return ret;
}
char *
PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
{
char *text = NULL;
if (tok->encoding) {
/* convert source to original encondig */
PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
if (lineobj != NULL) {
int linelen = PyString_Size(lineobj);
const char *line = PyString_AsString(lineobj);
text = PyObject_MALLOC(linelen + 1);
if (text != NULL && line != NULL) {
if (linelen)
strncpy(text, line, linelen);
text[linelen] = '\0';
}
Py_DECREF(lineobj);
/* adjust error offset */
if (*offset > 1) {
PyObject *offsetobj = dec_utf8(tok->encoding,
tok->buf, *offset-1);
if (offsetobj) {
*offset = PyString_Size(offsetobj) + 1;
Py_DECREF(offsetobj);
}
}
}
}
return text;
}
#endif
#ifdef Py_DEBUG
void

View File

@ -58,6 +58,8 @@ extern struct tok_state *PyTokenizer_FromString(const char *);
extern struct tok_state *PyTokenizer_FromFile(FILE *, char *, char *);
extern void PyTokenizer_Free(struct tok_state *);
extern int PyTokenizer_Get(struct tok_state *, char **, char **);
extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
int len, int *offset);
#ifdef __cplusplus
}