Implement PEP 3131. Add isidentifier to str.
This commit is contained in:
parent
32c4ac0143
commit
47383403a0
|
@ -653,6 +653,11 @@ is at least one character, false otherwise.
|
|||
For 8-bit strings, this method is locale-dependent.
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}[str]{isidentifier}{}
|
||||
Return True if S is a valid identifier according\n\
|
||||
to the language definition.
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}[str]{islower}{}
|
||||
Return true if all cased characters in the string are lowercase and
|
||||
there is at least one cased character, false otherwise.
|
||||
|
|
|
@ -29,6 +29,7 @@ extern "C" {
|
|||
#define E_EOFS 23 /* EOF in triple-quoted string */
|
||||
#define E_EOLS 24 /* EOL in single-quoted string */
|
||||
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
|
||||
#define E_IDENTIFIER 26 /* Invalid characters in identifier */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -182,6 +182,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
|
||||
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
|
||||
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
|
||||
# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
|
||||
# define PyUnicode_Join PyUnicodeUCS2_Join
|
||||
# define PyUnicode_Partition PyUnicodeUCS2_Partition
|
||||
# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
|
||||
|
@ -268,6 +269,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
|
||||
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
|
||||
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
|
||||
# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
|
||||
# define PyUnicode_Join PyUnicodeUCS4_Join
|
||||
# define PyUnicode_Partition PyUnicodeUCS4_Partition
|
||||
# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
|
||||
|
@ -1250,6 +1252,10 @@ PyAPI_FUNC(int) PyUnicode_Contains(
|
|||
PyObject *element /* Element string */
|
||||
);
|
||||
|
||||
/* Checks whether argument is a valid identifier. */
|
||||
|
||||
PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
|
||||
|
||||
/* Externally visible for str.strip(unicode) */
|
||||
PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
|
||||
PyUnicodeObject *self,
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
€ = 2
|
|
@ -0,0 +1,29 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import unittest
|
||||
from test import test_support
|
||||
|
||||
class PEP3131Test(unittest.TestCase):
|
||||
|
||||
def test_valid(self):
|
||||
class T:
|
||||
ä = 1
|
||||
µ = 2 # this is a compatibility character
|
||||
蟒 = 3
|
||||
self.assertEquals(getattr(T, "\xe4"), 1)
|
||||
self.assertEquals(getattr(T, "\u03bc"), 2)
|
||||
self.assertEquals(getattr(T, '\u87d2'), 3)
|
||||
|
||||
def test_invalid(self):
|
||||
try:
|
||||
from test import badsyntax_3131
|
||||
except SyntaxError as s:
|
||||
self.assertEquals(str(s),
|
||||
"invalid character in identifier (badsyntax_3131.py, line 2)")
|
||||
else:
|
||||
self.fail("expected exception didn't occur")
|
||||
|
||||
def test_main():
|
||||
test_support.run_unittest(PEP3131Test)
|
||||
|
||||
if __name__=="__main__":
|
||||
test_main()
|
|
@ -313,6 +313,19 @@ class UnicodeTest(
|
|||
|
||||
self.assertRaises(TypeError, "abc".isnumeric, 42)
|
||||
|
||||
def test_isidentifier(self):
|
||||
self.assertTrue("a".isidentifier())
|
||||
self.assertTrue("Z".isidentifier())
|
||||
self.assertTrue("_".isidentifier())
|
||||
self.assertTrue("b0".isidentifier())
|
||||
self.assertTrue("bc".isidentifier())
|
||||
self.assertTrue("b_".isidentifier())
|
||||
self.assertTrue("µ".isidentifier())
|
||||
|
||||
self.assertFalse(" ".isidentifier())
|
||||
self.assertFalse("[".isidentifier())
|
||||
self.assertFalse("©".isidentifier())
|
||||
|
||||
def test_contains(self):
|
||||
# Testing Unicode contains method
|
||||
self.assert_('a' in 'abdb')
|
||||
|
|
|
@ -26,6 +26,8 @@ TO DO
|
|||
Core and Builtins
|
||||
-----------------
|
||||
|
||||
- PEP 3131: Support non-ASCII identifiers.
|
||||
|
||||
- PEP 3120: Change default encoding to UTF-8.
|
||||
|
||||
- PEP 3123: Use proper C inheritance for PyObject.
|
||||
|
|
|
@ -227,7 +227,8 @@ int unicode_resize(register PyUnicodeObject *unicode,
|
|||
}
|
||||
|
||||
/* We allocate one more byte to make sure the string is
|
||||
Ux0000 terminated -- XXX is this needed ?
|
||||
Ux0000 terminated; some code (e.g. new_identifier)
|
||||
relies on that.
|
||||
|
||||
XXX This allocator could further be enhanced by assuring that the
|
||||
free list never reduces its size below 1.
|
||||
|
@ -6679,6 +6680,47 @@ unicode_isnumeric(PyUnicodeObject *self)
|
|||
return PyBool_FromLong(1);
|
||||
}
|
||||
|
||||
int
|
||||
PyUnicode_IsIdentifier(PyObject *self)
|
||||
{
|
||||
register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
|
||||
register const Py_UNICODE *e;
|
||||
|
||||
/* Special case for empty strings */
|
||||
if (PyUnicode_GET_SIZE(self) == 0)
|
||||
return 0;
|
||||
|
||||
/* PEP 3131 says that the first character must be in
|
||||
XID_Start and subsequent characters in XID_Continue,
|
||||
and for the ASCII range, the 2.x rules apply (i.e
|
||||
start with letters and underscore, continue with
|
||||
letters, digits, underscore). However, given the current
|
||||
definition of XID_Start and XID_Continue, it is sufficient
|
||||
to check just for these, except that _ must be allowed
|
||||
as starting an identifier. */
|
||||
if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
|
||||
return 0;
|
||||
|
||||
e = p + PyUnicode_GET_SIZE(self);
|
||||
for (p++; p < e; p++) {
|
||||
if (!_PyUnicode_IsXidContinue(*p))
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(isidentifier__doc__,
|
||||
"S.isidentifier() -> bool\n\
|
||||
\n\
|
||||
Return True if S is a valid identifier according\n\
|
||||
to the language definition.");
|
||||
|
||||
static PyObject*
|
||||
unicode_isidentifier(PyObject *self)
|
||||
{
|
||||
return PyBool_FromLong(PyUnicode_IsIdentifier(self));
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(join__doc__,
|
||||
"S.join(sequence) -> unicode\n\
|
||||
\n\
|
||||
|
@ -7714,6 +7756,7 @@ static PyMethodDef unicode_methods[] = {
|
|||
{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
|
||||
{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
|
||||
{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
|
||||
{"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
|
||||
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
|
||||
#if 0
|
||||
{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
|
||||
|
|
|
@ -21,13 +21,15 @@
|
|||
#define is_potential_identifier_start(c) (\
|
||||
(c >= 'a' && c <= 'z')\
|
||||
|| (c >= 'A' && c <= 'Z')\
|
||||
|| c == '_')
|
||||
|| c == '_'\
|
||||
|| (c >= 128))
|
||||
|
||||
#define is_potential_identifier_char(c) (\
|
||||
(c >= 'a' && c <= 'z')\
|
||||
|| (c >= 'A' && c <= 'Z')\
|
||||
|| (c >= '0' && c <= '9')\
|
||||
|| c == '_')
|
||||
|| c == '_'\
|
||||
|| (c >= 128))
|
||||
|
||||
extern char *PyOS_Readline(FILE *, FILE *, char *);
|
||||
/* Return malloc'ed string including trailing \n;
|
||||
|
@ -1070,6 +1072,19 @@ indenterror(struct tok_state *tok)
|
|||
return 0;
|
||||
}
|
||||
|
||||
#ifdef PGEN
|
||||
#define verify_identifier(s,e) 1
|
||||
#else
|
||||
/* Verify that the identifier follows PEP 3131. */
|
||||
static int
|
||||
verify_identifier(char *start, char *end)
|
||||
{
|
||||
PyObject *s = PyUnicode_DecodeUTF8(start, end-start, NULL);
|
||||
int result = PyUnicode_IsIdentifier(s);
|
||||
Py_DECREF(s);
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Get next token, after space stripping etc. */
|
||||
|
||||
|
@ -1077,7 +1092,7 @@ static int
|
|||
tok_get(register struct tok_state *tok, char **p_start, char **p_end)
|
||||
{
|
||||
register int c;
|
||||
int blankline;
|
||||
int blankline, nonascii;
|
||||
|
||||
*p_start = *p_end = NULL;
|
||||
nextline:
|
||||
|
@ -1195,6 +1210,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
|
|||
}
|
||||
|
||||
/* Identifier (most frequent token!) */
|
||||
nonascii = 0;
|
||||
if (is_potential_identifier_start(c)) {
|
||||
/* Process r"", u"" and ur"" */
|
||||
switch (c) {
|
||||
|
@ -1214,9 +1230,16 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
|
|||
break;
|
||||
}
|
||||
while (is_potential_identifier_char(c)) {
|
||||
if (c >= 128)
|
||||
nonascii = 1;
|
||||
c = tok_nextc(tok);
|
||||
}
|
||||
tok_backup(tok, c);
|
||||
if (nonascii &&
|
||||
!verify_identifier(tok->start, tok->cur)) {
|
||||
tok->done = E_IDENTIFIER;
|
||||
return ERRORTOKEN;
|
||||
}
|
||||
*p_start = tok->start;
|
||||
*p_end = tok->cur;
|
||||
return NAME;
|
||||
|
|
21
Python/ast.c
21
Python/ast.c
|
@ -47,8 +47,27 @@ static PyObject *parsestrplus(struct compiling *, const node *n,
|
|||
#define COMP_SETCOMP 2
|
||||
|
||||
static identifier
|
||||
new_identifier(const char* n, PyArena *arena) {
|
||||
new_identifier(const char* n, PyArena *arena)
|
||||
{
|
||||
PyObject* id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
|
||||
Py_UNICODE *u = PyUnicode_AS_UNICODE(id);
|
||||
/* Check whether there are non-ASCII characters in the
|
||||
identifier; if so, normalize to NFKC. */
|
||||
for (; *u; u++) {
|
||||
if (*u >= 128) {
|
||||
PyObject *m = PyImport_ImportModule("unicodedata");
|
||||
PyObject *id2;
|
||||
if (!m)
|
||||
return NULL;
|
||||
id2 = PyObject_CallMethod(m, "normalize", "sO", "NFKC", id);
|
||||
Py_DECREF(m);
|
||||
if (!id2)
|
||||
return NULL;
|
||||
Py_DECREF(id);
|
||||
id = id2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
PyUnicode_InternInPlace(&id);
|
||||
PyArena_AddPyObject(arena, id);
|
||||
return id;
|
||||
|
|
|
@ -1530,6 +1530,10 @@ err_input(perrdetail *err)
|
|||
case E_LINECONT:
|
||||
msg = "unexpected character after line continuation character";
|
||||
break;
|
||||
|
||||
case E_IDENTIFIER:
|
||||
msg = "invalid character in identifier";
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "error=%d\n", err->error);
|
||||
msg = "unknown parsing error";
|
||||
|
|
Loading…
Reference in New Issue