#2630: Implement PEP 3138.
The repr() of a string now contains printable Unicode characters unescaped. The new ascii() builtin can be used to get a repr() with only ASCII characters in it. PEP and patch were written by Atsuo Ishimoto.
This commit is contained in:
parent
ea6d58d9d3
commit
559e5d7f4d
|
@ -116,8 +116,18 @@ Object Protocol
|
||||||
|
|
||||||
Compute a string representation of object *o*. Returns the string
|
Compute a string representation of object *o*. Returns the string
|
||||||
representation on success, *NULL* on failure. This is the equivalent of the
|
representation on success, *NULL* on failure. This is the equivalent of the
|
||||||
Python expression ``repr(o)``. Called by the :func:`repr` built-in function and
|
Python expression ``repr(o)``. Called by the :func:`repr` built-in function.
|
||||||
by reverse quotes.
|
|
||||||
|
|
||||||
|
.. cfunction:: PyObject* PyObject_ASCII(PyObject *o)
|
||||||
|
|
||||||
|
.. index:: builtin: ascii
|
||||||
|
|
||||||
|
As :cfunc:`PyObject_Repr`, compute a string representation of object *o*, but
|
||||||
|
escape the non-ASCII characters in the string returned by
|
||||||
|
:cfunc:`PyObject_Repr` with ``\x``, ``\u`` or ``\U`` escapes. This generates
|
||||||
|
a string similar to that returned by :cfunc:`PyObject_Repr` in Python 2.
|
||||||
|
Called by the :func:`ascii` built-in function.
|
||||||
|
|
||||||
|
|
||||||
.. cfunction:: PyObject* PyObject_Str(PyObject *o)
|
.. cfunction:: PyObject* PyObject_Str(PyObject *o)
|
||||||
|
|
|
@ -144,6 +144,18 @@ the Python configuration.
|
||||||
|
|
||||||
Return 1 or 0 depending on whether *ch* is an alphanumeric character.
|
Return 1 or 0 depending on whether *ch* is an alphanumeric character.
|
||||||
|
|
||||||
|
|
||||||
|
.. cfunction:: int Py_UNICODE_ISPRINTABLE(Py_UNICODE ch)
|
||||||
|
|
||||||
|
Return 1 or 0 depending on whether *ch* is a printable character.
|
||||||
|
Nonprintable characters are those characters defined in the Unicode character
|
||||||
|
database as "Other" or "Separator", excepting the ASCII space (0x20) which is
|
||||||
|
considered printable. (Note that printable characters in this context are
|
||||||
|
those which should not be escaped when :func:`repr` is invoked on a string.
|
||||||
|
It has no bearing on the handling of strings written to :data:`sys.stdout` or
|
||||||
|
:data:`sys.stderr`.)
|
||||||
|
|
||||||
|
|
||||||
These APIs can be used for fast direct character conversions:
|
These APIs can be used for fast direct character conversions:
|
||||||
|
|
||||||
|
|
||||||
|
@ -266,6 +278,9 @@ APIs:
|
||||||
| | | of what the platform's |
|
| | | of what the platform's |
|
||||||
| | | ``printf`` yields. |
|
| | | ``printf`` yields. |
|
||||||
+-------------------+---------------------+--------------------------------+
|
+-------------------+---------------------+--------------------------------+
|
||||||
|
| :attr:`%A` | PyObject\* | The result of calling |
|
||||||
|
| | | :func:`ascii`. |
|
||||||
|
+-------------------+---------------------+--------------------------------+
|
||||||
| :attr:`%U` | PyObject\* | A unicode object. |
|
| :attr:`%U` | PyObject\* | A unicode object. |
|
||||||
+-------------------+---------------------+--------------------------------+
|
+-------------------+---------------------+--------------------------------+
|
||||||
| :attr:`%V` | PyObject\*, char \* | A unicode object (which may be |
|
| :attr:`%V` | PyObject\*, char \* | A unicode object (which may be |
|
||||||
|
|
|
@ -91,6 +91,14 @@ are always available. They are listed here in alphabetical order.
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
.. function:: ascii(object)
|
||||||
|
|
||||||
|
As :func:`repr`, return a string containing a printable representation of an
|
||||||
|
object, but escape the non-ASCII characters in the string returned by
|
||||||
|
:func:`repr` using ``\x``, ``\u`` or ``\U`` escapes. This generates a string
|
||||||
|
similar to that returned by :func:`repr` in Python 2.
|
||||||
|
|
||||||
|
|
||||||
.. function:: bin(x)
|
.. function:: bin(x)
|
||||||
|
|
||||||
Convert an integer number to a binary string. The result is a valid Python
|
Convert an integer number to a binary string. The result is a valid Python
|
||||||
|
|
|
@ -774,6 +774,17 @@ functions based on regular expressions.
|
||||||
least one cased character, false otherwise.
|
least one cased character, false otherwise.
|
||||||
|
|
||||||
|
|
||||||
|
.. method:: str.isprintable()
|
||||||
|
|
||||||
|
Return true if all characters in the string are printable or the string is
|
||||||
|
empty, false otherwise. Nonprintable characters are those characters defined
|
||||||
|
in the Unicode character database as "Other" or "Separator", excepting the
|
||||||
|
ASCII space (0x20) which is considered printable. (Note that printable
|
||||||
|
characters in this context are those which should not be escaped when
|
||||||
|
:func:`repr` is invoked on a string. It has no bearing on the handling of
|
||||||
|
strings written to :data:`sys.stdout` or :data:`sys.stderr`.)
|
||||||
|
|
||||||
|
|
||||||
.. method:: str.isspace()
|
.. method:: str.isspace()
|
||||||
|
|
||||||
Return true if there are only whitespace characters in the string and there is
|
Return true if there are only whitespace characters in the string and there is
|
||||||
|
|
|
@ -229,8 +229,9 @@ as a string, overriding its own definition of formatting. By converting the
|
||||||
value to a string before calling :meth:`__format__`, the normal formatting logic
|
value to a string before calling :meth:`__format__`, the normal formatting logic
|
||||||
is bypassed.
|
is bypassed.
|
||||||
|
|
||||||
Two conversion flags are currently supported: ``'!s'`` which calls :func:`str`
|
Three conversion flags are currently supported: ``'!s'`` which calls :func:`str`
|
||||||
on the value, and ``'!r'`` which calls :func:`repr`.
|
on the value, ``'!r'`` which calls :func:`repr` and ``'!a'`` which calls
|
||||||
|
:func:`ascii`.
|
||||||
|
|
||||||
Some examples::
|
Some examples::
|
||||||
|
|
||||||
|
|
|
@ -425,6 +425,9 @@ These environment variables influence Python's behavior.
|
||||||
``encodingname:errorhandler``. The ``:errorhandler`` part is optional and
|
``encodingname:errorhandler``. The ``:errorhandler`` part is optional and
|
||||||
has the same meaning as in :func:`str.encode`.
|
has the same meaning as in :func:`str.encode`.
|
||||||
|
|
||||||
|
For stderr, the ``:errorhandler`` part is ignored; the handler will always be
|
||||||
|
``'backslashreplace'``.
|
||||||
|
|
||||||
|
|
||||||
.. envvar:: PYTHONNOUSERSITE
|
.. envvar:: PYTHONNOUSERSITE
|
||||||
|
|
||||||
|
|
|
@ -425,6 +425,7 @@ PyAPI_FUNC(void) _Py_BreakPoint(void);
|
||||||
PyAPI_FUNC(void) _PyObject_Dump(PyObject *);
|
PyAPI_FUNC(void) _PyObject_Dump(PyObject *);
|
||||||
PyAPI_FUNC(PyObject *) PyObject_Repr(PyObject *);
|
PyAPI_FUNC(PyObject *) PyObject_Repr(PyObject *);
|
||||||
PyAPI_FUNC(PyObject *) PyObject_Str(PyObject *);
|
PyAPI_FUNC(PyObject *) PyObject_Str(PyObject *);
|
||||||
|
PyAPI_FUNC(PyObject *) PyObject_ASCII(PyObject *);
|
||||||
PyAPI_FUNC(int) PyObject_Compare(PyObject *, PyObject *);
|
PyAPI_FUNC(int) PyObject_Compare(PyObject *, PyObject *);
|
||||||
PyAPI_FUNC(PyObject *) PyObject_RichCompare(PyObject *, PyObject *, int);
|
PyAPI_FUNC(PyObject *) PyObject_RichCompare(PyObject *, PyObject *, int);
|
||||||
PyAPI_FUNC(int) PyObject_RichCompareBool(PyObject *, PyObject *, int);
|
PyAPI_FUNC(int) PyObject_RichCompareBool(PyObject *, PyObject *, int);
|
||||||
|
|
|
@ -220,6 +220,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
|
# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
|
||||||
# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
|
# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
|
||||||
# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
|
# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
|
||||||
|
# define _PyUnicode_IsPrintable _PyUnicodeUCS2_IsPrintable
|
||||||
# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
|
# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
|
||||||
# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
|
# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
|
||||||
# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
|
# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
|
||||||
|
@ -317,6 +318,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
|
# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
|
||||||
# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
|
# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
|
||||||
# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
|
# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
|
||||||
|
# define _PyUnicode_IsPrintable _PyUnicodeUCS4_IsPrintable
|
||||||
# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
|
# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
|
||||||
# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
|
# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
|
||||||
# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
|
# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
|
||||||
|
@ -357,6 +359,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
|
#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
|
||||||
#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
|
#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
|
||||||
#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
|
#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
|
||||||
|
#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
|
||||||
|
|
||||||
#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
|
#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
|
||||||
#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
|
#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
|
||||||
|
@ -387,6 +390,7 @@ extern const unsigned char _Py_ascii_whitespace[];
|
||||||
#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
|
#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
|
||||||
#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
|
#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
|
||||||
#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
|
#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
|
||||||
|
#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
|
||||||
|
|
||||||
#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
|
#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
|
||||||
#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
|
#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
|
||||||
|
@ -1533,6 +1537,10 @@ PyAPI_FUNC(int) _PyUnicode_IsNumeric(
|
||||||
Py_UNICODE ch /* Unicode character */
|
Py_UNICODE ch /* Unicode character */
|
||||||
);
|
);
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) _PyUnicode_IsPrintable(
|
||||||
|
Py_UNICODE ch /* Unicode character */
|
||||||
|
);
|
||||||
|
|
||||||
PyAPI_FUNC(int) _PyUnicode_IsAlpha(
|
PyAPI_FUNC(int) _PyUnicode_IsAlpha(
|
||||||
Py_UNICODE ch /* Unicode character */
|
Py_UNICODE ch /* Unicode character */
|
||||||
);
|
);
|
||||||
|
|
|
@ -1440,6 +1440,12 @@ class OutputChecker:
|
||||||
and returns true if they match; and `output_difference`, which
|
and returns true if they match; and `output_difference`, which
|
||||||
returns a string describing the differences between two outputs.
|
returns a string describing the differences between two outputs.
|
||||||
"""
|
"""
|
||||||
|
def _toAscii(self, s):
|
||||||
|
"""
|
||||||
|
Convert string to hex-escaped ASCII string.
|
||||||
|
"""
|
||||||
|
return str(s.encode('ASCII', 'backslashreplace'), "ASCII")
|
||||||
|
|
||||||
def check_output(self, want, got, optionflags):
|
def check_output(self, want, got, optionflags):
|
||||||
"""
|
"""
|
||||||
Return True iff the actual output from an example (`got`)
|
Return True iff the actual output from an example (`got`)
|
||||||
|
@ -1450,6 +1456,15 @@ class OutputChecker:
|
||||||
documentation for `TestRunner` for more information about
|
documentation for `TestRunner` for more information about
|
||||||
option flags.
|
option flags.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# If `want` contains hex-escaped character such as "\u1234",
|
||||||
|
# then `want` is a string of six characters(e.g. [\,u,1,2,3,4]).
|
||||||
|
# On the other hand, `got` could be an another sequence of
|
||||||
|
# characters such as [\u1234], so `want` and `got` should
|
||||||
|
# be folded to hex-escaped ASCII string to compare.
|
||||||
|
got = self._toAscii(got)
|
||||||
|
want = self._toAscii(want)
|
||||||
|
|
||||||
# Handle the common case first, for efficiency:
|
# Handle the common case first, for efficiency:
|
||||||
# if they're string-identical, always return true.
|
# if they're string-identical, always return true.
|
||||||
if got == want:
|
if got == want:
|
||||||
|
|
|
@ -768,7 +768,7 @@ class UnicodeTest(StringTest):
|
||||||
a = array.array('u', s)
|
a = array.array('u', s)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
repr(a),
|
repr(a),
|
||||||
"array('u', '\\x00=\"\\'a\\\\b\\x80\\xff\\x00\\x01\\u1234')")
|
"array('u', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')")
|
||||||
|
|
||||||
self.assertRaises(TypeError, a.fromunicode)
|
self.assertRaises(TypeError, a.fromunicode)
|
||||||
|
|
||||||
|
|
|
@ -159,6 +159,20 @@ class BuiltinTest(unittest.TestCase):
|
||||||
S = [10, 20, 30]
|
S = [10, 20, 30]
|
||||||
self.assertEqual(any(x > 42 for x in S), False)
|
self.assertEqual(any(x > 42 for x in S), False)
|
||||||
|
|
||||||
|
def test_ascii(self):
|
||||||
|
self.assertEqual(ascii(''), '\'\'')
|
||||||
|
self.assertEqual(ascii(0), '0')
|
||||||
|
self.assertEqual(ascii(0), '0')
|
||||||
|
self.assertEqual(ascii(()), '()')
|
||||||
|
self.assertEqual(ascii([]), '[]')
|
||||||
|
self.assertEqual(ascii({}), '{}')
|
||||||
|
a = []
|
||||||
|
a.append(a)
|
||||||
|
self.assertEqual(ascii(a), '[[...]]')
|
||||||
|
a = {}
|
||||||
|
a[0] = a
|
||||||
|
self.assertEqual(ascii(a), '{0: {...}}')
|
||||||
|
|
||||||
def test_neg(self):
|
def test_neg(self):
|
||||||
x = -sys.maxsize-1
|
x = -sys.maxsize-1
|
||||||
self.assert_(isinstance(x, int))
|
self.assert_(isinstance(x, int))
|
||||||
|
|
|
@ -216,6 +216,8 @@ class FormatTest(unittest.TestCase):
|
||||||
testformat("%o", 0o42, "42")
|
testformat("%o", 0o42, "42")
|
||||||
testformat("%o", -0o42, "-42")
|
testformat("%o", -0o42, "-42")
|
||||||
testformat("%o", float(0o42), "42")
|
testformat("%o", float(0o42), "42")
|
||||||
|
testformat("%r", "\u0370", "'\u0370'")
|
||||||
|
testformat("%a", "\u0370", "'\\u0370'")
|
||||||
# Test exception for unknown format characters
|
# Test exception for unknown format characters
|
||||||
if verbose:
|
if verbose:
|
||||||
print('Testing exceptions')
|
print('Testing exceptions')
|
||||||
|
@ -235,8 +237,8 @@ class FormatTest(unittest.TestCase):
|
||||||
raise
|
raise
|
||||||
else:
|
else:
|
||||||
raise TestFailed('did not get expected exception: %s' % excmsg)
|
raise TestFailed('did not get expected exception: %s' % excmsg)
|
||||||
test_exc('abc %a', 1, ValueError,
|
test_exc('abc %b', 1, ValueError,
|
||||||
"unsupported format character 'a' (0x61) at index 5")
|
"unsupported format character 'b' (0x62) at index 5")
|
||||||
#test_exc(unicode('abc %\u3000','raw-unicode-escape'), 1, ValueError,
|
#test_exc(unicode('abc %\u3000','raw-unicode-escape'), 1, ValueError,
|
||||||
# "unsupported format character '?' (0x3000) at index 5")
|
# "unsupported format character '?' (0x3000) at index 5")
|
||||||
test_exc('%d', '1', TypeError, "%d format: a number is required, not str")
|
test_exc('%d', '1', TypeError, "%d format: a number is required, not str")
|
||||||
|
|
|
@ -131,7 +131,7 @@ class ParseTest(unittest.TestCase):
|
||||||
self.assertEquals(op[1], "Comment: ' comment data '")
|
self.assertEquals(op[1], "Comment: ' comment data '")
|
||||||
self.assertEquals(op[2], "Notation declared: ('notation', None, 'notation.jpeg', None)")
|
self.assertEquals(op[2], "Notation declared: ('notation', None, 'notation.jpeg', None)")
|
||||||
self.assertEquals(op[3], "Unparsed entity decl: ('unparsed_entity', None, 'entity.file', None, 'notation')")
|
self.assertEquals(op[3], "Unparsed entity decl: ('unparsed_entity', None, 'entity.file', None, 'notation')")
|
||||||
self.assertEquals(op[4], "Start element: 'root' {'attr1': 'value1', 'attr2': 'value2\\u1f40'}")
|
self.assertEquals(op[4], "Start element: 'root' {'attr1': 'value1', 'attr2': 'value2\u1f40'}")
|
||||||
self.assertEquals(op[5], "NS decl: 'myns' 'http://www.python.org/namespace'")
|
self.assertEquals(op[5], "NS decl: 'myns' 'http://www.python.org/namespace'")
|
||||||
self.assertEquals(op[6], "Start element: 'http://www.python.org/namespace!subelement' {}")
|
self.assertEquals(op[6], "Start element: 'http://www.python.org/namespace!subelement' {}")
|
||||||
self.assertEquals(op[7], "Character data: 'Contents of subelements'")
|
self.assertEquals(op[7], "Character data: 'Contents of subelements'")
|
||||||
|
|
|
@ -71,6 +71,48 @@ class UnicodeTest(
|
||||||
# raw strings should not have unicode escapes
|
# raw strings should not have unicode escapes
|
||||||
self.assertNotEquals(r"\u0020", " ")
|
self.assertNotEquals(r"\u0020", " ")
|
||||||
|
|
||||||
|
def test_ascii(self):
|
||||||
|
if not sys.platform.startswith('java'):
|
||||||
|
# Test basic sanity of repr()
|
||||||
|
self.assertEqual(ascii('abc'), "'abc'")
|
||||||
|
self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
|
||||||
|
self.assertEqual(ascii('ab\\'), "'ab\\\\'")
|
||||||
|
self.assertEqual(ascii('\\c'), "'\\\\c'")
|
||||||
|
self.assertEqual(ascii('\\'), "'\\\\'")
|
||||||
|
self.assertEqual(ascii('\n'), "'\\n'")
|
||||||
|
self.assertEqual(ascii('\r'), "'\\r'")
|
||||||
|
self.assertEqual(ascii('\t'), "'\\t'")
|
||||||
|
self.assertEqual(ascii('\b'), "'\\x08'")
|
||||||
|
self.assertEqual(ascii("'\""), """'\\'"'""")
|
||||||
|
self.assertEqual(ascii("'\""), """'\\'"'""")
|
||||||
|
self.assertEqual(ascii("'"), '''"'"''')
|
||||||
|
self.assertEqual(ascii('"'), """'"'""")
|
||||||
|
latin1repr = (
|
||||||
|
"'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
|
||||||
|
"\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
|
||||||
|
"\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
|
||||||
|
"JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
|
||||||
|
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
|
||||||
|
"\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
|
||||||
|
"\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
|
||||||
|
"\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
|
||||||
|
"\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
|
||||||
|
"\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
|
||||||
|
"\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
|
||||||
|
"\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
|
||||||
|
"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
|
||||||
|
"\\xfe\\xff'")
|
||||||
|
testrepr = ascii(''.join(map(chr, range(256))))
|
||||||
|
self.assertEqual(testrepr, latin1repr)
|
||||||
|
# Test ascii works on wide unicode escapes without overflow.
|
||||||
|
self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
|
||||||
|
ascii("\U00010000" * 39 + "\uffff" * 4096))
|
||||||
|
|
||||||
|
class WrongRepr:
|
||||||
|
def __repr__(self):
|
||||||
|
return b'byte-repr'
|
||||||
|
self.assertRaises(TypeError, ascii, WrongRepr())
|
||||||
|
|
||||||
def test_repr(self):
|
def test_repr(self):
|
||||||
if not sys.platform.startswith('java'):
|
if not sys.platform.startswith('java'):
|
||||||
# Test basic sanity of repr()
|
# Test basic sanity of repr()
|
||||||
|
@ -94,20 +136,25 @@ class UnicodeTest(
|
||||||
"JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
|
"JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
|
||||||
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
|
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
|
||||||
"\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
|
"\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
|
||||||
"\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
|
"\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
|
||||||
"\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
|
"\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
|
||||||
"\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
|
"\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
|
||||||
"\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
|
"\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
|
||||||
"\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
|
"\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
|
||||||
"\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
|
"\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
|
||||||
"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
|
"\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
|
||||||
"\\xfe\\xff'")
|
"\xfe\xff'")
|
||||||
testrepr = repr(''.join(map(chr, range(256))))
|
testrepr = repr(''.join(map(chr, range(256))))
|
||||||
self.assertEqual(testrepr, latin1repr)
|
self.assertEqual(testrepr, latin1repr)
|
||||||
# Test repr works on wide unicode escapes without overflow.
|
# Test repr works on wide unicode escapes without overflow.
|
||||||
self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
|
self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
|
||||||
repr("\U00010000" * 39 + "\uffff" * 4096))
|
repr("\U00010000" * 39 + "\uffff" * 4096))
|
||||||
|
|
||||||
|
class WrongRepr:
|
||||||
|
def __repr__(self):
|
||||||
|
return b'byte-repr'
|
||||||
|
self.assertRaises(TypeError, repr, WrongRepr())
|
||||||
|
|
||||||
def test_iterators(self):
|
def test_iterators(self):
|
||||||
# Make sure unicode objects have an __iter__ method
|
# Make sure unicode objects have an __iter__ method
|
||||||
it = "\u1111\u2222\u3333".__iter__()
|
it = "\u1111\u2222\u3333".__iter__()
|
||||||
|
@ -374,6 +421,13 @@ class UnicodeTest(
|
||||||
self.assertFalse("[".isidentifier())
|
self.assertFalse("[".isidentifier())
|
||||||
self.assertFalse("©".isidentifier())
|
self.assertFalse("©".isidentifier())
|
||||||
|
|
||||||
|
def test_isprintable(self):
|
||||||
|
self.assertTrue("".isprintable())
|
||||||
|
self.assertTrue("abcdefg".isprintable())
|
||||||
|
self.assertFalse("abcdefg\n".isprintable())
|
||||||
|
self.assertTrue("\u0370".isprintable())
|
||||||
|
self.assertFalse("\ud800".isprintable())
|
||||||
|
|
||||||
def test_contains(self):
|
def test_contains(self):
|
||||||
# Testing Unicode contains method
|
# Testing Unicode contains method
|
||||||
self.assert_('a' in 'abdb')
|
self.assert_('a' in 'abdb')
|
||||||
|
@ -544,7 +598,7 @@ class UnicodeTest(
|
||||||
# format specifiers for user defined type
|
# format specifiers for user defined type
|
||||||
self.assertEqual('{0:abc}'.format(C()), 'abc')
|
self.assertEqual('{0:abc}'.format(C()), 'abc')
|
||||||
|
|
||||||
# !r and !s coersions
|
# !r, !s and !a coersions
|
||||||
self.assertEqual('{0!s}'.format('Hello'), 'Hello')
|
self.assertEqual('{0!s}'.format('Hello'), 'Hello')
|
||||||
self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
|
self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
|
||||||
self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
|
self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
|
||||||
|
@ -552,6 +606,11 @@ class UnicodeTest(
|
||||||
self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
|
self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
|
||||||
self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
|
self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
|
||||||
self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
|
self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
|
||||||
|
self.assertEqual('{0!r}'.format(F('\u0370')), 'F(\u0370)')
|
||||||
|
self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
|
||||||
|
self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
|
||||||
|
self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
|
||||||
|
self.assertEqual('{0!a}'.format(F('\u0370')), 'F(\\u0370)')
|
||||||
|
|
||||||
# test fallback to object.__format__
|
# test fallback to object.__format__
|
||||||
self.assertEqual('{0}'.format({}), '{}')
|
self.assertEqual('{0}'.format({}), '{}')
|
||||||
|
@ -643,6 +702,8 @@ class UnicodeTest(
|
||||||
self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
|
self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
|
||||||
if not sys.platform.startswith('java'):
|
if not sys.platform.startswith('java'):
|
||||||
self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
|
self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
|
||||||
|
self.assertEqual("%r" % ("\u1234",), "'\u1234'")
|
||||||
|
self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
|
||||||
self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
|
self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
|
||||||
self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
|
self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,10 @@ What's new in Python 3.0b1?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #2630: implement PEP 3138. repr() now returns printable
|
||||||
|
Unicode characters unescaped, to get an ASCII-only representation
|
||||||
|
of an object use ascii().
|
||||||
|
|
||||||
- Issue #1342: On windows, Python could not start when installed in a
|
- Issue #1342: On windows, Python could not start when installed in a
|
||||||
directory with non-ascii characters.
|
directory with non-ascii characters.
|
||||||
|
|
||||||
|
|
|
@ -425,6 +425,33 @@ PyObject_Str(PyObject *v)
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PyObject *
|
||||||
|
PyObject_ASCII(PyObject *v)
|
||||||
|
{
|
||||||
|
PyObject *repr, *ascii, *res;
|
||||||
|
|
||||||
|
repr = PyObject_Repr(v);
|
||||||
|
if (repr == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/* repr is guaranteed to be a PyUnicode object by PyObject_Repr */
|
||||||
|
ascii = PyUnicode_EncodeASCII(
|
||||||
|
PyUnicode_AS_UNICODE(repr),
|
||||||
|
PyUnicode_GET_SIZE(repr),
|
||||||
|
"backslashreplace");
|
||||||
|
|
||||||
|
Py_DECREF(repr);
|
||||||
|
if (ascii == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
res = PyUnicode_DecodeASCII(
|
||||||
|
PyBytes_AS_STRING(ascii),
|
||||||
|
PyBytes_GET_SIZE(ascii),
|
||||||
|
NULL);
|
||||||
|
|
||||||
|
Py_DECREF(ascii);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
/* The new comparison philosophy is: we completely separate three-way
|
/* The new comparison philosophy is: we completely separate three-way
|
||||||
comparison from rich comparison. That is, PyObject_Compare() and
|
comparison from rich comparison. That is, PyObject_Compare() and
|
||||||
|
|
|
@ -766,6 +766,10 @@ do_conversion(PyObject *obj, STRINGLIB_CHAR conversion)
|
||||||
return PyObject_Repr(obj);
|
return PyObject_Repr(obj);
|
||||||
case 's':
|
case 's':
|
||||||
return STRINGLIB_TOSTR(obj);
|
return STRINGLIB_TOSTR(obj);
|
||||||
|
#if PY_VERSION_HEX >= 0x03000000
|
||||||
|
case 'a':
|
||||||
|
return STRINGLIB_TOASCII(obj);
|
||||||
|
#endif
|
||||||
default:
|
default:
|
||||||
if (conversion > 32 && conversion < 127) {
|
if (conversion > 32 && conversion < 127) {
|
||||||
/* It's the ASCII subrange; casting to char is safe
|
/* It's the ASCII subrange; casting to char is safe
|
||||||
|
|
|
@ -24,5 +24,5 @@
|
||||||
#define STRINGLIB_CMP memcmp
|
#define STRINGLIB_CMP memcmp
|
||||||
#define STRINGLIB_TOSTR PyObject_Str
|
#define STRINGLIB_TOSTR PyObject_Str
|
||||||
#define STRINGLIB_GROUPING _PyBytes_InsertThousandsGrouping
|
#define STRINGLIB_GROUPING _PyBytes_InsertThousandsGrouping
|
||||||
|
#define STRINGLIB_TOASCII PyObject_Repr
|
||||||
#endif /* !STRINGLIB_STRINGDEFS_H */
|
#endif /* !STRINGLIB_STRINGDEFS_H */
|
||||||
|
|
|
@ -25,8 +25,10 @@
|
||||||
|
|
||||||
#if PY_VERSION_HEX < 0x03000000
|
#if PY_VERSION_HEX < 0x03000000
|
||||||
#define STRINGLIB_TOSTR PyObject_Unicode
|
#define STRINGLIB_TOSTR PyObject_Unicode
|
||||||
|
#define STRINGLIB_TOASCII PyObject_Repr
|
||||||
#else
|
#else
|
||||||
#define STRINGLIB_TOSTR PyObject_Str
|
#define STRINGLIB_TOSTR PyObject_Str
|
||||||
|
#define STRINGLIB_TOASCII PyObject_ASCII
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define STRINGLIB_WANT_CONTAINS_OBJ 1
|
#define STRINGLIB_WANT_CONTAINS_OBJ 1
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
#define UPPER_MASK 0x80
|
#define UPPER_MASK 0x80
|
||||||
#define XID_START_MASK 0x100
|
#define XID_START_MASK 0x100
|
||||||
#define XID_CONTINUE_MASK 0x200
|
#define XID_CONTINUE_MASK 0x200
|
||||||
|
#define NONPRINTABLE_MASK 0x400
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const Py_UNICODE upper;
|
const Py_UNICODE upper;
|
||||||
|
@ -675,6 +676,26 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
|
||||||
return _PyUnicode_ToNumeric(ch) != -1.0;
|
return _PyUnicode_ToNumeric(ch) != -1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
|
||||||
|
0 otherwise.
|
||||||
|
All characters except those characters defined in the Unicode character
|
||||||
|
database as following categories are considered printable.
|
||||||
|
* Cc (Other, Control)
|
||||||
|
* Cf (Other, Format)
|
||||||
|
* Cs (Other, Surrogate)
|
||||||
|
* Co (Other, Private Use)
|
||||||
|
* Cn (Other, Not Assigned)
|
||||||
|
* Zl Separator, Line ('\u2028', LINE SEPARATOR)
|
||||||
|
* Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
|
||||||
|
* Zs (Separator, Space) other than ASCII space('\x20').
|
||||||
|
*/
|
||||||
|
int _PyUnicode_IsPrintable(Py_UNICODE ch)
|
||||||
|
{
|
||||||
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
|
return (ctype->flags & NONPRINTABLE_MASK) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef WANT_WCTYPE_FUNCTIONS
|
#ifndef WANT_WCTYPE_FUNCTIONS
|
||||||
|
|
||||||
/* Returns 1 for Unicode characters having the bidirectional type
|
/* Returns 1 for Unicode characters having the bidirectional type
|
||||||
|
|
|
@ -645,11 +645,12 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
|
||||||
count = vargs;
|
count = vargs;
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
/* step 1: count the number of %S/%R format specifications
|
/* step 1: count the number of %S/%R/%A format specifications
|
||||||
* (we call PyObject_Str()/PyObject_Repr() for these objects
|
* (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
|
||||||
* once during step 3 and put the result in an array) */
|
* these objects once during step 3 and put the result in
|
||||||
|
an array) */
|
||||||
for (f = format; *f; f++) {
|
for (f = format; *f; f++) {
|
||||||
if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
|
if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
|
||||||
++callcount;
|
++callcount;
|
||||||
}
|
}
|
||||||
/* step 2: allocate memory for the results of
|
/* step 2: allocate memory for the results of
|
||||||
|
@ -778,6 +779,19 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
|
||||||
*callresult++ = repr;
|
*callresult++ = repr;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case 'A':
|
||||||
|
{
|
||||||
|
PyObject *obj = va_arg(count, PyObject *);
|
||||||
|
PyObject *ascii;
|
||||||
|
assert(obj);
|
||||||
|
ascii = PyObject_ASCII(obj);
|
||||||
|
if (!ascii)
|
||||||
|
goto fail;
|
||||||
|
n += PyUnicode_GET_SIZE(ascii);
|
||||||
|
/* Remember the repr and switch to the next slot */
|
||||||
|
*callresult++ = ascii;
|
||||||
|
break;
|
||||||
|
}
|
||||||
case 'p':
|
case 'p':
|
||||||
(void) va_arg(count, int);
|
(void) va_arg(count, int);
|
||||||
/* maximum 64-bit pointer representation:
|
/* maximum 64-bit pointer representation:
|
||||||
|
@ -7231,6 +7245,32 @@ unicode_isidentifier(PyObject *self)
|
||||||
return PyBool_FromLong(PyUnicode_IsIdentifier(self));
|
return PyBool_FromLong(PyUnicode_IsIdentifier(self));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PyDoc_STRVAR(isprintable__doc__,
|
||||||
|
"S.isprintable() -> bool\n\
|
||||||
|
\n\
|
||||||
|
Return True if all characters in S are considered\n\
|
||||||
|
printable in repr() or S is empty, False otherwise.");
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
unicode_isprintable(PyObject *self)
|
||||||
|
{
|
||||||
|
register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
||||||
|
register const Py_UNICODE *e;
|
||||||
|
|
||||||
|
/* Shortcut for single character strings */
|
||||||
|
if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
|
||||||
|
Py_RETURN_TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
e = p + PyUnicode_GET_SIZE(self);
|
||||||
|
for (; p < e; p++) {
|
||||||
|
if (!Py_UNICODE_ISPRINTABLE(*p)) {
|
||||||
|
Py_RETURN_FALSE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Py_RETURN_TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(join__doc__,
|
PyDoc_STRVAR(join__doc__,
|
||||||
"S.join(sequence) -> str\n\
|
"S.join(sequence) -> str\n\
|
||||||
\n\
|
\n\
|
||||||
|
@ -7608,61 +7648,8 @@ PyObject *unicode_repr(PyObject *unicode)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef Py_UNICODE_WIDE
|
/* Map special whitespace to '\t', \n', '\r' */
|
||||||
/* Map 21-bit characters to '\U00xxxxxx' */
|
if (ch == '\t') {
|
||||||
else if (ch >= 0x10000) {
|
|
||||||
*p++ = '\\';
|
|
||||||
*p++ = 'U';
|
|
||||||
*p++ = hexdigits[(ch >> 28) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[(ch >> 24) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[(ch >> 20) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[(ch >> 16) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[(ch >> 12) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[(ch >> 8) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[(ch >> 4) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[ch & 0x0000000F];
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
|
|
||||||
else if (ch >= 0xD800 && ch < 0xDC00) {
|
|
||||||
Py_UNICODE ch2;
|
|
||||||
Py_UCS4 ucs;
|
|
||||||
|
|
||||||
ch2 = *s++;
|
|
||||||
size--;
|
|
||||||
if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
|
|
||||||
ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
|
|
||||||
*p++ = '\\';
|
|
||||||
*p++ = 'U';
|
|
||||||
*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
|
|
||||||
*p++ = hexdigits[ucs & 0x0000000F];
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* Fall through: isolated surrogates are copied as-is */
|
|
||||||
s--;
|
|
||||||
size++;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Map 16-bit characters to '\uxxxx' */
|
|
||||||
if (ch >= 256) {
|
|
||||||
*p++ = '\\';
|
|
||||||
*p++ = 'u';
|
|
||||||
*p++ = hexdigits[(ch >> 12) & 0x000F];
|
|
||||||
*p++ = hexdigits[(ch >> 8) & 0x000F];
|
|
||||||
*p++ = hexdigits[(ch >> 4) & 0x000F];
|
|
||||||
*p++ = hexdigits[ch & 0x000F];
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Map special whitespace to '\t', \n', '\r' */
|
|
||||||
else if (ch == '\t') {
|
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 't';
|
*p++ = 't';
|
||||||
}
|
}
|
||||||
|
@ -7676,16 +7663,79 @@ PyObject *unicode_repr(PyObject *unicode)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Map non-printable US ASCII to '\xhh' */
|
/* Map non-printable US ASCII to '\xhh' */
|
||||||
else if (ch < ' ' || ch >= 0x7F) {
|
else if (ch < ' ' || ch == 0x7F) {
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'x';
|
*p++ = 'x';
|
||||||
*p++ = hexdigits[(ch >> 4) & 0x000F];
|
*p++ = hexdigits[(ch >> 4) & 0x000F];
|
||||||
*p++ = hexdigits[ch & 0x000F];
|
*p++ = hexdigits[ch & 0x000F];
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Copy everything else as-is */
|
/* Copy ASCII characters as-is */
|
||||||
else
|
else if (ch < 0x7F) {
|
||||||
*p++ = (char) ch;
|
*p++ = ch;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Non-ASCII characters */
|
||||||
|
else {
|
||||||
|
Py_UCS4 ucs = ch;
|
||||||
|
|
||||||
|
#ifndef Py_UNICODE_WIDE
|
||||||
|
Py_UNICODE ch2 = 0;
|
||||||
|
/* Get code point from surrogate pair */
|
||||||
|
if (size > 0) {
|
||||||
|
ch2 = *s;
|
||||||
|
if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
|
||||||
|
&& ch2 <= 0xDFFF) {
|
||||||
|
ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
|
||||||
|
+ 0x00010000;
|
||||||
|
s++;
|
||||||
|
size--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
/* Map Unicode whitespace and control characters
|
||||||
|
(categories Z* and C* except ASCII space)
|
||||||
|
*/
|
||||||
|
if (!Py_UNICODE_ISPRINTABLE(ucs)) {
|
||||||
|
/* Map 8-bit characters to '\xhh' */
|
||||||
|
if (ucs <= 0xff) {
|
||||||
|
*p++ = '\\';
|
||||||
|
*p++ = 'x';
|
||||||
|
*p++ = hexdigits[(ch >> 4) & 0x000F];
|
||||||
|
*p++ = hexdigits[ch & 0x000F];
|
||||||
|
}
|
||||||
|
/* Map 21-bit characters to '\U00xxxxxx' */
|
||||||
|
else if (ucs >= 0x10000) {
|
||||||
|
*p++ = '\\';
|
||||||
|
*p++ = 'U';
|
||||||
|
*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
|
||||||
|
*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
|
||||||
|
*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
|
||||||
|
*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
|
||||||
|
*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
|
||||||
|
*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
|
||||||
|
*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
|
||||||
|
*p++ = hexdigits[ucs & 0x0000000F];
|
||||||
|
}
|
||||||
|
/* Map 16-bit characters to '\uxxxx' */
|
||||||
|
else {
|
||||||
|
*p++ = '\\';
|
||||||
|
*p++ = 'u';
|
||||||
|
*p++ = hexdigits[(ucs >> 12) & 0x000F];
|
||||||
|
*p++ = hexdigits[(ucs >> 8) & 0x000F];
|
||||||
|
*p++ = hexdigits[(ucs >> 4) & 0x000F];
|
||||||
|
*p++ = hexdigits[ucs & 0x000F];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* Copy characters as-is */
|
||||||
|
else {
|
||||||
|
*p++ = ch;
|
||||||
|
#ifndef Py_UNICODE_WIDE
|
||||||
|
if (ucs >= 0x10000)
|
||||||
|
*p++ = ch2;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/* Add quote */
|
/* Add quote */
|
||||||
*p++ = PyUnicode_AS_UNICODE(repr)[0];
|
*p++ = PyUnicode_AS_UNICODE(repr)[0];
|
||||||
|
@ -8372,6 +8422,7 @@ static PyMethodDef unicode_methods[] = {
|
||||||
{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
|
{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
|
||||||
{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
|
{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
|
||||||
{"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
|
{"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
|
||||||
|
{"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
|
||||||
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
|
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
|
||||||
{"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
|
{"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
|
||||||
{"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
|
{"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
|
||||||
|
@ -8958,6 +9009,7 @@ PyObject *PyUnicode_Format(PyObject *format,
|
||||||
|
|
||||||
case 's':
|
case 's':
|
||||||
case 'r':
|
case 'r':
|
||||||
|
case 'a':
|
||||||
if (PyUnicode_Check(v) && c == 's') {
|
if (PyUnicode_Check(v) && c == 's') {
|
||||||
temp = v;
|
temp = v;
|
||||||
Py_INCREF(temp);
|
Py_INCREF(temp);
|
||||||
|
@ -8965,8 +9017,10 @@ PyObject *PyUnicode_Format(PyObject *format,
|
||||||
else {
|
else {
|
||||||
if (c == 's')
|
if (c == 's')
|
||||||
temp = PyObject_Str(v);
|
temp = PyObject_Str(v);
|
||||||
else
|
else if (c == 'r')
|
||||||
temp = PyObject_Repr(v);
|
temp = PyObject_Repr(v);
|
||||||
|
else
|
||||||
|
temp = PyObject_ASCII(v);
|
||||||
if (temp == NULL)
|
if (temp == NULL)
|
||||||
goto onError;
|
goto onError;
|
||||||
if (PyUnicode_Check(temp))
|
if (PyUnicode_Check(temp))
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -265,6 +265,20 @@ PyDoc_STRVAR(any_doc,
|
||||||
\n\
|
\n\
|
||||||
Return True if bool(x) is True for any x in the iterable.");
|
Return True if bool(x) is True for any x in the iterable.");
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
builtin_ascii(PyObject *self, PyObject *v)
|
||||||
|
{
|
||||||
|
return PyObject_ASCII(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
PyDoc_STRVAR(ascii_doc,
|
||||||
|
"ascii(object) -> string\n\
|
||||||
|
\n\
|
||||||
|
As repr(), return a string containing a printable representation of an\n\
|
||||||
|
object, but escape the non-ASCII characters in the string returned by\n\
|
||||||
|
repr() using \\x, \\u or \\U escapes. This generates a string similar\n\
|
||||||
|
to that returned by repr() in Python 2.");
|
||||||
|
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
builtin_bin(PyObject *self, PyObject *v)
|
builtin_bin(PyObject *self, PyObject *v)
|
||||||
|
@ -2188,6 +2202,7 @@ static PyMethodDef builtin_methods[] = {
|
||||||
{"abs", builtin_abs, METH_O, abs_doc},
|
{"abs", builtin_abs, METH_O, abs_doc},
|
||||||
{"all", builtin_all, METH_O, all_doc},
|
{"all", builtin_all, METH_O, all_doc},
|
||||||
{"any", builtin_any, METH_O, any_doc},
|
{"any", builtin_any, METH_O, any_doc},
|
||||||
|
{"ascii", builtin_ascii, METH_O, ascii_doc},
|
||||||
{"bin", builtin_bin, METH_O, bin_doc},
|
{"bin", builtin_bin, METH_O, bin_doc},
|
||||||
{"chr", builtin_chr, METH_VARARGS, chr_doc},
|
{"chr", builtin_chr, METH_VARARGS, chr_doc},
|
||||||
{"cmp", builtin_cmp, METH_VARARGS, cmp_doc},
|
{"cmp", builtin_cmp, METH_VARARGS, cmp_doc},
|
||||||
|
|
|
@ -793,7 +793,7 @@ initstdio(void)
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (!(std = PyFile_FromFd(fd, "<stderr>", "w", -1, encoding,
|
if (!(std = PyFile_FromFd(fd, "<stderr>", "w", -1, encoding,
|
||||||
errors, "\n", 0))) {
|
"backslashreplace", "\n", 0))) {
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
} /* if (fd < 0) */
|
} /* if (fd < 0) */
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
# 2002-11-25 mvl add UNIDATA_VERSION
|
# 2002-11-25 mvl add UNIDATA_VERSION
|
||||||
# 2004-05-29 perky add east asian width information
|
# 2004-05-29 perky add east asian width information
|
||||||
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
|
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
|
||||||
|
# 2008-06-11 gb add NONPRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
|
||||||
#
|
#
|
||||||
# written by Fredrik Lundh (fredrik@pythonware.com)
|
# written by Fredrik Lundh (fredrik@pythonware.com)
|
||||||
#
|
#
|
||||||
|
@ -60,6 +61,7 @@ TITLE_MASK = 0x40
|
||||||
UPPER_MASK = 0x80
|
UPPER_MASK = 0x80
|
||||||
XID_START_MASK = 0x100
|
XID_START_MASK = 0x100
|
||||||
XID_CONTINUE_MASK = 0x200
|
XID_CONTINUE_MASK = 0x200
|
||||||
|
NONPRINTABLE_MASK = 0x400
|
||||||
|
|
||||||
def maketables(trace=0):
|
def maketables(trace=0):
|
||||||
|
|
||||||
|
@ -71,7 +73,7 @@ def maketables(trace=0):
|
||||||
EASTASIAN_WIDTH % version,
|
EASTASIAN_WIDTH % version,
|
||||||
DERIVED_CORE_PROPERTIES % version)
|
DERIVED_CORE_PROPERTIES % version)
|
||||||
|
|
||||||
print(len(filter(None, unicode.table)), "characters")
|
print(len(list(filter(None, unicode.table))), "characters")
|
||||||
|
|
||||||
for version in old_versions:
|
for version in old_versions:
|
||||||
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
|
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
|
||||||
|
@ -79,7 +81,7 @@ def maketables(trace=0):
|
||||||
COMPOSITION_EXCLUSIONS % ("-"+version),
|
COMPOSITION_EXCLUSIONS % ("-"+version),
|
||||||
EASTASIAN_WIDTH % ("-"+version),
|
EASTASIAN_WIDTH % ("-"+version),
|
||||||
DERIVED_CORE_PROPERTIES % ("-"+version))
|
DERIVED_CORE_PROPERTIES % ("-"+version))
|
||||||
print(len(filter(None, old_unicode.table)), "characters")
|
print(len(list(filter(None, old_unicode.table))), "characters")
|
||||||
merge_old_version(version, unicode, old_unicode)
|
merge_old_version(version, unicode, old_unicode)
|
||||||
|
|
||||||
makeunicodename(unicode, trace)
|
makeunicodename(unicode, trace)
|
||||||
|
@ -371,6 +373,10 @@ def makeunicodetype(unicode, trace):
|
||||||
flags |= TITLE_MASK
|
flags |= TITLE_MASK
|
||||||
if category == "Lu":
|
if category == "Lu":
|
||||||
flags |= UPPER_MASK
|
flags |= UPPER_MASK
|
||||||
|
if category[0] == "C":
|
||||||
|
flags |= NONPRINTABLE_MASK
|
||||||
|
if category[0] == "Z" and char != " ":
|
||||||
|
flags |= NONPRINTABLE_MASK
|
||||||
if "XID_Start" in properties:
|
if "XID_Start" in properties:
|
||||||
flags |= XID_START_MASK
|
flags |= XID_START_MASK
|
||||||
if "XID_Continue" in properties:
|
if "XID_Continue" in properties:
|
||||||
|
@ -465,7 +471,7 @@ def makeunicodename(unicode, trace):
|
||||||
if name and name[0] != "<":
|
if name and name[0] != "<":
|
||||||
names[char] = name + chr(0)
|
names[char] = name + chr(0)
|
||||||
|
|
||||||
print(len(n for n in names if n is not None), "distinct names")
|
print(len(list(n for n in names if n is not None)), "distinct names")
|
||||||
|
|
||||||
# collect unique words from names (note that we differ between
|
# collect unique words from names (note that we differ between
|
||||||
# words inside a sentence, and words ending a sentence. the
|
# words inside a sentence, and words ending a sentence. the
|
||||||
|
|
Loading…
Reference in New Issue