merge heads

This commit is contained in:
Giampaolo Rodola' 2012-05-16 16:03:07 +02:00
commit 26fd8feb5e
18 changed files with 498 additions and 251 deletions

View File

@ -32,7 +32,6 @@ Modules/Setup.local
Modules/config.c
Modules/ld_so_aix$
Parser/pgen$
PCbuild/amd64/
^core
^python-gdb.py
^python.exe-gdb.py
@ -56,6 +55,12 @@ PC/python_nt*.h
PC/pythonnt_rc*.h
PC/*.obj
PC/*.exe
PC/*/*.user
PC/*/*.ncb
PC/*/*.suo
PC/*/Win32-temp-*
PC/*/x64-temp-*
PC/*/amd64
PCbuild/*.exe
PCbuild/*.dll
PCbuild/*.pdb
@ -69,6 +74,8 @@ PCbuild/*.suo
PCbuild/*.*sdf
PCbuild/Win32-temp-*
PCbuild/x64-temp-*
PCbuild/amd64
BuildLog.htm
__pycache__
Modules/_testembed
.coverage

View File

@ -17,10 +17,10 @@ yourself. However the bundled generator knows how to generate most email in a
standards-compliant way, should handle MIME and non-MIME email messages just
fine, and is designed so that the transformation from flat text, to a message
structure via the :class:`~email.parser.Parser` class, and back to flat text,
is idempotent (the input is identical to the output). On the other hand, using
the Generator on a :class:`~email.message.Message` constructed by program may
result in changes to the :class:`~email.message.Message` object as defaults are
filled in.
is idempotent (the input is identical to the output) [#]_. On the other hand,
using the Generator on a :class:`~email.message.Message` constructed by program
may result in changes to the :class:`~email.message.Message` object as defaults
are filled in.
:class:`bytes` output can be generated using the :class:`BytesGenerator` class.
If the message object structure contains non-ASCII bytes, this generator's
@ -223,3 +223,12 @@ representing the part.
The default value for *fmt* is ``None``, meaning ::
[Non-text (%(type)s) part of message omitted, filename %(filename)s]
.. rubric:: Footnotes
.. [#] This statement assumes that you use the appropriate setting for the
``unixfrom`` argument, and that you set maxheaderlen=0 (which will
preserve whatever the input line lengths were). It is also not strictly
true, since in many cases runs of whitespace in headers are collapsed
into single blanks. The latter is a bug that will eventually be fixed.

View File

@ -339,6 +339,15 @@ and also the following constants for integer status codes:
| :const:`UPGRADE_REQUIRED` | ``426`` | HTTP Upgrade to TLS, |
| | | :rfc:`2817`, Section 6 |
+------------------------------------------+---------+-----------------------------------------------------------------------+
| :const:`PRECONDITION_REQUIRED` | ``428`` | Additional HTTP Status Codes, |
| | | :rfc:`6585`, Section 3 |
+------------------------------------------+---------+-----------------------------------------------------------------------+
| :const:`TOO_MANY_REQUESTS` | ``429`` | Additional HTTP Status Codes, |
| | | :rfc:`6585`, Section 4 |
+------------------------------------------+---------+-----------------------------------------------------------------------+
| :const:`REQUEST_HEADER_FIELDS_TOO_LARGE` | ``431`` | Additional HTTP Status Codes, |
| | | :rfc:`6585`, Section 5 |
+------------------------------------------+---------+-----------------------------------------------------------------------+
| :const:`INTERNAL_SERVER_ERROR` | ``500`` | HTTP/1.1, `RFC 2616, Section |
| | | 10.5.1 |
| | | <http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.5.1>`_ |
@ -369,6 +378,12 @@ and also the following constants for integer status codes:
| :const:`NOT_EXTENDED` | ``510`` | An HTTP Extension Framework, |
| | | :rfc:`2774`, Section 7 |
+------------------------------------------+---------+-----------------------------------------------------------------------+
| :const:`NETWORK_AUTHENTICATION_REQUIRED` | ``511`` | Additional HTTP Status Codes, |
| | | :rfc:`6585`, Section 6 |
+------------------------------------------+---------+-----------------------------------------------------------------------+
.. versionchanged:: 3.3
Added codes ``428``, ``429``, ``431`` and ``511`` from :rfc:`6585`.
.. data:: responses

View File

@ -141,6 +141,9 @@ UNPROCESSABLE_ENTITY = 422
LOCKED = 423
FAILED_DEPENDENCY = 424
UPGRADE_REQUIRED = 426
PRECONDITION_REQUIRED = 428
TOO_MANY_REQUESTS = 429
REQUEST_HEADER_FIELDS_TOO_LARGE = 431
# server error
INTERNAL_SERVER_ERROR = 500
@ -151,6 +154,7 @@ GATEWAY_TIMEOUT = 504
HTTP_VERSION_NOT_SUPPORTED = 505
INSUFFICIENT_STORAGE = 507
NOT_EXTENDED = 510
NETWORK_AUTHENTICATION_REQUIRED = 511
# Mapping status codes to official W3C names
responses = {
@ -192,6 +196,9 @@ responses = {
415: 'Unsupported Media Type',
416: 'Requested Range Not Satisfiable',
417: 'Expectation Failed',
428: 'Precondition Required',
429: 'Too Many Requests',
431: 'Request Header Fields Too Large',
500: 'Internal Server Error',
501: 'Not Implemented',
@ -199,6 +206,7 @@ responses = {
503: 'Service Unavailable',
504: 'Gateway Timeout',
505: 'HTTP Version Not Supported',
511: 'Network Authentication Required',
}
# maximal amount of data to read at one time in _safe_read

View File

@ -573,7 +573,7 @@ class BaseHTTPRequestHandler(socketserver.StreamRequestHandler):
# Table mapping response codes to messages; entries have the
# form {code: (shortmessage, longmessage)}.
# See RFC 2616.
# See RFC 2616 and 6585.
responses = {
100: ('Continue', 'Request received, please continue'),
101: ('Switching Protocols',
@ -628,6 +628,12 @@ class BaseHTTPRequestHandler(socketserver.StreamRequestHandler):
'Cannot satisfy request range.'),
417: ('Expectation Failed',
'Expect condition could not be satisfied.'),
428: ('Precondition Required',
'The origin server requires the request to be conditional.'),
429: ('Too Many Requests', 'The user has sent too many requests '
'in a given amount of time ("rate limiting").'),
431: ('Request Header Fields Too Large', 'The server is unwilling to '
'process the request because its header fields are too large.'),
500: ('Internal Server Error', 'Server got itself in trouble'),
501: ('Not Implemented',
@ -638,6 +644,8 @@ class BaseHTTPRequestHandler(socketserver.StreamRequestHandler):
504: ('Gateway Timeout',
'The gateway server did not receive a timely response'),
505: ('HTTP Version Not Supported', 'Cannot fulfill request.'),
511: ('Network Authentication Required',
'The client needs to authenticate to gain network access.'),
}

View File

@ -23,6 +23,28 @@ del sys.modules['bisect']
import bisect as c_bisect
class Range(object):
"""A trivial range()-like object without any integer width limitations."""
def __init__(self, start, stop):
self.start = start
self.stop = stop
self.last_insert = None
def __len__(self):
return self.stop - self.start
def __getitem__(self, idx):
n = self.stop - self.start
if idx < 0:
idx += n
if idx >= n:
raise IndexError(idx)
return self.start + idx
def insert(self, idx, item):
self.last_insert = idx, item
class TestBisect(unittest.TestCase):
module = None
@ -125,9 +147,28 @@ class TestBisect(unittest.TestCase):
def test_large_range(self):
# Issue 13496
mod = self.module
data = range(sys.maxsize-1)
self.assertEqual(mod.bisect_left(data, sys.maxsize-3), sys.maxsize-3)
self.assertEqual(mod.bisect_right(data, sys.maxsize-3), sys.maxsize-2)
n = sys.maxsize
data = range(n-1)
self.assertEqual(mod.bisect_left(data, n-3), n-3)
self.assertEqual(mod.bisect_right(data, n-3), n-2)
self.assertEqual(mod.bisect_left(data, n-3, n-10, n), n-3)
self.assertEqual(mod.bisect_right(data, n-3, n-10, n), n-2)
def test_large_pyrange(self):
# Same as above, but without C-imposed limits on range() parameters
mod = self.module
n = sys.maxsize
data = Range(0, n-1)
self.assertEqual(mod.bisect_left(data, n-3), n-3)
self.assertEqual(mod.bisect_right(data, n-3), n-2)
self.assertEqual(mod.bisect_left(data, n-3, n-10, n), n-3)
self.assertEqual(mod.bisect_right(data, n-3, n-10, n), n-2)
x = n - 100
mod.insort_left(data, x, x - 50, x + 50)
self.assertEqual(data.last_insert, (x, x))
x = n - 200
mod.insort_right(data, x, x - 50, x + 50)
self.assertEqual(data.last_insert, (x + 1, x))
def test_random(self, n=25):
from random import randrange

View File

@ -137,8 +137,57 @@ class PkgutilPEP302Tests(unittest.TestCase):
self.assertEqual(foo.loads, 1)
del sys.modules['foo']
class ExtendPathTests(unittest.TestCase):
def create_init(self, pkgname):
dirname = tempfile.mkdtemp()
self.addCleanup(shutil.rmtree, dirname)
sys.path.insert(0, dirname)
pkgdir = os.path.join(dirname, pkgname)
os.mkdir(pkgdir)
with open(os.path.join(pkgdir, '__init__.py'), 'w') as fl:
fl.write('from pkgutil import extend_path\n__path__ = extend_path(__path__, __name__)\n')
return dirname
def create_submodule(self, dirname, pkgname, submodule_name, value):
module_name = os.path.join(dirname, pkgname, submodule_name + '.py')
with open(module_name, 'w') as fl:
print('value={}'.format(value), file=fl)
def setUp(self):
# Create 2 directories on sys.path
self.pkgname = 'foo'
self.dirname_0 = self.create_init(self.pkgname)
self.dirname_1 = self.create_init(self.pkgname)
def tearDown(self):
del sys.path[0]
del sys.path[0]
del sys.modules['foo']
del sys.modules['foo.bar']
del sys.modules['foo.baz']
def test_simple(self):
self.create_submodule(self.dirname_0, self.pkgname, 'bar', 0)
self.create_submodule(self.dirname_1, self.pkgname, 'baz', 1)
import foo.bar
import foo.baz
# Ensure we read the expected values
self.assertEqual(foo.bar.value, 0)
self.assertEqual(foo.baz.value, 1)
# Ensure the path is set up correctly
self.assertEqual(sorted(foo.__path__),
sorted([os.path.join(self.dirname_0, self.pkgname),
os.path.join(self.dirname_1, self.pkgname)]))
# XXX: test .pkg files
def test_main():
run_unittest(PkgutilTests, PkgutilPEP302Tests)
run_unittest(PkgutilTests, PkgutilPEP302Tests, ExtendPathTests)
# this is necessary if test is run repeated (like when finding leaks)
import zipimport
zipimport._zip_directory_cache.clear()

View File

@ -540,12 +540,19 @@ class Misc:
The type keyword specifies the form in which the data is
to be returned and should be an atom name such as STRING
or FILE_NAME. Type defaults to STRING.
or FILE_NAME. Type defaults to STRING, except on X11, where the default
is to try UTF8_STRING and fall back to STRING.
This command is equivalent to:
selection_get(CLIPBOARD)
"""
if 'type' not in kw and self._windowingsystem == 'x11':
try:
kw['type'] = 'UTF8_STRING'
return self.tk.call(('clipboard', 'get') + self._options(kw))
except TclError:
del kw['type']
return self.tk.call(('clipboard', 'get') + self._options(kw))
def clipboard_clear(self, **kw):
@ -627,8 +634,16 @@ class Misc:
A keyword parameter selection specifies the name of
the selection and defaults to PRIMARY. A keyword
parameter displayof specifies a widget on the display
to use."""
to use. A keyword parameter type specifies the form of data to be
fetched, defaulting to STRING except on X11, where UTF8_STRING is tried
before STRING."""
if 'displayof' not in kw: kw['displayof'] = self._w
if 'type' not in kw and self._windowingsystem == 'x11':
try:
kw['type'] = 'UTF8_STRING'
return self.tk.call(('selection', 'get') + self._options(kw))
except TclError:
del kw['type']
return self.tk.call(('selection', 'get') + self._options(kw))
def selection_handle(self, command, **kw):
"""Specify a function COMMAND to call if the X
@ -1043,6 +1058,15 @@ class Misc:
if displayof is None:
return ('-displayof', self._w)
return ()
@property
def _windowingsystem(self):
"""Internal function."""
try:
return self._root()._windowingsystem_cached
except AttributeError:
ws = self._root()._windowingsystem_cached = \
self.tk.call('tk', 'windowingsystem')
return ws
def _options(self, cnf, kw = None):
"""Internal function."""
if kw:

View File

@ -919,6 +919,7 @@ Ralf Schmitt
Michael Schneider
Peter Schneider-Kamp
Arvin Schnell
Robin Schreiber
Chad J. Schroeder
Sam Schulenburg
Stefan Schwarzer
@ -1129,6 +1130,7 @@ Florent Xicluna
Hirokazu Yamamoto
Ka-Ping Yee
Jason Yeo
EungJun Yi
Bob Yodlowski
Danny Yoo
George Yoshida

View File

@ -10,6 +10,9 @@ What's New in Python 3.3.0 Alpha 4?
Core and Builtins
-----------------
- Issue #14624: UTF-16 decoding is now 3x to 4x faster on various inputs.
Patch by Serhiy Storchaka.
- asdl_seq and asdl_int_seq are now Py_ssize_t sized.
- Issue #14133 (PEP 415): Implement suppression of __context__ display with an
@ -31,6 +34,21 @@ Core and Builtins
Library
-------
- Issue #14829: Fix bisect and range() indexing with large indices
(>= 2 ** 32) under 64-bit Windows.
- Issue #14732: The _csv module now uses PEP 3121 module initialization.
Patch by Robin Schreiber.
- Issue #14809: Add HTTP status codes introduced by RFC 6585 to http.server
and http.client. Patch by EungJun Yi.
- Issue #14777: tkinter may return undecoded UTF-8 bytes as a string when
accessing the Tk clipboard. Modify clipboad_get() to first request type
UTF8_STRING when no specific type is requested in an X11 windowing
environment, falling back to the current default type STRING if that fails.
Original patch by Thomas Kluyver.
- Issue #14773: Fix os.fwalk() failing on dangling symlinks.
- Issue #12541: Be lenient with quotes around Realm field of HTTP Basic

View File

@ -3,6 +3,7 @@
Converted to C by Dmitry Vasiliev (dima at hlabs.spb.ru).
*/
#define PY_SSIZE_T_CLEAN
#include "Python.h"
static Py_ssize_t
@ -195,8 +196,7 @@ insort_left(PyObject *self, PyObject *args, PyObject *kw)
return NULL;
} else {
_Py_IDENTIFIER(insert);
result = _PyObject_CallMethodId(list, &PyId_insert, "iO", index, item);
result = _PyObject_CallMethodId(list, &PyId_insert, "nO", index, item);
if (result == NULL)
return NULL;
Py_DECREF(result);

View File

@ -16,9 +16,39 @@ module instead.
#define IS_BASESTRING(o) \
PyUnicode_Check(o)
static PyObject *error_obj; /* CSV exception */
static PyObject *dialects; /* Dialect registry */
static long field_limit = 128 * 1024; /* max parsed field size */
typedef struct {
PyObject *error_obj; /* CSV exception */
PyObject *dialects; /* Dialect registry */
long field_limit; /* max parsed field size */
} _csvstate;
#define _csvstate(o) ((_csvstate *)PyModule_GetState(o))
static int
_csv_clear(PyObject *m)
{
Py_CLEAR(_csvstate(m)->error_obj);
Py_CLEAR(_csvstate(m)->dialects);
return 0;
}
static int
_csv_traverse(PyObject *m, visitproc visit, void *arg)
{
Py_VISIT(_csvstate(m)->error_obj);
Py_VISIT(_csvstate(m)->dialects);
return 0;
}
static void
_csv_free(void *m)
{
_csv_clear((PyObject *)m);
}
static struct PyModuleDef _csvmodule;
#define _csvstate_global ((_csvstate *)PyModule_GetState(PyState_FindModule(&_csvmodule)))
typedef enum {
START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD,
@ -103,10 +133,10 @@ get_dialect_from_registry(PyObject * name_obj)
{
PyObject *dialect_obj;
dialect_obj = PyDict_GetItem(dialects, name_obj);
dialect_obj = PyDict_GetItem(_csvstate_global->dialects, name_obj);
if (dialect_obj == NULL) {
if (!PyErr_Occurred())
PyErr_Format(error_obj, "unknown dialect");
PyErr_Format(_csvstate_global->error_obj, "unknown dialect");
}
else
Py_INCREF(dialect_obj);
@ -544,9 +574,9 @@ parse_grow_buff(ReaderObj *self)
static int
parse_add_char(ReaderObj *self, Py_UCS4 c)
{
if (self->field_len >= field_limit) {
PyErr_Format(error_obj, "field larger than field limit (%ld)",
field_limit);
if (self->field_len >= _csvstate_global->field_limit) {
PyErr_Format(_csvstate_global->error_obj, "field larger than field limit (%ld)",
_csvstate_global->field_limit);
return -1;
}
if (self->field_len == self->field_size && !parse_grow_buff(self))
@ -703,7 +733,7 @@ parse_process_char(ReaderObj *self, Py_UCS4 c)
}
else {
/* illegal */
PyErr_Format(error_obj, "'%c' expected after '%c'",
PyErr_Format(_csvstate_global->error_obj, "'%c' expected after '%c'",
dialect->delimiter,
dialect->quotechar);
return -1;
@ -716,7 +746,7 @@ parse_process_char(ReaderObj *self, Py_UCS4 c)
else if (c == '\0')
self->state = START_RECORD;
else {
PyErr_Format(error_obj, "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");
PyErr_Format(_csvstate_global->error_obj, "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");
return -1;
}
break;
@ -755,12 +785,12 @@ Reader_iternext(ReaderObj *self)
if (lineobj == NULL) {
/* End of input OR exception */
if (!PyErr_Occurred() && self->field_len != 0)
PyErr_Format(error_obj,
PyErr_Format(_csvstate_global->error_obj,
"newline inside string");
return NULL;
}
if (!PyUnicode_Check(lineobj)) {
PyErr_Format(error_obj,
PyErr_Format(_csvstate_global->error_obj,
"iterator should return strings, "
"not %.200s "
"(did you open the file in text mode?)",
@ -778,7 +808,7 @@ Reader_iternext(ReaderObj *self)
c = PyUnicode_READ(kind, data, pos);
if (c == '\0') {
Py_DECREF(lineobj);
PyErr_Format(error_obj,
PyErr_Format(_csvstate_global->error_obj,
"line contains NULL byte");
goto err;
}
@ -994,7 +1024,7 @@ join_append_data(WriterObj *self, unsigned int field_kind, void *field_data,
}
if (want_escape) {
if (!dialect->escapechar) {
PyErr_Format(error_obj,
PyErr_Format(_csvstate_global->error_obj,
"need to escape, but no escapechar set");
return -1;
}
@ -1010,7 +1040,7 @@ join_append_data(WriterObj *self, unsigned int field_kind, void *field_data,
*/
if (i == 0 && quote_empty) {
if (dialect->quoting == QUOTE_NONE) {
PyErr_Format(error_obj,
PyErr_Format(_csvstate_global->error_obj,
"single empty field record must be quoted");
return -1;
}
@ -1127,7 +1157,7 @@ csv_writerow(WriterObj *self, PyObject *seq)
PyObject *line, *result;
if (!PySequence_Check(seq))
return PyErr_Format(error_obj, "sequence expected");
return PyErr_Format(_csvstate_global->error_obj, "sequence expected");
len = PySequence_Length(seq);
if (len < 0)
@ -1353,7 +1383,7 @@ csv_writer(PyObject *module, PyObject *args, PyObject *keyword_args)
static PyObject *
csv_list_dialects(PyObject *module, PyObject *args)
{
return PyDict_Keys(dialects);
return PyDict_Keys(_csvstate_global->dialects);
}
static PyObject *
@ -1372,7 +1402,7 @@ csv_register_dialect(PyObject *module, PyObject *args, PyObject *kwargs)
dialect = _call_dialect(dialect_obj, kwargs);
if (dialect == NULL)
return NULL;
if (PyDict_SetItem(dialects, name_obj, dialect) < 0) {
if (PyDict_SetItem(_csvstate_global->dialects, name_obj, dialect) < 0) {
Py_DECREF(dialect);
return NULL;
}
@ -1384,8 +1414,8 @@ csv_register_dialect(PyObject *module, PyObject *args, PyObject *kwargs)
static PyObject *
csv_unregister_dialect(PyObject *module, PyObject *name_obj)
{
if (PyDict_DelItem(dialects, name_obj) < 0)
return PyErr_Format(error_obj, "unknown dialect");
if (PyDict_DelItem(_csvstate_global->dialects, name_obj) < 0)
return PyErr_Format(_csvstate_global->error_obj, "unknown dialect");
Py_INCREF(Py_None);
return Py_None;
}
@ -1400,7 +1430,7 @@ static PyObject *
csv_field_size_limit(PyObject *module, PyObject *args)
{
PyObject *new_limit = NULL;
long old_limit = field_limit;
long old_limit = _csvstate_global->field_limit;
if (!PyArg_UnpackTuple(args, "field_size_limit", 0, 1, &new_limit))
return NULL;
@ -1410,9 +1440,9 @@ csv_field_size_limit(PyObject *module, PyObject *args)
"limit must be an integer");
return NULL;
}
field_limit = PyLong_AsLong(new_limit);
if (field_limit == -1 && PyErr_Occurred()) {
field_limit = old_limit;
_csvstate_global->field_limit = PyLong_AsLong(new_limit);
if (_csvstate_global->field_limit == -1 && PyErr_Occurred()) {
_csvstate_global->field_limit = old_limit;
return NULL;
}
}
@ -1551,17 +1581,16 @@ static struct PyMethodDef csv_methods[] = {
{ NULL, NULL }
};
static struct PyModuleDef _csvmodule = {
PyModuleDef_HEAD_INIT,
"_csv",
csv_module_doc,
-1,
sizeof(_csvstate),
csv_methods,
NULL,
NULL,
NULL,
NULL
_csv_traverse,
_csv_clear,
_csv_free
};
PyMODINIT_FUNC
@ -1589,11 +1618,16 @@ PyInit__csv(void)
MODULE_VERSION) == -1)
return NULL;
/* Set the field limit */
_csvstate(module)->field_limit = 128 * 1024;
/* Do I still need to add this var to the Module Dict? */
/* Add _dialects dictionary */
dialects = PyDict_New();
if (dialects == NULL)
_csvstate(module)->dialects = PyDict_New();
if (_csvstate(module)->dialects == NULL)
return NULL;
if (PyModule_AddObject(module, "_dialects", dialects))
Py_INCREF(_csvstate(module)->dialects);
if (PyModule_AddObject(module, "_dialects", _csvstate(module)->dialects))
return NULL;
/* Add quote styles into dictionary */
@ -1609,9 +1643,10 @@ PyInit__csv(void)
return NULL;
/* Add the CSV exception object to the module. */
error_obj = PyErr_NewException("_csv.Error", NULL, NULL);
if (error_obj == NULL)
_csvstate(module)->error_obj = PyErr_NewException("_csv.Error", NULL, NULL);
if (_csvstate(module)->error_obj == NULL)
return NULL;
PyModule_AddObject(module, "Error", error_obj);
Py_INCREF(_csvstate(module)->error_obj);
PyModule_AddObject(module, "Error", _csvstate(module)->error_obj);
return module;
}

View File

@ -349,7 +349,8 @@ PyException_SetContext(PyObject *self, PyObject *context) {
static struct PyMemberDef BaseException_members[] = {
{"__suppress_context__", T_BOOL,
offsetof(PyBaseExceptionObject, suppress_context)}
offsetof(PyBaseExceptionObject, suppress_context)},
{NULL}
};

View File

@ -308,7 +308,7 @@ compute_range_item(rangeobject *r, PyObject *arg)
static PyObject *
range_item(rangeobject *r, Py_ssize_t i)
{
PyObject *res, *arg = PyLong_FromLong(i);
PyObject *res, *arg = PyLong_FromSsize_t(i);
if (!arg) {
return NULL;
}

View File

@ -215,7 +215,6 @@ InvalidContinuation:
goto Return;
}
#undef LONG_PTR_MASK
#undef ASCII_CHAR_MASK
@ -415,4 +414,152 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
#undef MAX_SHORT_UNICHARS
}
/* The pattern for constructing UCS2-repeated masks. */
#if SIZEOF_LONG == 8
# define UCS2_REPEAT_MASK 0x0001000100010001ul
#elif SIZEOF_LONG == 4
# define UCS2_REPEAT_MASK 0x00010001ul
#else
# error C 'long' size should be either 4 or 8!
#endif
/* The mask for fast checking. */
#if STRINGLIB_SIZEOF_CHAR == 1
/* The mask for fast checking of whether a C 'long' contains a
non-ASCII or non-Latin1 UTF16-encoded characters. */
# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
#else
/* The mask for fast checking of whether a C 'long' may contain
UTF16-encoded surrogate characters. This is an efficient heuristic,
assuming that non-surrogate characters with a code point >= 0x8000 are
rare in most input.
*/
# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
#endif
/* The mask for fast byte-swapping. */
#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
/* Swap bytes. */
#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
(((value) & STRIPPED_MASK) << 8))
Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
int native_ordering)
{
Py_UCS4 ch;
const unsigned char *aligned_end =
(const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
const unsigned char *q = *inptr;
STRINGLIB_CHAR *p = dest + *outpos;
/* Offsets from q for retrieving byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int ihi = !!native_ordering, ilo = !native_ordering;
#else
int ihi = !native_ordering, ilo = !!native_ordering;
#endif
--e;
while (q < e) {
Py_UCS4 ch2;
/* First check for possible aligned read of a C 'long'. Unaligned
reads are more expensive, better to defer to another iteration. */
if (!((size_t) q & LONG_PTR_MASK)) {
/* Fast path for runs of in-range non-surrogate chars. */
register const unsigned char *_q = q;
while (_q < aligned_end) {
unsigned long block = * (unsigned long *) _q;
if (native_ordering) {
/* Can use buffer directly */
if (block & FAST_CHAR_MASK)
break;
}
else {
/* Need to byte-swap */
if (block & SWAB(FAST_CHAR_MASK))
break;
#if STRINGLIB_SIZEOF_CHAR == 1
block >>= 8;
#else
block = SWAB(block);
#endif
}
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
# if SIZEOF_LONG == 4
p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
p[1] = (STRINGLIB_CHAR)(block >> 16);
# elif SIZEOF_LONG == 8
p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
p[3] = (STRINGLIB_CHAR)(block >> 48);
# endif
#else
# if SIZEOF_LONG == 4
p[0] = (STRINGLIB_CHAR)(block >> 16);
p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
# elif SIZEOF_LONG == 8
p[0] = (STRINGLIB_CHAR)(block >> 48);
p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
# endif
#endif
_q += SIZEOF_LONG;
p += SIZEOF_LONG / 2;
}
q = _q;
if (q >= e)
break;
}
ch = (q[ihi] << 8) | q[ilo];
q += 2;
if (!Py_UNICODE_IS_SURROGATE(ch)) {
#if STRINGLIB_SIZEOF_CHAR < 2
if (ch > STRINGLIB_MAX_CHAR)
/* Out-of-range */
goto Return;
#endif
*p++ = (STRINGLIB_CHAR)ch;
continue;
}
/* UTF-16 code pair: */
if (q >= e)
goto UnexpectedEnd;
if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
goto IllegalEncoding;
ch2 = (q[ihi] << 8) | q[ilo];
q += 2;
if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
goto IllegalSurrogate;
ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
#if STRINGLIB_SIZEOF_CHAR < 4
/* Out-of-range */
goto Return;
#else
*p++ = (STRINGLIB_CHAR)ch;
#endif
}
ch = 0;
Return:
*inptr = q;
*outpos = p - dest;
return ch;
UnexpectedEnd:
ch = 1;
goto Return;
IllegalEncoding:
ch = 2;
goto Return;
IllegalSurrogate:
ch = 3;
goto Return;
}
#undef UCS2_REPEAT_MASK
#undef FAST_CHAR_MASK
#undef STRIPPED_MASK
#undef SWAB
#undef LONG_PTR_MASK
#endif /* STRINGLIB_IS_UNICODE */

View File

@ -5195,25 +5195,6 @@ PyUnicode_DecodeUTF16(const char *s,
return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
}
/* Two masks for fast checking of whether a C 'long' may contain
UTF16-encoded surrogate characters. This is an efficient heuristic,
assuming that non-surrogate characters with a code point >= 0x8000 are
rare in most input.
FAST_CHAR_MASK is used when the input is in native byte ordering,
SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
*/
#if (SIZEOF_LONG == 8)
# define FAST_CHAR_MASK 0x8000800080008000L
# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
# define STRIPPED_MASK 0x00FF00FF00FF00FFL
#elif (SIZEOF_LONG == 4)
# define FAST_CHAR_MASK 0x80008000L
# define SWAPPED_FAST_CHAR_MASK 0x00800080L
# define STRIPPED_MASK 0x00FF00FFL
#else
# error C 'long' size should be either 4 or 8!
#endif
PyObject *
PyUnicode_DecodeUTF16Stateful(const char *s,
Py_ssize_t size,
@ -5226,30 +5207,15 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
Py_ssize_t endinpos;
Py_ssize_t outpos;
PyObject *unicode;
const unsigned char *q, *e, *aligned_end;
const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */
int native_ordering = 0;
int native_ordering;
const char *errmsg = "";
/* Offsets from q for retrieving byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int ihi = 1, ilo = 0;
#else
int ihi = 0, ilo = 1;
#endif
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
/* Note: size will always be longer than the resulting Unicode
character count */
unicode = PyUnicode_New(size, 127);
if (!unicode)
return NULL;
if (size == 0)
return unicode;
outpos = 0;
q = (unsigned char *)s;
e = q + size - 1;
e = q + size;
if (byteorder)
bo = *byteorder;
@ -5258,10 +5224,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */
if (bo == 0) {
if (size >= 2) {
const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (bo == 0 && size >= 2) {
const Py_UCS4 bom = (q[1] << 8) | q[0];
if (bom == 0xFEFF) {
q += 2;
bo = -1;
@ -5270,143 +5234,88 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
q += 2;
bo = 1;
}
#else
if (bom == 0xFEFF) {
q += 2;
bo = 1;
}
else if (bom == 0xFFFE) {
q += 2;
bo = -1;
}
#endif
}
if (byteorder)
*byteorder = bo;
}
if (bo == -1) {
/* force LE */
ihi = 1;
ilo = 0;
}
else if (bo == 1) {
/* force BE */
ihi = 0;
ilo = 1;
if (q == e) {
if (consumed)
*consumed = size;
Py_INCREF(unicode_empty);
return unicode_empty;
}
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
native_ordering = ilo < ihi;
native_ordering = bo <= 0;
#else
native_ordering = ilo > ihi;
native_ordering = bo >= 0;
#endif
aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
while (q < e) {
Py_UCS4 ch;
/* First check for possible aligned read of a C 'long'. Unaligned
reads are more expensive, better to defer to another iteration. */
if (!((size_t) q & LONG_PTR_MASK)) {
/* Fast path for runs of non-surrogate chars. */
register const unsigned char *_q = q;
/* Note: size will always be longer than the resulting Unicode
character count */
unicode = PyUnicode_New((e - q + 1) / 2, 127);
if (!unicode)
return NULL;
outpos = 0;
while (1) {
Py_UCS4 ch = 0;
if (e - q >= 2) {
int kind = PyUnicode_KIND(unicode);
void *data = PyUnicode_DATA(unicode);
while (_q < aligned_end) {
unsigned long block = * (unsigned long *) _q;
Py_UCS4 maxch;
if (native_ordering) {
/* Can use buffer directly */
if (block & FAST_CHAR_MASK)
break;
if (kind == PyUnicode_1BYTE_KIND) {
if (PyUnicode_IS_ASCII(unicode))
ch = asciilib_utf16_decode(&q, e,
PyUnicode_1BYTE_DATA(unicode), &outpos,
native_ordering);
else
ch = ucs1lib_utf16_decode(&q, e,
PyUnicode_1BYTE_DATA(unicode), &outpos,
native_ordering);
} else if (kind == PyUnicode_2BYTE_KIND) {
ch = ucs2lib_utf16_decode(&q, e,
PyUnicode_2BYTE_DATA(unicode), &outpos,
native_ordering);
} else {
assert(kind == PyUnicode_4BYTE_KIND);
ch = ucs4lib_utf16_decode(&q, e,
PyUnicode_4BYTE_DATA(unicode), &outpos,
native_ordering);
}
else {
/* Need to byte-swap */
if (block & SWAPPED_FAST_CHAR_MASK)
break;
block = ((block >> 8) & STRIPPED_MASK) |
((block & STRIPPED_MASK) << 8);
}
maxch = (Py_UCS2)(block & 0xFFFF);
#if SIZEOF_LONG == 8
ch = (Py_UCS2)((block >> 16) & 0xFFFF);
maxch = MAX_MAXCHAR(maxch, ch);
ch = (Py_UCS2)((block >> 32) & 0xFFFF);
maxch = MAX_MAXCHAR(maxch, ch);
ch = (Py_UCS2)(block >> 48);
maxch = MAX_MAXCHAR(maxch, ch);
#else
ch = (Py_UCS2)(block >> 16);
maxch = MAX_MAXCHAR(maxch, ch);
#endif
if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
if (unicode_widen(&unicode, outpos, maxch) < 0)
goto onError;
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
}
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
#if SIZEOF_LONG == 8
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
#else
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
#endif
#else
#if SIZEOF_LONG == 8
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
#else
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
#endif
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
#endif
_q += SIZEOF_LONG;
}
q = _q;
if (q >= e)
break;
}
ch = (q[ihi] << 8) | q[ilo];
q += 2;
if (!Py_UNICODE_IS_SURROGATE(ch)) {
switch (ch)
{
case 0:
/* remaining byte at the end? (size should be even) */
if (q == e || consumed)
goto End;
errmsg = "truncated data";
startinpos = ((const char *)q) - starts;
endinpos = ((const char *)e) - starts;
break;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
case 1:
errmsg = "unexpected end of data";
startinpos = ((const char *)q) - 2 - starts;
endinpos = ((const char *)e) - starts;
break;
case 2:
errmsg = "illegal encoding";
startinpos = ((const char *)q) - 2 - starts;
endinpos = startinpos + 2;
break;
case 3:
errmsg = "illegal UTF-16 surrogate";
startinpos = ((const char *)q) - 4 - starts;
endinpos = startinpos + 2;
break;
default:
if (unicode_putchar(&unicode, &outpos, ch) < 0)
goto onError;
continue;
}
/* UTF-16 code pair: */
if (q > e) {
errmsg = "unexpected end of data";
startinpos = (((const char *)q) - 2) - starts;
endinpos = ((const char *)e) + 1 - starts;
goto utf16Error;
}
if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
q += 2;
if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
if (unicode_putchar(&unicode, &outpos,
Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
goto onError;
continue;
}
else {
errmsg = "illegal UTF-16 surrogate";
startinpos = (((const char *)q)-4)-starts;
endinpos = startinpos+2;
goto utf16Error;
}
}
errmsg = "illegal encoding";
startinpos = (((const char *)q)-2)-starts;
endinpos = startinpos+2;
/* Fall through to report the error */
utf16Error:
if (unicode_decode_call_errorhandler(
errors,
&errorHandler,
@ -5421,33 +5330,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
&outpos))
goto onError;
}
/* remaining byte at the end? (size should be even) */
if (e == q) {
if (!consumed) {
errmsg = "truncated data";
startinpos = ((const char *)q) - starts;
endinpos = ((const char *)e) + 1 - starts;
if (unicode_decode_call_errorhandler(
errors,
&errorHandler,
"utf16", errmsg,
&starts,
(const char **)&e,
&startinpos,
&endinpos,
&exc,
(const char **)&q,
&unicode,
&outpos))
goto onError;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
}
}
if (byteorder)
*byteorder = bo;
End:
if (consumed)
*consumed = (const char *)q-starts;
@ -5466,9 +5350,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
return NULL;
}
#undef FAST_CHAR_MASK
#undef SWAPPED_FAST_CHAR_MASK
PyObject *
_PyUnicode_EncodeUTF16(PyObject *str,
const char *errors,

View File

@ -25,6 +25,8 @@ def main(input_path, output_path):
with open(output_path, 'w', encoding='utf-8') as output_file:
output_file.write('\n'.join(lines))
output_file.write('/* Mercurial binary marker: \x00 */')
# Avoid a compiler warning for lack of EOL
output_file.write('\n')
if __name__ == '__main__':