From 190d79e5c648174b550de2bef75d1b4addf0d625 Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Wed, 30 Jan 2008 11:58:22 +0000 Subject: [PATCH] Merged revisions 60408-60440 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r60425 | raymond.hettinger | 2008-01-29 20:52:09 +0100 (Tue, 29 Jan 2008) | 1 line CallMethod is faster with a NULL third-argument than with an empty format string. ........ r60431 | raymond.hettinger | 2008-01-30 01:01:07 +0100 (Wed, 30 Jan 2008) | 1 line Add isdisjoint() to the Set/MutableSet ABCs. ........ r60432 | raymond.hettinger | 2008-01-30 01:08:31 +0100 (Wed, 30 Jan 2008) | 1 line MutableSets support a remove() method. ........ r60433 | raymond.hettinger | 2008-01-30 01:51:58 +0100 (Wed, 30 Jan 2008) | 1 line Demonstrate new except/as syntax. ........ r60440 | christian.heimes | 2008-01-30 12:32:37 +0100 (Wed, 30 Jan 2008) | 1 line Patch #1970 by Antoine Pitrou: Speedup unicode whitespace and linebreak detection. The speedup is about 25% for split() (571 / 457 usec) and 35% (175 / 127 usec )for splitlines() ........ --- Doc/tutorial/errors.rst | 4 +- Include/unicodeobject.h | 9 +++- Lib/_abcoll.py | 12 +++++ Objects/unicodeobject.c | 99 +++++++++++++++++++++++++++++++++-------- 4 files changed, 103 insertions(+), 21 deletions(-) diff --git a/Doc/tutorial/errors.rst b/Doc/tutorial/errors.rst index fa2d42220fb..5c652a98714 100644 --- a/Doc/tutorial/errors.rst +++ b/Doc/tutorial/errors.rst @@ -131,8 +131,8 @@ the exception (allowing a caller to handle the exception as well):: f = open('myfile.txt') s = f.readline() i = int(s.strip()) - except IOError as e: - print("I/O error(%s): %s" % (e.errno, e.strerror)) + except IOError as (errno, strerror): + print "I/O error(%s): %s" % (errno, strerror) except ValueError: print("Could not convert data to an integer.") except: diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index d7808a4c1d9..39542893d7a 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -358,7 +358,14 @@ typedef PY_UNICODE_TYPE Py_UNICODE; #else -#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch) +/* Since splitting on whitespace is an important use case, and whitespace + in most situations is solely ASCII whitespace, we optimize for the common + case by using a quick look-up table with an inlined check. + */ +extern const unsigned char _Py_ascii_whitespace[]; + +#define Py_UNICODE_ISSPACE(ch) \ + ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) diff --git a/Lib/_abcoll.py b/Lib/_abcoll.py index 36d39cf728e..4ce3df4696e 100644 --- a/Lib/_abcoll.py +++ b/Lib/_abcoll.py @@ -211,6 +211,12 @@ class Set(metaclass=ABCMeta): return NotImplemented return self._from_iterable(value for value in other if value in self) + def isdisjoint(self, other): + for value in other: + if value in self: + return False + return True + def __or__(self, other): if not isinstance(other, Iterable): return NotImplemented @@ -278,6 +284,12 @@ class MutableSet(Set): """Return True if it was deleted, False if not there.""" raise NotImplementedError + def remove(self, value): + """Remove an element. If not a member, raise a KeyError.""" + if value not in self: + raise KeyError(value) + self.discard(value) + def pop(self): """Return the popped value. Raise KeyError if empty.""" it = iter(self) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 694f3b0a261..1b35d4e273b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -125,6 +125,64 @@ static PyUnicodeObject *unicode_latin1[256]; */ static const char unicode_default_encoding[] = "utf-8"; +/* Fast detection of the most frequent whitespace characters */ +const unsigned char _Py_ascii_whitespace[] = { + 0, 0, 0, 0, 0, 0, 0, 0, +// case 0x0009: /* HORIZONTAL TABULATION */ +// case 0x000A: /* LINE FEED */ +// case 0x000B: /* VERTICAL TABULATION */ +// case 0x000C: /* FORM FEED */ +// case 0x000D: /* CARRIAGE RETURN */ + 0, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +// case 0x001C: /* FILE SEPARATOR */ +// case 0x001D: /* GROUP SEPARATOR */ +// case 0x001E: /* RECORD SEPARATOR */ +// case 0x001F: /* UNIT SEPARATOR */ + 0, 0, 0, 0, 1, 1, 1, 1, +// case 0x0020: /* SPACE */ + 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Same for linebreaks */ +static unsigned char ascii_linebreak[] = { + 0, 0, 0, 0, 0, 0, 0, 0, +// 0x000A, /* LINE FEED */ +// 0x000D, /* CARRIAGE RETURN */ + 0, 0, 1, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +// 0x001C, /* FILE SEPARATOR */ +// 0x001D, /* GROUP SEPARATOR */ +// 0x001E, /* RECORD SEPARATOR */ + 0, 0, 0, 0, 1, 1, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 +}; + + Py_UNICODE PyUnicode_GetMax(void) { @@ -151,8 +209,9 @@ static BLOOM_MASK bloom_linebreak; #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) -#define BLOOM_LINEBREAK(ch)\ - (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch))) +#define BLOOM_LINEBREAK(ch) \ + ((ch) < 128U ? ascii_linebreak[(ch)] : \ + (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) { @@ -5602,25 +5661,26 @@ PyObject *split_whitespace(PyUnicodeObject *self, register Py_ssize_t j; Py_ssize_t len = self->length; PyObject *str; + register const Py_UNICODE *buf = self->str; for (i = j = 0; i < len; ) { /* find a token */ - while (i < len && Py_UNICODE_ISSPACE(self->str[i])) + while (i < len && Py_UNICODE_ISSPACE(buf[i])) i++; j = i; - while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) + while (i < len && !Py_UNICODE_ISSPACE(buf[i])) i++; if (j < i) { if (maxcount-- <= 0) break; - SPLIT_APPEND(self->str, j, i); - while (i < len && Py_UNICODE_ISSPACE(self->str[i])) + SPLIT_APPEND(buf, j, i); + while (i < len && Py_UNICODE_ISSPACE(buf[i])) i++; j = i; } } if (j < len) { - SPLIT_APPEND(self->str, j, len); + SPLIT_APPEND(buf, j, len); } return list; @@ -5693,18 +5753,19 @@ PyObject *split_char(PyUnicodeObject *self, register Py_ssize_t j; Py_ssize_t len = self->length; PyObject *str; + register const Py_UNICODE *buf = self->str; for (i = j = 0; i < len; ) { - if (self->str[i] == ch) { + if (buf[i] == ch) { if (maxcount-- <= 0) break; - SPLIT_APPEND(self->str, j, i); + SPLIT_APPEND(buf, j, i); i = j = i + 1; } else i++; } if (j <= len) { - SPLIT_APPEND(self->str, j, len); + SPLIT_APPEND(buf, j, len); } return list; @@ -5753,25 +5814,26 @@ PyObject *rsplit_whitespace(PyUnicodeObject *self, register Py_ssize_t j; Py_ssize_t len = self->length; PyObject *str; + register const Py_UNICODE *buf = self->str; for (i = j = len - 1; i >= 0; ) { /* find a token */ - while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) + while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) i--; j = i; - while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i])) + while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i])) i--; if (j > i) { if (maxcount-- <= 0) break; - SPLIT_APPEND(self->str, i + 1, j + 1); - while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) + SPLIT_APPEND(buf, i + 1, j + 1); + while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) i--; j = i; } } if (j >= 0) { - SPLIT_APPEND(self->str, 0, j + 1); + SPLIT_APPEND(buf, 0, j + 1); } if (PyList_Reverse(list) < 0) goto onError; @@ -5792,18 +5854,19 @@ PyObject *rsplit_char(PyUnicodeObject *self, register Py_ssize_t j; Py_ssize_t len = self->length; PyObject *str; + register const Py_UNICODE *buf = self->str; for (i = j = len - 1; i >= 0; ) { - if (self->str[i] == ch) { + if (buf[i] == ch) { if (maxcount-- <= 0) break; - SPLIT_APPEND(self->str, i + 1, j + 1); + SPLIT_APPEND(buf, i + 1, j + 1); j = i = i - 1; } else i--; } if (j >= -1) { - SPLIT_APPEND(self->str, 0, j + 1); + SPLIT_APPEND(buf, 0, j + 1); } if (PyList_Reverse(list) < 0) goto onError;