Merged revisions 60408-60440 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ........ r60425 | raymond.hettinger | 2008-01-29 20:52:09 +0100 (Tue, 29 Jan 2008) | 1 line CallMethod is faster with a NULL third-argument than with an empty format string. ........ r60431 | raymond.hettinger | 2008-01-30 01:01:07 +0100 (Wed, 30 Jan 2008) | 1 line Add isdisjoint() to the Set/MutableSet ABCs. ........ r60432 | raymond.hettinger | 2008-01-30 01:08:31 +0100 (Wed, 30 Jan 2008) | 1 line MutableSets support a remove() method. ........ r60433 | raymond.hettinger | 2008-01-30 01:51:58 +0100 (Wed, 30 Jan 2008) | 1 line Demonstrate new except/as syntax. ........ r60440 | christian.heimes | 2008-01-30 12:32:37 +0100 (Wed, 30 Jan 2008) | 1 line Patch #1970 by Antoine Pitrou: Speedup unicode whitespace and linebreak detection. The speedup is about 25% for split() (571 / 457 usec) and 35% (175 / 127 usec )for splitlines() ........
This commit is contained in:
parent
510711d598
commit
190d79e5c6
|
@ -131,8 +131,8 @@ the exception (allowing a caller to handle the exception as well)::
|
|||
f = open('myfile.txt')
|
||||
s = f.readline()
|
||||
i = int(s.strip())
|
||||
except IOError as e:
|
||||
print("I/O error(%s): %s" % (e.errno, e.strerror))
|
||||
except IOError as (errno, strerror):
|
||||
print "I/O error(%s): %s" % (errno, strerror)
|
||||
except ValueError:
|
||||
print("Could not convert data to an integer.")
|
||||
except:
|
||||
|
|
|
@ -358,7 +358,14 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
|
||||
#else
|
||||
|
||||
#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
|
||||
/* Since splitting on whitespace is an important use case, and whitespace
|
||||
in most situations is solely ASCII whitespace, we optimize for the common
|
||||
case by using a quick look-up table with an inlined check.
|
||||
*/
|
||||
extern const unsigned char _Py_ascii_whitespace[];
|
||||
|
||||
#define Py_UNICODE_ISSPACE(ch) \
|
||||
((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
|
||||
|
||||
#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
|
||||
#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
|
||||
|
|
|
@ -211,6 +211,12 @@ class Set(metaclass=ABCMeta):
|
|||
return NotImplemented
|
||||
return self._from_iterable(value for value in other if value in self)
|
||||
|
||||
def isdisjoint(self, other):
|
||||
for value in other:
|
||||
if value in self:
|
||||
return False
|
||||
return True
|
||||
|
||||
def __or__(self, other):
|
||||
if not isinstance(other, Iterable):
|
||||
return NotImplemented
|
||||
|
@ -278,6 +284,12 @@ class MutableSet(Set):
|
|||
"""Return True if it was deleted, False if not there."""
|
||||
raise NotImplementedError
|
||||
|
||||
def remove(self, value):
|
||||
"""Remove an element. If not a member, raise a KeyError."""
|
||||
if value not in self:
|
||||
raise KeyError(value)
|
||||
self.discard(value)
|
||||
|
||||
def pop(self):
|
||||
"""Return the popped value. Raise KeyError if empty."""
|
||||
it = iter(self)
|
||||
|
|
|
@ -125,6 +125,64 @@ static PyUnicodeObject *unicode_latin1[256];
|
|||
*/
|
||||
static const char unicode_default_encoding[] = "utf-8";
|
||||
|
||||
/* Fast detection of the most frequent whitespace characters */
|
||||
const unsigned char _Py_ascii_whitespace[] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// case 0x0009: /* HORIZONTAL TABULATION */
|
||||
// case 0x000A: /* LINE FEED */
|
||||
// case 0x000B: /* VERTICAL TABULATION */
|
||||
// case 0x000C: /* FORM FEED */
|
||||
// case 0x000D: /* CARRIAGE RETURN */
|
||||
0, 1, 1, 1, 1, 1, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// case 0x001C: /* FILE SEPARATOR */
|
||||
// case 0x001D: /* GROUP SEPARATOR */
|
||||
// case 0x001E: /* RECORD SEPARATOR */
|
||||
// case 0x001F: /* UNIT SEPARATOR */
|
||||
0, 0, 0, 0, 1, 1, 1, 1,
|
||||
// case 0x0020: /* SPACE */
|
||||
1, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
/* Same for linebreaks */
|
||||
static unsigned char ascii_linebreak[] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// 0x000A, /* LINE FEED */
|
||||
// 0x000D, /* CARRIAGE RETURN */
|
||||
0, 0, 1, 0, 0, 1, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// 0x001C, /* FILE SEPARATOR */
|
||||
// 0x001D, /* GROUP SEPARATOR */
|
||||
// 0x001E, /* RECORD SEPARATOR */
|
||||
0, 0, 0, 0, 1, 1, 1, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
|
||||
Py_UNICODE
|
||||
PyUnicode_GetMax(void)
|
||||
{
|
||||
|
@ -151,8 +209,9 @@ static BLOOM_MASK bloom_linebreak;
|
|||
|
||||
#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
|
||||
|
||||
#define BLOOM_LINEBREAK(ch)\
|
||||
(BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
|
||||
#define BLOOM_LINEBREAK(ch) \
|
||||
((ch) < 128U ? ascii_linebreak[(ch)] : \
|
||||
(BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
|
||||
|
||||
Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
|
||||
{
|
||||
|
@ -5602,25 +5661,26 @@ PyObject *split_whitespace(PyUnicodeObject *self,
|
|||
register Py_ssize_t j;
|
||||
Py_ssize_t len = self->length;
|
||||
PyObject *str;
|
||||
register const Py_UNICODE *buf = self->str;
|
||||
|
||||
for (i = j = 0; i < len; ) {
|
||||
/* find a token */
|
||||
while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
|
||||
while (i < len && Py_UNICODE_ISSPACE(buf[i]))
|
||||
i++;
|
||||
j = i;
|
||||
while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
|
||||
while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
|
||||
i++;
|
||||
if (j < i) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_APPEND(self->str, j, i);
|
||||
while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
|
||||
SPLIT_APPEND(buf, j, i);
|
||||
while (i < len && Py_UNICODE_ISSPACE(buf[i]))
|
||||
i++;
|
||||
j = i;
|
||||
}
|
||||
}
|
||||
if (j < len) {
|
||||
SPLIT_APPEND(self->str, j, len);
|
||||
SPLIT_APPEND(buf, j, len);
|
||||
}
|
||||
return list;
|
||||
|
||||
|
@ -5693,18 +5753,19 @@ PyObject *split_char(PyUnicodeObject *self,
|
|||
register Py_ssize_t j;
|
||||
Py_ssize_t len = self->length;
|
||||
PyObject *str;
|
||||
register const Py_UNICODE *buf = self->str;
|
||||
|
||||
for (i = j = 0; i < len; ) {
|
||||
if (self->str[i] == ch) {
|
||||
if (buf[i] == ch) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_APPEND(self->str, j, i);
|
||||
SPLIT_APPEND(buf, j, i);
|
||||
i = j = i + 1;
|
||||
} else
|
||||
i++;
|
||||
}
|
||||
if (j <= len) {
|
||||
SPLIT_APPEND(self->str, j, len);
|
||||
SPLIT_APPEND(buf, j, len);
|
||||
}
|
||||
return list;
|
||||
|
||||
|
@ -5753,25 +5814,26 @@ PyObject *rsplit_whitespace(PyUnicodeObject *self,
|
|||
register Py_ssize_t j;
|
||||
Py_ssize_t len = self->length;
|
||||
PyObject *str;
|
||||
register const Py_UNICODE *buf = self->str;
|
||||
|
||||
for (i = j = len - 1; i >= 0; ) {
|
||||
/* find a token */
|
||||
while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
|
||||
while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
|
||||
i--;
|
||||
j = i;
|
||||
while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
|
||||
while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
|
||||
i--;
|
||||
if (j > i) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_APPEND(self->str, i + 1, j + 1);
|
||||
while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
|
||||
SPLIT_APPEND(buf, i + 1, j + 1);
|
||||
while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
|
||||
i--;
|
||||
j = i;
|
||||
}
|
||||
}
|
||||
if (j >= 0) {
|
||||
SPLIT_APPEND(self->str, 0, j + 1);
|
||||
SPLIT_APPEND(buf, 0, j + 1);
|
||||
}
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
|
@ -5792,18 +5854,19 @@ PyObject *rsplit_char(PyUnicodeObject *self,
|
|||
register Py_ssize_t j;
|
||||
Py_ssize_t len = self->length;
|
||||
PyObject *str;
|
||||
register const Py_UNICODE *buf = self->str;
|
||||
|
||||
for (i = j = len - 1; i >= 0; ) {
|
||||
if (self->str[i] == ch) {
|
||||
if (buf[i] == ch) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_APPEND(self->str, i + 1, j + 1);
|
||||
SPLIT_APPEND(buf, i + 1, j + 1);
|
||||
j = i = i - 1;
|
||||
} else
|
||||
i--;
|
||||
}
|
||||
if (j >= -1) {
|
||||
SPLIT_APPEND(self->str, 0, j + 1);
|
||||
SPLIT_APPEND(buf, 0, j + 1);
|
||||
}
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
|
|
Loading…
Reference in New Issue