Fully support 32-bit codes. Enable BIGCHARSET in UCS-4 builds.
This commit is contained in:
parent
53d93adc46
commit
78e2f06cc6
|
@ -16,7 +16,10 @@ from sre_constants import *
|
|||
|
||||
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
|
||||
|
||||
MAXCODE = 65535
|
||||
if _sre.CODESIZE == 2:
|
||||
MAXCODE = 65535
|
||||
else:
|
||||
MAXCODE = 0xFFFFFFFFL
|
||||
|
||||
def _compile(code, pattern, flags):
|
||||
# internal: compile a (sub)pattern
|
||||
|
@ -191,9 +194,6 @@ def _optimize_charset(charset, fixup):
|
|||
# XXX: could append to charmap tail
|
||||
return charset # cannot compress
|
||||
except IndexError:
|
||||
if sys.maxunicode != 65535:
|
||||
# XXX: big charsets don't work in UCS-4 builds
|
||||
return charset
|
||||
# character set contains unicode characters
|
||||
return _optimize_unicode(charset, fixup)
|
||||
# compress character map
|
||||
|
@ -228,14 +228,18 @@ def _optimize_charset(charset, fixup):
|
|||
|
||||
def _mk_bitmap(bits):
|
||||
data = []
|
||||
m = 1; v = 0
|
||||
if _sre.CODESIZE == 2:
|
||||
start = (1, 0)
|
||||
else:
|
||||
start = (1L, 0L)
|
||||
m, v = start
|
||||
for c in bits:
|
||||
if c:
|
||||
v = v + m
|
||||
m = m << 1
|
||||
if m > MAXCODE:
|
||||
data.append(v)
|
||||
m = 1; v = 0
|
||||
m, v = start
|
||||
return data
|
||||
|
||||
# To represent a big charset, first a bitmap of all characters in the
|
||||
|
@ -258,21 +262,38 @@ def _mk_bitmap(bits):
|
|||
# less significant byte is a bit index in the chunk (just like the
|
||||
# CHARSET matching).
|
||||
|
||||
# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
|
||||
# of the basic multilingual plane; an efficient representation
|
||||
# for all of UTF-16 has not yet been developed. This means,
|
||||
# in particular, that negated charsets cannot be represented as
|
||||
# bigcharsets.
|
||||
|
||||
def _optimize_unicode(charset, fixup):
|
||||
try:
|
||||
import array
|
||||
except ImportError:
|
||||
return charset
|
||||
charmap = [0]*65536
|
||||
negate = 0
|
||||
for op, av in charset:
|
||||
if op is NEGATE:
|
||||
negate = 1
|
||||
elif op is LITERAL:
|
||||
charmap[fixup(av)] = 1
|
||||
elif op is RANGE:
|
||||
for i in range(fixup(av[0]), fixup(av[1])+1):
|
||||
charmap[i] = 1
|
||||
elif op is CATEGORY:
|
||||
# XXX: could expand category
|
||||
return charset # cannot compress
|
||||
try:
|
||||
for op, av in charset:
|
||||
if op is NEGATE:
|
||||
negate = 1
|
||||
elif op is LITERAL:
|
||||
charmap[fixup(av)] = 1
|
||||
elif op is RANGE:
|
||||
for i in range(fixup(av[0]), fixup(av[1])+1):
|
||||
charmap[i] = 1
|
||||
elif op is CATEGORY:
|
||||
# XXX: could expand category
|
||||
return charset # cannot compress
|
||||
except IndexError:
|
||||
# non-BMP characters
|
||||
return charset
|
||||
if negate:
|
||||
if sys.maxunicode != 65535:
|
||||
# XXX: negation does not work with big charsets
|
||||
return charset
|
||||
for i in range(65536):
|
||||
charmap[i] = not charmap[i]
|
||||
comps = {}
|
||||
|
@ -287,12 +308,14 @@ def _optimize_unicode(charset, fixup):
|
|||
block = block + 1
|
||||
data = data + _mk_bitmap(chunk)
|
||||
header = [block]
|
||||
assert MAXCODE == 65535
|
||||
for i in range(128):
|
||||
if sys.byteorder == 'big':
|
||||
header.append(256*mapping[2*i]+mapping[2*i+1])
|
||||
else:
|
||||
header.append(mapping[2*i]+256*mapping[2*i+1])
|
||||
if MAXCODE == 65535:
|
||||
code = 'H'
|
||||
else:
|
||||
code = 'L'
|
||||
# Convert block indices to byte array of 256 bytes
|
||||
mapping = array.array('b', mapping).tostring()
|
||||
# Convert byte array to word array
|
||||
header = header + array.array(code, mapping).tolist()
|
||||
data[0:0] = header
|
||||
return [(BIGCHARSET, data)]
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
# update when constants are added or removed
|
||||
|
||||
MAGIC = 20010701
|
||||
MAGIC = 20030419
|
||||
|
||||
# max code word in this release
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
* 2001-10-24 fl added finditer primitive (for 2.2 only)
|
||||
* 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
|
||||
* 2002-11-09 fl fixed empty sub/subn return type
|
||||
* 2003-04-18 mvl fully support 4-byte codes
|
||||
*
|
||||
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
|
||||
*
|
||||
|
@ -510,10 +511,18 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
|
|||
break;
|
||||
|
||||
case SRE_OP_CHARSET:
|
||||
/* <CHARSET> <bitmap> (16 bits per code word) */
|
||||
if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
|
||||
return ok;
|
||||
set += 16;
|
||||
if (sizeof(SRE_CODE) == 2) {
|
||||
/* <CHARSET> <bitmap> (16 bits per code word) */
|
||||
if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
|
||||
return ok;
|
||||
set += 16;
|
||||
}
|
||||
else {
|
||||
/* <CHARSET> <bitmap> (32 bits per code word) */
|
||||
if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
|
||||
return ok;
|
||||
set += 8;
|
||||
}
|
||||
break;
|
||||
|
||||
case SRE_OP_BIGCHARSET:
|
||||
|
@ -521,11 +530,25 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
|
|||
{
|
||||
int count, block;
|
||||
count = *(set++);
|
||||
block = ((unsigned char*)set)[ch >> 8];
|
||||
set += 128;
|
||||
if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
|
||||
return ok;
|
||||
set += count*16;
|
||||
|
||||
if (sizeof(SRE_CODE) == 2) {
|
||||
block = ((unsigned char*)set)[ch >> 8];
|
||||
set += 128;
|
||||
if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
|
||||
return ok;
|
||||
set += count*16;
|
||||
}
|
||||
else {
|
||||
if (ch < 65536)
|
||||
block = ((unsigned char*)set)[ch >> 8];
|
||||
else
|
||||
block = -1;
|
||||
set += 64;
|
||||
if (block >=0 &&
|
||||
(set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
|
||||
return ok;
|
||||
set += count*8;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1371,7 +1394,10 @@ _compile(PyObject* self_, PyObject* args)
|
|||
|
||||
for (i = 0; i < n; i++) {
|
||||
PyObject *o = PyList_GET_ITEM(code, i);
|
||||
self->code[i] = (SRE_CODE) PyInt_AsLong(o);
|
||||
if (PyInt_Check(o))
|
||||
self->code[i] = (SRE_CODE) PyInt_AsLong(o);
|
||||
else
|
||||
self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o);
|
||||
}
|
||||
|
||||
if (PyErr_Occurred()) {
|
||||
|
@ -3045,6 +3071,12 @@ PyMODINIT_FUNC init_sre(void)
|
|||
Py_DECREF(x);
|
||||
}
|
||||
|
||||
x = PyInt_FromLong(sizeof(SRE_CODE));
|
||||
if (x) {
|
||||
PyDict_SetItemString(d, "CODESIZE", x);
|
||||
Py_DECREF(x);
|
||||
}
|
||||
|
||||
x = PyString_FromString(copyright);
|
||||
if (x) {
|
||||
PyDict_SetItemString(d, "copyright", x);
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
* See the _sre.c file for information on usage and redistribution.
|
||||
*/
|
||||
|
||||
#define SRE_MAGIC 20010701
|
||||
#define SRE_MAGIC 20030419
|
||||
#define SRE_OP_FAILURE 0
|
||||
#define SRE_OP_SUCCESS 1
|
||||
#define SRE_OP_ANY 2
|
||||
|
|
Loading…
Reference in New Issue