refactored the unicodeobject/ucnhash interface, to hide the
implementation details inside the ucnhash module. also cleaned up the unicode copyright blurb a little; Secret Labs' internal revision history isn't that interesting...
This commit is contained in:
parent
a2bf2709b3
commit
0fdb90cafe
|
@ -1,20 +1,29 @@
|
|||
/* Unicode name database interface */
|
||||
|
||||
#include "Python.h"
|
||||
#include <stdlib.h>
|
||||
#ifndef Py_UCNHASH_H
|
||||
#define Py_UCNHASH_H
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* --- C API ----------------------------------------------------*/
|
||||
/* C API for usage by other Python modules */
|
||||
typedef struct _Py_UCNHashAPI
|
||||
{
|
||||
unsigned long cKeys;
|
||||
unsigned long cchMax;
|
||||
unsigned long (*hash)(const char *key, unsigned int cch);
|
||||
const void *(*getValue)(unsigned long iKey);
|
||||
} _Py_UCNHashAPI;
|
||||
/* revised ucnhash CAPI interface (exported through a PyCObject) */
|
||||
|
||||
typedef struct
|
||||
{
|
||||
const char *pszUCN;
|
||||
Py_UCS4 value;
|
||||
} _Py_UnicodeCharacterName;
|
||||
typedef struct {
|
||||
|
||||
/* Size of this struct */
|
||||
int size;
|
||||
|
||||
/* Get name for a given character code. Returns non-zero if
|
||||
success, zero if not. Does not set Python exceptions. */
|
||||
int (*getname)(Py_UCS4 code, char* buffer, int buflen);
|
||||
|
||||
/* Get character code for a given name. Same error handling
|
||||
as for getname. */
|
||||
int (*getcode)(const char* name, int namelen, Py_UCS4* code);
|
||||
|
||||
} _PyUnicode_Name_CAPI;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif /* !Py_UCNHASH_H */
|
||||
|
|
|
@ -50,16 +50,6 @@ print "done."
|
|||
|
||||
# strict error testing:
|
||||
print "Testing unicode character name expansion strict error handling....",
|
||||
k_cchMaxUnicodeName = 83
|
||||
|
||||
s = "\N{" + "1" * (k_cchMaxUnicodeName + 2) + "}"
|
||||
try:
|
||||
unicode(s, 'unicode-escape', 'strict')
|
||||
except UnicodeError:
|
||||
pass
|
||||
else:
|
||||
raise AssertionError, "failed to raise an exception when presented " \
|
||||
"with a UCN > k_cchMaxUnicodeName"
|
||||
try:
|
||||
unicode("\N{blah}", 'unicode-escape', 'strict')
|
||||
except UnicodeError:
|
||||
|
@ -67,6 +57,14 @@ except UnicodeError:
|
|||
else:
|
||||
raise AssertionError, "failed to raise an exception when given a bogus character name"
|
||||
|
||||
try:
|
||||
unicode("\N{" + "x" * 100000 + "}", 'unicode-escape', 'strict')
|
||||
except UnicodeError:
|
||||
pass
|
||||
else:
|
||||
raise AssertionError, "failed to raise an exception when given a very " \
|
||||
"long bogus character name"
|
||||
|
||||
try:
|
||||
unicode("\N{SPACE", 'unicode-escape', 'strict')
|
||||
except UnicodeError:
|
||||
|
|
|
@ -1,5 +1,13 @@
|
|||
#include "Python.h"
|
||||
#include "ucnhash.h"
|
||||
|
||||
/* Modified for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */
|
||||
|
||||
typedef struct {
|
||||
const char* pszUCN;
|
||||
Py_UCS4 value;
|
||||
}_Py_UnicodeCharacterName;
|
||||
|
||||
/*
|
||||
* The hash is produced using the algorithm described in
|
||||
* "Optimal algorithms for minimal perfect hashing",
|
||||
|
@ -14,11 +22,11 @@
|
|||
* Generated on: Fri Jul 14 08:00:58 2000
|
||||
*/
|
||||
|
||||
#define cKeys 10538
|
||||
#define k_cHashElements 18836
|
||||
#define k_cchMaxKey 83
|
||||
#define k_cKeys 10538
|
||||
|
||||
|
||||
staticforward const unsigned short G[k_cHashElements];
|
||||
staticforward const _Py_UnicodeCharacterName aucn[k_cKeys];
|
||||
|
||||
|
@ -34,8 +42,7 @@ static long f1(const char *key, unsigned int cch)
|
|||
while (--len >= 0)
|
||||
{
|
||||
/* (1000003 * x) ^ toupper(*(p++))
|
||||
* translated to handle > 32 bit longs
|
||||
*/
|
||||
* translated to handle > 32 bit longs */
|
||||
x = (0xf4243 * x);
|
||||
x = x & 0xFFFFFFFF;
|
||||
x = x ^ toupper(*(p++));
|
||||
|
@ -98,110 +105,96 @@ static long f2(const char *key, unsigned int cch)
|
|||
}
|
||||
|
||||
|
||||
static unsigned long hash(const char *key, unsigned int cch)
|
||||
static unsigned long
|
||||
hash(const char *key, unsigned int cch)
|
||||
{
|
||||
return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) % k_cHashElements;
|
||||
}
|
||||
|
||||
const void *getValue(unsigned long iKey)
|
||||
const _Py_UnicodeCharacterName *
|
||||
getValue(unsigned long iKey)
|
||||
{
|
||||
return &aucn[iKey];
|
||||
return (_Py_UnicodeCharacterName *) &aucn[iKey];
|
||||
}
|
||||
|
||||
/* Helper for adding objects to dictionaries. Check for errors with
|
||||
PyErr_Occurred() */
|
||||
static
|
||||
void insobj(PyObject *dict,
|
||||
char *name,
|
||||
PyObject *v)
|
||||
static int
|
||||
mystrnicmp(const char *s1, const char *s2, size_t count)
|
||||
{
|
||||
PyDict_SetItemString(dict, name, v);
|
||||
Py_XDECREF(v);
|
||||
char c1, c2;
|
||||
|
||||
if (count) {
|
||||
do {
|
||||
c1 = tolower(*(s1++));
|
||||
c2 = tolower(*(s2++));
|
||||
} while (--count && c1 == c2);
|
||||
return c1 - c2;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const _Py_UCNHashAPI hashAPI =
|
||||
/* bindings for the new API */
|
||||
|
||||
static int
|
||||
ucnhash_getname(Py_UCS4 code, char* buffer, int buflen)
|
||||
{
|
||||
k_cKeys,
|
||||
k_cchMaxKey,
|
||||
&hash,
|
||||
&getValue,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
ucnhash_getcode(const char* name, int namelen, Py_UCS4* code)
|
||||
{
|
||||
unsigned long j;
|
||||
|
||||
j = hash(name, namelen);
|
||||
|
||||
if (j > cKeys || mystrnicmp(name, getValue(j)->pszUCN, namelen) != 0)
|
||||
return 0;
|
||||
|
||||
*code = getValue(j)->value;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static const _PyUnicode_Name_CAPI hashAPI =
|
||||
{
|
||||
sizeof(_PyUnicode_Name_CAPI),
|
||||
ucnhash_getname,
|
||||
ucnhash_getcode
|
||||
};
|
||||
|
||||
static
|
||||
PyMethodDef Module_methods[] =
|
||||
PyMethodDef ucnhash_methods[] =
|
||||
{
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
static char *Module_docstring = "ucnhash hash function module";
|
||||
|
||||
/* Error reporting for module init functions */
|
||||
|
||||
#define Py_ReportModuleInitError(modname) { \
|
||||
PyObject *exc_type, *exc_value, *exc_tb; \
|
||||
PyObject *str_type, *str_value; \
|
||||
\
|
||||
/* Fetch error objects and convert them to strings */ \
|
||||
PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \
|
||||
if (exc_type && exc_value) { \
|
||||
str_type = PyObject_Str(exc_type); \
|
||||
str_value = PyObject_Str(exc_value); \
|
||||
} \
|
||||
else { \
|
||||
str_type = NULL; \
|
||||
str_value = NULL; \
|
||||
} \
|
||||
/* Try to format a more informative error message using the \
|
||||
original error */ \
|
||||
if (str_type && str_value && \
|
||||
PyString_Check(str_type) && PyString_Check(str_value)) \
|
||||
PyErr_Format( \
|
||||
PyExc_ImportError, \
|
||||
"initialization of module "modname" failed " \
|
||||
"(%s:%s)", \
|
||||
PyString_AS_STRING(str_type), \
|
||||
PyString_AS_STRING(str_value)); \
|
||||
else \
|
||||
PyErr_SetString( \
|
||||
PyExc_ImportError, \
|
||||
"initialization of module "modname" failed"); \
|
||||
Py_XDECREF(str_type); \
|
||||
Py_XDECREF(str_value); \
|
||||
Py_XDECREF(exc_type); \
|
||||
Py_XDECREF(exc_value); \
|
||||
Py_XDECREF(exc_tb); \
|
||||
}
|
||||
static char *ucnhash_docstring = "ucnhash hash function module";
|
||||
|
||||
|
||||
/* Create PyMethodObjects and register them in the module's dict */
|
||||
DL_EXPORT(void)
|
||||
initucnhash(void)
|
||||
{
|
||||
PyObject *module, *moddict;
|
||||
/* Create module */
|
||||
module = Py_InitModule4("ucnhash", /* Module name */
|
||||
Module_methods, /* Method list */
|
||||
Module_docstring, /* Module doc-string */
|
||||
(PyObject *)NULL, /* always pass this as *self */
|
||||
PYTHON_API_VERSION); /* API Version */
|
||||
if (module == NULL)
|
||||
goto onError;
|
||||
/* Add some constants to the module's dict */
|
||||
moddict = PyModule_GetDict(module);
|
||||
if (moddict == NULL)
|
||||
goto onError;
|
||||
PyObject *m, *d, *v;
|
||||
|
||||
m = Py_InitModule4(
|
||||
"ucnhash", /* Module name */
|
||||
ucnhash_methods, /* Method list */
|
||||
ucnhash_docstring, /* Module doc-string */
|
||||
(PyObject *)NULL, /* always pass this as *self */
|
||||
PYTHON_API_VERSION); /* API Version */
|
||||
if (!m)
|
||||
return;
|
||||
|
||||
d = PyModule_GetDict(m);
|
||||
if (!d)
|
||||
return;
|
||||
|
||||
/* Export C API */
|
||||
insobj(
|
||||
moddict,
|
||||
"ucnhashAPI",
|
||||
PyCObject_FromVoidPtr((void *)&hashAPI, NULL));
|
||||
|
||||
onError:
|
||||
/* Check for errors and report them */
|
||||
if (PyErr_Occurred())
|
||||
Py_ReportModuleInitError("ucnhash");
|
||||
return;
|
||||
v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
|
||||
PyDict_SetItemString(d, "Unicode_Names_CAPI", v);
|
||||
Py_XDECREF(v);
|
||||
}
|
||||
|
||||
static const unsigned short G[] =
|
||||
|
|
|
@ -6,61 +6,35 @@ Unicode Integration Proposal (see file Misc/unicode.txt).
|
|||
|
||||
Copyright (c) Corporation for National Research Initiatives.
|
||||
|
||||
--------------------------------------------------------------------
|
||||
The original string type implementation is:
|
||||
|
||||
Original header:
|
||||
--------------------------------------------------------------------
|
||||
Copyright (c) 1999 by Secret Labs AB
|
||||
Copyright (c) 1999 by Fredrik Lundh
|
||||
|
||||
* Yet another Unicode string type for Python. This type supports the
|
||||
* 16-bit Basic Multilingual Plane (BMP) only.
|
||||
*
|
||||
* Note that this string class supports embedded NULL characters. End
|
||||
* of string is given by the length attribute. However, the internal
|
||||
* representation always stores a trailing NULL to make it easier to
|
||||
* use unicode strings with standard APIs.
|
||||
*
|
||||
* History:
|
||||
* 1999-01-23 fl Created
|
||||
* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
|
||||
* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
|
||||
* 1999-03-06 fl Moved declarations to separate file, etc.
|
||||
* 1999-06-13 fl Changed join method semantics according to Tim's proposal
|
||||
* 1999-08-10 fl Some minor tweaks
|
||||
*
|
||||
* Written by Fredrik Lundh, January 1999.
|
||||
*
|
||||
* Copyright (c) 1999 by Secret Labs AB.
|
||||
* Copyright (c) 1999 by Fredrik Lundh.
|
||||
*
|
||||
* fredrik@pythonware.com
|
||||
* http://www.pythonware.com
|
||||
*
|
||||
* --------------------------------------------------------------------
|
||||
* This Unicode String Type is
|
||||
*
|
||||
* Copyright (c) 1999 by Secret Labs AB
|
||||
* Copyright (c) 1999 by Fredrik Lundh
|
||||
*
|
||||
* By obtaining, using, and/or copying this software and/or its
|
||||
* associated documentation, you agree that you have read, understood,
|
||||
* and will comply with the following terms and conditions:
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software and its
|
||||
* associated documentation for any purpose and without fee is hereby
|
||||
* granted, provided that the above copyright notice appears in all
|
||||
* copies, and that both that copyright notice and this permission notice
|
||||
* appear in supporting documentation, and that the name of Secret Labs
|
||||
* AB or the author not be used in advertising or publicity pertaining to
|
||||
* distribution of the software without specific, written prior
|
||||
* permission.
|
||||
*
|
||||
* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
|
||||
* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
||||
* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
* -------------------------------------------------------------------- */
|
||||
By obtaining, using, and/or copying this software and/or its
|
||||
associated documentation, you agree that you have read, understood,
|
||||
and will comply with the following terms and conditions:
|
||||
|
||||
Permission to use, copy, modify, and distribute this software and its
|
||||
associated documentation for any purpose and without fee is hereby
|
||||
granted, provided that the above copyright notice appears in all
|
||||
copies, and that both that copyright notice and this permission notice
|
||||
appear in supporting documentation, and that the name of Secret Labs
|
||||
AB or the author not be used in advertising or publicity pertaining to
|
||||
distribution of the software without specific, written prior
|
||||
permission.
|
||||
|
||||
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
|
||||
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
|
||||
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
||||
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
--------------------------------------------------------------------
|
||||
|
||||
*/
|
||||
|
||||
#include "Python.h"
|
||||
|
||||
|
@ -1129,27 +1103,7 @@ int unicodeescape_decoding_error(const char **source,
|
|||
}
|
||||
}
|
||||
|
||||
static _Py_UCNHashAPI *pucnHash = NULL;
|
||||
|
||||
static
|
||||
int mystrnicmp(const char *s1, const char *s2, size_t count)
|
||||
{
|
||||
char c1, c2;
|
||||
|
||||
if (count)
|
||||
{
|
||||
do
|
||||
{
|
||||
c1 = tolower(*(s1++));
|
||||
c2 = tolower(*(s2++));
|
||||
}
|
||||
while(--count && c1 == c2);
|
||||
|
||||
return c1 - c2;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
static _PyUnicode_Name_CAPI *unicode_names = NULL;
|
||||
|
||||
PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||
int size,
|
||||
|
@ -1282,55 +1236,37 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
|
|||
/* Ok, we need to deal with Unicode Character Names now,
|
||||
* make sure we've imported the hash table data...
|
||||
*/
|
||||
if (pucnHash == NULL) {
|
||||
if (unicode_names == NULL) {
|
||||
PyObject *mod = 0, *v = 0;
|
||||
mod = PyImport_ImportModule("ucnhash");
|
||||
if (mod == NULL)
|
||||
goto onError;
|
||||
v = PyObject_GetAttrString(mod,"ucnhashAPI");
|
||||
v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI");
|
||||
Py_DECREF(mod);
|
||||
if (v == NULL)
|
||||
goto onError;
|
||||
pucnHash = PyCObject_AsVoidPtr(v);
|
||||
unicode_names = PyCObject_AsVoidPtr(v);
|
||||
Py_DECREF(v);
|
||||
if (pucnHash == NULL)
|
||||
if (unicode_names == NULL)
|
||||
goto onError;
|
||||
}
|
||||
|
||||
if (*s == '{') {
|
||||
const char *start = s + 1;
|
||||
const char *endBrace = start;
|
||||
unsigned long j;
|
||||
|
||||
/* look for either the closing brace, or we
|
||||
* exceed the maximum length of the unicode character names
|
||||
*/
|
||||
while (*endBrace != '}' &&
|
||||
(unsigned int)(endBrace - start) <=
|
||||
pucnHash->cchMax &&
|
||||
endBrace < end)
|
||||
{
|
||||
/* look for the closing brace */
|
||||
while (*endBrace != '}' && endBrace < end)
|
||||
endBrace++;
|
||||
}
|
||||
if (endBrace != end && *endBrace == '}') {
|
||||
j = pucnHash->hash(start, endBrace - start);
|
||||
if (j > pucnHash->cKeys ||
|
||||
mystrnicmp(
|
||||
start,
|
||||
((_Py_UnicodeCharacterName *)
|
||||
(pucnHash->getValue(j)))->pszUCN,
|
||||
(int)(endBrace - start)) != 0)
|
||||
{
|
||||
if (!unicode_names->getcode(start, endBrace-start, &chr)) {
|
||||
if (unicodeescape_decoding_error(
|
||||
&s, &x, errors,
|
||||
"Invalid Unicode Character Name"))
|
||||
{
|
||||
"Invalid Unicode Character Name")
|
||||
)
|
||||
goto onError;
|
||||
}
|
||||
goto ucnFallthrough;
|
||||
}
|
||||
chr = ((_Py_UnicodeCharacterName *)
|
||||
(pucnHash->getValue(j)))->value;
|
||||
s = endBrace + 1;
|
||||
goto store;
|
||||
} else {
|
||||
|
|
Loading…
Reference in New Issue