refactored the unicodeobject/ucnhash interface, to hide the

implementation details inside the ucnhash module.

also cleaned up the unicode copyright blurb a little; Secret Labs'
internal revision history isn't that interesting...
This commit is contained in:
Fredrik Lundh 2001-01-19 09:45:02 +00:00
parent a2bf2709b3
commit 0fdb90cafe
4 changed files with 144 additions and 208 deletions

View File

@ -1,20 +1,29 @@
/* Unicode name database interface */
#include "Python.h"
#include <stdlib.h>
#ifndef Py_UCNHASH_H
#define Py_UCNHASH_H
#ifdef __cplusplus
extern "C" {
#endif
/* --- C API ----------------------------------------------------*/
/* C API for usage by other Python modules */
typedef struct _Py_UCNHashAPI
{
unsigned long cKeys;
unsigned long cchMax;
unsigned long (*hash)(const char *key, unsigned int cch);
const void *(*getValue)(unsigned long iKey);
} _Py_UCNHashAPI;
/* revised ucnhash CAPI interface (exported through a PyCObject) */
typedef struct
{
const char *pszUCN;
Py_UCS4 value;
} _Py_UnicodeCharacterName;
typedef struct {
/* Size of this struct */
int size;
/* Get name for a given character code. Returns non-zero if
success, zero if not. Does not set Python exceptions. */
int (*getname)(Py_UCS4 code, char* buffer, int buflen);
/* Get character code for a given name. Same error handling
as for getname. */
int (*getcode)(const char* name, int namelen, Py_UCS4* code);
} _PyUnicode_Name_CAPI;
#ifdef __cplusplus
}
#endif
#endif /* !Py_UCNHASH_H */

View File

@ -50,16 +50,6 @@ print "done."
# strict error testing:
print "Testing unicode character name expansion strict error handling....",
k_cchMaxUnicodeName = 83
s = "\N{" + "1" * (k_cchMaxUnicodeName + 2) + "}"
try:
unicode(s, 'unicode-escape', 'strict')
except UnicodeError:
pass
else:
raise AssertionError, "failed to raise an exception when presented " \
"with a UCN > k_cchMaxUnicodeName"
try:
unicode("\N{blah}", 'unicode-escape', 'strict')
except UnicodeError:
@ -67,6 +57,14 @@ except UnicodeError:
else:
raise AssertionError, "failed to raise an exception when given a bogus character name"
try:
unicode("\N{" + "x" * 100000 + "}", 'unicode-escape', 'strict')
except UnicodeError:
pass
else:
raise AssertionError, "failed to raise an exception when given a very " \
"long bogus character name"
try:
unicode("\N{SPACE", 'unicode-escape', 'strict')
except UnicodeError:

View File

@ -1,5 +1,13 @@
#include "Python.h"
#include "ucnhash.h"
/* Modified for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */
typedef struct {
const char* pszUCN;
Py_UCS4 value;
}_Py_UnicodeCharacterName;
/*
* The hash is produced using the algorithm described in
* "Optimal algorithms for minimal perfect hashing",
@ -14,11 +22,11 @@
* Generated on: Fri Jul 14 08:00:58 2000
*/
#define cKeys 10538
#define k_cHashElements 18836
#define k_cchMaxKey 83
#define k_cKeys 10538
staticforward const unsigned short G[k_cHashElements];
staticforward const _Py_UnicodeCharacterName aucn[k_cKeys];
@ -34,8 +42,7 @@ static long f1(const char *key, unsigned int cch)
while (--len >= 0)
{
/* (1000003 * x) ^ toupper(*(p++))
* translated to handle > 32 bit longs
*/
* translated to handle > 32 bit longs */
x = (0xf4243 * x);
x = x & 0xFFFFFFFF;
x = x ^ toupper(*(p++));
@ -98,110 +105,96 @@ static long f2(const char *key, unsigned int cch)
}
static unsigned long hash(const char *key, unsigned int cch)
static unsigned long
hash(const char *key, unsigned int cch)
{
return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) % k_cHashElements;
}
const void *getValue(unsigned long iKey)
const _Py_UnicodeCharacterName *
getValue(unsigned long iKey)
{
return &aucn[iKey];
return (_Py_UnicodeCharacterName *) &aucn[iKey];
}
/* Helper for adding objects to dictionaries. Check for errors with
PyErr_Occurred() */
static
void insobj(PyObject *dict,
char *name,
PyObject *v)
static int
mystrnicmp(const char *s1, const char *s2, size_t count)
{
PyDict_SetItemString(dict, name, v);
Py_XDECREF(v);
char c1, c2;
if (count) {
do {
c1 = tolower(*(s1++));
c2 = tolower(*(s2++));
} while (--count && c1 == c2);
return c1 - c2;
}
return 0;
}
static const _Py_UCNHashAPI hashAPI =
/* bindings for the new API */
static int
ucnhash_getname(Py_UCS4 code, char* buffer, int buflen)
{
k_cKeys,
k_cchMaxKey,
&hash,
&getValue,
return 0;
}
static int
ucnhash_getcode(const char* name, int namelen, Py_UCS4* code)
{
unsigned long j;
j = hash(name, namelen);
if (j > cKeys || mystrnicmp(name, getValue(j)->pszUCN, namelen) != 0)
return 0;
*code = getValue(j)->value;
return 1;
}
static const _PyUnicode_Name_CAPI hashAPI =
{
sizeof(_PyUnicode_Name_CAPI),
ucnhash_getname,
ucnhash_getcode
};
static
PyMethodDef Module_methods[] =
PyMethodDef ucnhash_methods[] =
{
{NULL, NULL},
};
static char *Module_docstring = "ucnhash hash function module";
/* Error reporting for module init functions */
#define Py_ReportModuleInitError(modname) { \
PyObject *exc_type, *exc_value, *exc_tb; \
PyObject *str_type, *str_value; \
\
/* Fetch error objects and convert them to strings */ \
PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \
if (exc_type && exc_value) { \
str_type = PyObject_Str(exc_type); \
str_value = PyObject_Str(exc_value); \
} \
else { \
str_type = NULL; \
str_value = NULL; \
} \
/* Try to format a more informative error message using the \
original error */ \
if (str_type && str_value && \
PyString_Check(str_type) && PyString_Check(str_value)) \
PyErr_Format( \
PyExc_ImportError, \
"initialization of module "modname" failed " \
"(%s:%s)", \
PyString_AS_STRING(str_type), \
PyString_AS_STRING(str_value)); \
else \
PyErr_SetString( \
PyExc_ImportError, \
"initialization of module "modname" failed"); \
Py_XDECREF(str_type); \
Py_XDECREF(str_value); \
Py_XDECREF(exc_type); \
Py_XDECREF(exc_value); \
Py_XDECREF(exc_tb); \
}
static char *ucnhash_docstring = "ucnhash hash function module";
/* Create PyMethodObjects and register them in the module's dict */
DL_EXPORT(void)
initucnhash(void)
{
PyObject *module, *moddict;
/* Create module */
module = Py_InitModule4("ucnhash", /* Module name */
Module_methods, /* Method list */
Module_docstring, /* Module doc-string */
(PyObject *)NULL, /* always pass this as *self */
PYTHON_API_VERSION); /* API Version */
if (module == NULL)
goto onError;
/* Add some constants to the module's dict */
moddict = PyModule_GetDict(module);
if (moddict == NULL)
goto onError;
PyObject *m, *d, *v;
m = Py_InitModule4(
"ucnhash", /* Module name */
ucnhash_methods, /* Method list */
ucnhash_docstring, /* Module doc-string */
(PyObject *)NULL, /* always pass this as *self */
PYTHON_API_VERSION); /* API Version */
if (!m)
return;
d = PyModule_GetDict(m);
if (!d)
return;
/* Export C API */
insobj(
moddict,
"ucnhashAPI",
PyCObject_FromVoidPtr((void *)&hashAPI, NULL));
onError:
/* Check for errors and report them */
if (PyErr_Occurred())
Py_ReportModuleInitError("ucnhash");
return;
v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
PyDict_SetItemString(d, "Unicode_Names_CAPI", v);
Py_XDECREF(v);
}
static const unsigned short G[] =

View File

@ -6,61 +6,35 @@ Unicode Integration Proposal (see file Misc/unicode.txt).
Copyright (c) Corporation for National Research Initiatives.
--------------------------------------------------------------------
The original string type implementation is:
Original header:
--------------------------------------------------------------------
Copyright (c) 1999 by Secret Labs AB
Copyright (c) 1999 by Fredrik Lundh
* Yet another Unicode string type for Python. This type supports the
* 16-bit Basic Multilingual Plane (BMP) only.
*
* Note that this string class supports embedded NULL characters. End
* of string is given by the length attribute. However, the internal
* representation always stores a trailing NULL to make it easier to
* use unicode strings with standard APIs.
*
* History:
* 1999-01-23 fl Created
* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
* 1999-03-06 fl Moved declarations to separate file, etc.
* 1999-06-13 fl Changed join method semantics according to Tim's proposal
* 1999-08-10 fl Some minor tweaks
*
* Written by Fredrik Lundh, January 1999.
*
* Copyright (c) 1999 by Secret Labs AB.
* Copyright (c) 1999 by Fredrik Lundh.
*
* fredrik@pythonware.com
* http://www.pythonware.com
*
* --------------------------------------------------------------------
* This Unicode String Type is
*
* Copyright (c) 1999 by Secret Labs AB
* Copyright (c) 1999 by Fredrik Lundh
*
* By obtaining, using, and/or copying this software and/or its
* associated documentation, you agree that you have read, understood,
* and will comply with the following terms and conditions:
*
* Permission to use, copy, modify, and distribute this software and its
* associated documentation for any purpose and without fee is hereby
* granted, provided that the above copyright notice appears in all
* copies, and that both that copyright notice and this permission notice
* appear in supporting documentation, and that the name of Secret Labs
* AB or the author not be used in advertising or publicity pertaining to
* distribution of the software without specific, written prior
* permission.
*
* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
* -------------------------------------------------------------------- */
By obtaining, using, and/or copying this software and/or its
associated documentation, you agree that you have read, understood,
and will comply with the following terms and conditions:
Permission to use, copy, modify, and distribute this software and its
associated documentation for any purpose and without fee is hereby
granted, provided that the above copyright notice appears in all
copies, and that both that copyright notice and this permission notice
appear in supporting documentation, and that the name of Secret Labs
AB or the author not be used in advertising or publicity pertaining to
distribution of the software without specific, written prior
permission.
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
--------------------------------------------------------------------
*/
#include "Python.h"
@ -1129,27 +1103,7 @@ int unicodeescape_decoding_error(const char **source,
}
}
static _Py_UCNHashAPI *pucnHash = NULL;
static
int mystrnicmp(const char *s1, const char *s2, size_t count)
{
char c1, c2;
if (count)
{
do
{
c1 = tolower(*(s1++));
c2 = tolower(*(s2++));
}
while(--count && c1 == c2);
return c1 - c2;
}
return 0;
}
static _PyUnicode_Name_CAPI *unicode_names = NULL;
PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
int size,
@ -1282,55 +1236,37 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
/* Ok, we need to deal with Unicode Character Names now,
* make sure we've imported the hash table data...
*/
if (pucnHash == NULL) {
if (unicode_names == NULL) {
PyObject *mod = 0, *v = 0;
mod = PyImport_ImportModule("ucnhash");
if (mod == NULL)
goto onError;
v = PyObject_GetAttrString(mod,"ucnhashAPI");
v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI");
Py_DECREF(mod);
if (v == NULL)
goto onError;
pucnHash = PyCObject_AsVoidPtr(v);
unicode_names = PyCObject_AsVoidPtr(v);
Py_DECREF(v);
if (pucnHash == NULL)
if (unicode_names == NULL)
goto onError;
}
if (*s == '{') {
const char *start = s + 1;
const char *endBrace = start;
unsigned long j;
/* look for either the closing brace, or we
* exceed the maximum length of the unicode character names
*/
while (*endBrace != '}' &&
(unsigned int)(endBrace - start) <=
pucnHash->cchMax &&
endBrace < end)
{
/* look for the closing brace */
while (*endBrace != '}' && endBrace < end)
endBrace++;
}
if (endBrace != end && *endBrace == '}') {
j = pucnHash->hash(start, endBrace - start);
if (j > pucnHash->cKeys ||
mystrnicmp(
start,
((_Py_UnicodeCharacterName *)
(pucnHash->getValue(j)))->pszUCN,
(int)(endBrace - start)) != 0)
{
if (!unicode_names->getcode(start, endBrace-start, &chr)) {
if (unicodeescape_decoding_error(
&s, &x, errors,
"Invalid Unicode Character Name"))
{
"Invalid Unicode Character Name")
)
goto onError;
}
goto ucnFallthrough;
}
chr = ((_Py_UnicodeCharacterName *)
(pucnHash->getValue(j)))->value;
s = endBrace + 1;
goto store;
} else {