cpython/Modules/unicodedata.c

/* ------------------------------------------------------------------------

   unicodedata -- Provides access to the Unicode 3.2 data base.

   Data was extracted from the Unicode 3.2 UnicodeData.txt file.

   Written by Marc-Andre Lemburg (mal@lemburg.com).
   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   Modified by Martin v. L<EFBFBD>wis (martin@v.loewis.de)

   Copyright (c) Corporation for National Research Initiatives.

   ------------------------------------------------------------------------ */

#include "Python.h"
#include "ucnhash.h"

/* character properties */

typedef struct {
    const unsigned char category;	/* index into
					   _PyUnicode_CategoryNames */
    const unsigned char	combining; 	/* combining class value 0 - 255 */
    const unsigned char	bidirectional; 	/* index into
					   _PyUnicode_BidirectionalNames */
    const unsigned char mirrored;	/* true if mirrored in bidir mode */
} _PyUnicode_DatabaseRecord;

/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodedata_db.h"

static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)
{
    int index;
    if (code >= 0x110000)
        index = 0;
    else {
        index = index1[(code>>SHIFT)];
        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
    }

    return &_PyUnicode_Database_Records[index];
}

static const _PyUnicode_DatabaseRecord*
_getrecord(PyUnicodeObject* v)
{
    return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
}

/* --- Module API --------------------------------------------------------- */

static PyObject *
unicodedata_decimal(PyObject *self, PyObject *args)
{
    PyUnicodeObject *v;
    PyObject *defobj = NULL;
    long rc;

    if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
        return NULL;
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
        return NULL;
    }
    rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
    if (rc < 0) {
	if (defobj == NULL) {
	    PyErr_SetString(PyExc_ValueError,
			    "not a decimal");
            return NULL;
	}
	else {
	    Py_INCREF(defobj);
	    return defobj;
	}
    }
    return PyInt_FromLong(rc);
}

static PyObject *
unicodedata_digit(PyObject *self, PyObject *args)
{
    PyUnicodeObject *v;
    PyObject *defobj = NULL;
    long rc;

    if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
        return NULL;
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
        return NULL;
    }
    rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
    if (rc < 0) {
	if (defobj == NULL) {
	    PyErr_SetString(PyExc_ValueError, "not a digit");
            return NULL;
	}
	else {
	    Py_INCREF(defobj);
	    return defobj;
	}
    }
    return PyInt_FromLong(rc);
}

static PyObject *
unicodedata_numeric(PyObject *self, PyObject *args)
{
    PyUnicodeObject *v;
    PyObject *defobj = NULL;
    double rc;

    if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
        return NULL;
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
	return NULL;
    }
    rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
    if (rc < 0) {
	if (defobj == NULL) {
	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
	    return NULL;
	}
	else {
	    Py_INCREF(defobj);
	    return defobj;
	}
    }
    return PyFloat_FromDouble(rc);
}

static PyObject *
unicodedata_category(PyObject *self, PyObject *args)
{
    PyUnicodeObject *v;
    int index;

    if (!PyArg_ParseTuple(args, "O!:category",
			  &PyUnicode_Type, &v))
	return NULL;
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
	return NULL;
    }
    index = (int) _getrecord(v)->category;
    return PyString_FromString(_PyUnicode_CategoryNames[index]);
}

static PyObject *
unicodedata_bidirectional(PyObject *self, PyObject *args)
{
    PyUnicodeObject *v;
    int index;

    if (!PyArg_ParseTuple(args, "O!:bidirectional",
			  &PyUnicode_Type, &v))
	return NULL;
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
	return NULL;
    }
    index = (int) _getrecord(v)->bidirectional;
    return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
}

static PyObject *
unicodedata_combining(PyObject *self, PyObject *args)
{
    PyUnicodeObject *v;

    if (!PyArg_ParseTuple(args, "O!:combining",
			  &PyUnicode_Type, &v))
	return NULL;
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
	return NULL;
    }
    return PyInt_FromLong((int) _getrecord(v)->combining);
}

static PyObject *
unicodedata_mirrored(PyObject *self, PyObject *args)
{
    PyUnicodeObject *v;

    if (!PyArg_ParseTuple(args, "O!:mirrored",
			  &PyUnicode_Type, &v))
	return NULL;
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
	return NULL;
    }
    return PyInt_FromLong((int) _getrecord(v)->mirrored);
}

static PyObject *
unicodedata_decomposition(PyObject *self, PyObject *args)
{
    PyUnicodeObject *v;
    char decomp[256];
    int code, index, count, i;

    if (!PyArg_ParseTuple(args, "O!:decomposition",
			  &PyUnicode_Type, &v))
	return NULL;
    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
	return NULL;
    }

    code = (int) *PyUnicode_AS_UNICODE(v);

    if (code < 0 || code >= 0x110000)
        index = 0;
    else {
        index = decomp_index1[(code>>DECOMP_SHIFT)];
        index = decomp_index2[(index<<DECOMP_SHIFT)+
                             (code&((1<<DECOMP_SHIFT)-1))];
    }

    /* high byte is number of hex bytes (usually one or two), low byte
       is prefix code (from*/
    count = decomp_data[index] >> 8;

    /* XXX: could allocate the PyString up front instead
       (strlen(prefix) + 5 * count + 1 bytes) */

    /* copy prefix */
    i = strlen(decomp_prefix[decomp_data[index] & 255]);
    memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);

    while (count-- > 0) {
        if (i)
            decomp[i++] = ' ';
        assert((size_t)i < sizeof(decomp));
        PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
                      decomp_data[++index]);
        i += strlen(decomp + i);
    }
    
    decomp[i] = '\0';

    return PyString_FromString(decomp);
}

void
get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
{
    if (code >= 0x110000) {
        *index = 0;
    } 
    else {
        *index = decomp_index1[(code>>DECOMP_SHIFT)];
        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
                               (code&((1<<DECOMP_SHIFT)-1))];
    }
	
    /* high byte is number of hex bytes (usually one or two), low byte
       is prefix code (from*/
    *count = decomp_data[*index] >> 8;
    *prefix = decomp_data[*index] & 255;

    (*index)++;
}

#define SBase   0xAC00
#define LBase   0x1100
#define VBase   0x1161
#define TBase   0x11A7
#define LCount  19
#define VCount  21
#define TCount  28
#define NCount  (VCount*TCount)
#define SCount  (LCount*NCount)

static PyObject*
nfd_nfkd(PyObject *input, int k)
{
    PyObject *result;
    Py_UNICODE *i, *end, *o;
    /* Longest decomposition in Unicode 3.2: U+FDFA */
    Py_UNICODE stack[20]; 
    int space, stackptr, isize;
    int index, prefix, count;
    unsigned char prev, cur;
	
    stackptr = 0;
    isize = PyUnicode_GET_SIZE(input);
    /* Overallocate atmost 10 characters. */
    space = (isize > 10 ? 10 : isize) + isize;
    result = PyUnicode_FromUnicode(NULL, space);
    if (!result)
        return NULL;
    i = PyUnicode_AS_UNICODE(input);
    end = i + isize;
    o = PyUnicode_AS_UNICODE(result);

    while (i < end) {
        stack[stackptr++] = *i++;
        while(stackptr) {
            Py_UNICODE code = stack[--stackptr];
            /* Hangul Decomposition adds three characters in
               a single step, so we need atleast that much room. */
            if (space < 3) {
                int newsize = PyString_GET_SIZE(result) + 10;
                space += 10;
                if (PyUnicode_Resize(&result, newsize) == -1)
                    return NULL;
                o = PyUnicode_AS_UNICODE(result) + newsize - space;
            }
            /* Hangul Decomposition. */
            if (SBase <= code && code < (SBase+SCount)) {
                int SIndex = code - SBase;
                int L = LBase + SIndex / NCount;
                int V = VBase + (SIndex % NCount) / TCount;
                int T = TBase + SIndex % TCount;
                *o++ = L;
                *o++ = V;
                space -= 2;
                if (T != TBase) {
                    *o++ = T;
                    space --;
                }
                continue;
            }
            /* Other decompoistions. */
            get_decomp_record(code, &index, &prefix, &count);

            /* Copy character if it is not decomposable, or has a
               compatibility decomposition, but we do NFD. */
            if (!count || (prefix && !k)) {
                *o++ = code;
                space--;
                continue;
            }
            /* Copy decomposition onto the stack, in reverse
               order.  */
            while(count) {
                code = decomp_data[index + (--count)];
                stack[stackptr++] = code;
            }
        }
    }

    /* Drop overallocation. Cannot fail. */
    PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);

    /* Sort canonically. */
    i = PyUnicode_AS_UNICODE(result);
    prev = _getrecord_ex(*i)->combining;
    end = i + PyUnicode_GET_SIZE(result);
    for (i++; i < end; i++) {
        cur = _getrecord_ex(*i)->combining;
        if (prev == 0 || cur == 0 || prev <= cur) {
            prev = cur;
            continue;
        }
        /* Non-canonical order. Need to switch *i with previous. */
        o = i - 1;
        while (1) {
            Py_UNICODE tmp = o[1];
            o[1] = o[0];
            o[0] = tmp;
            o--;
            if (o < PyUnicode_AS_UNICODE(result))
                break;
            prev = _getrecord_ex(*o)->combining;
            if (prev == 0 || prev <= cur)
                break;
        }
        prev = _getrecord_ex(*i)->combining;
    }
    return result;
}

static int
find_nfc_index(struct reindex* nfc, Py_UNICODE code)
{
    int index;
    for (index = 0; nfc[index].start; index++) {
        int start = nfc[index].start;
        if (code < start)
            return -1;
        if (code <= start + nfc[index].count) {
            int delta = code - start;
            return nfc[index].index + delta;
        }
    }
    return -1;
}

static PyObject*
nfc_nfkc(PyObject *input, int k)
{
    PyObject *result;
    Py_UNICODE *i, *i1, *o, *end;
    int f,l,index,index1,comb;
    Py_UNICODE code;
    Py_UNICODE *skipped[20];
    int cskipped = 0;

    result = nfd_nfkd(input, k);
    if (!result)
        return NULL;

    /* We are going to modify result in-place.
       If nfd_nfkd is changed to sometimes return the input,
       this code needs to be reviewed. */
    assert(result != input);

    i = PyUnicode_AS_UNICODE(result);
    end = i + PyUnicode_GET_SIZE(result);
    o = PyUnicode_AS_UNICODE(result);
	
  again:
    while (i < end) {
      for (index = 0; index < cskipped; index++) {
          if (skipped[index] == i) {
              /* *i character is skipped. 
                 Remove from list. */
              skipped[index] = skipped[cskipped-1];
              cskipped--;
              i++;
              goto again; /* continue while */
          }
      }
      /* Hangul Composition. We don't need to check for <LV,T>
         pairs, since we always have decomposed data. */
      if (LBase <= *i && *i < (LBase+LCount) &&
          i + 1 < end && 
          VBase <= i[1] && i[1] <= (VBase+VCount)) {
          int LIndex, VIndex;
          LIndex = i[0] - LBase;
          VIndex = i[1] - VBase;
          code = SBase + (LIndex*VCount+VIndex)*TCount;
          i+=2;
          if (i < end &&
              TBase <= *i && *i <= (TBase+TCount)) {
              code += *i-TBase;
              i++;
          }
          *o++ = code;
          continue;
      }

      f = find_nfc_index(nfc_first, *i);
      if (f == -1) {
          *o++ = *i++;
          continue;
      }
      /* Find next unblocked character. */
      i1 = i+1;
      comb = 0;
      while (i1 < end) {
          int comb1 = _getrecord_ex(*i1)->combining;
          if (comb1 && comb == comb1) {
              /* Character is blocked. */
              i1++;
              continue;
          }
          l = find_nfc_index(nfc_last, *i1);
          /* *i1 cannot be combined with *i. If *i1
             is a starter, we don't need to look further.
             Otherwise, record the combining class. */
          if (l == -1) {
            not_combinable:
              if (comb1 == 0)
                  break;
              comb = comb1;
              i1++;
              continue;
          }
          index = f*TOTAL_LAST + l;
          index1 = comp_index[index >> COMP_SHIFT];
          code = comp_data[(index1<<COMP_SHIFT)+
                           (index&((1<<COMP_SHIFT)-1))];
          if (code == 0)
              goto not_combinable;
			
          /* Replace the original character. */
          *i = code;
          /* Mark the second character unused. */
          skipped[cskipped++] = i1;
          i1++;
          f = find_nfc_index(nfc_first, *i);
          if (f == -1)
              break;
      }
      *o++ = *i++;
    }
    if (o != end)
        PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
    return result;
}
		
static PyObject*
unicodedata_normalize(PyObject *self, PyObject *args)
{
    char *form;
    PyObject *input;

    if(!PyArg_ParseTuple(args, "sO!:normalized",
                         &form, &PyUnicode_Type, &input))
        return NULL;

    if (PyUnicode_GetSize(input) == 0) {
        /* Special case empty input strings, since resizing
           them  later would cause internal errors. */
        Py_INCREF(input);
        return input;
    }

    if (strcmp(form, "NFC") == 0)
        return nfc_nfkc(input, 0);
    if (strcmp(form, "NFKC") == 0)
        return nfc_nfkc(input, 1);
    if (strcmp(form, "NFD") == 0)
        return nfd_nfkd(input, 0);
    if (strcmp(form, "NFKD") == 0)
        return nfd_nfkd(input, 1);
    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
    return NULL;
}

/* -------------------------------------------------------------------- */
/* unicode character name tables */

/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodename_db.h"

/* -------------------------------------------------------------------- */
/* database code (cut and pasted from the unidb package) */

static unsigned long
_gethash(const char *s, int len, int scale)
{
    int i;
    unsigned long h = 0;
    unsigned long ix;
    for (i = 0; i < len; i++) {
        h = (h * scale) + (unsigned char) toupper(s[i]);
        ix = h & 0xff000000;
        if (ix)
            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
    }
    return h;
}

static char *hangul_syllables[][3] = {
    { "G",  "A",   ""   },
    { "GG", "AE",  "G"  },
    { "N",  "YA",  "GG" },
    { "D",  "YAE", "GS" },
    { "DD", "EO",  "N", },
    { "R",  "E",   "NJ" },
    { "M",  "YEO", "NH" },
    { "B",  "YE",  "D"  },
    { "BB", "O",   "L"  },
    { "S",  "WA",  "LG" },
    { "SS", "WAE", "LM" },
    { "",   "OE",  "LB" },
    { "J",  "YO",  "LS" },
    { "JJ", "U",   "LT" },
    { "C",  "WEO", "LP" },
    { "K",  "WE",  "LH" },
    { "T",  "WI",  "M"  },
    { "P",  "YU",  "B"  },
    { "H",  "EU",  "BS" },
    { 0,    "YI",  "S"  },
    { 0,    "I",   "SS" },
    { 0,    0,     "NG" },
    { 0,    0,     "J"  },
    { 0,    0,     "C"  },
    { 0,    0,     "K"  },
    { 0,    0,     "T"  },
    { 0,    0,     "P"  },
    { 0,    0,     "H"  }
};

static int
is_unified_ideograph(Py_UCS4 code)
{
    return (
        (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
        (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
        (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
}

static int
_getucname(Py_UCS4 code, char* buffer, int buflen)
{
    int offset;
    int i;
    int word;
    unsigned char* w;

    if (SBase <= code && code < SBase+SCount) {
	/* Hangul syllable. */
	int SIndex = code - SBase;
	int L = SIndex / NCount;
	int V = (SIndex % NCount) / TCount;
	int T = SIndex % TCount;

	if (buflen < 27)
	    /* Worst case: HANGUL SYLLABLE <10chars>. */
	    return 0;
	strcpy(buffer, "HANGUL SYLLABLE ");
	buffer += 16;
	strcpy(buffer, hangul_syllables[L][0]);
	buffer += strlen(hangul_syllables[L][0]);
	strcpy(buffer, hangul_syllables[V][1]);
	buffer += strlen(hangul_syllables[V][1]);
	strcpy(buffer, hangul_syllables[T][2]);
	buffer += strlen(hangul_syllables[T][2]);
	*buffer = '\0';
	return 1;
    }

    if (is_unified_ideograph(code)) {
        if (buflen < 28)
            /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
            return 0;
        sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
        return 1;
    }

    if (code >= 0x110000)
        return 0;

    /* get offset into phrasebook */
    offset = phrasebook_offset1[(code>>phrasebook_shift)];
    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
                               (code&((1<<phrasebook_shift)-1))];
    if (!offset)
        return 0;

    i = 0;

    for (;;) {
        /* get word index */
        word = phrasebook[offset] - phrasebook_short;
        if (word >= 0) {
            word = (word << 8) + phrasebook[offset+1];
            offset += 2;
        } else
            word = phrasebook[offset++];
        if (i) {
            if (i > buflen)
                return 0; /* buffer overflow */
            buffer[i++] = ' ';
        }
        /* copy word string from lexicon.  the last character in the
           word has bit 7 set.  the last word in a string ends with
           0x80 */
        w = lexicon + lexicon_offset[word];
        while (*w < 128) {
            if (i >= buflen)
                return 0; /* buffer overflow */
            buffer[i++] = *w++;
        }
        if (i >= buflen)
            return 0; /* buffer overflow */
        buffer[i++] = *w & 127;
        if (*w == 128)
            break; /* end of word */
    }

    return 1;
}

static int
_cmpname(int code, const char* name, int namelen)
{
    /* check if code corresponds to the given name */
    int i;
    char buffer[NAME_MAXLEN];
    if (!_getucname(code, buffer, sizeof(buffer)))
        return 0;
    for (i = 0; i < namelen; i++) {
        if (toupper(name[i]) != buffer[i])
            return 0;
    }
    return buffer[namelen] == '\0';
}

static void 
find_syllable(const char *str, int *len, int *pos, int count, int column)
{
    int i, len1;
    *len = -1;
    for (i = 0; i < count; i++) {
	char *s = hangul_syllables[i][column];
	len1 = strlen(s);
	if (len1 <= *len)
	    continue;
	if (strncmp(str, s, len1) == 0) {
	    *len = len1;
	    *pos = i;
	}
    }
    if (*len == -1) {
	*len = 0;
	*pos = -1;
    }
}

static int
_getcode(const char* name, int namelen, Py_UCS4* code)
{
    unsigned int h, v;
    unsigned int mask = code_size-1;
    unsigned int i, incr;

    /* Check for hangul syllables. */
    if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
	int L, V, T, len;
	const char *pos = name + 16;
	find_syllable(pos, &len, &L, LCount, 0);
	pos += len;
	find_syllable(pos, &len, &V, VCount, 1);
	pos += len;
	find_syllable(pos, &len, &T, TCount, 2);
	pos += len;
	if (V != -1 && V != -1 && T != -1 && pos-name == namelen) {
	    *code = SBase + (L*VCount+V)*TCount + T;
	    return 1;
	}
        /* Otherwise, it's an illegal syllable name. */
        return 0;
    }

    /* Check for unified ideographs. */
    if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
        /* Four or five hexdigits must follow. */
        v = 0;
        name += 22;
        namelen -= 22;
        if (namelen != 4 && namelen != 5)
            return 0;
        while (namelen--) {
            v *= 16;
            if (*name >= '0' && *name <= '9')
                v += *name - '0';
            else if (*name >= 'A' && *name <= 'F')
                v += *name - 'A' + 10;
            else
                return 0;
            name++;
        }
        if (!is_unified_ideograph(v))
            return 0;
        *code = v;
        return 1;
    }

    /* the following is the same as python's dictionary lookup, with
       only minor changes.  see the makeunicodedata script for more
       details */

    h = (unsigned int) _gethash(name, namelen, code_magic);
    i = (~h) & mask;
    v = code_hash[i];
    if (!v)
        return 0;
    if (_cmpname(v, name, namelen)) {
        *code = v;
        return 1;
    }
    incr = (h ^ (h >> 3)) & mask;
    if (!incr)
        incr = mask;
    for (;;) {
        i = (i + incr) & mask;
        v = code_hash[i];
        if (!v)
            return 0;
        if (_cmpname(v, name, namelen)) {
            *code = v;
            return 1;
        }
        incr = incr << 1;
        if (incr > mask)
            incr = incr ^ code_poly;
    }
}

static const _PyUnicode_Name_CAPI hashAPI = 
{
    sizeof(_PyUnicode_Name_CAPI),
    _getucname,
    _getcode
};

/* -------------------------------------------------------------------- */
/* Python bindings */

static PyObject *
unicodedata_name(PyObject* self, PyObject* args)
{
    char name[NAME_MAXLEN];

    PyUnicodeObject* v;
    PyObject* defobj = NULL;
    if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
        return NULL;

    if (PyUnicode_GET_SIZE(v) != 1) {
	PyErr_SetString(PyExc_TypeError,
			"need a single Unicode character as parameter");
	return NULL;
    }

    if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
                             name, sizeof(name))) {
	if (defobj == NULL) {
	    PyErr_SetString(PyExc_ValueError, "no such name");
            return NULL;
	}
	else {
	    Py_INCREF(defobj);
	    return defobj;
	}
    }

    return Py_BuildValue("s", name);
}

static PyObject *
unicodedata_lookup(PyObject* self, PyObject* args)
{
    Py_UCS4 code;
    Py_UNICODE str[1];

    char* name;
    int namelen;
    if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
        return NULL;

    if (!_getcode(name, namelen, &code)) {
        char fmt[] = "undefined character name '%s'";
        char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
        sprintf(buf, fmt, name);
        PyErr_SetString(PyExc_KeyError, buf);
        PyMem_FREE(buf);
        return NULL;
    }

    str[0] = (Py_UNICODE) code;
    return PyUnicode_FromUnicode(str, 1);
}

/* XXX Add doc strings. */

static PyMethodDef unicodedata_functions[] = {
    {"decimal", unicodedata_decimal, METH_VARARGS},
    {"digit", unicodedata_digit, METH_VARARGS},
    {"numeric", unicodedata_numeric, METH_VARARGS},
    {"category", unicodedata_category, METH_VARARGS},
    {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
    {"combining", unicodedata_combining, METH_VARARGS},
    {"mirrored", unicodedata_mirrored, METH_VARARGS},
    {"decomposition",unicodedata_decomposition, METH_VARARGS},
    {"name", unicodedata_name, METH_VARARGS},
    {"lookup", unicodedata_lookup, METH_VARARGS},
    {"normalize", unicodedata_normalize, METH_VARARGS},
    {NULL, NULL}		/* sentinel */
};

PyDoc_STRVAR(unicodedata_docstring, "unicode character database");

PyMODINIT_FUNC
initunicodedata(void)
{
    PyObject *m, *v;

    m = Py_InitModule3(
        "unicodedata", unicodedata_functions, unicodedata_docstring);
    if (!m)
        return;

    PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);

    /* Export C API */
    v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
    if (v != NULL)
        PyModule_AddObject(m, "ucnhash_CAPI", v);
}

/* 
Local variables:
c-basic-offset: 4
indent-tabs-mode: nil
End:
*/
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								/* ------------------------------------------------------------------------
-												Patch #626548: Support Hangul syllable names.

											
										
										
											2002-11-23 08:22:32 -04:00
+								   unicodedata -- Provides access to the Unicode 3.2 data base.
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
-												Patch #626548: Support Hangul syllable names.

											
										
										
											2002-11-23 08:22:32 -04:00
+								   Data was extracted from the Unicode 3.2 UnicodeData.txt file.
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
-												unicode database compression, step 2:

- fixed attributions
- moved decomposition data to a separate table, in preparation
  for step 3 (which won't happen before 2.0 final, promise!)
- use relative paths in the generator script

I have a lot more stuff in the works for 2.1, but let's leave
that for another day...

											
										
										
											2000-09-25 05:07:06 -03:00
+								   Written by Marc-Andre Lemburg (mal@lemburg.com).
 								   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
-												Patch #626548: Support Hangul syllable names.

											
										
										
											2002-11-23 08:22:32 -04:00
+								   Modified by Martin v. L<EFBFBD>wis (martin@v.loewis.de)
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
-												unicode database compression, step 2:

- fixed attributions
- moved decomposition data to a separate table, in preparation
  for step 3 (which won't happen before 2.0 final, promise!)
- use relative paths in the generator script

I have a lot more stuff in the works for 2.1, but let's leave
that for another day...

											
										
										
											2000-09-25 05:07:06 -03:00
+								   Copyright (c) Corporation for National Research Initiatives.
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
 								   ------------------------------------------------------------------------ */
 								#include "Python.h"
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								#include "ucnhash.h"
 								/* character properties */
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								typedef struct {
 								    const unsigned char category;	/* index into
 													   _PyUnicode_CategoryNames */
 								    const unsigned char	combining; 	/* combining class value 0 - 255 */
 								    const unsigned char	bidirectional; 	/* index into
 													   _PyUnicode_BidirectionalNames */
 								    const unsigned char mirrored;	/* true if mirrored in bidir mode */
 								} _PyUnicode_DatabaseRecord;
 								/* data file generated by Tools/unicode/makeunicodedata.py */
 								#include "unicodedata_db.h"
 								static const _PyUnicode_DatabaseRecord*
-												Patch #626485: Support Unicode normalization.

											
										
										
											2002-11-23 18:08:15 -04:00
+								_getrecord_ex(Py_UCS4 code)
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								{
 								    int index;
-												Fix SF bug #694816, remove comparison of unsigned value < 0

											
										
										
											2003-02-27 23:14:37 -04:00
+								    if (code >= 0x110000)
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								        index = 0;
 								    else {
 								        index = index1[(code>>SHIFT)];
 								        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
 								    }
 								    return &_PyUnicode_Database_Records[index];
 								}
-												Patch #626485: Support Unicode normalization.

											
										
										
											2002-11-23 18:08:15 -04:00
+								static const _PyUnicode_DatabaseRecord*
 								_getrecord(PyUnicodeObject* v)
 								{
 								    return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
 								}
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								/* --- Module API --------------------------------------------------------- */
 								static PyObject *
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								unicodedata_decimal(PyObject *self, PyObject *args)
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								{
 								    PyUnicodeObject *v;
 								    PyObject *defobj = NULL;
 								    long rc;
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								    if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								        return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    if (PyUnicode_GET_SIZE(v) != 1) {
 									PyErr_SetString(PyExc_TypeError,
 											"need a single Unicode character as parameter");
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								        return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    }
 								    rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
 								    if (rc < 0) {
 									if (defobj == NULL) {
 									    PyErr_SetString(PyExc_ValueError,
 											    "not a decimal");
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								            return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+									}
 									else {
 									    Py_INCREF(defobj);
 									    return defobj;
 									}
 								    }
 								    return PyInt_FromLong(rc);
 								}
 								static PyObject *
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								unicodedata_digit(PyObject *self, PyObject *args)
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								{
 								    PyUnicodeObject *v;
 								    PyObject *defobj = NULL;
 								    long rc;
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								    if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								        return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    if (PyUnicode_GET_SIZE(v) != 1) {
 									PyErr_SetString(PyExc_TypeError,
 											"need a single Unicode character as parameter");
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								        return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    }
 								    rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
 								    if (rc < 0) {
 									if (defobj == NULL) {
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+									    PyErr_SetString(PyExc_ValueError, "not a digit");
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								            return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+									}
 									else {
 									    Py_INCREF(defobj);
 									    return defobj;
 									}
 								    }
 								    return PyInt_FromLong(rc);
 								}
 								static PyObject *
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								unicodedata_numeric(PyObject *self, PyObject *args)
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								{
 								    PyUnicodeObject *v;
 								    PyObject *defobj = NULL;
 								    double rc;
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								    if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								        return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    if (PyUnicode_GET_SIZE(v) != 1) {
 									PyErr_SetString(PyExc_TypeError,
 											"need a single Unicode character as parameter");
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+									return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    }
 								    rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
 								    if (rc < 0) {
 									if (defobj == NULL) {
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+									    PyErr_SetString(PyExc_ValueError, "not a numeric character");
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+									    return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+									}
 									else {
 									    Py_INCREF(defobj);
 									    return defobj;
 									}
 								    }
 								    return PyFloat_FromDouble(rc);
 								}
 								static PyObject *
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								unicodedata_category(PyObject *self, PyObject *args)
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								{
 								    PyUnicodeObject *v;
 								    int index;
 								    if (!PyArg_ParseTuple(args, "O!:category",
 											  &PyUnicode_Type, &v))
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+									return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    if (PyUnicode_GET_SIZE(v) != 1) {
 									PyErr_SetString(PyExc_TypeError,
 											"need a single Unicode character as parameter");
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+									return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    }
-												renamed internal functions to avoid name clashes under OpenVMS
(fixes bug #132815)

											
										
										
											2001-02-18 18:06:17 -04:00
+								    index = (int) _getrecord(v)->category;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    return PyString_FromString(_PyUnicode_CategoryNames[index]);
 								}
 								static PyObject *
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								unicodedata_bidirectional(PyObject *self, PyObject *args)
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								{
 								    PyUnicodeObject *v;
 								    int index;
 								    if (!PyArg_ParseTuple(args, "O!:bidirectional",
 											  &PyUnicode_Type, &v))
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+									return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    if (PyUnicode_GET_SIZE(v) != 1) {
 									PyErr_SetString(PyExc_TypeError,
 											"need a single Unicode character as parameter");
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+									return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    }
-												renamed internal functions to avoid name clashes under OpenVMS
(fixes bug #132815)

											
										
										
											2001-02-18 18:06:17 -04:00
+								    index = (int) _getrecord(v)->bidirectional;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 								}
 								static PyObject *
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								unicodedata_combining(PyObject *self, PyObject *args)
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								{
 								    PyUnicodeObject *v;
 								    if (!PyArg_ParseTuple(args, "O!:combining",
 											  &PyUnicode_Type, &v))
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+									return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    if (PyUnicode_GET_SIZE(v) != 1) {
 									PyErr_SetString(PyExc_TypeError,
 											"need a single Unicode character as parameter");
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+									return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    }
-												renamed internal functions to avoid name clashes under OpenVMS
(fixes bug #132815)

											
										
										
											2001-02-18 18:06:17 -04:00
+								    return PyInt_FromLong((int) _getrecord(v)->combining);
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								}
 								static PyObject *
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								unicodedata_mirrored(PyObject *self, PyObject *args)
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								{
 								    PyUnicodeObject *v;
 								    if (!PyArg_ParseTuple(args, "O!:mirrored",
 											  &PyUnicode_Type, &v))
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+									return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    if (PyUnicode_GET_SIZE(v) != 1) {
 									PyErr_SetString(PyExc_TypeError,
 											"need a single Unicode character as parameter");
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+									return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    }
-												renamed internal functions to avoid name clashes under OpenVMS
(fixes bug #132815)

											
										
										
											2001-02-18 18:06:17 -04:00
+								    return PyInt_FromLong((int) _getrecord(v)->mirrored);
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								}
 								static PyObject *
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								unicodedata_decomposition(PyObject *self, PyObject *args)
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								{
 								    PyUnicodeObject *v;
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								    char decomp[256];
 								    int code, index, count, i;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
 								    if (!PyArg_ParseTuple(args, "O!:decomposition",
 											  &PyUnicode_Type, &v))
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+									return NULL;
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    if (PyUnicode_GET_SIZE(v) != 1) {
 									PyErr_SetString(PyExc_TypeError,
 											"need a single Unicode character as parameter");
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+									return NULL;
 								    }
 								    code = (int) *PyUnicode_AS_UNICODE(v);
-												Update to Unicode 3.2 database.

											
										
										
											2002-10-18 13:11:54 -03:00
+								    if (code < 0 || code >= 0x110000)
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								        index = 0;
 								    else {
 								        index = decomp_index1[(code>>DECOMP_SHIFT)];
 								        index = decomp_index2[(index<<DECOMP_SHIFT)+
 								                             (code&((1<<DECOMP_SHIFT)-1))];
 								    }
-												unicodedata_decomposition():  sprintf -> PyOS_snprintf.

											
										
										
											2001-11-30 03:23:05 -04:00
+								    /* high byte is number of hex bytes (usually one or two), low byte
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								       is prefix code (from*/
 								    count = decomp_data[index] >> 8;
 								    /* XXX: could allocate the PyString up front instead
 								       (strlen(prefix) + 5 * count + 1 bytes) */
 								    /* copy prefix */
 								    i = strlen(decomp_prefix[decomp_data[index] & 255]);
 								    memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
 								    while (count-- > 0) {
 								        if (i)
 								            decomp[i++] = ' ';
-												unicodedata_decomposition():  sprintf -> PyOS_snprintf.

											
										
										
											2001-11-30 03:23:05 -04:00
+								        assert((size_t)i < sizeof(decomp));
 								        PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 								                      decomp_data[++index]);
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								        i += strlen(decomp + i);
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    }
-												compress unicode decomposition tables (this saves another 55k)

											
										
										
											2001-01-21 18:41:08 -04:00
+								    decomp[i] = '\0';
 								    return PyString_FromString(decomp);
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								}
-												Patch #626485: Support Unicode normalization.

											
										
										
											2002-11-23 18:08:15 -04:00
+								void
 								get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
 								{
-												Fix SF bug #694816, remove comparison of unsigned value < 0

											
										
										
											2003-02-27 23:14:37 -04:00
+								    if (code >= 0x110000) {
-												Patch #626485: Support Unicode normalization.

											
										
										
											2002-11-23 18:08:15 -04:00
+								        *index = 0;
 								    }
 								    else {
 								        *index = decomp_index1[(code>>DECOMP_SHIFT)];
 								        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
 								                               (code&((1<<DECOMP_SHIFT)-1))];
 								    }
 								    /* high byte is number of hex bytes (usually one or two), low byte
 								       is prefix code (from*/
 								    *count = decomp_data[*index] >> 8;
 								    *prefix = decomp_data[*index] & 255;
 								    (*index)++;
 								}
 								#define SBase   0xAC00
 								#define LBase   0x1100
 								#define VBase   0x1161
 								#define TBase   0x11A7
 								#define LCount  19
 								#define VCount  21
 								#define TCount  28
 								#define NCount  (VCount*TCount)
 								#define SCount  (LCount*NCount)
 								static PyObject*
 								nfd_nfkd(PyObject *input, int k)
 								{
 								    PyObject *result;
 								    Py_UNICODE *i, *end, *o;
 								    /* Longest decomposition in Unicode 3.2: U+FDFA */
 								    Py_UNICODE stack[20];
 								    int space, stackptr, isize;
 								    int index, prefix, count;
 								    unsigned char prev, cur;
 								    stackptr = 0;
 								    isize = PyUnicode_GET_SIZE(input);
 								    /* Overallocate atmost 10 characters. */
 								    space = (isize > 10 ? 10 : isize) + isize;
 								    result = PyUnicode_FromUnicode(NULL, space);
 								    if (!result)
 								        return NULL;
 								    i = PyUnicode_AS_UNICODE(input);
 								    end = i + isize;
 								    o = PyUnicode_AS_UNICODE(result);
 								    while (i < end) {
 								        stack[stackptr++] = *i++;
 								        while(stackptr) {
 								            Py_UNICODE code = stack[--stackptr];
-												Overallocate target buffer for normalization more early. Fixes #834676.
Backported to 2.3.

											
										
										
											2003-11-06 16:47:57 -04:00
+								            /* Hangul Decomposition adds three characters in
 								               a single step, so we need atleast that much room. */
 								            if (space < 3) {
 								                int newsize = PyString_GET_SIZE(result) + 10;
 								                space += 10;
 								                if (PyUnicode_Resize(&result, newsize) == -1)
-												Patch #626485: Support Unicode normalization.

											
										
										
											2002-11-23 18:08:15 -04:00
+								                    return NULL;
-												Overallocate target buffer for normalization more early. Fixes #834676.
Backported to 2.3.

											
										
										
											2003-11-06 16:47:57 -04:00
+								                o = PyUnicode_AS_UNICODE(result) + newsize - space;
-												Patch #626485: Support Unicode normalization.

											
										
										
											2002-11-23 18:08:15 -04:00
+								            }
 								            /* Hangul Decomposition. */
 								            if (SBase <= code && code < (SBase+SCount)) {
 								                int SIndex = code - SBase;
 								                int L = LBase + SIndex / NCount;
 								                int V = VBase + (SIndex % NCount) / TCount;
 								                int T = TBase + SIndex % TCount;
 								                *o++ = L;
 								                *o++ = V;
 								                space -= 2;
 								                if (T != TBase) {
 								                    *o++ = T;
 								                    space --;
 								                }
 								                continue;
 								            }
 								            /* Other decompoistions. */
 								            get_decomp_record(code, &index, &prefix, &count);
 								            /* Copy character if it is not decomposable, or has a
 								               compatibility decomposition, but we do NFD. */
 								            if (!count || (prefix && !k)) {
 								                *o++ = code;
 								                space--;
 								                continue;
 								            }
 								            /* Copy decomposition onto the stack, in reverse
 								               order.  */
 								            while(count) {
 								                code = decomp_data[index + (--count)];
 								                stack[stackptr++] = code;
 								            }
 								        }
 								    }
 								    /* Drop overallocation. Cannot fail. */
 								    PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
 								    /* Sort canonically. */
 								    i = PyUnicode_AS_UNICODE(result);
 								    prev = _getrecord_ex(*i)->combining;
 								    end = i + PyUnicode_GET_SIZE(result);
 								    for (i++; i < end; i++) {
 								        cur = _getrecord_ex(*i)->combining;
 								        if (prev == 0 || cur == 0 || prev <= cur) {
 								            prev = cur;
 								            continue;
 								        }
 								        /* Non-canonical order. Need to switch *i with previous. */
 								        o = i - 1;
 								        while (1) {
 								            Py_UNICODE tmp = o[1];
 								            o[1] = o[0];
 								            o[0] = tmp;
 								            o--;
 								            if (o < PyUnicode_AS_UNICODE(result))
 								                break;
 								            prev = _getrecord_ex(*o)->combining;
 								            if (prev == 0 || prev <= cur)
 								                break;
 								        }
 								        prev = _getrecord_ex(*i)->combining;
 								    }
 								    return result;
 								}
 								static int
 								find_nfc_index(struct reindex* nfc, Py_UNICODE code)
 								{
 								    int index;
 								    for (index = 0; nfc[index].start; index++) {
 								        int start = nfc[index].start;
 								        if (code < start)
 								            return -1;
 								        if (code <= start + nfc[index].count) {
 								            int delta = code - start;
 								            return nfc[index].index + delta;
 								        }
 								    }
 								    return -1;
 								}
 								static PyObject*
 								nfc_nfkc(PyObject *input, int k)
 								{
 								    PyObject *result;
 								    Py_UNICODE *i, *i1, *o, *end;
 								    int f,l,index,index1,comb;
 								    Py_UNICODE code;
 								    Py_UNICODE *skipped[20];
 								    int cskipped = 0;
 								    result = nfd_nfkd(input, k);
 								    if (!result)
 								        return NULL;
 								    /* We are going to modify result in-place.
 								       If nfd_nfkd is changed to sometimes return the input,
 								       this code needs to be reviewed. */
 								    assert(result != input);
 								    i = PyUnicode_AS_UNICODE(result);
 								    end = i + PyUnicode_GET_SIZE(result);
 								    o = PyUnicode_AS_UNICODE(result);
 								  again:
 								    while (i < end) {
 								      for (index = 0; index < cskipped; index++) {
 								          if (skipped[index] == i) {
 								              /* *i character is skipped.
 								                 Remove from list. */
 								              skipped[index] = skipped[cskipped-1];
 								              cskipped--;
 								              i++;
-												Remove C++ comment.

											
										
										
											2002-12-07 10:56:36 -04:00
+								              goto again; /* continue while */
-												Patch #626485: Support Unicode normalization.

											
										
										
											2002-11-23 18:08:15 -04:00
+								          }
 								      }
 								      /* Hangul Composition. We don't need to check for <LV,T>
 								         pairs, since we always have decomposed data. */
 								      if (LBase <= *i && *i < (LBase+LCount) &&
 								          i + 1 < end &&
 								          VBase <= i[1] && i[1] <= (VBase+VCount)) {
 								          int LIndex, VIndex;
 								          LIndex = i[0] - LBase;
 								          VIndex = i[1] - VBase;
 								          code = SBase + (LIndex*VCount+VIndex)*TCount;
 								          i+=2;
 								          if (i < end &&
 								              TBase <= *i && *i <= (TBase+TCount)) {
 								              code += *i-TBase;
 								              i++;
 								          }
 								          *o++ = code;
 								          continue;
 								      }
 								      f = find_nfc_index(nfc_first, *i);
 								      if (f == -1) {
 								          *o++ = *i++;
 								          continue;
 								      }
 								      /* Find next unblocked character. */
 								      i1 = i+1;
 								      comb = 0;
 								      while (i1 < end) {
 								          int comb1 = _getrecord_ex(*i1)->combining;
 								          if (comb1 && comb == comb1) {
 								              /* Character is blocked. */
 								              i1++;
 								              continue;
 								          }
 								          l = find_nfc_index(nfc_last, *i1);
 								          /* *i1 cannot be combined with *i. If *i1
 								             is a starter, we don't need to look further.
 								             Otherwise, record the combining class. */
 								          if (l == -1) {
 								            not_combinable:
 								              if (comb1 == 0)
 								                  break;
 								              comb = comb1;
 								              i1++;
 								              continue;
 								          }
 								          index = f*TOTAL_LAST + l;
 								          index1 = comp_index[index >> COMP_SHIFT];
 								          code = comp_data[(index1<<COMP_SHIFT)+
 								                           (index&((1<<COMP_SHIFT)-1))];
 								          if (code == 0)
 								              goto not_combinable;
 								          /* Replace the original character. */
 								          *i = code;
 								          /* Mark the second character unused. */
 								          skipped[cskipped++] = i1;
 								          i1++;
 								          f = find_nfc_index(nfc_first, *i);
 								          if (f == -1)
 								              break;
 								      }
 								      *o++ = *i++;
 								    }
 								    if (o != end)
 								        PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
 								    return result;
 								}
 								static PyObject*
 								unicodedata_normalize(PyObject *self, PyObject *args)
 								{
 								    char *form;
 								    PyObject *input;
 								    if(!PyArg_ParseTuple(args, "sO!:normalized",
 								                         &form, &PyUnicode_Type, &input))
 								        return NULL;
-												Special case normalization of empty strings. Fixes #924361.
Backported to 2.3.

											
										
										
											2004-04-17 16:36:48 -03:00
+								    if (PyUnicode_GetSize(input) == 0) {
 								        /* Special case empty input strings, since resizing
 								           them  later would cause internal errors. */
 								        Py_INCREF(input);
 								        return input;
 								    }
-												Patch #626485: Support Unicode normalization.

											
										
										
											2002-11-23 18:08:15 -04:00
+								    if (strcmp(form, "NFC") == 0)
 								        return nfc_nfkc(input, 0);
 								    if (strcmp(form, "NFKC") == 0)
 								        return nfc_nfkc(input, 1);
 								    if (strcmp(form, "NFD") == 0)
 								        return nfd_nfkd(input, 0);
 								    if (strcmp(form, "NFKD") == 0)
 								        return nfd_nfkd(input, 1);
 								    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 								    return NULL;
 								}
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								/* -------------------------------------------------------------------- */
 								/* unicode character name tables */
 								/* data file generated by Tools/unicode/makeunicodedata.py */
 								#include "unicodename_db.h"
 								/* -------------------------------------------------------------------- */
 								/* database code (cut and pasted from the unidb package) */
 								static unsigned long
-												renamed internal functions to avoid name clashes under OpenVMS
(fixes bug #132815)

											
										
										
											2001-02-18 18:06:17 -04:00
+								_gethash(const char *s, int len, int scale)
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								{
 								    int i;
 								    unsigned long h = 0;
 								    unsigned long ix;
 								    for (i = 0; i < len; i++) {
 								        h = (h * scale) + (unsigned char) toupper(s[i]);
 								        ix = h & 0xff000000;
 								        if (ix)
 								            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 								    }
 								    return h;
 								}
-												Patch #626548: Support Hangul syllable names.

											
										
										
											2002-11-23 08:22:32 -04:00
+								static char *hangul_syllables[][3] = {
 								    { "G",  "A",   ""   },
 								    { "GG", "AE",  "G"  },
 								    { "N",  "YA",  "GG" },
 								    { "D",  "YAE", "GS" },
 								    { "DD", "EO",  "N", },
 								    { "R",  "E",   "NJ" },
 								    { "M",  "YEO", "NH" },
 								    { "B",  "YE",  "D"  },
 								    { "BB", "O",   "L"  },
 								    { "S",  "WA",  "LG" },
 								    { "SS", "WAE", "LM" },
 								    { "",   "OE",  "LB" },
 								    { "J",  "YO",  "LS" },
 								    { "JJ", "U",   "LT" },
 								    { "C",  "WEO", "LP" },
 								    { "K",  "WE",  "LH" },
 								    { "T",  "WI",  "M"  },
 								    { "P",  "YU",  "B"  },
 								    { "H",  "EU",  "BS" },
 								    { 0,    "YI",  "S"  },
 								    { 0,    "I",   "SS" },
 								    { 0,    0,     "NG" },
 								    { 0,    0,     "J"  },
 								    { 0,    0,     "C"  },
 								    { 0,    0,     "K"  },
 								    { 0,    0,     "T"  },
 								    { 0,    0,     "P"  },
 								    { 0,    0,     "H"  }
 								};
-												Verify that the code in CJK UNIFIED IDEOGRAPH- actually denotes an ideograph.

											
										
										
											2002-11-23 18:10:29 -04:00
+								static int
 								is_unified_ideograph(Py_UCS4 code)
 								{
 								    return (
 								        (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
 								        (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
 								        (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
 								}
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								static int
-												_Py prefix is verboten for static entry points

											
										
										
											2002-06-13 08:55:14 -03:00
+								_getucname(Py_UCS4 code, char* buffer, int buflen)
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								{
 								    int offset;
 								    int i;
 								    int word;
 								    unsigned char* w;
-												Fix off-by-one error.

											
										
										
											2002-11-23 13:11:06 -04:00
+								    if (SBase <= code && code < SBase+SCount) {
-												Patch #626548: Support Hangul syllable names.

											
										
										
											2002-11-23 08:22:32 -04:00
+									/* Hangul syllable. */
 									int SIndex = code - SBase;
 									int L = SIndex / NCount;
 									int V = (SIndex % NCount) / TCount;
 									int T = SIndex % TCount;
 									if (buflen < 27)
 									    /* Worst case: HANGUL SYLLABLE <10chars>. */
 									    return 0;
 									strcpy(buffer, "HANGUL SYLLABLE ");
 									buffer += 16;
 									strcpy(buffer, hangul_syllables[L][0]);
 									buffer += strlen(hangul_syllables[L][0]);
 									strcpy(buffer, hangul_syllables[V][1]);
 									buffer += strlen(hangul_syllables[V][1]);
 									strcpy(buffer, hangul_syllables[T][2]);
 									buffer += strlen(hangul_syllables[T][2]);
 									*buffer = '\0';
 									return 1;
 								    }
-												Verify that the code in CJK UNIFIED IDEOGRAPH- actually denotes an ideograph.

											
										
										
											2002-11-23 18:10:29 -04:00
+								    if (is_unified_ideograph(code)) {
-												Implement names for CJK unified ideographs. Add name to KeyError output.
Verify that the lookup for an existing name succeeds.

											
										
										
											2002-11-23 14:01:32 -04:00
+								        if (buflen < 28)
 								            /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
 								            return 0;
 								        sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
 								        return 1;
 								    }
-												Update to Unicode 3.2 database.

											
										
										
											2002-10-18 13:11:54 -03:00
+								    if (code >= 0x110000)
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								        return 0;
 								    /* get offset into phrasebook */
 								    offset = phrasebook_offset1[(code>>phrasebook_shift)];
 								    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 								                               (code&((1<<phrasebook_shift)-1))];
 								    if (!offset)
 								        return 0;
 								    i = 0;
 								    for (;;) {
 								        /* get word index */
 								        word = phrasebook[offset] - phrasebook_short;
 								        if (word >= 0) {
 								            word = (word << 8) + phrasebook[offset+1];
 								            offset += 2;
 								        } else
 								            word = phrasebook[offset++];
 								        if (i) {
 								            if (i > buflen)
 								                return 0; /* buffer overflow */
 								            buffer[i++] = ' ';
 								        }
 								        /* copy word string from lexicon.  the last character in the
 								           word has bit 7 set.  the last word in a string ends with
 x80 */
 								        w = lexicon + lexicon_offset[word];
 								        while (*w < 128) {
 								            if (i >= buflen)
 								                return 0; /* buffer overflow */
 								            buffer[i++] = *w++;
 								        }
 								        if (i >= buflen)
 								            return 0; /* buffer overflow */
 								        buffer[i++] = *w & 127;
 								        if (*w == 128)
 								            break; /* end of word */
 								    }
 								    return 1;
 								}
 								static int
-												renamed internal functions to avoid name clashes under OpenVMS
(fixes bug #132815)

											
										
										
											2001-02-18 18:06:17 -04:00
+								_cmpname(int code, const char* name, int namelen)
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								{
 								    /* check if code corresponds to the given name */
 								    int i;
 								    char buffer[NAME_MAXLEN];
-												_Py prefix is verboten for static entry points

											
										
										
											2002-06-13 08:55:14 -03:00
+								    if (!_getucname(code, buffer, sizeof(buffer)))
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								        return 0;
 								    for (i = 0; i < namelen; i++) {
 								        if (toupper(name[i]) != buffer[i])
 								            return 0;
 								    }
 								    return buffer[namelen] == '\0';
 								}
-												Patch #626548: Support Hangul syllable names.

											
										
										
											2002-11-23 08:22:32 -04:00
+								static void
 								find_syllable(const char *str, int *len, int *pos, int count, int column)
 								{
 								    int i, len1;
 								    *len = -1;
 								    for (i = 0; i < count; i++) {
 									char *s = hangul_syllables[i][column];
 									len1 = strlen(s);
 									if (len1 <= *len)
 									    continue;
 									if (strncmp(str, s, len1) == 0) {
 									    *len = len1;
 									    *pos = i;
 									}
 								    }
 								    if (*len == -1) {
 									*len = 0;
 									*pos = -1;
 								    }
 								}
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								static int
-												renamed internal functions to avoid name clashes under OpenVMS
(fixes bug #132815)

											
										
										
											2001-02-18 18:06:17 -04:00
+								_getcode(const char* name, int namelen, Py_UCS4* code)
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								{
 								    unsigned int h, v;
 								    unsigned int mask = code_size-1;
 								    unsigned int i, incr;
-												Patch #626548: Support Hangul syllable names.

											
										
										
											2002-11-23 08:22:32 -04:00
+								    /* Check for hangul syllables. */
 								    if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
 									int L, V, T, len;
 									const char *pos = name + 16;
 									find_syllable(pos, &len, &L, LCount, 0);
 									pos += len;
 									find_syllable(pos, &len, &V, VCount, 1);
 									pos += len;
 									find_syllable(pos, &len, &T, TCount, 2);
 									pos += len;
 									if (V != -1 && V != -1 && T != -1 && pos-name == namelen) {
 									    *code = SBase + (L*VCount+V)*TCount + T;
 									    return 1;
 									}
-												Implement names for CJK unified ideographs. Add name to KeyError output.
Verify that the lookup for an existing name succeeds.

											
										
										
											2002-11-23 14:01:32 -04:00
+								        /* Otherwise, it's an illegal syllable name. */
 								        return 0;
 								    }
 								    /* Check for unified ideographs. */
 								    if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
 								        /* Four or five hexdigits must follow. */
 								        v = 0;
 								        name += 22;
 								        namelen -= 22;
 								        if (namelen != 4 && namelen != 5)
 								            return 0;
 								        while (namelen--) {
 								            v *= 16;
 								            if (*name >= '0' && *name <= '9')
 								                v += *name - '0';
 								            else if (*name >= 'A' && *name <= 'F')
 								                v += *name - 'A' + 10;
 								            else
 								                return 0;
 								            name++;
 								        }
-												Verify that the code in CJK UNIFIED IDEOGRAPH- actually denotes an ideograph.

											
										
										
											2002-11-23 18:10:29 -04:00
+								        if (!is_unified_ideograph(v))
 								            return 0;
-												Implement names for CJK unified ideographs. Add name to KeyError output.
Verify that the lookup for an existing name succeeds.

											
										
										
											2002-11-23 14:01:32 -04:00
+								        *code = v;
 								        return 1;
-												Patch #626548: Support Hangul syllable names.

											
										
										
											2002-11-23 08:22:32 -04:00
+								    }
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								    /* the following is the same as python's dictionary lookup, with
 								       only minor changes.  see the makeunicodedata script for more
 								       details */
-												renamed internal functions to avoid name clashes under OpenVMS
(fixes bug #132815)

											
										
										
											2001-02-18 18:06:17 -04:00
+								    h = (unsigned int) _gethash(name, namelen, code_magic);
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								    i = (~h) & mask;
 								    v = code_hash[i];
 								    if (!v)
 								        return 0;
-												renamed internal functions to avoid name clashes under OpenVMS
(fixes bug #132815)

											
										
										
											2001-02-18 18:06:17 -04:00
+								    if (_cmpname(v, name, namelen)) {
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								        *code = v;
 								        return 1;
 								    }
 								    incr = (h ^ (h >> 3)) & mask;
 								    if (!incr)
 								        incr = mask;
 								    for (;;) {
 								        i = (i + incr) & mask;
 								        v = code_hash[i];
 								        if (!v)
-												stupid typo (for some reason, this only caused problems on OpenVMS).

											
										
										
											2001-02-18 07:41:49 -04:00
+								            return 0;
-												renamed internal functions to avoid name clashes under OpenVMS
(fixes bug #132815)

											
										
										
											2001-02-18 18:06:17 -04:00
+								        if (_cmpname(v, name, namelen)) {
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								            *code = v;
 								            return 1;
 								        }
 								        incr = incr << 1;
 								        if (incr > mask)
 								            incr = incr ^ code_poly;
 								    }
 								}
 								static const _PyUnicode_Name_CAPI hashAPI =
 								{
 								    sizeof(_PyUnicode_Name_CAPI),
-												_Py prefix is verboten for static entry points

											
										
										
											2002-06-13 08:55:14 -03:00
+								    _getucname,
-												renamed internal functions to avoid name clashes under OpenVMS
(fixes bug #132815)

											
										
										
											2001-02-18 18:06:17 -04:00
+								    _getcode
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								};
 								/* -------------------------------------------------------------------- */
 								/* Python bindings */
 								static PyObject *
 								unicodedata_name(PyObject* self, PyObject* args)
 								{
 								    char name[NAME_MAXLEN];
 								    PyUnicodeObject* v;
 								    PyObject* defobj = NULL;
 								    if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
 								        return NULL;
 								    if (PyUnicode_GET_SIZE(v) != 1) {
 									PyErr_SetString(PyExc_TypeError,
 											"need a single Unicode character as parameter");
 									return NULL;
 								    }
-												_Py prefix is verboten for static entry points

											
										
										
											2002-06-13 08:55:14 -03:00
+								    if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
-												renamed internal functions to avoid name clashes under OpenVMS
(fixes bug #132815)

											
										
										
											2001-02-18 18:06:17 -04:00
+								                             name, sizeof(name))) {
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+									if (defobj == NULL) {
 									    PyErr_SetString(PyExc_ValueError, "no such name");
 								            return NULL;
 									}
 									else {
 									    Py_INCREF(defobj);
 									    return defobj;
 									}
 								    }
 								    return Py_BuildValue("s", name);
 								}
 								static PyObject *
 								unicodedata_lookup(PyObject* self, PyObject* args)
 								{
 								    Py_UCS4 code;
 								    Py_UNICODE str[1];
 								    char* name;
 								    int namelen;
 								    if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
 								        return NULL;
-												renamed internal functions to avoid name clashes under OpenVMS
(fixes bug #132815)

											
										
										
											2001-02-18 18:06:17 -04:00
+								    if (!_getcode(name, namelen, &code)) {
-												Implement names for CJK unified ideographs. Add name to KeyError output.
Verify that the lookup for an existing name succeeds.

											
										
										
											2002-11-23 14:01:32 -04:00
+								        char fmt[] = "undefined character name '%s'";
 								        char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
 								        sprintf(buf, fmt, name);
 								        PyErr_SetString(PyExc_KeyError, buf);
 								        PyMem_FREE(buf);
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								        return NULL;
 								    }
 								    str[0] = (Py_UNICODE) code;
 								    return PyUnicode_FromUnicode(str, 1);
 								}
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								/* XXX Add doc strings. */
 								static PyMethodDef unicodedata_functions[] = {
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								    {"decimal", unicodedata_decimal, METH_VARARGS},
 								    {"digit", unicodedata_digit, METH_VARARGS},
 								    {"numeric", unicodedata_numeric, METH_VARARGS},
 								    {"category", unicodedata_category, METH_VARARGS},
 								    {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
 								    {"combining", unicodedata_combining, METH_VARARGS},
 								    {"mirrored", unicodedata_mirrored, METH_VARARGS},
 								    {"decomposition",unicodedata_decomposition, METH_VARARGS},
 								    {"name", unicodedata_name, METH_VARARGS},
 								    {"lookup", unicodedata_lookup, METH_VARARGS},
-												Patch #626485: Support Unicode normalization.

											
										
										
											2002-11-23 18:08:15 -04:00
+								    {"normalize", unicodedata_normalize, METH_VARARGS},
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								    {NULL, NULL}		/* sentinel */
 								};
-												Patch #568124: Add doc string macros.

											
										
										
											2002-06-13 17:33:02 -03:00
+								PyDoc_STRVAR(unicodedata_docstring, "unicode character database");
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
-												Replace DL_IMPORT with PyMODINIT_FUNC and remove "/export:init..." link
command line for Windows builds.  This should allow MSVC to import and
build the Python MSVC6 project files without error.

											
										
										
											2002-07-23 03:31:15 -03:00
+								PyMODINIT_FUNC
-												Bunch of minor ANSIfications: 'void initfunc()' -> 'void initfunc(void)',
and a couple of functions that were missed in the previous batches. Not
terribly tested, but very carefully scrutinized, three times.

All these were found by the little findkrc.py that I posted to python-dev,
which means there might be more lurking. Cases such as this:

long
func(a, b)
	long a;
	long b; /* flagword */
{

and other cases where the last ; in the argument list isn't followed by a
newline and an opening curly bracket. Regexps to catch all are welcome, of
course ;)

											
										
										
											2000-07-21 03:00:07 -03:00
+								initunicodedata(void)
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								{
-												Remove direct manipulation of the module dict.

											
										
										
											2002-04-03 17:39:26 -04:00
+								    PyObject *m, *v;
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
-												Be a bit more strict in setting up the export of the C API for this
module; do not attempt to insert the API object into the module dict
if there was an error creating it.

											
										
										
											2001-03-03 15:41:55 -04:00
+								    m = Py_InitModule3(
 								        "unicodedata", unicodedata_functions, unicodedata_docstring);
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								    if (!m)
 								        return;
-												Add unidata_version. Bump generator version number.

											
										
										
											2002-11-25 05:13:37 -04:00
+								    PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
-												Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.

											
										
										
											2001-01-24 03:59:11 -04:00
+								    /* Export C API */
 								    v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
-												Remove direct manipulation of the module dict.

											
										
										
											2002-04-03 17:39:26 -04:00
+								    if (v != NULL)
 								        PyModule_AddObject(m, "ucnhash_CAPI", v);
-												Module unicodedata -- Provides access to the Unicode 3.0 data base.
Written by Marc-Andre Lemburg.

											
										
										
											2000-03-10 19:10:21 -04:00
+								}
-												Patch #626548: Support Hangul syllable names.

											
										
										
											2002-11-23 08:22:32 -04:00
 								/*
 								Local variables:
 								c-basic-offset: 4
-												Patch #626485: Support Unicode normalization.

											
										
										
											2002-11-23 18:08:15 -04:00
+								indent-tabs-mode: nil
-												Patch #626548: Support Hangul syllable names.

											
										
										
											2002-11-23 08:22:32 -04:00
+								End:
 								*/