FCL/sf/adapt/qemu: comparison symbian-qemu-0.9.1-12/python-2.6.1/Modules/unicodedata.c

equal deleted inserted replaced

-:ffa851df0825
+:2fb8b9db1c86
+/* ------------------------------------------------------------------------
+unicodedata -- Provides access to the Unicode 5.1 data base.
+Data was extracted from the Unicode 5.1 UnicodeData.txt file.
+Written by Marc-Andre Lemburg (mal@lemburg.com).
+Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
+Modified by Martin v. Löwis (martin@v.loewis.de)
+Copyright (c) Corporation for National Research Initiatives.
+------------------------------------------------------------------------ */
+#include "Python.h"
+#include "ucnhash.h"
+#include "structmember.h"
+/* character properties */
+typedef struct {
+const unsigned char category;	/* index into
+					   _PyUnicode_CategoryNames */
+const unsigned char	combining; 	/* combining class value 0 - 255 */
+const unsigned char	bidirectional; 	/* index into
+					   _PyUnicode_BidirectionalNames */
+const unsigned char mirrored;	/* true if mirrored in bidir mode */
+const unsigned char east_asian_width;	/* index into
+						   _PyUnicode_EastAsianWidth */
+} _PyUnicode_DatabaseRecord;
+typedef struct change_record {
+/* sequence of fields should be the same as in merge_old_version */
+const unsigned char bidir_changed;
+const unsigned char category_changed;
+const unsigned char decimal_changed;
+const unsigned char mirrored_changed;
+const int numeric_changed;
+} change_record;
+/* data file generated by Tools/unicode/makeunicodedata.py */
+#include "unicodedata_db.h"
+static const _PyUnicode_DatabaseRecord*
+_getrecord_ex(Py_UCS4 code)
+{
+int index;
+if (code >= 0x110000)
+index = 0;
+else {
+index = index1[(code>>SHIFT)];
+index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
+}
+return &_PyUnicode_Database_Records[index];
+}
+/* ------------- Previous-version API ------------------------------------- */
+typedef struct previous_version {
+PyObject_HEAD
+const char *name;
+const change_record* (*getrecord)(Py_UCS4);
+Py_UCS4 (*normalization)(Py_UCS4);
+} PreviousDBVersion;
+#define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
+static PyMemberDef DB_members[] = {
+	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
+{NULL}
+};
+/* forward declaration */
+static PyTypeObject UCD_Type;
+static PyObject*
+new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
+Py_UCS4 (*normalization)(Py_UCS4))
+{
+	PreviousDBVersion *self;
+	self = PyObject_New(PreviousDBVersion, &UCD_Type);
+	if (self == NULL)
+		return NULL;
+	self->name = name;
+	self->getrecord = getrecord;
+self->normalization = normalization;
+	return (PyObject*)self;
+}
+static Py_UCS4 getuchar(PyUnicodeObject *obj)
+{
+Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
+if (PyUnicode_GET_SIZE(obj) == 1)
+	return *v;
+#ifndef Py_UNICODE_WIDE
+else if ((PyUnicode_GET_SIZE(obj) == 2) &&
+(0xD800 <= v[0] && v[0] <= 0xDBFF) &&
+(0xDC00 <= v[1] && v[1] <= 0xDFFF))
+	return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
+#endif
+PyErr_SetString(PyExc_TypeError,
+"need a single Unicode character as parameter");
+return (Py_UCS4)-1;
+}
+/* --- Module API --------------------------------------------------------- */
+PyDoc_STRVAR(unicodedata_decimal__doc__,
+"decimal(unichr[, default])\n\
+\n\
+Returns the decimal value assigned to the Unicode character unichr\n\
+as integer. If no such value is defined, default is returned, or, if\n\
+not given, ValueError is raised.");
+static PyObject *
+unicodedata_decimal(PyObject *self, PyObject *args)
+{
+PyUnicodeObject *v;
+PyObject *defobj = NULL;
+int have_old = 0;
+long rc;
+Py_UCS4 c;
+if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
+return NULL;
+c = getuchar(v);
+if (c == (Py_UCS4)-1)
+return NULL;
+if (self) {
+const change_record *old = get_old_record(self, c);
+if (old->category_changed == 0) {
+/* unassigned */
+have_old = 1;
+rc = -1;
+}
+else if (old->decimal_changed != 0xFF) {
+have_old = 1;
+rc = old->decimal_changed;
+}
+}
+if (!have_old)
+rc = Py_UNICODE_TODECIMAL(c);
+if (rc < 0) {
+	if (defobj == NULL) {
+	    PyErr_SetString(PyExc_ValueError,
+			    "not a decimal");
+return NULL;
+	}
+	else {
+	    Py_INCREF(defobj);
+	    return defobj;
+	}
+}
+return PyInt_FromLong(rc);
+}
+PyDoc_STRVAR(unicodedata_digit__doc__,
+"digit(unichr[, default])\n\
+\n\
+Returns the digit value assigned to the Unicode character unichr as\n\
+integer. If no such value is defined, default is returned, or, if\n\
+not given, ValueError is raised.");
+static PyObject *
+unicodedata_digit(PyObject *self, PyObject *args)
+{
+PyUnicodeObject *v;
+PyObject *defobj = NULL;
+long rc;
+Py_UCS4 c;
+if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
+return NULL;
+c = getuchar(v);
+if (c == (Py_UCS4)-1)
+return NULL;
+rc = Py_UNICODE_TODIGIT(c);
+if (rc < 0) {
+	if (defobj == NULL) {
+	    PyErr_SetString(PyExc_ValueError, "not a digit");
+return NULL;
+	}
+	else {
+	    Py_INCREF(defobj);
+	    return defobj;
+	}
+}
+return PyInt_FromLong(rc);
+}
+PyDoc_STRVAR(unicodedata_numeric__doc__,
+"numeric(unichr[, default])\n\
+\n\
+Returns the numeric value assigned to the Unicode character unichr\n\
+as float. If no such value is defined, default is returned, or, if\n\
+not given, ValueError is raised.");
+static PyObject *
+unicodedata_numeric(PyObject *self, PyObject *args)
+{
+PyUnicodeObject *v;
+PyObject *defobj = NULL;
+int have_old = 0;
+double rc;
+Py_UCS4 c;
+if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
+return NULL;
+c = getuchar(v);
+if (c == (Py_UCS4)-1)
+return NULL;
+if (self) {
+const change_record *old = get_old_record(self, c);
+if (old->category_changed == 0) {
+/* unassigned */
+have_old = 1;
+rc = -1.0;
+}
+else if (old->decimal_changed != 0xFF) {
+have_old = 1;
+rc = old->decimal_changed;
+}
+}
+if (!have_old)
+rc = Py_UNICODE_TONUMERIC(c);
+if (rc == -1.0) {
+	if (defobj == NULL) {
+	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
+	    return NULL;
+	}
+	else {
+	    Py_INCREF(defobj);
+	    return defobj;
+	}
+}
+return PyFloat_FromDouble(rc);
+}
+PyDoc_STRVAR(unicodedata_category__doc__,
+"category(unichr)\n\
+\n\
+Returns the general category assigned to the Unicode character\n\
+unichr as string.");
+static PyObject *
+unicodedata_category(PyObject *self, PyObject *args)
+{
+PyUnicodeObject *v;
+int index;
+Py_UCS4 c;
+if (!PyArg_ParseTuple(args, "O!:category",
+			  &PyUnicode_Type, &v))
+	return NULL;
+c = getuchar(v);
+if (c == (Py_UCS4)-1)
+return NULL;
+index = (int) _getrecord_ex(c)->category;
+if (self) {
+const change_record *old = get_old_record(self, c);
+if (old->category_changed != 0xFF)
+index = old->category_changed;
+}
+return PyString_FromString(_PyUnicode_CategoryNames[index]);
+}
+PyDoc_STRVAR(unicodedata_bidirectional__doc__,
+"bidirectional(unichr)\n\
+\n\
+Returns the bidirectional category assigned to the Unicode character\n\
+unichr as string. If no such value is defined, an empty string is\n\
+returned.");
+static PyObject *
+unicodedata_bidirectional(PyObject *self, PyObject *args)
+{
+PyUnicodeObject *v;
+int index;
+Py_UCS4 c;
+if (!PyArg_ParseTuple(args, "O!:bidirectional",
+			  &PyUnicode_Type, &v))
+	return NULL;
+c = getuchar(v);
+if (c == (Py_UCS4)-1)
+return NULL;
+index = (int) _getrecord_ex(c)->bidirectional;
+if (self) {
+const change_record *old = get_old_record(self, c);
+if (old->category_changed == 0)
+index = 0; /* unassigned */
+else if (old->bidir_changed != 0xFF)
+index = old->bidir_changed;
+}
+return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
+}
+PyDoc_STRVAR(unicodedata_combining__doc__,
+"combining(unichr)\n\
+\n\
+Returns the canonical combining class assigned to the Unicode\n\
+character unichr as integer. Returns 0 if no combining class is\n\
+defined.");
+static PyObject *
+unicodedata_combining(PyObject *self, PyObject *args)
+{
+PyUnicodeObject *v;
+int index;
+Py_UCS4 c;
+if (!PyArg_ParseTuple(args, "O!:combining",
+			  &PyUnicode_Type, &v))
+	return NULL;
+c = getuchar(v);
+if (c == (Py_UCS4)-1)
+return NULL;
+index = (int) _getrecord_ex(c)->combining;
+if (self) {
+const change_record *old = get_old_record(self, c);
+if (old->category_changed == 0)
+index = 0; /* unassigned */
+}
+return PyInt_FromLong(index);
+}
+PyDoc_STRVAR(unicodedata_mirrored__doc__,
+"mirrored(unichr)\n\
+\n\
+Returns the mirrored property assigned to the Unicode character\n\
+unichr as integer. Returns 1 if the character has been identified as\n\
+a \"mirrored\" character in bidirectional text, 0 otherwise.");
+static PyObject *
+unicodedata_mirrored(PyObject *self, PyObject *args)
+{
+PyUnicodeObject *v;
+int index;
+Py_UCS4 c;
+if (!PyArg_ParseTuple(args, "O!:mirrored",
+			  &PyUnicode_Type, &v))
+	return NULL;
+c = getuchar(v);
+if (c == (Py_UCS4)-1)
+return NULL;
+index = (int) _getrecord_ex(c)->mirrored;
+if (self) {
+const change_record *old = get_old_record(self, c);
+if (old->category_changed == 0)
+index = 0; /* unassigned */
+else if (old->mirrored_changed != 0xFF)
+index = old->mirrored_changed;
+}
+return PyInt_FromLong(index);
+}
+PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
+"east_asian_width(unichr)\n\
+\n\
+Returns the east asian width assigned to the Unicode character\n\
+unichr as string.");
+static PyObject *
+unicodedata_east_asian_width(PyObject *self, PyObject *args)
+{
+PyUnicodeObject *v;
+int index;
+Py_UCS4 c;
+if (!PyArg_ParseTuple(args, "O!:east_asian_width",
+			  &PyUnicode_Type, &v))
+	return NULL;
+c = getuchar(v);
+if (c == (Py_UCS4)-1)
+return NULL;
+index = (int) _getrecord_ex(c)->east_asian_width;
+if (self) {
+const change_record *old = get_old_record(self, c);
+if (old->category_changed == 0)
+index = 0; /* unassigned */
+}
+return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
+}
+PyDoc_STRVAR(unicodedata_decomposition__doc__,
+"decomposition(unichr)\n\
+\n\
+Returns the character decomposition mapping assigned to the Unicode\n\
+character unichr as string. An empty string is returned in case no\n\
+such mapping is defined.");
+static PyObject *
+unicodedata_decomposition(PyObject *self, PyObject *args)
+{
+PyUnicodeObject *v;
+char decomp[256];
+int code, index, count, i;
+unsigned int prefix_index;
+Py_UCS4 c;
+if (!PyArg_ParseTuple(args, "O!:decomposition",
+			  &PyUnicode_Type, &v))
+	return NULL;
+c = getuchar(v);
+if (c == (Py_UCS4)-1)
+return NULL;
+code = (int)c;
+if (self) {
+const change_record *old = get_old_record(self, c);
+if (old->category_changed == 0)
+return PyString_FromString(""); /* unassigned */
+}
+if (code < 0 || code >= 0x110000)
+index = 0;
+else {
+index = decomp_index1[(code>>DECOMP_SHIFT)];
+index = decomp_index2[(index<<DECOMP_SHIFT)+
+(code&((1<<DECOMP_SHIFT)-1))];
+}
+/* high byte is number of hex bytes (usually one or two), low byte
+is prefix code (from*/
+count = decomp_data[index] >> 8;
+/* XXX: could allocate the PyString up front instead
+(strlen(prefix) + 5 * count + 1 bytes) */
+/* Based on how index is calculated above and decomp_data is generated
+from Tools/unicode/makeunicodedata.py, it should not be possible
+to overflow decomp_prefix. */
+prefix_index = decomp_data[index] & 255;
+assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
+/* copy prefix */
+i = strlen(decomp_prefix[prefix_index]);
+memcpy(decomp, decomp_prefix[prefix_index], i);
+while (count-- > 0) {
+if (i)
+decomp[i++] = ' ';
+assert((size_t)i < sizeof(decomp));
+PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
+decomp_data[++index]);
+i += strlen(decomp + i);
+}
+decomp[i] = '\0';
+return PyString_FromString(decomp);
+}
+static void
+get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
+{
+if (code >= 0x110000) {
+*index = 0;
+} else if (self && get_old_record(self, code)->category_changed==0) {
+/* unassigned in old version */
+*index = 0;
+}
+else {
+*index = decomp_index1[(code>>DECOMP_SHIFT)];
+*index = decomp_index2[(*index<<DECOMP_SHIFT)+
+(code&((1<<DECOMP_SHIFT)-1))];
+}
+/* high byte is number of hex bytes (usually one or two), low byte
+is prefix code (from*/
+*count = decomp_data[*index] >> 8;
+*prefix = decomp_data[*index] & 255;
+(*index)++;
+}
+#define SBase   0xAC00
+#define LBase   0x1100
+#define VBase   0x1161
+#define TBase   0x11A7
+#define LCount  19
+#define VCount  21
+#define TCount  28
+#define NCount  (VCount*TCount)
+#define SCount  (LCount*NCount)
+static PyObject*
+nfd_nfkd(PyObject *self, PyObject *input, int k)
+{
+PyObject *result;
+Py_UNICODE *i, *end, *o;
+/* Longest decomposition in Unicode 3.2: U+FDFA */
+Py_UNICODE stack[20];
+Py_ssize_t space, isize;
+int index, prefix, count, stackptr;
+unsigned char prev, cur;
+stackptr = 0;
+isize = PyUnicode_GET_SIZE(input);
+/* Overallocate atmost 10 characters. */
+space = (isize > 10 ? 10 : isize) + isize;
+result = PyUnicode_FromUnicode(NULL, space);
+if (!result)
+return NULL;
+i = PyUnicode_AS_UNICODE(input);
+end = i + isize;
+o = PyUnicode_AS_UNICODE(result);
+while (i < end) {
+stack[stackptr++] = *i++;
+while(stackptr) {
+Py_UNICODE code = stack[--stackptr];
+/* Hangul Decomposition adds three characters in
+a single step, so we need atleast that much room. */
+if (space < 3) {
+Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
+space += 10;
+if (PyUnicode_Resize(&result, newsize) == -1)
+return NULL;
+o = PyUnicode_AS_UNICODE(result) + newsize - space;
+}
+/* Hangul Decomposition. */
+if (SBase <= code && code < (SBase+SCount)) {
+int SIndex = code - SBase;
+int L = LBase + SIndex / NCount;
+int V = VBase + (SIndex % NCount) / TCount;
+int T = TBase + SIndex % TCount;
+*o++ = L;
+*o++ = V;
+space -= 2;
+if (T != TBase) {
+*o++ = T;
+space --;
+}
+continue;
+}
+/* normalization changes */
+if (self) {
+Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
+if (value != 0) {
+stack[stackptr++] = value;
+continue;
+}
+}
+/* Other decompositions. */
+get_decomp_record(self, code, &index, &prefix, &count);
+/* Copy character if it is not decomposable, or has a
+compatibility decomposition, but we do NFD. */
+if (!count || (prefix && !k)) {
+*o++ = code;
+space--;
+continue;
+}
+/* Copy decomposition onto the stack, in reverse
+order.  */
+while(count) {
+code = decomp_data[index + (--count)];
+stack[stackptr++] = code;
+}
+}
+}
+/* Drop overallocation. Cannot fail. */
+PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
+/* Sort canonically. */
+i = PyUnicode_AS_UNICODE(result);
+prev = _getrecord_ex(*i)->combining;
+end = i + PyUnicode_GET_SIZE(result);
+for (i++; i < end; i++) {
+cur = _getrecord_ex(*i)->combining;
+if (prev == 0 || cur == 0 || prev <= cur) {
+prev = cur;
+continue;
+}
+/* Non-canonical order. Need to switch *i with previous. */
+o = i - 1;
+while (1) {
+Py_UNICODE tmp = o[1];
+o[1] = o[0];
+o[0] = tmp;
+o--;
+if (o < PyUnicode_AS_UNICODE(result))
+break;
+prev = _getrecord_ex(*o)->combining;
+if (prev == 0 || prev <= cur)
+break;
+}
+prev = _getrecord_ex(*i)->combining;
+}
+return result;
+}
+static int
+find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
+{
+int index;
+for (index = 0; nfc[index].start; index++) {
+int start = nfc[index].start;
+if (code < start)
+return -1;
+if (code <= start + nfc[index].count) {
+int delta = code - start;
+return nfc[index].index + delta;
+}
+}
+return -1;
+}
+static PyObject*
+nfc_nfkc(PyObject *self, PyObject *input, int k)
+{
+PyObject *result;
+Py_UNICODE *i, *i1, *o, *end;
+int f,l,index,index1,comb;
+Py_UNICODE code;
+Py_UNICODE *skipped[20];
+int cskipped = 0;
+result = nfd_nfkd(self, input, k);
+if (!result)
+return NULL;
+/* We are going to modify result in-place.
+If nfd_nfkd is changed to sometimes return the input,
+this code needs to be reviewed. */
+assert(result != input);
+i = PyUnicode_AS_UNICODE(result);
+end = i + PyUnicode_GET_SIZE(result);
+o = PyUnicode_AS_UNICODE(result);
+again:
+while (i < end) {
+for (index = 0; index < cskipped; index++) {
+if (skipped[index] == i) {
+/* *i character is skipped.
+Remove from list. */
+skipped[index] = skipped[cskipped-1];
+cskipped--;
+i++;
+goto again; /* continue while */
+}
+}
+/* Hangul Composition. We don't need to check for <LV,T>
+pairs, since we always have decomposed data. */
+if (LBase <= *i && *i < (LBase+LCount) &&
+i + 1 < end &&
+VBase <= i[1] && i[1] <= (VBase+VCount)) {
+int LIndex, VIndex;
+LIndex = i[0] - LBase;
+VIndex = i[1] - VBase;
+code = SBase + (LIndex*VCount+VIndex)*TCount;
+i+=2;
+if (i < end &&
+TBase <= *i && *i <= (TBase+TCount)) {
+code += *i-TBase;
+i++;
+}
+*o++ = code;
+continue;
+}
+f = find_nfc_index(self, nfc_first, *i);
+if (f == -1) {
+*o++ = *i++;
+continue;
+}
+/* Find next unblocked character. */
+i1 = i+1;
+comb = 0;
+while (i1 < end) {
+int comb1 = _getrecord_ex(*i1)->combining;
+if (comb1 && comb == comb1) {
+/* Character is blocked. */
+i1++;
+continue;
+}
+l = find_nfc_index(self, nfc_last, *i1);
+/* *i1 cannot be combined with *i. If *i1
+is a starter, we don't need to look further.
+Otherwise, record the combining class. */
+if (l == -1) {
+not_combinable:
+if (comb1 == 0)
+break;
+comb = comb1;
+i1++;
+continue;
+}
+index = f*TOTAL_LAST + l;
+index1 = comp_index[index >> COMP_SHIFT];
+code = comp_data[(index1<<COMP_SHIFT)+
+(index&((1<<COMP_SHIFT)-1))];
+if (code == 0)
+goto not_combinable;
+/* Replace the original character. */
+*i = code;
+/* Mark the second character unused. */
+skipped[cskipped++] = i1;
+i1++;
+f = find_nfc_index(self, nfc_first, *i);
+if (f == -1)
+break;
+}
+*o++ = *i++;
+}
+if (o != end)
+PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
+return result;
+}
+PyDoc_STRVAR(unicodedata_normalize__doc__,
+"normalize(form, unistr)\n\
+\n\
+Return the normal form 'form' for the Unicode string unistr.  Valid\n\
+values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
+static PyObject*
+unicodedata_normalize(PyObject *self, PyObject *args)
+{
+char *form;
+PyObject *input;
+if(!PyArg_ParseTuple(args, "sO!:normalize",
+&form, &PyUnicode_Type, &input))
+return NULL;
+if (PyUnicode_GetSize(input) == 0) {
+/* Special case empty input strings, since resizing
+them  later would cause internal errors. */
+Py_INCREF(input);
+return input;
+}
+if (strcmp(form, "NFC") == 0)
+return nfc_nfkc(self, input, 0);
+if (strcmp(form, "NFKC") == 0)
+return nfc_nfkc(self, input, 1);
+if (strcmp(form, "NFD") == 0)
+return nfd_nfkd(self, input, 0);
+if (strcmp(form, "NFKD") == 0)
+return nfd_nfkd(self, input, 1);
+PyErr_SetString(PyExc_ValueError, "invalid normalization form");
+return NULL;
+}
+/* -------------------------------------------------------------------- */
+/* unicode character name tables */
+/* data file generated by Tools/unicode/makeunicodedata.py */
+#include "unicodename_db.h"
+/* -------------------------------------------------------------------- */
+/* database code (cut and pasted from the unidb package) */
+static unsigned long
+_gethash(const char *s, int len, int scale)
+{
+int i;
+unsigned long h = 0;
+unsigned long ix;
+for (i = 0; i < len; i++) {
+h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
+ix = h & 0xff000000;
+if (ix)
+h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
+}
+return h;
+}
+static char *hangul_syllables[][3] = {
+{ "G",  "A",   ""   },
+{ "GG", "AE",  "G"  },
+{ "N",  "YA",  "GG" },
+{ "D",  "YAE", "GS" },
+{ "DD", "EO",  "N", },
+{ "R",  "E",   "NJ" },
+{ "M",  "YEO", "NH" },
+{ "B",  "YE",  "D"  },
+{ "BB", "O",   "L"  },
+{ "S",  "WA",  "LG" },
+{ "SS", "WAE", "LM" },
+{ "",   "OE",  "LB" },
+{ "J",  "YO",  "LS" },
+{ "JJ", "U",   "LT" },
+{ "C",  "WEO", "LP" },
+{ "K",  "WE",  "LH" },
+{ "T",  "WI",  "M"  },
+{ "P",  "YU",  "B"  },
+{ "H",  "EU",  "BS" },
+{ 0,    "YI",  "S"  },
+{ 0,    "I",   "SS" },
+{ 0,    0,     "NG" },
+{ 0,    0,     "J"  },
+{ 0,    0,     "C"  },
+{ 0,    0,     "K"  },
+{ 0,    0,     "T"  },
+{ 0,    0,     "P"  },
+{ 0,    0,     "H"  }
+};
+static int
+is_unified_ideograph(Py_UCS4 code)
+{
+return (
+(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
+(0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
+(0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
+}
+static int
+_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
+{
+int offset;
+int i;
+int word;
+unsigned char* w;
+if (code >= 0x110000)
+return 0;
+if (self) {
+const change_record *old = get_old_record(self, code);
+if (old->category_changed == 0) {
+/* unassigned */
+return 0;
+}
+}
+if (SBase <= code && code < SBase+SCount) {
+	/* Hangul syllable. */
+	int SIndex = code - SBase;
+	int L = SIndex / NCount;
+	int V = (SIndex % NCount) / TCount;
+	int T = SIndex % TCount;
+	if (buflen < 27)
+	    /* Worst case: HANGUL SYLLABLE <10chars>. */
+	    return 0;
+	strcpy(buffer, "HANGUL SYLLABLE ");
+	buffer += 16;
+	strcpy(buffer, hangul_syllables[L][0]);
+	buffer += strlen(hangul_syllables[L][0]);
+	strcpy(buffer, hangul_syllables[V][1]);
+	buffer += strlen(hangul_syllables[V][1]);
+	strcpy(buffer, hangul_syllables[T][2]);
+	buffer += strlen(hangul_syllables[T][2]);
+	*buffer = '\0';
+	return 1;
+}
+if (is_unified_ideograph(code)) {
+if (buflen < 28)
+/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
+return 0;
+sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
+return 1;
+}
+/* get offset into phrasebook */
+offset = phrasebook_offset1[(code>>phrasebook_shift)];
+offset = phrasebook_offset2[(offset<<phrasebook_shift) +
+(code&((1<<phrasebook_shift)-1))];
+if (!offset)
+return 0;
+i = 0;
+for (;;) {
+/* get word index */
+word = phrasebook[offset] - phrasebook_short;
+if (word >= 0) {
+word = (word << 8) + phrasebook[offset+1];
+offset += 2;
+} else
+word = phrasebook[offset++];
+if (i) {
+if (i > buflen)
+return 0; /* buffer overflow */
+buffer[i++] = ' ';
+}
+/* copy word string from lexicon.  the last character in the
+word has bit 7 set.  the last word in a string ends with
+0x80 */
+w = lexicon + lexicon_offset[word];
+while (*w < 128) {
+if (i >= buflen)
+return 0; /* buffer overflow */
+buffer[i++] = *w++;
+}
+if (i >= buflen)
+return 0; /* buffer overflow */
+buffer[i++] = *w & 127;
+if (*w == 128)
+break; /* end of word */
+}
+return 1;
+}
+static int
+_cmpname(PyObject *self, int code, const char* name, int namelen)
+{
+/* check if code corresponds to the given name */
+int i;
+char buffer[NAME_MAXLEN];
+if (!_getucname(self, code, buffer, sizeof(buffer)))
+return 0;
+for (i = 0; i < namelen; i++) {
+if (toupper(Py_CHARMASK(name[i])) != buffer[i])
+return 0;
+}
+return buffer[namelen] == '\0';
+}
+static void
+find_syllable(const char *str, int *len, int *pos, int count, int column)
+{
+int i, len1;
+*len = -1;
+for (i = 0; i < count; i++) {
+	char *s = hangul_syllables[i][column];
+	len1 = strlen(s);
+	if (len1 <= *len)
+	    continue;
+	if (strncmp(str, s, len1) == 0) {
+	    *len = len1;
+	    *pos = i;
+	}
+}
+if (*len == -1) {
+	*len = 0;
+}
+}
+static int
+_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
+{
+unsigned int h, v;
+unsigned int mask = code_size-1;
+unsigned int i, incr;
+/* Check for hangul syllables. */
+if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
+	int len, L = -1, V = -1, T = -1;
+	const char *pos = name + 16;
+	find_syllable(pos, &len, &L, LCount, 0);
+	pos += len;
+	find_syllable(pos, &len, &V, VCount, 1);
+	pos += len;
+	find_syllable(pos, &len, &T, TCount, 2);
+	pos += len;
+	if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
+	    *code = SBase + (L*VCount+V)*TCount + T;
+	    return 1;
+	}
+/* Otherwise, it's an illegal syllable name. */
+return 0;
+}
+/* Check for unified ideographs. */
+if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
+/* Four or five hexdigits must follow. */
+v = 0;
+name += 22;
+namelen -= 22;
+if (namelen != 4 && namelen != 5)
+return 0;
+while (namelen--) {
+v *= 16;
+if (*name >= '0' && *name <= '9')
+v += *name - '0';
+else if (*name >= 'A' && *name <= 'F')
+v += *name - 'A' + 10;
+else
+return 0;
+name++;
+}
+if (!is_unified_ideograph(v))
+return 0;
+*code = v;
+return 1;
+}
+/* the following is the same as python's dictionary lookup, with
+only minor changes.  see the makeunicodedata script for more
+details */
+h = (unsigned int) _gethash(name, namelen, code_magic);
+i = (~h) & mask;
+v = code_hash[i];
+if (!v)
+return 0;
+if (_cmpname(self, v, name, namelen)) {
+*code = v;
+return 1;
+}
+incr = (h ^ (h >> 3)) & mask;
+if (!incr)
+incr = mask;
+for (;;) {
+i = (i + incr) & mask;
+v = code_hash[i];
+if (!v)
+return 0;
+if (_cmpname(self, v, name, namelen)) {
+*code = v;
+return 1;
+}
+incr = incr << 1;
+if (incr > mask)
+incr = incr ^ code_poly;
+}
+}
+static const _PyUnicode_Name_CAPI hashAPI =
+{
+sizeof(_PyUnicode_Name_CAPI),
+_getucname,
+_getcode
+};
+/* -------------------------------------------------------------------- */
+/* Python bindings */
+PyDoc_STRVAR(unicodedata_name__doc__,
+"name(unichr[, default])\n\
+Returns the name assigned to the Unicode character unichr as a\n\
+string. If no name is defined, default is returned, or, if not\n\
+given, ValueError is raised.");
+static PyObject *
+unicodedata_name(PyObject* self, PyObject* args)
+{
+char name[NAME_MAXLEN];
+Py_UCS4 c;
+PyUnicodeObject* v;
+PyObject* defobj = NULL;
+if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
+return NULL;
+c = getuchar(v);
+if (c == (Py_UCS4)-1)
+return NULL;
+if (!_getucname(self, c, name, sizeof(name))) {
+	if (defobj == NULL) {
+	    PyErr_SetString(PyExc_ValueError, "no such name");
+return NULL;
+	}
+	else {
+	    Py_INCREF(defobj);
+	    return defobj;
+	}
+}
+return Py_BuildValue("s", name);
+}
+PyDoc_STRVAR(unicodedata_lookup__doc__,
+"lookup(name)\n\
+\n\
+Look up character by name.  If a character with the\n\
+given name is found, return the corresponding Unicode\n\
+character.  If not found, KeyError is raised.");
+static PyObject *
+unicodedata_lookup(PyObject* self, PyObject* args)
+{
+Py_UCS4 code;
+Py_UNICODE str[2];
+char* name;
+int namelen;
+if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
+return NULL;
+if (!_getcode(self, name, namelen, &code)) {
+PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
+name);
+return NULL;
+}
+#ifndef Py_UNICODE_WIDE
+if (code >= 0x10000) {
+str[0] = 0xd800 + ((code - 0x10000) >> 10);
+str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
+return PyUnicode_FromUnicode(str, 2);
+}
+#endif
+str[0] = (Py_UNICODE) code;
+return PyUnicode_FromUnicode(str, 1);
+}
+/* XXX Add doc strings. */
+static PyMethodDef unicodedata_functions[] = {
+{"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
+{"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
+{"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
+{"category", unicodedata_category, METH_VARARGS,
+unicodedata_category__doc__},
+{"bidirectional", unicodedata_bidirectional, METH_VARARGS,
+unicodedata_bidirectional__doc__},
+{"combining", unicodedata_combining, METH_VARARGS,
+unicodedata_combining__doc__},
+{"mirrored", unicodedata_mirrored, METH_VARARGS,
+unicodedata_mirrored__doc__},
+{"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
+unicodedata_east_asian_width__doc__},
+{"decomposition", unicodedata_decomposition, METH_VARARGS,
+unicodedata_decomposition__doc__},
+{"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
+{"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
+{"normalize", unicodedata_normalize, METH_VARARGS,
+unicodedata_normalize__doc__},
+{NULL, NULL}		/* sentinel */
+};
+static PyTypeObject UCD_Type = {
+	/* The ob_type field must be initialized in the module init function
+	 * to be portable to Windows without using C++. */
+	PyVarObject_HEAD_INIT(NULL, 0)
+	"unicodedata.UCD",		/*tp_name*/
+	sizeof(PreviousDBVersion),	/*tp_basicsize*/
+	0,			/*tp_itemsize*/
+	/* methods */
+	(destructor)PyObject_Del, /*tp_dealloc*/
+	0,			/*tp_print*/
+	0,                      /*tp_getattr*/
+	0,			/*tp_setattr*/
+	0,			/*tp_compare*/
+	0,			/*tp_repr*/
+	0,			/*tp_as_number*/
+	0,			/*tp_as_sequence*/
+	0,			/*tp_as_mapping*/
+	0,			/*tp_hash*/
+0,                      /*tp_call*/
+0,                      /*tp_str*/
+PyObject_GenericGetAttr,/*tp_getattro*/
+0,                      /*tp_setattro*/
+0,                      /*tp_as_buffer*/
+Py_TPFLAGS_DEFAULT,     /*tp_flags*/
+0,                      /*tp_doc*/
+0,                      /*tp_traverse*/
+0,                      /*tp_clear*/
+0,                      /*tp_richcompare*/
+0,                      /*tp_weaklistoffset*/
+0,                      /*tp_iter*/
+0,                      /*tp_iternext*/
+unicodedata_functions,  /*tp_methods*/
+DB_members,             /*tp_members*/
+0,                      /*tp_getset*/
+0,                      /*tp_base*/
+0,                      /*tp_dict*/
+0,                      /*tp_descr_get*/
+0,                      /*tp_descr_set*/
+0,                      /*tp_dictoffset*/
+0,                      /*tp_init*/
+0,                      /*tp_alloc*/
+0,                      /*tp_new*/
+0,                      /*tp_free*/
+0,                      /*tp_is_gc*/
+};
+PyDoc_STRVAR(unicodedata_docstring,
+"This module provides access to the Unicode Character Database which\n\
+defines character properties for all Unicode characters. The data in\n\
+this database is based on the UnicodeData.txt file version\n\
+5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
+\n\
+The module uses the same names and symbols as defined by the\n\
+UnicodeData File Format 5.1.0 (see\n\
+http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
+PyMODINIT_FUNC
+initunicodedata(void)
+{
+PyObject *m, *v;
+Py_TYPE(&UCD_Type) = &PyType_Type;
+m = Py_InitModule3(
+"unicodedata", unicodedata_functions, unicodedata_docstring);
+if (!m)
+return;
+PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
+Py_INCREF(&UCD_Type);
+PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
+/* Previous versions */
+v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
+if (v != NULL)
+PyModule_AddObject(m, "ucd_3_2_0", v);
+/* Export C API */
+v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
+if (v != NULL)
+PyModule_AddObject(m, "ucnhash_CAPI", v);
+}
+/*
+Local variables:
+c-basic-offset: 4
+indent-tabs-mode: nil
+End:
+*/