symbian-qemu-0.9.1-12/python-2.6.1/Modules/unicodedata.c
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 /* ------------------------------------------------------------------------
       
     2 
       
     3    unicodedata -- Provides access to the Unicode 5.1 data base.
       
     4 
       
     5    Data was extracted from the Unicode 5.1 UnicodeData.txt file.
       
     6 
       
     7    Written by Marc-Andre Lemburg (mal@lemburg.com).
       
     8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
       
     9    Modified by Martin v. Löwis (martin@v.loewis.de)
       
    10 
       
    11    Copyright (c) Corporation for National Research Initiatives.
       
    12 
       
    13    ------------------------------------------------------------------------ */
       
    14 
       
    15 #include "Python.h"
       
    16 #include "ucnhash.h"
       
    17 #include "structmember.h"
       
    18 
       
    19 /* character properties */
       
    20 
       
    21 typedef struct {
       
    22     const unsigned char category;	/* index into
       
    23 					   _PyUnicode_CategoryNames */
       
    24     const unsigned char	combining; 	/* combining class value 0 - 255 */
       
    25     const unsigned char	bidirectional; 	/* index into
       
    26 					   _PyUnicode_BidirectionalNames */
       
    27     const unsigned char mirrored;	/* true if mirrored in bidir mode */
       
    28     const unsigned char east_asian_width;	/* index into
       
    29 						   _PyUnicode_EastAsianWidth */
       
    30 } _PyUnicode_DatabaseRecord;
       
    31 
       
    32 typedef struct change_record {
       
    33     /* sequence of fields should be the same as in merge_old_version */
       
    34     const unsigned char bidir_changed;
       
    35     const unsigned char category_changed;
       
    36     const unsigned char decimal_changed;
       
    37     const unsigned char mirrored_changed;
       
    38     const int numeric_changed;
       
    39 } change_record;
       
    40 
       
    41 /* data file generated by Tools/unicode/makeunicodedata.py */
       
    42 #include "unicodedata_db.h"
       
    43 
       
    44 static const _PyUnicode_DatabaseRecord*
       
    45 _getrecord_ex(Py_UCS4 code)
       
    46 {
       
    47     int index;
       
    48     if (code >= 0x110000)
       
    49         index = 0;
       
    50     else {
       
    51         index = index1[(code>>SHIFT)];
       
    52         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
       
    53     }
       
    54 
       
    55     return &_PyUnicode_Database_Records[index];
       
    56 }
       
    57 
       
    58 /* ------------- Previous-version API ------------------------------------- */
       
    59 typedef struct previous_version {
       
    60     PyObject_HEAD
       
    61     const char *name;
       
    62     const change_record* (*getrecord)(Py_UCS4);
       
    63     Py_UCS4 (*normalization)(Py_UCS4);
       
    64 } PreviousDBVersion;
       
    65 
       
    66 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
       
    67 
       
    68 static PyMemberDef DB_members[] = {
       
    69 	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
       
    70         {NULL}
       
    71 };
       
    72 
       
    73 /* forward declaration */
       
    74 static PyTypeObject UCD_Type;
       
    75 
       
    76 static PyObject*
       
    77 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
       
    78                      Py_UCS4 (*normalization)(Py_UCS4))
       
    79 {
       
    80 	PreviousDBVersion *self;
       
    81 	self = PyObject_New(PreviousDBVersion, &UCD_Type);
       
    82 	if (self == NULL)
       
    83 		return NULL;
       
    84 	self->name = name;
       
    85 	self->getrecord = getrecord;
       
    86         self->normalization = normalization;
       
    87 	return (PyObject*)self;
       
    88 }
       
    89 
       
    90 
       
    91 static Py_UCS4 getuchar(PyUnicodeObject *obj)
       
    92 {
       
    93     Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
       
    94 
       
    95     if (PyUnicode_GET_SIZE(obj) == 1)
       
    96 	return *v;
       
    97 #ifndef Py_UNICODE_WIDE
       
    98     else if ((PyUnicode_GET_SIZE(obj) == 2) &&
       
    99              (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
       
   100              (0xDC00 <= v[1] && v[1] <= 0xDFFF))
       
   101 	return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
       
   102 #endif
       
   103     PyErr_SetString(PyExc_TypeError,
       
   104                     "need a single Unicode character as parameter");
       
   105     return (Py_UCS4)-1;
       
   106 }
       
   107 
       
   108 /* --- Module API --------------------------------------------------------- */
       
   109 
       
   110 PyDoc_STRVAR(unicodedata_decimal__doc__,
       
   111 "decimal(unichr[, default])\n\
       
   112 \n\
       
   113 Returns the decimal value assigned to the Unicode character unichr\n\
       
   114 as integer. If no such value is defined, default is returned, or, if\n\
       
   115 not given, ValueError is raised.");
       
   116 
       
   117 static PyObject *
       
   118 unicodedata_decimal(PyObject *self, PyObject *args)
       
   119 {
       
   120     PyUnicodeObject *v;
       
   121     PyObject *defobj = NULL;
       
   122     int have_old = 0;
       
   123     long rc;
       
   124     Py_UCS4 c;
       
   125 
       
   126     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
       
   127         return NULL;
       
   128     c = getuchar(v);
       
   129     if (c == (Py_UCS4)-1)
       
   130         return NULL;
       
   131 
       
   132     if (self) {
       
   133         const change_record *old = get_old_record(self, c);
       
   134         if (old->category_changed == 0) {
       
   135             /* unassigned */
       
   136             have_old = 1;
       
   137             rc = -1;
       
   138         } 
       
   139         else if (old->decimal_changed != 0xFF) {
       
   140             have_old = 1;
       
   141             rc = old->decimal_changed;
       
   142         }
       
   143     }
       
   144 
       
   145     if (!have_old)
       
   146         rc = Py_UNICODE_TODECIMAL(c);
       
   147     if (rc < 0) {
       
   148 	if (defobj == NULL) {
       
   149 	    PyErr_SetString(PyExc_ValueError,
       
   150 			    "not a decimal");
       
   151             return NULL;
       
   152 	}
       
   153 	else {
       
   154 	    Py_INCREF(defobj);
       
   155 	    return defobj;
       
   156 	}
       
   157     }
       
   158     return PyInt_FromLong(rc);
       
   159 }
       
   160 
       
   161 PyDoc_STRVAR(unicodedata_digit__doc__,
       
   162 "digit(unichr[, default])\n\
       
   163 \n\
       
   164 Returns the digit value assigned to the Unicode character unichr as\n\
       
   165 integer. If no such value is defined, default is returned, or, if\n\
       
   166 not given, ValueError is raised.");
       
   167 
       
   168 static PyObject *
       
   169 unicodedata_digit(PyObject *self, PyObject *args)
       
   170 {
       
   171     PyUnicodeObject *v;
       
   172     PyObject *defobj = NULL;
       
   173     long rc;
       
   174     Py_UCS4 c;
       
   175 
       
   176     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
       
   177         return NULL;
       
   178     c = getuchar(v);
       
   179     if (c == (Py_UCS4)-1)
       
   180         return NULL;
       
   181     rc = Py_UNICODE_TODIGIT(c);
       
   182     if (rc < 0) {
       
   183 	if (defobj == NULL) {
       
   184 	    PyErr_SetString(PyExc_ValueError, "not a digit");
       
   185             return NULL;
       
   186 	}
       
   187 	else {
       
   188 	    Py_INCREF(defobj);
       
   189 	    return defobj;
       
   190 	}
       
   191     }
       
   192     return PyInt_FromLong(rc);
       
   193 }
       
   194 
       
   195 PyDoc_STRVAR(unicodedata_numeric__doc__,
       
   196 "numeric(unichr[, default])\n\
       
   197 \n\
       
   198 Returns the numeric value assigned to the Unicode character unichr\n\
       
   199 as float. If no such value is defined, default is returned, or, if\n\
       
   200 not given, ValueError is raised.");
       
   201 
       
   202 static PyObject *
       
   203 unicodedata_numeric(PyObject *self, PyObject *args)
       
   204 {
       
   205     PyUnicodeObject *v;
       
   206     PyObject *defobj = NULL;
       
   207     int have_old = 0;
       
   208     double rc;
       
   209     Py_UCS4 c;
       
   210 
       
   211     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
       
   212         return NULL;
       
   213     c = getuchar(v);
       
   214     if (c == (Py_UCS4)-1)
       
   215         return NULL;
       
   216 
       
   217     if (self) {
       
   218         const change_record *old = get_old_record(self, c);
       
   219         if (old->category_changed == 0) {
       
   220             /* unassigned */
       
   221             have_old = 1;
       
   222             rc = -1.0;
       
   223         } 
       
   224         else if (old->decimal_changed != 0xFF) {
       
   225             have_old = 1;
       
   226             rc = old->decimal_changed;
       
   227         }
       
   228     }
       
   229 
       
   230     if (!have_old)
       
   231         rc = Py_UNICODE_TONUMERIC(c);
       
   232     if (rc == -1.0) {
       
   233 	if (defobj == NULL) {
       
   234 	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
       
   235 	    return NULL;
       
   236 	}
       
   237 	else {
       
   238 	    Py_INCREF(defobj);
       
   239 	    return defobj;
       
   240 	}
       
   241     }
       
   242     return PyFloat_FromDouble(rc);
       
   243 }
       
   244 
       
   245 PyDoc_STRVAR(unicodedata_category__doc__,
       
   246 "category(unichr)\n\
       
   247 \n\
       
   248 Returns the general category assigned to the Unicode character\n\
       
   249 unichr as string.");
       
   250 
       
   251 static PyObject *
       
   252 unicodedata_category(PyObject *self, PyObject *args)
       
   253 {
       
   254     PyUnicodeObject *v;
       
   255     int index;
       
   256     Py_UCS4 c;
       
   257 
       
   258     if (!PyArg_ParseTuple(args, "O!:category",
       
   259 			  &PyUnicode_Type, &v))
       
   260 	return NULL;
       
   261     c = getuchar(v);
       
   262     if (c == (Py_UCS4)-1)
       
   263         return NULL;
       
   264     index = (int) _getrecord_ex(c)->category;
       
   265     if (self) {
       
   266         const change_record *old = get_old_record(self, c);
       
   267         if (old->category_changed != 0xFF)
       
   268             index = old->category_changed;
       
   269     }
       
   270     return PyString_FromString(_PyUnicode_CategoryNames[index]);
       
   271 }
       
   272 
       
   273 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
       
   274 "bidirectional(unichr)\n\
       
   275 \n\
       
   276 Returns the bidirectional category assigned to the Unicode character\n\
       
   277 unichr as string. If no such value is defined, an empty string is\n\
       
   278 returned.");
       
   279 
       
   280 static PyObject *
       
   281 unicodedata_bidirectional(PyObject *self, PyObject *args)
       
   282 {
       
   283     PyUnicodeObject *v;
       
   284     int index;
       
   285     Py_UCS4 c;
       
   286 
       
   287     if (!PyArg_ParseTuple(args, "O!:bidirectional",
       
   288 			  &PyUnicode_Type, &v))
       
   289 	return NULL;
       
   290     c = getuchar(v);
       
   291     if (c == (Py_UCS4)-1)
       
   292         return NULL;
       
   293     index = (int) _getrecord_ex(c)->bidirectional;
       
   294     if (self) {
       
   295         const change_record *old = get_old_record(self, c);
       
   296         if (old->category_changed == 0)
       
   297             index = 0; /* unassigned */
       
   298         else if (old->bidir_changed != 0xFF)
       
   299             index = old->bidir_changed;
       
   300     }
       
   301     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
       
   302 }
       
   303 
       
   304 PyDoc_STRVAR(unicodedata_combining__doc__,
       
   305 "combining(unichr)\n\
       
   306 \n\
       
   307 Returns the canonical combining class assigned to the Unicode\n\
       
   308 character unichr as integer. Returns 0 if no combining class is\n\
       
   309 defined.");
       
   310 
       
   311 static PyObject *
       
   312 unicodedata_combining(PyObject *self, PyObject *args)
       
   313 {
       
   314     PyUnicodeObject *v;
       
   315     int index;
       
   316     Py_UCS4 c;
       
   317 
       
   318     if (!PyArg_ParseTuple(args, "O!:combining",
       
   319 			  &PyUnicode_Type, &v))
       
   320 	return NULL;
       
   321     c = getuchar(v);
       
   322     if (c == (Py_UCS4)-1)
       
   323         return NULL;
       
   324     index = (int) _getrecord_ex(c)->combining;
       
   325     if (self) {
       
   326         const change_record *old = get_old_record(self, c);
       
   327         if (old->category_changed == 0)
       
   328             index = 0; /* unassigned */
       
   329     }
       
   330     return PyInt_FromLong(index);
       
   331 }
       
   332 
       
   333 PyDoc_STRVAR(unicodedata_mirrored__doc__,
       
   334 "mirrored(unichr)\n\
       
   335 \n\
       
   336 Returns the mirrored property assigned to the Unicode character\n\
       
   337 unichr as integer. Returns 1 if the character has been identified as\n\
       
   338 a \"mirrored\" character in bidirectional text, 0 otherwise.");
       
   339 
       
   340 static PyObject *
       
   341 unicodedata_mirrored(PyObject *self, PyObject *args)
       
   342 {
       
   343     PyUnicodeObject *v;
       
   344     int index;
       
   345     Py_UCS4 c;
       
   346 
       
   347     if (!PyArg_ParseTuple(args, "O!:mirrored",
       
   348 			  &PyUnicode_Type, &v))
       
   349 	return NULL;
       
   350     c = getuchar(v);
       
   351     if (c == (Py_UCS4)-1)
       
   352         return NULL;
       
   353     index = (int) _getrecord_ex(c)->mirrored;
       
   354     if (self) {
       
   355         const change_record *old = get_old_record(self, c);
       
   356         if (old->category_changed == 0)
       
   357             index = 0; /* unassigned */
       
   358         else if (old->mirrored_changed != 0xFF)
       
   359             index = old->mirrored_changed;
       
   360     }
       
   361     return PyInt_FromLong(index);
       
   362 }
       
   363 
       
   364 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
       
   365 "east_asian_width(unichr)\n\
       
   366 \n\
       
   367 Returns the east asian width assigned to the Unicode character\n\
       
   368 unichr as string.");
       
   369 
       
   370 static PyObject *
       
   371 unicodedata_east_asian_width(PyObject *self, PyObject *args)
       
   372 {
       
   373     PyUnicodeObject *v;
       
   374     int index;
       
   375     Py_UCS4 c;
       
   376 
       
   377     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
       
   378 			  &PyUnicode_Type, &v))
       
   379 	return NULL;
       
   380     c = getuchar(v);
       
   381     if (c == (Py_UCS4)-1)
       
   382         return NULL;
       
   383     index = (int) _getrecord_ex(c)->east_asian_width;
       
   384     if (self) {
       
   385         const change_record *old = get_old_record(self, c);
       
   386         if (old->category_changed == 0)
       
   387             index = 0; /* unassigned */
       
   388     }
       
   389     return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
       
   390 }
       
   391 
       
   392 PyDoc_STRVAR(unicodedata_decomposition__doc__,
       
   393 "decomposition(unichr)\n\
       
   394 \n\
       
   395 Returns the character decomposition mapping assigned to the Unicode\n\
       
   396 character unichr as string. An empty string is returned in case no\n\
       
   397 such mapping is defined.");
       
   398 
       
   399 static PyObject *
       
   400 unicodedata_decomposition(PyObject *self, PyObject *args)
       
   401 {
       
   402     PyUnicodeObject *v;
       
   403     char decomp[256];
       
   404     int code, index, count, i;
       
   405     unsigned int prefix_index;
       
   406     Py_UCS4 c;
       
   407 
       
   408     if (!PyArg_ParseTuple(args, "O!:decomposition",
       
   409 			  &PyUnicode_Type, &v))
       
   410 	return NULL;
       
   411     c = getuchar(v);
       
   412     if (c == (Py_UCS4)-1)
       
   413         return NULL;
       
   414 
       
   415     code = (int)c;
       
   416 
       
   417     if (self) {
       
   418         const change_record *old = get_old_record(self, c);
       
   419         if (old->category_changed == 0)
       
   420             return PyString_FromString(""); /* unassigned */
       
   421     }
       
   422 
       
   423     if (code < 0 || code >= 0x110000)
       
   424         index = 0;
       
   425     else {
       
   426         index = decomp_index1[(code>>DECOMP_SHIFT)];
       
   427         index = decomp_index2[(index<<DECOMP_SHIFT)+
       
   428                              (code&((1<<DECOMP_SHIFT)-1))];
       
   429     }
       
   430 
       
   431     /* high byte is number of hex bytes (usually one or two), low byte
       
   432        is prefix code (from*/
       
   433     count = decomp_data[index] >> 8;
       
   434 
       
   435     /* XXX: could allocate the PyString up front instead
       
   436        (strlen(prefix) + 5 * count + 1 bytes) */
       
   437 
       
   438     /* Based on how index is calculated above and decomp_data is generated
       
   439        from Tools/unicode/makeunicodedata.py, it should not be possible
       
   440        to overflow decomp_prefix. */
       
   441     prefix_index = decomp_data[index] & 255;
       
   442     assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
       
   443 
       
   444     /* copy prefix */
       
   445     i = strlen(decomp_prefix[prefix_index]);
       
   446     memcpy(decomp, decomp_prefix[prefix_index], i);
       
   447 
       
   448     while (count-- > 0) {
       
   449         if (i)
       
   450             decomp[i++] = ' ';
       
   451         assert((size_t)i < sizeof(decomp));
       
   452         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
       
   453                       decomp_data[++index]);
       
   454         i += strlen(decomp + i);
       
   455     }
       
   456     
       
   457     decomp[i] = '\0';
       
   458 
       
   459     return PyString_FromString(decomp);
       
   460 }
       
   461 
       
   462 static void
       
   463 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
       
   464 {
       
   465     if (code >= 0x110000) {
       
   466         *index = 0;
       
   467     } else if (self && get_old_record(self, code)->category_changed==0) {
       
   468         /* unassigned in old version */
       
   469         *index = 0;
       
   470     }
       
   471     else {
       
   472         *index = decomp_index1[(code>>DECOMP_SHIFT)];
       
   473         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
       
   474                                (code&((1<<DECOMP_SHIFT)-1))];
       
   475     }
       
   476 	
       
   477     /* high byte is number of hex bytes (usually one or two), low byte
       
   478        is prefix code (from*/
       
   479     *count = decomp_data[*index] >> 8;
       
   480     *prefix = decomp_data[*index] & 255;
       
   481 
       
   482     (*index)++;
       
   483 }
       
   484 
       
   485 #define SBase   0xAC00
       
   486 #define LBase   0x1100
       
   487 #define VBase   0x1161
       
   488 #define TBase   0x11A7
       
   489 #define LCount  19
       
   490 #define VCount  21
       
   491 #define TCount  28
       
   492 #define NCount  (VCount*TCount)
       
   493 #define SCount  (LCount*NCount)
       
   494 
       
   495 static PyObject*
       
   496 nfd_nfkd(PyObject *self, PyObject *input, int k)
       
   497 {
       
   498     PyObject *result;
       
   499     Py_UNICODE *i, *end, *o;
       
   500     /* Longest decomposition in Unicode 3.2: U+FDFA */
       
   501     Py_UNICODE stack[20]; 
       
   502     Py_ssize_t space, isize;
       
   503     int index, prefix, count, stackptr;
       
   504     unsigned char prev, cur;
       
   505 	
       
   506     stackptr = 0;
       
   507     isize = PyUnicode_GET_SIZE(input);
       
   508     /* Overallocate atmost 10 characters. */
       
   509     space = (isize > 10 ? 10 : isize) + isize;
       
   510     result = PyUnicode_FromUnicode(NULL, space);
       
   511     if (!result)
       
   512         return NULL;
       
   513     i = PyUnicode_AS_UNICODE(input);
       
   514     end = i + isize;
       
   515     o = PyUnicode_AS_UNICODE(result);
       
   516 
       
   517     while (i < end) {
       
   518         stack[stackptr++] = *i++;
       
   519         while(stackptr) {
       
   520             Py_UNICODE code = stack[--stackptr];
       
   521             /* Hangul Decomposition adds three characters in
       
   522                a single step, so we need atleast that much room. */
       
   523             if (space < 3) {
       
   524                 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
       
   525                 space += 10;
       
   526                 if (PyUnicode_Resize(&result, newsize) == -1)
       
   527                     return NULL;
       
   528                 o = PyUnicode_AS_UNICODE(result) + newsize - space;
       
   529             }
       
   530             /* Hangul Decomposition. */
       
   531             if (SBase <= code && code < (SBase+SCount)) {
       
   532                 int SIndex = code - SBase;
       
   533                 int L = LBase + SIndex / NCount;
       
   534                 int V = VBase + (SIndex % NCount) / TCount;
       
   535                 int T = TBase + SIndex % TCount;
       
   536                 *o++ = L;
       
   537                 *o++ = V;
       
   538                 space -= 2;
       
   539                 if (T != TBase) {
       
   540                     *o++ = T;
       
   541                     space --;
       
   542                 }
       
   543                 continue;
       
   544             }
       
   545             /* normalization changes */
       
   546             if (self) {
       
   547                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
       
   548                 if (value != 0) {
       
   549                     stack[stackptr++] = value;
       
   550                     continue;
       
   551                 }
       
   552             }
       
   553 
       
   554             /* Other decompositions. */
       
   555             get_decomp_record(self, code, &index, &prefix, &count);
       
   556 
       
   557             /* Copy character if it is not decomposable, or has a
       
   558                compatibility decomposition, but we do NFD. */
       
   559             if (!count || (prefix && !k)) {
       
   560                 *o++ = code;
       
   561                 space--;
       
   562                 continue;
       
   563             }
       
   564             /* Copy decomposition onto the stack, in reverse
       
   565                order.  */
       
   566             while(count) {
       
   567                 code = decomp_data[index + (--count)];
       
   568                 stack[stackptr++] = code;
       
   569             }
       
   570         }
       
   571     }
       
   572 
       
   573     /* Drop overallocation. Cannot fail. */
       
   574     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
       
   575 
       
   576     /* Sort canonically. */
       
   577     i = PyUnicode_AS_UNICODE(result);
       
   578     prev = _getrecord_ex(*i)->combining;
       
   579     end = i + PyUnicode_GET_SIZE(result);
       
   580     for (i++; i < end; i++) {
       
   581         cur = _getrecord_ex(*i)->combining;
       
   582         if (prev == 0 || cur == 0 || prev <= cur) {
       
   583             prev = cur;
       
   584             continue;
       
   585         }
       
   586         /* Non-canonical order. Need to switch *i with previous. */
       
   587         o = i - 1;
       
   588         while (1) {
       
   589             Py_UNICODE tmp = o[1];
       
   590             o[1] = o[0];
       
   591             o[0] = tmp;
       
   592             o--;
       
   593             if (o < PyUnicode_AS_UNICODE(result))
       
   594                 break;
       
   595             prev = _getrecord_ex(*o)->combining;
       
   596             if (prev == 0 || prev <= cur)
       
   597                 break;
       
   598         }
       
   599         prev = _getrecord_ex(*i)->combining;
       
   600     }
       
   601     return result;
       
   602 }
       
   603 
       
   604 static int
       
   605 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
       
   606 {
       
   607     int index;
       
   608     for (index = 0; nfc[index].start; index++) {
       
   609         int start = nfc[index].start;
       
   610         if (code < start)
       
   611             return -1;
       
   612         if (code <= start + nfc[index].count) {
       
   613             int delta = code - start;
       
   614             return nfc[index].index + delta;
       
   615         }
       
   616     }
       
   617     return -1;
       
   618 }
       
   619 
       
   620 static PyObject*
       
   621 nfc_nfkc(PyObject *self, PyObject *input, int k)
       
   622 {
       
   623     PyObject *result;
       
   624     Py_UNICODE *i, *i1, *o, *end;
       
   625     int f,l,index,index1,comb;
       
   626     Py_UNICODE code;
       
   627     Py_UNICODE *skipped[20];
       
   628     int cskipped = 0;
       
   629 
       
   630     result = nfd_nfkd(self, input, k);
       
   631     if (!result)
       
   632         return NULL;
       
   633 
       
   634     /* We are going to modify result in-place.
       
   635        If nfd_nfkd is changed to sometimes return the input,
       
   636        this code needs to be reviewed. */
       
   637     assert(result != input);
       
   638 
       
   639     i = PyUnicode_AS_UNICODE(result);
       
   640     end = i + PyUnicode_GET_SIZE(result);
       
   641     o = PyUnicode_AS_UNICODE(result);
       
   642 	
       
   643   again:
       
   644     while (i < end) {
       
   645       for (index = 0; index < cskipped; index++) {
       
   646           if (skipped[index] == i) {
       
   647               /* *i character is skipped. 
       
   648                  Remove from list. */
       
   649               skipped[index] = skipped[cskipped-1];
       
   650               cskipped--;
       
   651               i++;
       
   652               goto again; /* continue while */
       
   653           }
       
   654       }
       
   655       /* Hangul Composition. We don't need to check for <LV,T>
       
   656          pairs, since we always have decomposed data. */
       
   657       if (LBase <= *i && *i < (LBase+LCount) &&
       
   658           i + 1 < end && 
       
   659           VBase <= i[1] && i[1] <= (VBase+VCount)) {
       
   660           int LIndex, VIndex;
       
   661           LIndex = i[0] - LBase;
       
   662           VIndex = i[1] - VBase;
       
   663           code = SBase + (LIndex*VCount+VIndex)*TCount;
       
   664           i+=2;
       
   665           if (i < end &&
       
   666               TBase <= *i && *i <= (TBase+TCount)) {
       
   667               code += *i-TBase;
       
   668               i++;
       
   669           }
       
   670           *o++ = code;
       
   671           continue;
       
   672       }
       
   673 
       
   674       f = find_nfc_index(self, nfc_first, *i);
       
   675       if (f == -1) {
       
   676           *o++ = *i++;
       
   677           continue;
       
   678       }
       
   679       /* Find next unblocked character. */
       
   680       i1 = i+1;
       
   681       comb = 0;
       
   682       while (i1 < end) {
       
   683           int comb1 = _getrecord_ex(*i1)->combining;
       
   684           if (comb1 && comb == comb1) {
       
   685               /* Character is blocked. */
       
   686               i1++;
       
   687               continue;
       
   688           }
       
   689           l = find_nfc_index(self, nfc_last, *i1);
       
   690           /* *i1 cannot be combined with *i. If *i1
       
   691              is a starter, we don't need to look further.
       
   692              Otherwise, record the combining class. */
       
   693           if (l == -1) {
       
   694             not_combinable:
       
   695               if (comb1 == 0)
       
   696                   break;
       
   697               comb = comb1;
       
   698               i1++;
       
   699               continue;
       
   700           }
       
   701           index = f*TOTAL_LAST + l;
       
   702           index1 = comp_index[index >> COMP_SHIFT];
       
   703           code = comp_data[(index1<<COMP_SHIFT)+
       
   704                            (index&((1<<COMP_SHIFT)-1))];
       
   705           if (code == 0)
       
   706               goto not_combinable;
       
   707 			
       
   708           /* Replace the original character. */
       
   709           *i = code;
       
   710           /* Mark the second character unused. */
       
   711           skipped[cskipped++] = i1;
       
   712           i1++;
       
   713           f = find_nfc_index(self, nfc_first, *i);
       
   714           if (f == -1)
       
   715               break;
       
   716       }
       
   717       *o++ = *i++;
       
   718     }
       
   719     if (o != end)
       
   720         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
       
   721     return result;
       
   722 }
       
   723 		
       
   724 PyDoc_STRVAR(unicodedata_normalize__doc__,
       
   725 "normalize(form, unistr)\n\
       
   726 \n\
       
   727 Return the normal form 'form' for the Unicode string unistr.  Valid\n\
       
   728 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
       
   729 
       
   730 static PyObject*
       
   731 unicodedata_normalize(PyObject *self, PyObject *args)
       
   732 {
       
   733     char *form;
       
   734     PyObject *input;
       
   735 
       
   736     if(!PyArg_ParseTuple(args, "sO!:normalize",
       
   737                          &form, &PyUnicode_Type, &input))
       
   738         return NULL;
       
   739 
       
   740     if (PyUnicode_GetSize(input) == 0) {
       
   741         /* Special case empty input strings, since resizing
       
   742            them  later would cause internal errors. */
       
   743         Py_INCREF(input);
       
   744         return input;
       
   745     }
       
   746 
       
   747     if (strcmp(form, "NFC") == 0)
       
   748         return nfc_nfkc(self, input, 0);
       
   749     if (strcmp(form, "NFKC") == 0)
       
   750         return nfc_nfkc(self, input, 1);
       
   751     if (strcmp(form, "NFD") == 0)
       
   752         return nfd_nfkd(self, input, 0);
       
   753     if (strcmp(form, "NFKD") == 0)
       
   754         return nfd_nfkd(self, input, 1);
       
   755     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
       
   756     return NULL;
       
   757 }
       
   758 
       
   759 /* -------------------------------------------------------------------- */
       
   760 /* unicode character name tables */
       
   761 
       
   762 /* data file generated by Tools/unicode/makeunicodedata.py */
       
   763 #include "unicodename_db.h"
       
   764 
       
   765 /* -------------------------------------------------------------------- */
       
   766 /* database code (cut and pasted from the unidb package) */
       
   767 
       
   768 static unsigned long
       
   769 _gethash(const char *s, int len, int scale)
       
   770 {
       
   771     int i;
       
   772     unsigned long h = 0;
       
   773     unsigned long ix;
       
   774     for (i = 0; i < len; i++) {
       
   775         h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
       
   776         ix = h & 0xff000000;
       
   777         if (ix)
       
   778             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
       
   779     }
       
   780     return h;
       
   781 }
       
   782 
       
   783 static char *hangul_syllables[][3] = {
       
   784     { "G",  "A",   ""   },
       
   785     { "GG", "AE",  "G"  },
       
   786     { "N",  "YA",  "GG" },
       
   787     { "D",  "YAE", "GS" },
       
   788     { "DD", "EO",  "N", },
       
   789     { "R",  "E",   "NJ" },
       
   790     { "M",  "YEO", "NH" },
       
   791     { "B",  "YE",  "D"  },
       
   792     { "BB", "O",   "L"  },
       
   793     { "S",  "WA",  "LG" },
       
   794     { "SS", "WAE", "LM" },
       
   795     { "",   "OE",  "LB" },
       
   796     { "J",  "YO",  "LS" },
       
   797     { "JJ", "U",   "LT" },
       
   798     { "C",  "WEO", "LP" },
       
   799     { "K",  "WE",  "LH" },
       
   800     { "T",  "WI",  "M"  },
       
   801     { "P",  "YU",  "B"  },
       
   802     { "H",  "EU",  "BS" },
       
   803     { 0,    "YI",  "S"  },
       
   804     { 0,    "I",   "SS" },
       
   805     { 0,    0,     "NG" },
       
   806     { 0,    0,     "J"  },
       
   807     { 0,    0,     "C"  },
       
   808     { 0,    0,     "K"  },
       
   809     { 0,    0,     "T"  },
       
   810     { 0,    0,     "P"  },
       
   811     { 0,    0,     "H"  }
       
   812 };
       
   813 
       
   814 static int
       
   815 is_unified_ideograph(Py_UCS4 code)
       
   816 {
       
   817     return (
       
   818         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
       
   819         (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
       
   820         (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
       
   821 }
       
   822 
       
   823 static int
       
   824 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
       
   825 {
       
   826     int offset;
       
   827     int i;
       
   828     int word;
       
   829     unsigned char* w;
       
   830 
       
   831     if (code >= 0x110000)
       
   832         return 0;
       
   833 
       
   834     if (self) {
       
   835         const change_record *old = get_old_record(self, code);
       
   836         if (old->category_changed == 0) {
       
   837             /* unassigned */
       
   838             return 0;
       
   839         } 
       
   840     }
       
   841 
       
   842     if (SBase <= code && code < SBase+SCount) {
       
   843 	/* Hangul syllable. */
       
   844 	int SIndex = code - SBase;
       
   845 	int L = SIndex / NCount;
       
   846 	int V = (SIndex % NCount) / TCount;
       
   847 	int T = SIndex % TCount;
       
   848 
       
   849 	if (buflen < 27)
       
   850 	    /* Worst case: HANGUL SYLLABLE <10chars>. */
       
   851 	    return 0;
       
   852 	strcpy(buffer, "HANGUL SYLLABLE ");
       
   853 	buffer += 16;
       
   854 	strcpy(buffer, hangul_syllables[L][0]);
       
   855 	buffer += strlen(hangul_syllables[L][0]);
       
   856 	strcpy(buffer, hangul_syllables[V][1]);
       
   857 	buffer += strlen(hangul_syllables[V][1]);
       
   858 	strcpy(buffer, hangul_syllables[T][2]);
       
   859 	buffer += strlen(hangul_syllables[T][2]);
       
   860 	*buffer = '\0';
       
   861 	return 1;
       
   862     }
       
   863 
       
   864     if (is_unified_ideograph(code)) {
       
   865         if (buflen < 28)
       
   866             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
       
   867             return 0;
       
   868         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
       
   869         return 1;
       
   870     }
       
   871 
       
   872     /* get offset into phrasebook */
       
   873     offset = phrasebook_offset1[(code>>phrasebook_shift)];
       
   874     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
       
   875                                (code&((1<<phrasebook_shift)-1))];
       
   876     if (!offset)
       
   877         return 0;
       
   878 
       
   879     i = 0;
       
   880 
       
   881     for (;;) {
       
   882         /* get word index */
       
   883         word = phrasebook[offset] - phrasebook_short;
       
   884         if (word >= 0) {
       
   885             word = (word << 8) + phrasebook[offset+1];
       
   886             offset += 2;
       
   887         } else
       
   888             word = phrasebook[offset++];
       
   889         if (i) {
       
   890             if (i > buflen)
       
   891                 return 0; /* buffer overflow */
       
   892             buffer[i++] = ' ';
       
   893         }
       
   894         /* copy word string from lexicon.  the last character in the
       
   895            word has bit 7 set.  the last word in a string ends with
       
   896            0x80 */
       
   897         w = lexicon + lexicon_offset[word];
       
   898         while (*w < 128) {
       
   899             if (i >= buflen)
       
   900                 return 0; /* buffer overflow */
       
   901             buffer[i++] = *w++;
       
   902         }
       
   903         if (i >= buflen)
       
   904             return 0; /* buffer overflow */
       
   905         buffer[i++] = *w & 127;
       
   906         if (*w == 128)
       
   907             break; /* end of word */
       
   908     }
       
   909 
       
   910     return 1;
       
   911 }
       
   912 
       
   913 static int
       
   914 _cmpname(PyObject *self, int code, const char* name, int namelen)
       
   915 {
       
   916     /* check if code corresponds to the given name */
       
   917     int i;
       
   918     char buffer[NAME_MAXLEN];
       
   919     if (!_getucname(self, code, buffer, sizeof(buffer)))
       
   920         return 0;
       
   921     for (i = 0; i < namelen; i++) {
       
   922         if (toupper(Py_CHARMASK(name[i])) != buffer[i])
       
   923             return 0;
       
   924     }
       
   925     return buffer[namelen] == '\0';
       
   926 }
       
   927 
       
   928 static void 
       
   929 find_syllable(const char *str, int *len, int *pos, int count, int column)
       
   930 {
       
   931     int i, len1;
       
   932     *len = -1;
       
   933     for (i = 0; i < count; i++) {
       
   934 	char *s = hangul_syllables[i][column];
       
   935 	len1 = strlen(s);
       
   936 	if (len1 <= *len)
       
   937 	    continue;
       
   938 	if (strncmp(str, s, len1) == 0) {
       
   939 	    *len = len1;
       
   940 	    *pos = i;
       
   941 	}
       
   942     }
       
   943     if (*len == -1) {
       
   944 	*len = 0;
       
   945     }
       
   946 }
       
   947 
       
   948 static int
       
   949 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
       
   950 {
       
   951     unsigned int h, v;
       
   952     unsigned int mask = code_size-1;
       
   953     unsigned int i, incr;
       
   954 
       
   955     /* Check for hangul syllables. */
       
   956     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
       
   957 	int len, L = -1, V = -1, T = -1;
       
   958 	const char *pos = name + 16;
       
   959 	find_syllable(pos, &len, &L, LCount, 0);
       
   960 	pos += len;
       
   961 	find_syllable(pos, &len, &V, VCount, 1);
       
   962 	pos += len;
       
   963 	find_syllable(pos, &len, &T, TCount, 2);
       
   964 	pos += len;
       
   965 	if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
       
   966 	    *code = SBase + (L*VCount+V)*TCount + T;
       
   967 	    return 1;
       
   968 	}
       
   969         /* Otherwise, it's an illegal syllable name. */
       
   970         return 0;
       
   971     }
       
   972 
       
   973     /* Check for unified ideographs. */
       
   974     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
       
   975         /* Four or five hexdigits must follow. */
       
   976         v = 0;
       
   977         name += 22;
       
   978         namelen -= 22;
       
   979         if (namelen != 4 && namelen != 5)
       
   980             return 0;
       
   981         while (namelen--) {
       
   982             v *= 16;
       
   983             if (*name >= '0' && *name <= '9')
       
   984                 v += *name - '0';
       
   985             else if (*name >= 'A' && *name <= 'F')
       
   986                 v += *name - 'A' + 10;
       
   987             else
       
   988                 return 0;
       
   989             name++;
       
   990         }
       
   991         if (!is_unified_ideograph(v))
       
   992             return 0;
       
   993         *code = v;
       
   994         return 1;
       
   995     }
       
   996 
       
   997     /* the following is the same as python's dictionary lookup, with
       
   998        only minor changes.  see the makeunicodedata script for more
       
   999        details */
       
  1000 
       
  1001     h = (unsigned int) _gethash(name, namelen, code_magic);
       
  1002     i = (~h) & mask;
       
  1003     v = code_hash[i];
       
  1004     if (!v)
       
  1005         return 0;
       
  1006     if (_cmpname(self, v, name, namelen)) {
       
  1007         *code = v;
       
  1008         return 1;
       
  1009     }
       
  1010     incr = (h ^ (h >> 3)) & mask;
       
  1011     if (!incr)
       
  1012         incr = mask;
       
  1013     for (;;) {
       
  1014         i = (i + incr) & mask;
       
  1015         v = code_hash[i];
       
  1016         if (!v)
       
  1017             return 0;
       
  1018         if (_cmpname(self, v, name, namelen)) {
       
  1019             *code = v;
       
  1020             return 1;
       
  1021         }
       
  1022         incr = incr << 1;
       
  1023         if (incr > mask)
       
  1024             incr = incr ^ code_poly;
       
  1025     }
       
  1026 }
       
  1027 
       
  1028 static const _PyUnicode_Name_CAPI hashAPI = 
       
  1029 {
       
  1030     sizeof(_PyUnicode_Name_CAPI),
       
  1031     _getucname,
       
  1032     _getcode
       
  1033 };
       
  1034 
       
  1035 /* -------------------------------------------------------------------- */
       
  1036 /* Python bindings */
       
  1037 
       
  1038 PyDoc_STRVAR(unicodedata_name__doc__,
       
  1039 "name(unichr[, default])\n\
       
  1040 Returns the name assigned to the Unicode character unichr as a\n\
       
  1041 string. If no name is defined, default is returned, or, if not\n\
       
  1042 given, ValueError is raised.");
       
  1043 
       
  1044 static PyObject *
       
  1045 unicodedata_name(PyObject* self, PyObject* args)
       
  1046 {
       
  1047     char name[NAME_MAXLEN];
       
  1048     Py_UCS4 c;
       
  1049 
       
  1050     PyUnicodeObject* v;
       
  1051     PyObject* defobj = NULL;
       
  1052     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
       
  1053         return NULL;
       
  1054 
       
  1055     c = getuchar(v);
       
  1056     if (c == (Py_UCS4)-1)
       
  1057         return NULL;
       
  1058 
       
  1059     if (!_getucname(self, c, name, sizeof(name))) {
       
  1060 	if (defobj == NULL) {
       
  1061 	    PyErr_SetString(PyExc_ValueError, "no such name");
       
  1062             return NULL;
       
  1063 	}
       
  1064 	else {
       
  1065 	    Py_INCREF(defobj);
       
  1066 	    return defobj;
       
  1067 	}
       
  1068     }
       
  1069 
       
  1070     return Py_BuildValue("s", name);
       
  1071 }
       
  1072 
       
  1073 PyDoc_STRVAR(unicodedata_lookup__doc__,
       
  1074 "lookup(name)\n\
       
  1075 \n\
       
  1076 Look up character by name.  If a character with the\n\
       
  1077 given name is found, return the corresponding Unicode\n\
       
  1078 character.  If not found, KeyError is raised.");
       
  1079 
       
  1080 static PyObject *
       
  1081 unicodedata_lookup(PyObject* self, PyObject* args)
       
  1082 {
       
  1083     Py_UCS4 code;
       
  1084     Py_UNICODE str[2];
       
  1085 
       
  1086     char* name;
       
  1087     int namelen;
       
  1088     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
       
  1089         return NULL;
       
  1090 
       
  1091     if (!_getcode(self, name, namelen, &code)) {
       
  1092         PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
       
  1093                      name);
       
  1094         return NULL;
       
  1095     }
       
  1096 
       
  1097 #ifndef Py_UNICODE_WIDE
       
  1098     if (code >= 0x10000) {
       
  1099         str[0] = 0xd800 + ((code - 0x10000) >> 10);
       
  1100         str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
       
  1101         return PyUnicode_FromUnicode(str, 2);
       
  1102     }
       
  1103 #endif
       
  1104     str[0] = (Py_UNICODE) code;
       
  1105     return PyUnicode_FromUnicode(str, 1);    
       
  1106 }
       
  1107 
       
  1108 /* XXX Add doc strings. */
       
  1109 
       
  1110 static PyMethodDef unicodedata_functions[] = {
       
  1111     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
       
  1112     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
       
  1113     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
       
  1114     {"category", unicodedata_category, METH_VARARGS,
       
  1115                  unicodedata_category__doc__},
       
  1116     {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
       
  1117                       unicodedata_bidirectional__doc__},
       
  1118     {"combining", unicodedata_combining, METH_VARARGS,
       
  1119                   unicodedata_combining__doc__},
       
  1120     {"mirrored", unicodedata_mirrored, METH_VARARGS,
       
  1121                  unicodedata_mirrored__doc__},
       
  1122     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
       
  1123                          unicodedata_east_asian_width__doc__},
       
  1124     {"decomposition", unicodedata_decomposition, METH_VARARGS,
       
  1125                       unicodedata_decomposition__doc__},
       
  1126     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
       
  1127     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
       
  1128     {"normalize", unicodedata_normalize, METH_VARARGS,
       
  1129                   unicodedata_normalize__doc__},
       
  1130     {NULL, NULL}		/* sentinel */
       
  1131 };
       
  1132 
       
  1133 static PyTypeObject UCD_Type = {
       
  1134 	/* The ob_type field must be initialized in the module init function
       
  1135 	 * to be portable to Windows without using C++. */
       
  1136 	PyVarObject_HEAD_INIT(NULL, 0)
       
  1137 	"unicodedata.UCD",		/*tp_name*/
       
  1138 	sizeof(PreviousDBVersion),	/*tp_basicsize*/
       
  1139 	0,			/*tp_itemsize*/
       
  1140 	/* methods */
       
  1141 	(destructor)PyObject_Del, /*tp_dealloc*/
       
  1142 	0,			/*tp_print*/
       
  1143 	0,                      /*tp_getattr*/
       
  1144 	0,			/*tp_setattr*/
       
  1145 	0,			/*tp_compare*/
       
  1146 	0,			/*tp_repr*/
       
  1147 	0,			/*tp_as_number*/
       
  1148 	0,			/*tp_as_sequence*/
       
  1149 	0,			/*tp_as_mapping*/
       
  1150 	0,			/*tp_hash*/
       
  1151         0,                      /*tp_call*/
       
  1152         0,                      /*tp_str*/
       
  1153         PyObject_GenericGetAttr,/*tp_getattro*/
       
  1154         0,                      /*tp_setattro*/
       
  1155         0,                      /*tp_as_buffer*/
       
  1156         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
       
  1157         0,                      /*tp_doc*/
       
  1158         0,                      /*tp_traverse*/
       
  1159         0,                      /*tp_clear*/
       
  1160         0,                      /*tp_richcompare*/
       
  1161         0,                      /*tp_weaklistoffset*/
       
  1162         0,                      /*tp_iter*/
       
  1163         0,                      /*tp_iternext*/
       
  1164         unicodedata_functions,  /*tp_methods*/
       
  1165         DB_members,             /*tp_members*/
       
  1166         0,                      /*tp_getset*/
       
  1167         0,                      /*tp_base*/
       
  1168         0,                      /*tp_dict*/
       
  1169         0,                      /*tp_descr_get*/
       
  1170         0,                      /*tp_descr_set*/
       
  1171         0,                      /*tp_dictoffset*/
       
  1172         0,                      /*tp_init*/
       
  1173         0,                      /*tp_alloc*/
       
  1174         0,                      /*tp_new*/
       
  1175         0,                      /*tp_free*/
       
  1176         0,                      /*tp_is_gc*/
       
  1177 };
       
  1178 
       
  1179 PyDoc_STRVAR(unicodedata_docstring,
       
  1180 "This module provides access to the Unicode Character Database which\n\
       
  1181 defines character properties for all Unicode characters. The data in\n\
       
  1182 this database is based on the UnicodeData.txt file version\n\
       
  1183 5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
       
  1184 \n\
       
  1185 The module uses the same names and symbols as defined by the\n\
       
  1186 UnicodeData File Format 5.1.0 (see\n\
       
  1187 http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
       
  1188 
       
  1189 PyMODINIT_FUNC
       
  1190 initunicodedata(void)
       
  1191 {
       
  1192     PyObject *m, *v;
       
  1193 
       
  1194     Py_TYPE(&UCD_Type) = &PyType_Type;
       
  1195 
       
  1196     m = Py_InitModule3(
       
  1197         "unicodedata", unicodedata_functions, unicodedata_docstring);
       
  1198     if (!m)
       
  1199         return;
       
  1200 
       
  1201     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
       
  1202     Py_INCREF(&UCD_Type);
       
  1203     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
       
  1204 
       
  1205     /* Previous versions */
       
  1206     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
       
  1207     if (v != NULL)
       
  1208         PyModule_AddObject(m, "ucd_3_2_0", v);
       
  1209 
       
  1210     /* Export C API */
       
  1211     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
       
  1212     if (v != NULL)
       
  1213         PyModule_AddObject(m, "ucnhash_CAPI", v);
       
  1214 }
       
  1215 
       
  1216 /* 
       
  1217 Local variables:
       
  1218 c-basic-offset: 4
       
  1219 indent-tabs-mode: nil
       
  1220 End:
       
  1221 */