|
1 /* ------------------------------------------------------------------------ |
|
2 |
|
3 unicodedata -- Provides access to the Unicode 5.1 data base. |
|
4 |
|
5 Data was extracted from the Unicode 5.1 UnicodeData.txt file. |
|
6 |
|
7 Written by Marc-Andre Lemburg (mal@lemburg.com). |
|
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) |
|
9 Modified by Martin v. Löwis (martin@v.loewis.de) |
|
10 |
|
11 Copyright (c) Corporation for National Research Initiatives. |
|
12 |
|
13 ------------------------------------------------------------------------ */ |
|
14 |
|
15 #include "Python.h" |
|
16 #include "ucnhash.h" |
|
17 #include "structmember.h" |
|
18 |
|
19 /* character properties */ |
|
20 |
|
21 typedef struct { |
|
22 const unsigned char category; /* index into |
|
23 _PyUnicode_CategoryNames */ |
|
24 const unsigned char combining; /* combining class value 0 - 255 */ |
|
25 const unsigned char bidirectional; /* index into |
|
26 _PyUnicode_BidirectionalNames */ |
|
27 const unsigned char mirrored; /* true if mirrored in bidir mode */ |
|
28 const unsigned char east_asian_width; /* index into |
|
29 _PyUnicode_EastAsianWidth */ |
|
30 } _PyUnicode_DatabaseRecord; |
|
31 |
|
32 typedef struct change_record { |
|
33 /* sequence of fields should be the same as in merge_old_version */ |
|
34 const unsigned char bidir_changed; |
|
35 const unsigned char category_changed; |
|
36 const unsigned char decimal_changed; |
|
37 const unsigned char mirrored_changed; |
|
38 const int numeric_changed; |
|
39 } change_record; |
|
40 |
|
41 /* data file generated by Tools/unicode/makeunicodedata.py */ |
|
42 #include "unicodedata_db.h" |
|
43 |
|
44 static const _PyUnicode_DatabaseRecord* |
|
45 _getrecord_ex(Py_UCS4 code) |
|
46 { |
|
47 int index; |
|
48 if (code >= 0x110000) |
|
49 index = 0; |
|
50 else { |
|
51 index = index1[(code>>SHIFT)]; |
|
52 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; |
|
53 } |
|
54 |
|
55 return &_PyUnicode_Database_Records[index]; |
|
56 } |
|
57 |
|
58 /* ------------- Previous-version API ------------------------------------- */ |
|
59 typedef struct previous_version { |
|
60 PyObject_HEAD |
|
61 const char *name; |
|
62 const change_record* (*getrecord)(Py_UCS4); |
|
63 Py_UCS4 (*normalization)(Py_UCS4); |
|
64 } PreviousDBVersion; |
|
65 |
|
66 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v)) |
|
67 |
|
68 static PyMemberDef DB_members[] = { |
|
69 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, |
|
70 {NULL} |
|
71 }; |
|
72 |
|
73 /* forward declaration */ |
|
74 static PyTypeObject UCD_Type; |
|
75 |
|
76 static PyObject* |
|
77 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), |
|
78 Py_UCS4 (*normalization)(Py_UCS4)) |
|
79 { |
|
80 PreviousDBVersion *self; |
|
81 self = PyObject_New(PreviousDBVersion, &UCD_Type); |
|
82 if (self == NULL) |
|
83 return NULL; |
|
84 self->name = name; |
|
85 self->getrecord = getrecord; |
|
86 self->normalization = normalization; |
|
87 return (PyObject*)self; |
|
88 } |
|
89 |
|
90 |
|
91 static Py_UCS4 getuchar(PyUnicodeObject *obj) |
|
92 { |
|
93 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj); |
|
94 |
|
95 if (PyUnicode_GET_SIZE(obj) == 1) |
|
96 return *v; |
|
97 #ifndef Py_UNICODE_WIDE |
|
98 else if ((PyUnicode_GET_SIZE(obj) == 2) && |
|
99 (0xD800 <= v[0] && v[0] <= 0xDBFF) && |
|
100 (0xDC00 <= v[1] && v[1] <= 0xDFFF)) |
|
101 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000; |
|
102 #endif |
|
103 PyErr_SetString(PyExc_TypeError, |
|
104 "need a single Unicode character as parameter"); |
|
105 return (Py_UCS4)-1; |
|
106 } |
|
107 |
|
108 /* --- Module API --------------------------------------------------------- */ |
|
109 |
|
110 PyDoc_STRVAR(unicodedata_decimal__doc__, |
|
111 "decimal(unichr[, default])\n\ |
|
112 \n\ |
|
113 Returns the decimal value assigned to the Unicode character unichr\n\ |
|
114 as integer. If no such value is defined, default is returned, or, if\n\ |
|
115 not given, ValueError is raised."); |
|
116 |
|
117 static PyObject * |
|
118 unicodedata_decimal(PyObject *self, PyObject *args) |
|
119 { |
|
120 PyUnicodeObject *v; |
|
121 PyObject *defobj = NULL; |
|
122 int have_old = 0; |
|
123 long rc; |
|
124 Py_UCS4 c; |
|
125 |
|
126 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) |
|
127 return NULL; |
|
128 c = getuchar(v); |
|
129 if (c == (Py_UCS4)-1) |
|
130 return NULL; |
|
131 |
|
132 if (self) { |
|
133 const change_record *old = get_old_record(self, c); |
|
134 if (old->category_changed == 0) { |
|
135 /* unassigned */ |
|
136 have_old = 1; |
|
137 rc = -1; |
|
138 } |
|
139 else if (old->decimal_changed != 0xFF) { |
|
140 have_old = 1; |
|
141 rc = old->decimal_changed; |
|
142 } |
|
143 } |
|
144 |
|
145 if (!have_old) |
|
146 rc = Py_UNICODE_TODECIMAL(c); |
|
147 if (rc < 0) { |
|
148 if (defobj == NULL) { |
|
149 PyErr_SetString(PyExc_ValueError, |
|
150 "not a decimal"); |
|
151 return NULL; |
|
152 } |
|
153 else { |
|
154 Py_INCREF(defobj); |
|
155 return defobj; |
|
156 } |
|
157 } |
|
158 return PyInt_FromLong(rc); |
|
159 } |
|
160 |
|
161 PyDoc_STRVAR(unicodedata_digit__doc__, |
|
162 "digit(unichr[, default])\n\ |
|
163 \n\ |
|
164 Returns the digit value assigned to the Unicode character unichr as\n\ |
|
165 integer. If no such value is defined, default is returned, or, if\n\ |
|
166 not given, ValueError is raised."); |
|
167 |
|
168 static PyObject * |
|
169 unicodedata_digit(PyObject *self, PyObject *args) |
|
170 { |
|
171 PyUnicodeObject *v; |
|
172 PyObject *defobj = NULL; |
|
173 long rc; |
|
174 Py_UCS4 c; |
|
175 |
|
176 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj)) |
|
177 return NULL; |
|
178 c = getuchar(v); |
|
179 if (c == (Py_UCS4)-1) |
|
180 return NULL; |
|
181 rc = Py_UNICODE_TODIGIT(c); |
|
182 if (rc < 0) { |
|
183 if (defobj == NULL) { |
|
184 PyErr_SetString(PyExc_ValueError, "not a digit"); |
|
185 return NULL; |
|
186 } |
|
187 else { |
|
188 Py_INCREF(defobj); |
|
189 return defobj; |
|
190 } |
|
191 } |
|
192 return PyInt_FromLong(rc); |
|
193 } |
|
194 |
|
195 PyDoc_STRVAR(unicodedata_numeric__doc__, |
|
196 "numeric(unichr[, default])\n\ |
|
197 \n\ |
|
198 Returns the numeric value assigned to the Unicode character unichr\n\ |
|
199 as float. If no such value is defined, default is returned, or, if\n\ |
|
200 not given, ValueError is raised."); |
|
201 |
|
202 static PyObject * |
|
203 unicodedata_numeric(PyObject *self, PyObject *args) |
|
204 { |
|
205 PyUnicodeObject *v; |
|
206 PyObject *defobj = NULL; |
|
207 int have_old = 0; |
|
208 double rc; |
|
209 Py_UCS4 c; |
|
210 |
|
211 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) |
|
212 return NULL; |
|
213 c = getuchar(v); |
|
214 if (c == (Py_UCS4)-1) |
|
215 return NULL; |
|
216 |
|
217 if (self) { |
|
218 const change_record *old = get_old_record(self, c); |
|
219 if (old->category_changed == 0) { |
|
220 /* unassigned */ |
|
221 have_old = 1; |
|
222 rc = -1.0; |
|
223 } |
|
224 else if (old->decimal_changed != 0xFF) { |
|
225 have_old = 1; |
|
226 rc = old->decimal_changed; |
|
227 } |
|
228 } |
|
229 |
|
230 if (!have_old) |
|
231 rc = Py_UNICODE_TONUMERIC(c); |
|
232 if (rc == -1.0) { |
|
233 if (defobj == NULL) { |
|
234 PyErr_SetString(PyExc_ValueError, "not a numeric character"); |
|
235 return NULL; |
|
236 } |
|
237 else { |
|
238 Py_INCREF(defobj); |
|
239 return defobj; |
|
240 } |
|
241 } |
|
242 return PyFloat_FromDouble(rc); |
|
243 } |
|
244 |
|
245 PyDoc_STRVAR(unicodedata_category__doc__, |
|
246 "category(unichr)\n\ |
|
247 \n\ |
|
248 Returns the general category assigned to the Unicode character\n\ |
|
249 unichr as string."); |
|
250 |
|
251 static PyObject * |
|
252 unicodedata_category(PyObject *self, PyObject *args) |
|
253 { |
|
254 PyUnicodeObject *v; |
|
255 int index; |
|
256 Py_UCS4 c; |
|
257 |
|
258 if (!PyArg_ParseTuple(args, "O!:category", |
|
259 &PyUnicode_Type, &v)) |
|
260 return NULL; |
|
261 c = getuchar(v); |
|
262 if (c == (Py_UCS4)-1) |
|
263 return NULL; |
|
264 index = (int) _getrecord_ex(c)->category; |
|
265 if (self) { |
|
266 const change_record *old = get_old_record(self, c); |
|
267 if (old->category_changed != 0xFF) |
|
268 index = old->category_changed; |
|
269 } |
|
270 return PyString_FromString(_PyUnicode_CategoryNames[index]); |
|
271 } |
|
272 |
|
273 PyDoc_STRVAR(unicodedata_bidirectional__doc__, |
|
274 "bidirectional(unichr)\n\ |
|
275 \n\ |
|
276 Returns the bidirectional category assigned to the Unicode character\n\ |
|
277 unichr as string. If no such value is defined, an empty string is\n\ |
|
278 returned."); |
|
279 |
|
280 static PyObject * |
|
281 unicodedata_bidirectional(PyObject *self, PyObject *args) |
|
282 { |
|
283 PyUnicodeObject *v; |
|
284 int index; |
|
285 Py_UCS4 c; |
|
286 |
|
287 if (!PyArg_ParseTuple(args, "O!:bidirectional", |
|
288 &PyUnicode_Type, &v)) |
|
289 return NULL; |
|
290 c = getuchar(v); |
|
291 if (c == (Py_UCS4)-1) |
|
292 return NULL; |
|
293 index = (int) _getrecord_ex(c)->bidirectional; |
|
294 if (self) { |
|
295 const change_record *old = get_old_record(self, c); |
|
296 if (old->category_changed == 0) |
|
297 index = 0; /* unassigned */ |
|
298 else if (old->bidir_changed != 0xFF) |
|
299 index = old->bidir_changed; |
|
300 } |
|
301 return PyString_FromString(_PyUnicode_BidirectionalNames[index]); |
|
302 } |
|
303 |
|
304 PyDoc_STRVAR(unicodedata_combining__doc__, |
|
305 "combining(unichr)\n\ |
|
306 \n\ |
|
307 Returns the canonical combining class assigned to the Unicode\n\ |
|
308 character unichr as integer. Returns 0 if no combining class is\n\ |
|
309 defined."); |
|
310 |
|
311 static PyObject * |
|
312 unicodedata_combining(PyObject *self, PyObject *args) |
|
313 { |
|
314 PyUnicodeObject *v; |
|
315 int index; |
|
316 Py_UCS4 c; |
|
317 |
|
318 if (!PyArg_ParseTuple(args, "O!:combining", |
|
319 &PyUnicode_Type, &v)) |
|
320 return NULL; |
|
321 c = getuchar(v); |
|
322 if (c == (Py_UCS4)-1) |
|
323 return NULL; |
|
324 index = (int) _getrecord_ex(c)->combining; |
|
325 if (self) { |
|
326 const change_record *old = get_old_record(self, c); |
|
327 if (old->category_changed == 0) |
|
328 index = 0; /* unassigned */ |
|
329 } |
|
330 return PyInt_FromLong(index); |
|
331 } |
|
332 |
|
333 PyDoc_STRVAR(unicodedata_mirrored__doc__, |
|
334 "mirrored(unichr)\n\ |
|
335 \n\ |
|
336 Returns the mirrored property assigned to the Unicode character\n\ |
|
337 unichr as integer. Returns 1 if the character has been identified as\n\ |
|
338 a \"mirrored\" character in bidirectional text, 0 otherwise."); |
|
339 |
|
340 static PyObject * |
|
341 unicodedata_mirrored(PyObject *self, PyObject *args) |
|
342 { |
|
343 PyUnicodeObject *v; |
|
344 int index; |
|
345 Py_UCS4 c; |
|
346 |
|
347 if (!PyArg_ParseTuple(args, "O!:mirrored", |
|
348 &PyUnicode_Type, &v)) |
|
349 return NULL; |
|
350 c = getuchar(v); |
|
351 if (c == (Py_UCS4)-1) |
|
352 return NULL; |
|
353 index = (int) _getrecord_ex(c)->mirrored; |
|
354 if (self) { |
|
355 const change_record *old = get_old_record(self, c); |
|
356 if (old->category_changed == 0) |
|
357 index = 0; /* unassigned */ |
|
358 else if (old->mirrored_changed != 0xFF) |
|
359 index = old->mirrored_changed; |
|
360 } |
|
361 return PyInt_FromLong(index); |
|
362 } |
|
363 |
|
364 PyDoc_STRVAR(unicodedata_east_asian_width__doc__, |
|
365 "east_asian_width(unichr)\n\ |
|
366 \n\ |
|
367 Returns the east asian width assigned to the Unicode character\n\ |
|
368 unichr as string."); |
|
369 |
|
370 static PyObject * |
|
371 unicodedata_east_asian_width(PyObject *self, PyObject *args) |
|
372 { |
|
373 PyUnicodeObject *v; |
|
374 int index; |
|
375 Py_UCS4 c; |
|
376 |
|
377 if (!PyArg_ParseTuple(args, "O!:east_asian_width", |
|
378 &PyUnicode_Type, &v)) |
|
379 return NULL; |
|
380 c = getuchar(v); |
|
381 if (c == (Py_UCS4)-1) |
|
382 return NULL; |
|
383 index = (int) _getrecord_ex(c)->east_asian_width; |
|
384 if (self) { |
|
385 const change_record *old = get_old_record(self, c); |
|
386 if (old->category_changed == 0) |
|
387 index = 0; /* unassigned */ |
|
388 } |
|
389 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]); |
|
390 } |
|
391 |
|
392 PyDoc_STRVAR(unicodedata_decomposition__doc__, |
|
393 "decomposition(unichr)\n\ |
|
394 \n\ |
|
395 Returns the character decomposition mapping assigned to the Unicode\n\ |
|
396 character unichr as string. An empty string is returned in case no\n\ |
|
397 such mapping is defined."); |
|
398 |
|
399 static PyObject * |
|
400 unicodedata_decomposition(PyObject *self, PyObject *args) |
|
401 { |
|
402 PyUnicodeObject *v; |
|
403 char decomp[256]; |
|
404 int code, index, count, i; |
|
405 unsigned int prefix_index; |
|
406 Py_UCS4 c; |
|
407 |
|
408 if (!PyArg_ParseTuple(args, "O!:decomposition", |
|
409 &PyUnicode_Type, &v)) |
|
410 return NULL; |
|
411 c = getuchar(v); |
|
412 if (c == (Py_UCS4)-1) |
|
413 return NULL; |
|
414 |
|
415 code = (int)c; |
|
416 |
|
417 if (self) { |
|
418 const change_record *old = get_old_record(self, c); |
|
419 if (old->category_changed == 0) |
|
420 return PyString_FromString(""); /* unassigned */ |
|
421 } |
|
422 |
|
423 if (code < 0 || code >= 0x110000) |
|
424 index = 0; |
|
425 else { |
|
426 index = decomp_index1[(code>>DECOMP_SHIFT)]; |
|
427 index = decomp_index2[(index<<DECOMP_SHIFT)+ |
|
428 (code&((1<<DECOMP_SHIFT)-1))]; |
|
429 } |
|
430 |
|
431 /* high byte is number of hex bytes (usually one or two), low byte |
|
432 is prefix code (from*/ |
|
433 count = decomp_data[index] >> 8; |
|
434 |
|
435 /* XXX: could allocate the PyString up front instead |
|
436 (strlen(prefix) + 5 * count + 1 bytes) */ |
|
437 |
|
438 /* Based on how index is calculated above and decomp_data is generated |
|
439 from Tools/unicode/makeunicodedata.py, it should not be possible |
|
440 to overflow decomp_prefix. */ |
|
441 prefix_index = decomp_data[index] & 255; |
|
442 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix))); |
|
443 |
|
444 /* copy prefix */ |
|
445 i = strlen(decomp_prefix[prefix_index]); |
|
446 memcpy(decomp, decomp_prefix[prefix_index], i); |
|
447 |
|
448 while (count-- > 0) { |
|
449 if (i) |
|
450 decomp[i++] = ' '; |
|
451 assert((size_t)i < sizeof(decomp)); |
|
452 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", |
|
453 decomp_data[++index]); |
|
454 i += strlen(decomp + i); |
|
455 } |
|
456 |
|
457 decomp[i] = '\0'; |
|
458 |
|
459 return PyString_FromString(decomp); |
|
460 } |
|
461 |
|
462 static void |
|
463 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count) |
|
464 { |
|
465 if (code >= 0x110000) { |
|
466 *index = 0; |
|
467 } else if (self && get_old_record(self, code)->category_changed==0) { |
|
468 /* unassigned in old version */ |
|
469 *index = 0; |
|
470 } |
|
471 else { |
|
472 *index = decomp_index1[(code>>DECOMP_SHIFT)]; |
|
473 *index = decomp_index2[(*index<<DECOMP_SHIFT)+ |
|
474 (code&((1<<DECOMP_SHIFT)-1))]; |
|
475 } |
|
476 |
|
477 /* high byte is number of hex bytes (usually one or two), low byte |
|
478 is prefix code (from*/ |
|
479 *count = decomp_data[*index] >> 8; |
|
480 *prefix = decomp_data[*index] & 255; |
|
481 |
|
482 (*index)++; |
|
483 } |
|
484 |
|
485 #define SBase 0xAC00 |
|
486 #define LBase 0x1100 |
|
487 #define VBase 0x1161 |
|
488 #define TBase 0x11A7 |
|
489 #define LCount 19 |
|
490 #define VCount 21 |
|
491 #define TCount 28 |
|
492 #define NCount (VCount*TCount) |
|
493 #define SCount (LCount*NCount) |
|
494 |
|
495 static PyObject* |
|
496 nfd_nfkd(PyObject *self, PyObject *input, int k) |
|
497 { |
|
498 PyObject *result; |
|
499 Py_UNICODE *i, *end, *o; |
|
500 /* Longest decomposition in Unicode 3.2: U+FDFA */ |
|
501 Py_UNICODE stack[20]; |
|
502 Py_ssize_t space, isize; |
|
503 int index, prefix, count, stackptr; |
|
504 unsigned char prev, cur; |
|
505 |
|
506 stackptr = 0; |
|
507 isize = PyUnicode_GET_SIZE(input); |
|
508 /* Overallocate atmost 10 characters. */ |
|
509 space = (isize > 10 ? 10 : isize) + isize; |
|
510 result = PyUnicode_FromUnicode(NULL, space); |
|
511 if (!result) |
|
512 return NULL; |
|
513 i = PyUnicode_AS_UNICODE(input); |
|
514 end = i + isize; |
|
515 o = PyUnicode_AS_UNICODE(result); |
|
516 |
|
517 while (i < end) { |
|
518 stack[stackptr++] = *i++; |
|
519 while(stackptr) { |
|
520 Py_UNICODE code = stack[--stackptr]; |
|
521 /* Hangul Decomposition adds three characters in |
|
522 a single step, so we need atleast that much room. */ |
|
523 if (space < 3) { |
|
524 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10; |
|
525 space += 10; |
|
526 if (PyUnicode_Resize(&result, newsize) == -1) |
|
527 return NULL; |
|
528 o = PyUnicode_AS_UNICODE(result) + newsize - space; |
|
529 } |
|
530 /* Hangul Decomposition. */ |
|
531 if (SBase <= code && code < (SBase+SCount)) { |
|
532 int SIndex = code - SBase; |
|
533 int L = LBase + SIndex / NCount; |
|
534 int V = VBase + (SIndex % NCount) / TCount; |
|
535 int T = TBase + SIndex % TCount; |
|
536 *o++ = L; |
|
537 *o++ = V; |
|
538 space -= 2; |
|
539 if (T != TBase) { |
|
540 *o++ = T; |
|
541 space --; |
|
542 } |
|
543 continue; |
|
544 } |
|
545 /* normalization changes */ |
|
546 if (self) { |
|
547 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); |
|
548 if (value != 0) { |
|
549 stack[stackptr++] = value; |
|
550 continue; |
|
551 } |
|
552 } |
|
553 |
|
554 /* Other decompositions. */ |
|
555 get_decomp_record(self, code, &index, &prefix, &count); |
|
556 |
|
557 /* Copy character if it is not decomposable, or has a |
|
558 compatibility decomposition, but we do NFD. */ |
|
559 if (!count || (prefix && !k)) { |
|
560 *o++ = code; |
|
561 space--; |
|
562 continue; |
|
563 } |
|
564 /* Copy decomposition onto the stack, in reverse |
|
565 order. */ |
|
566 while(count) { |
|
567 code = decomp_data[index + (--count)]; |
|
568 stack[stackptr++] = code; |
|
569 } |
|
570 } |
|
571 } |
|
572 |
|
573 /* Drop overallocation. Cannot fail. */ |
|
574 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space); |
|
575 |
|
576 /* Sort canonically. */ |
|
577 i = PyUnicode_AS_UNICODE(result); |
|
578 prev = _getrecord_ex(*i)->combining; |
|
579 end = i + PyUnicode_GET_SIZE(result); |
|
580 for (i++; i < end; i++) { |
|
581 cur = _getrecord_ex(*i)->combining; |
|
582 if (prev == 0 || cur == 0 || prev <= cur) { |
|
583 prev = cur; |
|
584 continue; |
|
585 } |
|
586 /* Non-canonical order. Need to switch *i with previous. */ |
|
587 o = i - 1; |
|
588 while (1) { |
|
589 Py_UNICODE tmp = o[1]; |
|
590 o[1] = o[0]; |
|
591 o[0] = tmp; |
|
592 o--; |
|
593 if (o < PyUnicode_AS_UNICODE(result)) |
|
594 break; |
|
595 prev = _getrecord_ex(*o)->combining; |
|
596 if (prev == 0 || prev <= cur) |
|
597 break; |
|
598 } |
|
599 prev = _getrecord_ex(*i)->combining; |
|
600 } |
|
601 return result; |
|
602 } |
|
603 |
|
604 static int |
|
605 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code) |
|
606 { |
|
607 int index; |
|
608 for (index = 0; nfc[index].start; index++) { |
|
609 int start = nfc[index].start; |
|
610 if (code < start) |
|
611 return -1; |
|
612 if (code <= start + nfc[index].count) { |
|
613 int delta = code - start; |
|
614 return nfc[index].index + delta; |
|
615 } |
|
616 } |
|
617 return -1; |
|
618 } |
|
619 |
|
620 static PyObject* |
|
621 nfc_nfkc(PyObject *self, PyObject *input, int k) |
|
622 { |
|
623 PyObject *result; |
|
624 Py_UNICODE *i, *i1, *o, *end; |
|
625 int f,l,index,index1,comb; |
|
626 Py_UNICODE code; |
|
627 Py_UNICODE *skipped[20]; |
|
628 int cskipped = 0; |
|
629 |
|
630 result = nfd_nfkd(self, input, k); |
|
631 if (!result) |
|
632 return NULL; |
|
633 |
|
634 /* We are going to modify result in-place. |
|
635 If nfd_nfkd is changed to sometimes return the input, |
|
636 this code needs to be reviewed. */ |
|
637 assert(result != input); |
|
638 |
|
639 i = PyUnicode_AS_UNICODE(result); |
|
640 end = i + PyUnicode_GET_SIZE(result); |
|
641 o = PyUnicode_AS_UNICODE(result); |
|
642 |
|
643 again: |
|
644 while (i < end) { |
|
645 for (index = 0; index < cskipped; index++) { |
|
646 if (skipped[index] == i) { |
|
647 /* *i character is skipped. |
|
648 Remove from list. */ |
|
649 skipped[index] = skipped[cskipped-1]; |
|
650 cskipped--; |
|
651 i++; |
|
652 goto again; /* continue while */ |
|
653 } |
|
654 } |
|
655 /* Hangul Composition. We don't need to check for <LV,T> |
|
656 pairs, since we always have decomposed data. */ |
|
657 if (LBase <= *i && *i < (LBase+LCount) && |
|
658 i + 1 < end && |
|
659 VBase <= i[1] && i[1] <= (VBase+VCount)) { |
|
660 int LIndex, VIndex; |
|
661 LIndex = i[0] - LBase; |
|
662 VIndex = i[1] - VBase; |
|
663 code = SBase + (LIndex*VCount+VIndex)*TCount; |
|
664 i+=2; |
|
665 if (i < end && |
|
666 TBase <= *i && *i <= (TBase+TCount)) { |
|
667 code += *i-TBase; |
|
668 i++; |
|
669 } |
|
670 *o++ = code; |
|
671 continue; |
|
672 } |
|
673 |
|
674 f = find_nfc_index(self, nfc_first, *i); |
|
675 if (f == -1) { |
|
676 *o++ = *i++; |
|
677 continue; |
|
678 } |
|
679 /* Find next unblocked character. */ |
|
680 i1 = i+1; |
|
681 comb = 0; |
|
682 while (i1 < end) { |
|
683 int comb1 = _getrecord_ex(*i1)->combining; |
|
684 if (comb1 && comb == comb1) { |
|
685 /* Character is blocked. */ |
|
686 i1++; |
|
687 continue; |
|
688 } |
|
689 l = find_nfc_index(self, nfc_last, *i1); |
|
690 /* *i1 cannot be combined with *i. If *i1 |
|
691 is a starter, we don't need to look further. |
|
692 Otherwise, record the combining class. */ |
|
693 if (l == -1) { |
|
694 not_combinable: |
|
695 if (comb1 == 0) |
|
696 break; |
|
697 comb = comb1; |
|
698 i1++; |
|
699 continue; |
|
700 } |
|
701 index = f*TOTAL_LAST + l; |
|
702 index1 = comp_index[index >> COMP_SHIFT]; |
|
703 code = comp_data[(index1<<COMP_SHIFT)+ |
|
704 (index&((1<<COMP_SHIFT)-1))]; |
|
705 if (code == 0) |
|
706 goto not_combinable; |
|
707 |
|
708 /* Replace the original character. */ |
|
709 *i = code; |
|
710 /* Mark the second character unused. */ |
|
711 skipped[cskipped++] = i1; |
|
712 i1++; |
|
713 f = find_nfc_index(self, nfc_first, *i); |
|
714 if (f == -1) |
|
715 break; |
|
716 } |
|
717 *o++ = *i++; |
|
718 } |
|
719 if (o != end) |
|
720 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result)); |
|
721 return result; |
|
722 } |
|
723 |
|
724 PyDoc_STRVAR(unicodedata_normalize__doc__, |
|
725 "normalize(form, unistr)\n\ |
|
726 \n\ |
|
727 Return the normal form 'form' for the Unicode string unistr. Valid\n\ |
|
728 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'."); |
|
729 |
|
730 static PyObject* |
|
731 unicodedata_normalize(PyObject *self, PyObject *args) |
|
732 { |
|
733 char *form; |
|
734 PyObject *input; |
|
735 |
|
736 if(!PyArg_ParseTuple(args, "sO!:normalize", |
|
737 &form, &PyUnicode_Type, &input)) |
|
738 return NULL; |
|
739 |
|
740 if (PyUnicode_GetSize(input) == 0) { |
|
741 /* Special case empty input strings, since resizing |
|
742 them later would cause internal errors. */ |
|
743 Py_INCREF(input); |
|
744 return input; |
|
745 } |
|
746 |
|
747 if (strcmp(form, "NFC") == 0) |
|
748 return nfc_nfkc(self, input, 0); |
|
749 if (strcmp(form, "NFKC") == 0) |
|
750 return nfc_nfkc(self, input, 1); |
|
751 if (strcmp(form, "NFD") == 0) |
|
752 return nfd_nfkd(self, input, 0); |
|
753 if (strcmp(form, "NFKD") == 0) |
|
754 return nfd_nfkd(self, input, 1); |
|
755 PyErr_SetString(PyExc_ValueError, "invalid normalization form"); |
|
756 return NULL; |
|
757 } |
|
758 |
|
759 /* -------------------------------------------------------------------- */ |
|
760 /* unicode character name tables */ |
|
761 |
|
762 /* data file generated by Tools/unicode/makeunicodedata.py */ |
|
763 #include "unicodename_db.h" |
|
764 |
|
765 /* -------------------------------------------------------------------- */ |
|
766 /* database code (cut and pasted from the unidb package) */ |
|
767 |
|
768 static unsigned long |
|
769 _gethash(const char *s, int len, int scale) |
|
770 { |
|
771 int i; |
|
772 unsigned long h = 0; |
|
773 unsigned long ix; |
|
774 for (i = 0; i < len; i++) { |
|
775 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i])); |
|
776 ix = h & 0xff000000; |
|
777 if (ix) |
|
778 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; |
|
779 } |
|
780 return h; |
|
781 } |
|
782 |
|
783 static char *hangul_syllables[][3] = { |
|
784 { "G", "A", "" }, |
|
785 { "GG", "AE", "G" }, |
|
786 { "N", "YA", "GG" }, |
|
787 { "D", "YAE", "GS" }, |
|
788 { "DD", "EO", "N", }, |
|
789 { "R", "E", "NJ" }, |
|
790 { "M", "YEO", "NH" }, |
|
791 { "B", "YE", "D" }, |
|
792 { "BB", "O", "L" }, |
|
793 { "S", "WA", "LG" }, |
|
794 { "SS", "WAE", "LM" }, |
|
795 { "", "OE", "LB" }, |
|
796 { "J", "YO", "LS" }, |
|
797 { "JJ", "U", "LT" }, |
|
798 { "C", "WEO", "LP" }, |
|
799 { "K", "WE", "LH" }, |
|
800 { "T", "WI", "M" }, |
|
801 { "P", "YU", "B" }, |
|
802 { "H", "EU", "BS" }, |
|
803 { 0, "YI", "S" }, |
|
804 { 0, "I", "SS" }, |
|
805 { 0, 0, "NG" }, |
|
806 { 0, 0, "J" }, |
|
807 { 0, 0, "C" }, |
|
808 { 0, 0, "K" }, |
|
809 { 0, 0, "T" }, |
|
810 { 0, 0, "P" }, |
|
811 { 0, 0, "H" } |
|
812 }; |
|
813 |
|
814 static int |
|
815 is_unified_ideograph(Py_UCS4 code) |
|
816 { |
|
817 return ( |
|
818 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ |
|
819 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */ |
|
820 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */ |
|
821 } |
|
822 |
|
823 static int |
|
824 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen) |
|
825 { |
|
826 int offset; |
|
827 int i; |
|
828 int word; |
|
829 unsigned char* w; |
|
830 |
|
831 if (code >= 0x110000) |
|
832 return 0; |
|
833 |
|
834 if (self) { |
|
835 const change_record *old = get_old_record(self, code); |
|
836 if (old->category_changed == 0) { |
|
837 /* unassigned */ |
|
838 return 0; |
|
839 } |
|
840 } |
|
841 |
|
842 if (SBase <= code && code < SBase+SCount) { |
|
843 /* Hangul syllable. */ |
|
844 int SIndex = code - SBase; |
|
845 int L = SIndex / NCount; |
|
846 int V = (SIndex % NCount) / TCount; |
|
847 int T = SIndex % TCount; |
|
848 |
|
849 if (buflen < 27) |
|
850 /* Worst case: HANGUL SYLLABLE <10chars>. */ |
|
851 return 0; |
|
852 strcpy(buffer, "HANGUL SYLLABLE "); |
|
853 buffer += 16; |
|
854 strcpy(buffer, hangul_syllables[L][0]); |
|
855 buffer += strlen(hangul_syllables[L][0]); |
|
856 strcpy(buffer, hangul_syllables[V][1]); |
|
857 buffer += strlen(hangul_syllables[V][1]); |
|
858 strcpy(buffer, hangul_syllables[T][2]); |
|
859 buffer += strlen(hangul_syllables[T][2]); |
|
860 *buffer = '\0'; |
|
861 return 1; |
|
862 } |
|
863 |
|
864 if (is_unified_ideograph(code)) { |
|
865 if (buflen < 28) |
|
866 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ |
|
867 return 0; |
|
868 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); |
|
869 return 1; |
|
870 } |
|
871 |
|
872 /* get offset into phrasebook */ |
|
873 offset = phrasebook_offset1[(code>>phrasebook_shift)]; |
|
874 offset = phrasebook_offset2[(offset<<phrasebook_shift) + |
|
875 (code&((1<<phrasebook_shift)-1))]; |
|
876 if (!offset) |
|
877 return 0; |
|
878 |
|
879 i = 0; |
|
880 |
|
881 for (;;) { |
|
882 /* get word index */ |
|
883 word = phrasebook[offset] - phrasebook_short; |
|
884 if (word >= 0) { |
|
885 word = (word << 8) + phrasebook[offset+1]; |
|
886 offset += 2; |
|
887 } else |
|
888 word = phrasebook[offset++]; |
|
889 if (i) { |
|
890 if (i > buflen) |
|
891 return 0; /* buffer overflow */ |
|
892 buffer[i++] = ' '; |
|
893 } |
|
894 /* copy word string from lexicon. the last character in the |
|
895 word has bit 7 set. the last word in a string ends with |
|
896 0x80 */ |
|
897 w = lexicon + lexicon_offset[word]; |
|
898 while (*w < 128) { |
|
899 if (i >= buflen) |
|
900 return 0; /* buffer overflow */ |
|
901 buffer[i++] = *w++; |
|
902 } |
|
903 if (i >= buflen) |
|
904 return 0; /* buffer overflow */ |
|
905 buffer[i++] = *w & 127; |
|
906 if (*w == 128) |
|
907 break; /* end of word */ |
|
908 } |
|
909 |
|
910 return 1; |
|
911 } |
|
912 |
|
913 static int |
|
914 _cmpname(PyObject *self, int code, const char* name, int namelen) |
|
915 { |
|
916 /* check if code corresponds to the given name */ |
|
917 int i; |
|
918 char buffer[NAME_MAXLEN]; |
|
919 if (!_getucname(self, code, buffer, sizeof(buffer))) |
|
920 return 0; |
|
921 for (i = 0; i < namelen; i++) { |
|
922 if (toupper(Py_CHARMASK(name[i])) != buffer[i]) |
|
923 return 0; |
|
924 } |
|
925 return buffer[namelen] == '\0'; |
|
926 } |
|
927 |
|
928 static void |
|
929 find_syllable(const char *str, int *len, int *pos, int count, int column) |
|
930 { |
|
931 int i, len1; |
|
932 *len = -1; |
|
933 for (i = 0; i < count; i++) { |
|
934 char *s = hangul_syllables[i][column]; |
|
935 len1 = strlen(s); |
|
936 if (len1 <= *len) |
|
937 continue; |
|
938 if (strncmp(str, s, len1) == 0) { |
|
939 *len = len1; |
|
940 *pos = i; |
|
941 } |
|
942 } |
|
943 if (*len == -1) { |
|
944 *len = 0; |
|
945 } |
|
946 } |
|
947 |
|
948 static int |
|
949 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code) |
|
950 { |
|
951 unsigned int h, v; |
|
952 unsigned int mask = code_size-1; |
|
953 unsigned int i, incr; |
|
954 |
|
955 /* Check for hangul syllables. */ |
|
956 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { |
|
957 int len, L = -1, V = -1, T = -1; |
|
958 const char *pos = name + 16; |
|
959 find_syllable(pos, &len, &L, LCount, 0); |
|
960 pos += len; |
|
961 find_syllable(pos, &len, &V, VCount, 1); |
|
962 pos += len; |
|
963 find_syllable(pos, &len, &T, TCount, 2); |
|
964 pos += len; |
|
965 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) { |
|
966 *code = SBase + (L*VCount+V)*TCount + T; |
|
967 return 1; |
|
968 } |
|
969 /* Otherwise, it's an illegal syllable name. */ |
|
970 return 0; |
|
971 } |
|
972 |
|
973 /* Check for unified ideographs. */ |
|
974 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { |
|
975 /* Four or five hexdigits must follow. */ |
|
976 v = 0; |
|
977 name += 22; |
|
978 namelen -= 22; |
|
979 if (namelen != 4 && namelen != 5) |
|
980 return 0; |
|
981 while (namelen--) { |
|
982 v *= 16; |
|
983 if (*name >= '0' && *name <= '9') |
|
984 v += *name - '0'; |
|
985 else if (*name >= 'A' && *name <= 'F') |
|
986 v += *name - 'A' + 10; |
|
987 else |
|
988 return 0; |
|
989 name++; |
|
990 } |
|
991 if (!is_unified_ideograph(v)) |
|
992 return 0; |
|
993 *code = v; |
|
994 return 1; |
|
995 } |
|
996 |
|
997 /* the following is the same as python's dictionary lookup, with |
|
998 only minor changes. see the makeunicodedata script for more |
|
999 details */ |
|
1000 |
|
1001 h = (unsigned int) _gethash(name, namelen, code_magic); |
|
1002 i = (~h) & mask; |
|
1003 v = code_hash[i]; |
|
1004 if (!v) |
|
1005 return 0; |
|
1006 if (_cmpname(self, v, name, namelen)) { |
|
1007 *code = v; |
|
1008 return 1; |
|
1009 } |
|
1010 incr = (h ^ (h >> 3)) & mask; |
|
1011 if (!incr) |
|
1012 incr = mask; |
|
1013 for (;;) { |
|
1014 i = (i + incr) & mask; |
|
1015 v = code_hash[i]; |
|
1016 if (!v) |
|
1017 return 0; |
|
1018 if (_cmpname(self, v, name, namelen)) { |
|
1019 *code = v; |
|
1020 return 1; |
|
1021 } |
|
1022 incr = incr << 1; |
|
1023 if (incr > mask) |
|
1024 incr = incr ^ code_poly; |
|
1025 } |
|
1026 } |
|
1027 |
|
1028 static const _PyUnicode_Name_CAPI hashAPI = |
|
1029 { |
|
1030 sizeof(_PyUnicode_Name_CAPI), |
|
1031 _getucname, |
|
1032 _getcode |
|
1033 }; |
|
1034 |
|
1035 /* -------------------------------------------------------------------- */ |
|
1036 /* Python bindings */ |
|
1037 |
|
1038 PyDoc_STRVAR(unicodedata_name__doc__, |
|
1039 "name(unichr[, default])\n\ |
|
1040 Returns the name assigned to the Unicode character unichr as a\n\ |
|
1041 string. If no name is defined, default is returned, or, if not\n\ |
|
1042 given, ValueError is raised."); |
|
1043 |
|
1044 static PyObject * |
|
1045 unicodedata_name(PyObject* self, PyObject* args) |
|
1046 { |
|
1047 char name[NAME_MAXLEN]; |
|
1048 Py_UCS4 c; |
|
1049 |
|
1050 PyUnicodeObject* v; |
|
1051 PyObject* defobj = NULL; |
|
1052 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj)) |
|
1053 return NULL; |
|
1054 |
|
1055 c = getuchar(v); |
|
1056 if (c == (Py_UCS4)-1) |
|
1057 return NULL; |
|
1058 |
|
1059 if (!_getucname(self, c, name, sizeof(name))) { |
|
1060 if (defobj == NULL) { |
|
1061 PyErr_SetString(PyExc_ValueError, "no such name"); |
|
1062 return NULL; |
|
1063 } |
|
1064 else { |
|
1065 Py_INCREF(defobj); |
|
1066 return defobj; |
|
1067 } |
|
1068 } |
|
1069 |
|
1070 return Py_BuildValue("s", name); |
|
1071 } |
|
1072 |
|
1073 PyDoc_STRVAR(unicodedata_lookup__doc__, |
|
1074 "lookup(name)\n\ |
|
1075 \n\ |
|
1076 Look up character by name. If a character with the\n\ |
|
1077 given name is found, return the corresponding Unicode\n\ |
|
1078 character. If not found, KeyError is raised."); |
|
1079 |
|
1080 static PyObject * |
|
1081 unicodedata_lookup(PyObject* self, PyObject* args) |
|
1082 { |
|
1083 Py_UCS4 code; |
|
1084 Py_UNICODE str[2]; |
|
1085 |
|
1086 char* name; |
|
1087 int namelen; |
|
1088 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) |
|
1089 return NULL; |
|
1090 |
|
1091 if (!_getcode(self, name, namelen, &code)) { |
|
1092 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", |
|
1093 name); |
|
1094 return NULL; |
|
1095 } |
|
1096 |
|
1097 #ifndef Py_UNICODE_WIDE |
|
1098 if (code >= 0x10000) { |
|
1099 str[0] = 0xd800 + ((code - 0x10000) >> 10); |
|
1100 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff); |
|
1101 return PyUnicode_FromUnicode(str, 2); |
|
1102 } |
|
1103 #endif |
|
1104 str[0] = (Py_UNICODE) code; |
|
1105 return PyUnicode_FromUnicode(str, 1); |
|
1106 } |
|
1107 |
|
1108 /* XXX Add doc strings. */ |
|
1109 |
|
1110 static PyMethodDef unicodedata_functions[] = { |
|
1111 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__}, |
|
1112 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__}, |
|
1113 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__}, |
|
1114 {"category", unicodedata_category, METH_VARARGS, |
|
1115 unicodedata_category__doc__}, |
|
1116 {"bidirectional", unicodedata_bidirectional, METH_VARARGS, |
|
1117 unicodedata_bidirectional__doc__}, |
|
1118 {"combining", unicodedata_combining, METH_VARARGS, |
|
1119 unicodedata_combining__doc__}, |
|
1120 {"mirrored", unicodedata_mirrored, METH_VARARGS, |
|
1121 unicodedata_mirrored__doc__}, |
|
1122 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS, |
|
1123 unicodedata_east_asian_width__doc__}, |
|
1124 {"decomposition", unicodedata_decomposition, METH_VARARGS, |
|
1125 unicodedata_decomposition__doc__}, |
|
1126 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__}, |
|
1127 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__}, |
|
1128 {"normalize", unicodedata_normalize, METH_VARARGS, |
|
1129 unicodedata_normalize__doc__}, |
|
1130 {NULL, NULL} /* sentinel */ |
|
1131 }; |
|
1132 |
|
1133 static PyTypeObject UCD_Type = { |
|
1134 /* The ob_type field must be initialized in the module init function |
|
1135 * to be portable to Windows without using C++. */ |
|
1136 PyVarObject_HEAD_INIT(NULL, 0) |
|
1137 "unicodedata.UCD", /*tp_name*/ |
|
1138 sizeof(PreviousDBVersion), /*tp_basicsize*/ |
|
1139 0, /*tp_itemsize*/ |
|
1140 /* methods */ |
|
1141 (destructor)PyObject_Del, /*tp_dealloc*/ |
|
1142 0, /*tp_print*/ |
|
1143 0, /*tp_getattr*/ |
|
1144 0, /*tp_setattr*/ |
|
1145 0, /*tp_compare*/ |
|
1146 0, /*tp_repr*/ |
|
1147 0, /*tp_as_number*/ |
|
1148 0, /*tp_as_sequence*/ |
|
1149 0, /*tp_as_mapping*/ |
|
1150 0, /*tp_hash*/ |
|
1151 0, /*tp_call*/ |
|
1152 0, /*tp_str*/ |
|
1153 PyObject_GenericGetAttr,/*tp_getattro*/ |
|
1154 0, /*tp_setattro*/ |
|
1155 0, /*tp_as_buffer*/ |
|
1156 Py_TPFLAGS_DEFAULT, /*tp_flags*/ |
|
1157 0, /*tp_doc*/ |
|
1158 0, /*tp_traverse*/ |
|
1159 0, /*tp_clear*/ |
|
1160 0, /*tp_richcompare*/ |
|
1161 0, /*tp_weaklistoffset*/ |
|
1162 0, /*tp_iter*/ |
|
1163 0, /*tp_iternext*/ |
|
1164 unicodedata_functions, /*tp_methods*/ |
|
1165 DB_members, /*tp_members*/ |
|
1166 0, /*tp_getset*/ |
|
1167 0, /*tp_base*/ |
|
1168 0, /*tp_dict*/ |
|
1169 0, /*tp_descr_get*/ |
|
1170 0, /*tp_descr_set*/ |
|
1171 0, /*tp_dictoffset*/ |
|
1172 0, /*tp_init*/ |
|
1173 0, /*tp_alloc*/ |
|
1174 0, /*tp_new*/ |
|
1175 0, /*tp_free*/ |
|
1176 0, /*tp_is_gc*/ |
|
1177 }; |
|
1178 |
|
1179 PyDoc_STRVAR(unicodedata_docstring, |
|
1180 "This module provides access to the Unicode Character Database which\n\ |
|
1181 defines character properties for all Unicode characters. The data in\n\ |
|
1182 this database is based on the UnicodeData.txt file version\n\ |
|
1183 5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\ |
|
1184 \n\ |
|
1185 The module uses the same names and symbols as defined by the\n\ |
|
1186 UnicodeData File Format 5.1.0 (see\n\ |
|
1187 http://www.unicode.org/Public/5.1.0/ucd/UCD.html)."); |
|
1188 |
|
1189 PyMODINIT_FUNC |
|
1190 initunicodedata(void) |
|
1191 { |
|
1192 PyObject *m, *v; |
|
1193 |
|
1194 Py_TYPE(&UCD_Type) = &PyType_Type; |
|
1195 |
|
1196 m = Py_InitModule3( |
|
1197 "unicodedata", unicodedata_functions, unicodedata_docstring); |
|
1198 if (!m) |
|
1199 return; |
|
1200 |
|
1201 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); |
|
1202 Py_INCREF(&UCD_Type); |
|
1203 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type); |
|
1204 |
|
1205 /* Previous versions */ |
|
1206 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0); |
|
1207 if (v != NULL) |
|
1208 PyModule_AddObject(m, "ucd_3_2_0", v); |
|
1209 |
|
1210 /* Export C API */ |
|
1211 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); |
|
1212 if (v != NULL) |
|
1213 PyModule_AddObject(m, "ucnhash_CAPI", v); |
|
1214 } |
|
1215 |
|
1216 /* |
|
1217 Local variables: |
|
1218 c-basic-offset: 4 |
|
1219 indent-tabs-mode: nil |
|
1220 End: |
|
1221 */ |