|
1 # |
|
2 # (re)generate unicode property and type databases |
|
3 # |
|
4 # this script converts a unicode 3.2 database file to |
|
5 # Modules/unicodedata_db.h, Modules/unicodename_db.h, |
|
6 # and Objects/unicodetype_db.h |
|
7 # |
|
8 # history: |
|
9 # 2000-09-24 fl created (based on bits and pieces from unidb) |
|
10 # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table |
|
11 # 2000-09-25 fl added character type table |
|
12 # 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0) |
|
13 # 2000-11-03 fl expand first/last ranges |
|
14 # 2001-01-19 fl added character name tables (2.1) |
|
15 # 2001-01-21 fl added decomp compression; dynamic phrasebook threshold |
|
16 # 2002-09-11 wd use string methods |
|
17 # 2002-10-18 mvl update to Unicode 3.2 |
|
18 # 2002-10-22 mvl generate NFC tables |
|
19 # 2002-11-24 mvl expand all ranges, sort names version-independently |
|
20 # 2002-11-25 mvl add UNIDATA_VERSION |
|
21 # 2004-05-29 perky add east asian width information |
|
22 # 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta |
|
23 # |
|
24 # written by Fredrik Lundh (fredrik@pythonware.com) |
|
25 # |
|
26 |
|
27 import sys |
|
28 |
|
29 SCRIPT = sys.argv[0] |
|
30 VERSION = "2.6" |
|
31 |
|
32 # The Unicode Database |
|
33 UNIDATA_VERSION = "5.1.0" |
|
34 UNICODE_DATA = "UnicodeData%s.txt" |
|
35 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" |
|
36 EASTASIAN_WIDTH = "EastAsianWidth%s.txt" |
|
37 |
|
38 old_versions = ["3.2.0"] |
|
39 |
|
40 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", |
|
41 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", |
|
42 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", |
|
43 "So" ] |
|
44 |
|
45 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", |
|
46 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", |
|
47 "ON" ] |
|
48 |
|
49 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] |
|
50 |
|
51 # note: should match definitions in Objects/unicodectype.c |
|
52 ALPHA_MASK = 0x01 |
|
53 DECIMAL_MASK = 0x02 |
|
54 DIGIT_MASK = 0x04 |
|
55 LOWER_MASK = 0x08 |
|
56 LINEBREAK_MASK = 0x10 |
|
57 SPACE_MASK = 0x20 |
|
58 TITLE_MASK = 0x40 |
|
59 UPPER_MASK = 0x80 |
|
60 NODELTA_MASK = 0x100 |
|
61 |
|
62 def maketables(trace=0): |
|
63 |
|
64 print "--- Reading", UNICODE_DATA % "", "..." |
|
65 |
|
66 version = "" |
|
67 unicode = UnicodeData(UNICODE_DATA % version, |
|
68 COMPOSITION_EXCLUSIONS % version, |
|
69 EASTASIAN_WIDTH % version) |
|
70 |
|
71 print len(filter(None, unicode.table)), "characters" |
|
72 |
|
73 for version in old_versions: |
|
74 print "--- Reading", UNICODE_DATA % ("-"+version), "..." |
|
75 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version), |
|
76 COMPOSITION_EXCLUSIONS % ("-"+version), |
|
77 EASTASIAN_WIDTH % ("-"+version)) |
|
78 print len(filter(None, old_unicode.table)), "characters" |
|
79 merge_old_version(version, unicode, old_unicode) |
|
80 |
|
81 makeunicodename(unicode, trace) |
|
82 makeunicodedata(unicode, trace) |
|
83 makeunicodetype(unicode, trace) |
|
84 |
|
85 # -------------------------------------------------------------------- |
|
86 # unicode character properties |
|
87 |
|
88 def makeunicodedata(unicode, trace): |
|
89 |
|
90 dummy = (0, 0, 0, 0, 0) |
|
91 table = [dummy] |
|
92 cache = {0: dummy} |
|
93 index = [0] * len(unicode.chars) |
|
94 |
|
95 FILE = "Modules/unicodedata_db.h" |
|
96 |
|
97 print "--- Preparing", FILE, "..." |
|
98 |
|
99 # 1) database properties |
|
100 |
|
101 for char in unicode.chars: |
|
102 record = unicode.table[char] |
|
103 if record: |
|
104 # extract database properties |
|
105 category = CATEGORY_NAMES.index(record[2]) |
|
106 combining = int(record[3]) |
|
107 bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) |
|
108 mirrored = record[9] == "Y" |
|
109 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15]) |
|
110 item = ( |
|
111 category, combining, bidirectional, mirrored, eastasianwidth |
|
112 ) |
|
113 # add entry to index and item tables |
|
114 i = cache.get(item) |
|
115 if i is None: |
|
116 cache[item] = i = len(table) |
|
117 table.append(item) |
|
118 index[char] = i |
|
119 |
|
120 # 2) decomposition data |
|
121 |
|
122 decomp_data = [0] |
|
123 decomp_prefix = [""] |
|
124 decomp_index = [0] * len(unicode.chars) |
|
125 decomp_size = 0 |
|
126 |
|
127 comp_pairs = [] |
|
128 comp_first = [None] * len(unicode.chars) |
|
129 comp_last = [None] * len(unicode.chars) |
|
130 |
|
131 for char in unicode.chars: |
|
132 record = unicode.table[char] |
|
133 if record: |
|
134 if record[5]: |
|
135 decomp = record[5].split() |
|
136 if len(decomp) > 19: |
|
137 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char |
|
138 # prefix |
|
139 if decomp[0][0] == "<": |
|
140 prefix = decomp.pop(0) |
|
141 else: |
|
142 prefix = "" |
|
143 try: |
|
144 i = decomp_prefix.index(prefix) |
|
145 except ValueError: |
|
146 i = len(decomp_prefix) |
|
147 decomp_prefix.append(prefix) |
|
148 prefix = i |
|
149 assert prefix < 256 |
|
150 # content |
|
151 decomp = [prefix + (len(decomp)<<8)] +\ |
|
152 map(lambda s: int(s, 16), decomp) |
|
153 # Collect NFC pairs |
|
154 if not prefix and len(decomp) == 3 and \ |
|
155 char not in unicode.exclusions and \ |
|
156 unicode.table[decomp[1]][3] == "0": |
|
157 p, l, r = decomp |
|
158 comp_first[l] = 1 |
|
159 comp_last[r] = 1 |
|
160 comp_pairs.append((l,r,char)) |
|
161 try: |
|
162 i = decomp_data.index(decomp) |
|
163 except ValueError: |
|
164 i = len(decomp_data) |
|
165 decomp_data.extend(decomp) |
|
166 decomp_size = decomp_size + len(decomp) * 2 |
|
167 else: |
|
168 i = 0 |
|
169 decomp_index[char] = i |
|
170 |
|
171 f = l = 0 |
|
172 comp_first_ranges = [] |
|
173 comp_last_ranges = [] |
|
174 prev_f = prev_l = None |
|
175 for i in unicode.chars: |
|
176 if comp_first[i] is not None: |
|
177 comp_first[i] = f |
|
178 f += 1 |
|
179 if prev_f is None: |
|
180 prev_f = (i,i) |
|
181 elif prev_f[1]+1 == i: |
|
182 prev_f = prev_f[0],i |
|
183 else: |
|
184 comp_first_ranges.append(prev_f) |
|
185 prev_f = (i,i) |
|
186 if comp_last[i] is not None: |
|
187 comp_last[i] = l |
|
188 l += 1 |
|
189 if prev_l is None: |
|
190 prev_l = (i,i) |
|
191 elif prev_l[1]+1 == i: |
|
192 prev_l = prev_l[0],i |
|
193 else: |
|
194 comp_last_ranges.append(prev_l) |
|
195 prev_l = (i,i) |
|
196 comp_first_ranges.append(prev_f) |
|
197 comp_last_ranges.append(prev_l) |
|
198 total_first = f |
|
199 total_last = l |
|
200 |
|
201 comp_data = [0]*(total_first*total_last) |
|
202 for f,l,char in comp_pairs: |
|
203 f = comp_first[f] |
|
204 l = comp_last[l] |
|
205 comp_data[f*total_last+l] = char |
|
206 |
|
207 print len(table), "unique properties" |
|
208 print len(decomp_prefix), "unique decomposition prefixes" |
|
209 print len(decomp_data), "unique decomposition entries:", |
|
210 print decomp_size, "bytes" |
|
211 print total_first, "first characters in NFC" |
|
212 print total_last, "last characters in NFC" |
|
213 print len(comp_pairs), "NFC pairs" |
|
214 |
|
215 print "--- Writing", FILE, "..." |
|
216 |
|
217 fp = open(FILE, "w") |
|
218 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) |
|
219 print >>fp |
|
220 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION |
|
221 print >>fp, "/* a list of unique database records */" |
|
222 print >>fp, \ |
|
223 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {" |
|
224 for item in table: |
|
225 print >>fp, " {%d, %d, %d, %d, %d}," % item |
|
226 print >>fp, "};" |
|
227 print >>fp |
|
228 |
|
229 print >>fp, "/* Reindexing of NFC first characters. */" |
|
230 print >>fp, "#define TOTAL_FIRST",total_first |
|
231 print >>fp, "#define TOTAL_LAST",total_last |
|
232 print >>fp, "struct reindex{int start;short count,index;};" |
|
233 print >>fp, "static struct reindex nfc_first[] = {" |
|
234 for start,end in comp_first_ranges: |
|
235 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start]) |
|
236 print >>fp," {0,0,0}" |
|
237 print >>fp,"};\n" |
|
238 print >>fp, "static struct reindex nfc_last[] = {" |
|
239 for start,end in comp_last_ranges: |
|
240 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start]) |
|
241 print >>fp," {0,0,0}" |
|
242 print >>fp,"};\n" |
|
243 |
|
244 # FIXME: <fl> the following tables could be made static, and |
|
245 # the support code moved into unicodedatabase.c |
|
246 |
|
247 print >>fp, "/* string literals */" |
|
248 print >>fp, "const char *_PyUnicode_CategoryNames[] = {" |
|
249 for name in CATEGORY_NAMES: |
|
250 print >>fp, " \"%s\"," % name |
|
251 print >>fp, " NULL" |
|
252 print >>fp, "};" |
|
253 |
|
254 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {" |
|
255 for name in BIDIRECTIONAL_NAMES: |
|
256 print >>fp, " \"%s\"," % name |
|
257 print >>fp, " NULL" |
|
258 print >>fp, "};" |
|
259 |
|
260 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {" |
|
261 for name in EASTASIANWIDTH_NAMES: |
|
262 print >>fp, " \"%s\"," % name |
|
263 print >>fp, " NULL" |
|
264 print >>fp, "};" |
|
265 |
|
266 print >>fp, "static const char *decomp_prefix[] = {" |
|
267 for name in decomp_prefix: |
|
268 print >>fp, " \"%s\"," % name |
|
269 print >>fp, " NULL" |
|
270 print >>fp, "};" |
|
271 |
|
272 # split record index table |
|
273 index1, index2, shift = splitbins(index, trace) |
|
274 |
|
275 print >>fp, "/* index tables for the database records */" |
|
276 print >>fp, "#define SHIFT", shift |
|
277 Array("index1", index1).dump(fp, trace) |
|
278 Array("index2", index2).dump(fp, trace) |
|
279 |
|
280 # split decomposition index table |
|
281 index1, index2, shift = splitbins(decomp_index, trace) |
|
282 |
|
283 print >>fp, "/* decomposition data */" |
|
284 Array("decomp_data", decomp_data).dump(fp, trace) |
|
285 |
|
286 print >>fp, "/* index tables for the decomposition data */" |
|
287 print >>fp, "#define DECOMP_SHIFT", shift |
|
288 Array("decomp_index1", index1).dump(fp, trace) |
|
289 Array("decomp_index2", index2).dump(fp, trace) |
|
290 |
|
291 index, index2, shift = splitbins(comp_data, trace) |
|
292 print >>fp, "/* NFC pairs */" |
|
293 print >>fp, "#define COMP_SHIFT", shift |
|
294 Array("comp_index", index).dump(fp, trace) |
|
295 Array("comp_data", index2).dump(fp, trace) |
|
296 |
|
297 # Generate delta tables for old versions |
|
298 for version, table, normalization in unicode.changed: |
|
299 cversion = version.replace(".","_") |
|
300 records = [table[0]] |
|
301 cache = {table[0]:0} |
|
302 index = [0] * len(table) |
|
303 for i, record in enumerate(table): |
|
304 try: |
|
305 index[i] = cache[record] |
|
306 except KeyError: |
|
307 index[i] = cache[record] = len(records) |
|
308 records.append(record) |
|
309 index1, index2, shift = splitbins(index, trace) |
|
310 print >>fp, "static const change_record change_records_%s[] = {" % cversion |
|
311 for record in records: |
|
312 print >>fp, "\t{ %s }," % ", ".join(map(str,record)) |
|
313 print >>fp, "};" |
|
314 Array("changes_%s_index" % cversion, index1).dump(fp, trace) |
|
315 Array("changes_%s_data" % cversion, index2).dump(fp, trace) |
|
316 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion |
|
317 print >>fp, "{" |
|
318 print >>fp, "\tint index;" |
|
319 print >>fp, "\tif (n >= 0x110000) index = 0;" |
|
320 print >>fp, "\telse {" |
|
321 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift) |
|
322 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \ |
|
323 (cversion, shift, ((1<<shift)-1)) |
|
324 print >>fp, "\t}" |
|
325 print >>fp, "\treturn change_records_%s+index;" % cversion |
|
326 print >>fp, "}\n" |
|
327 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion |
|
328 print >>fp, "{" |
|
329 print >>fp, "\tswitch(n) {" |
|
330 for k, v in normalization: |
|
331 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v) |
|
332 print >>fp, "\tdefault: return 0;" |
|
333 print >>fp, "\t}\n}\n" |
|
334 |
|
335 fp.close() |
|
336 |
|
337 # -------------------------------------------------------------------- |
|
338 # unicode character type tables |
|
339 |
|
340 def makeunicodetype(unicode, trace): |
|
341 |
|
342 FILE = "Objects/unicodetype_db.h" |
|
343 |
|
344 print "--- Preparing", FILE, "..." |
|
345 |
|
346 # extract unicode types |
|
347 dummy = (0, 0, 0, 0, 0, 0) |
|
348 table = [dummy] |
|
349 cache = {0: dummy} |
|
350 index = [0] * len(unicode.chars) |
|
351 |
|
352 for char in unicode.chars: |
|
353 record = unicode.table[char] |
|
354 if record: |
|
355 # extract database properties |
|
356 category = record[2] |
|
357 bidirectional = record[4] |
|
358 flags = 0 |
|
359 delta = True |
|
360 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: |
|
361 flags |= ALPHA_MASK |
|
362 if category == "Ll": |
|
363 flags |= LOWER_MASK |
|
364 if category == "Zl" or bidirectional == "B": |
|
365 flags |= LINEBREAK_MASK |
|
366 if category == "Zs" or bidirectional in ("WS", "B", "S"): |
|
367 flags |= SPACE_MASK |
|
368 if category == "Lt": |
|
369 flags |= TITLE_MASK |
|
370 if category == "Lu": |
|
371 flags |= UPPER_MASK |
|
372 # use delta predictor for upper/lower/title if it fits |
|
373 if record[12]: |
|
374 upper = int(record[12], 16) - char |
|
375 if -32768 <= upper <= 32767 and delta: |
|
376 upper = upper & 0xffff |
|
377 else: |
|
378 upper += char |
|
379 delta = False |
|
380 else: |
|
381 upper = 0 |
|
382 if record[13]: |
|
383 lower = int(record[13], 16) - char |
|
384 if -32768 <= lower <= 32767 and delta: |
|
385 lower = lower & 0xffff |
|
386 else: |
|
387 lower += char |
|
388 delta = False |
|
389 else: |
|
390 lower = 0 |
|
391 if record[14]: |
|
392 title = int(record[14], 16) - char |
|
393 if -32768 <= lower <= 32767 and delta: |
|
394 title = title & 0xffff |
|
395 else: |
|
396 title += char |
|
397 delta = False |
|
398 else: |
|
399 title = 0 |
|
400 if not delta: |
|
401 flags |= NODELTA_MASK |
|
402 # decimal digit, integer digit |
|
403 decimal = 0 |
|
404 if record[6]: |
|
405 flags |= DECIMAL_MASK |
|
406 decimal = int(record[6]) |
|
407 digit = 0 |
|
408 if record[7]: |
|
409 flags |= DIGIT_MASK |
|
410 digit = int(record[7]) |
|
411 item = ( |
|
412 upper, lower, title, decimal, digit, flags |
|
413 ) |
|
414 # add entry to index and item tables |
|
415 i = cache.get(item) |
|
416 if i is None: |
|
417 cache[item] = i = len(table) |
|
418 table.append(item) |
|
419 index[char] = i |
|
420 |
|
421 print len(table), "unique character type entries" |
|
422 |
|
423 print "--- Writing", FILE, "..." |
|
424 |
|
425 fp = open(FILE, "w") |
|
426 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) |
|
427 print >>fp |
|
428 print >>fp, "/* a list of unique character type descriptors */" |
|
429 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {" |
|
430 for item in table: |
|
431 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item |
|
432 print >>fp, "};" |
|
433 print >>fp |
|
434 |
|
435 # split decomposition index table |
|
436 index1, index2, shift = splitbins(index, trace) |
|
437 |
|
438 print >>fp, "/* type indexes */" |
|
439 print >>fp, "#define SHIFT", shift |
|
440 Array("index1", index1).dump(fp, trace) |
|
441 Array("index2", index2).dump(fp, trace) |
|
442 |
|
443 fp.close() |
|
444 |
|
445 # -------------------------------------------------------------------- |
|
446 # unicode name database |
|
447 |
|
448 def makeunicodename(unicode, trace): |
|
449 |
|
450 FILE = "Modules/unicodename_db.h" |
|
451 |
|
452 print "--- Preparing", FILE, "..." |
|
453 |
|
454 # collect names |
|
455 names = [None] * len(unicode.chars) |
|
456 |
|
457 for char in unicode.chars: |
|
458 record = unicode.table[char] |
|
459 if record: |
|
460 name = record[1].strip() |
|
461 if name and name[0] != "<": |
|
462 names[char] = name + chr(0) |
|
463 |
|
464 print len(filter(lambda n: n is not None, names)), "distinct names" |
|
465 |
|
466 # collect unique words from names (note that we differ between |
|
467 # words inside a sentence, and words ending a sentence. the |
|
468 # latter includes the trailing null byte. |
|
469 |
|
470 words = {} |
|
471 n = b = 0 |
|
472 for char in unicode.chars: |
|
473 name = names[char] |
|
474 if name: |
|
475 w = name.split() |
|
476 b = b + len(name) |
|
477 n = n + len(w) |
|
478 for w in w: |
|
479 l = words.get(w) |
|
480 if l: |
|
481 l.append(None) |
|
482 else: |
|
483 words[w] = [len(words)] |
|
484 |
|
485 print n, "words in text;", b, "bytes" |
|
486 |
|
487 wordlist = words.items() |
|
488 |
|
489 # sort on falling frequency, then by name |
|
490 def cmpwords((aword, alist),(bword, blist)): |
|
491 r = -cmp(len(alist),len(blist)) |
|
492 if r: |
|
493 return r |
|
494 return cmp(aword, bword) |
|
495 wordlist.sort(cmpwords) |
|
496 |
|
497 # figure out how many phrasebook escapes we need |
|
498 escapes = 0 |
|
499 while escapes * 256 < len(wordlist): |
|
500 escapes = escapes + 1 |
|
501 print escapes, "escapes" |
|
502 |
|
503 short = 256 - escapes |
|
504 |
|
505 assert short > 0 |
|
506 |
|
507 print short, "short indexes in lexicon" |
|
508 |
|
509 # statistics |
|
510 n = 0 |
|
511 for i in range(short): |
|
512 n = n + len(wordlist[i][1]) |
|
513 print n, "short indexes in phrasebook" |
|
514 |
|
515 # pick the most commonly used words, and sort the rest on falling |
|
516 # length (to maximize overlap) |
|
517 |
|
518 wordlist, wordtail = wordlist[:short], wordlist[short:] |
|
519 wordtail.sort(lambda a, b: len(b[0])-len(a[0])) |
|
520 wordlist.extend(wordtail) |
|
521 |
|
522 # generate lexicon from words |
|
523 |
|
524 lexicon_offset = [0] |
|
525 lexicon = "" |
|
526 words = {} |
|
527 |
|
528 # build a lexicon string |
|
529 offset = 0 |
|
530 for w, x in wordlist: |
|
531 # encoding: bit 7 indicates last character in word (chr(128) |
|
532 # indicates the last character in an entire string) |
|
533 ww = w[:-1] + chr(ord(w[-1])+128) |
|
534 # reuse string tails, when possible |
|
535 o = lexicon.find(ww) |
|
536 if o < 0: |
|
537 o = offset |
|
538 lexicon = lexicon + ww |
|
539 offset = offset + len(w) |
|
540 words[w] = len(lexicon_offset) |
|
541 lexicon_offset.append(o) |
|
542 |
|
543 lexicon = map(ord, lexicon) |
|
544 |
|
545 # generate phrasebook from names and lexicon |
|
546 phrasebook = [0] |
|
547 phrasebook_offset = [0] * len(unicode.chars) |
|
548 for char in unicode.chars: |
|
549 name = names[char] |
|
550 if name: |
|
551 w = name.split() |
|
552 phrasebook_offset[char] = len(phrasebook) |
|
553 for w in w: |
|
554 i = words[w] |
|
555 if i < short: |
|
556 phrasebook.append(i) |
|
557 else: |
|
558 # store as two bytes |
|
559 phrasebook.append((i>>8) + short) |
|
560 phrasebook.append(i&255) |
|
561 |
|
562 assert getsize(phrasebook) == 1 |
|
563 |
|
564 # |
|
565 # unicode name hash table |
|
566 |
|
567 # extract names |
|
568 data = [] |
|
569 for char in unicode.chars: |
|
570 record = unicode.table[char] |
|
571 if record: |
|
572 name = record[1].strip() |
|
573 if name and name[0] != "<": |
|
574 data.append((name, char)) |
|
575 |
|
576 # the magic number 47 was chosen to minimize the number of |
|
577 # collisions on the current data set. if you like, change it |
|
578 # and see what happens... |
|
579 |
|
580 codehash = Hash("code", data, 47) |
|
581 |
|
582 print "--- Writing", FILE, "..." |
|
583 |
|
584 fp = open(FILE, "w") |
|
585 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION) |
|
586 print >>fp |
|
587 print >>fp, "#define NAME_MAXLEN", 256 |
|
588 print >>fp |
|
589 print >>fp, "/* lexicon */" |
|
590 Array("lexicon", lexicon).dump(fp, trace) |
|
591 Array("lexicon_offset", lexicon_offset).dump(fp, trace) |
|
592 |
|
593 # split decomposition index table |
|
594 offset1, offset2, shift = splitbins(phrasebook_offset, trace) |
|
595 |
|
596 print >>fp, "/* code->name phrasebook */" |
|
597 print >>fp, "#define phrasebook_shift", shift |
|
598 print >>fp, "#define phrasebook_short", short |
|
599 |
|
600 Array("phrasebook", phrasebook).dump(fp, trace) |
|
601 Array("phrasebook_offset1", offset1).dump(fp, trace) |
|
602 Array("phrasebook_offset2", offset2).dump(fp, trace) |
|
603 |
|
604 print >>fp, "/* name->code dictionary */" |
|
605 codehash.dump(fp, trace) |
|
606 |
|
607 fp.close() |
|
608 |
|
609 |
|
610 def merge_old_version(version, new, old): |
|
611 # Changes to exclusion file not implemented yet |
|
612 if old.exclusions != new.exclusions: |
|
613 raise NotImplementedError, "exclusions differ" |
|
614 |
|
615 # In these change records, 0xFF means "no change" |
|
616 bidir_changes = [0xFF]*0x110000 |
|
617 category_changes = [0xFF]*0x110000 |
|
618 decimal_changes = [0xFF]*0x110000 |
|
619 mirrored_changes = [0xFF]*0x110000 |
|
620 # In numeric data, 0 means "no change", |
|
621 # -1 means "did not have a numeric value |
|
622 numeric_changes = [0] * 0x110000 |
|
623 # normalization_changes is a list of key-value pairs |
|
624 normalization_changes = [] |
|
625 for i in range(0x110000): |
|
626 if new.table[i] is None: |
|
627 # Characters unassigned in the new version ought to |
|
628 # be unassigned in the old one |
|
629 assert old.table[i] is None |
|
630 continue |
|
631 # check characters unassigned in the old version |
|
632 if old.table[i] is None: |
|
633 # category 0 is "unassigned" |
|
634 category_changes[i] = 0 |
|
635 continue |
|
636 # check characters that differ |
|
637 if old.table[i] != new.table[i]: |
|
638 for k in range(len(old.table[i])): |
|
639 if old.table[i][k] != new.table[i][k]: |
|
640 value = old.table[i][k] |
|
641 if k == 2: |
|
642 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k] |
|
643 category_changes[i] = CATEGORY_NAMES.index(value) |
|
644 elif k == 4: |
|
645 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k] |
|
646 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value) |
|
647 elif k == 5: |
|
648 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k] |
|
649 # We assume that all normalization changes are in 1:1 mappings |
|
650 assert " " not in value |
|
651 normalization_changes.append((i, value)) |
|
652 elif k == 6: |
|
653 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k] |
|
654 # we only support changes where the old value is a single digit |
|
655 assert value in "0123456789" |
|
656 decimal_changes[i] = int(value) |
|
657 elif k == 8: |
|
658 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k] |
|
659 # Since 0 encodes "no change", the old value is better not 0 |
|
660 assert value != "0" and value != "-1" |
|
661 if not value: |
|
662 numeric_changes[i] = -1 |
|
663 else: |
|
664 assert re.match("^[0-9]+$", value) |
|
665 numeric_changes[i] = int(value) |
|
666 elif k == 9: |
|
667 if value == 'Y': |
|
668 mirrored_changes[i] = '1' |
|
669 else: |
|
670 mirrored_changes[i] = '0' |
|
671 elif k == 11: |
|
672 # change to ISO comment, ignore |
|
673 pass |
|
674 elif k == 12: |
|
675 # change to simple uppercase mapping; ignore |
|
676 pass |
|
677 elif k == 13: |
|
678 # change to simple lowercase mapping; ignore |
|
679 pass |
|
680 elif k == 14: |
|
681 # change to simple titlecase mapping; ignore |
|
682 pass |
|
683 else: |
|
684 class Difference(Exception):pass |
|
685 raise Difference, (hex(i), k, old.table[i], new.table[i]) |
|
686 new.changed.append((version, zip(bidir_changes, category_changes, |
|
687 decimal_changes, mirrored_changes, |
|
688 numeric_changes), |
|
689 normalization_changes)) |
|
690 |
|
691 |
|
692 # -------------------------------------------------------------------- |
|
693 # the following support code is taken from the unidb utilities |
|
694 # Copyright (c) 1999-2000 by Secret Labs AB |
|
695 |
|
696 # load a unicode-data file from disk |
|
697 |
|
698 import sys |
|
699 |
|
700 class UnicodeData: |
|
701 |
|
702 def __init__(self, filename, exclusions, eastasianwidth, expand=1): |
|
703 self.changed = [] |
|
704 file = open(filename) |
|
705 table = [None] * 0x110000 |
|
706 while 1: |
|
707 s = file.readline() |
|
708 if not s: |
|
709 break |
|
710 s = s.strip().split(";") |
|
711 char = int(s[0], 16) |
|
712 table[char] = s |
|
713 |
|
714 # expand first-last ranges |
|
715 if expand: |
|
716 field = None |
|
717 for i in range(0, 0x110000): |
|
718 s = table[i] |
|
719 if s: |
|
720 if s[1][-6:] == "First>": |
|
721 s[1] = "" |
|
722 field = s |
|
723 elif s[1][-5:] == "Last>": |
|
724 s[1] = "" |
|
725 field = None |
|
726 elif field: |
|
727 f2 = field[:] |
|
728 f2[0] = "%X" % i |
|
729 table[i] = f2 |
|
730 |
|
731 # public attributes |
|
732 self.filename = filename |
|
733 self.table = table |
|
734 self.chars = range(0x110000) # unicode 3.2 |
|
735 |
|
736 file = open(exclusions) |
|
737 self.exclusions = {} |
|
738 for s in file: |
|
739 s = s.strip() |
|
740 if not s: |
|
741 continue |
|
742 if s[0] == '#': |
|
743 continue |
|
744 char = int(s.split()[0],16) |
|
745 self.exclusions[char] = 1 |
|
746 |
|
747 widths = [None] * 0x110000 |
|
748 for s in open(eastasianwidth): |
|
749 s = s.strip() |
|
750 if not s: |
|
751 continue |
|
752 if s[0] == '#': |
|
753 continue |
|
754 s = s.split()[0].split(';') |
|
755 if '..' in s[0]: |
|
756 first, last = [int(c, 16) for c in s[0].split('..')] |
|
757 chars = range(first, last+1) |
|
758 else: |
|
759 chars = [int(s[0], 16)] |
|
760 for char in chars: |
|
761 widths[char] = s[1] |
|
762 for i in range(0, 0x110000): |
|
763 if table[i] is not None: |
|
764 table[i].append(widths[i]) |
|
765 |
|
766 def uselatin1(self): |
|
767 # restrict character range to ISO Latin 1 |
|
768 self.chars = range(256) |
|
769 |
|
770 # hash table tools |
|
771 |
|
772 # this is a straight-forward reimplementation of Python's built-in |
|
773 # dictionary type, using a static data structure, and a custom string |
|
774 # hash algorithm. |
|
775 |
|
776 def myhash(s, magic): |
|
777 h = 0 |
|
778 for c in map(ord, s.upper()): |
|
779 h = (h * magic) + c |
|
780 ix = h & 0xff000000L |
|
781 if ix: |
|
782 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff |
|
783 return h |
|
784 |
|
785 SIZES = [ |
|
786 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17), |
|
787 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3), |
|
788 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9), |
|
789 (2097152,5), (4194304,3), (8388608,33), (16777216,27) |
|
790 ] |
|
791 |
|
792 class Hash: |
|
793 def __init__(self, name, data, magic): |
|
794 # turn a (key, value) list into a static hash table structure |
|
795 |
|
796 # determine table size |
|
797 for size, poly in SIZES: |
|
798 if size > len(data): |
|
799 poly = size + poly |
|
800 break |
|
801 else: |
|
802 raise AssertionError, "ran out of polynominals" |
|
803 |
|
804 print size, "slots in hash table" |
|
805 |
|
806 table = [None] * size |
|
807 |
|
808 mask = size-1 |
|
809 |
|
810 n = 0 |
|
811 |
|
812 hash = myhash |
|
813 |
|
814 # initialize hash table |
|
815 for key, value in data: |
|
816 h = hash(key, magic) |
|
817 i = (~h) & mask |
|
818 v = table[i] |
|
819 if v is None: |
|
820 table[i] = value |
|
821 continue |
|
822 incr = (h ^ (h >> 3)) & mask; |
|
823 if not incr: |
|
824 incr = mask |
|
825 while 1: |
|
826 n = n + 1 |
|
827 i = (i + incr) & mask |
|
828 v = table[i] |
|
829 if v is None: |
|
830 table[i] = value |
|
831 break |
|
832 incr = incr << 1 |
|
833 if incr > mask: |
|
834 incr = incr ^ poly |
|
835 |
|
836 print n, "collisions" |
|
837 self.collisions = n |
|
838 |
|
839 for i in range(len(table)): |
|
840 if table[i] is None: |
|
841 table[i] = 0 |
|
842 |
|
843 self.data = Array(name + "_hash", table) |
|
844 self.magic = magic |
|
845 self.name = name |
|
846 self.size = size |
|
847 self.poly = poly |
|
848 |
|
849 def dump(self, file, trace): |
|
850 # write data to file, as a C array |
|
851 self.data.dump(file, trace) |
|
852 file.write("#define %s_magic %d\n" % (self.name, self.magic)) |
|
853 file.write("#define %s_size %d\n" % (self.name, self.size)) |
|
854 file.write("#define %s_poly %d\n" % (self.name, self.poly)) |
|
855 |
|
856 # stuff to deal with arrays of unsigned integers |
|
857 |
|
858 class Array: |
|
859 |
|
860 def __init__(self, name, data): |
|
861 self.name = name |
|
862 self.data = data |
|
863 |
|
864 def dump(self, file, trace=0): |
|
865 # write data to file, as a C array |
|
866 size = getsize(self.data) |
|
867 if trace: |
|
868 print >>sys.stderr, self.name+":", size*len(self.data), "bytes" |
|
869 file.write("static ") |
|
870 if size == 1: |
|
871 file.write("unsigned char") |
|
872 elif size == 2: |
|
873 file.write("unsigned short") |
|
874 else: |
|
875 file.write("unsigned int") |
|
876 file.write(" " + self.name + "[] = {\n") |
|
877 if self.data: |
|
878 s = " " |
|
879 for item in self.data: |
|
880 i = str(item) + ", " |
|
881 if len(s) + len(i) > 78: |
|
882 file.write(s + "\n") |
|
883 s = " " + i |
|
884 else: |
|
885 s = s + i |
|
886 if s.strip(): |
|
887 file.write(s + "\n") |
|
888 file.write("};\n\n") |
|
889 |
|
890 def getsize(data): |
|
891 # return smallest possible integer size for the given array |
|
892 maxdata = max(data) |
|
893 if maxdata < 256: |
|
894 return 1 |
|
895 elif maxdata < 65536: |
|
896 return 2 |
|
897 else: |
|
898 return 4 |
|
899 |
|
900 def splitbins(t, trace=0): |
|
901 """t, trace=0 -> (t1, t2, shift). Split a table to save space. |
|
902 |
|
903 t is a sequence of ints. This function can be useful to save space if |
|
904 many of the ints are the same. t1 and t2 are lists of ints, and shift |
|
905 is an int, chosen to minimize the combined size of t1 and t2 (in C |
|
906 code), and where for each i in range(len(t)), |
|
907 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] |
|
908 where mask is a bitmask isolating the last "shift" bits. |
|
909 |
|
910 If optional arg trace is non-zero (default zero), progress info |
|
911 is printed to sys.stderr. The higher the value, the more info |
|
912 you'll get. |
|
913 """ |
|
914 |
|
915 import sys |
|
916 if trace: |
|
917 def dump(t1, t2, shift, bytes): |
|
918 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % ( |
|
919 len(t1), len(t2), shift, bytes) |
|
920 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \ |
|
921 "bytes" |
|
922 n = len(t)-1 # last valid index |
|
923 maxshift = 0 # the most we can shift n and still have something left |
|
924 if n > 0: |
|
925 while n >> 1: |
|
926 n >>= 1 |
|
927 maxshift += 1 |
|
928 del n |
|
929 bytes = sys.maxint # smallest total size so far |
|
930 t = tuple(t) # so slices can be dict keys |
|
931 for shift in range(maxshift + 1): |
|
932 t1 = [] |
|
933 t2 = [] |
|
934 size = 2**shift |
|
935 bincache = {} |
|
936 for i in range(0, len(t), size): |
|
937 bin = t[i:i+size] |
|
938 index = bincache.get(bin) |
|
939 if index is None: |
|
940 index = len(t2) |
|
941 bincache[bin] = index |
|
942 t2.extend(bin) |
|
943 t1.append(index >> shift) |
|
944 # determine memory size |
|
945 b = len(t1)*getsize(t1) + len(t2)*getsize(t2) |
|
946 if trace > 1: |
|
947 dump(t1, t2, shift, b) |
|
948 if b < bytes: |
|
949 best = t1, t2, shift |
|
950 bytes = b |
|
951 t1, t2, shift = best |
|
952 if trace: |
|
953 print >>sys.stderr, "Best:", |
|
954 dump(t1, t2, shift, bytes) |
|
955 if __debug__: |
|
956 # exhaustively verify that the decomposition is correct |
|
957 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits |
|
958 for i in xrange(len(t)): |
|
959 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] |
|
960 return best |
|
961 |
|
962 if __name__ == "__main__": |
|
963 maketables(1) |