symbian-qemu-0.9.1-12/python-2.6.1/Tools/unicode/makeunicodedata.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 #
       
     2 # (re)generate unicode property and type databases
       
     3 #
       
     4 # this script converts a unicode 3.2 database file to
       
     5 # Modules/unicodedata_db.h, Modules/unicodename_db.h,
       
     6 # and Objects/unicodetype_db.h
       
     7 #
       
     8 # history:
       
     9 # 2000-09-24 fl   created (based on bits and pieces from unidb)
       
    10 # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
       
    11 # 2000-09-25 fl   added character type table
       
    12 # 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
       
    13 # 2000-11-03 fl   expand first/last ranges
       
    14 # 2001-01-19 fl   added character name tables (2.1)
       
    15 # 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
       
    16 # 2002-09-11 wd   use string methods
       
    17 # 2002-10-18 mvl  update to Unicode 3.2
       
    18 # 2002-10-22 mvl  generate NFC tables
       
    19 # 2002-11-24 mvl  expand all ranges, sort names version-independently
       
    20 # 2002-11-25 mvl  add UNIDATA_VERSION
       
    21 # 2004-05-29 perky add east asian width information
       
    22 # 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
       
    23 #
       
    24 # written by Fredrik Lundh (fredrik@pythonware.com)
       
    25 #
       
    26 
       
    27 import sys
       
    28 
       
    29 SCRIPT = sys.argv[0]
       
    30 VERSION = "2.6"
       
    31 
       
    32 # The Unicode Database
       
    33 UNIDATA_VERSION = "5.1.0"
       
    34 UNICODE_DATA = "UnicodeData%s.txt"
       
    35 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
       
    36 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
       
    37 
       
    38 old_versions = ["3.2.0"]
       
    39 
       
    40 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
       
    41     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
       
    42     "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
       
    43     "So" ]
       
    44 
       
    45 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
       
    46     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
       
    47     "ON" ]
       
    48 
       
    49 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
       
    50 
       
    51 # note: should match definitions in Objects/unicodectype.c
       
    52 ALPHA_MASK = 0x01
       
    53 DECIMAL_MASK = 0x02
       
    54 DIGIT_MASK = 0x04
       
    55 LOWER_MASK = 0x08
       
    56 LINEBREAK_MASK = 0x10
       
    57 SPACE_MASK = 0x20
       
    58 TITLE_MASK = 0x40
       
    59 UPPER_MASK = 0x80
       
    60 NODELTA_MASK = 0x100
       
    61 
       
    62 def maketables(trace=0):
       
    63 
       
    64     print "--- Reading", UNICODE_DATA % "", "..."
       
    65 
       
    66     version = ""
       
    67     unicode = UnicodeData(UNICODE_DATA % version,
       
    68                           COMPOSITION_EXCLUSIONS % version,
       
    69                           EASTASIAN_WIDTH % version)
       
    70 
       
    71     print len(filter(None, unicode.table)), "characters"
       
    72 
       
    73     for version in old_versions:
       
    74         print "--- Reading", UNICODE_DATA % ("-"+version), "..."
       
    75         old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
       
    76                                   COMPOSITION_EXCLUSIONS % ("-"+version),
       
    77                                   EASTASIAN_WIDTH % ("-"+version))
       
    78         print len(filter(None, old_unicode.table)), "characters"
       
    79         merge_old_version(version, unicode, old_unicode)
       
    80 
       
    81     makeunicodename(unicode, trace)
       
    82     makeunicodedata(unicode, trace)
       
    83     makeunicodetype(unicode, trace)
       
    84 
       
    85 # --------------------------------------------------------------------
       
    86 # unicode character properties
       
    87 
       
    88 def makeunicodedata(unicode, trace):
       
    89 
       
    90     dummy = (0, 0, 0, 0, 0)
       
    91     table = [dummy]
       
    92     cache = {0: dummy}
       
    93     index = [0] * len(unicode.chars)
       
    94 
       
    95     FILE = "Modules/unicodedata_db.h"
       
    96 
       
    97     print "--- Preparing", FILE, "..."
       
    98 
       
    99     # 1) database properties
       
   100 
       
   101     for char in unicode.chars:
       
   102         record = unicode.table[char]
       
   103         if record:
       
   104             # extract database properties
       
   105             category = CATEGORY_NAMES.index(record[2])
       
   106             combining = int(record[3])
       
   107             bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
       
   108             mirrored = record[9] == "Y"
       
   109             eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
       
   110             item = (
       
   111                 category, combining, bidirectional, mirrored, eastasianwidth
       
   112                 )
       
   113             # add entry to index and item tables
       
   114             i = cache.get(item)
       
   115             if i is None:
       
   116                 cache[item] = i = len(table)
       
   117                 table.append(item)
       
   118             index[char] = i
       
   119 
       
   120     # 2) decomposition data
       
   121 
       
   122     decomp_data = [0]
       
   123     decomp_prefix = [""]
       
   124     decomp_index = [0] * len(unicode.chars)
       
   125     decomp_size = 0
       
   126 
       
   127     comp_pairs = []
       
   128     comp_first = [None] * len(unicode.chars)
       
   129     comp_last = [None] * len(unicode.chars)
       
   130 
       
   131     for char in unicode.chars:
       
   132         record = unicode.table[char]
       
   133         if record:
       
   134             if record[5]:
       
   135                 decomp = record[5].split()
       
   136                 if len(decomp) > 19:
       
   137                     raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
       
   138                 # prefix
       
   139                 if decomp[0][0] == "<":
       
   140                     prefix = decomp.pop(0)
       
   141                 else:
       
   142                     prefix = ""
       
   143                 try:
       
   144                     i = decomp_prefix.index(prefix)
       
   145                 except ValueError:
       
   146                     i = len(decomp_prefix)
       
   147                     decomp_prefix.append(prefix)
       
   148                 prefix = i
       
   149                 assert prefix < 256
       
   150                 # content
       
   151                 decomp = [prefix + (len(decomp)<<8)] +\
       
   152                          map(lambda s: int(s, 16), decomp)
       
   153                 # Collect NFC pairs
       
   154                 if not prefix and len(decomp) == 3 and \
       
   155                    char not in unicode.exclusions and \
       
   156                    unicode.table[decomp[1]][3] == "0":
       
   157                     p, l, r = decomp
       
   158                     comp_first[l] = 1
       
   159                     comp_last[r] = 1
       
   160                     comp_pairs.append((l,r,char))
       
   161                 try:
       
   162                     i = decomp_data.index(decomp)
       
   163                 except ValueError:
       
   164                     i = len(decomp_data)
       
   165                     decomp_data.extend(decomp)
       
   166                     decomp_size = decomp_size + len(decomp) * 2
       
   167             else:
       
   168                 i = 0
       
   169             decomp_index[char] = i
       
   170 
       
   171     f = l = 0
       
   172     comp_first_ranges = []
       
   173     comp_last_ranges = []
       
   174     prev_f = prev_l = None
       
   175     for i in unicode.chars:
       
   176         if comp_first[i] is not None:
       
   177             comp_first[i] = f
       
   178             f += 1
       
   179             if prev_f is None:
       
   180                 prev_f = (i,i)
       
   181             elif prev_f[1]+1 == i:
       
   182                 prev_f = prev_f[0],i
       
   183             else:
       
   184                 comp_first_ranges.append(prev_f)
       
   185                 prev_f = (i,i)
       
   186         if comp_last[i] is not None:
       
   187             comp_last[i] = l
       
   188             l += 1
       
   189             if prev_l is None:
       
   190                 prev_l = (i,i)
       
   191             elif prev_l[1]+1 == i:
       
   192                 prev_l = prev_l[0],i
       
   193             else:
       
   194                 comp_last_ranges.append(prev_l)
       
   195                 prev_l = (i,i)
       
   196     comp_first_ranges.append(prev_f)
       
   197     comp_last_ranges.append(prev_l)
       
   198     total_first = f
       
   199     total_last = l
       
   200 
       
   201     comp_data = [0]*(total_first*total_last)
       
   202     for f,l,char in comp_pairs:
       
   203         f = comp_first[f]
       
   204         l = comp_last[l]
       
   205         comp_data[f*total_last+l] = char
       
   206 
       
   207     print len(table), "unique properties"
       
   208     print len(decomp_prefix), "unique decomposition prefixes"
       
   209     print len(decomp_data), "unique decomposition entries:",
       
   210     print decomp_size, "bytes"
       
   211     print total_first, "first characters in NFC"
       
   212     print total_last, "last characters in NFC"
       
   213     print len(comp_pairs), "NFC pairs"
       
   214 
       
   215     print "--- Writing", FILE, "..."
       
   216 
       
   217     fp = open(FILE, "w")
       
   218     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
       
   219     print >>fp
       
   220     print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
       
   221     print >>fp, "/* a list of unique database records */"
       
   222     print >>fp, \
       
   223           "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
       
   224     for item in table:
       
   225         print >>fp, "    {%d, %d, %d, %d, %d}," % item
       
   226     print >>fp, "};"
       
   227     print >>fp
       
   228 
       
   229     print >>fp, "/* Reindexing of NFC first characters. */"
       
   230     print >>fp, "#define TOTAL_FIRST",total_first
       
   231     print >>fp, "#define TOTAL_LAST",total_last
       
   232     print >>fp, "struct reindex{int start;short count,index;};"
       
   233     print >>fp, "static struct reindex nfc_first[] = {"
       
   234     for start,end in comp_first_ranges:
       
   235         print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_first[start])
       
   236     print >>fp,"  {0,0,0}"
       
   237     print >>fp,"};\n"
       
   238     print >>fp, "static struct reindex nfc_last[] = {"
       
   239     for start,end in comp_last_ranges:
       
   240         print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_last[start])
       
   241     print >>fp,"  {0,0,0}"
       
   242     print >>fp,"};\n"
       
   243 
       
   244     # FIXME: <fl> the following tables could be made static, and
       
   245     # the support code moved into unicodedatabase.c
       
   246 
       
   247     print >>fp, "/* string literals */"
       
   248     print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
       
   249     for name in CATEGORY_NAMES:
       
   250         print >>fp, "    \"%s\"," % name
       
   251     print >>fp, "    NULL"
       
   252     print >>fp, "};"
       
   253 
       
   254     print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
       
   255     for name in BIDIRECTIONAL_NAMES:
       
   256         print >>fp, "    \"%s\"," % name
       
   257     print >>fp, "    NULL"
       
   258     print >>fp, "};"
       
   259 
       
   260     print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
       
   261     for name in EASTASIANWIDTH_NAMES:
       
   262         print >>fp, "    \"%s\"," % name
       
   263     print >>fp, "    NULL"
       
   264     print >>fp, "};"
       
   265 
       
   266     print >>fp, "static const char *decomp_prefix[] = {"
       
   267     for name in decomp_prefix:
       
   268         print >>fp, "    \"%s\"," % name
       
   269     print >>fp, "    NULL"
       
   270     print >>fp, "};"
       
   271 
       
   272     # split record index table
       
   273     index1, index2, shift = splitbins(index, trace)
       
   274 
       
   275     print >>fp, "/* index tables for the database records */"
       
   276     print >>fp, "#define SHIFT", shift
       
   277     Array("index1", index1).dump(fp, trace)
       
   278     Array("index2", index2).dump(fp, trace)
       
   279 
       
   280     # split decomposition index table
       
   281     index1, index2, shift = splitbins(decomp_index, trace)
       
   282 
       
   283     print >>fp, "/* decomposition data */"
       
   284     Array("decomp_data", decomp_data).dump(fp, trace)
       
   285 
       
   286     print >>fp, "/* index tables for the decomposition data */"
       
   287     print >>fp, "#define DECOMP_SHIFT", shift
       
   288     Array("decomp_index1", index1).dump(fp, trace)
       
   289     Array("decomp_index2", index2).dump(fp, trace)
       
   290 
       
   291     index, index2, shift = splitbins(comp_data, trace)
       
   292     print >>fp, "/* NFC pairs */"
       
   293     print >>fp, "#define COMP_SHIFT", shift
       
   294     Array("comp_index", index).dump(fp, trace)
       
   295     Array("comp_data", index2).dump(fp, trace)
       
   296 
       
   297     # Generate delta tables for old versions
       
   298     for version, table, normalization in unicode.changed:
       
   299         cversion = version.replace(".","_")
       
   300         records = [table[0]]
       
   301         cache = {table[0]:0}
       
   302         index = [0] * len(table)
       
   303         for i, record in enumerate(table):
       
   304             try:
       
   305                 index[i] = cache[record]
       
   306             except KeyError:
       
   307                 index[i] = cache[record] = len(records)
       
   308                 records.append(record)
       
   309         index1, index2, shift = splitbins(index, trace)
       
   310         print >>fp, "static const change_record change_records_%s[] = {" % cversion
       
   311         for record in records:
       
   312             print >>fp, "\t{ %s }," % ", ".join(map(str,record))
       
   313         print >>fp, "};"
       
   314         Array("changes_%s_index" % cversion, index1).dump(fp, trace)
       
   315         Array("changes_%s_data" % cversion, index2).dump(fp, trace)
       
   316         print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
       
   317         print >>fp, "{"
       
   318         print >>fp, "\tint index;"
       
   319         print >>fp, "\tif (n >= 0x110000) index = 0;"
       
   320         print >>fp, "\telse {"
       
   321         print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
       
   322         print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
       
   323               (cversion, shift, ((1<<shift)-1))
       
   324         print >>fp, "\t}"
       
   325         print >>fp, "\treturn change_records_%s+index;" % cversion
       
   326         print >>fp, "}\n"
       
   327         print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
       
   328         print >>fp, "{"
       
   329         print >>fp, "\tswitch(n) {"
       
   330         for k, v in normalization:
       
   331             print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
       
   332         print >>fp, "\tdefault: return 0;"
       
   333         print >>fp, "\t}\n}\n"
       
   334 
       
   335     fp.close()
       
   336 
       
   337 # --------------------------------------------------------------------
       
   338 # unicode character type tables
       
   339 
       
   340 def makeunicodetype(unicode, trace):
       
   341 
       
   342     FILE = "Objects/unicodetype_db.h"
       
   343 
       
   344     print "--- Preparing", FILE, "..."
       
   345 
       
   346     # extract unicode types
       
   347     dummy = (0, 0, 0, 0, 0, 0)
       
   348     table = [dummy]
       
   349     cache = {0: dummy}
       
   350     index = [0] * len(unicode.chars)
       
   351 
       
   352     for char in unicode.chars:
       
   353         record = unicode.table[char]
       
   354         if record:
       
   355             # extract database properties
       
   356             category = record[2]
       
   357             bidirectional = record[4]
       
   358             flags = 0
       
   359             delta = True
       
   360             if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
       
   361                 flags |= ALPHA_MASK
       
   362             if category == "Ll":
       
   363                 flags |= LOWER_MASK
       
   364             if category == "Zl" or bidirectional == "B":
       
   365                 flags |= LINEBREAK_MASK
       
   366             if category == "Zs" or bidirectional in ("WS", "B", "S"):
       
   367                 flags |= SPACE_MASK
       
   368             if category == "Lt":
       
   369                 flags |= TITLE_MASK
       
   370             if category == "Lu":
       
   371                 flags |= UPPER_MASK
       
   372             # use delta predictor for upper/lower/title if it fits
       
   373             if record[12]:
       
   374                 upper = int(record[12], 16) - char
       
   375                 if  -32768 <= upper <= 32767 and delta:
       
   376                     upper = upper & 0xffff
       
   377                 else:
       
   378                     upper += char
       
   379                     delta = False
       
   380             else:
       
   381                 upper = 0
       
   382             if record[13]:
       
   383                 lower = int(record[13], 16) - char
       
   384                 if -32768 <= lower <= 32767 and delta:
       
   385                     lower = lower & 0xffff
       
   386                 else:
       
   387                     lower += char
       
   388                     delta = False
       
   389             else:
       
   390                 lower = 0
       
   391             if record[14]:
       
   392                 title = int(record[14], 16) - char
       
   393                 if -32768 <= lower <= 32767 and delta:
       
   394                     title = title & 0xffff
       
   395                 else:
       
   396                     title += char
       
   397                     delta = False
       
   398             else:
       
   399                 title = 0
       
   400             if not delta:
       
   401                 flags |= NODELTA_MASK
       
   402             # decimal digit, integer digit
       
   403             decimal = 0
       
   404             if record[6]:
       
   405                 flags |= DECIMAL_MASK
       
   406                 decimal = int(record[6])
       
   407             digit = 0
       
   408             if record[7]:
       
   409                 flags |= DIGIT_MASK
       
   410                 digit = int(record[7])
       
   411             item = (
       
   412                 upper, lower, title, decimal, digit, flags
       
   413                 )
       
   414             # add entry to index and item tables
       
   415             i = cache.get(item)
       
   416             if i is None:
       
   417                 cache[item] = i = len(table)
       
   418                 table.append(item)
       
   419             index[char] = i
       
   420 
       
   421     print len(table), "unique character type entries"
       
   422 
       
   423     print "--- Writing", FILE, "..."
       
   424 
       
   425     fp = open(FILE, "w")
       
   426     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
       
   427     print >>fp
       
   428     print >>fp, "/* a list of unique character type descriptors */"
       
   429     print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
       
   430     for item in table:
       
   431         print >>fp, "    {%d, %d, %d, %d, %d, %d}," % item
       
   432     print >>fp, "};"
       
   433     print >>fp
       
   434 
       
   435     # split decomposition index table
       
   436     index1, index2, shift = splitbins(index, trace)
       
   437 
       
   438     print >>fp, "/* type indexes */"
       
   439     print >>fp, "#define SHIFT", shift
       
   440     Array("index1", index1).dump(fp, trace)
       
   441     Array("index2", index2).dump(fp, trace)
       
   442 
       
   443     fp.close()
       
   444 
       
   445 # --------------------------------------------------------------------
       
   446 # unicode name database
       
   447 
       
   448 def makeunicodename(unicode, trace):
       
   449 
       
   450     FILE = "Modules/unicodename_db.h"
       
   451 
       
   452     print "--- Preparing", FILE, "..."
       
   453 
       
   454     # collect names
       
   455     names = [None] * len(unicode.chars)
       
   456 
       
   457     for char in unicode.chars:
       
   458         record = unicode.table[char]
       
   459         if record:
       
   460             name = record[1].strip()
       
   461             if name and name[0] != "<":
       
   462                 names[char] = name + chr(0)
       
   463 
       
   464     print len(filter(lambda n: n is not None, names)), "distinct names"
       
   465 
       
   466     # collect unique words from names (note that we differ between
       
   467     # words inside a sentence, and words ending a sentence.  the
       
   468     # latter includes the trailing null byte.
       
   469 
       
   470     words = {}
       
   471     n = b = 0
       
   472     for char in unicode.chars:
       
   473         name = names[char]
       
   474         if name:
       
   475             w = name.split()
       
   476             b = b + len(name)
       
   477             n = n + len(w)
       
   478             for w in w:
       
   479                 l = words.get(w)
       
   480                 if l:
       
   481                     l.append(None)
       
   482                 else:
       
   483                     words[w] = [len(words)]
       
   484 
       
   485     print n, "words in text;", b, "bytes"
       
   486 
       
   487     wordlist = words.items()
       
   488 
       
   489     # sort on falling frequency, then by name
       
   490     def cmpwords((aword, alist),(bword, blist)):
       
   491         r = -cmp(len(alist),len(blist))
       
   492         if r:
       
   493             return r
       
   494         return cmp(aword, bword)
       
   495     wordlist.sort(cmpwords)
       
   496 
       
   497     # figure out how many phrasebook escapes we need
       
   498     escapes = 0
       
   499     while escapes * 256 < len(wordlist):
       
   500         escapes = escapes + 1
       
   501     print escapes, "escapes"
       
   502 
       
   503     short = 256 - escapes
       
   504 
       
   505     assert short > 0
       
   506 
       
   507     print short, "short indexes in lexicon"
       
   508 
       
   509     # statistics
       
   510     n = 0
       
   511     for i in range(short):
       
   512         n = n + len(wordlist[i][1])
       
   513     print n, "short indexes in phrasebook"
       
   514 
       
   515     # pick the most commonly used words, and sort the rest on falling
       
   516     # length (to maximize overlap)
       
   517 
       
   518     wordlist, wordtail = wordlist[:short], wordlist[short:]
       
   519     wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
       
   520     wordlist.extend(wordtail)
       
   521 
       
   522     # generate lexicon from words
       
   523 
       
   524     lexicon_offset = [0]
       
   525     lexicon = ""
       
   526     words = {}
       
   527 
       
   528     # build a lexicon string
       
   529     offset = 0
       
   530     for w, x in wordlist:
       
   531         # encoding: bit 7 indicates last character in word (chr(128)
       
   532         # indicates the last character in an entire string)
       
   533         ww = w[:-1] + chr(ord(w[-1])+128)
       
   534         # reuse string tails, when possible
       
   535         o = lexicon.find(ww)
       
   536         if o < 0:
       
   537             o = offset
       
   538             lexicon = lexicon + ww
       
   539             offset = offset + len(w)
       
   540         words[w] = len(lexicon_offset)
       
   541         lexicon_offset.append(o)
       
   542 
       
   543     lexicon = map(ord, lexicon)
       
   544 
       
   545     # generate phrasebook from names and lexicon
       
   546     phrasebook = [0]
       
   547     phrasebook_offset = [0] * len(unicode.chars)
       
   548     for char in unicode.chars:
       
   549         name = names[char]
       
   550         if name:
       
   551             w = name.split()
       
   552             phrasebook_offset[char] = len(phrasebook)
       
   553             for w in w:
       
   554                 i = words[w]
       
   555                 if i < short:
       
   556                     phrasebook.append(i)
       
   557                 else:
       
   558                     # store as two bytes
       
   559                     phrasebook.append((i>>8) + short)
       
   560                     phrasebook.append(i&255)
       
   561 
       
   562     assert getsize(phrasebook) == 1
       
   563 
       
   564     #
       
   565     # unicode name hash table
       
   566 
       
   567     # extract names
       
   568     data = []
       
   569     for char in unicode.chars:
       
   570         record = unicode.table[char]
       
   571         if record:
       
   572             name = record[1].strip()
       
   573             if name and name[0] != "<":
       
   574                 data.append((name, char))
       
   575 
       
   576     # the magic number 47 was chosen to minimize the number of
       
   577     # collisions on the current data set.  if you like, change it
       
   578     # and see what happens...
       
   579 
       
   580     codehash = Hash("code", data, 47)
       
   581 
       
   582     print "--- Writing", FILE, "..."
       
   583 
       
   584     fp = open(FILE, "w")
       
   585     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
       
   586     print >>fp
       
   587     print >>fp, "#define NAME_MAXLEN", 256
       
   588     print >>fp
       
   589     print >>fp, "/* lexicon */"
       
   590     Array("lexicon", lexicon).dump(fp, trace)
       
   591     Array("lexicon_offset", lexicon_offset).dump(fp, trace)
       
   592 
       
   593     # split decomposition index table
       
   594     offset1, offset2, shift = splitbins(phrasebook_offset, trace)
       
   595 
       
   596     print >>fp, "/* code->name phrasebook */"
       
   597     print >>fp, "#define phrasebook_shift", shift
       
   598     print >>fp, "#define phrasebook_short", short
       
   599 
       
   600     Array("phrasebook", phrasebook).dump(fp, trace)
       
   601     Array("phrasebook_offset1", offset1).dump(fp, trace)
       
   602     Array("phrasebook_offset2", offset2).dump(fp, trace)
       
   603 
       
   604     print >>fp, "/* name->code dictionary */"
       
   605     codehash.dump(fp, trace)
       
   606 
       
   607     fp.close()
       
   608 
       
   609 
       
   610 def merge_old_version(version, new, old):
       
   611     # Changes to exclusion file not implemented yet
       
   612     if old.exclusions != new.exclusions:
       
   613         raise NotImplementedError, "exclusions differ"
       
   614 
       
   615     # In these change records, 0xFF means "no change"
       
   616     bidir_changes = [0xFF]*0x110000
       
   617     category_changes = [0xFF]*0x110000
       
   618     decimal_changes = [0xFF]*0x110000
       
   619     mirrored_changes = [0xFF]*0x110000
       
   620     # In numeric data, 0 means "no change",
       
   621     # -1 means "did not have a numeric value
       
   622     numeric_changes = [0] * 0x110000
       
   623     # normalization_changes is a list of key-value pairs
       
   624     normalization_changes = []
       
   625     for i in range(0x110000):
       
   626         if new.table[i] is None:
       
   627             # Characters unassigned in the new version ought to
       
   628             # be unassigned in the old one
       
   629             assert old.table[i] is None
       
   630             continue
       
   631         # check characters unassigned in the old version
       
   632         if old.table[i] is None:
       
   633             # category 0 is "unassigned"
       
   634             category_changes[i] = 0
       
   635             continue
       
   636         # check characters that differ
       
   637         if old.table[i] != new.table[i]:
       
   638             for k in range(len(old.table[i])):
       
   639                 if old.table[i][k] != new.table[i][k]:
       
   640                     value = old.table[i][k]
       
   641                     if k == 2:
       
   642                         #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
       
   643                         category_changes[i] = CATEGORY_NAMES.index(value)
       
   644                     elif k == 4:
       
   645                         #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
       
   646                         bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
       
   647                     elif k == 5:
       
   648                         #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
       
   649                         # We assume that all normalization changes are in 1:1 mappings
       
   650                         assert " " not in value
       
   651                         normalization_changes.append((i, value))
       
   652                     elif k == 6:
       
   653                         #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
       
   654                         # we only support changes where the old value is a single digit
       
   655                         assert value in "0123456789"
       
   656                         decimal_changes[i] = int(value)
       
   657                     elif k == 8:
       
   658                         # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
       
   659                         # Since 0 encodes "no change", the old value is better not 0
       
   660                         assert value != "0" and value != "-1"
       
   661                         if not value:
       
   662                             numeric_changes[i] = -1
       
   663                         else:
       
   664                             assert re.match("^[0-9]+$", value)
       
   665                             numeric_changes[i] = int(value)
       
   666                     elif k == 9:
       
   667                         if value == 'Y':
       
   668                             mirrored_changes[i] = '1'
       
   669                         else:
       
   670                             mirrored_changes[i] = '0'
       
   671                     elif k == 11:
       
   672                         # change to ISO comment, ignore
       
   673                         pass
       
   674                     elif k == 12:
       
   675                         # change to simple uppercase mapping; ignore
       
   676                         pass
       
   677                     elif k == 13:
       
   678                         # change to simple lowercase mapping; ignore
       
   679                         pass
       
   680                     elif k == 14:
       
   681                         # change to simple titlecase mapping; ignore
       
   682                         pass
       
   683                     else:
       
   684                         class Difference(Exception):pass
       
   685                         raise Difference, (hex(i), k, old.table[i], new.table[i])
       
   686     new.changed.append((version, zip(bidir_changes, category_changes,
       
   687                                      decimal_changes, mirrored_changes,
       
   688                                      numeric_changes),
       
   689                         normalization_changes))
       
   690 
       
   691 
       
   692 # --------------------------------------------------------------------
       
   693 # the following support code is taken from the unidb utilities
       
   694 # Copyright (c) 1999-2000 by Secret Labs AB
       
   695 
       
   696 # load a unicode-data file from disk
       
   697 
       
   698 import sys
       
   699 
       
   700 class UnicodeData:
       
   701 
       
   702     def __init__(self, filename, exclusions, eastasianwidth, expand=1):
       
   703         self.changed = []
       
   704         file = open(filename)
       
   705         table = [None] * 0x110000
       
   706         while 1:
       
   707             s = file.readline()
       
   708             if not s:
       
   709                 break
       
   710             s = s.strip().split(";")
       
   711             char = int(s[0], 16)
       
   712             table[char] = s
       
   713 
       
   714         # expand first-last ranges
       
   715         if expand:
       
   716             field = None
       
   717             for i in range(0, 0x110000):
       
   718                 s = table[i]
       
   719                 if s:
       
   720                     if s[1][-6:] == "First>":
       
   721                         s[1] = ""
       
   722                         field = s
       
   723                     elif s[1][-5:] == "Last>":
       
   724                         s[1] = ""
       
   725                         field = None
       
   726                 elif field:
       
   727                     f2 = field[:]
       
   728                     f2[0] = "%X" % i
       
   729                     table[i] = f2
       
   730 
       
   731         # public attributes
       
   732         self.filename = filename
       
   733         self.table = table
       
   734         self.chars = range(0x110000) # unicode 3.2
       
   735 
       
   736         file = open(exclusions)
       
   737         self.exclusions = {}
       
   738         for s in file:
       
   739             s = s.strip()
       
   740             if not s:
       
   741                 continue
       
   742             if s[0] == '#':
       
   743                 continue
       
   744             char = int(s.split()[0],16)
       
   745             self.exclusions[char] = 1
       
   746 
       
   747         widths = [None] * 0x110000
       
   748         for s in open(eastasianwidth):
       
   749             s = s.strip()
       
   750             if not s:
       
   751                 continue
       
   752             if s[0] == '#':
       
   753                 continue
       
   754             s = s.split()[0].split(';')
       
   755             if '..' in s[0]:
       
   756                 first, last = [int(c, 16) for c in s[0].split('..')]
       
   757                 chars = range(first, last+1)
       
   758             else:
       
   759                 chars = [int(s[0], 16)]
       
   760             for char in chars:
       
   761                 widths[char] = s[1]
       
   762         for i in range(0, 0x110000):
       
   763             if table[i] is not None:
       
   764                 table[i].append(widths[i])
       
   765 
       
   766     def uselatin1(self):
       
   767         # restrict character range to ISO Latin 1
       
   768         self.chars = range(256)
       
   769 
       
   770 # hash table tools
       
   771 
       
   772 # this is a straight-forward reimplementation of Python's built-in
       
   773 # dictionary type, using a static data structure, and a custom string
       
   774 # hash algorithm.
       
   775 
       
   776 def myhash(s, magic):
       
   777     h = 0
       
   778     for c in map(ord, s.upper()):
       
   779         h = (h * magic) + c
       
   780         ix = h & 0xff000000L
       
   781         if ix:
       
   782             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
       
   783     return h
       
   784 
       
   785 SIZES = [
       
   786     (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
       
   787     (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
       
   788     (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
       
   789     (2097152,5), (4194304,3), (8388608,33), (16777216,27)
       
   790 ]
       
   791 
       
   792 class Hash:
       
   793     def __init__(self, name, data, magic):
       
   794         # turn a (key, value) list into a static hash table structure
       
   795 
       
   796         # determine table size
       
   797         for size, poly in SIZES:
       
   798             if size > len(data):
       
   799                 poly = size + poly
       
   800                 break
       
   801         else:
       
   802             raise AssertionError, "ran out of polynominals"
       
   803 
       
   804         print size, "slots in hash table"
       
   805 
       
   806         table = [None] * size
       
   807 
       
   808         mask = size-1
       
   809 
       
   810         n = 0
       
   811 
       
   812         hash = myhash
       
   813 
       
   814         # initialize hash table
       
   815         for key, value in data:
       
   816             h = hash(key, magic)
       
   817             i = (~h) & mask
       
   818             v = table[i]
       
   819             if v is None:
       
   820                 table[i] = value
       
   821                 continue
       
   822             incr = (h ^ (h >> 3)) & mask;
       
   823             if not incr:
       
   824                 incr = mask
       
   825             while 1:
       
   826                 n = n + 1
       
   827                 i = (i + incr) & mask
       
   828                 v = table[i]
       
   829                 if v is None:
       
   830                     table[i] = value
       
   831                     break
       
   832                 incr = incr << 1
       
   833                 if incr > mask:
       
   834                     incr = incr ^ poly
       
   835 
       
   836         print n, "collisions"
       
   837         self.collisions = n
       
   838 
       
   839         for i in range(len(table)):
       
   840             if table[i] is None:
       
   841                 table[i] = 0
       
   842 
       
   843         self.data = Array(name + "_hash", table)
       
   844         self.magic = magic
       
   845         self.name = name
       
   846         self.size = size
       
   847         self.poly = poly
       
   848 
       
   849     def dump(self, file, trace):
       
   850         # write data to file, as a C array
       
   851         self.data.dump(file, trace)
       
   852         file.write("#define %s_magic %d\n" % (self.name, self.magic))
       
   853         file.write("#define %s_size %d\n" % (self.name, self.size))
       
   854         file.write("#define %s_poly %d\n" % (self.name, self.poly))
       
   855 
       
   856 # stuff to deal with arrays of unsigned integers
       
   857 
       
   858 class Array:
       
   859 
       
   860     def __init__(self, name, data):
       
   861         self.name = name
       
   862         self.data = data
       
   863 
       
   864     def dump(self, file, trace=0):
       
   865         # write data to file, as a C array
       
   866         size = getsize(self.data)
       
   867         if trace:
       
   868             print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
       
   869         file.write("static ")
       
   870         if size == 1:
       
   871             file.write("unsigned char")
       
   872         elif size == 2:
       
   873             file.write("unsigned short")
       
   874         else:
       
   875             file.write("unsigned int")
       
   876         file.write(" " + self.name + "[] = {\n")
       
   877         if self.data:
       
   878             s = "    "
       
   879             for item in self.data:
       
   880                 i = str(item) + ", "
       
   881                 if len(s) + len(i) > 78:
       
   882                     file.write(s + "\n")
       
   883                     s = "    " + i
       
   884                 else:
       
   885                     s = s + i
       
   886             if s.strip():
       
   887                 file.write(s + "\n")
       
   888         file.write("};\n\n")
       
   889 
       
   890 def getsize(data):
       
   891     # return smallest possible integer size for the given array
       
   892     maxdata = max(data)
       
   893     if maxdata < 256:
       
   894         return 1
       
   895     elif maxdata < 65536:
       
   896         return 2
       
   897     else:
       
   898         return 4
       
   899 
       
   900 def splitbins(t, trace=0):
       
   901     """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
       
   902 
       
   903     t is a sequence of ints.  This function can be useful to save space if
       
   904     many of the ints are the same.  t1 and t2 are lists of ints, and shift
       
   905     is an int, chosen to minimize the combined size of t1 and t2 (in C
       
   906     code), and where for each i in range(len(t)),
       
   907         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
       
   908     where mask is a bitmask isolating the last "shift" bits.
       
   909 
       
   910     If optional arg trace is non-zero (default zero), progress info
       
   911     is printed to sys.stderr.  The higher the value, the more info
       
   912     you'll get.
       
   913     """
       
   914 
       
   915     import sys
       
   916     if trace:
       
   917         def dump(t1, t2, shift, bytes):
       
   918             print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
       
   919                 len(t1), len(t2), shift, bytes)
       
   920         print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
       
   921                             "bytes"
       
   922     n = len(t)-1    # last valid index
       
   923     maxshift = 0    # the most we can shift n and still have something left
       
   924     if n > 0:
       
   925         while n >> 1:
       
   926             n >>= 1
       
   927             maxshift += 1
       
   928     del n
       
   929     bytes = sys.maxint  # smallest total size so far
       
   930     t = tuple(t)    # so slices can be dict keys
       
   931     for shift in range(maxshift + 1):
       
   932         t1 = []
       
   933         t2 = []
       
   934         size = 2**shift
       
   935         bincache = {}
       
   936         for i in range(0, len(t), size):
       
   937             bin = t[i:i+size]
       
   938             index = bincache.get(bin)
       
   939             if index is None:
       
   940                 index = len(t2)
       
   941                 bincache[bin] = index
       
   942                 t2.extend(bin)
       
   943             t1.append(index >> shift)
       
   944         # determine memory size
       
   945         b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
       
   946         if trace > 1:
       
   947             dump(t1, t2, shift, b)
       
   948         if b < bytes:
       
   949             best = t1, t2, shift
       
   950             bytes = b
       
   951     t1, t2, shift = best
       
   952     if trace:
       
   953         print >>sys.stderr, "Best:",
       
   954         dump(t1, t2, shift, bytes)
       
   955     if __debug__:
       
   956         # exhaustively verify that the decomposition is correct
       
   957         mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
       
   958         for i in xrange(len(t)):
       
   959             assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
       
   960     return best
       
   961 
       
   962 if __name__ == "__main__":
       
   963     maketables(1)