|
1 import re, unicodedata, sys |
|
2 |
|
3 if sys.maxunicode == 65535: |
|
4 raise RuntimeError, "need UCS-4 Python" |
|
5 |
|
6 def gen_category(cats): |
|
7 for i in range(0, 0x110000): |
|
8 if unicodedata.category(unichr(i)) in cats: |
|
9 yield(i) |
|
10 |
|
11 def gen_bidirectional(cats): |
|
12 for i in range(0, 0x110000): |
|
13 if unicodedata.bidirectional(unichr(i)) in cats: |
|
14 yield(i) |
|
15 |
|
16 def compact_set(l): |
|
17 single = [] |
|
18 tuple = [] |
|
19 prev = None |
|
20 span = 0 |
|
21 for e in l: |
|
22 if prev is None: |
|
23 prev = e |
|
24 span = 0 |
|
25 continue |
|
26 if prev+span+1 != e: |
|
27 if span > 2: |
|
28 tuple.append((prev,prev+span+1)) |
|
29 else: |
|
30 for i in range(prev, prev+span+1): |
|
31 single.append(i) |
|
32 prev = e |
|
33 span = 0 |
|
34 else: |
|
35 span += 1 |
|
36 if span: |
|
37 tuple.append((prev,prev+span+1)) |
|
38 else: |
|
39 single.append(prev) |
|
40 tuple = " + ".join(["range(%d,%d)" % t for t in tuple]) |
|
41 if not single: |
|
42 return "set(%s)" % tuple |
|
43 if not tuple: |
|
44 return "set(%s)" % repr(single) |
|
45 return "set(%s + %s)" % (repr(single),tuple) |
|
46 |
|
47 ############## Read the tables in the RFC ####################### |
|
48 |
|
49 data = open("rfc3454.txt").readlines() |
|
50 |
|
51 tables = [] |
|
52 curname = None |
|
53 for l in data: |
|
54 l = l.strip() |
|
55 if not l: |
|
56 continue |
|
57 # Skip RFC page breaks |
|
58 if l.startswith("Hoffman & Blanchet") or\ |
|
59 l.startswith("RFC 3454"): |
|
60 continue |
|
61 # Find start/end lines |
|
62 m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l) |
|
63 if m: |
|
64 if m.group(1) == "Start": |
|
65 if curname: |
|
66 raise "Double Start",(curname, l) |
|
67 curname = m.group(2) |
|
68 table = {} |
|
69 tables.append((curname, table)) |
|
70 continue |
|
71 else: |
|
72 if not curname: |
|
73 raise "End without start", l |
|
74 curname = None |
|
75 continue |
|
76 if not curname: |
|
77 continue |
|
78 # Now we are in a table |
|
79 fields = l.split(";") |
|
80 if len(fields) > 1: |
|
81 # Drop comment field |
|
82 fields = fields[:-1] |
|
83 if len(fields) == 1: |
|
84 fields = fields[0].split("-") |
|
85 if len(fields) > 1: |
|
86 # range |
|
87 try: |
|
88 start, end = fields |
|
89 except ValueError: |
|
90 raise "Unpacking problem", l |
|
91 else: |
|
92 start = end = fields[0] |
|
93 start = int(start, 16) |
|
94 end = int(end, 16) |
|
95 for i in range(start, end+1): |
|
96 table[i] = i |
|
97 else: |
|
98 code, value = fields |
|
99 value = value.strip() |
|
100 if value: |
|
101 value = [int(v, 16) for v in value.split(" ")] |
|
102 else: |
|
103 # table B.1 |
|
104 value = None |
|
105 table[int(code, 16)] = value |
|
106 |
|
107 ########### Generate compact Python versions of the tables ############# |
|
108 |
|
109 print """# This file is generated by mkstringprep.py. DO NOT EDIT. |
|
110 \"\"\"Library that exposes various tables found in the StringPrep RFC 3454. |
|
111 |
|
112 There are two kinds of tables: sets, for which a member test is provided, |
|
113 and mappings, for which a mapping function is provided. |
|
114 \"\"\" |
|
115 |
|
116 import unicodedata |
|
117 """ |
|
118 |
|
119 print "assert unicodedata.unidata_version == %s" % repr(unicodedata.unidata_version) |
|
120 |
|
121 # A.1 is the table of unassigned characters |
|
122 # XXX Plane 15 PUA is listed as unassigned in Python. |
|
123 name, table = tables[0] |
|
124 del tables[0] |
|
125 assert name == "A.1" |
|
126 table = set(table.keys()) |
|
127 Cn = set(gen_category(["Cn"])) |
|
128 |
|
129 # FDD0..FDEF are process internal codes |
|
130 Cn -= set(range(0xFDD0, 0xFDF0)) |
|
131 # not a character |
|
132 Cn -= set(range(0xFFFE, 0x110000, 0x10000)) |
|
133 Cn -= set(range(0xFFFF, 0x110000, 0x10000)) |
|
134 |
|
135 # assert table == Cn |
|
136 |
|
137 print """ |
|
138 def in_table_a1(code): |
|
139 if unicodedata.category(code) != 'Cn': return False |
|
140 c = ord(code) |
|
141 if 0xFDD0 <= c < 0xFDF0: return False |
|
142 return (c & 0xFFFF) not in (0xFFFE, 0xFFFF) |
|
143 """ |
|
144 |
|
145 # B.1 cannot easily be derived |
|
146 name, table = tables[0] |
|
147 del tables[0] |
|
148 assert name == "B.1" |
|
149 table = table.keys() |
|
150 table.sort() |
|
151 print """ |
|
152 b1_set = """ + compact_set(table) + """ |
|
153 def in_table_b1(code): |
|
154 return ord(code) in b1_set |
|
155 """ |
|
156 |
|
157 # B.2 and B.3 is case folding. |
|
158 # It takes CaseFolding.txt into account, which is |
|
159 # not available in the Python database. Since |
|
160 # B.2 is derived from B.3, we process B.3 first. |
|
161 # B.3 supposedly *is* CaseFolding-3.2.0.txt. |
|
162 |
|
163 name, table_b2 = tables[0] |
|
164 del tables[0] |
|
165 assert name == "B.2" |
|
166 |
|
167 name, table_b3 = tables[0] |
|
168 del tables[0] |
|
169 assert name == "B.3" |
|
170 |
|
171 # B.3 is mostly Python's .lower, except for a number |
|
172 # of special cases, e.g. considering canonical forms. |
|
173 |
|
174 b3_exceptions = {} |
|
175 |
|
176 for k,v in table_b2.items(): |
|
177 if map(ord, unichr(k).lower()) != v: |
|
178 b3_exceptions[k] = u"".join(map(unichr,v)) |
|
179 |
|
180 b3 = b3_exceptions.items() |
|
181 b3.sort() |
|
182 |
|
183 print """ |
|
184 b3_exceptions = {""" |
|
185 for i,(k,v) in enumerate(b3): |
|
186 print "0x%x:%s," % (k, repr(v)), |
|
187 if i % 4 == 3: |
|
188 print |
|
189 print "}" |
|
190 |
|
191 print """ |
|
192 def map_table_b3(code): |
|
193 r = b3_exceptions.get(ord(code)) |
|
194 if r is not None: return r |
|
195 return code.lower() |
|
196 """ |
|
197 |
|
198 def map_table_b3(code): |
|
199 r = b3_exceptions.get(ord(code)) |
|
200 if r is not None: return r |
|
201 return code.lower() |
|
202 |
|
203 # B.2 is case folding for NFKC. This is the same as B.3, |
|
204 # except where NormalizeWithKC(Fold(a)) != |
|
205 # NormalizeWithKC(Fold(NormalizeWithKC(Fold(a)))) |
|
206 |
|
207 def map_table_b2(a): |
|
208 al = map_table_b3(a) |
|
209 b = unicodedata.normalize("NFKC", al) |
|
210 bl = u"".join([map_table_b3(ch) for ch in b]) |
|
211 c = unicodedata.normalize("NFKC", bl) |
|
212 if b != c: |
|
213 return c |
|
214 else: |
|
215 return al |
|
216 |
|
217 specials = {} |
|
218 for k,v in table_b2.items(): |
|
219 if map(ord, map_table_b2(unichr(k))) != v: |
|
220 specials[k] = v |
|
221 |
|
222 # B.3 should not add any additional special cases |
|
223 assert specials == {} |
|
224 |
|
225 print """ |
|
226 def map_table_b2(a): |
|
227 al = map_table_b3(a) |
|
228 b = unicodedata.normalize("NFKC", al) |
|
229 bl = u"".join([map_table_b3(ch) for ch in b]) |
|
230 c = unicodedata.normalize("NFKC", bl) |
|
231 if b != c: |
|
232 return c |
|
233 else: |
|
234 return al |
|
235 """ |
|
236 |
|
237 # C.1.1 is a table with a single character |
|
238 name, table = tables[0] |
|
239 del tables[0] |
|
240 assert name == "C.1.1" |
|
241 assert table == {0x20:0x20} |
|
242 |
|
243 print """ |
|
244 def in_table_c11(code): |
|
245 return code == u" " |
|
246 """ |
|
247 |
|
248 # C.1.2 is the rest of all space characters |
|
249 name, table = tables[0] |
|
250 del tables[0] |
|
251 assert name == "C.1.2" |
|
252 |
|
253 # table = set(table.keys()) |
|
254 # Zs = set(gen_category(["Zs"])) - set([0x20]) |
|
255 # assert Zs == table |
|
256 |
|
257 print """ |
|
258 def in_table_c12(code): |
|
259 return unicodedata.category(code) == "Zs" and code != u" " |
|
260 |
|
261 def in_table_c11_c12(code): |
|
262 return unicodedata.category(code) == "Zs" |
|
263 """ |
|
264 |
|
265 # C.2.1 ASCII control characters |
|
266 name, table_c21 = tables[0] |
|
267 del tables[0] |
|
268 assert name == "C.2.1" |
|
269 |
|
270 Cc = set(gen_category(["Cc"])) |
|
271 Cc_ascii = Cc & set(range(128)) |
|
272 table_c21 = set(table_c21.keys()) |
|
273 assert Cc_ascii == table_c21 |
|
274 |
|
275 print """ |
|
276 def in_table_c21(code): |
|
277 return ord(code) < 128 and unicodedata.category(code) == "Cc" |
|
278 """ |
|
279 |
|
280 # C.2.2 Non-ASCII control characters. It also includes |
|
281 # a number of characters in category Cf. |
|
282 name, table_c22 = tables[0] |
|
283 del tables[0] |
|
284 assert name == "C.2.2" |
|
285 |
|
286 Cc_nonascii = Cc - Cc_ascii |
|
287 table_c22 = set(table_c22.keys()) |
|
288 assert len(Cc_nonascii - table_c22) == 0 |
|
289 |
|
290 specials = list(table_c22 - Cc_nonascii) |
|
291 specials.sort() |
|
292 |
|
293 print """c22_specials = """ + compact_set(specials) + """ |
|
294 def in_table_c22(code): |
|
295 c = ord(code) |
|
296 if c < 128: return False |
|
297 if unicodedata.category(code) == "Cc": return True |
|
298 return c in c22_specials |
|
299 |
|
300 def in_table_c21_c22(code): |
|
301 return unicodedata.category(code) == "Cc" or \\ |
|
302 ord(code) in c22_specials |
|
303 """ |
|
304 |
|
305 # C.3 Private use |
|
306 name, table = tables[0] |
|
307 del tables[0] |
|
308 assert name == "C.3" |
|
309 |
|
310 Co = set(gen_category(["Co"])) |
|
311 assert set(table.keys()) == Co |
|
312 |
|
313 print """ |
|
314 def in_table_c3(code): |
|
315 return unicodedata.category(code) == "Co" |
|
316 """ |
|
317 |
|
318 # C.4 Non-character code points, xFFFE, xFFFF |
|
319 # plus process internal codes |
|
320 name, table = tables[0] |
|
321 del tables[0] |
|
322 assert name == "C.4" |
|
323 |
|
324 nonchar = set(range(0xFDD0,0xFDF0) + |
|
325 range(0xFFFE,0x110000,0x10000) + |
|
326 range(0xFFFF,0x110000,0x10000)) |
|
327 table = set(table.keys()) |
|
328 assert table == nonchar |
|
329 |
|
330 print """ |
|
331 def in_table_c4(code): |
|
332 c = ord(code) |
|
333 if c < 0xFDD0: return False |
|
334 if c < 0xFDF0: return True |
|
335 return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF) |
|
336 """ |
|
337 |
|
338 # C.5 Surrogate codes |
|
339 name, table = tables[0] |
|
340 del tables[0] |
|
341 assert name == "C.5" |
|
342 |
|
343 Cs = set(gen_category(["Cs"])) |
|
344 assert set(table.keys()) == Cs |
|
345 |
|
346 print """ |
|
347 def in_table_c5(code): |
|
348 return unicodedata.category(code) == "Cs" |
|
349 """ |
|
350 |
|
351 # C.6 Inappropriate for plain text |
|
352 name, table = tables[0] |
|
353 del tables[0] |
|
354 assert name == "C.6" |
|
355 |
|
356 table = table.keys() |
|
357 table.sort() |
|
358 |
|
359 print """ |
|
360 c6_set = """ + compact_set(table) + """ |
|
361 def in_table_c6(code): |
|
362 return ord(code) in c6_set |
|
363 """ |
|
364 |
|
365 # C.7 Inappropriate for canonical representation |
|
366 name, table = tables[0] |
|
367 del tables[0] |
|
368 assert name == "C.7" |
|
369 |
|
370 table = table.keys() |
|
371 table.sort() |
|
372 |
|
373 print """ |
|
374 c7_set = """ + compact_set(table) + """ |
|
375 def in_table_c7(code): |
|
376 return ord(code) in c7_set |
|
377 """ |
|
378 |
|
379 # C.8 Change display properties or are deprecated |
|
380 name, table = tables[0] |
|
381 del tables[0] |
|
382 assert name == "C.8" |
|
383 |
|
384 table = table.keys() |
|
385 table.sort() |
|
386 |
|
387 print """ |
|
388 c8_set = """ + compact_set(table) + """ |
|
389 def in_table_c8(code): |
|
390 return ord(code) in c8_set |
|
391 """ |
|
392 |
|
393 # C.9 Tagging characters |
|
394 name, table = tables[0] |
|
395 del tables[0] |
|
396 assert name == "C.9" |
|
397 |
|
398 table = table.keys() |
|
399 table.sort() |
|
400 |
|
401 print """ |
|
402 c9_set = """ + compact_set(table) + """ |
|
403 def in_table_c9(code): |
|
404 return ord(code) in c9_set |
|
405 """ |
|
406 |
|
407 # D.1 Characters with bidirectional property "R" or "AL" |
|
408 name, table = tables[0] |
|
409 del tables[0] |
|
410 assert name == "D.1" |
|
411 |
|
412 RandAL = set(gen_bidirectional(["R","AL"])) |
|
413 assert set(table.keys()) == RandAL |
|
414 |
|
415 print """ |
|
416 def in_table_d1(code): |
|
417 return unicodedata.bidirectional(code) in ("R","AL") |
|
418 """ |
|
419 |
|
420 # D.2 Characters with bidirectional property "L" |
|
421 name, table = tables[0] |
|
422 del tables[0] |
|
423 assert name == "D.2" |
|
424 |
|
425 L = set(gen_bidirectional(["L"])) |
|
426 assert set(table.keys()) == L |
|
427 |
|
428 print """ |
|
429 def in_table_d2(code): |
|
430 return unicodedata.bidirectional(code) == "L" |
|
431 """ |