|
1 import test.test_support, unittest |
|
2 import sys, codecs, htmlentitydefs, unicodedata |
|
3 |
|
4 class PosReturn: |
|
5 # this can be used for configurable callbacks |
|
6 |
|
7 def __init__(self): |
|
8 self.pos = 0 |
|
9 |
|
10 def handle(self, exc): |
|
11 oldpos = self.pos |
|
12 realpos = oldpos |
|
13 if realpos<0: |
|
14 realpos = len(exc.object) + realpos |
|
15 # if we don't advance this time, terminate on the next call |
|
16 # otherwise we'd get an endless loop |
|
17 if realpos <= exc.start: |
|
18 self.pos = len(exc.object) |
|
19 return (u"<?>", oldpos) |
|
20 |
|
21 # A UnicodeEncodeError object with a bad start attribute |
|
22 class BadStartUnicodeEncodeError(UnicodeEncodeError): |
|
23 def __init__(self): |
|
24 UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad") |
|
25 self.start = [] |
|
26 |
|
27 # A UnicodeEncodeError object with a bad object attribute |
|
28 class BadObjectUnicodeEncodeError(UnicodeEncodeError): |
|
29 def __init__(self): |
|
30 UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad") |
|
31 self.object = [] |
|
32 |
|
33 # A UnicodeDecodeError object without an end attribute |
|
34 class NoEndUnicodeDecodeError(UnicodeDecodeError): |
|
35 def __init__(self): |
|
36 UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad") |
|
37 del self.end |
|
38 |
|
39 # A UnicodeDecodeError object with a bad object attribute |
|
40 class BadObjectUnicodeDecodeError(UnicodeDecodeError): |
|
41 def __init__(self): |
|
42 UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad") |
|
43 self.object = [] |
|
44 |
|
45 # A UnicodeTranslateError object without a start attribute |
|
46 class NoStartUnicodeTranslateError(UnicodeTranslateError): |
|
47 def __init__(self): |
|
48 UnicodeTranslateError.__init__(self, u"", 0, 1, "bad") |
|
49 del self.start |
|
50 |
|
51 # A UnicodeTranslateError object without an end attribute |
|
52 class NoEndUnicodeTranslateError(UnicodeTranslateError): |
|
53 def __init__(self): |
|
54 UnicodeTranslateError.__init__(self, u"", 0, 1, "bad") |
|
55 del self.end |
|
56 |
|
57 # A UnicodeTranslateError object without an object attribute |
|
58 class NoObjectUnicodeTranslateError(UnicodeTranslateError): |
|
59 def __init__(self): |
|
60 UnicodeTranslateError.__init__(self, u"", 0, 1, "bad") |
|
61 del self.object |
|
62 |
|
63 class CodecCallbackTest(unittest.TestCase): |
|
64 |
|
65 def test_xmlcharrefreplace(self): |
|
66 # replace unencodable characters which numeric character entities. |
|
67 # For ascii, latin-1 and charmaps this is completely implemented |
|
68 # in C and should be reasonably fast. |
|
69 s = u"\u30b9\u30d1\u30e2 \xe4nd eggs" |
|
70 self.assertEqual( |
|
71 s.encode("ascii", "xmlcharrefreplace"), |
|
72 "スパモ änd eggs" |
|
73 ) |
|
74 self.assertEqual( |
|
75 s.encode("latin-1", "xmlcharrefreplace"), |
|
76 "スパモ \xe4nd eggs" |
|
77 ) |
|
78 |
|
79 def test_xmlcharnamereplace(self): |
|
80 # This time use a named character entity for unencodable |
|
81 # characters, if one is available. |
|
82 |
|
83 def xmlcharnamereplace(exc): |
|
84 if not isinstance(exc, UnicodeEncodeError): |
|
85 raise TypeError("don't know how to handle %r" % exc) |
|
86 l = [] |
|
87 for c in exc.object[exc.start:exc.end]: |
|
88 try: |
|
89 l.append(u"&%s;" % htmlentitydefs.codepoint2name[ord(c)]) |
|
90 except KeyError: |
|
91 l.append(u"&#%d;" % ord(c)) |
|
92 return (u"".join(l), exc.end) |
|
93 |
|
94 codecs.register_error( |
|
95 "test.xmlcharnamereplace", xmlcharnamereplace) |
|
96 |
|
97 sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a" |
|
98 sout = "«ℜ» = ⟨ሴ€⟩" |
|
99 self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout) |
|
100 sout = "\xabℜ\xbb = ⟨ሴ€⟩" |
|
101 self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout) |
|
102 sout = "\xabℜ\xbb = ⟨ሴ\xa4⟩" |
|
103 self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout) |
|
104 |
|
105 def test_uninamereplace(self): |
|
106 # We're using the names from the unicode database this time, |
|
107 # and we're doing "syntax highlighting" here, i.e. we include |
|
108 # the replaced text in ANSI escape sequences. For this it is |
|
109 # useful that the error handler is not called for every single |
|
110 # unencodable character, but for a complete sequence of |
|
111 # unencodable characters, otherwise we would output many |
|
112 # unneccessary escape sequences. |
|
113 |
|
114 def uninamereplace(exc): |
|
115 if not isinstance(exc, UnicodeEncodeError): |
|
116 raise TypeError("don't know how to handle %r" % exc) |
|
117 l = [] |
|
118 for c in exc.object[exc.start:exc.end]: |
|
119 l.append(unicodedata.name(c, u"0x%x" % ord(c))) |
|
120 return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end) |
|
121 |
|
122 codecs.register_error( |
|
123 "test.uninamereplace", uninamereplace) |
|
124 |
|
125 sin = u"\xac\u1234\u20ac\u8000" |
|
126 sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" |
|
127 self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout) |
|
128 |
|
129 sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" |
|
130 self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout) |
|
131 |
|
132 sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m" |
|
133 self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout) |
|
134 |
|
135 def test_backslashescape(self): |
|
136 # Does the same as the "unicode-escape" encoding, but with different |
|
137 # base encodings. |
|
138 sin = u"a\xac\u1234\u20ac\u8000" |
|
139 if sys.maxunicode > 0xffff: |
|
140 sin += unichr(sys.maxunicode) |
|
141 sout = "a\\xac\\u1234\\u20ac\\u8000" |
|
142 if sys.maxunicode > 0xffff: |
|
143 sout += "\\U%08x" % sys.maxunicode |
|
144 self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) |
|
145 |
|
146 sout = "a\xac\\u1234\\u20ac\\u8000" |
|
147 if sys.maxunicode > 0xffff: |
|
148 sout += "\\U%08x" % sys.maxunicode |
|
149 self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) |
|
150 |
|
151 sout = "a\xac\\u1234\xa4\\u8000" |
|
152 if sys.maxunicode > 0xffff: |
|
153 sout += "\\U%08x" % sys.maxunicode |
|
154 self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) |
|
155 |
|
156 def test_decoderelaxedutf8(self): |
|
157 # This is the test for a decoding callback handler, |
|
158 # that relaxes the UTF-8 minimal encoding restriction. |
|
159 # A null byte that is encoded as "\xc0\x80" will be |
|
160 # decoded as a null byte. All other illegal sequences |
|
161 # will be handled strictly. |
|
162 def relaxedutf8(exc): |
|
163 if not isinstance(exc, UnicodeDecodeError): |
|
164 raise TypeError("don't know how to handle %r" % exc) |
|
165 if exc.object[exc.start:exc.end].startswith("\xc0\x80"): |
|
166 return (u"\x00", exc.start+2) # retry after two bytes |
|
167 else: |
|
168 raise exc |
|
169 |
|
170 codecs.register_error( |
|
171 "test.relaxedutf8", relaxedutf8) |
|
172 |
|
173 sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80" |
|
174 sout = u"a\x00b\x00c\xfc\x00\x00" |
|
175 self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout) |
|
176 sin = "\xc0\x80\xc0\x81" |
|
177 self.assertRaises(UnicodeError, sin.decode, "utf-8", "test.relaxedutf8") |
|
178 |
|
179 def test_charmapencode(self): |
|
180 # For charmap encodings the replacement string will be |
|
181 # mapped through the encoding again. This means, that |
|
182 # to be able to use e.g. the "replace" handler, the |
|
183 # charmap has to have a mapping for "?". |
|
184 charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"]) |
|
185 sin = u"abc" |
|
186 sout = "AABBCC" |
|
187 self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout) |
|
188 |
|
189 sin = u"abcA" |
|
190 self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap) |
|
191 |
|
192 charmap[ord("?")] = "XYZ" |
|
193 sin = u"abcDEF" |
|
194 sout = "AABBCCXYZXYZXYZ" |
|
195 self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout) |
|
196 |
|
197 charmap[ord("?")] = u"XYZ" |
|
198 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) |
|
199 |
|
200 charmap[ord("?")] = u"XYZ" |
|
201 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) |
|
202 |
|
203 def test_decodeunicodeinternal(self): |
|
204 self.assertRaises( |
|
205 UnicodeDecodeError, |
|
206 "\x00\x00\x00\x00\x00".decode, |
|
207 "unicode-internal", |
|
208 ) |
|
209 if sys.maxunicode > 0xffff: |
|
210 def handler_unicodeinternal(exc): |
|
211 if not isinstance(exc, UnicodeDecodeError): |
|
212 raise TypeError("don't know how to handle %r" % exc) |
|
213 return (u"\x01", 1) |
|
214 |
|
215 self.assertEqual( |
|
216 "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"), |
|
217 u"\u0000" |
|
218 ) |
|
219 |
|
220 self.assertEqual( |
|
221 "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"), |
|
222 u"\u0000\ufffd" |
|
223 ) |
|
224 |
|
225 codecs.register_error("test.hui", handler_unicodeinternal) |
|
226 |
|
227 self.assertEqual( |
|
228 "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"), |
|
229 u"\u0000\u0001\u0000" |
|
230 ) |
|
231 |
|
232 def test_callbacks(self): |
|
233 def handler1(exc): |
|
234 if not isinstance(exc, UnicodeEncodeError) \ |
|
235 and not isinstance(exc, UnicodeDecodeError): |
|
236 raise TypeError("don't know how to handle %r" % exc) |
|
237 l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] |
|
238 return (u"[%s]" % u"".join(l), exc.end) |
|
239 |
|
240 codecs.register_error("test.handler1", handler1) |
|
241 |
|
242 def handler2(exc): |
|
243 if not isinstance(exc, UnicodeDecodeError): |
|
244 raise TypeError("don't know how to handle %r" % exc) |
|
245 l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] |
|
246 return (u"[%s]" % u"".join(l), exc.end+1) # skip one character |
|
247 |
|
248 codecs.register_error("test.handler2", handler2) |
|
249 |
|
250 s = "\x00\x81\x7f\x80\xff" |
|
251 |
|
252 self.assertEqual( |
|
253 s.decode("ascii", "test.handler1"), |
|
254 u"\x00[<129>]\x7f[<128>][<255>]" |
|
255 ) |
|
256 self.assertEqual( |
|
257 s.decode("ascii", "test.handler2"), |
|
258 u"\x00[<129>][<128>]" |
|
259 ) |
|
260 |
|
261 self.assertEqual( |
|
262 "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"), |
|
263 u"\u3042[<92><117><51><120>]xx" |
|
264 ) |
|
265 |
|
266 self.assertEqual( |
|
267 "\\u3042\u3xx".decode("unicode-escape", "test.handler1"), |
|
268 u"\u3042[<92><117><51><120><120>]" |
|
269 ) |
|
270 |
|
271 self.assertEqual( |
|
272 codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0], |
|
273 u"z[<98>][<99>]" |
|
274 ) |
|
275 |
|
276 self.assertEqual( |
|
277 u"g\xfc\xdfrk".encode("ascii", "test.handler1"), |
|
278 u"g[<252><223>]rk" |
|
279 ) |
|
280 |
|
281 self.assertEqual( |
|
282 u"g\xfc\xdf".encode("ascii", "test.handler1"), |
|
283 u"g[<252><223>]" |
|
284 ) |
|
285 |
|
286 def test_longstrings(self): |
|
287 # test long strings to check for memory overflow problems |
|
288 errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"] |
|
289 # register the handlers under different names, |
|
290 # to prevent the codec from recognizing the name |
|
291 for err in errors: |
|
292 codecs.register_error("test." + err, codecs.lookup_error(err)) |
|
293 l = 1000 |
|
294 errors += [ "test." + err for err in errors ] |
|
295 for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]: |
|
296 for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"): |
|
297 for err in errors: |
|
298 try: |
|
299 uni.encode(enc, err) |
|
300 except UnicodeError: |
|
301 pass |
|
302 |
|
303 def check_exceptionobjectargs(self, exctype, args, msg): |
|
304 # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion |
|
305 # check with one missing argument |
|
306 self.assertRaises(TypeError, exctype, *args[:-1]) |
|
307 # check with one argument too much |
|
308 self.assertRaises(TypeError, exctype, *(args + ["too much"])) |
|
309 # check with one argument of the wrong type |
|
310 wrongargs = [ "spam", u"eggs", 42, 1.0, None ] |
|
311 for i in xrange(len(args)): |
|
312 for wrongarg in wrongargs: |
|
313 if type(wrongarg) is type(args[i]): |
|
314 continue |
|
315 # build argument array |
|
316 callargs = [] |
|
317 for j in xrange(len(args)): |
|
318 if i==j: |
|
319 callargs.append(wrongarg) |
|
320 else: |
|
321 callargs.append(args[i]) |
|
322 self.assertRaises(TypeError, exctype, *callargs) |
|
323 |
|
324 # check with the correct number and type of arguments |
|
325 exc = exctype(*args) |
|
326 self.assertEquals(str(exc), msg) |
|
327 |
|
328 def test_unicodeencodeerror(self): |
|
329 self.check_exceptionobjectargs( |
|
330 UnicodeEncodeError, |
|
331 ["ascii", u"g\xfcrk", 1, 2, "ouch"], |
|
332 "'ascii' codec can't encode character u'\\xfc' in position 1: ouch" |
|
333 ) |
|
334 self.check_exceptionobjectargs( |
|
335 UnicodeEncodeError, |
|
336 ["ascii", u"g\xfcrk", 1, 4, "ouch"], |
|
337 "'ascii' codec can't encode characters in position 1-3: ouch" |
|
338 ) |
|
339 self.check_exceptionobjectargs( |
|
340 UnicodeEncodeError, |
|
341 ["ascii", u"\xfcx", 0, 1, "ouch"], |
|
342 "'ascii' codec can't encode character u'\\xfc' in position 0: ouch" |
|
343 ) |
|
344 self.check_exceptionobjectargs( |
|
345 UnicodeEncodeError, |
|
346 ["ascii", u"\u0100x", 0, 1, "ouch"], |
|
347 "'ascii' codec can't encode character u'\\u0100' in position 0: ouch" |
|
348 ) |
|
349 self.check_exceptionobjectargs( |
|
350 UnicodeEncodeError, |
|
351 ["ascii", u"\uffffx", 0, 1, "ouch"], |
|
352 "'ascii' codec can't encode character u'\\uffff' in position 0: ouch" |
|
353 ) |
|
354 if sys.maxunicode > 0xffff: |
|
355 self.check_exceptionobjectargs( |
|
356 UnicodeEncodeError, |
|
357 ["ascii", u"\U00010000x", 0, 1, "ouch"], |
|
358 "'ascii' codec can't encode character u'\\U00010000' in position 0: ouch" |
|
359 ) |
|
360 |
|
361 def test_unicodedecodeerror(self): |
|
362 self.check_exceptionobjectargs( |
|
363 UnicodeDecodeError, |
|
364 ["ascii", "g\xfcrk", 1, 2, "ouch"], |
|
365 "'ascii' codec can't decode byte 0xfc in position 1: ouch" |
|
366 ) |
|
367 self.check_exceptionobjectargs( |
|
368 UnicodeDecodeError, |
|
369 ["ascii", "g\xfcrk", 1, 3, "ouch"], |
|
370 "'ascii' codec can't decode bytes in position 1-2: ouch" |
|
371 ) |
|
372 |
|
373 def test_unicodetranslateerror(self): |
|
374 self.check_exceptionobjectargs( |
|
375 UnicodeTranslateError, |
|
376 [u"g\xfcrk", 1, 2, "ouch"], |
|
377 "can't translate character u'\\xfc' in position 1: ouch" |
|
378 ) |
|
379 self.check_exceptionobjectargs( |
|
380 UnicodeTranslateError, |
|
381 [u"g\u0100rk", 1, 2, "ouch"], |
|
382 "can't translate character u'\\u0100' in position 1: ouch" |
|
383 ) |
|
384 self.check_exceptionobjectargs( |
|
385 UnicodeTranslateError, |
|
386 [u"g\uffffrk", 1, 2, "ouch"], |
|
387 "can't translate character u'\\uffff' in position 1: ouch" |
|
388 ) |
|
389 if sys.maxunicode > 0xffff: |
|
390 self.check_exceptionobjectargs( |
|
391 UnicodeTranslateError, |
|
392 [u"g\U00010000rk", 1, 2, "ouch"], |
|
393 "can't translate character u'\\U00010000' in position 1: ouch" |
|
394 ) |
|
395 self.check_exceptionobjectargs( |
|
396 UnicodeTranslateError, |
|
397 [u"g\xfcrk", 1, 3, "ouch"], |
|
398 "can't translate characters in position 1-2: ouch" |
|
399 ) |
|
400 |
|
401 def test_badandgoodstrictexceptions(self): |
|
402 # "strict" complains about a non-exception passed in |
|
403 self.assertRaises( |
|
404 TypeError, |
|
405 codecs.strict_errors, |
|
406 42 |
|
407 ) |
|
408 # "strict" complains about the wrong exception type |
|
409 self.assertRaises( |
|
410 Exception, |
|
411 codecs.strict_errors, |
|
412 Exception("ouch") |
|
413 ) |
|
414 |
|
415 # If the correct exception is passed in, "strict" raises it |
|
416 self.assertRaises( |
|
417 UnicodeEncodeError, |
|
418 codecs.strict_errors, |
|
419 UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch") |
|
420 ) |
|
421 |
|
422 def test_badandgoodignoreexceptions(self): |
|
423 # "ignore" complains about a non-exception passed in |
|
424 self.assertRaises( |
|
425 TypeError, |
|
426 codecs.ignore_errors, |
|
427 42 |
|
428 ) |
|
429 # "ignore" complains about the wrong exception type |
|
430 self.assertRaises( |
|
431 TypeError, |
|
432 codecs.ignore_errors, |
|
433 UnicodeError("ouch") |
|
434 ) |
|
435 # If the correct exception is passed in, "ignore" returns an empty replacement |
|
436 self.assertEquals( |
|
437 codecs.ignore_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), |
|
438 (u"", 1) |
|
439 ) |
|
440 self.assertEquals( |
|
441 codecs.ignore_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")), |
|
442 (u"", 1) |
|
443 ) |
|
444 self.assertEquals( |
|
445 codecs.ignore_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")), |
|
446 (u"", 1) |
|
447 ) |
|
448 |
|
449 def test_badandgoodreplaceexceptions(self): |
|
450 # "replace" complains about a non-exception passed in |
|
451 self.assertRaises( |
|
452 TypeError, |
|
453 codecs.replace_errors, |
|
454 42 |
|
455 ) |
|
456 # "replace" complains about the wrong exception type |
|
457 self.assertRaises( |
|
458 TypeError, |
|
459 codecs.replace_errors, |
|
460 UnicodeError("ouch") |
|
461 ) |
|
462 self.assertRaises( |
|
463 TypeError, |
|
464 codecs.replace_errors, |
|
465 BadObjectUnicodeEncodeError() |
|
466 ) |
|
467 self.assertRaises( |
|
468 TypeError, |
|
469 codecs.replace_errors, |
|
470 BadObjectUnicodeDecodeError() |
|
471 ) |
|
472 # With the correct exception, "replace" returns an "?" or u"\ufffd" replacement |
|
473 self.assertEquals( |
|
474 codecs.replace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), |
|
475 (u"?", 1) |
|
476 ) |
|
477 self.assertEquals( |
|
478 codecs.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")), |
|
479 (u"\ufffd", 1) |
|
480 ) |
|
481 self.assertEquals( |
|
482 codecs.replace_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")), |
|
483 (u"\ufffd", 1) |
|
484 ) |
|
485 |
|
486 def test_badandgoodxmlcharrefreplaceexceptions(self): |
|
487 # "xmlcharrefreplace" complains about a non-exception passed in |
|
488 self.assertRaises( |
|
489 TypeError, |
|
490 codecs.xmlcharrefreplace_errors, |
|
491 42 |
|
492 ) |
|
493 # "xmlcharrefreplace" complains about the wrong exception types |
|
494 self.assertRaises( |
|
495 TypeError, |
|
496 codecs.xmlcharrefreplace_errors, |
|
497 UnicodeError("ouch") |
|
498 ) |
|
499 # "xmlcharrefreplace" can only be used for encoding |
|
500 self.assertRaises( |
|
501 TypeError, |
|
502 codecs.xmlcharrefreplace_errors, |
|
503 UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch") |
|
504 ) |
|
505 self.assertRaises( |
|
506 TypeError, |
|
507 codecs.xmlcharrefreplace_errors, |
|
508 UnicodeTranslateError(u"\u3042", 0, 1, "ouch") |
|
509 ) |
|
510 # Use the correct exception |
|
511 cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 0x3042) |
|
512 s = "".join(unichr(c) for c in cs) |
|
513 self.assertEquals( |
|
514 codecs.xmlcharrefreplace_errors( |
|
515 UnicodeEncodeError("ascii", s, 0, len(s), "ouch") |
|
516 ), |
|
517 (u"".join(u"&#%d;" % ord(c) for c in s), len(s)) |
|
518 ) |
|
519 |
|
520 def test_badandgoodbackslashreplaceexceptions(self): |
|
521 # "backslashreplace" complains about a non-exception passed in |
|
522 self.assertRaises( |
|
523 TypeError, |
|
524 codecs.backslashreplace_errors, |
|
525 42 |
|
526 ) |
|
527 # "backslashreplace" complains about the wrong exception types |
|
528 self.assertRaises( |
|
529 TypeError, |
|
530 codecs.backslashreplace_errors, |
|
531 UnicodeError("ouch") |
|
532 ) |
|
533 # "backslashreplace" can only be used for encoding |
|
534 self.assertRaises( |
|
535 TypeError, |
|
536 codecs.backslashreplace_errors, |
|
537 UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch") |
|
538 ) |
|
539 self.assertRaises( |
|
540 TypeError, |
|
541 codecs.backslashreplace_errors, |
|
542 UnicodeTranslateError(u"\u3042", 0, 1, "ouch") |
|
543 ) |
|
544 # Use the correct exception |
|
545 self.assertEquals( |
|
546 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), |
|
547 (u"\\u3042", 1) |
|
548 ) |
|
549 self.assertEquals( |
|
550 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\x00", 0, 1, "ouch")), |
|
551 (u"\\x00", 1) |
|
552 ) |
|
553 self.assertEquals( |
|
554 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\xff", 0, 1, "ouch")), |
|
555 (u"\\xff", 1) |
|
556 ) |
|
557 self.assertEquals( |
|
558 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u0100", 0, 1, "ouch")), |
|
559 (u"\\u0100", 1) |
|
560 ) |
|
561 self.assertEquals( |
|
562 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\uffff", 0, 1, "ouch")), |
|
563 (u"\\uffff", 1) |
|
564 ) |
|
565 if sys.maxunicode>0xffff: |
|
566 self.assertEquals( |
|
567 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U00010000", 0, 1, "ouch")), |
|
568 (u"\\U00010000", 1) |
|
569 ) |
|
570 self.assertEquals( |
|
571 codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U0010ffff", 0, 1, "ouch")), |
|
572 (u"\\U0010ffff", 1) |
|
573 ) |
|
574 |
|
575 def test_badhandlerresults(self): |
|
576 results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) |
|
577 encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15") |
|
578 |
|
579 for res in results: |
|
580 codecs.register_error("test.badhandler", lambda: res) |
|
581 for enc in encs: |
|
582 self.assertRaises( |
|
583 TypeError, |
|
584 u"\u3042".encode, |
|
585 enc, |
|
586 "test.badhandler" |
|
587 ) |
|
588 for (enc, bytes) in ( |
|
589 ("ascii", "\xff"), |
|
590 ("utf-8", "\xff"), |
|
591 ("utf-7", "+x-"), |
|
592 ("unicode-internal", "\x00"), |
|
593 ): |
|
594 self.assertRaises( |
|
595 TypeError, |
|
596 bytes.decode, |
|
597 enc, |
|
598 "test.badhandler" |
|
599 ) |
|
600 |
|
601 def test_lookup(self): |
|
602 self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict")) |
|
603 self.assertEquals(codecs.ignore_errors, codecs.lookup_error("ignore")) |
|
604 self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict")) |
|
605 self.assertEquals( |
|
606 codecs.xmlcharrefreplace_errors, |
|
607 codecs.lookup_error("xmlcharrefreplace") |
|
608 ) |
|
609 self.assertEquals( |
|
610 codecs.backslashreplace_errors, |
|
611 codecs.lookup_error("backslashreplace") |
|
612 ) |
|
613 |
|
614 def test_unencodablereplacement(self): |
|
615 def unencrepl(exc): |
|
616 if isinstance(exc, UnicodeEncodeError): |
|
617 return (u"\u4242", exc.end) |
|
618 else: |
|
619 raise TypeError("don't know how to handle %r" % exc) |
|
620 codecs.register_error("test.unencreplhandler", unencrepl) |
|
621 for enc in ("ascii", "iso-8859-1", "iso-8859-15"): |
|
622 self.assertRaises( |
|
623 UnicodeEncodeError, |
|
624 u"\u4242".encode, |
|
625 enc, |
|
626 "test.unencreplhandler" |
|
627 ) |
|
628 |
|
629 def test_badregistercall(self): |
|
630 # enhance coverage of: |
|
631 # Modules/_codecsmodule.c::register_error() |
|
632 # Python/codecs.c::PyCodec_RegisterError() |
|
633 self.assertRaises(TypeError, codecs.register_error, 42) |
|
634 self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42) |
|
635 |
|
636 def test_badlookupcall(self): |
|
637 # enhance coverage of: |
|
638 # Modules/_codecsmodule.c::lookup_error() |
|
639 self.assertRaises(TypeError, codecs.lookup_error) |
|
640 |
|
641 def test_unknownhandler(self): |
|
642 # enhance coverage of: |
|
643 # Modules/_codecsmodule.c::lookup_error() |
|
644 self.assertRaises(LookupError, codecs.lookup_error, "test.unknown") |
|
645 |
|
646 def test_xmlcharrefvalues(self): |
|
647 # enhance coverage of: |
|
648 # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors() |
|
649 # and inline implementations |
|
650 v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000) |
|
651 if sys.maxunicode>=100000: |
|
652 v += (100000, 500000, 1000000) |
|
653 s = u"".join([unichr(x) for x in v]) |
|
654 codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors) |
|
655 for enc in ("ascii", "iso-8859-15"): |
|
656 for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"): |
|
657 s.encode(enc, err) |
|
658 |
|
659 def test_decodehelper(self): |
|
660 # enhance coverage of: |
|
661 # Objects/unicodeobject.c::unicode_decode_call_errorhandler() |
|
662 # and callers |
|
663 self.assertRaises(LookupError, "\xff".decode, "ascii", "test.unknown") |
|
664 |
|
665 def baddecodereturn1(exc): |
|
666 return 42 |
|
667 codecs.register_error("test.baddecodereturn1", baddecodereturn1) |
|
668 self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn1") |
|
669 self.assertRaises(TypeError, "\\".decode, "unicode-escape", "test.baddecodereturn1") |
|
670 self.assertRaises(TypeError, "\\x0".decode, "unicode-escape", "test.baddecodereturn1") |
|
671 self.assertRaises(TypeError, "\\x0y".decode, "unicode-escape", "test.baddecodereturn1") |
|
672 self.assertRaises(TypeError, "\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1") |
|
673 self.assertRaises(TypeError, "\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1") |
|
674 |
|
675 def baddecodereturn2(exc): |
|
676 return (u"?", None) |
|
677 codecs.register_error("test.baddecodereturn2", baddecodereturn2) |
|
678 self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn2") |
|
679 |
|
680 handler = PosReturn() |
|
681 codecs.register_error("test.posreturn", handler.handle) |
|
682 |
|
683 # Valid negative position |
|
684 handler.pos = -1 |
|
685 self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0") |
|
686 |
|
687 # Valid negative position |
|
688 handler.pos = -2 |
|
689 self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?><?>") |
|
690 |
|
691 # Negative position out of bounds |
|
692 handler.pos = -3 |
|
693 self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn") |
|
694 |
|
695 # Valid positive position |
|
696 handler.pos = 1 |
|
697 self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0") |
|
698 |
|
699 # Largest valid positive position (one beyond end of input) |
|
700 handler.pos = 2 |
|
701 self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>") |
|
702 |
|
703 # Invalid positive position |
|
704 handler.pos = 3 |
|
705 self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn") |
|
706 |
|
707 # Restart at the "0" |
|
708 handler.pos = 6 |
|
709 self.assertEquals("\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), u"<?>0") |
|
710 |
|
711 class D(dict): |
|
712 def __getitem__(self, key): |
|
713 raise ValueError |
|
714 self.assertRaises(UnicodeError, codecs.charmap_decode, "\xff", "strict", {0xff: None}) |
|
715 self.assertRaises(ValueError, codecs.charmap_decode, "\xff", "strict", D()) |
|
716 self.assertRaises(TypeError, codecs.charmap_decode, "\xff", "strict", {0xff: sys.maxunicode+1}) |
|
717 |
|
718 def test_encodehelper(self): |
|
719 # enhance coverage of: |
|
720 # Objects/unicodeobject.c::unicode_encode_call_errorhandler() |
|
721 # and callers |
|
722 self.assertRaises(LookupError, u"\xff".encode, "ascii", "test.unknown") |
|
723 |
|
724 def badencodereturn1(exc): |
|
725 return 42 |
|
726 codecs.register_error("test.badencodereturn1", badencodereturn1) |
|
727 self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn1") |
|
728 |
|
729 def badencodereturn2(exc): |
|
730 return (u"?", None) |
|
731 codecs.register_error("test.badencodereturn2", badencodereturn2) |
|
732 self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn2") |
|
733 |
|
734 handler = PosReturn() |
|
735 codecs.register_error("test.posreturn", handler.handle) |
|
736 |
|
737 # Valid negative position |
|
738 handler.pos = -1 |
|
739 self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0") |
|
740 |
|
741 # Valid negative position |
|
742 handler.pos = -2 |
|
743 self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>") |
|
744 |
|
745 # Negative position out of bounds |
|
746 handler.pos = -3 |
|
747 self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn") |
|
748 |
|
749 # Valid positive position |
|
750 handler.pos = 1 |
|
751 self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0") |
|
752 |
|
753 # Largest valid positive position (one beyond end of input |
|
754 handler.pos = 2 |
|
755 self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>") |
|
756 |
|
757 # Invalid positive position |
|
758 handler.pos = 3 |
|
759 self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn") |
|
760 |
|
761 handler.pos = 0 |
|
762 |
|
763 class D(dict): |
|
764 def __getitem__(self, key): |
|
765 raise ValueError |
|
766 for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"): |
|
767 self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None}) |
|
768 self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D()) |
|
769 self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300}) |
|
770 |
|
771 def test_translatehelper(self): |
|
772 # enhance coverage of: |
|
773 # Objects/unicodeobject.c::unicode_encode_call_errorhandler() |
|
774 # and callers |
|
775 # (Unfortunately the errors argument is not directly accessible |
|
776 # from Python, so we can't test that much) |
|
777 class D(dict): |
|
778 def __getitem__(self, key): |
|
779 raise ValueError |
|
780 self.assertRaises(ValueError, u"\xff".translate, D()) |
|
781 self.assertRaises(TypeError, u"\xff".translate, {0xff: sys.maxunicode+1}) |
|
782 self.assertRaises(TypeError, u"\xff".translate, {0xff: ()}) |
|
783 |
|
784 def test_bug828737(self): |
|
785 charmap = { |
|
786 ord("&"): u"&", |
|
787 ord("<"): u"<", |
|
788 ord(">"): u">", |
|
789 ord('"'): u""", |
|
790 } |
|
791 |
|
792 for n in (1, 10, 100, 1000): |
|
793 text = u'abc<def>ghi'*n |
|
794 text.translate(charmap) |
|
795 |
|
796 def test_main(): |
|
797 test.test_support.run_unittest(CodecCallbackTest) |
|
798 |
|
799 if __name__ == "__main__": |
|
800 test_main() |