|
1 """ codecs -- Python Codec Registry, API and helpers. |
|
2 |
|
3 |
|
4 Written by Marc-Andre Lemburg (mal@lemburg.com). |
|
5 |
|
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. |
|
7 |
|
8 """#" |
|
9 |
|
10 import __builtin__, sys |
|
11 |
|
12 ### Registry and builtin stateless codec functions |
|
13 |
|
14 try: |
|
15 from _codecs import * |
|
16 except ImportError, why: |
|
17 raise SystemError('Failed to load the builtin codecs: %s' % why) |
|
18 |
|
19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", |
|
20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", |
|
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", |
|
22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", |
|
23 "strict_errors", "ignore_errors", "replace_errors", |
|
24 "xmlcharrefreplace_errors", |
|
25 "register_error", "lookup_error"] |
|
26 |
|
27 ### Constants |
|
28 |
|
29 # |
|
30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) |
|
31 # and its possible byte string values |
|
32 # for UTF8/UTF16/UTF32 output and little/big endian machines |
|
33 # |
|
34 |
|
35 # UTF-8 |
|
36 BOM_UTF8 = '\xef\xbb\xbf' |
|
37 |
|
38 # UTF-16, little endian |
|
39 BOM_LE = BOM_UTF16_LE = '\xff\xfe' |
|
40 |
|
41 # UTF-16, big endian |
|
42 BOM_BE = BOM_UTF16_BE = '\xfe\xff' |
|
43 |
|
44 # UTF-32, little endian |
|
45 BOM_UTF32_LE = '\xff\xfe\x00\x00' |
|
46 |
|
47 # UTF-32, big endian |
|
48 BOM_UTF32_BE = '\x00\x00\xfe\xff' |
|
49 |
|
50 if sys.byteorder == 'little': |
|
51 |
|
52 # UTF-16, native endianness |
|
53 BOM = BOM_UTF16 = BOM_UTF16_LE |
|
54 |
|
55 # UTF-32, native endianness |
|
56 BOM_UTF32 = BOM_UTF32_LE |
|
57 |
|
58 else: |
|
59 |
|
60 # UTF-16, native endianness |
|
61 BOM = BOM_UTF16 = BOM_UTF16_BE |
|
62 |
|
63 # UTF-32, native endianness |
|
64 BOM_UTF32 = BOM_UTF32_BE |
|
65 |
|
66 # Old broken names (don't use in new code) |
|
67 BOM32_LE = BOM_UTF16_LE |
|
68 BOM32_BE = BOM_UTF16_BE |
|
69 BOM64_LE = BOM_UTF32_LE |
|
70 BOM64_BE = BOM_UTF32_BE |
|
71 |
|
72 |
|
73 ### Codec base classes (defining the API) |
|
74 |
|
75 class CodecInfo(tuple): |
|
76 |
|
77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, |
|
78 incrementalencoder=None, incrementaldecoder=None, name=None): |
|
79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) |
|
80 self.name = name |
|
81 self.encode = encode |
|
82 self.decode = decode |
|
83 self.incrementalencoder = incrementalencoder |
|
84 self.incrementaldecoder = incrementaldecoder |
|
85 self.streamwriter = streamwriter |
|
86 self.streamreader = streamreader |
|
87 return self |
|
88 |
|
89 def __repr__(self): |
|
90 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self)) |
|
91 |
|
92 class Codec: |
|
93 |
|
94 """ Defines the interface for stateless encoders/decoders. |
|
95 |
|
96 The .encode()/.decode() methods may use different error |
|
97 handling schemes by providing the errors argument. These |
|
98 string values are predefined: |
|
99 |
|
100 'strict' - raise a ValueError error (or a subclass) |
|
101 'ignore' - ignore the character and continue with the next |
|
102 'replace' - replace with a suitable replacement character; |
|
103 Python will use the official U+FFFD REPLACEMENT |
|
104 CHARACTER for the builtin Unicode codecs on |
|
105 decoding and '?' on encoding. |
|
106 'xmlcharrefreplace' - Replace with the appropriate XML |
|
107 character reference (only for encoding). |
|
108 'backslashreplace' - Replace with backslashed escape sequences |
|
109 (only for encoding). |
|
110 |
|
111 The set of allowed values can be extended via register_error. |
|
112 |
|
113 """ |
|
114 def encode(self, input, errors='strict'): |
|
115 |
|
116 """ Encodes the object input and returns a tuple (output |
|
117 object, length consumed). |
|
118 |
|
119 errors defines the error handling to apply. It defaults to |
|
120 'strict' handling. |
|
121 |
|
122 The method may not store state in the Codec instance. Use |
|
123 StreamCodec for codecs which have to keep state in order to |
|
124 make encoding/decoding efficient. |
|
125 |
|
126 The encoder must be able to handle zero length input and |
|
127 return an empty object of the output object type in this |
|
128 situation. |
|
129 |
|
130 """ |
|
131 raise NotImplementedError |
|
132 |
|
133 def decode(self, input, errors='strict'): |
|
134 |
|
135 """ Decodes the object input and returns a tuple (output |
|
136 object, length consumed). |
|
137 |
|
138 input must be an object which provides the bf_getreadbuf |
|
139 buffer slot. Python strings, buffer objects and memory |
|
140 mapped files are examples of objects providing this slot. |
|
141 |
|
142 errors defines the error handling to apply. It defaults to |
|
143 'strict' handling. |
|
144 |
|
145 The method may not store state in the Codec instance. Use |
|
146 StreamCodec for codecs which have to keep state in order to |
|
147 make encoding/decoding efficient. |
|
148 |
|
149 The decoder must be able to handle zero length input and |
|
150 return an empty object of the output object type in this |
|
151 situation. |
|
152 |
|
153 """ |
|
154 raise NotImplementedError |
|
155 |
|
156 class IncrementalEncoder(object): |
|
157 """ |
|
158 An IncrementalEncoder encodes an input in multiple steps. The input can be |
|
159 passed piece by piece to the encode() method. The IncrementalEncoder remembers |
|
160 the state of the Encoding process between calls to encode(). |
|
161 """ |
|
162 def __init__(self, errors='strict'): |
|
163 """ |
|
164 Creates an IncrementalEncoder instance. |
|
165 |
|
166 The IncrementalEncoder may use different error handling schemes by |
|
167 providing the errors keyword argument. See the module docstring |
|
168 for a list of possible values. |
|
169 """ |
|
170 self.errors = errors |
|
171 self.buffer = "" |
|
172 |
|
173 def encode(self, input, final=False): |
|
174 """ |
|
175 Encodes input and returns the resulting object. |
|
176 """ |
|
177 raise NotImplementedError |
|
178 |
|
179 def reset(self): |
|
180 """ |
|
181 Resets the encoder to the initial state. |
|
182 """ |
|
183 |
|
184 class BufferedIncrementalEncoder(IncrementalEncoder): |
|
185 """ |
|
186 This subclass of IncrementalEncoder can be used as the baseclass for an |
|
187 incremental encoder if the encoder must keep some of the output in a |
|
188 buffer between calls to encode(). |
|
189 """ |
|
190 def __init__(self, errors='strict'): |
|
191 IncrementalEncoder.__init__(self, errors) |
|
192 self.buffer = "" # unencoded input that is kept between calls to encode() |
|
193 |
|
194 def _buffer_encode(self, input, errors, final): |
|
195 # Overwrite this method in subclasses: It must encode input |
|
196 # and return an (output, length consumed) tuple |
|
197 raise NotImplementedError |
|
198 |
|
199 def encode(self, input, final=False): |
|
200 # encode input (taking the buffer into account) |
|
201 data = self.buffer + input |
|
202 (result, consumed) = self._buffer_encode(data, self.errors, final) |
|
203 # keep unencoded input until the next call |
|
204 self.buffer = data[consumed:] |
|
205 return result |
|
206 |
|
207 def reset(self): |
|
208 IncrementalEncoder.reset(self) |
|
209 self.buffer = "" |
|
210 |
|
211 class IncrementalDecoder(object): |
|
212 """ |
|
213 An IncrementalDecoder decodes an input in multiple steps. The input can be |
|
214 passed piece by piece to the decode() method. The IncrementalDecoder |
|
215 remembers the state of the decoding process between calls to decode(). |
|
216 """ |
|
217 def __init__(self, errors='strict'): |
|
218 """ |
|
219 Creates a IncrementalDecoder instance. |
|
220 |
|
221 The IncrementalDecoder may use different error handling schemes by |
|
222 providing the errors keyword argument. See the module docstring |
|
223 for a list of possible values. |
|
224 """ |
|
225 self.errors = errors |
|
226 |
|
227 def decode(self, input, final=False): |
|
228 """ |
|
229 Decodes input and returns the resulting object. |
|
230 """ |
|
231 raise NotImplementedError |
|
232 |
|
233 def reset(self): |
|
234 """ |
|
235 Resets the decoder to the initial state. |
|
236 """ |
|
237 |
|
238 class BufferedIncrementalDecoder(IncrementalDecoder): |
|
239 """ |
|
240 This subclass of IncrementalDecoder can be used as the baseclass for an |
|
241 incremental decoder if the decoder must be able to handle incomplete byte |
|
242 sequences. |
|
243 """ |
|
244 def __init__(self, errors='strict'): |
|
245 IncrementalDecoder.__init__(self, errors) |
|
246 self.buffer = "" # undecoded input that is kept between calls to decode() |
|
247 |
|
248 def _buffer_decode(self, input, errors, final): |
|
249 # Overwrite this method in subclasses: It must decode input |
|
250 # and return an (output, length consumed) tuple |
|
251 raise NotImplementedError |
|
252 |
|
253 def decode(self, input, final=False): |
|
254 # decode input (taking the buffer into account) |
|
255 data = self.buffer + input |
|
256 (result, consumed) = self._buffer_decode(data, self.errors, final) |
|
257 # keep undecoded input until the next call |
|
258 self.buffer = data[consumed:] |
|
259 return result |
|
260 |
|
261 def reset(self): |
|
262 IncrementalDecoder.reset(self) |
|
263 self.buffer = "" |
|
264 |
|
265 # |
|
266 # The StreamWriter and StreamReader class provide generic working |
|
267 # interfaces which can be used to implement new encoding submodules |
|
268 # very easily. See encodings/utf_8.py for an example on how this is |
|
269 # done. |
|
270 # |
|
271 |
|
272 class StreamWriter(Codec): |
|
273 |
|
274 def __init__(self, stream, errors='strict'): |
|
275 |
|
276 """ Creates a StreamWriter instance. |
|
277 |
|
278 stream must be a file-like object open for writing |
|
279 (binary) data. |
|
280 |
|
281 The StreamWriter may use different error handling |
|
282 schemes by providing the errors keyword argument. These |
|
283 parameters are predefined: |
|
284 |
|
285 'strict' - raise a ValueError (or a subclass) |
|
286 'ignore' - ignore the character and continue with the next |
|
287 'replace'- replace with a suitable replacement character |
|
288 'xmlcharrefreplace' - Replace with the appropriate XML |
|
289 character reference. |
|
290 'backslashreplace' - Replace with backslashed escape |
|
291 sequences (only for encoding). |
|
292 |
|
293 The set of allowed parameter values can be extended via |
|
294 register_error. |
|
295 """ |
|
296 self.stream = stream |
|
297 self.errors = errors |
|
298 |
|
299 def write(self, object): |
|
300 |
|
301 """ Writes the object's contents encoded to self.stream. |
|
302 """ |
|
303 data, consumed = self.encode(object, self.errors) |
|
304 self.stream.write(data) |
|
305 |
|
306 def writelines(self, list): |
|
307 |
|
308 """ Writes the concatenated list of strings to the stream |
|
309 using .write(). |
|
310 """ |
|
311 self.write(''.join(list)) |
|
312 |
|
313 def reset(self): |
|
314 |
|
315 """ Flushes and resets the codec buffers used for keeping state. |
|
316 |
|
317 Calling this method should ensure that the data on the |
|
318 output is put into a clean state, that allows appending |
|
319 of new fresh data without having to rescan the whole |
|
320 stream to recover state. |
|
321 |
|
322 """ |
|
323 pass |
|
324 |
|
325 def __getattr__(self, name, |
|
326 getattr=getattr): |
|
327 |
|
328 """ Inherit all other methods from the underlying stream. |
|
329 """ |
|
330 return getattr(self.stream, name) |
|
331 |
|
332 def __enter__(self): |
|
333 return self |
|
334 |
|
335 def __exit__(self, type, value, tb): |
|
336 self.stream.close() |
|
337 |
|
338 ### |
|
339 |
|
340 class StreamReader(Codec): |
|
341 |
|
342 def __init__(self, stream, errors='strict'): |
|
343 |
|
344 """ Creates a StreamReader instance. |
|
345 |
|
346 stream must be a file-like object open for reading |
|
347 (binary) data. |
|
348 |
|
349 The StreamReader may use different error handling |
|
350 schemes by providing the errors keyword argument. These |
|
351 parameters are predefined: |
|
352 |
|
353 'strict' - raise a ValueError (or a subclass) |
|
354 'ignore' - ignore the character and continue with the next |
|
355 'replace'- replace with a suitable replacement character; |
|
356 |
|
357 The set of allowed parameter values can be extended via |
|
358 register_error. |
|
359 """ |
|
360 self.stream = stream |
|
361 self.errors = errors |
|
362 self.bytebuffer = "" |
|
363 # For str->str decoding this will stay a str |
|
364 # For str->unicode decoding the first read will promote it to unicode |
|
365 self.charbuffer = "" |
|
366 self.linebuffer = None |
|
367 |
|
368 def decode(self, input, errors='strict'): |
|
369 raise NotImplementedError |
|
370 |
|
371 def read(self, size=-1, chars=-1, firstline=False): |
|
372 |
|
373 """ Decodes data from the stream self.stream and returns the |
|
374 resulting object. |
|
375 |
|
376 chars indicates the number of characters to read from the |
|
377 stream. read() will never return more than chars |
|
378 characters, but it might return less, if there are not enough |
|
379 characters available. |
|
380 |
|
381 size indicates the approximate maximum number of bytes to |
|
382 read from the stream for decoding purposes. The decoder |
|
383 can modify this setting as appropriate. The default value |
|
384 -1 indicates to read and decode as much as possible. size |
|
385 is intended to prevent having to decode huge files in one |
|
386 step. |
|
387 |
|
388 If firstline is true, and a UnicodeDecodeError happens |
|
389 after the first line terminator in the input only the first line |
|
390 will be returned, the rest of the input will be kept until the |
|
391 next call to read(). |
|
392 |
|
393 The method should use a greedy read strategy meaning that |
|
394 it should read as much data as is allowed within the |
|
395 definition of the encoding and the given size, e.g. if |
|
396 optional encoding endings or state markers are available |
|
397 on the stream, these should be read too. |
|
398 """ |
|
399 # If we have lines cached, first merge them back into characters |
|
400 if self.linebuffer: |
|
401 self.charbuffer = "".join(self.linebuffer) |
|
402 self.linebuffer = None |
|
403 |
|
404 # read until we get the required number of characters (if available) |
|
405 while True: |
|
406 # can the request can be satisfied from the character buffer? |
|
407 if chars < 0: |
|
408 if size < 0: |
|
409 if self.charbuffer: |
|
410 break |
|
411 elif len(self.charbuffer) >= size: |
|
412 break |
|
413 else: |
|
414 if len(self.charbuffer) >= chars: |
|
415 break |
|
416 # we need more data |
|
417 if size < 0: |
|
418 newdata = self.stream.read() |
|
419 else: |
|
420 newdata = self.stream.read(size) |
|
421 # decode bytes (those remaining from the last call included) |
|
422 data = self.bytebuffer + newdata |
|
423 try: |
|
424 newchars, decodedbytes = self.decode(data, self.errors) |
|
425 except UnicodeDecodeError, exc: |
|
426 if firstline: |
|
427 newchars, decodedbytes = self.decode(data[:exc.start], self.errors) |
|
428 lines = newchars.splitlines(True) |
|
429 if len(lines)<=1: |
|
430 raise |
|
431 else: |
|
432 raise |
|
433 # keep undecoded bytes until the next call |
|
434 self.bytebuffer = data[decodedbytes:] |
|
435 # put new characters in the character buffer |
|
436 self.charbuffer += newchars |
|
437 # there was no data available |
|
438 if not newdata: |
|
439 break |
|
440 if chars < 0: |
|
441 # Return everything we've got |
|
442 result = self.charbuffer |
|
443 self.charbuffer = "" |
|
444 else: |
|
445 # Return the first chars characters |
|
446 result = self.charbuffer[:chars] |
|
447 self.charbuffer = self.charbuffer[chars:] |
|
448 return result |
|
449 |
|
450 def readline(self, size=None, keepends=True): |
|
451 |
|
452 """ Read one line from the input stream and return the |
|
453 decoded data. |
|
454 |
|
455 size, if given, is passed as size argument to the |
|
456 read() method. |
|
457 |
|
458 """ |
|
459 # If we have lines cached from an earlier read, return |
|
460 # them unconditionally |
|
461 if self.linebuffer: |
|
462 line = self.linebuffer[0] |
|
463 del self.linebuffer[0] |
|
464 if len(self.linebuffer) == 1: |
|
465 # revert to charbuffer mode; we might need more data |
|
466 # next time |
|
467 self.charbuffer = self.linebuffer[0] |
|
468 self.linebuffer = None |
|
469 if not keepends: |
|
470 line = line.splitlines(False)[0] |
|
471 return line |
|
472 |
|
473 readsize = size or 72 |
|
474 line = "" |
|
475 # If size is given, we call read() only once |
|
476 while True: |
|
477 data = self.read(readsize, firstline=True) |
|
478 if data: |
|
479 # If we're at a "\r" read one extra character (which might |
|
480 # be a "\n") to get a proper line ending. If the stream is |
|
481 # temporarily exhausted we return the wrong line ending. |
|
482 if data.endswith("\r"): |
|
483 data += self.read(size=1, chars=1) |
|
484 |
|
485 line += data |
|
486 lines = line.splitlines(True) |
|
487 if lines: |
|
488 if len(lines) > 1: |
|
489 # More than one line result; the first line is a full line |
|
490 # to return |
|
491 line = lines[0] |
|
492 del lines[0] |
|
493 if len(lines) > 1: |
|
494 # cache the remaining lines |
|
495 lines[-1] += self.charbuffer |
|
496 self.linebuffer = lines |
|
497 self.charbuffer = None |
|
498 else: |
|
499 # only one remaining line, put it back into charbuffer |
|
500 self.charbuffer = lines[0] + self.charbuffer |
|
501 if not keepends: |
|
502 line = line.splitlines(False)[0] |
|
503 break |
|
504 line0withend = lines[0] |
|
505 line0withoutend = lines[0].splitlines(False)[0] |
|
506 if line0withend != line0withoutend: # We really have a line end |
|
507 # Put the rest back together and keep it until the next call |
|
508 self.charbuffer = "".join(lines[1:]) + self.charbuffer |
|
509 if keepends: |
|
510 line = line0withend |
|
511 else: |
|
512 line = line0withoutend |
|
513 break |
|
514 # we didn't get anything or this was our only try |
|
515 if not data or size is not None: |
|
516 if line and not keepends: |
|
517 line = line.splitlines(False)[0] |
|
518 break |
|
519 if readsize<8000: |
|
520 readsize *= 2 |
|
521 return line |
|
522 |
|
523 def readlines(self, sizehint=None, keepends=True): |
|
524 |
|
525 """ Read all lines available on the input stream |
|
526 and return them as list of lines. |
|
527 |
|
528 Line breaks are implemented using the codec's decoder |
|
529 method and are included in the list entries. |
|
530 |
|
531 sizehint, if given, is ignored since there is no efficient |
|
532 way to finding the true end-of-line. |
|
533 |
|
534 """ |
|
535 data = self.read() |
|
536 return data.splitlines(keepends) |
|
537 |
|
538 def reset(self): |
|
539 |
|
540 """ Resets the codec buffers used for keeping state. |
|
541 |
|
542 Note that no stream repositioning should take place. |
|
543 This method is primarily intended to be able to recover |
|
544 from decoding errors. |
|
545 |
|
546 """ |
|
547 self.bytebuffer = "" |
|
548 self.charbuffer = u"" |
|
549 self.linebuffer = None |
|
550 |
|
551 def seek(self, offset, whence=0): |
|
552 """ Set the input stream's current position. |
|
553 |
|
554 Resets the codec buffers used for keeping state. |
|
555 """ |
|
556 self.reset() |
|
557 self.stream.seek(offset, whence) |
|
558 |
|
559 def next(self): |
|
560 |
|
561 """ Return the next decoded line from the input stream.""" |
|
562 line = self.readline() |
|
563 if line: |
|
564 return line |
|
565 raise StopIteration |
|
566 |
|
567 def __iter__(self): |
|
568 return self |
|
569 |
|
570 def __getattr__(self, name, |
|
571 getattr=getattr): |
|
572 |
|
573 """ Inherit all other methods from the underlying stream. |
|
574 """ |
|
575 return getattr(self.stream, name) |
|
576 |
|
577 def __enter__(self): |
|
578 return self |
|
579 |
|
580 def __exit__(self, type, value, tb): |
|
581 self.stream.close() |
|
582 |
|
583 ### |
|
584 |
|
585 class StreamReaderWriter: |
|
586 |
|
587 """ StreamReaderWriter instances allow wrapping streams which |
|
588 work in both read and write modes. |
|
589 |
|
590 The design is such that one can use the factory functions |
|
591 returned by the codec.lookup() function to construct the |
|
592 instance. |
|
593 |
|
594 """ |
|
595 # Optional attributes set by the file wrappers below |
|
596 encoding = 'unknown' |
|
597 |
|
598 def __init__(self, stream, Reader, Writer, errors='strict'): |
|
599 |
|
600 """ Creates a StreamReaderWriter instance. |
|
601 |
|
602 stream must be a Stream-like object. |
|
603 |
|
604 Reader, Writer must be factory functions or classes |
|
605 providing the StreamReader, StreamWriter interface resp. |
|
606 |
|
607 Error handling is done in the same way as defined for the |
|
608 StreamWriter/Readers. |
|
609 |
|
610 """ |
|
611 self.stream = stream |
|
612 self.reader = Reader(stream, errors) |
|
613 self.writer = Writer(stream, errors) |
|
614 self.errors = errors |
|
615 |
|
616 def read(self, size=-1): |
|
617 |
|
618 return self.reader.read(size) |
|
619 |
|
620 def readline(self, size=None): |
|
621 |
|
622 return self.reader.readline(size) |
|
623 |
|
624 def readlines(self, sizehint=None): |
|
625 |
|
626 return self.reader.readlines(sizehint) |
|
627 |
|
628 def next(self): |
|
629 |
|
630 """ Return the next decoded line from the input stream.""" |
|
631 return self.reader.next() |
|
632 |
|
633 def __iter__(self): |
|
634 return self |
|
635 |
|
636 def write(self, data): |
|
637 |
|
638 return self.writer.write(data) |
|
639 |
|
640 def writelines(self, list): |
|
641 |
|
642 return self.writer.writelines(list) |
|
643 |
|
644 def reset(self): |
|
645 |
|
646 self.reader.reset() |
|
647 self.writer.reset() |
|
648 |
|
649 def __getattr__(self, name, |
|
650 getattr=getattr): |
|
651 |
|
652 """ Inherit all other methods from the underlying stream. |
|
653 """ |
|
654 return getattr(self.stream, name) |
|
655 |
|
656 # these are needed to make "with codecs.open(...)" work properly |
|
657 |
|
658 def __enter__(self): |
|
659 return self |
|
660 |
|
661 def __exit__(self, type, value, tb): |
|
662 self.stream.close() |
|
663 |
|
664 ### |
|
665 |
|
666 class StreamRecoder: |
|
667 |
|
668 """ StreamRecoder instances provide a frontend - backend |
|
669 view of encoding data. |
|
670 |
|
671 They use the complete set of APIs returned by the |
|
672 codecs.lookup() function to implement their task. |
|
673 |
|
674 Data written to the stream is first decoded into an |
|
675 intermediate format (which is dependent on the given codec |
|
676 combination) and then written to the stream using an instance |
|
677 of the provided Writer class. |
|
678 |
|
679 In the other direction, data is read from the stream using a |
|
680 Reader instance and then return encoded data to the caller. |
|
681 |
|
682 """ |
|
683 # Optional attributes set by the file wrappers below |
|
684 data_encoding = 'unknown' |
|
685 file_encoding = 'unknown' |
|
686 |
|
687 def __init__(self, stream, encode, decode, Reader, Writer, |
|
688 errors='strict'): |
|
689 |
|
690 """ Creates a StreamRecoder instance which implements a two-way |
|
691 conversion: encode and decode work on the frontend (the |
|
692 input to .read() and output of .write()) while |
|
693 Reader and Writer work on the backend (reading and |
|
694 writing to the stream). |
|
695 |
|
696 You can use these objects to do transparent direct |
|
697 recodings from e.g. latin-1 to utf-8 and back. |
|
698 |
|
699 stream must be a file-like object. |
|
700 |
|
701 encode, decode must adhere to the Codec interface, Reader, |
|
702 Writer must be factory functions or classes providing the |
|
703 StreamReader, StreamWriter interface resp. |
|
704 |
|
705 encode and decode are needed for the frontend translation, |
|
706 Reader and Writer for the backend translation. Unicode is |
|
707 used as intermediate encoding. |
|
708 |
|
709 Error handling is done in the same way as defined for the |
|
710 StreamWriter/Readers. |
|
711 |
|
712 """ |
|
713 self.stream = stream |
|
714 self.encode = encode |
|
715 self.decode = decode |
|
716 self.reader = Reader(stream, errors) |
|
717 self.writer = Writer(stream, errors) |
|
718 self.errors = errors |
|
719 |
|
720 def read(self, size=-1): |
|
721 |
|
722 data = self.reader.read(size) |
|
723 data, bytesencoded = self.encode(data, self.errors) |
|
724 return data |
|
725 |
|
726 def readline(self, size=None): |
|
727 |
|
728 if size is None: |
|
729 data = self.reader.readline() |
|
730 else: |
|
731 data = self.reader.readline(size) |
|
732 data, bytesencoded = self.encode(data, self.errors) |
|
733 return data |
|
734 |
|
735 def readlines(self, sizehint=None): |
|
736 |
|
737 data = self.reader.read() |
|
738 data, bytesencoded = self.encode(data, self.errors) |
|
739 return data.splitlines(1) |
|
740 |
|
741 def next(self): |
|
742 |
|
743 """ Return the next decoded line from the input stream.""" |
|
744 data = self.reader.next() |
|
745 data, bytesencoded = self.encode(data, self.errors) |
|
746 return data |
|
747 |
|
748 def __iter__(self): |
|
749 return self |
|
750 |
|
751 def write(self, data): |
|
752 |
|
753 data, bytesdecoded = self.decode(data, self.errors) |
|
754 return self.writer.write(data) |
|
755 |
|
756 def writelines(self, list): |
|
757 |
|
758 data = ''.join(list) |
|
759 data, bytesdecoded = self.decode(data, self.errors) |
|
760 return self.writer.write(data) |
|
761 |
|
762 def reset(self): |
|
763 |
|
764 self.reader.reset() |
|
765 self.writer.reset() |
|
766 |
|
767 def __getattr__(self, name, |
|
768 getattr=getattr): |
|
769 |
|
770 """ Inherit all other methods from the underlying stream. |
|
771 """ |
|
772 return getattr(self.stream, name) |
|
773 |
|
774 def __enter__(self): |
|
775 return self |
|
776 |
|
777 def __exit__(self, type, value, tb): |
|
778 self.stream.close() |
|
779 |
|
780 ### Shortcuts |
|
781 |
|
782 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): |
|
783 |
|
784 """ Open an encoded file using the given mode and return |
|
785 a wrapped version providing transparent encoding/decoding. |
|
786 |
|
787 Note: The wrapped version will only accept the object format |
|
788 defined by the codecs, i.e. Unicode objects for most builtin |
|
789 codecs. Output is also codec dependent and will usually be |
|
790 Unicode as well. |
|
791 |
|
792 Files are always opened in binary mode, even if no binary mode |
|
793 was specified. This is done to avoid data loss due to encodings |
|
794 using 8-bit values. The default file mode is 'rb' meaning to |
|
795 open the file in binary read mode. |
|
796 |
|
797 encoding specifies the encoding which is to be used for the |
|
798 file. |
|
799 |
|
800 errors may be given to define the error handling. It defaults |
|
801 to 'strict' which causes ValueErrors to be raised in case an |
|
802 encoding error occurs. |
|
803 |
|
804 buffering has the same meaning as for the builtin open() API. |
|
805 It defaults to line buffered. |
|
806 |
|
807 The returned wrapped file object provides an extra attribute |
|
808 .encoding which allows querying the used encoding. This |
|
809 attribute is only available if an encoding was specified as |
|
810 parameter. |
|
811 |
|
812 """ |
|
813 if encoding is not None and \ |
|
814 'b' not in mode: |
|
815 # Force opening of the file in binary mode |
|
816 mode = mode + 'b' |
|
817 file = __builtin__.open(filename, mode, buffering) |
|
818 if encoding is None: |
|
819 return file |
|
820 info = lookup(encoding) |
|
821 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) |
|
822 # Add attributes to simplify introspection |
|
823 srw.encoding = encoding |
|
824 return srw |
|
825 |
|
826 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): |
|
827 |
|
828 """ Return a wrapped version of file which provides transparent |
|
829 encoding translation. |
|
830 |
|
831 Strings written to the wrapped file are interpreted according |
|
832 to the given data_encoding and then written to the original |
|
833 file as string using file_encoding. The intermediate encoding |
|
834 will usually be Unicode but depends on the specified codecs. |
|
835 |
|
836 Strings are read from the file using file_encoding and then |
|
837 passed back to the caller as string using data_encoding. |
|
838 |
|
839 If file_encoding is not given, it defaults to data_encoding. |
|
840 |
|
841 errors may be given to define the error handling. It defaults |
|
842 to 'strict' which causes ValueErrors to be raised in case an |
|
843 encoding error occurs. |
|
844 |
|
845 The returned wrapped file object provides two extra attributes |
|
846 .data_encoding and .file_encoding which reflect the given |
|
847 parameters of the same name. The attributes can be used for |
|
848 introspection by Python programs. |
|
849 |
|
850 """ |
|
851 if file_encoding is None: |
|
852 file_encoding = data_encoding |
|
853 data_info = lookup(data_encoding) |
|
854 file_info = lookup(file_encoding) |
|
855 sr = StreamRecoder(file, data_info.encode, data_info.decode, |
|
856 file_info.streamreader, file_info.streamwriter, errors) |
|
857 # Add attributes to simplify introspection |
|
858 sr.data_encoding = data_encoding |
|
859 sr.file_encoding = file_encoding |
|
860 return sr |
|
861 |
|
862 ### Helpers for codec lookup |
|
863 |
|
864 def getencoder(encoding): |
|
865 |
|
866 """ Lookup up the codec for the given encoding and return |
|
867 its encoder function. |
|
868 |
|
869 Raises a LookupError in case the encoding cannot be found. |
|
870 |
|
871 """ |
|
872 return lookup(encoding).encode |
|
873 |
|
874 def getdecoder(encoding): |
|
875 |
|
876 """ Lookup up the codec for the given encoding and return |
|
877 its decoder function. |
|
878 |
|
879 Raises a LookupError in case the encoding cannot be found. |
|
880 |
|
881 """ |
|
882 return lookup(encoding).decode |
|
883 |
|
884 def getincrementalencoder(encoding): |
|
885 |
|
886 """ Lookup up the codec for the given encoding and return |
|
887 its IncrementalEncoder class or factory function. |
|
888 |
|
889 Raises a LookupError in case the encoding cannot be found |
|
890 or the codecs doesn't provide an incremental encoder. |
|
891 |
|
892 """ |
|
893 encoder = lookup(encoding).incrementalencoder |
|
894 if encoder is None: |
|
895 raise LookupError(encoding) |
|
896 return encoder |
|
897 |
|
898 def getincrementaldecoder(encoding): |
|
899 |
|
900 """ Lookup up the codec for the given encoding and return |
|
901 its IncrementalDecoder class or factory function. |
|
902 |
|
903 Raises a LookupError in case the encoding cannot be found |
|
904 or the codecs doesn't provide an incremental decoder. |
|
905 |
|
906 """ |
|
907 decoder = lookup(encoding).incrementaldecoder |
|
908 if decoder is None: |
|
909 raise LookupError(encoding) |
|
910 return decoder |
|
911 |
|
912 def getreader(encoding): |
|
913 |
|
914 """ Lookup up the codec for the given encoding and return |
|
915 its StreamReader class or factory function. |
|
916 |
|
917 Raises a LookupError in case the encoding cannot be found. |
|
918 |
|
919 """ |
|
920 return lookup(encoding).streamreader |
|
921 |
|
922 def getwriter(encoding): |
|
923 |
|
924 """ Lookup up the codec for the given encoding and return |
|
925 its StreamWriter class or factory function. |
|
926 |
|
927 Raises a LookupError in case the encoding cannot be found. |
|
928 |
|
929 """ |
|
930 return lookup(encoding).streamwriter |
|
931 |
|
932 def iterencode(iterator, encoding, errors='strict', **kwargs): |
|
933 """ |
|
934 Encoding iterator. |
|
935 |
|
936 Encodes the input strings from the iterator using a IncrementalEncoder. |
|
937 |
|
938 errors and kwargs are passed through to the IncrementalEncoder |
|
939 constructor. |
|
940 """ |
|
941 encoder = getincrementalencoder(encoding)(errors, **kwargs) |
|
942 for input in iterator: |
|
943 output = encoder.encode(input) |
|
944 if output: |
|
945 yield output |
|
946 output = encoder.encode("", True) |
|
947 if output: |
|
948 yield output |
|
949 |
|
950 def iterdecode(iterator, encoding, errors='strict', **kwargs): |
|
951 """ |
|
952 Decoding iterator. |
|
953 |
|
954 Decodes the input strings from the iterator using a IncrementalDecoder. |
|
955 |
|
956 errors and kwargs are passed through to the IncrementalDecoder |
|
957 constructor. |
|
958 """ |
|
959 decoder = getincrementaldecoder(encoding)(errors, **kwargs) |
|
960 for input in iterator: |
|
961 output = decoder.decode(input) |
|
962 if output: |
|
963 yield output |
|
964 output = decoder.decode("", True) |
|
965 if output: |
|
966 yield output |
|
967 |
|
968 ### Helpers for charmap-based codecs |
|
969 |
|
970 def make_identity_dict(rng): |
|
971 |
|
972 """ make_identity_dict(rng) -> dict |
|
973 |
|
974 Return a dictionary where elements of the rng sequence are |
|
975 mapped to themselves. |
|
976 |
|
977 """ |
|
978 res = {} |
|
979 for i in rng: |
|
980 res[i]=i |
|
981 return res |
|
982 |
|
983 def make_encoding_map(decoding_map): |
|
984 |
|
985 """ Creates an encoding map from a decoding map. |
|
986 |
|
987 If a target mapping in the decoding map occurs multiple |
|
988 times, then that target is mapped to None (undefined mapping), |
|
989 causing an exception when encountered by the charmap codec |
|
990 during translation. |
|
991 |
|
992 One example where this happens is cp875.py which decodes |
|
993 multiple character to \u001a. |
|
994 |
|
995 """ |
|
996 m = {} |
|
997 for k,v in decoding_map.items(): |
|
998 if not v in m: |
|
999 m[v] = k |
|
1000 else: |
|
1001 m[v] = None |
|
1002 return m |
|
1003 |
|
1004 ### error handlers |
|
1005 |
|
1006 try: |
|
1007 strict_errors = lookup_error("strict") |
|
1008 ignore_errors = lookup_error("ignore") |
|
1009 replace_errors = lookup_error("replace") |
|
1010 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") |
|
1011 backslashreplace_errors = lookup_error("backslashreplace") |
|
1012 except LookupError: |
|
1013 # In --disable-unicode builds, these error handler are missing |
|
1014 strict_errors = None |
|
1015 ignore_errors = None |
|
1016 replace_errors = None |
|
1017 xmlcharrefreplace_errors = None |
|
1018 backslashreplace_errors = None |
|
1019 |
|
1020 # Tell modulefinder that using codecs probably needs the encodings |
|
1021 # package |
|
1022 _false = 0 |
|
1023 if _false: |
|
1024 import encodings |
|
1025 |
|
1026 ### Tests |
|
1027 |
|
1028 if __name__ == '__main__': |
|
1029 |
|
1030 # Make stdout translate Latin-1 output into UTF-8 output |
|
1031 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') |
|
1032 |
|
1033 # Have stdin translate Latin-1 input into UTF-8 input |
|
1034 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') |