python-2.5.2/win32/Lib/codecs.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 """ codecs -- Python Codec Registry, API and helpers.
       
     2 
       
     3 
       
     4 Written by Marc-Andre Lemburg (mal@lemburg.com).
       
     5 
       
     6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
       
     7 
       
     8 """#"
       
     9 
       
    10 import __builtin__, sys
       
    11 
       
    12 ### Registry and builtin stateless codec functions
       
    13 
       
    14 try:
       
    15     from _codecs import *
       
    16 except ImportError, why:
       
    17     raise SystemError('Failed to load the builtin codecs: %s' % why)
       
    18 
       
    19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
       
    20            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
       
    21            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
       
    22            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
       
    23            "strict_errors", "ignore_errors", "replace_errors",
       
    24            "xmlcharrefreplace_errors",
       
    25            "register_error", "lookup_error"]
       
    26 
       
    27 ### Constants
       
    28 
       
    29 #
       
    30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
       
    31 # and its possible byte string values
       
    32 # for UTF8/UTF16/UTF32 output and little/big endian machines
       
    33 #
       
    34 
       
    35 # UTF-8
       
    36 BOM_UTF8 = '\xef\xbb\xbf'
       
    37 
       
    38 # UTF-16, little endian
       
    39 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
       
    40 
       
    41 # UTF-16, big endian
       
    42 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
       
    43 
       
    44 # UTF-32, little endian
       
    45 BOM_UTF32_LE = '\xff\xfe\x00\x00'
       
    46 
       
    47 # UTF-32, big endian
       
    48 BOM_UTF32_BE = '\x00\x00\xfe\xff'
       
    49 
       
    50 if sys.byteorder == 'little':
       
    51 
       
    52     # UTF-16, native endianness
       
    53     BOM = BOM_UTF16 = BOM_UTF16_LE
       
    54 
       
    55     # UTF-32, native endianness
       
    56     BOM_UTF32 = BOM_UTF32_LE
       
    57 
       
    58 else:
       
    59 
       
    60     # UTF-16, native endianness
       
    61     BOM = BOM_UTF16 = BOM_UTF16_BE
       
    62 
       
    63     # UTF-32, native endianness
       
    64     BOM_UTF32 = BOM_UTF32_BE
       
    65 
       
    66 # Old broken names (don't use in new code)
       
    67 BOM32_LE = BOM_UTF16_LE
       
    68 BOM32_BE = BOM_UTF16_BE
       
    69 BOM64_LE = BOM_UTF32_LE
       
    70 BOM64_BE = BOM_UTF32_BE
       
    71 
       
    72 
       
    73 ### Codec base classes (defining the API)
       
    74 
       
    75 class CodecInfo(tuple):
       
    76 
       
    77     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
       
    78         incrementalencoder=None, incrementaldecoder=None, name=None):
       
    79         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
       
    80         self.name = name
       
    81         self.encode = encode
       
    82         self.decode = decode
       
    83         self.incrementalencoder = incrementalencoder
       
    84         self.incrementaldecoder = incrementaldecoder
       
    85         self.streamwriter = streamwriter
       
    86         self.streamreader = streamreader
       
    87         return self
       
    88 
       
    89     def __repr__(self):
       
    90         return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
       
    91 
       
    92 class Codec:
       
    93 
       
    94     """ Defines the interface for stateless encoders/decoders.
       
    95 
       
    96         The .encode()/.decode() methods may use different error
       
    97         handling schemes by providing the errors argument. These
       
    98         string values are predefined:
       
    99 
       
   100          'strict' - raise a ValueError error (or a subclass)
       
   101          'ignore' - ignore the character and continue with the next
       
   102          'replace' - replace with a suitable replacement character;
       
   103                     Python will use the official U+FFFD REPLACEMENT
       
   104                     CHARACTER for the builtin Unicode codecs on
       
   105                     decoding and '?' on encoding.
       
   106          'xmlcharrefreplace' - Replace with the appropriate XML
       
   107                                character reference (only for encoding).
       
   108          'backslashreplace'  - Replace with backslashed escape sequences
       
   109                                (only for encoding).
       
   110 
       
   111         The set of allowed values can be extended via register_error.
       
   112 
       
   113     """
       
   114     def encode(self, input, errors='strict'):
       
   115 
       
   116         """ Encodes the object input and returns a tuple (output
       
   117             object, length consumed).
       
   118 
       
   119             errors defines the error handling to apply. It defaults to
       
   120             'strict' handling.
       
   121 
       
   122             The method may not store state in the Codec instance. Use
       
   123             StreamCodec for codecs which have to keep state in order to
       
   124             make encoding/decoding efficient.
       
   125 
       
   126             The encoder must be able to handle zero length input and
       
   127             return an empty object of the output object type in this
       
   128             situation.
       
   129 
       
   130         """
       
   131         raise NotImplementedError
       
   132 
       
   133     def decode(self, input, errors='strict'):
       
   134 
       
   135         """ Decodes the object input and returns a tuple (output
       
   136             object, length consumed).
       
   137 
       
   138             input must be an object which provides the bf_getreadbuf
       
   139             buffer slot. Python strings, buffer objects and memory
       
   140             mapped files are examples of objects providing this slot.
       
   141 
       
   142             errors defines the error handling to apply. It defaults to
       
   143             'strict' handling.
       
   144 
       
   145             The method may not store state in the Codec instance. Use
       
   146             StreamCodec for codecs which have to keep state in order to
       
   147             make encoding/decoding efficient.
       
   148 
       
   149             The decoder must be able to handle zero length input and
       
   150             return an empty object of the output object type in this
       
   151             situation.
       
   152 
       
   153         """
       
   154         raise NotImplementedError
       
   155 
       
   156 class IncrementalEncoder(object):
       
   157     """
       
   158     An IncrementalEncoder encodes an input in multiple steps. The input can be
       
   159     passed piece by piece to the encode() method. The IncrementalEncoder remembers
       
   160     the state of the Encoding process between calls to encode().
       
   161     """
       
   162     def __init__(self, errors='strict'):
       
   163         """
       
   164         Creates an IncrementalEncoder instance.
       
   165 
       
   166         The IncrementalEncoder may use different error handling schemes by
       
   167         providing the errors keyword argument. See the module docstring
       
   168         for a list of possible values.
       
   169         """
       
   170         self.errors = errors
       
   171         self.buffer = ""
       
   172 
       
   173     def encode(self, input, final=False):
       
   174         """
       
   175         Encodes input and returns the resulting object.
       
   176         """
       
   177         raise NotImplementedError
       
   178 
       
   179     def reset(self):
       
   180         """
       
   181         Resets the encoder to the initial state.
       
   182         """
       
   183 
       
   184 class BufferedIncrementalEncoder(IncrementalEncoder):
       
   185     """
       
   186     This subclass of IncrementalEncoder can be used as the baseclass for an
       
   187     incremental encoder if the encoder must keep some of the output in a
       
   188     buffer between calls to encode().
       
   189     """
       
   190     def __init__(self, errors='strict'):
       
   191         IncrementalEncoder.__init__(self, errors)
       
   192         self.buffer = "" # unencoded input that is kept between calls to encode()
       
   193 
       
   194     def _buffer_encode(self, input, errors, final):
       
   195         # Overwrite this method in subclasses: It must encode input
       
   196         # and return an (output, length consumed) tuple
       
   197         raise NotImplementedError
       
   198 
       
   199     def encode(self, input, final=False):
       
   200         # encode input (taking the buffer into account)
       
   201         data = self.buffer + input
       
   202         (result, consumed) = self._buffer_encode(data, self.errors, final)
       
   203         # keep unencoded input until the next call
       
   204         self.buffer = data[consumed:]
       
   205         return result
       
   206 
       
   207     def reset(self):
       
   208         IncrementalEncoder.reset(self)
       
   209         self.buffer = ""
       
   210 
       
   211 class IncrementalDecoder(object):
       
   212     """
       
   213     An IncrementalDecoder decodes an input in multiple steps. The input can be
       
   214     passed piece by piece to the decode() method. The IncrementalDecoder
       
   215     remembers the state of the decoding process between calls to decode().
       
   216     """
       
   217     def __init__(self, errors='strict'):
       
   218         """
       
   219         Creates a IncrementalDecoder instance.
       
   220 
       
   221         The IncrementalDecoder may use different error handling schemes by
       
   222         providing the errors keyword argument. See the module docstring
       
   223         for a list of possible values.
       
   224         """
       
   225         self.errors = errors
       
   226 
       
   227     def decode(self, input, final=False):
       
   228         """
       
   229         Decodes input and returns the resulting object.
       
   230         """
       
   231         raise NotImplementedError
       
   232 
       
   233     def reset(self):
       
   234         """
       
   235         Resets the decoder to the initial state.
       
   236         """
       
   237 
       
   238 class BufferedIncrementalDecoder(IncrementalDecoder):
       
   239     """
       
   240     This subclass of IncrementalDecoder can be used as the baseclass for an
       
   241     incremental decoder if the decoder must be able to handle incomplete byte
       
   242     sequences.
       
   243     """
       
   244     def __init__(self, errors='strict'):
       
   245         IncrementalDecoder.__init__(self, errors)
       
   246         self.buffer = "" # undecoded input that is kept between calls to decode()
       
   247 
       
   248     def _buffer_decode(self, input, errors, final):
       
   249         # Overwrite this method in subclasses: It must decode input
       
   250         # and return an (output, length consumed) tuple
       
   251         raise NotImplementedError
       
   252 
       
   253     def decode(self, input, final=False):
       
   254         # decode input (taking the buffer into account)
       
   255         data = self.buffer + input
       
   256         (result, consumed) = self._buffer_decode(data, self.errors, final)
       
   257         # keep undecoded input until the next call
       
   258         self.buffer = data[consumed:]
       
   259         return result
       
   260 
       
   261     def reset(self):
       
   262         IncrementalDecoder.reset(self)
       
   263         self.buffer = ""
       
   264 
       
   265 #
       
   266 # The StreamWriter and StreamReader class provide generic working
       
   267 # interfaces which can be used to implement new encoding submodules
       
   268 # very easily. See encodings/utf_8.py for an example on how this is
       
   269 # done.
       
   270 #
       
   271 
       
   272 class StreamWriter(Codec):
       
   273 
       
   274     def __init__(self, stream, errors='strict'):
       
   275 
       
   276         """ Creates a StreamWriter instance.
       
   277 
       
   278             stream must be a file-like object open for writing
       
   279             (binary) data.
       
   280 
       
   281             The StreamWriter may use different error handling
       
   282             schemes by providing the errors keyword argument. These
       
   283             parameters are predefined:
       
   284 
       
   285              'strict' - raise a ValueError (or a subclass)
       
   286              'ignore' - ignore the character and continue with the next
       
   287              'replace'- replace with a suitable replacement character
       
   288              'xmlcharrefreplace' - Replace with the appropriate XML
       
   289                                    character reference.
       
   290              'backslashreplace'  - Replace with backslashed escape
       
   291                                    sequences (only for encoding).
       
   292 
       
   293             The set of allowed parameter values can be extended via
       
   294             register_error.
       
   295         """
       
   296         self.stream = stream
       
   297         self.errors = errors
       
   298 
       
   299     def write(self, object):
       
   300 
       
   301         """ Writes the object's contents encoded to self.stream.
       
   302         """
       
   303         data, consumed = self.encode(object, self.errors)
       
   304         self.stream.write(data)
       
   305 
       
   306     def writelines(self, list):
       
   307 
       
   308         """ Writes the concatenated list of strings to the stream
       
   309             using .write().
       
   310         """
       
   311         self.write(''.join(list))
       
   312 
       
   313     def reset(self):
       
   314 
       
   315         """ Flushes and resets the codec buffers used for keeping state.
       
   316 
       
   317             Calling this method should ensure that the data on the
       
   318             output is put into a clean state, that allows appending
       
   319             of new fresh data without having to rescan the whole
       
   320             stream to recover state.
       
   321 
       
   322         """
       
   323         pass
       
   324 
       
   325     def __getattr__(self, name,
       
   326                     getattr=getattr):
       
   327 
       
   328         """ Inherit all other methods from the underlying stream.
       
   329         """
       
   330         return getattr(self.stream, name)
       
   331 
       
   332     def __enter__(self):
       
   333         return self
       
   334 
       
   335     def __exit__(self, type, value, tb):
       
   336         self.stream.close()
       
   337 
       
   338 ###
       
   339 
       
   340 class StreamReader(Codec):
       
   341 
       
   342     def __init__(self, stream, errors='strict'):
       
   343 
       
   344         """ Creates a StreamReader instance.
       
   345 
       
   346             stream must be a file-like object open for reading
       
   347             (binary) data.
       
   348 
       
   349             The StreamReader may use different error handling
       
   350             schemes by providing the errors keyword argument. These
       
   351             parameters are predefined:
       
   352 
       
   353              'strict' - raise a ValueError (or a subclass)
       
   354              'ignore' - ignore the character and continue with the next
       
   355              'replace'- replace with a suitable replacement character;
       
   356 
       
   357             The set of allowed parameter values can be extended via
       
   358             register_error.
       
   359         """
       
   360         self.stream = stream
       
   361         self.errors = errors
       
   362         self.bytebuffer = ""
       
   363         # For str->str decoding this will stay a str
       
   364         # For str->unicode decoding the first read will promote it to unicode
       
   365         self.charbuffer = ""
       
   366         self.linebuffer = None
       
   367 
       
   368     def decode(self, input, errors='strict'):
       
   369         raise NotImplementedError
       
   370 
       
   371     def read(self, size=-1, chars=-1, firstline=False):
       
   372 
       
   373         """ Decodes data from the stream self.stream and returns the
       
   374             resulting object.
       
   375 
       
   376             chars indicates the number of characters to read from the
       
   377             stream. read() will never return more than chars
       
   378             characters, but it might return less, if there are not enough
       
   379             characters available.
       
   380 
       
   381             size indicates the approximate maximum number of bytes to
       
   382             read from the stream for decoding purposes. The decoder
       
   383             can modify this setting as appropriate. The default value
       
   384             -1 indicates to read and decode as much as possible.  size
       
   385             is intended to prevent having to decode huge files in one
       
   386             step.
       
   387 
       
   388             If firstline is true, and a UnicodeDecodeError happens
       
   389             after the first line terminator in the input only the first line
       
   390             will be returned, the rest of the input will be kept until the
       
   391             next call to read().
       
   392 
       
   393             The method should use a greedy read strategy meaning that
       
   394             it should read as much data as is allowed within the
       
   395             definition of the encoding and the given size, e.g.  if
       
   396             optional encoding endings or state markers are available
       
   397             on the stream, these should be read too.
       
   398         """
       
   399         # If we have lines cached, first merge them back into characters
       
   400         if self.linebuffer:
       
   401             self.charbuffer = "".join(self.linebuffer)
       
   402             self.linebuffer = None
       
   403 
       
   404         # read until we get the required number of characters (if available)
       
   405         while True:
       
   406             # can the request can be satisfied from the character buffer?
       
   407             if chars < 0:
       
   408                 if size < 0:
       
   409                     if self.charbuffer:
       
   410                         break
       
   411                 elif len(self.charbuffer) >= size:
       
   412                     break
       
   413             else:
       
   414                 if len(self.charbuffer) >= chars:
       
   415                     break
       
   416             # we need more data
       
   417             if size < 0:
       
   418                 newdata = self.stream.read()
       
   419             else:
       
   420                 newdata = self.stream.read(size)
       
   421             # decode bytes (those remaining from the last call included)
       
   422             data = self.bytebuffer + newdata
       
   423             try:
       
   424                 newchars, decodedbytes = self.decode(data, self.errors)
       
   425             except UnicodeDecodeError, exc:
       
   426                 if firstline:
       
   427                     newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
       
   428                     lines = newchars.splitlines(True)
       
   429                     if len(lines)<=1:
       
   430                         raise
       
   431                 else:
       
   432                     raise
       
   433             # keep undecoded bytes until the next call
       
   434             self.bytebuffer = data[decodedbytes:]
       
   435             # put new characters in the character buffer
       
   436             self.charbuffer += newchars
       
   437             # there was no data available
       
   438             if not newdata:
       
   439                 break
       
   440         if chars < 0:
       
   441             # Return everything we've got
       
   442             result = self.charbuffer
       
   443             self.charbuffer = ""
       
   444         else:
       
   445             # Return the first chars characters
       
   446             result = self.charbuffer[:chars]
       
   447             self.charbuffer = self.charbuffer[chars:]
       
   448         return result
       
   449 
       
   450     def readline(self, size=None, keepends=True):
       
   451 
       
   452         """ Read one line from the input stream and return the
       
   453             decoded data.
       
   454 
       
   455             size, if given, is passed as size argument to the
       
   456             read() method.
       
   457 
       
   458         """
       
   459         # If we have lines cached from an earlier read, return
       
   460         # them unconditionally
       
   461         if self.linebuffer:
       
   462             line = self.linebuffer[0]
       
   463             del self.linebuffer[0]
       
   464             if len(self.linebuffer) == 1:
       
   465                 # revert to charbuffer mode; we might need more data
       
   466                 # next time
       
   467                 self.charbuffer = self.linebuffer[0]
       
   468                 self.linebuffer = None
       
   469             if not keepends:
       
   470                 line = line.splitlines(False)[0]
       
   471             return line
       
   472 
       
   473         readsize = size or 72
       
   474         line = ""
       
   475         # If size is given, we call read() only once
       
   476         while True:
       
   477             data = self.read(readsize, firstline=True)
       
   478             if data:
       
   479                 # If we're at a "\r" read one extra character (which might
       
   480                 # be a "\n") to get a proper line ending. If the stream is
       
   481                 # temporarily exhausted we return the wrong line ending.
       
   482                 if data.endswith("\r"):
       
   483                     data += self.read(size=1, chars=1)
       
   484 
       
   485             line += data
       
   486             lines = line.splitlines(True)
       
   487             if lines:
       
   488                 if len(lines) > 1:
       
   489                     # More than one line result; the first line is a full line
       
   490                     # to return
       
   491                     line = lines[0]
       
   492                     del lines[0]
       
   493                     if len(lines) > 1:
       
   494                         # cache the remaining lines
       
   495                         lines[-1] += self.charbuffer
       
   496                         self.linebuffer = lines
       
   497                         self.charbuffer = None
       
   498                     else:
       
   499                         # only one remaining line, put it back into charbuffer
       
   500                         self.charbuffer = lines[0] + self.charbuffer
       
   501                     if not keepends:
       
   502                         line = line.splitlines(False)[0]
       
   503                     break
       
   504                 line0withend = lines[0]
       
   505                 line0withoutend = lines[0].splitlines(False)[0]
       
   506                 if line0withend != line0withoutend: # We really have a line end
       
   507                     # Put the rest back together and keep it until the next call
       
   508                     self.charbuffer = "".join(lines[1:]) + self.charbuffer
       
   509                     if keepends:
       
   510                         line = line0withend
       
   511                     else:
       
   512                         line = line0withoutend
       
   513                     break
       
   514             # we didn't get anything or this was our only try
       
   515             if not data or size is not None:
       
   516                 if line and not keepends:
       
   517                     line = line.splitlines(False)[0]
       
   518                 break
       
   519             if readsize<8000:
       
   520                 readsize *= 2
       
   521         return line
       
   522 
       
   523     def readlines(self, sizehint=None, keepends=True):
       
   524 
       
   525         """ Read all lines available on the input stream
       
   526             and return them as list of lines.
       
   527 
       
   528             Line breaks are implemented using the codec's decoder
       
   529             method and are included in the list entries.
       
   530 
       
   531             sizehint, if given, is ignored since there is no efficient
       
   532             way to finding the true end-of-line.
       
   533 
       
   534         """
       
   535         data = self.read()
       
   536         return data.splitlines(keepends)
       
   537 
       
   538     def reset(self):
       
   539 
       
   540         """ Resets the codec buffers used for keeping state.
       
   541 
       
   542             Note that no stream repositioning should take place.
       
   543             This method is primarily intended to be able to recover
       
   544             from decoding errors.
       
   545 
       
   546         """
       
   547         self.bytebuffer = ""
       
   548         self.charbuffer = u""
       
   549         self.linebuffer = None
       
   550 
       
   551     def seek(self, offset, whence=0):
       
   552         """ Set the input stream's current position.
       
   553 
       
   554             Resets the codec buffers used for keeping state.
       
   555         """
       
   556         self.reset()
       
   557         self.stream.seek(offset, whence)
       
   558 
       
   559     def next(self):
       
   560 
       
   561         """ Return the next decoded line from the input stream."""
       
   562         line = self.readline()
       
   563         if line:
       
   564             return line
       
   565         raise StopIteration
       
   566 
       
   567     def __iter__(self):
       
   568         return self
       
   569 
       
   570     def __getattr__(self, name,
       
   571                     getattr=getattr):
       
   572 
       
   573         """ Inherit all other methods from the underlying stream.
       
   574         """
       
   575         return getattr(self.stream, name)
       
   576 
       
   577     def __enter__(self):
       
   578         return self
       
   579 
       
   580     def __exit__(self, type, value, tb):
       
   581         self.stream.close()
       
   582 
       
   583 ###
       
   584 
       
   585 class StreamReaderWriter:
       
   586 
       
   587     """ StreamReaderWriter instances allow wrapping streams which
       
   588         work in both read and write modes.
       
   589 
       
   590         The design is such that one can use the factory functions
       
   591         returned by the codec.lookup() function to construct the
       
   592         instance.
       
   593 
       
   594     """
       
   595     # Optional attributes set by the file wrappers below
       
   596     encoding = 'unknown'
       
   597 
       
   598     def __init__(self, stream, Reader, Writer, errors='strict'):
       
   599 
       
   600         """ Creates a StreamReaderWriter instance.
       
   601 
       
   602             stream must be a Stream-like object.
       
   603 
       
   604             Reader, Writer must be factory functions or classes
       
   605             providing the StreamReader, StreamWriter interface resp.
       
   606 
       
   607             Error handling is done in the same way as defined for the
       
   608             StreamWriter/Readers.
       
   609 
       
   610         """
       
   611         self.stream = stream
       
   612         self.reader = Reader(stream, errors)
       
   613         self.writer = Writer(stream, errors)
       
   614         self.errors = errors
       
   615 
       
   616     def read(self, size=-1):
       
   617 
       
   618         return self.reader.read(size)
       
   619 
       
   620     def readline(self, size=None):
       
   621 
       
   622         return self.reader.readline(size)
       
   623 
       
   624     def readlines(self, sizehint=None):
       
   625 
       
   626         return self.reader.readlines(sizehint)
       
   627 
       
   628     def next(self):
       
   629 
       
   630         """ Return the next decoded line from the input stream."""
       
   631         return self.reader.next()
       
   632 
       
   633     def __iter__(self):
       
   634         return self
       
   635 
       
   636     def write(self, data):
       
   637 
       
   638         return self.writer.write(data)
       
   639 
       
   640     def writelines(self, list):
       
   641 
       
   642         return self.writer.writelines(list)
       
   643 
       
   644     def reset(self):
       
   645 
       
   646         self.reader.reset()
       
   647         self.writer.reset()
       
   648 
       
   649     def __getattr__(self, name,
       
   650                     getattr=getattr):
       
   651 
       
   652         """ Inherit all other methods from the underlying stream.
       
   653         """
       
   654         return getattr(self.stream, name)
       
   655 
       
   656     # these are needed to make "with codecs.open(...)" work properly
       
   657 
       
   658     def __enter__(self):
       
   659         return self
       
   660 
       
   661     def __exit__(self, type, value, tb):
       
   662         self.stream.close()
       
   663 
       
   664 ###
       
   665 
       
   666 class StreamRecoder:
       
   667 
       
   668     """ StreamRecoder instances provide a frontend - backend
       
   669         view of encoding data.
       
   670 
       
   671         They use the complete set of APIs returned by the
       
   672         codecs.lookup() function to implement their task.
       
   673 
       
   674         Data written to the stream is first decoded into an
       
   675         intermediate format (which is dependent on the given codec
       
   676         combination) and then written to the stream using an instance
       
   677         of the provided Writer class.
       
   678 
       
   679         In the other direction, data is read from the stream using a
       
   680         Reader instance and then return encoded data to the caller.
       
   681 
       
   682     """
       
   683     # Optional attributes set by the file wrappers below
       
   684     data_encoding = 'unknown'
       
   685     file_encoding = 'unknown'
       
   686 
       
   687     def __init__(self, stream, encode, decode, Reader, Writer,
       
   688                  errors='strict'):
       
   689 
       
   690         """ Creates a StreamRecoder instance which implements a two-way
       
   691             conversion: encode and decode work on the frontend (the
       
   692             input to .read() and output of .write()) while
       
   693             Reader and Writer work on the backend (reading and
       
   694             writing to the stream).
       
   695 
       
   696             You can use these objects to do transparent direct
       
   697             recodings from e.g. latin-1 to utf-8 and back.
       
   698 
       
   699             stream must be a file-like object.
       
   700 
       
   701             encode, decode must adhere to the Codec interface, Reader,
       
   702             Writer must be factory functions or classes providing the
       
   703             StreamReader, StreamWriter interface resp.
       
   704 
       
   705             encode and decode are needed for the frontend translation,
       
   706             Reader and Writer for the backend translation. Unicode is
       
   707             used as intermediate encoding.
       
   708 
       
   709             Error handling is done in the same way as defined for the
       
   710             StreamWriter/Readers.
       
   711 
       
   712         """
       
   713         self.stream = stream
       
   714         self.encode = encode
       
   715         self.decode = decode
       
   716         self.reader = Reader(stream, errors)
       
   717         self.writer = Writer(stream, errors)
       
   718         self.errors = errors
       
   719 
       
   720     def read(self, size=-1):
       
   721 
       
   722         data = self.reader.read(size)
       
   723         data, bytesencoded = self.encode(data, self.errors)
       
   724         return data
       
   725 
       
   726     def readline(self, size=None):
       
   727 
       
   728         if size is None:
       
   729             data = self.reader.readline()
       
   730         else:
       
   731             data = self.reader.readline(size)
       
   732         data, bytesencoded = self.encode(data, self.errors)
       
   733         return data
       
   734 
       
   735     def readlines(self, sizehint=None):
       
   736 
       
   737         data = self.reader.read()
       
   738         data, bytesencoded = self.encode(data, self.errors)
       
   739         return data.splitlines(1)
       
   740 
       
   741     def next(self):
       
   742 
       
   743         """ Return the next decoded line from the input stream."""
       
   744         data = self.reader.next()
       
   745         data, bytesencoded = self.encode(data, self.errors)
       
   746         return data
       
   747 
       
   748     def __iter__(self):
       
   749         return self
       
   750 
       
   751     def write(self, data):
       
   752 
       
   753         data, bytesdecoded = self.decode(data, self.errors)
       
   754         return self.writer.write(data)
       
   755 
       
   756     def writelines(self, list):
       
   757 
       
   758         data = ''.join(list)
       
   759         data, bytesdecoded = self.decode(data, self.errors)
       
   760         return self.writer.write(data)
       
   761 
       
   762     def reset(self):
       
   763 
       
   764         self.reader.reset()
       
   765         self.writer.reset()
       
   766 
       
   767     def __getattr__(self, name,
       
   768                     getattr=getattr):
       
   769 
       
   770         """ Inherit all other methods from the underlying stream.
       
   771         """
       
   772         return getattr(self.stream, name)
       
   773 
       
   774     def __enter__(self):
       
   775         return self
       
   776 
       
   777     def __exit__(self, type, value, tb):
       
   778         self.stream.close()
       
   779 
       
   780 ### Shortcuts
       
   781 
       
   782 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
       
   783 
       
   784     """ Open an encoded file using the given mode and return
       
   785         a wrapped version providing transparent encoding/decoding.
       
   786 
       
   787         Note: The wrapped version will only accept the object format
       
   788         defined by the codecs, i.e. Unicode objects for most builtin
       
   789         codecs. Output is also codec dependent and will usually be
       
   790         Unicode as well.
       
   791 
       
   792         Files are always opened in binary mode, even if no binary mode
       
   793         was specified. This is done to avoid data loss due to encodings
       
   794         using 8-bit values. The default file mode is 'rb' meaning to
       
   795         open the file in binary read mode.
       
   796 
       
   797         encoding specifies the encoding which is to be used for the
       
   798         file.
       
   799 
       
   800         errors may be given to define the error handling. It defaults
       
   801         to 'strict' which causes ValueErrors to be raised in case an
       
   802         encoding error occurs.
       
   803 
       
   804         buffering has the same meaning as for the builtin open() API.
       
   805         It defaults to line buffered.
       
   806 
       
   807         The returned wrapped file object provides an extra attribute
       
   808         .encoding which allows querying the used encoding. This
       
   809         attribute is only available if an encoding was specified as
       
   810         parameter.
       
   811 
       
   812     """
       
   813     if encoding is not None and \
       
   814        'b' not in mode:
       
   815         # Force opening of the file in binary mode
       
   816         mode = mode + 'b'
       
   817     file = __builtin__.open(filename, mode, buffering)
       
   818     if encoding is None:
       
   819         return file
       
   820     info = lookup(encoding)
       
   821     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
       
   822     # Add attributes to simplify introspection
       
   823     srw.encoding = encoding
       
   824     return srw
       
   825 
       
   826 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
       
   827 
       
   828     """ Return a wrapped version of file which provides transparent
       
   829         encoding translation.
       
   830 
       
   831         Strings written to the wrapped file are interpreted according
       
   832         to the given data_encoding and then written to the original
       
   833         file as string using file_encoding. The intermediate encoding
       
   834         will usually be Unicode but depends on the specified codecs.
       
   835 
       
   836         Strings are read from the file using file_encoding and then
       
   837         passed back to the caller as string using data_encoding.
       
   838 
       
   839         If file_encoding is not given, it defaults to data_encoding.
       
   840 
       
   841         errors may be given to define the error handling. It defaults
       
   842         to 'strict' which causes ValueErrors to be raised in case an
       
   843         encoding error occurs.
       
   844 
       
   845         The returned wrapped file object provides two extra attributes
       
   846         .data_encoding and .file_encoding which reflect the given
       
   847         parameters of the same name. The attributes can be used for
       
   848         introspection by Python programs.
       
   849 
       
   850     """
       
   851     if file_encoding is None:
       
   852         file_encoding = data_encoding
       
   853     data_info = lookup(data_encoding)
       
   854     file_info = lookup(file_encoding)
       
   855     sr = StreamRecoder(file, data_info.encode, data_info.decode,
       
   856                        file_info.streamreader, file_info.streamwriter, errors)
       
   857     # Add attributes to simplify introspection
       
   858     sr.data_encoding = data_encoding
       
   859     sr.file_encoding = file_encoding
       
   860     return sr
       
   861 
       
   862 ### Helpers for codec lookup
       
   863 
       
   864 def getencoder(encoding):
       
   865 
       
   866     """ Lookup up the codec for the given encoding and return
       
   867         its encoder function.
       
   868 
       
   869         Raises a LookupError in case the encoding cannot be found.
       
   870 
       
   871     """
       
   872     return lookup(encoding).encode
       
   873 
       
   874 def getdecoder(encoding):
       
   875 
       
   876     """ Lookup up the codec for the given encoding and return
       
   877         its decoder function.
       
   878 
       
   879         Raises a LookupError in case the encoding cannot be found.
       
   880 
       
   881     """
       
   882     return lookup(encoding).decode
       
   883 
       
   884 def getincrementalencoder(encoding):
       
   885 
       
   886     """ Lookup up the codec for the given encoding and return
       
   887         its IncrementalEncoder class or factory function.
       
   888 
       
   889         Raises a LookupError in case the encoding cannot be found
       
   890         or the codecs doesn't provide an incremental encoder.
       
   891 
       
   892     """
       
   893     encoder = lookup(encoding).incrementalencoder
       
   894     if encoder is None:
       
   895         raise LookupError(encoding)
       
   896     return encoder
       
   897 
       
   898 def getincrementaldecoder(encoding):
       
   899 
       
   900     """ Lookup up the codec for the given encoding and return
       
   901         its IncrementalDecoder class or factory function.
       
   902 
       
   903         Raises a LookupError in case the encoding cannot be found
       
   904         or the codecs doesn't provide an incremental decoder.
       
   905 
       
   906     """
       
   907     decoder = lookup(encoding).incrementaldecoder
       
   908     if decoder is None:
       
   909         raise LookupError(encoding)
       
   910     return decoder
       
   911 
       
   912 def getreader(encoding):
       
   913 
       
   914     """ Lookup up the codec for the given encoding and return
       
   915         its StreamReader class or factory function.
       
   916 
       
   917         Raises a LookupError in case the encoding cannot be found.
       
   918 
       
   919     """
       
   920     return lookup(encoding).streamreader
       
   921 
       
   922 def getwriter(encoding):
       
   923 
       
   924     """ Lookup up the codec for the given encoding and return
       
   925         its StreamWriter class or factory function.
       
   926 
       
   927         Raises a LookupError in case the encoding cannot be found.
       
   928 
       
   929     """
       
   930     return lookup(encoding).streamwriter
       
   931 
       
   932 def iterencode(iterator, encoding, errors='strict', **kwargs):
       
   933     """
       
   934     Encoding iterator.
       
   935 
       
   936     Encodes the input strings from the iterator using a IncrementalEncoder.
       
   937 
       
   938     errors and kwargs are passed through to the IncrementalEncoder
       
   939     constructor.
       
   940     """
       
   941     encoder = getincrementalencoder(encoding)(errors, **kwargs)
       
   942     for input in iterator:
       
   943         output = encoder.encode(input)
       
   944         if output:
       
   945             yield output
       
   946     output = encoder.encode("", True)
       
   947     if output:
       
   948         yield output
       
   949 
       
   950 def iterdecode(iterator, encoding, errors='strict', **kwargs):
       
   951     """
       
   952     Decoding iterator.
       
   953 
       
   954     Decodes the input strings from the iterator using a IncrementalDecoder.
       
   955 
       
   956     errors and kwargs are passed through to the IncrementalDecoder
       
   957     constructor.
       
   958     """
       
   959     decoder = getincrementaldecoder(encoding)(errors, **kwargs)
       
   960     for input in iterator:
       
   961         output = decoder.decode(input)
       
   962         if output:
       
   963             yield output
       
   964     output = decoder.decode("", True)
       
   965     if output:
       
   966         yield output
       
   967 
       
   968 ### Helpers for charmap-based codecs
       
   969 
       
   970 def make_identity_dict(rng):
       
   971 
       
   972     """ make_identity_dict(rng) -> dict
       
   973 
       
   974         Return a dictionary where elements of the rng sequence are
       
   975         mapped to themselves.
       
   976 
       
   977     """
       
   978     res = {}
       
   979     for i in rng:
       
   980         res[i]=i
       
   981     return res
       
   982 
       
   983 def make_encoding_map(decoding_map):
       
   984 
       
   985     """ Creates an encoding map from a decoding map.
       
   986 
       
   987         If a target mapping in the decoding map occurs multiple
       
   988         times, then that target is mapped to None (undefined mapping),
       
   989         causing an exception when encountered by the charmap codec
       
   990         during translation.
       
   991 
       
   992         One example where this happens is cp875.py which decodes
       
   993         multiple character to \u001a.
       
   994 
       
   995     """
       
   996     m = {}
       
   997     for k,v in decoding_map.items():
       
   998         if not v in m:
       
   999             m[v] = k
       
  1000         else:
       
  1001             m[v] = None
       
  1002     return m
       
  1003 
       
  1004 ### error handlers
       
  1005 
       
  1006 try:
       
  1007     strict_errors = lookup_error("strict")
       
  1008     ignore_errors = lookup_error("ignore")
       
  1009     replace_errors = lookup_error("replace")
       
  1010     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
       
  1011     backslashreplace_errors = lookup_error("backslashreplace")
       
  1012 except LookupError:
       
  1013     # In --disable-unicode builds, these error handler are missing
       
  1014     strict_errors = None
       
  1015     ignore_errors = None
       
  1016     replace_errors = None
       
  1017     xmlcharrefreplace_errors = None
       
  1018     backslashreplace_errors = None
       
  1019 
       
  1020 # Tell modulefinder that using codecs probably needs the encodings
       
  1021 # package
       
  1022 _false = 0
       
  1023 if _false:
       
  1024     import encodings
       
  1025 
       
  1026 ### Tests
       
  1027 
       
  1028 if __name__ == '__main__':
       
  1029 
       
  1030     # Make stdout translate Latin-1 output into UTF-8 output
       
  1031     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
       
  1032 
       
  1033     # Have stdin translate Latin-1 input into UTF-8 input
       
  1034     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')