symbian-qemu-0.9.1-12/python-win32-2.6.1/lib/lib2to3/pgen2/tokenize.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
       
     2 # All rights reserved.
       
     3 
       
     4 """Tokenization help for Python programs.
       
     5 
       
     6 generate_tokens(readline) is a generator that breaks a stream of
       
     7 text into Python tokens.  It accepts a readline-like method which is called
       
     8 repeatedly to get the next line of input (or "" for EOF).  It generates
       
     9 5-tuples with these members:
       
    10 
       
    11     the token type (see token.py)
       
    12     the token (a string)
       
    13     the starting (row, column) indices of the token (a 2-tuple of ints)
       
    14     the ending (row, column) indices of the token (a 2-tuple of ints)
       
    15     the original line (string)
       
    16 
       
    17 It is designed to match the working of the Python tokenizer exactly, except
       
    18 that it produces COMMENT tokens for comments and gives type OP for all
       
    19 operators
       
    20 
       
    21 Older entry points
       
    22     tokenize_loop(readline, tokeneater)
       
    23     tokenize(readline, tokeneater=printtoken)
       
    24 are the same, except instead of generating tokens, tokeneater is a callback
       
    25 function to which the 5 fields described above are passed as 5 arguments,
       
    26 each time a new token is found."""
       
    27 
       
    28 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
       
    29 __credits__ = \
       
    30     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
       
    31 
       
    32 import string, re
       
    33 from lib2to3.pgen2.token import *
       
    34 
       
    35 from . import token
       
    36 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
       
    37            "generate_tokens", "untokenize"]
       
    38 del token
       
    39 
       
    40 def group(*choices): return '(' + '|'.join(choices) + ')'
       
    41 def any(*choices): return group(*choices) + '*'
       
    42 def maybe(*choices): return group(*choices) + '?'
       
    43 
       
    44 Whitespace = r'[ \f\t]*'
       
    45 Comment = r'#[^\r\n]*'
       
    46 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
       
    47 Name = r'[a-zA-Z_]\w*'
       
    48 
       
    49 Binnumber = r'0[bB][01]*'
       
    50 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
       
    51 Octnumber = r'0[oO]?[0-7]*[lL]?'
       
    52 Decnumber = r'[1-9]\d*[lL]?'
       
    53 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
       
    54 Exponent = r'[eE][-+]?\d+'
       
    55 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
       
    56 Expfloat = r'\d+' + Exponent
       
    57 Floatnumber = group(Pointfloat, Expfloat)
       
    58 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
       
    59 Number = group(Imagnumber, Floatnumber, Intnumber)
       
    60 
       
    61 # Tail end of ' string.
       
    62 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
       
    63 # Tail end of " string.
       
    64 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
       
    65 # Tail end of ''' string.
       
    66 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
       
    67 # Tail end of """ string.
       
    68 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
       
    69 Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
       
    70 # Single-line ' or " string.
       
    71 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
       
    72                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
       
    73 
       
    74 # Because of leftmost-then-longest match semantics, be sure to put the
       
    75 # longest operators first (e.g., if = came before ==, == would get
       
    76 # recognized as two instances of =).
       
    77 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
       
    78                  r"//=?", r"->",
       
    79                  r"[+\-*/%&|^=<>]=?",
       
    80                  r"~")
       
    81 
       
    82 Bracket = '[][(){}]'
       
    83 Special = group(r'\r?\n', r'[:;.,`@]')
       
    84 Funny = group(Operator, Bracket, Special)
       
    85 
       
    86 PlainToken = group(Number, Funny, String, Name)
       
    87 Token = Ignore + PlainToken
       
    88 
       
    89 # First (or only) line of ' or " string.
       
    90 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
       
    91                 group("'", r'\\\r?\n'),
       
    92                 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
       
    93                 group('"', r'\\\r?\n'))
       
    94 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
       
    95 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
       
    96 
       
    97 tokenprog, pseudoprog, single3prog, double3prog = map(
       
    98     re.compile, (Token, PseudoToken, Single3, Double3))
       
    99 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
       
   100             "'''": single3prog, '"""': double3prog,
       
   101             "r'''": single3prog, 'r"""': double3prog,
       
   102             "u'''": single3prog, 'u"""': double3prog,
       
   103             "b'''": single3prog, 'b"""': double3prog,
       
   104             "ur'''": single3prog, 'ur"""': double3prog,
       
   105             "br'''": single3prog, 'br"""': double3prog,
       
   106             "R'''": single3prog, 'R"""': double3prog,
       
   107             "U'''": single3prog, 'U"""': double3prog,
       
   108             "B'''": single3prog, 'B"""': double3prog,
       
   109             "uR'''": single3prog, 'uR"""': double3prog,
       
   110             "Ur'''": single3prog, 'Ur"""': double3prog,
       
   111             "UR'''": single3prog, 'UR"""': double3prog,
       
   112             "bR'''": single3prog, 'bR"""': double3prog,
       
   113             "Br'''": single3prog, 'Br"""': double3prog,
       
   114             "BR'''": single3prog, 'BR"""': double3prog,
       
   115             'r': None, 'R': None,
       
   116             'u': None, 'U': None,
       
   117             'b': None, 'B': None}
       
   118 
       
   119 triple_quoted = {}
       
   120 for t in ("'''", '"""',
       
   121           "r'''", 'r"""', "R'''", 'R"""',
       
   122           "u'''", 'u"""', "U'''", 'U"""',
       
   123           "b'''", 'b"""', "B'''", 'B"""',
       
   124           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
       
   125           "uR'''", 'uR"""', "UR'''", 'UR"""',
       
   126           "br'''", 'br"""', "Br'''", 'Br"""',
       
   127           "bR'''", 'bR"""', "BR'''", 'BR"""',):
       
   128     triple_quoted[t] = t
       
   129 single_quoted = {}
       
   130 for t in ("'", '"',
       
   131           "r'", 'r"', "R'", 'R"',
       
   132           "u'", 'u"', "U'", 'U"',
       
   133           "b'", 'b"', "B'", 'B"',
       
   134           "ur'", 'ur"', "Ur'", 'Ur"',
       
   135           "uR'", 'uR"', "UR'", 'UR"',
       
   136           "br'", 'br"', "Br'", 'Br"',
       
   137           "bR'", 'bR"', "BR'", 'BR"', ):
       
   138     single_quoted[t] = t
       
   139 
       
   140 tabsize = 8
       
   141 
       
   142 class TokenError(Exception): pass
       
   143 
       
   144 class StopTokenizing(Exception): pass
       
   145 
       
   146 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
       
   147     print "%d,%d-%d,%d:\t%s\t%s" % \
       
   148         (srow, scol, erow, ecol, tok_name[type], repr(token))
       
   149 
       
   150 def tokenize(readline, tokeneater=printtoken):
       
   151     """
       
   152     The tokenize() function accepts two parameters: one representing the
       
   153     input stream, and one providing an output mechanism for tokenize().
       
   154 
       
   155     The first parameter, readline, must be a callable object which provides
       
   156     the same interface as the readline() method of built-in file objects.
       
   157     Each call to the function should return one line of input as a string.
       
   158 
       
   159     The second parameter, tokeneater, must also be a callable object. It is
       
   160     called once for each token, with five arguments, corresponding to the
       
   161     tuples generated by generate_tokens().
       
   162     """
       
   163     try:
       
   164         tokenize_loop(readline, tokeneater)
       
   165     except StopTokenizing:
       
   166         pass
       
   167 
       
   168 # backwards compatible interface
       
   169 def tokenize_loop(readline, tokeneater):
       
   170     for token_info in generate_tokens(readline):
       
   171         tokeneater(*token_info)
       
   172 
       
   173 class Untokenizer:
       
   174 
       
   175     def __init__(self):
       
   176         self.tokens = []
       
   177         self.prev_row = 1
       
   178         self.prev_col = 0
       
   179 
       
   180     def add_whitespace(self, start):
       
   181         row, col = start
       
   182         assert row <= self.prev_row
       
   183         col_offset = col - self.prev_col
       
   184         if col_offset:
       
   185             self.tokens.append(" " * col_offset)
       
   186 
       
   187     def untokenize(self, iterable):
       
   188         for t in iterable:
       
   189             if len(t) == 2:
       
   190                 self.compat(t, iterable)
       
   191                 break
       
   192             tok_type, token, start, end, line = t
       
   193             self.add_whitespace(start)
       
   194             self.tokens.append(token)
       
   195             self.prev_row, self.prev_col = end
       
   196             if tok_type in (NEWLINE, NL):
       
   197                 self.prev_row += 1
       
   198                 self.prev_col = 0
       
   199         return "".join(self.tokens)
       
   200 
       
   201     def compat(self, token, iterable):
       
   202         startline = False
       
   203         indents = []
       
   204         toks_append = self.tokens.append
       
   205         toknum, tokval = token
       
   206         if toknum in (NAME, NUMBER):
       
   207             tokval += ' '
       
   208         if toknum in (NEWLINE, NL):
       
   209             startline = True
       
   210         for tok in iterable:
       
   211             toknum, tokval = tok[:2]
       
   212 
       
   213             if toknum in (NAME, NUMBER):
       
   214                 tokval += ' '
       
   215 
       
   216             if toknum == INDENT:
       
   217                 indents.append(tokval)
       
   218                 continue
       
   219             elif toknum == DEDENT:
       
   220                 indents.pop()
       
   221                 continue
       
   222             elif toknum in (NEWLINE, NL):
       
   223                 startline = True
       
   224             elif startline and indents:
       
   225                 toks_append(indents[-1])
       
   226                 startline = False
       
   227             toks_append(tokval)
       
   228 
       
   229 def untokenize(iterable):
       
   230     """Transform tokens back into Python source code.
       
   231 
       
   232     Each element returned by the iterable must be a token sequence
       
   233     with at least two elements, a token number and token value.  If
       
   234     only two tokens are passed, the resulting output is poor.
       
   235 
       
   236     Round-trip invariant for full input:
       
   237         Untokenized source will match input source exactly
       
   238 
       
   239     Round-trip invariant for limited intput:
       
   240         # Output text will tokenize the back to the input
       
   241         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
       
   242         newcode = untokenize(t1)
       
   243         readline = iter(newcode.splitlines(1)).next
       
   244         t2 = [tok[:2] for tokin generate_tokens(readline)]
       
   245         assert t1 == t2
       
   246     """
       
   247     ut = Untokenizer()
       
   248     return ut.untokenize(iterable)
       
   249 
       
   250 def generate_tokens(readline):
       
   251     """
       
   252     The generate_tokens() generator requires one argment, readline, which
       
   253     must be a callable object which provides the same interface as the
       
   254     readline() method of built-in file objects. Each call to the function
       
   255     should return one line of input as a string.  Alternately, readline
       
   256     can be a callable function terminating with StopIteration:
       
   257         readline = open(myfile).next    # Example of alternate readline
       
   258 
       
   259     The generator produces 5-tuples with these members: the token type; the
       
   260     token string; a 2-tuple (srow, scol) of ints specifying the row and
       
   261     column where the token begins in the source; a 2-tuple (erow, ecol) of
       
   262     ints specifying the row and column where the token ends in the source;
       
   263     and the line on which the token was found. The line passed is the
       
   264     logical line; continuation lines are included.
       
   265     """
       
   266     lnum = parenlev = continued = 0
       
   267     namechars, numchars = string.ascii_letters + '_', '0123456789'
       
   268     contstr, needcont = '', 0
       
   269     contline = None
       
   270     indents = [0]
       
   271 
       
   272     while 1:                                   # loop over lines in stream
       
   273         try:
       
   274             line = readline()
       
   275         except StopIteration:
       
   276             line = ''
       
   277         lnum = lnum + 1
       
   278         pos, max = 0, len(line)
       
   279 
       
   280         if contstr:                            # continued string
       
   281             if not line:
       
   282                 raise TokenError, ("EOF in multi-line string", strstart)
       
   283             endmatch = endprog.match(line)
       
   284             if endmatch:
       
   285                 pos = end = endmatch.end(0)
       
   286                 yield (STRING, contstr + line[:end],
       
   287                        strstart, (lnum, end), contline + line)
       
   288                 contstr, needcont = '', 0
       
   289                 contline = None
       
   290             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
       
   291                 yield (ERRORTOKEN, contstr + line,
       
   292                            strstart, (lnum, len(line)), contline)
       
   293                 contstr = ''
       
   294                 contline = None
       
   295                 continue
       
   296             else:
       
   297                 contstr = contstr + line
       
   298                 contline = contline + line
       
   299                 continue
       
   300 
       
   301         elif parenlev == 0 and not continued:  # new statement
       
   302             if not line: break
       
   303             column = 0
       
   304             while pos < max:                   # measure leading whitespace
       
   305                 if line[pos] == ' ': column = column + 1
       
   306                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
       
   307                 elif line[pos] == '\f': column = 0
       
   308                 else: break
       
   309                 pos = pos + 1
       
   310             if pos == max: break
       
   311 
       
   312             if line[pos] in '#\r\n':           # skip comments or blank lines
       
   313                 if line[pos] == '#':
       
   314                     comment_token = line[pos:].rstrip('\r\n')
       
   315                     nl_pos = pos + len(comment_token)
       
   316                     yield (COMMENT, comment_token,
       
   317                            (lnum, pos), (lnum, pos + len(comment_token)), line)
       
   318                     yield (NL, line[nl_pos:],
       
   319                            (lnum, nl_pos), (lnum, len(line)), line)
       
   320                 else:
       
   321                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
       
   322                            (lnum, pos), (lnum, len(line)), line)
       
   323                 continue
       
   324 
       
   325             if column > indents[-1]:           # count indents or dedents
       
   326                 indents.append(column)
       
   327                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
       
   328             while column < indents[-1]:
       
   329                 if column not in indents:
       
   330                     raise IndentationError(
       
   331                         "unindent does not match any outer indentation level",
       
   332                         ("<tokenize>", lnum, pos, line))
       
   333                 indents = indents[:-1]
       
   334                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
       
   335 
       
   336         else:                                  # continued statement
       
   337             if not line:
       
   338                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
       
   339             continued = 0
       
   340 
       
   341         while pos < max:
       
   342             pseudomatch = pseudoprog.match(line, pos)
       
   343             if pseudomatch:                                # scan for tokens
       
   344                 start, end = pseudomatch.span(1)
       
   345                 spos, epos, pos = (lnum, start), (lnum, end), end
       
   346                 token, initial = line[start:end], line[start]
       
   347 
       
   348                 if initial in numchars or \
       
   349                    (initial == '.' and token != '.'):      # ordinary number
       
   350                     yield (NUMBER, token, spos, epos, line)
       
   351                 elif initial in '\r\n':
       
   352                     newline = NEWLINE
       
   353                     if parenlev > 0:
       
   354                         newline = NL
       
   355                     yield (newline, token, spos, epos, line)
       
   356                 elif initial == '#':
       
   357                     assert not token.endswith("\n")
       
   358                     yield (COMMENT, token, spos, epos, line)
       
   359                 elif token in triple_quoted:
       
   360                     endprog = endprogs[token]
       
   361                     endmatch = endprog.match(line, pos)
       
   362                     if endmatch:                           # all on one line
       
   363                         pos = endmatch.end(0)
       
   364                         token = line[start:pos]
       
   365                         yield (STRING, token, spos, (lnum, pos), line)
       
   366                     else:
       
   367                         strstart = (lnum, start)           # multiple lines
       
   368                         contstr = line[start:]
       
   369                         contline = line
       
   370                         break
       
   371                 elif initial in single_quoted or \
       
   372                     token[:2] in single_quoted or \
       
   373                     token[:3] in single_quoted:
       
   374                     if token[-1] == '\n':                  # continued string
       
   375                         strstart = (lnum, start)
       
   376                         endprog = (endprogs[initial] or endprogs[token[1]] or
       
   377                                    endprogs[token[2]])
       
   378                         contstr, needcont = line[start:], 1
       
   379                         contline = line
       
   380                         break
       
   381                     else:                                  # ordinary string
       
   382                         yield (STRING, token, spos, epos, line)
       
   383                 elif initial in namechars:                 # ordinary name
       
   384                     yield (NAME, token, spos, epos, line)
       
   385                 elif initial == '\\':                      # continued stmt
       
   386                     # This yield is new; needed for better idempotency:
       
   387                     yield (NL, token, spos, (lnum, pos), line)
       
   388                     continued = 1
       
   389                 else:
       
   390                     if initial in '([{': parenlev = parenlev + 1
       
   391                     elif initial in ')]}': parenlev = parenlev - 1
       
   392                     yield (OP, token, spos, epos, line)
       
   393             else:
       
   394                 yield (ERRORTOKEN, line[pos],
       
   395                            (lnum, pos), (lnum, pos+1), line)
       
   396                 pos = pos + 1
       
   397 
       
   398     for indent in indents[1:]:                 # pop remaining indent levels
       
   399         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
       
   400     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
       
   401 
       
   402 if __name__ == '__main__':                     # testing
       
   403     import sys
       
   404     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
       
   405     else: tokenize(sys.stdin.readline)