python-2.5.2/win32/Lib/tokenize.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 """Tokenization help for Python programs.
       
     2 
       
     3 generate_tokens(readline) is a generator that breaks a stream of
       
     4 text into Python tokens.  It accepts a readline-like method which is called
       
     5 repeatedly to get the next line of input (or "" for EOF).  It generates
       
     6 5-tuples with these members:
       
     7 
       
     8     the token type (see token.py)
       
     9     the token (a string)
       
    10     the starting (row, column) indices of the token (a 2-tuple of ints)
       
    11     the ending (row, column) indices of the token (a 2-tuple of ints)
       
    12     the original line (string)
       
    13 
       
    14 It is designed to match the working of the Python tokenizer exactly, except
       
    15 that it produces COMMENT tokens for comments and gives type OP for all
       
    16 operators
       
    17 
       
    18 Older entry points
       
    19     tokenize_loop(readline, tokeneater)
       
    20     tokenize(readline, tokeneater=printtoken)
       
    21 are the same, except instead of generating tokens, tokeneater is a callback
       
    22 function to which the 5 fields described above are passed as 5 arguments,
       
    23 each time a new token is found."""
       
    24 
       
    25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
       
    26 __credits__ = \
       
    27     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
       
    28 
       
    29 import string, re
       
    30 from token import *
       
    31 
       
    32 import token
       
    33 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
       
    34            "generate_tokens", "NL", "untokenize"]
       
    35 del x
       
    36 del token
       
    37 
       
    38 COMMENT = N_TOKENS
       
    39 tok_name[COMMENT] = 'COMMENT'
       
    40 NL = N_TOKENS + 1
       
    41 tok_name[NL] = 'NL'
       
    42 N_TOKENS += 2
       
    43 
       
    44 def group(*choices): return '(' + '|'.join(choices) + ')'
       
    45 def any(*choices): return group(*choices) + '*'
       
    46 def maybe(*choices): return group(*choices) + '?'
       
    47 
       
    48 Whitespace = r'[ \f\t]*'
       
    49 Comment = r'#[^\r\n]*'
       
    50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
       
    51 Name = r'[a-zA-Z_]\w*'
       
    52 
       
    53 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
       
    54 Octnumber = r'0[0-7]*[lL]?'
       
    55 Decnumber = r'[1-9]\d*[lL]?'
       
    56 Intnumber = group(Hexnumber, Octnumber, Decnumber)
       
    57 Exponent = r'[eE][-+]?\d+'
       
    58 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
       
    59 Expfloat = r'\d+' + Exponent
       
    60 Floatnumber = group(Pointfloat, Expfloat)
       
    61 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
       
    62 Number = group(Imagnumber, Floatnumber, Intnumber)
       
    63 
       
    64 # Tail end of ' string.
       
    65 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
       
    66 # Tail end of " string.
       
    67 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
       
    68 # Tail end of ''' string.
       
    69 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
       
    70 # Tail end of """ string.
       
    71 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
       
    72 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
       
    73 # Single-line ' or " string.
       
    74 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
       
    75                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
       
    76 
       
    77 # Because of leftmost-then-longest match semantics, be sure to put the
       
    78 # longest operators first (e.g., if = came before ==, == would get
       
    79 # recognized as two instances of =).
       
    80 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
       
    81                  r"//=?",
       
    82                  r"[+\-*/%&|^=<>]=?",
       
    83                  r"~")
       
    84 
       
    85 Bracket = '[][(){}]'
       
    86 Special = group(r'\r?\n', r'[:;.,`@]')
       
    87 Funny = group(Operator, Bracket, Special)
       
    88 
       
    89 PlainToken = group(Number, Funny, String, Name)
       
    90 Token = Ignore + PlainToken
       
    91 
       
    92 # First (or only) line of ' or " string.
       
    93 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
       
    94                 group("'", r'\\\r?\n'),
       
    95                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
       
    96                 group('"', r'\\\r?\n'))
       
    97 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
       
    98 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
       
    99 
       
   100 tokenprog, pseudoprog, single3prog, double3prog = map(
       
   101     re.compile, (Token, PseudoToken, Single3, Double3))
       
   102 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
       
   103             "'''": single3prog, '"""': double3prog,
       
   104             "r'''": single3prog, 'r"""': double3prog,
       
   105             "u'''": single3prog, 'u"""': double3prog,
       
   106             "ur'''": single3prog, 'ur"""': double3prog,
       
   107             "R'''": single3prog, 'R"""': double3prog,
       
   108             "U'''": single3prog, 'U"""': double3prog,
       
   109             "uR'''": single3prog, 'uR"""': double3prog,
       
   110             "Ur'''": single3prog, 'Ur"""': double3prog,
       
   111             "UR'''": single3prog, 'UR"""': double3prog,
       
   112             'r': None, 'R': None, 'u': None, 'U': None}
       
   113 
       
   114 triple_quoted = {}
       
   115 for t in ("'''", '"""',
       
   116           "r'''", 'r"""', "R'''", 'R"""',
       
   117           "u'''", 'u"""', "U'''", 'U"""',
       
   118           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
       
   119           "uR'''", 'uR"""', "UR'''", 'UR"""'):
       
   120     triple_quoted[t] = t
       
   121 single_quoted = {}
       
   122 for t in ("'", '"',
       
   123           "r'", 'r"', "R'", 'R"',
       
   124           "u'", 'u"', "U'", 'U"',
       
   125           "ur'", 'ur"', "Ur'", 'Ur"',
       
   126           "uR'", 'uR"', "UR'", 'UR"' ):
       
   127     single_quoted[t] = t
       
   128 
       
   129 tabsize = 8
       
   130 
       
   131 class TokenError(Exception): pass
       
   132 
       
   133 class StopTokenizing(Exception): pass
       
   134 
       
   135 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
       
   136     print "%d,%d-%d,%d:\t%s\t%s" % \
       
   137         (srow, scol, erow, ecol, tok_name[type], repr(token))
       
   138 
       
   139 def tokenize(readline, tokeneater=printtoken):
       
   140     """
       
   141     The tokenize() function accepts two parameters: one representing the
       
   142     input stream, and one providing an output mechanism for tokenize().
       
   143 
       
   144     The first parameter, readline, must be a callable object which provides
       
   145     the same interface as the readline() method of built-in file objects.
       
   146     Each call to the function should return one line of input as a string.
       
   147 
       
   148     The second parameter, tokeneater, must also be a callable object. It is
       
   149     called once for each token, with five arguments, corresponding to the
       
   150     tuples generated by generate_tokens().
       
   151     """
       
   152     try:
       
   153         tokenize_loop(readline, tokeneater)
       
   154     except StopTokenizing:
       
   155         pass
       
   156 
       
   157 # backwards compatible interface
       
   158 def tokenize_loop(readline, tokeneater):
       
   159     for token_info in generate_tokens(readline):
       
   160         tokeneater(*token_info)
       
   161 
       
   162 
       
   163 def untokenize(iterable):
       
   164     """Transform tokens back into Python source code.
       
   165 
       
   166     Each element returned by the iterable must be a token sequence
       
   167     with at least two elements, a token number and token value.
       
   168 
       
   169     Round-trip invariant:
       
   170         # Output text will tokenize the back to the input
       
   171         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
       
   172         newcode = untokenize(t1)
       
   173         readline = iter(newcode.splitlines(1)).next
       
   174         t2 = [tok[:2] for tokin generate_tokens(readline)]
       
   175         assert t1 == t2
       
   176     """
       
   177 
       
   178     startline = False
       
   179     indents = []
       
   180     toks = []
       
   181     toks_append = toks.append
       
   182     for tok in iterable:
       
   183         toknum, tokval = tok[:2]
       
   184 
       
   185         if toknum in (NAME, NUMBER):
       
   186             tokval += ' '
       
   187 
       
   188         if toknum == INDENT:
       
   189             indents.append(tokval)
       
   190             continue
       
   191         elif toknum == DEDENT:
       
   192             indents.pop()
       
   193             continue
       
   194         elif toknum in (NEWLINE, COMMENT, NL):
       
   195             startline = True
       
   196         elif startline and indents:
       
   197             toks_append(indents[-1])
       
   198             startline = False
       
   199         toks_append(tokval)
       
   200     return ''.join(toks)
       
   201 
       
   202 
       
   203 def generate_tokens(readline):
       
   204     """
       
   205     The generate_tokens() generator requires one argment, readline, which
       
   206     must be a callable object which provides the same interface as the
       
   207     readline() method of built-in file objects. Each call to the function
       
   208     should return one line of input as a string.  Alternately, readline
       
   209     can be a callable function terminating with StopIteration:
       
   210         readline = open(myfile).next    # Example of alternate readline
       
   211 
       
   212     The generator produces 5-tuples with these members: the token type; the
       
   213     token string; a 2-tuple (srow, scol) of ints specifying the row and
       
   214     column where the token begins in the source; a 2-tuple (erow, ecol) of
       
   215     ints specifying the row and column where the token ends in the source;
       
   216     and the line on which the token was found. The line passed is the
       
   217     logical line; continuation lines are included.
       
   218     """
       
   219     lnum = parenlev = continued = 0
       
   220     namechars, numchars = string.ascii_letters + '_', '0123456789'
       
   221     contstr, needcont = '', 0
       
   222     contline = None
       
   223     indents = [0]
       
   224 
       
   225     while 1:                                   # loop over lines in stream
       
   226         try:
       
   227             line = readline()
       
   228         except StopIteration:
       
   229             line = ''
       
   230         lnum = lnum + 1
       
   231         pos, max = 0, len(line)
       
   232 
       
   233         if contstr:                            # continued string
       
   234             if not line:
       
   235                 raise TokenError, ("EOF in multi-line string", strstart)
       
   236             endmatch = endprog.match(line)
       
   237             if endmatch:
       
   238                 pos = end = endmatch.end(0)
       
   239                 yield (STRING, contstr + line[:end],
       
   240                            strstart, (lnum, end), contline + line)
       
   241                 contstr, needcont = '', 0
       
   242                 contline = None
       
   243             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
       
   244                 yield (ERRORTOKEN, contstr + line,
       
   245                            strstart, (lnum, len(line)), contline)
       
   246                 contstr = ''
       
   247                 contline = None
       
   248                 continue
       
   249             else:
       
   250                 contstr = contstr + line
       
   251                 contline = contline + line
       
   252                 continue
       
   253 
       
   254         elif parenlev == 0 and not continued:  # new statement
       
   255             if not line: break
       
   256             column = 0
       
   257             while pos < max:                   # measure leading whitespace
       
   258                 if line[pos] == ' ': column = column + 1
       
   259                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
       
   260                 elif line[pos] == '\f': column = 0
       
   261                 else: break
       
   262                 pos = pos + 1
       
   263             if pos == max: break
       
   264 
       
   265             if line[pos] in '#\r\n':           # skip comments or blank lines
       
   266                 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
       
   267                            (lnum, pos), (lnum, len(line)), line)
       
   268                 continue
       
   269 
       
   270             if column > indents[-1]:           # count indents or dedents
       
   271                 indents.append(column)
       
   272                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
       
   273             while column < indents[-1]:
       
   274                 if column not in indents:
       
   275                     raise IndentationError(
       
   276                         "unindent does not match any outer indentation level",
       
   277                         ("<tokenize>", lnum, pos, line))
       
   278                 indents = indents[:-1]
       
   279                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
       
   280 
       
   281         else:                                  # continued statement
       
   282             if not line:
       
   283                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
       
   284             continued = 0
       
   285 
       
   286         while pos < max:
       
   287             pseudomatch = pseudoprog.match(line, pos)
       
   288             if pseudomatch:                                # scan for tokens
       
   289                 start, end = pseudomatch.span(1)
       
   290                 spos, epos, pos = (lnum, start), (lnum, end), end
       
   291                 token, initial = line[start:end], line[start]
       
   292 
       
   293                 if initial in numchars or \
       
   294                    (initial == '.' and token != '.'):      # ordinary number
       
   295                     yield (NUMBER, token, spos, epos, line)
       
   296                 elif initial in '\r\n':
       
   297                     yield (parenlev > 0 and NL or NEWLINE,
       
   298                                token, spos, epos, line)
       
   299                 elif initial == '#':
       
   300                     yield (COMMENT, token, spos, epos, line)
       
   301                 elif token in triple_quoted:
       
   302                     endprog = endprogs[token]
       
   303                     endmatch = endprog.match(line, pos)
       
   304                     if endmatch:                           # all on one line
       
   305                         pos = endmatch.end(0)
       
   306                         token = line[start:pos]
       
   307                         yield (STRING, token, spos, (lnum, pos), line)
       
   308                     else:
       
   309                         strstart = (lnum, start)           # multiple lines
       
   310                         contstr = line[start:]
       
   311                         contline = line
       
   312                         break
       
   313                 elif initial in single_quoted or \
       
   314                     token[:2] in single_quoted or \
       
   315                     token[:3] in single_quoted:
       
   316                     if token[-1] == '\n':                  # continued string
       
   317                         strstart = (lnum, start)
       
   318                         endprog = (endprogs[initial] or endprogs[token[1]] or
       
   319                                    endprogs[token[2]])
       
   320                         contstr, needcont = line[start:], 1
       
   321                         contline = line
       
   322                         break
       
   323                     else:                                  # ordinary string
       
   324                         yield (STRING, token, spos, epos, line)
       
   325                 elif initial in namechars:                 # ordinary name
       
   326                     yield (NAME, token, spos, epos, line)
       
   327                 elif initial == '\\':                      # continued stmt
       
   328                     continued = 1
       
   329                 else:
       
   330                     if initial in '([{': parenlev = parenlev + 1
       
   331                     elif initial in ')]}': parenlev = parenlev - 1
       
   332                     yield (OP, token, spos, epos, line)
       
   333             else:
       
   334                 yield (ERRORTOKEN, line[pos],
       
   335                            (lnum, pos), (lnum, pos+1), line)
       
   336                 pos = pos + 1
       
   337 
       
   338     for indent in indents[1:]:                 # pop remaining indent levels
       
   339         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
       
   340     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
       
   341 
       
   342 if __name__ == '__main__':                     # testing
       
   343     import sys
       
   344     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
       
   345     else: tokenize(sys.stdin.readline)