python-2.5.2/win32/Lib/idlelib/PyParse.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 import re
       
     2 import sys
       
     3 
       
     4 # Reason last stmt is continued (or C_NONE if it's not).
       
     5 (C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
       
     6  C_STRING_NEXT_LINES, C_BRACKET) = range(5)
       
     7 
       
     8 if 0:   # for throwaway debugging output
       
     9     def dump(*stuff):
       
    10         sys.__stdout__.write(" ".join(map(str, stuff)) + "\n")
       
    11 
       
    12 # Find what looks like the start of a popular stmt.
       
    13 
       
    14 _synchre = re.compile(r"""
       
    15     ^
       
    16     [ \t]*
       
    17     (?: while
       
    18     |   else
       
    19     |   def
       
    20     |   return
       
    21     |   assert
       
    22     |   break
       
    23     |   class
       
    24     |   continue
       
    25     |   elif
       
    26     |   try
       
    27     |   except
       
    28     |   raise
       
    29     |   import
       
    30     |   yield
       
    31     )
       
    32     \b
       
    33 """, re.VERBOSE | re.MULTILINE).search
       
    34 
       
    35 # Match blank line or non-indenting comment line.
       
    36 
       
    37 _junkre = re.compile(r"""
       
    38     [ \t]*
       
    39     (?: \# \S .* )?
       
    40     \n
       
    41 """, re.VERBOSE).match
       
    42 
       
    43 # Match any flavor of string; the terminating quote is optional
       
    44 # so that we're robust in the face of incomplete program text.
       
    45 
       
    46 _match_stringre = re.compile(r"""
       
    47     \""" [^"\\]* (?:
       
    48                      (?: \\. | "(?!"") )
       
    49                      [^"\\]*
       
    50                  )*
       
    51     (?: \""" )?
       
    52 
       
    53 |   " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
       
    54 
       
    55 |   ''' [^'\\]* (?:
       
    56                    (?: \\. | '(?!'') )
       
    57                    [^'\\]*
       
    58                 )*
       
    59     (?: ''' )?
       
    60 
       
    61 |   ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
       
    62 """, re.VERBOSE | re.DOTALL).match
       
    63 
       
    64 # Match a line that starts with something interesting;
       
    65 # used to find the first item of a bracket structure.
       
    66 
       
    67 _itemre = re.compile(r"""
       
    68     [ \t]*
       
    69     [^\s#\\]    # if we match, m.end()-1 is the interesting char
       
    70 """, re.VERBOSE).match
       
    71 
       
    72 # Match start of stmts that should be followed by a dedent.
       
    73 
       
    74 _closere = re.compile(r"""
       
    75     \s*
       
    76     (?: return
       
    77     |   break
       
    78     |   continue
       
    79     |   raise
       
    80     |   pass
       
    81     )
       
    82     \b
       
    83 """, re.VERBOSE).match
       
    84 
       
    85 # Chew up non-special chars as quickly as possible.  If match is
       
    86 # successful, m.end() less 1 is the index of the last boring char
       
    87 # matched.  If match is unsuccessful, the string starts with an
       
    88 # interesting char.
       
    89 
       
    90 _chew_ordinaryre = re.compile(r"""
       
    91     [^[\](){}#'"\\]+
       
    92 """, re.VERBOSE).match
       
    93 
       
    94 # Build translation table to map uninteresting chars to "x", open
       
    95 # brackets to "(", and close brackets to ")".
       
    96 
       
    97 _tran = ['x'] * 256
       
    98 for ch in "({[":
       
    99     _tran[ord(ch)] = '('
       
   100 for ch in ")}]":
       
   101     _tran[ord(ch)] = ')'
       
   102 for ch in "\"'\\\n#":
       
   103     _tran[ord(ch)] = ch
       
   104 _tran = ''.join(_tran)
       
   105 del ch
       
   106 
       
   107 try:
       
   108     UnicodeType = type(unicode(""))
       
   109 except NameError:
       
   110     UnicodeType = None
       
   111 
       
   112 class Parser:
       
   113 
       
   114     def __init__(self, indentwidth, tabwidth):
       
   115         self.indentwidth = indentwidth
       
   116         self.tabwidth = tabwidth
       
   117 
       
   118     def set_str(self, str):
       
   119         assert len(str) == 0 or str[-1] == '\n'
       
   120         if type(str) is UnicodeType:
       
   121             # The parse functions have no idea what to do with Unicode, so
       
   122             # replace all Unicode characters with "x".  This is "safe"
       
   123             # so long as the only characters germane to parsing the structure
       
   124             # of Python are 7-bit ASCII.  It's *necessary* because Unicode
       
   125             # strings don't have a .translate() method that supports
       
   126             # deletechars.
       
   127             uniphooey = str
       
   128             str = []
       
   129             push = str.append
       
   130             for raw in map(ord, uniphooey):
       
   131                 push(raw < 127 and chr(raw) or "x")
       
   132             str = "".join(str)
       
   133         self.str = str
       
   134         self.study_level = 0
       
   135 
       
   136     # Return index of a good place to begin parsing, as close to the
       
   137     # end of the string as possible.  This will be the start of some
       
   138     # popular stmt like "if" or "def".  Return None if none found:
       
   139     # the caller should pass more prior context then, if possible, or
       
   140     # if not (the entire program text up until the point of interest
       
   141     # has already been tried) pass 0 to set_lo.
       
   142     #
       
   143     # This will be reliable iff given a reliable is_char_in_string
       
   144     # function, meaning that when it says "no", it's absolutely
       
   145     # guaranteed that the char is not in a string.
       
   146 
       
   147     def find_good_parse_start(self, is_char_in_string=None,
       
   148                               _synchre=_synchre):
       
   149         str, pos = self.str, None
       
   150 
       
   151         if not is_char_in_string:
       
   152             # no clue -- make the caller pass everything
       
   153             return None
       
   154 
       
   155         # Peek back from the end for a good place to start,
       
   156         # but don't try too often; pos will be left None, or
       
   157         # bumped to a legitimate synch point.
       
   158         limit = len(str)
       
   159         for tries in range(5):
       
   160             i = str.rfind(":\n", 0, limit)
       
   161             if i < 0:
       
   162                 break
       
   163             i = str.rfind('\n', 0, i) + 1  # start of colon line
       
   164             m = _synchre(str, i, limit)
       
   165             if m and not is_char_in_string(m.start()):
       
   166                 pos = m.start()
       
   167                 break
       
   168             limit = i
       
   169         if pos is None:
       
   170             # Nothing looks like a block-opener, or stuff does
       
   171             # but is_char_in_string keeps returning true; most likely
       
   172             # we're in or near a giant string, the colorizer hasn't
       
   173             # caught up enough to be helpful, or there simply *aren't*
       
   174             # any interesting stmts.  In any of these cases we're
       
   175             # going to have to parse the whole thing to be sure, so
       
   176             # give it one last try from the start, but stop wasting
       
   177             # time here regardless of the outcome.
       
   178             m = _synchre(str)
       
   179             if m and not is_char_in_string(m.start()):
       
   180                 pos = m.start()
       
   181             return pos
       
   182 
       
   183         # Peeking back worked; look forward until _synchre no longer
       
   184         # matches.
       
   185         i = pos + 1
       
   186         while 1:
       
   187             m = _synchre(str, i)
       
   188             if m:
       
   189                 s, i = m.span()
       
   190                 if not is_char_in_string(s):
       
   191                     pos = s
       
   192             else:
       
   193                 break
       
   194         return pos
       
   195 
       
   196     # Throw away the start of the string.  Intended to be called with
       
   197     # find_good_parse_start's result.
       
   198 
       
   199     def set_lo(self, lo):
       
   200         assert lo == 0 or self.str[lo-1] == '\n'
       
   201         if lo > 0:
       
   202             self.str = self.str[lo:]
       
   203 
       
   204     # As quickly as humanly possible <wink>, find the line numbers (0-
       
   205     # based) of the non-continuation lines.
       
   206     # Creates self.{goodlines, continuation}.
       
   207 
       
   208     def _study1(self):
       
   209         if self.study_level >= 1:
       
   210             return
       
   211         self.study_level = 1
       
   212 
       
   213         # Map all uninteresting characters to "x", all open brackets
       
   214         # to "(", all close brackets to ")", then collapse runs of
       
   215         # uninteresting characters.  This can cut the number of chars
       
   216         # by a factor of 10-40, and so greatly speed the following loop.
       
   217         str = self.str
       
   218         str = str.translate(_tran)
       
   219         str = str.replace('xxxxxxxx', 'x')
       
   220         str = str.replace('xxxx', 'x')
       
   221         str = str.replace('xx', 'x')
       
   222         str = str.replace('xx', 'x')
       
   223         str = str.replace('\nx', '\n')
       
   224         # note that replacing x\n with \n would be incorrect, because
       
   225         # x may be preceded by a backslash
       
   226 
       
   227         # March over the squashed version of the program, accumulating
       
   228         # the line numbers of non-continued stmts, and determining
       
   229         # whether & why the last stmt is a continuation.
       
   230         continuation = C_NONE
       
   231         level = lno = 0     # level is nesting level; lno is line number
       
   232         self.goodlines = goodlines = [0]
       
   233         push_good = goodlines.append
       
   234         i, n = 0, len(str)
       
   235         while i < n:
       
   236             ch = str[i]
       
   237             i = i+1
       
   238 
       
   239             # cases are checked in decreasing order of frequency
       
   240             if ch == 'x':
       
   241                 continue
       
   242 
       
   243             if ch == '\n':
       
   244                 lno = lno + 1
       
   245                 if level == 0:
       
   246                     push_good(lno)
       
   247                     # else we're in an unclosed bracket structure
       
   248                 continue
       
   249 
       
   250             if ch == '(':
       
   251                 level = level + 1
       
   252                 continue
       
   253 
       
   254             if ch == ')':
       
   255                 if level:
       
   256                     level = level - 1
       
   257                     # else the program is invalid, but we can't complain
       
   258                 continue
       
   259 
       
   260             if ch == '"' or ch == "'":
       
   261                 # consume the string
       
   262                 quote = ch
       
   263                 if str[i-1:i+2] == quote * 3:
       
   264                     quote = quote * 3
       
   265                 firstlno = lno
       
   266                 w = len(quote) - 1
       
   267                 i = i+w
       
   268                 while i < n:
       
   269                     ch = str[i]
       
   270                     i = i+1
       
   271 
       
   272                     if ch == 'x':
       
   273                         continue
       
   274 
       
   275                     if str[i-1:i+w] == quote:
       
   276                         i = i+w
       
   277                         break
       
   278 
       
   279                     if ch == '\n':
       
   280                         lno = lno + 1
       
   281                         if w == 0:
       
   282                             # unterminated single-quoted string
       
   283                             if level == 0:
       
   284                                 push_good(lno)
       
   285                             break
       
   286                         continue
       
   287 
       
   288                     if ch == '\\':
       
   289                         assert i < n
       
   290                         if str[i] == '\n':
       
   291                             lno = lno + 1
       
   292                         i = i+1
       
   293                         continue
       
   294 
       
   295                     # else comment char or paren inside string
       
   296 
       
   297                 else:
       
   298                     # didn't break out of the loop, so we're still
       
   299                     # inside a string
       
   300                     if (lno - 1) == firstlno:
       
   301                         # before the previous \n in str, we were in the first
       
   302                         # line of the string
       
   303                         continuation = C_STRING_FIRST_LINE
       
   304                     else:
       
   305                         continuation = C_STRING_NEXT_LINES
       
   306                 continue    # with outer loop
       
   307 
       
   308             if ch == '#':
       
   309                 # consume the comment
       
   310                 i = str.find('\n', i)
       
   311                 assert i >= 0
       
   312                 continue
       
   313 
       
   314             assert ch == '\\'
       
   315             assert i < n
       
   316             if str[i] == '\n':
       
   317                 lno = lno + 1
       
   318                 if i+1 == n:
       
   319                     continuation = C_BACKSLASH
       
   320             i = i+1
       
   321 
       
   322         # The last stmt may be continued for all 3 reasons.
       
   323         # String continuation takes precedence over bracket
       
   324         # continuation, which beats backslash continuation.
       
   325         if (continuation != C_STRING_FIRST_LINE
       
   326             and continuation != C_STRING_NEXT_LINES and level > 0):
       
   327             continuation = C_BRACKET
       
   328         self.continuation = continuation
       
   329 
       
   330         # Push the final line number as a sentinel value, regardless of
       
   331         # whether it's continued.
       
   332         assert (continuation == C_NONE) == (goodlines[-1] == lno)
       
   333         if goodlines[-1] != lno:
       
   334             push_good(lno)
       
   335 
       
   336     def get_continuation_type(self):
       
   337         self._study1()
       
   338         return self.continuation
       
   339 
       
   340     # study1 was sufficient to determine the continuation status,
       
   341     # but doing more requires looking at every character.  study2
       
   342     # does this for the last interesting statement in the block.
       
   343     # Creates:
       
   344     #     self.stmt_start, stmt_end
       
   345     #         slice indices of last interesting stmt
       
   346     #     self.stmt_bracketing
       
   347     #         the bracketing structure of the last interesting stmt;
       
   348     #         for example, for the statement "say(boo) or die", stmt_bracketing
       
   349     #         will be [(0, 0), (3, 1), (8, 0)]. Strings and comments are
       
   350     #         treated as brackets, for the matter.
       
   351     #     self.lastch
       
   352     #         last non-whitespace character before optional trailing
       
   353     #         comment
       
   354     #     self.lastopenbracketpos
       
   355     #         if continuation is C_BRACKET, index of last open bracket
       
   356 
       
   357     def _study2(self):
       
   358         if self.study_level >= 2:
       
   359             return
       
   360         self._study1()
       
   361         self.study_level = 2
       
   362 
       
   363         # Set p and q to slice indices of last interesting stmt.
       
   364         str, goodlines = self.str, self.goodlines
       
   365         i = len(goodlines) - 1
       
   366         p = len(str)    # index of newest line
       
   367         while i:
       
   368             assert p
       
   369             # p is the index of the stmt at line number goodlines[i].
       
   370             # Move p back to the stmt at line number goodlines[i-1].
       
   371             q = p
       
   372             for nothing in range(goodlines[i-1], goodlines[i]):
       
   373                 # tricky: sets p to 0 if no preceding newline
       
   374                 p = str.rfind('\n', 0, p-1) + 1
       
   375             # The stmt str[p:q] isn't a continuation, but may be blank
       
   376             # or a non-indenting comment line.
       
   377             if  _junkre(str, p):
       
   378                 i = i-1
       
   379             else:
       
   380                 break
       
   381         if i == 0:
       
   382             # nothing but junk!
       
   383             assert p == 0
       
   384             q = p
       
   385         self.stmt_start, self.stmt_end = p, q
       
   386 
       
   387         # Analyze this stmt, to find the last open bracket (if any)
       
   388         # and last interesting character (if any).
       
   389         lastch = ""
       
   390         stack = []  # stack of open bracket indices
       
   391         push_stack = stack.append
       
   392         bracketing = [(p, 0)]
       
   393         while p < q:
       
   394             # suck up all except ()[]{}'"#\\
       
   395             m = _chew_ordinaryre(str, p, q)
       
   396             if m:
       
   397                 # we skipped at least one boring char
       
   398                 newp = m.end()
       
   399                 # back up over totally boring whitespace
       
   400                 i = newp - 1    # index of last boring char
       
   401                 while i >= p and str[i] in " \t\n":
       
   402                     i = i-1
       
   403                 if i >= p:
       
   404                     lastch = str[i]
       
   405                 p = newp
       
   406                 if p >= q:
       
   407                     break
       
   408 
       
   409             ch = str[p]
       
   410 
       
   411             if ch in "([{":
       
   412                 push_stack(p)
       
   413                 bracketing.append((p, len(stack)))
       
   414                 lastch = ch
       
   415                 p = p+1
       
   416                 continue
       
   417 
       
   418             if ch in ")]}":
       
   419                 if stack:
       
   420                     del stack[-1]
       
   421                 lastch = ch
       
   422                 p = p+1
       
   423                 bracketing.append((p, len(stack)))
       
   424                 continue
       
   425 
       
   426             if ch == '"' or ch == "'":
       
   427                 # consume string
       
   428                 # Note that study1 did this with a Python loop, but
       
   429                 # we use a regexp here; the reason is speed in both
       
   430                 # cases; the string may be huge, but study1 pre-squashed
       
   431                 # strings to a couple of characters per line.  study1
       
   432                 # also needed to keep track of newlines, and we don't
       
   433                 # have to.
       
   434                 bracketing.append((p, len(stack)+1))
       
   435                 lastch = ch
       
   436                 p = _match_stringre(str, p, q).end()
       
   437                 bracketing.append((p, len(stack)))
       
   438                 continue
       
   439 
       
   440             if ch == '#':
       
   441                 # consume comment and trailing newline
       
   442                 bracketing.append((p, len(stack)+1))
       
   443                 p = str.find('\n', p, q) + 1
       
   444                 assert p > 0
       
   445                 bracketing.append((p, len(stack)))
       
   446                 continue
       
   447 
       
   448             assert ch == '\\'
       
   449             p = p+1     # beyond backslash
       
   450             assert p < q
       
   451             if str[p] != '\n':
       
   452                 # the program is invalid, but can't complain
       
   453                 lastch = ch + str[p]
       
   454             p = p+1     # beyond escaped char
       
   455 
       
   456         # end while p < q:
       
   457 
       
   458         self.lastch = lastch
       
   459         if stack:
       
   460             self.lastopenbracketpos = stack[-1]
       
   461         self.stmt_bracketing = tuple(bracketing)
       
   462 
       
   463     # Assuming continuation is C_BRACKET, return the number
       
   464     # of spaces the next line should be indented.
       
   465 
       
   466     def compute_bracket_indent(self):
       
   467         self._study2()
       
   468         assert self.continuation == C_BRACKET
       
   469         j = self.lastopenbracketpos
       
   470         str = self.str
       
   471         n = len(str)
       
   472         origi = i = str.rfind('\n', 0, j) + 1
       
   473         j = j+1     # one beyond open bracket
       
   474         # find first list item; set i to start of its line
       
   475         while j < n:
       
   476             m = _itemre(str, j)
       
   477             if m:
       
   478                 j = m.end() - 1     # index of first interesting char
       
   479                 extra = 0
       
   480                 break
       
   481             else:
       
   482                 # this line is junk; advance to next line
       
   483                 i = j = str.find('\n', j) + 1
       
   484         else:
       
   485             # nothing interesting follows the bracket;
       
   486             # reproduce the bracket line's indentation + a level
       
   487             j = i = origi
       
   488             while str[j] in " \t":
       
   489                 j = j+1
       
   490             extra = self.indentwidth
       
   491         return len(str[i:j].expandtabs(self.tabwidth)) + extra
       
   492 
       
   493     # Return number of physical lines in last stmt (whether or not
       
   494     # it's an interesting stmt!  this is intended to be called when
       
   495     # continuation is C_BACKSLASH).
       
   496 
       
   497     def get_num_lines_in_stmt(self):
       
   498         self._study1()
       
   499         goodlines = self.goodlines
       
   500         return goodlines[-1] - goodlines[-2]
       
   501 
       
   502     # Assuming continuation is C_BACKSLASH, return the number of spaces
       
   503     # the next line should be indented.  Also assuming the new line is
       
   504     # the first one following the initial line of the stmt.
       
   505 
       
   506     def compute_backslash_indent(self):
       
   507         self._study2()
       
   508         assert self.continuation == C_BACKSLASH
       
   509         str = self.str
       
   510         i = self.stmt_start
       
   511         while str[i] in " \t":
       
   512             i = i+1
       
   513         startpos = i
       
   514 
       
   515         # See whether the initial line starts an assignment stmt; i.e.,
       
   516         # look for an = operator
       
   517         endpos = str.find('\n', startpos) + 1
       
   518         found = level = 0
       
   519         while i < endpos:
       
   520             ch = str[i]
       
   521             if ch in "([{":
       
   522                 level = level + 1
       
   523                 i = i+1
       
   524             elif ch in ")]}":
       
   525                 if level:
       
   526                     level = level - 1
       
   527                 i = i+1
       
   528             elif ch == '"' or ch == "'":
       
   529                 i = _match_stringre(str, i, endpos).end()
       
   530             elif ch == '#':
       
   531                 break
       
   532             elif level == 0 and ch == '=' and \
       
   533                    (i == 0 or str[i-1] not in "=<>!") and \
       
   534                    str[i+1] != '=':
       
   535                 found = 1
       
   536                 break
       
   537             else:
       
   538                 i = i+1
       
   539 
       
   540         if found:
       
   541             # found a legit =, but it may be the last interesting
       
   542             # thing on the line
       
   543             i = i+1     # move beyond the =
       
   544             found = re.match(r"\s*\\", str[i:endpos]) is None
       
   545 
       
   546         if not found:
       
   547             # oh well ... settle for moving beyond the first chunk
       
   548             # of non-whitespace chars
       
   549             i = startpos
       
   550             while str[i] not in " \t\n":
       
   551                 i = i+1
       
   552 
       
   553         return len(str[self.stmt_start:i].expandtabs(\
       
   554                                      self.tabwidth)) + 1
       
   555 
       
   556     # Return the leading whitespace on the initial line of the last
       
   557     # interesting stmt.
       
   558 
       
   559     def get_base_indent_string(self):
       
   560         self._study2()
       
   561         i, n = self.stmt_start, self.stmt_end
       
   562         j = i
       
   563         str = self.str
       
   564         while j < n and str[j] in " \t":
       
   565             j = j + 1
       
   566         return str[i:j]
       
   567 
       
   568     # Did the last interesting stmt open a block?
       
   569 
       
   570     def is_block_opener(self):
       
   571         self._study2()
       
   572         return self.lastch == ':'
       
   573 
       
   574     # Did the last interesting stmt close a block?
       
   575 
       
   576     def is_block_closer(self):
       
   577         self._study2()
       
   578         return _closere(self.str, self.stmt_start) is not None
       
   579 
       
   580     # index of last open bracket ({[, or None if none
       
   581     lastopenbracketpos = None
       
   582 
       
   583     def get_last_open_bracket_pos(self):
       
   584         self._study2()
       
   585         return self.lastopenbracketpos
       
   586 
       
   587     # the structure of the bracketing of the last interesting statement,
       
   588     # in the format defined in _study2, or None if the text didn't contain
       
   589     # anything
       
   590     stmt_bracketing = None
       
   591 
       
   592     def get_last_stmt_bracketing(self):
       
   593         self._study2()
       
   594         return self.stmt_bracketing