python-2.5.2/win32/Lib/urllib2.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 """An extensible library for opening URLs using a variety of protocols
       
     2 
       
     3 The simplest way to use this module is to call the urlopen function,
       
     4 which accepts a string containing a URL or a Request object (described
       
     5 below).  It opens the URL and returns the results as file-like
       
     6 object; the returned object has some extra methods described below.
       
     7 
       
     8 The OpenerDirector manages a collection of Handler objects that do
       
     9 all the actual work.  Each Handler implements a particular protocol or
       
    10 option.  The OpenerDirector is a composite object that invokes the
       
    11 Handlers needed to open the requested URL.  For example, the
       
    12 HTTPHandler performs HTTP GET and POST requests and deals with
       
    13 non-error returns.  The HTTPRedirectHandler automatically deals with
       
    14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
       
    15 deals with digest authentication.
       
    16 
       
    17 urlopen(url, data=None) -- basic usage is the same as original
       
    18 urllib.  pass the url and optionally data to post to an HTTP URL, and
       
    19 get a file-like object back.  One difference is that you can also pass
       
    20 a Request instance instead of URL.  Raises a URLError (subclass of
       
    21 IOError); for HTTP errors, raises an HTTPError, which can also be
       
    22 treated as a valid response.
       
    23 
       
    24 build_opener -- function that creates a new OpenerDirector instance.
       
    25 will install the default handlers.  accepts one or more Handlers as
       
    26 arguments, either instances or Handler classes that it will
       
    27 instantiate.  if one of the argument is a subclass of the default
       
    28 handler, the argument will be installed instead of the default.
       
    29 
       
    30 install_opener -- installs a new opener as the default opener.
       
    31 
       
    32 objects of interest:
       
    33 OpenerDirector --
       
    34 
       
    35 Request -- an object that encapsulates the state of a request.  the
       
    36 state can be a simple as the URL.  it can also include extra HTTP
       
    37 headers, e.g. a User-Agent.
       
    38 
       
    39 BaseHandler --
       
    40 
       
    41 exceptions:
       
    42 URLError-- a subclass of IOError, individual protocols have their own
       
    43 specific subclass
       
    44 
       
    45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
       
    46 as an exceptional event or valid response
       
    47 
       
    48 internals:
       
    49 BaseHandler and parent
       
    50 _call_chain conventions
       
    51 
       
    52 Example usage:
       
    53 
       
    54 import urllib2
       
    55 
       
    56 # set up authentication info
       
    57 authinfo = urllib2.HTTPBasicAuthHandler()
       
    58 authinfo.add_password(realm='PDQ Application',
       
    59                       uri='https://mahler:8092/site-updates.py',
       
    60                       user='klem',
       
    61                       passwd='geheim$parole')
       
    62 
       
    63 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
       
    64 
       
    65 # build a new opener that adds authentication and caching FTP handlers
       
    66 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
       
    67 
       
    68 # install it
       
    69 urllib2.install_opener(opener)
       
    70 
       
    71 f = urllib2.urlopen('http://www.python.org/')
       
    72 
       
    73 
       
    74 """
       
    75 
       
    76 # XXX issues:
       
    77 # If an authentication error handler that tries to perform
       
    78 # authentication for some reason but fails, how should the error be
       
    79 # signalled?  The client needs to know the HTTP error code.  But if
       
    80 # the handler knows that the problem was, e.g., that it didn't know
       
    81 # that hash algo that requested in the challenge, it would be good to
       
    82 # pass that information along to the client, too.
       
    83 # ftp errors aren't handled cleanly
       
    84 # check digest against correct (i.e. non-apache) implementation
       
    85 
       
    86 # Possible extensions:
       
    87 # complex proxies  XXX not sure what exactly was meant by this
       
    88 # abstract factory for opener
       
    89 
       
    90 import base64
       
    91 import hashlib
       
    92 import httplib
       
    93 import mimetools
       
    94 import os
       
    95 import posixpath
       
    96 import random
       
    97 import re
       
    98 import socket
       
    99 import sys
       
   100 import time
       
   101 import urlparse
       
   102 import bisect
       
   103 
       
   104 try:
       
   105     from cStringIO import StringIO
       
   106 except ImportError:
       
   107     from StringIO import StringIO
       
   108 
       
   109 from urllib import (unwrap, unquote, splittype, splithost, quote,
       
   110      addinfourl, splitport, splitgophertype, splitquery,
       
   111      splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue)
       
   112 
       
   113 # support for FileHandler, proxies via environment variables
       
   114 from urllib import localhost, url2pathname, getproxies
       
   115 
       
   116 # used in User-Agent header sent
       
   117 __version__ = sys.version[:3]
       
   118 
       
   119 _opener = None
       
   120 def urlopen(url, data=None):
       
   121     global _opener
       
   122     if _opener is None:
       
   123         _opener = build_opener()
       
   124     return _opener.open(url, data)
       
   125 
       
   126 def install_opener(opener):
       
   127     global _opener
       
   128     _opener = opener
       
   129 
       
   130 # do these error classes make sense?
       
   131 # make sure all of the IOError stuff is overridden.  we just want to be
       
   132 # subtypes.
       
   133 
       
   134 class URLError(IOError):
       
   135     # URLError is a sub-type of IOError, but it doesn't share any of
       
   136     # the implementation.  need to override __init__ and __str__.
       
   137     # It sets self.args for compatibility with other EnvironmentError
       
   138     # subclasses, but args doesn't have the typical format with errno in
       
   139     # slot 0 and strerror in slot 1.  This may be better than nothing.
       
   140     def __init__(self, reason):
       
   141         self.args = reason,
       
   142         self.reason = reason
       
   143 
       
   144     def __str__(self):
       
   145         return '<urlopen error %s>' % self.reason
       
   146 
       
   147 class HTTPError(URLError, addinfourl):
       
   148     """Raised when HTTP error occurs, but also acts like non-error return"""
       
   149     __super_init = addinfourl.__init__
       
   150 
       
   151     def __init__(self, url, code, msg, hdrs, fp):
       
   152         self.code = code
       
   153         self.msg = msg
       
   154         self.hdrs = hdrs
       
   155         self.fp = fp
       
   156         self.filename = url
       
   157         # The addinfourl classes depend on fp being a valid file
       
   158         # object.  In some cases, the HTTPError may not have a valid
       
   159         # file object.  If this happens, the simplest workaround is to
       
   160         # not initialize the base classes.
       
   161         if fp is not None:
       
   162             self.__super_init(fp, hdrs, url)
       
   163 
       
   164     def __str__(self):
       
   165         return 'HTTP Error %s: %s' % (self.code, self.msg)
       
   166 
       
   167 class GopherError(URLError):
       
   168     pass
       
   169 
       
   170 # copied from cookielib.py
       
   171 _cut_port_re = re.compile(r":\d+$")
       
   172 def request_host(request):
       
   173     """Return request-host, as defined by RFC 2965.
       
   174 
       
   175     Variation from RFC: returned value is lowercased, for convenient
       
   176     comparison.
       
   177 
       
   178     """
       
   179     url = request.get_full_url()
       
   180     host = urlparse.urlparse(url)[1]
       
   181     if host == "":
       
   182         host = request.get_header("Host", "")
       
   183 
       
   184     # remove port, if present
       
   185     host = _cut_port_re.sub("", host, 1)
       
   186     return host.lower()
       
   187 
       
   188 class Request:
       
   189 
       
   190     def __init__(self, url, data=None, headers={},
       
   191                  origin_req_host=None, unverifiable=False):
       
   192         # unwrap('<URL:type://host/path>') --> 'type://host/path'
       
   193         self.__original = unwrap(url)
       
   194         self.type = None
       
   195         # self.__r_type is what's left after doing the splittype
       
   196         self.host = None
       
   197         self.port = None
       
   198         self.data = data
       
   199         self.headers = {}
       
   200         for key, value in headers.items():
       
   201             self.add_header(key, value)
       
   202         self.unredirected_hdrs = {}
       
   203         if origin_req_host is None:
       
   204             origin_req_host = request_host(self)
       
   205         self.origin_req_host = origin_req_host
       
   206         self.unverifiable = unverifiable
       
   207 
       
   208     def __getattr__(self, attr):
       
   209         # XXX this is a fallback mechanism to guard against these
       
   210         # methods getting called in a non-standard order.  this may be
       
   211         # too complicated and/or unnecessary.
       
   212         # XXX should the __r_XXX attributes be public?
       
   213         if attr[:12] == '_Request__r_':
       
   214             name = attr[12:]
       
   215             if hasattr(Request, 'get_' + name):
       
   216                 getattr(self, 'get_' + name)()
       
   217                 return getattr(self, attr)
       
   218         raise AttributeError, attr
       
   219 
       
   220     def get_method(self):
       
   221         if self.has_data():
       
   222             return "POST"
       
   223         else:
       
   224             return "GET"
       
   225 
       
   226     # XXX these helper methods are lame
       
   227 
       
   228     def add_data(self, data):
       
   229         self.data = data
       
   230 
       
   231     def has_data(self):
       
   232         return self.data is not None
       
   233 
       
   234     def get_data(self):
       
   235         return self.data
       
   236 
       
   237     def get_full_url(self):
       
   238         return self.__original
       
   239 
       
   240     def get_type(self):
       
   241         if self.type is None:
       
   242             self.type, self.__r_type = splittype(self.__original)
       
   243             if self.type is None:
       
   244                 raise ValueError, "unknown url type: %s" % self.__original
       
   245         return self.type
       
   246 
       
   247     def get_host(self):
       
   248         if self.host is None:
       
   249             self.host, self.__r_host = splithost(self.__r_type)
       
   250             if self.host:
       
   251                 self.host = unquote(self.host)
       
   252         return self.host
       
   253 
       
   254     def get_selector(self):
       
   255         return self.__r_host
       
   256 
       
   257     def set_proxy(self, host, type):
       
   258         self.host, self.type = host, type
       
   259         self.__r_host = self.__original
       
   260 
       
   261     def get_origin_req_host(self):
       
   262         return self.origin_req_host
       
   263 
       
   264     def is_unverifiable(self):
       
   265         return self.unverifiable
       
   266 
       
   267     def add_header(self, key, val):
       
   268         # useful for something like authentication
       
   269         self.headers[key.capitalize()] = val
       
   270 
       
   271     def add_unredirected_header(self, key, val):
       
   272         # will not be added to a redirected request
       
   273         self.unredirected_hdrs[key.capitalize()] = val
       
   274 
       
   275     def has_header(self, header_name):
       
   276         return (header_name in self.headers or
       
   277                 header_name in self.unredirected_hdrs)
       
   278 
       
   279     def get_header(self, header_name, default=None):
       
   280         return self.headers.get(
       
   281             header_name,
       
   282             self.unredirected_hdrs.get(header_name, default))
       
   283 
       
   284     def header_items(self):
       
   285         hdrs = self.unredirected_hdrs.copy()
       
   286         hdrs.update(self.headers)
       
   287         return hdrs.items()
       
   288 
       
   289 class OpenerDirector:
       
   290     def __init__(self):
       
   291         client_version = "Python-urllib/%s" % __version__
       
   292         self.addheaders = [('User-agent', client_version)]
       
   293         # manage the individual handlers
       
   294         self.handlers = []
       
   295         self.handle_open = {}
       
   296         self.handle_error = {}
       
   297         self.process_response = {}
       
   298         self.process_request = {}
       
   299 
       
   300     def add_handler(self, handler):
       
   301         if not hasattr(handler, "add_parent"):
       
   302             raise TypeError("expected BaseHandler instance, got %r" %
       
   303                             type(handler))
       
   304 
       
   305         added = False
       
   306         for meth in dir(handler):
       
   307             if meth in ["redirect_request", "do_open", "proxy_open"]:
       
   308                 # oops, coincidental match
       
   309                 continue
       
   310 
       
   311             i = meth.find("_")
       
   312             protocol = meth[:i]
       
   313             condition = meth[i+1:]
       
   314 
       
   315             if condition.startswith("error"):
       
   316                 j = condition.find("_") + i + 1
       
   317                 kind = meth[j+1:]
       
   318                 try:
       
   319                     kind = int(kind)
       
   320                 except ValueError:
       
   321                     pass
       
   322                 lookup = self.handle_error.get(protocol, {})
       
   323                 self.handle_error[protocol] = lookup
       
   324             elif condition == "open":
       
   325                 kind = protocol
       
   326                 lookup = self.handle_open
       
   327             elif condition == "response":
       
   328                 kind = protocol
       
   329                 lookup = self.process_response
       
   330             elif condition == "request":
       
   331                 kind = protocol
       
   332                 lookup = self.process_request
       
   333             else:
       
   334                 continue
       
   335 
       
   336             handlers = lookup.setdefault(kind, [])
       
   337             if handlers:
       
   338                 bisect.insort(handlers, handler)
       
   339             else:
       
   340                 handlers.append(handler)
       
   341             added = True
       
   342 
       
   343         if added:
       
   344             # XXX why does self.handlers need to be sorted?
       
   345             bisect.insort(self.handlers, handler)
       
   346             handler.add_parent(self)
       
   347 
       
   348     def close(self):
       
   349         # Only exists for backwards compatibility.
       
   350         pass
       
   351 
       
   352     def _call_chain(self, chain, kind, meth_name, *args):
       
   353         # Handlers raise an exception if no one else should try to handle
       
   354         # the request, or return None if they can't but another handler
       
   355         # could.  Otherwise, they return the response.
       
   356         handlers = chain.get(kind, ())
       
   357         for handler in handlers:
       
   358             func = getattr(handler, meth_name)
       
   359 
       
   360             result = func(*args)
       
   361             if result is not None:
       
   362                 return result
       
   363 
       
   364     def open(self, fullurl, data=None):
       
   365         # accept a URL or a Request object
       
   366         if isinstance(fullurl, basestring):
       
   367             req = Request(fullurl, data)
       
   368         else:
       
   369             req = fullurl
       
   370             if data is not None:
       
   371                 req.add_data(data)
       
   372 
       
   373         protocol = req.get_type()
       
   374 
       
   375         # pre-process request
       
   376         meth_name = protocol+"_request"
       
   377         for processor in self.process_request.get(protocol, []):
       
   378             meth = getattr(processor, meth_name)
       
   379             req = meth(req)
       
   380 
       
   381         response = self._open(req, data)
       
   382 
       
   383         # post-process response
       
   384         meth_name = protocol+"_response"
       
   385         for processor in self.process_response.get(protocol, []):
       
   386             meth = getattr(processor, meth_name)
       
   387             response = meth(req, response)
       
   388 
       
   389         return response
       
   390 
       
   391     def _open(self, req, data=None):
       
   392         result = self._call_chain(self.handle_open, 'default',
       
   393                                   'default_open', req)
       
   394         if result:
       
   395             return result
       
   396 
       
   397         protocol = req.get_type()
       
   398         result = self._call_chain(self.handle_open, protocol, protocol +
       
   399                                   '_open', req)
       
   400         if result:
       
   401             return result
       
   402 
       
   403         return self._call_chain(self.handle_open, 'unknown',
       
   404                                 'unknown_open', req)
       
   405 
       
   406     def error(self, proto, *args):
       
   407         if proto in ('http', 'https'):
       
   408             # XXX http[s] protocols are special-cased
       
   409             dict = self.handle_error['http'] # https is not different than http
       
   410             proto = args[2]  # YUCK!
       
   411             meth_name = 'http_error_%s' % proto
       
   412             http_err = 1
       
   413             orig_args = args
       
   414         else:
       
   415             dict = self.handle_error
       
   416             meth_name = proto + '_error'
       
   417             http_err = 0
       
   418         args = (dict, proto, meth_name) + args
       
   419         result = self._call_chain(*args)
       
   420         if result:
       
   421             return result
       
   422 
       
   423         if http_err:
       
   424             args = (dict, 'default', 'http_error_default') + orig_args
       
   425             return self._call_chain(*args)
       
   426 
       
   427 # XXX probably also want an abstract factory that knows when it makes
       
   428 # sense to skip a superclass in favor of a subclass and when it might
       
   429 # make sense to include both
       
   430 
       
   431 def build_opener(*handlers):
       
   432     """Create an opener object from a list of handlers.
       
   433 
       
   434     The opener will use several default handlers, including support
       
   435     for HTTP and FTP.
       
   436 
       
   437     If any of the handlers passed as arguments are subclasses of the
       
   438     default handlers, the default handlers will not be used.
       
   439     """
       
   440     import types
       
   441     def isclass(obj):
       
   442         return isinstance(obj, types.ClassType) or hasattr(obj, "__bases__")
       
   443 
       
   444     opener = OpenerDirector()
       
   445     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
       
   446                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
       
   447                        FTPHandler, FileHandler, HTTPErrorProcessor]
       
   448     if hasattr(httplib, 'HTTPS'):
       
   449         default_classes.append(HTTPSHandler)
       
   450     skip = []
       
   451     for klass in default_classes:
       
   452         for check in handlers:
       
   453             if isclass(check):
       
   454                 if issubclass(check, klass):
       
   455                     skip.append(klass)
       
   456             elif isinstance(check, klass):
       
   457                 skip.append(klass)
       
   458     for klass in skip:
       
   459         default_classes.remove(klass)
       
   460 
       
   461     for klass in default_classes:
       
   462         opener.add_handler(klass())
       
   463 
       
   464     for h in handlers:
       
   465         if isclass(h):
       
   466             h = h()
       
   467         opener.add_handler(h)
       
   468     return opener
       
   469 
       
   470 class BaseHandler:
       
   471     handler_order = 500
       
   472 
       
   473     def add_parent(self, parent):
       
   474         self.parent = parent
       
   475 
       
   476     def close(self):
       
   477         # Only exists for backwards compatibility
       
   478         pass
       
   479 
       
   480     def __lt__(self, other):
       
   481         if not hasattr(other, "handler_order"):
       
   482             # Try to preserve the old behavior of having custom classes
       
   483             # inserted after default ones (works only for custom user
       
   484             # classes which are not aware of handler_order).
       
   485             return True
       
   486         return self.handler_order < other.handler_order
       
   487 
       
   488 
       
   489 class HTTPErrorProcessor(BaseHandler):
       
   490     """Process HTTP error responses."""
       
   491     handler_order = 1000  # after all other processing
       
   492 
       
   493     def http_response(self, request, response):
       
   494         code, msg, hdrs = response.code, response.msg, response.info()
       
   495 
       
   496         if code not in (200, 206):
       
   497             response = self.parent.error(
       
   498                 'http', request, response, code, msg, hdrs)
       
   499 
       
   500         return response
       
   501 
       
   502     https_response = http_response
       
   503 
       
   504 class HTTPDefaultErrorHandler(BaseHandler):
       
   505     def http_error_default(self, req, fp, code, msg, hdrs):
       
   506         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
       
   507 
       
   508 class HTTPRedirectHandler(BaseHandler):
       
   509     # maximum number of redirections to any single URL
       
   510     # this is needed because of the state that cookies introduce
       
   511     max_repeats = 4
       
   512     # maximum total number of redirections (regardless of URL) before
       
   513     # assuming we're in a loop
       
   514     max_redirections = 10
       
   515 
       
   516     def redirect_request(self, req, fp, code, msg, headers, newurl):
       
   517         """Return a Request or None in response to a redirect.
       
   518 
       
   519         This is called by the http_error_30x methods when a
       
   520         redirection response is received.  If a redirection should
       
   521         take place, return a new Request to allow http_error_30x to
       
   522         perform the redirect.  Otherwise, raise HTTPError if no-one
       
   523         else should try to handle this url.  Return None if you can't
       
   524         but another Handler might.
       
   525         """
       
   526         m = req.get_method()
       
   527         if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
       
   528             or code in (301, 302, 303) and m == "POST"):
       
   529             # Strictly (according to RFC 2616), 301 or 302 in response
       
   530             # to a POST MUST NOT cause a redirection without confirmation
       
   531             # from the user (of urllib2, in this case).  In practice,
       
   532             # essentially all clients do redirect in this case, so we
       
   533             # do the same.
       
   534             # be conciliant with URIs containing a space
       
   535             newurl = newurl.replace(' ', '%20')
       
   536             return Request(newurl,
       
   537                            headers=req.headers,
       
   538                            origin_req_host=req.get_origin_req_host(),
       
   539                            unverifiable=True)
       
   540         else:
       
   541             raise HTTPError(req.get_full_url(), code, msg, headers, fp)
       
   542 
       
   543     # Implementation note: To avoid the server sending us into an
       
   544     # infinite loop, the request object needs to track what URLs we
       
   545     # have already seen.  Do this by adding a handler-specific
       
   546     # attribute to the Request object.
       
   547     def http_error_302(self, req, fp, code, msg, headers):
       
   548         # Some servers (incorrectly) return multiple Location headers
       
   549         # (so probably same goes for URI).  Use first header.
       
   550         if 'location' in headers:
       
   551             newurl = headers.getheaders('location')[0]
       
   552         elif 'uri' in headers:
       
   553             newurl = headers.getheaders('uri')[0]
       
   554         else:
       
   555             return
       
   556         newurl = urlparse.urljoin(req.get_full_url(), newurl)
       
   557 
       
   558         # XXX Probably want to forget about the state of the current
       
   559         # request, although that might interact poorly with other
       
   560         # handlers that also use handler-specific request attributes
       
   561         new = self.redirect_request(req, fp, code, msg, headers, newurl)
       
   562         if new is None:
       
   563             return
       
   564 
       
   565         # loop detection
       
   566         # .redirect_dict has a key url if url was previously visited.
       
   567         if hasattr(req, 'redirect_dict'):
       
   568             visited = new.redirect_dict = req.redirect_dict
       
   569             if (visited.get(newurl, 0) >= self.max_repeats or
       
   570                 len(visited) >= self.max_redirections):
       
   571                 raise HTTPError(req.get_full_url(), code,
       
   572                                 self.inf_msg + msg, headers, fp)
       
   573         else:
       
   574             visited = new.redirect_dict = req.redirect_dict = {}
       
   575         visited[newurl] = visited.get(newurl, 0) + 1
       
   576 
       
   577         # Don't close the fp until we are sure that we won't use it
       
   578         # with HTTPError.
       
   579         fp.read()
       
   580         fp.close()
       
   581 
       
   582         return self.parent.open(new)
       
   583 
       
   584     http_error_301 = http_error_303 = http_error_307 = http_error_302
       
   585 
       
   586     inf_msg = "The HTTP server returned a redirect error that would " \
       
   587               "lead to an infinite loop.\n" \
       
   588               "The last 30x error message was:\n"
       
   589 
       
   590 
       
   591 def _parse_proxy(proxy):
       
   592     """Return (scheme, user, password, host/port) given a URL or an authority.
       
   593 
       
   594     If a URL is supplied, it must have an authority (host:port) component.
       
   595     According to RFC 3986, having an authority component means the URL must
       
   596     have two slashes after the scheme:
       
   597 
       
   598     >>> _parse_proxy('file:/ftp.example.com/')
       
   599     Traceback (most recent call last):
       
   600     ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
       
   601 
       
   602     The first three items of the returned tuple may be None.
       
   603 
       
   604     Examples of authority parsing:
       
   605 
       
   606     >>> _parse_proxy('proxy.example.com')
       
   607     (None, None, None, 'proxy.example.com')
       
   608     >>> _parse_proxy('proxy.example.com:3128')
       
   609     (None, None, None, 'proxy.example.com:3128')
       
   610 
       
   611     The authority component may optionally include userinfo (assumed to be
       
   612     username:password):
       
   613 
       
   614     >>> _parse_proxy('joe:password@proxy.example.com')
       
   615     (None, 'joe', 'password', 'proxy.example.com')
       
   616     >>> _parse_proxy('joe:password@proxy.example.com:3128')
       
   617     (None, 'joe', 'password', 'proxy.example.com:3128')
       
   618 
       
   619     Same examples, but with URLs instead:
       
   620 
       
   621     >>> _parse_proxy('http://proxy.example.com/')
       
   622     ('http', None, None, 'proxy.example.com')
       
   623     >>> _parse_proxy('http://proxy.example.com:3128/')
       
   624     ('http', None, None, 'proxy.example.com:3128')
       
   625     >>> _parse_proxy('http://joe:password@proxy.example.com/')
       
   626     ('http', 'joe', 'password', 'proxy.example.com')
       
   627     >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
       
   628     ('http', 'joe', 'password', 'proxy.example.com:3128')
       
   629 
       
   630     Everything after the authority is ignored:
       
   631 
       
   632     >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
       
   633     ('ftp', 'joe', 'password', 'proxy.example.com')
       
   634 
       
   635     Test for no trailing '/' case:
       
   636 
       
   637     >>> _parse_proxy('http://joe:password@proxy.example.com')
       
   638     ('http', 'joe', 'password', 'proxy.example.com')
       
   639 
       
   640     """
       
   641     scheme, r_scheme = splittype(proxy)
       
   642     if not r_scheme.startswith("/"):
       
   643         # authority
       
   644         scheme = None
       
   645         authority = proxy
       
   646     else:
       
   647         # URL
       
   648         if not r_scheme.startswith("//"):
       
   649             raise ValueError("proxy URL with no authority: %r" % proxy)
       
   650         # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
       
   651         # and 3.3.), path is empty or starts with '/'
       
   652         end = r_scheme.find("/", 2)
       
   653         if end == -1:
       
   654             end = None
       
   655         authority = r_scheme[2:end]
       
   656     userinfo, hostport = splituser(authority)
       
   657     if userinfo is not None:
       
   658         user, password = splitpasswd(userinfo)
       
   659     else:
       
   660         user = password = None
       
   661     return scheme, user, password, hostport
       
   662 
       
   663 class ProxyHandler(BaseHandler):
       
   664     # Proxies must be in front
       
   665     handler_order = 100
       
   666 
       
   667     def __init__(self, proxies=None):
       
   668         if proxies is None:
       
   669             proxies = getproxies()
       
   670         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
       
   671         self.proxies = proxies
       
   672         for type, url in proxies.items():
       
   673             setattr(self, '%s_open' % type,
       
   674                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
       
   675                     meth(r, proxy, type))
       
   676 
       
   677     def proxy_open(self, req, proxy, type):
       
   678         orig_type = req.get_type()
       
   679         proxy_type, user, password, hostport = _parse_proxy(proxy)
       
   680         if proxy_type is None:
       
   681             proxy_type = orig_type
       
   682         if user and password:
       
   683             user_pass = '%s:%s' % (unquote(user), unquote(password))
       
   684             creds = base64.b64encode(user_pass).strip()
       
   685             req.add_header('Proxy-authorization', 'Basic ' + creds)
       
   686         hostport = unquote(hostport)
       
   687         req.set_proxy(hostport, proxy_type)
       
   688         if orig_type == proxy_type:
       
   689             # let other handlers take care of it
       
   690             return None
       
   691         else:
       
   692             # need to start over, because the other handlers don't
       
   693             # grok the proxy's URL type
       
   694             # e.g. if we have a constructor arg proxies like so:
       
   695             # {'http': 'ftp://proxy.example.com'}, we may end up turning
       
   696             # a request for http://acme.example.com/a into one for
       
   697             # ftp://proxy.example.com/a
       
   698             return self.parent.open(req)
       
   699 
       
   700 class HTTPPasswordMgr:
       
   701 
       
   702     def __init__(self):
       
   703         self.passwd = {}
       
   704 
       
   705     def add_password(self, realm, uri, user, passwd):
       
   706         # uri could be a single URI or a sequence
       
   707         if isinstance(uri, basestring):
       
   708             uri = [uri]
       
   709         if not realm in self.passwd:
       
   710             self.passwd[realm] = {}
       
   711         for default_port in True, False:
       
   712             reduced_uri = tuple(
       
   713                 [self.reduce_uri(u, default_port) for u in uri])
       
   714             self.passwd[realm][reduced_uri] = (user, passwd)
       
   715 
       
   716     def find_user_password(self, realm, authuri):
       
   717         domains = self.passwd.get(realm, {})
       
   718         for default_port in True, False:
       
   719             reduced_authuri = self.reduce_uri(authuri, default_port)
       
   720             for uris, authinfo in domains.iteritems():
       
   721                 for uri in uris:
       
   722                     if self.is_suburi(uri, reduced_authuri):
       
   723                         return authinfo
       
   724         return None, None
       
   725 
       
   726     def reduce_uri(self, uri, default_port=True):
       
   727         """Accept authority or URI and extract only the authority and path."""
       
   728         # note HTTP URLs do not have a userinfo component
       
   729         parts = urlparse.urlsplit(uri)
       
   730         if parts[1]:
       
   731             # URI
       
   732             scheme = parts[0]
       
   733             authority = parts[1]
       
   734             path = parts[2] or '/'
       
   735         else:
       
   736             # host or host:port
       
   737             scheme = None
       
   738             authority = uri
       
   739             path = '/'
       
   740         host, port = splitport(authority)
       
   741         if default_port and port is None and scheme is not None:
       
   742             dport = {"http": 80,
       
   743                      "https": 443,
       
   744                      }.get(scheme)
       
   745             if dport is not None:
       
   746                 authority = "%s:%d" % (host, dport)
       
   747         return authority, path
       
   748 
       
   749     def is_suburi(self, base, test):
       
   750         """Check if test is below base in a URI tree
       
   751 
       
   752         Both args must be URIs in reduced form.
       
   753         """
       
   754         if base == test:
       
   755             return True
       
   756         if base[0] != test[0]:
       
   757             return False
       
   758         common = posixpath.commonprefix((base[1], test[1]))
       
   759         if len(common) == len(base[1]):
       
   760             return True
       
   761         return False
       
   762 
       
   763 
       
   764 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
       
   765 
       
   766     def find_user_password(self, realm, authuri):
       
   767         user, password = HTTPPasswordMgr.find_user_password(self, realm,
       
   768                                                             authuri)
       
   769         if user is not None:
       
   770             return user, password
       
   771         return HTTPPasswordMgr.find_user_password(self, None, authuri)
       
   772 
       
   773 
       
   774 class AbstractBasicAuthHandler:
       
   775 
       
   776     # XXX this allows for multiple auth-schemes, but will stupidly pick
       
   777     # the last one with a realm specified.
       
   778 
       
   779     rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
       
   780 
       
   781     # XXX could pre-emptively send auth info already accepted (RFC 2617,
       
   782     # end of section 2, and section 1.2 immediately after "credentials"
       
   783     # production).
       
   784 
       
   785     def __init__(self, password_mgr=None):
       
   786         if password_mgr is None:
       
   787             password_mgr = HTTPPasswordMgr()
       
   788         self.passwd = password_mgr
       
   789         self.add_password = self.passwd.add_password
       
   790 
       
   791     def http_error_auth_reqed(self, authreq, host, req, headers):
       
   792         # host may be an authority (without userinfo) or a URL with an
       
   793         # authority
       
   794         # XXX could be multiple headers
       
   795         authreq = headers.get(authreq, None)
       
   796         if authreq:
       
   797             mo = AbstractBasicAuthHandler.rx.search(authreq)
       
   798             if mo:
       
   799                 scheme, realm = mo.groups()
       
   800                 if scheme.lower() == 'basic':
       
   801                     return self.retry_http_basic_auth(host, req, realm)
       
   802 
       
   803     def retry_http_basic_auth(self, host, req, realm):
       
   804         user, pw = self.passwd.find_user_password(realm, host)
       
   805         if pw is not None:
       
   806             raw = "%s:%s" % (user, pw)
       
   807             auth = 'Basic %s' % base64.b64encode(raw).strip()
       
   808             if req.headers.get(self.auth_header, None) == auth:
       
   809                 return None
       
   810             req.add_header(self.auth_header, auth)
       
   811             return self.parent.open(req)
       
   812         else:
       
   813             return None
       
   814 
       
   815 
       
   816 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
       
   817 
       
   818     auth_header = 'Authorization'
       
   819 
       
   820     def http_error_401(self, req, fp, code, msg, headers):
       
   821         url = req.get_full_url()
       
   822         return self.http_error_auth_reqed('www-authenticate',
       
   823                                           url, req, headers)
       
   824 
       
   825 
       
   826 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
       
   827 
       
   828     auth_header = 'Proxy-authorization'
       
   829 
       
   830     def http_error_407(self, req, fp, code, msg, headers):
       
   831         # http_error_auth_reqed requires that there is no userinfo component in
       
   832         # authority.  Assume there isn't one, since urllib2 does not (and
       
   833         # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
       
   834         # userinfo.
       
   835         authority = req.get_host()
       
   836         return self.http_error_auth_reqed('proxy-authenticate',
       
   837                                           authority, req, headers)
       
   838 
       
   839 
       
   840 def randombytes(n):
       
   841     """Return n random bytes."""
       
   842     # Use /dev/urandom if it is available.  Fall back to random module
       
   843     # if not.  It might be worthwhile to extend this function to use
       
   844     # other platform-specific mechanisms for getting random bytes.
       
   845     if os.path.exists("/dev/urandom"):
       
   846         f = open("/dev/urandom")
       
   847         s = f.read(n)
       
   848         f.close()
       
   849         return s
       
   850     else:
       
   851         L = [chr(random.randrange(0, 256)) for i in range(n)]
       
   852         return "".join(L)
       
   853 
       
   854 class AbstractDigestAuthHandler:
       
   855     # Digest authentication is specified in RFC 2617.
       
   856 
       
   857     # XXX The client does not inspect the Authentication-Info header
       
   858     # in a successful response.
       
   859 
       
   860     # XXX It should be possible to test this implementation against
       
   861     # a mock server that just generates a static set of challenges.
       
   862 
       
   863     # XXX qop="auth-int" supports is shaky
       
   864 
       
   865     def __init__(self, passwd=None):
       
   866         if passwd is None:
       
   867             passwd = HTTPPasswordMgr()
       
   868         self.passwd = passwd
       
   869         self.add_password = self.passwd.add_password
       
   870         self.retried = 0
       
   871         self.nonce_count = 0
       
   872 
       
   873     def reset_retry_count(self):
       
   874         self.retried = 0
       
   875 
       
   876     def http_error_auth_reqed(self, auth_header, host, req, headers):
       
   877         authreq = headers.get(auth_header, None)
       
   878         if self.retried > 5:
       
   879             # Don't fail endlessly - if we failed once, we'll probably
       
   880             # fail a second time. Hm. Unless the Password Manager is
       
   881             # prompting for the information. Crap. This isn't great
       
   882             # but it's better than the current 'repeat until recursion
       
   883             # depth exceeded' approach <wink>
       
   884             raise HTTPError(req.get_full_url(), 401, "digest auth failed",
       
   885                             headers, None)
       
   886         else:
       
   887             self.retried += 1
       
   888         if authreq:
       
   889             scheme = authreq.split()[0]
       
   890             if scheme.lower() == 'digest':
       
   891                 return self.retry_http_digest_auth(req, authreq)
       
   892 
       
   893     def retry_http_digest_auth(self, req, auth):
       
   894         token, challenge = auth.split(' ', 1)
       
   895         chal = parse_keqv_list(parse_http_list(challenge))
       
   896         auth = self.get_authorization(req, chal)
       
   897         if auth:
       
   898             auth_val = 'Digest %s' % auth
       
   899             if req.headers.get(self.auth_header, None) == auth_val:
       
   900                 return None
       
   901             req.add_unredirected_header(self.auth_header, auth_val)
       
   902             resp = self.parent.open(req)
       
   903             return resp
       
   904 
       
   905     def get_cnonce(self, nonce):
       
   906         # The cnonce-value is an opaque
       
   907         # quoted string value provided by the client and used by both client
       
   908         # and server to avoid chosen plaintext attacks, to provide mutual
       
   909         # authentication, and to provide some message integrity protection.
       
   910         # This isn't a fabulous effort, but it's probably Good Enough.
       
   911         dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
       
   912                                             randombytes(8))).hexdigest()
       
   913         return dig[:16]
       
   914 
       
   915     def get_authorization(self, req, chal):
       
   916         try:
       
   917             realm = chal['realm']
       
   918             nonce = chal['nonce']
       
   919             qop = chal.get('qop')
       
   920             algorithm = chal.get('algorithm', 'MD5')
       
   921             # mod_digest doesn't send an opaque, even though it isn't
       
   922             # supposed to be optional
       
   923             opaque = chal.get('opaque', None)
       
   924         except KeyError:
       
   925             return None
       
   926 
       
   927         H, KD = self.get_algorithm_impls(algorithm)
       
   928         if H is None:
       
   929             return None
       
   930 
       
   931         user, pw = self.passwd.find_user_password(realm, req.get_full_url())
       
   932         if user is None:
       
   933             return None
       
   934 
       
   935         # XXX not implemented yet
       
   936         if req.has_data():
       
   937             entdig = self.get_entity_digest(req.get_data(), chal)
       
   938         else:
       
   939             entdig = None
       
   940 
       
   941         A1 = "%s:%s:%s" % (user, realm, pw)
       
   942         A2 = "%s:%s" % (req.get_method(),
       
   943                         # XXX selector: what about proxies and full urls
       
   944                         req.get_selector())
       
   945         if qop == 'auth':
       
   946             self.nonce_count += 1
       
   947             ncvalue = '%08x' % self.nonce_count
       
   948             cnonce = self.get_cnonce(nonce)
       
   949             noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
       
   950             respdig = KD(H(A1), noncebit)
       
   951         elif qop is None:
       
   952             respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
       
   953         else:
       
   954             # XXX handle auth-int.
       
   955             raise URLError("qop '%s' is not supported." % qop)
       
   956 
       
   957         # XXX should the partial digests be encoded too?
       
   958 
       
   959         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
       
   960                'response="%s"' % (user, realm, nonce, req.get_selector(),
       
   961                                   respdig)
       
   962         if opaque:
       
   963             base += ', opaque="%s"' % opaque
       
   964         if entdig:
       
   965             base += ', digest="%s"' % entdig
       
   966         base += ', algorithm="%s"' % algorithm
       
   967         if qop:
       
   968             base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
       
   969         return base
       
   970 
       
   971     def get_algorithm_impls(self, algorithm):
       
   972         # lambdas assume digest modules are imported at the top level
       
   973         if algorithm == 'MD5':
       
   974             H = lambda x: hashlib.md5(x).hexdigest()
       
   975         elif algorithm == 'SHA':
       
   976             H = lambda x: hashlib.sha1(x).hexdigest()
       
   977         # XXX MD5-sess
       
   978         KD = lambda s, d: H("%s:%s" % (s, d))
       
   979         return H, KD
       
   980 
       
   981     def get_entity_digest(self, data, chal):
       
   982         # XXX not implemented yet
       
   983         return None
       
   984 
       
   985 
       
   986 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
       
   987     """An authentication protocol defined by RFC 2069
       
   988 
       
   989     Digest authentication improves on basic authentication because it
       
   990     does not transmit passwords in the clear.
       
   991     """
       
   992 
       
   993     auth_header = 'Authorization'
       
   994     handler_order = 490  # before Basic auth
       
   995 
       
   996     def http_error_401(self, req, fp, code, msg, headers):
       
   997         host = urlparse.urlparse(req.get_full_url())[1]
       
   998         retry = self.http_error_auth_reqed('www-authenticate',
       
   999                                            host, req, headers)
       
  1000         self.reset_retry_count()
       
  1001         return retry
       
  1002 
       
  1003 
       
  1004 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
       
  1005 
       
  1006     auth_header = 'Proxy-Authorization'
       
  1007     handler_order = 490  # before Basic auth
       
  1008 
       
  1009     def http_error_407(self, req, fp, code, msg, headers):
       
  1010         host = req.get_host()
       
  1011         retry = self.http_error_auth_reqed('proxy-authenticate',
       
  1012                                            host, req, headers)
       
  1013         self.reset_retry_count()
       
  1014         return retry
       
  1015 
       
  1016 class AbstractHTTPHandler(BaseHandler):
       
  1017 
       
  1018     def __init__(self, debuglevel=0):
       
  1019         self._debuglevel = debuglevel
       
  1020 
       
  1021     def set_http_debuglevel(self, level):
       
  1022         self._debuglevel = level
       
  1023 
       
  1024     def do_request_(self, request):
       
  1025         host = request.get_host()
       
  1026         if not host:
       
  1027             raise URLError('no host given')
       
  1028 
       
  1029         if request.has_data():  # POST
       
  1030             data = request.get_data()
       
  1031             if not request.has_header('Content-type'):
       
  1032                 request.add_unredirected_header(
       
  1033                     'Content-type',
       
  1034                     'application/x-www-form-urlencoded')
       
  1035             if not request.has_header('Content-length'):
       
  1036                 request.add_unredirected_header(
       
  1037                     'Content-length', '%d' % len(data))
       
  1038 
       
  1039         scheme, sel = splittype(request.get_selector())
       
  1040         sel_host, sel_path = splithost(sel)
       
  1041         if not request.has_header('Host'):
       
  1042             request.add_unredirected_header('Host', sel_host or host)
       
  1043         for name, value in self.parent.addheaders:
       
  1044             name = name.capitalize()
       
  1045             if not request.has_header(name):
       
  1046                 request.add_unredirected_header(name, value)
       
  1047 
       
  1048         return request
       
  1049 
       
  1050     def do_open(self, http_class, req):
       
  1051         """Return an addinfourl object for the request, using http_class.
       
  1052 
       
  1053         http_class must implement the HTTPConnection API from httplib.
       
  1054         The addinfourl return value is a file-like object.  It also
       
  1055         has methods and attributes including:
       
  1056             - info(): return a mimetools.Message object for the headers
       
  1057             - geturl(): return the original request URL
       
  1058             - code: HTTP status code
       
  1059         """
       
  1060         host = req.get_host()
       
  1061         if not host:
       
  1062             raise URLError('no host given')
       
  1063 
       
  1064         h = http_class(host) # will parse host:port
       
  1065         h.set_debuglevel(self._debuglevel)
       
  1066 
       
  1067         headers = dict(req.headers)
       
  1068         headers.update(req.unredirected_hdrs)
       
  1069         # We want to make an HTTP/1.1 request, but the addinfourl
       
  1070         # class isn't prepared to deal with a persistent connection.
       
  1071         # It will try to read all remaining data from the socket,
       
  1072         # which will block while the server waits for the next request.
       
  1073         # So make sure the connection gets closed after the (only)
       
  1074         # request.
       
  1075         headers["Connection"] = "close"
       
  1076         headers = dict(
       
  1077             (name.title(), val) for name, val in headers.items())
       
  1078         try:
       
  1079             h.request(req.get_method(), req.get_selector(), req.data, headers)
       
  1080             r = h.getresponse()
       
  1081         except socket.error, err: # XXX what error?
       
  1082             raise URLError(err)
       
  1083 
       
  1084         # Pick apart the HTTPResponse object to get the addinfourl
       
  1085         # object initialized properly.
       
  1086 
       
  1087         # Wrap the HTTPResponse object in socket's file object adapter
       
  1088         # for Windows.  That adapter calls recv(), so delegate recv()
       
  1089         # to read().  This weird wrapping allows the returned object to
       
  1090         # have readline() and readlines() methods.
       
  1091 
       
  1092         # XXX It might be better to extract the read buffering code
       
  1093         # out of socket._fileobject() and into a base class.
       
  1094 
       
  1095         r.recv = r.read
       
  1096         fp = socket._fileobject(r, close=True)
       
  1097 
       
  1098         resp = addinfourl(fp, r.msg, req.get_full_url())
       
  1099         resp.code = r.status
       
  1100         resp.msg = r.reason
       
  1101         return resp
       
  1102 
       
  1103 
       
  1104 class HTTPHandler(AbstractHTTPHandler):
       
  1105 
       
  1106     def http_open(self, req):
       
  1107         return self.do_open(httplib.HTTPConnection, req)
       
  1108 
       
  1109     http_request = AbstractHTTPHandler.do_request_
       
  1110 
       
  1111 if hasattr(httplib, 'HTTPS'):
       
  1112     class HTTPSHandler(AbstractHTTPHandler):
       
  1113 
       
  1114         def https_open(self, req):
       
  1115             return self.do_open(httplib.HTTPSConnection, req)
       
  1116 
       
  1117         https_request = AbstractHTTPHandler.do_request_
       
  1118 
       
  1119 class HTTPCookieProcessor(BaseHandler):
       
  1120     def __init__(self, cookiejar=None):
       
  1121         import cookielib
       
  1122         if cookiejar is None:
       
  1123             cookiejar = cookielib.CookieJar()
       
  1124         self.cookiejar = cookiejar
       
  1125 
       
  1126     def http_request(self, request):
       
  1127         self.cookiejar.add_cookie_header(request)
       
  1128         return request
       
  1129 
       
  1130     def http_response(self, request, response):
       
  1131         self.cookiejar.extract_cookies(response, request)
       
  1132         return response
       
  1133 
       
  1134     https_request = http_request
       
  1135     https_response = http_response
       
  1136 
       
  1137 class UnknownHandler(BaseHandler):
       
  1138     def unknown_open(self, req):
       
  1139         type = req.get_type()
       
  1140         raise URLError('unknown url type: %s' % type)
       
  1141 
       
  1142 def parse_keqv_list(l):
       
  1143     """Parse list of key=value strings where keys are not duplicated."""
       
  1144     parsed = {}
       
  1145     for elt in l:
       
  1146         k, v = elt.split('=', 1)
       
  1147         if v[0] == '"' and v[-1] == '"':
       
  1148             v = v[1:-1]
       
  1149         parsed[k] = v
       
  1150     return parsed
       
  1151 
       
  1152 def parse_http_list(s):
       
  1153     """Parse lists as described by RFC 2068 Section 2.
       
  1154 
       
  1155     In particular, parse comma-separated lists where the elements of
       
  1156     the list may include quoted-strings.  A quoted-string could
       
  1157     contain a comma.  A non-quoted string could have quotes in the
       
  1158     middle.  Neither commas nor quotes count if they are escaped.
       
  1159     Only double-quotes count, not single-quotes.
       
  1160     """
       
  1161     res = []
       
  1162     part = ''
       
  1163 
       
  1164     escape = quote = False
       
  1165     for cur in s:
       
  1166         if escape:
       
  1167             part += cur
       
  1168             escape = False
       
  1169             continue
       
  1170         if quote:
       
  1171             if cur == '\\':
       
  1172                 escape = True
       
  1173                 continue
       
  1174             elif cur == '"':
       
  1175                 quote = False
       
  1176             part += cur
       
  1177             continue
       
  1178 
       
  1179         if cur == ',':
       
  1180             res.append(part)
       
  1181             part = ''
       
  1182             continue
       
  1183 
       
  1184         if cur == '"':
       
  1185             quote = True
       
  1186 
       
  1187         part += cur
       
  1188 
       
  1189     # append last part
       
  1190     if part:
       
  1191         res.append(part)
       
  1192 
       
  1193     return [part.strip() for part in res]
       
  1194 
       
  1195 class FileHandler(BaseHandler):
       
  1196     # Use local file or FTP depending on form of URL
       
  1197     def file_open(self, req):
       
  1198         url = req.get_selector()
       
  1199         if url[:2] == '//' and url[2:3] != '/':
       
  1200             req.type = 'ftp'
       
  1201             return self.parent.open(req)
       
  1202         else:
       
  1203             return self.open_local_file(req)
       
  1204 
       
  1205     # names for the localhost
       
  1206     names = None
       
  1207     def get_names(self):
       
  1208         if FileHandler.names is None:
       
  1209             try:
       
  1210                 FileHandler.names = (socket.gethostbyname('localhost'),
       
  1211                                     socket.gethostbyname(socket.gethostname()))
       
  1212             except socket.gaierror:
       
  1213                 FileHandler.names = (socket.gethostbyname('localhost'),)
       
  1214         return FileHandler.names
       
  1215 
       
  1216     # not entirely sure what the rules are here
       
  1217     def open_local_file(self, req):
       
  1218         import email.Utils
       
  1219         import mimetypes
       
  1220         host = req.get_host()
       
  1221         file = req.get_selector()
       
  1222         localfile = url2pathname(file)
       
  1223         stats = os.stat(localfile)
       
  1224         size = stats.st_size
       
  1225         modified = email.Utils.formatdate(stats.st_mtime, usegmt=True)
       
  1226         mtype = mimetypes.guess_type(file)[0]
       
  1227         headers = mimetools.Message(StringIO(
       
  1228             'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
       
  1229             (mtype or 'text/plain', size, modified)))
       
  1230         if host:
       
  1231             host, port = splitport(host)
       
  1232         if not host or \
       
  1233            (not port and socket.gethostbyname(host) in self.get_names()):
       
  1234             return addinfourl(open(localfile, 'rb'),
       
  1235                               headers, 'file:'+file)
       
  1236         raise URLError('file not on local host')
       
  1237 
       
  1238 class FTPHandler(BaseHandler):
       
  1239     def ftp_open(self, req):
       
  1240         import ftplib
       
  1241         import mimetypes
       
  1242         host = req.get_host()
       
  1243         if not host:
       
  1244             raise IOError, ('ftp error', 'no host given')
       
  1245         host, port = splitport(host)
       
  1246         if port is None:
       
  1247             port = ftplib.FTP_PORT
       
  1248         else:
       
  1249             port = int(port)
       
  1250 
       
  1251         # username/password handling
       
  1252         user, host = splituser(host)
       
  1253         if user:
       
  1254             user, passwd = splitpasswd(user)
       
  1255         else:
       
  1256             passwd = None
       
  1257         host = unquote(host)
       
  1258         user = unquote(user or '')
       
  1259         passwd = unquote(passwd or '')
       
  1260 
       
  1261         try:
       
  1262             host = socket.gethostbyname(host)
       
  1263         except socket.error, msg:
       
  1264             raise URLError(msg)
       
  1265         path, attrs = splitattr(req.get_selector())
       
  1266         dirs = path.split('/')
       
  1267         dirs = map(unquote, dirs)
       
  1268         dirs, file = dirs[:-1], dirs[-1]
       
  1269         if dirs and not dirs[0]:
       
  1270             dirs = dirs[1:]
       
  1271         try:
       
  1272             fw = self.connect_ftp(user, passwd, host, port, dirs)
       
  1273             type = file and 'I' or 'D'
       
  1274             for attr in attrs:
       
  1275                 attr, value = splitvalue(attr)
       
  1276                 if attr.lower() == 'type' and \
       
  1277                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
       
  1278                     type = value.upper()
       
  1279             fp, retrlen = fw.retrfile(file, type)
       
  1280             headers = ""
       
  1281             mtype = mimetypes.guess_type(req.get_full_url())[0]
       
  1282             if mtype:
       
  1283                 headers += "Content-type: %s\n" % mtype
       
  1284             if retrlen is not None and retrlen >= 0:
       
  1285                 headers += "Content-length: %d\n" % retrlen
       
  1286             sf = StringIO(headers)
       
  1287             headers = mimetools.Message(sf)
       
  1288             return addinfourl(fp, headers, req.get_full_url())
       
  1289         except ftplib.all_errors, msg:
       
  1290             raise IOError, ('ftp error', msg), sys.exc_info()[2]
       
  1291 
       
  1292     def connect_ftp(self, user, passwd, host, port, dirs):
       
  1293         fw = ftpwrapper(user, passwd, host, port, dirs)
       
  1294 ##        fw.ftp.set_debuglevel(1)
       
  1295         return fw
       
  1296 
       
  1297 class CacheFTPHandler(FTPHandler):
       
  1298     # XXX would be nice to have pluggable cache strategies
       
  1299     # XXX this stuff is definitely not thread safe
       
  1300     def __init__(self):
       
  1301         self.cache = {}
       
  1302         self.timeout = {}
       
  1303         self.soonest = 0
       
  1304         self.delay = 60
       
  1305         self.max_conns = 16
       
  1306 
       
  1307     def setTimeout(self, t):
       
  1308         self.delay = t
       
  1309 
       
  1310     def setMaxConns(self, m):
       
  1311         self.max_conns = m
       
  1312 
       
  1313     def connect_ftp(self, user, passwd, host, port, dirs):
       
  1314         key = user, host, port, '/'.join(dirs)
       
  1315         if key in self.cache:
       
  1316             self.timeout[key] = time.time() + self.delay
       
  1317         else:
       
  1318             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
       
  1319             self.timeout[key] = time.time() + self.delay
       
  1320         self.check_cache()
       
  1321         return self.cache[key]
       
  1322 
       
  1323     def check_cache(self):
       
  1324         # first check for old ones
       
  1325         t = time.time()
       
  1326         if self.soonest <= t:
       
  1327             for k, v in self.timeout.items():
       
  1328                 if v < t:
       
  1329                     self.cache[k].close()
       
  1330                     del self.cache[k]
       
  1331                     del self.timeout[k]
       
  1332         self.soonest = min(self.timeout.values())
       
  1333 
       
  1334         # then check the size
       
  1335         if len(self.cache) == self.max_conns:
       
  1336             for k, v in self.timeout.items():
       
  1337                 if v == self.soonest:
       
  1338                     del self.cache[k]
       
  1339                     del self.timeout[k]
       
  1340                     break
       
  1341             self.soonest = min(self.timeout.values())
       
  1342 
       
  1343 class GopherHandler(BaseHandler):
       
  1344     def gopher_open(self, req):
       
  1345         # XXX can raise socket.error
       
  1346         import gopherlib  # this raises DeprecationWarning in 2.5
       
  1347         host = req.get_host()
       
  1348         if not host:
       
  1349             raise GopherError('no host given')
       
  1350         host = unquote(host)
       
  1351         selector = req.get_selector()
       
  1352         type, selector = splitgophertype(selector)
       
  1353         selector, query = splitquery(selector)
       
  1354         selector = unquote(selector)
       
  1355         if query:
       
  1356             query = unquote(query)
       
  1357             fp = gopherlib.send_query(selector, query, host)
       
  1358         else:
       
  1359             fp = gopherlib.send_selector(selector, host)
       
  1360         return addinfourl(fp, noheaders(), req.get_full_url())