|
1 """An extensible library for opening URLs using a variety of protocols |
|
2 |
|
3 The simplest way to use this module is to call the urlopen function, |
|
4 which accepts a string containing a URL or a Request object (described |
|
5 below). It opens the URL and returns the results as file-like |
|
6 object; the returned object has some extra methods described below. |
|
7 |
|
8 The OpenerDirector manages a collection of Handler objects that do |
|
9 all the actual work. Each Handler implements a particular protocol or |
|
10 option. The OpenerDirector is a composite object that invokes the |
|
11 Handlers needed to open the requested URL. For example, the |
|
12 HTTPHandler performs HTTP GET and POST requests and deals with |
|
13 non-error returns. The HTTPRedirectHandler automatically deals with |
|
14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler |
|
15 deals with digest authentication. |
|
16 |
|
17 urlopen(url, data=None) -- basic usage is the same as original |
|
18 urllib. pass the url and optionally data to post to an HTTP URL, and |
|
19 get a file-like object back. One difference is that you can also pass |
|
20 a Request instance instead of URL. Raises a URLError (subclass of |
|
21 IOError); for HTTP errors, raises an HTTPError, which can also be |
|
22 treated as a valid response. |
|
23 |
|
24 build_opener -- function that creates a new OpenerDirector instance. |
|
25 will install the default handlers. accepts one or more Handlers as |
|
26 arguments, either instances or Handler classes that it will |
|
27 instantiate. if one of the argument is a subclass of the default |
|
28 handler, the argument will be installed instead of the default. |
|
29 |
|
30 install_opener -- installs a new opener as the default opener. |
|
31 |
|
32 objects of interest: |
|
33 OpenerDirector -- |
|
34 |
|
35 Request -- an object that encapsulates the state of a request. the |
|
36 state can be a simple as the URL. it can also include extra HTTP |
|
37 headers, e.g. a User-Agent. |
|
38 |
|
39 BaseHandler -- |
|
40 |
|
41 exceptions: |
|
42 URLError-- a subclass of IOError, individual protocols have their own |
|
43 specific subclass |
|
44 |
|
45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error |
|
46 as an exceptional event or valid response |
|
47 |
|
48 internals: |
|
49 BaseHandler and parent |
|
50 _call_chain conventions |
|
51 |
|
52 Example usage: |
|
53 |
|
54 import urllib2 |
|
55 |
|
56 # set up authentication info |
|
57 authinfo = urllib2.HTTPBasicAuthHandler() |
|
58 authinfo.add_password(realm='PDQ Application', |
|
59 uri='https://mahler:8092/site-updates.py', |
|
60 user='klem', |
|
61 passwd='geheim$parole') |
|
62 |
|
63 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"}) |
|
64 |
|
65 # build a new opener that adds authentication and caching FTP handlers |
|
66 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler) |
|
67 |
|
68 # install it |
|
69 urllib2.install_opener(opener) |
|
70 |
|
71 f = urllib2.urlopen('http://www.python.org/') |
|
72 |
|
73 |
|
74 """ |
|
75 |
|
76 # XXX issues: |
|
77 # If an authentication error handler that tries to perform |
|
78 # authentication for some reason but fails, how should the error be |
|
79 # signalled? The client needs to know the HTTP error code. But if |
|
80 # the handler knows that the problem was, e.g., that it didn't know |
|
81 # that hash algo that requested in the challenge, it would be good to |
|
82 # pass that information along to the client, too. |
|
83 # ftp errors aren't handled cleanly |
|
84 # check digest against correct (i.e. non-apache) implementation |
|
85 |
|
86 # Possible extensions: |
|
87 # complex proxies XXX not sure what exactly was meant by this |
|
88 # abstract factory for opener |
|
89 |
|
90 import base64 |
|
91 import hashlib |
|
92 import httplib |
|
93 import mimetools |
|
94 import os |
|
95 import posixpath |
|
96 import random |
|
97 import re |
|
98 import socket |
|
99 import sys |
|
100 import time |
|
101 import urlparse |
|
102 import bisect |
|
103 |
|
104 try: |
|
105 from cStringIO import StringIO |
|
106 except ImportError: |
|
107 from StringIO import StringIO |
|
108 |
|
109 from urllib import (unwrap, unquote, splittype, splithost, quote, |
|
110 addinfourl, splitport, splitgophertype, splitquery, |
|
111 splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue) |
|
112 |
|
113 # support for FileHandler, proxies via environment variables |
|
114 from urllib import localhost, url2pathname, getproxies |
|
115 |
|
116 # used in User-Agent header sent |
|
117 __version__ = sys.version[:3] |
|
118 |
|
119 _opener = None |
|
120 def urlopen(url, data=None): |
|
121 global _opener |
|
122 if _opener is None: |
|
123 _opener = build_opener() |
|
124 return _opener.open(url, data) |
|
125 |
|
126 def install_opener(opener): |
|
127 global _opener |
|
128 _opener = opener |
|
129 |
|
130 # do these error classes make sense? |
|
131 # make sure all of the IOError stuff is overridden. we just want to be |
|
132 # subtypes. |
|
133 |
|
134 class URLError(IOError): |
|
135 # URLError is a sub-type of IOError, but it doesn't share any of |
|
136 # the implementation. need to override __init__ and __str__. |
|
137 # It sets self.args for compatibility with other EnvironmentError |
|
138 # subclasses, but args doesn't have the typical format with errno in |
|
139 # slot 0 and strerror in slot 1. This may be better than nothing. |
|
140 def __init__(self, reason): |
|
141 self.args = reason, |
|
142 self.reason = reason |
|
143 |
|
144 def __str__(self): |
|
145 return '<urlopen error %s>' % self.reason |
|
146 |
|
147 class HTTPError(URLError, addinfourl): |
|
148 """Raised when HTTP error occurs, but also acts like non-error return""" |
|
149 __super_init = addinfourl.__init__ |
|
150 |
|
151 def __init__(self, url, code, msg, hdrs, fp): |
|
152 self.code = code |
|
153 self.msg = msg |
|
154 self.hdrs = hdrs |
|
155 self.fp = fp |
|
156 self.filename = url |
|
157 # The addinfourl classes depend on fp being a valid file |
|
158 # object. In some cases, the HTTPError may not have a valid |
|
159 # file object. If this happens, the simplest workaround is to |
|
160 # not initialize the base classes. |
|
161 if fp is not None: |
|
162 self.__super_init(fp, hdrs, url) |
|
163 |
|
164 def __str__(self): |
|
165 return 'HTTP Error %s: %s' % (self.code, self.msg) |
|
166 |
|
167 class GopherError(URLError): |
|
168 pass |
|
169 |
|
170 # copied from cookielib.py |
|
171 _cut_port_re = re.compile(r":\d+$") |
|
172 def request_host(request): |
|
173 """Return request-host, as defined by RFC 2965. |
|
174 |
|
175 Variation from RFC: returned value is lowercased, for convenient |
|
176 comparison. |
|
177 |
|
178 """ |
|
179 url = request.get_full_url() |
|
180 host = urlparse.urlparse(url)[1] |
|
181 if host == "": |
|
182 host = request.get_header("Host", "") |
|
183 |
|
184 # remove port, if present |
|
185 host = _cut_port_re.sub("", host, 1) |
|
186 return host.lower() |
|
187 |
|
188 class Request: |
|
189 |
|
190 def __init__(self, url, data=None, headers={}, |
|
191 origin_req_host=None, unverifiable=False): |
|
192 # unwrap('<URL:type://host/path>') --> 'type://host/path' |
|
193 self.__original = unwrap(url) |
|
194 self.type = None |
|
195 # self.__r_type is what's left after doing the splittype |
|
196 self.host = None |
|
197 self.port = None |
|
198 self.data = data |
|
199 self.headers = {} |
|
200 for key, value in headers.items(): |
|
201 self.add_header(key, value) |
|
202 self.unredirected_hdrs = {} |
|
203 if origin_req_host is None: |
|
204 origin_req_host = request_host(self) |
|
205 self.origin_req_host = origin_req_host |
|
206 self.unverifiable = unverifiable |
|
207 |
|
208 def __getattr__(self, attr): |
|
209 # XXX this is a fallback mechanism to guard against these |
|
210 # methods getting called in a non-standard order. this may be |
|
211 # too complicated and/or unnecessary. |
|
212 # XXX should the __r_XXX attributes be public? |
|
213 if attr[:12] == '_Request__r_': |
|
214 name = attr[12:] |
|
215 if hasattr(Request, 'get_' + name): |
|
216 getattr(self, 'get_' + name)() |
|
217 return getattr(self, attr) |
|
218 raise AttributeError, attr |
|
219 |
|
220 def get_method(self): |
|
221 if self.has_data(): |
|
222 return "POST" |
|
223 else: |
|
224 return "GET" |
|
225 |
|
226 # XXX these helper methods are lame |
|
227 |
|
228 def add_data(self, data): |
|
229 self.data = data |
|
230 |
|
231 def has_data(self): |
|
232 return self.data is not None |
|
233 |
|
234 def get_data(self): |
|
235 return self.data |
|
236 |
|
237 def get_full_url(self): |
|
238 return self.__original |
|
239 |
|
240 def get_type(self): |
|
241 if self.type is None: |
|
242 self.type, self.__r_type = splittype(self.__original) |
|
243 if self.type is None: |
|
244 raise ValueError, "unknown url type: %s" % self.__original |
|
245 return self.type |
|
246 |
|
247 def get_host(self): |
|
248 if self.host is None: |
|
249 self.host, self.__r_host = splithost(self.__r_type) |
|
250 if self.host: |
|
251 self.host = unquote(self.host) |
|
252 return self.host |
|
253 |
|
254 def get_selector(self): |
|
255 return self.__r_host |
|
256 |
|
257 def set_proxy(self, host, type): |
|
258 self.host, self.type = host, type |
|
259 self.__r_host = self.__original |
|
260 |
|
261 def get_origin_req_host(self): |
|
262 return self.origin_req_host |
|
263 |
|
264 def is_unverifiable(self): |
|
265 return self.unverifiable |
|
266 |
|
267 def add_header(self, key, val): |
|
268 # useful for something like authentication |
|
269 self.headers[key.capitalize()] = val |
|
270 |
|
271 def add_unredirected_header(self, key, val): |
|
272 # will not be added to a redirected request |
|
273 self.unredirected_hdrs[key.capitalize()] = val |
|
274 |
|
275 def has_header(self, header_name): |
|
276 return (header_name in self.headers or |
|
277 header_name in self.unredirected_hdrs) |
|
278 |
|
279 def get_header(self, header_name, default=None): |
|
280 return self.headers.get( |
|
281 header_name, |
|
282 self.unredirected_hdrs.get(header_name, default)) |
|
283 |
|
284 def header_items(self): |
|
285 hdrs = self.unredirected_hdrs.copy() |
|
286 hdrs.update(self.headers) |
|
287 return hdrs.items() |
|
288 |
|
289 class OpenerDirector: |
|
290 def __init__(self): |
|
291 client_version = "Python-urllib/%s" % __version__ |
|
292 self.addheaders = [('User-agent', client_version)] |
|
293 # manage the individual handlers |
|
294 self.handlers = [] |
|
295 self.handle_open = {} |
|
296 self.handle_error = {} |
|
297 self.process_response = {} |
|
298 self.process_request = {} |
|
299 |
|
300 def add_handler(self, handler): |
|
301 if not hasattr(handler, "add_parent"): |
|
302 raise TypeError("expected BaseHandler instance, got %r" % |
|
303 type(handler)) |
|
304 |
|
305 added = False |
|
306 for meth in dir(handler): |
|
307 if meth in ["redirect_request", "do_open", "proxy_open"]: |
|
308 # oops, coincidental match |
|
309 continue |
|
310 |
|
311 i = meth.find("_") |
|
312 protocol = meth[:i] |
|
313 condition = meth[i+1:] |
|
314 |
|
315 if condition.startswith("error"): |
|
316 j = condition.find("_") + i + 1 |
|
317 kind = meth[j+1:] |
|
318 try: |
|
319 kind = int(kind) |
|
320 except ValueError: |
|
321 pass |
|
322 lookup = self.handle_error.get(protocol, {}) |
|
323 self.handle_error[protocol] = lookup |
|
324 elif condition == "open": |
|
325 kind = protocol |
|
326 lookup = self.handle_open |
|
327 elif condition == "response": |
|
328 kind = protocol |
|
329 lookup = self.process_response |
|
330 elif condition == "request": |
|
331 kind = protocol |
|
332 lookup = self.process_request |
|
333 else: |
|
334 continue |
|
335 |
|
336 handlers = lookup.setdefault(kind, []) |
|
337 if handlers: |
|
338 bisect.insort(handlers, handler) |
|
339 else: |
|
340 handlers.append(handler) |
|
341 added = True |
|
342 |
|
343 if added: |
|
344 # XXX why does self.handlers need to be sorted? |
|
345 bisect.insort(self.handlers, handler) |
|
346 handler.add_parent(self) |
|
347 |
|
348 def close(self): |
|
349 # Only exists for backwards compatibility. |
|
350 pass |
|
351 |
|
352 def _call_chain(self, chain, kind, meth_name, *args): |
|
353 # Handlers raise an exception if no one else should try to handle |
|
354 # the request, or return None if they can't but another handler |
|
355 # could. Otherwise, they return the response. |
|
356 handlers = chain.get(kind, ()) |
|
357 for handler in handlers: |
|
358 func = getattr(handler, meth_name) |
|
359 |
|
360 result = func(*args) |
|
361 if result is not None: |
|
362 return result |
|
363 |
|
364 def open(self, fullurl, data=None): |
|
365 # accept a URL or a Request object |
|
366 if isinstance(fullurl, basestring): |
|
367 req = Request(fullurl, data) |
|
368 else: |
|
369 req = fullurl |
|
370 if data is not None: |
|
371 req.add_data(data) |
|
372 |
|
373 protocol = req.get_type() |
|
374 |
|
375 # pre-process request |
|
376 meth_name = protocol+"_request" |
|
377 for processor in self.process_request.get(protocol, []): |
|
378 meth = getattr(processor, meth_name) |
|
379 req = meth(req) |
|
380 |
|
381 response = self._open(req, data) |
|
382 |
|
383 # post-process response |
|
384 meth_name = protocol+"_response" |
|
385 for processor in self.process_response.get(protocol, []): |
|
386 meth = getattr(processor, meth_name) |
|
387 response = meth(req, response) |
|
388 |
|
389 return response |
|
390 |
|
391 def _open(self, req, data=None): |
|
392 result = self._call_chain(self.handle_open, 'default', |
|
393 'default_open', req) |
|
394 if result: |
|
395 return result |
|
396 |
|
397 protocol = req.get_type() |
|
398 result = self._call_chain(self.handle_open, protocol, protocol + |
|
399 '_open', req) |
|
400 if result: |
|
401 return result |
|
402 |
|
403 return self._call_chain(self.handle_open, 'unknown', |
|
404 'unknown_open', req) |
|
405 |
|
406 def error(self, proto, *args): |
|
407 if proto in ('http', 'https'): |
|
408 # XXX http[s] protocols are special-cased |
|
409 dict = self.handle_error['http'] # https is not different than http |
|
410 proto = args[2] # YUCK! |
|
411 meth_name = 'http_error_%s' % proto |
|
412 http_err = 1 |
|
413 orig_args = args |
|
414 else: |
|
415 dict = self.handle_error |
|
416 meth_name = proto + '_error' |
|
417 http_err = 0 |
|
418 args = (dict, proto, meth_name) + args |
|
419 result = self._call_chain(*args) |
|
420 if result: |
|
421 return result |
|
422 |
|
423 if http_err: |
|
424 args = (dict, 'default', 'http_error_default') + orig_args |
|
425 return self._call_chain(*args) |
|
426 |
|
427 # XXX probably also want an abstract factory that knows when it makes |
|
428 # sense to skip a superclass in favor of a subclass and when it might |
|
429 # make sense to include both |
|
430 |
|
431 def build_opener(*handlers): |
|
432 """Create an opener object from a list of handlers. |
|
433 |
|
434 The opener will use several default handlers, including support |
|
435 for HTTP and FTP. |
|
436 |
|
437 If any of the handlers passed as arguments are subclasses of the |
|
438 default handlers, the default handlers will not be used. |
|
439 """ |
|
440 import types |
|
441 def isclass(obj): |
|
442 return isinstance(obj, types.ClassType) or hasattr(obj, "__bases__") |
|
443 |
|
444 opener = OpenerDirector() |
|
445 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, |
|
446 HTTPDefaultErrorHandler, HTTPRedirectHandler, |
|
447 FTPHandler, FileHandler, HTTPErrorProcessor] |
|
448 if hasattr(httplib, 'HTTPS'): |
|
449 default_classes.append(HTTPSHandler) |
|
450 skip = [] |
|
451 for klass in default_classes: |
|
452 for check in handlers: |
|
453 if isclass(check): |
|
454 if issubclass(check, klass): |
|
455 skip.append(klass) |
|
456 elif isinstance(check, klass): |
|
457 skip.append(klass) |
|
458 for klass in skip: |
|
459 default_classes.remove(klass) |
|
460 |
|
461 for klass in default_classes: |
|
462 opener.add_handler(klass()) |
|
463 |
|
464 for h in handlers: |
|
465 if isclass(h): |
|
466 h = h() |
|
467 opener.add_handler(h) |
|
468 return opener |
|
469 |
|
470 class BaseHandler: |
|
471 handler_order = 500 |
|
472 |
|
473 def add_parent(self, parent): |
|
474 self.parent = parent |
|
475 |
|
476 def close(self): |
|
477 # Only exists for backwards compatibility |
|
478 pass |
|
479 |
|
480 def __lt__(self, other): |
|
481 if not hasattr(other, "handler_order"): |
|
482 # Try to preserve the old behavior of having custom classes |
|
483 # inserted after default ones (works only for custom user |
|
484 # classes which are not aware of handler_order). |
|
485 return True |
|
486 return self.handler_order < other.handler_order |
|
487 |
|
488 |
|
489 class HTTPErrorProcessor(BaseHandler): |
|
490 """Process HTTP error responses.""" |
|
491 handler_order = 1000 # after all other processing |
|
492 |
|
493 def http_response(self, request, response): |
|
494 code, msg, hdrs = response.code, response.msg, response.info() |
|
495 |
|
496 if code not in (200, 206): |
|
497 response = self.parent.error( |
|
498 'http', request, response, code, msg, hdrs) |
|
499 |
|
500 return response |
|
501 |
|
502 https_response = http_response |
|
503 |
|
504 class HTTPDefaultErrorHandler(BaseHandler): |
|
505 def http_error_default(self, req, fp, code, msg, hdrs): |
|
506 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) |
|
507 |
|
508 class HTTPRedirectHandler(BaseHandler): |
|
509 # maximum number of redirections to any single URL |
|
510 # this is needed because of the state that cookies introduce |
|
511 max_repeats = 4 |
|
512 # maximum total number of redirections (regardless of URL) before |
|
513 # assuming we're in a loop |
|
514 max_redirections = 10 |
|
515 |
|
516 def redirect_request(self, req, fp, code, msg, headers, newurl): |
|
517 """Return a Request or None in response to a redirect. |
|
518 |
|
519 This is called by the http_error_30x methods when a |
|
520 redirection response is received. If a redirection should |
|
521 take place, return a new Request to allow http_error_30x to |
|
522 perform the redirect. Otherwise, raise HTTPError if no-one |
|
523 else should try to handle this url. Return None if you can't |
|
524 but another Handler might. |
|
525 """ |
|
526 m = req.get_method() |
|
527 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD") |
|
528 or code in (301, 302, 303) and m == "POST"): |
|
529 # Strictly (according to RFC 2616), 301 or 302 in response |
|
530 # to a POST MUST NOT cause a redirection without confirmation |
|
531 # from the user (of urllib2, in this case). In practice, |
|
532 # essentially all clients do redirect in this case, so we |
|
533 # do the same. |
|
534 # be conciliant with URIs containing a space |
|
535 newurl = newurl.replace(' ', '%20') |
|
536 return Request(newurl, |
|
537 headers=req.headers, |
|
538 origin_req_host=req.get_origin_req_host(), |
|
539 unverifiable=True) |
|
540 else: |
|
541 raise HTTPError(req.get_full_url(), code, msg, headers, fp) |
|
542 |
|
543 # Implementation note: To avoid the server sending us into an |
|
544 # infinite loop, the request object needs to track what URLs we |
|
545 # have already seen. Do this by adding a handler-specific |
|
546 # attribute to the Request object. |
|
547 def http_error_302(self, req, fp, code, msg, headers): |
|
548 # Some servers (incorrectly) return multiple Location headers |
|
549 # (so probably same goes for URI). Use first header. |
|
550 if 'location' in headers: |
|
551 newurl = headers.getheaders('location')[0] |
|
552 elif 'uri' in headers: |
|
553 newurl = headers.getheaders('uri')[0] |
|
554 else: |
|
555 return |
|
556 newurl = urlparse.urljoin(req.get_full_url(), newurl) |
|
557 |
|
558 # XXX Probably want to forget about the state of the current |
|
559 # request, although that might interact poorly with other |
|
560 # handlers that also use handler-specific request attributes |
|
561 new = self.redirect_request(req, fp, code, msg, headers, newurl) |
|
562 if new is None: |
|
563 return |
|
564 |
|
565 # loop detection |
|
566 # .redirect_dict has a key url if url was previously visited. |
|
567 if hasattr(req, 'redirect_dict'): |
|
568 visited = new.redirect_dict = req.redirect_dict |
|
569 if (visited.get(newurl, 0) >= self.max_repeats or |
|
570 len(visited) >= self.max_redirections): |
|
571 raise HTTPError(req.get_full_url(), code, |
|
572 self.inf_msg + msg, headers, fp) |
|
573 else: |
|
574 visited = new.redirect_dict = req.redirect_dict = {} |
|
575 visited[newurl] = visited.get(newurl, 0) + 1 |
|
576 |
|
577 # Don't close the fp until we are sure that we won't use it |
|
578 # with HTTPError. |
|
579 fp.read() |
|
580 fp.close() |
|
581 |
|
582 return self.parent.open(new) |
|
583 |
|
584 http_error_301 = http_error_303 = http_error_307 = http_error_302 |
|
585 |
|
586 inf_msg = "The HTTP server returned a redirect error that would " \ |
|
587 "lead to an infinite loop.\n" \ |
|
588 "The last 30x error message was:\n" |
|
589 |
|
590 |
|
591 def _parse_proxy(proxy): |
|
592 """Return (scheme, user, password, host/port) given a URL or an authority. |
|
593 |
|
594 If a URL is supplied, it must have an authority (host:port) component. |
|
595 According to RFC 3986, having an authority component means the URL must |
|
596 have two slashes after the scheme: |
|
597 |
|
598 >>> _parse_proxy('file:/ftp.example.com/') |
|
599 Traceback (most recent call last): |
|
600 ValueError: proxy URL with no authority: 'file:/ftp.example.com/' |
|
601 |
|
602 The first three items of the returned tuple may be None. |
|
603 |
|
604 Examples of authority parsing: |
|
605 |
|
606 >>> _parse_proxy('proxy.example.com') |
|
607 (None, None, None, 'proxy.example.com') |
|
608 >>> _parse_proxy('proxy.example.com:3128') |
|
609 (None, None, None, 'proxy.example.com:3128') |
|
610 |
|
611 The authority component may optionally include userinfo (assumed to be |
|
612 username:password): |
|
613 |
|
614 >>> _parse_proxy('joe:password@proxy.example.com') |
|
615 (None, 'joe', 'password', 'proxy.example.com') |
|
616 >>> _parse_proxy('joe:password@proxy.example.com:3128') |
|
617 (None, 'joe', 'password', 'proxy.example.com:3128') |
|
618 |
|
619 Same examples, but with URLs instead: |
|
620 |
|
621 >>> _parse_proxy('http://proxy.example.com/') |
|
622 ('http', None, None, 'proxy.example.com') |
|
623 >>> _parse_proxy('http://proxy.example.com:3128/') |
|
624 ('http', None, None, 'proxy.example.com:3128') |
|
625 >>> _parse_proxy('http://joe:password@proxy.example.com/') |
|
626 ('http', 'joe', 'password', 'proxy.example.com') |
|
627 >>> _parse_proxy('http://joe:password@proxy.example.com:3128') |
|
628 ('http', 'joe', 'password', 'proxy.example.com:3128') |
|
629 |
|
630 Everything after the authority is ignored: |
|
631 |
|
632 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') |
|
633 ('ftp', 'joe', 'password', 'proxy.example.com') |
|
634 |
|
635 Test for no trailing '/' case: |
|
636 |
|
637 >>> _parse_proxy('http://joe:password@proxy.example.com') |
|
638 ('http', 'joe', 'password', 'proxy.example.com') |
|
639 |
|
640 """ |
|
641 scheme, r_scheme = splittype(proxy) |
|
642 if not r_scheme.startswith("/"): |
|
643 # authority |
|
644 scheme = None |
|
645 authority = proxy |
|
646 else: |
|
647 # URL |
|
648 if not r_scheme.startswith("//"): |
|
649 raise ValueError("proxy URL with no authority: %r" % proxy) |
|
650 # We have an authority, so for RFC 3986-compliant URLs (by ss 3. |
|
651 # and 3.3.), path is empty or starts with '/' |
|
652 end = r_scheme.find("/", 2) |
|
653 if end == -1: |
|
654 end = None |
|
655 authority = r_scheme[2:end] |
|
656 userinfo, hostport = splituser(authority) |
|
657 if userinfo is not None: |
|
658 user, password = splitpasswd(userinfo) |
|
659 else: |
|
660 user = password = None |
|
661 return scheme, user, password, hostport |
|
662 |
|
663 class ProxyHandler(BaseHandler): |
|
664 # Proxies must be in front |
|
665 handler_order = 100 |
|
666 |
|
667 def __init__(self, proxies=None): |
|
668 if proxies is None: |
|
669 proxies = getproxies() |
|
670 assert hasattr(proxies, 'has_key'), "proxies must be a mapping" |
|
671 self.proxies = proxies |
|
672 for type, url in proxies.items(): |
|
673 setattr(self, '%s_open' % type, |
|
674 lambda r, proxy=url, type=type, meth=self.proxy_open: \ |
|
675 meth(r, proxy, type)) |
|
676 |
|
677 def proxy_open(self, req, proxy, type): |
|
678 orig_type = req.get_type() |
|
679 proxy_type, user, password, hostport = _parse_proxy(proxy) |
|
680 if proxy_type is None: |
|
681 proxy_type = orig_type |
|
682 if user and password: |
|
683 user_pass = '%s:%s' % (unquote(user), unquote(password)) |
|
684 creds = base64.b64encode(user_pass).strip() |
|
685 req.add_header('Proxy-authorization', 'Basic ' + creds) |
|
686 hostport = unquote(hostport) |
|
687 req.set_proxy(hostport, proxy_type) |
|
688 if orig_type == proxy_type: |
|
689 # let other handlers take care of it |
|
690 return None |
|
691 else: |
|
692 # need to start over, because the other handlers don't |
|
693 # grok the proxy's URL type |
|
694 # e.g. if we have a constructor arg proxies like so: |
|
695 # {'http': 'ftp://proxy.example.com'}, we may end up turning |
|
696 # a request for http://acme.example.com/a into one for |
|
697 # ftp://proxy.example.com/a |
|
698 return self.parent.open(req) |
|
699 |
|
700 class HTTPPasswordMgr: |
|
701 |
|
702 def __init__(self): |
|
703 self.passwd = {} |
|
704 |
|
705 def add_password(self, realm, uri, user, passwd): |
|
706 # uri could be a single URI or a sequence |
|
707 if isinstance(uri, basestring): |
|
708 uri = [uri] |
|
709 if not realm in self.passwd: |
|
710 self.passwd[realm] = {} |
|
711 for default_port in True, False: |
|
712 reduced_uri = tuple( |
|
713 [self.reduce_uri(u, default_port) for u in uri]) |
|
714 self.passwd[realm][reduced_uri] = (user, passwd) |
|
715 |
|
716 def find_user_password(self, realm, authuri): |
|
717 domains = self.passwd.get(realm, {}) |
|
718 for default_port in True, False: |
|
719 reduced_authuri = self.reduce_uri(authuri, default_port) |
|
720 for uris, authinfo in domains.iteritems(): |
|
721 for uri in uris: |
|
722 if self.is_suburi(uri, reduced_authuri): |
|
723 return authinfo |
|
724 return None, None |
|
725 |
|
726 def reduce_uri(self, uri, default_port=True): |
|
727 """Accept authority or URI and extract only the authority and path.""" |
|
728 # note HTTP URLs do not have a userinfo component |
|
729 parts = urlparse.urlsplit(uri) |
|
730 if parts[1]: |
|
731 # URI |
|
732 scheme = parts[0] |
|
733 authority = parts[1] |
|
734 path = parts[2] or '/' |
|
735 else: |
|
736 # host or host:port |
|
737 scheme = None |
|
738 authority = uri |
|
739 path = '/' |
|
740 host, port = splitport(authority) |
|
741 if default_port and port is None and scheme is not None: |
|
742 dport = {"http": 80, |
|
743 "https": 443, |
|
744 }.get(scheme) |
|
745 if dport is not None: |
|
746 authority = "%s:%d" % (host, dport) |
|
747 return authority, path |
|
748 |
|
749 def is_suburi(self, base, test): |
|
750 """Check if test is below base in a URI tree |
|
751 |
|
752 Both args must be URIs in reduced form. |
|
753 """ |
|
754 if base == test: |
|
755 return True |
|
756 if base[0] != test[0]: |
|
757 return False |
|
758 common = posixpath.commonprefix((base[1], test[1])) |
|
759 if len(common) == len(base[1]): |
|
760 return True |
|
761 return False |
|
762 |
|
763 |
|
764 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): |
|
765 |
|
766 def find_user_password(self, realm, authuri): |
|
767 user, password = HTTPPasswordMgr.find_user_password(self, realm, |
|
768 authuri) |
|
769 if user is not None: |
|
770 return user, password |
|
771 return HTTPPasswordMgr.find_user_password(self, None, authuri) |
|
772 |
|
773 |
|
774 class AbstractBasicAuthHandler: |
|
775 |
|
776 # XXX this allows for multiple auth-schemes, but will stupidly pick |
|
777 # the last one with a realm specified. |
|
778 |
|
779 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I) |
|
780 |
|
781 # XXX could pre-emptively send auth info already accepted (RFC 2617, |
|
782 # end of section 2, and section 1.2 immediately after "credentials" |
|
783 # production). |
|
784 |
|
785 def __init__(self, password_mgr=None): |
|
786 if password_mgr is None: |
|
787 password_mgr = HTTPPasswordMgr() |
|
788 self.passwd = password_mgr |
|
789 self.add_password = self.passwd.add_password |
|
790 |
|
791 def http_error_auth_reqed(self, authreq, host, req, headers): |
|
792 # host may be an authority (without userinfo) or a URL with an |
|
793 # authority |
|
794 # XXX could be multiple headers |
|
795 authreq = headers.get(authreq, None) |
|
796 if authreq: |
|
797 mo = AbstractBasicAuthHandler.rx.search(authreq) |
|
798 if mo: |
|
799 scheme, realm = mo.groups() |
|
800 if scheme.lower() == 'basic': |
|
801 return self.retry_http_basic_auth(host, req, realm) |
|
802 |
|
803 def retry_http_basic_auth(self, host, req, realm): |
|
804 user, pw = self.passwd.find_user_password(realm, host) |
|
805 if pw is not None: |
|
806 raw = "%s:%s" % (user, pw) |
|
807 auth = 'Basic %s' % base64.b64encode(raw).strip() |
|
808 if req.headers.get(self.auth_header, None) == auth: |
|
809 return None |
|
810 req.add_header(self.auth_header, auth) |
|
811 return self.parent.open(req) |
|
812 else: |
|
813 return None |
|
814 |
|
815 |
|
816 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): |
|
817 |
|
818 auth_header = 'Authorization' |
|
819 |
|
820 def http_error_401(self, req, fp, code, msg, headers): |
|
821 url = req.get_full_url() |
|
822 return self.http_error_auth_reqed('www-authenticate', |
|
823 url, req, headers) |
|
824 |
|
825 |
|
826 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): |
|
827 |
|
828 auth_header = 'Proxy-authorization' |
|
829 |
|
830 def http_error_407(self, req, fp, code, msg, headers): |
|
831 # http_error_auth_reqed requires that there is no userinfo component in |
|
832 # authority. Assume there isn't one, since urllib2 does not (and |
|
833 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing |
|
834 # userinfo. |
|
835 authority = req.get_host() |
|
836 return self.http_error_auth_reqed('proxy-authenticate', |
|
837 authority, req, headers) |
|
838 |
|
839 |
|
840 def randombytes(n): |
|
841 """Return n random bytes.""" |
|
842 # Use /dev/urandom if it is available. Fall back to random module |
|
843 # if not. It might be worthwhile to extend this function to use |
|
844 # other platform-specific mechanisms for getting random bytes. |
|
845 if os.path.exists("/dev/urandom"): |
|
846 f = open("/dev/urandom") |
|
847 s = f.read(n) |
|
848 f.close() |
|
849 return s |
|
850 else: |
|
851 L = [chr(random.randrange(0, 256)) for i in range(n)] |
|
852 return "".join(L) |
|
853 |
|
854 class AbstractDigestAuthHandler: |
|
855 # Digest authentication is specified in RFC 2617. |
|
856 |
|
857 # XXX The client does not inspect the Authentication-Info header |
|
858 # in a successful response. |
|
859 |
|
860 # XXX It should be possible to test this implementation against |
|
861 # a mock server that just generates a static set of challenges. |
|
862 |
|
863 # XXX qop="auth-int" supports is shaky |
|
864 |
|
865 def __init__(self, passwd=None): |
|
866 if passwd is None: |
|
867 passwd = HTTPPasswordMgr() |
|
868 self.passwd = passwd |
|
869 self.add_password = self.passwd.add_password |
|
870 self.retried = 0 |
|
871 self.nonce_count = 0 |
|
872 |
|
873 def reset_retry_count(self): |
|
874 self.retried = 0 |
|
875 |
|
876 def http_error_auth_reqed(self, auth_header, host, req, headers): |
|
877 authreq = headers.get(auth_header, None) |
|
878 if self.retried > 5: |
|
879 # Don't fail endlessly - if we failed once, we'll probably |
|
880 # fail a second time. Hm. Unless the Password Manager is |
|
881 # prompting for the information. Crap. This isn't great |
|
882 # but it's better than the current 'repeat until recursion |
|
883 # depth exceeded' approach <wink> |
|
884 raise HTTPError(req.get_full_url(), 401, "digest auth failed", |
|
885 headers, None) |
|
886 else: |
|
887 self.retried += 1 |
|
888 if authreq: |
|
889 scheme = authreq.split()[0] |
|
890 if scheme.lower() == 'digest': |
|
891 return self.retry_http_digest_auth(req, authreq) |
|
892 |
|
893 def retry_http_digest_auth(self, req, auth): |
|
894 token, challenge = auth.split(' ', 1) |
|
895 chal = parse_keqv_list(parse_http_list(challenge)) |
|
896 auth = self.get_authorization(req, chal) |
|
897 if auth: |
|
898 auth_val = 'Digest %s' % auth |
|
899 if req.headers.get(self.auth_header, None) == auth_val: |
|
900 return None |
|
901 req.add_unredirected_header(self.auth_header, auth_val) |
|
902 resp = self.parent.open(req) |
|
903 return resp |
|
904 |
|
905 def get_cnonce(self, nonce): |
|
906 # The cnonce-value is an opaque |
|
907 # quoted string value provided by the client and used by both client |
|
908 # and server to avoid chosen plaintext attacks, to provide mutual |
|
909 # authentication, and to provide some message integrity protection. |
|
910 # This isn't a fabulous effort, but it's probably Good Enough. |
|
911 dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(), |
|
912 randombytes(8))).hexdigest() |
|
913 return dig[:16] |
|
914 |
|
915 def get_authorization(self, req, chal): |
|
916 try: |
|
917 realm = chal['realm'] |
|
918 nonce = chal['nonce'] |
|
919 qop = chal.get('qop') |
|
920 algorithm = chal.get('algorithm', 'MD5') |
|
921 # mod_digest doesn't send an opaque, even though it isn't |
|
922 # supposed to be optional |
|
923 opaque = chal.get('opaque', None) |
|
924 except KeyError: |
|
925 return None |
|
926 |
|
927 H, KD = self.get_algorithm_impls(algorithm) |
|
928 if H is None: |
|
929 return None |
|
930 |
|
931 user, pw = self.passwd.find_user_password(realm, req.get_full_url()) |
|
932 if user is None: |
|
933 return None |
|
934 |
|
935 # XXX not implemented yet |
|
936 if req.has_data(): |
|
937 entdig = self.get_entity_digest(req.get_data(), chal) |
|
938 else: |
|
939 entdig = None |
|
940 |
|
941 A1 = "%s:%s:%s" % (user, realm, pw) |
|
942 A2 = "%s:%s" % (req.get_method(), |
|
943 # XXX selector: what about proxies and full urls |
|
944 req.get_selector()) |
|
945 if qop == 'auth': |
|
946 self.nonce_count += 1 |
|
947 ncvalue = '%08x' % self.nonce_count |
|
948 cnonce = self.get_cnonce(nonce) |
|
949 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) |
|
950 respdig = KD(H(A1), noncebit) |
|
951 elif qop is None: |
|
952 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) |
|
953 else: |
|
954 # XXX handle auth-int. |
|
955 raise URLError("qop '%s' is not supported." % qop) |
|
956 |
|
957 # XXX should the partial digests be encoded too? |
|
958 |
|
959 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ |
|
960 'response="%s"' % (user, realm, nonce, req.get_selector(), |
|
961 respdig) |
|
962 if opaque: |
|
963 base += ', opaque="%s"' % opaque |
|
964 if entdig: |
|
965 base += ', digest="%s"' % entdig |
|
966 base += ', algorithm="%s"' % algorithm |
|
967 if qop: |
|
968 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) |
|
969 return base |
|
970 |
|
971 def get_algorithm_impls(self, algorithm): |
|
972 # lambdas assume digest modules are imported at the top level |
|
973 if algorithm == 'MD5': |
|
974 H = lambda x: hashlib.md5(x).hexdigest() |
|
975 elif algorithm == 'SHA': |
|
976 H = lambda x: hashlib.sha1(x).hexdigest() |
|
977 # XXX MD5-sess |
|
978 KD = lambda s, d: H("%s:%s" % (s, d)) |
|
979 return H, KD |
|
980 |
|
981 def get_entity_digest(self, data, chal): |
|
982 # XXX not implemented yet |
|
983 return None |
|
984 |
|
985 |
|
986 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): |
|
987 """An authentication protocol defined by RFC 2069 |
|
988 |
|
989 Digest authentication improves on basic authentication because it |
|
990 does not transmit passwords in the clear. |
|
991 """ |
|
992 |
|
993 auth_header = 'Authorization' |
|
994 handler_order = 490 # before Basic auth |
|
995 |
|
996 def http_error_401(self, req, fp, code, msg, headers): |
|
997 host = urlparse.urlparse(req.get_full_url())[1] |
|
998 retry = self.http_error_auth_reqed('www-authenticate', |
|
999 host, req, headers) |
|
1000 self.reset_retry_count() |
|
1001 return retry |
|
1002 |
|
1003 |
|
1004 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): |
|
1005 |
|
1006 auth_header = 'Proxy-Authorization' |
|
1007 handler_order = 490 # before Basic auth |
|
1008 |
|
1009 def http_error_407(self, req, fp, code, msg, headers): |
|
1010 host = req.get_host() |
|
1011 retry = self.http_error_auth_reqed('proxy-authenticate', |
|
1012 host, req, headers) |
|
1013 self.reset_retry_count() |
|
1014 return retry |
|
1015 |
|
1016 class AbstractHTTPHandler(BaseHandler): |
|
1017 |
|
1018 def __init__(self, debuglevel=0): |
|
1019 self._debuglevel = debuglevel |
|
1020 |
|
1021 def set_http_debuglevel(self, level): |
|
1022 self._debuglevel = level |
|
1023 |
|
1024 def do_request_(self, request): |
|
1025 host = request.get_host() |
|
1026 if not host: |
|
1027 raise URLError('no host given') |
|
1028 |
|
1029 if request.has_data(): # POST |
|
1030 data = request.get_data() |
|
1031 if not request.has_header('Content-type'): |
|
1032 request.add_unredirected_header( |
|
1033 'Content-type', |
|
1034 'application/x-www-form-urlencoded') |
|
1035 if not request.has_header('Content-length'): |
|
1036 request.add_unredirected_header( |
|
1037 'Content-length', '%d' % len(data)) |
|
1038 |
|
1039 scheme, sel = splittype(request.get_selector()) |
|
1040 sel_host, sel_path = splithost(sel) |
|
1041 if not request.has_header('Host'): |
|
1042 request.add_unredirected_header('Host', sel_host or host) |
|
1043 for name, value in self.parent.addheaders: |
|
1044 name = name.capitalize() |
|
1045 if not request.has_header(name): |
|
1046 request.add_unredirected_header(name, value) |
|
1047 |
|
1048 return request |
|
1049 |
|
1050 def do_open(self, http_class, req): |
|
1051 """Return an addinfourl object for the request, using http_class. |
|
1052 |
|
1053 http_class must implement the HTTPConnection API from httplib. |
|
1054 The addinfourl return value is a file-like object. It also |
|
1055 has methods and attributes including: |
|
1056 - info(): return a mimetools.Message object for the headers |
|
1057 - geturl(): return the original request URL |
|
1058 - code: HTTP status code |
|
1059 """ |
|
1060 host = req.get_host() |
|
1061 if not host: |
|
1062 raise URLError('no host given') |
|
1063 |
|
1064 h = http_class(host) # will parse host:port |
|
1065 h.set_debuglevel(self._debuglevel) |
|
1066 |
|
1067 headers = dict(req.headers) |
|
1068 headers.update(req.unredirected_hdrs) |
|
1069 # We want to make an HTTP/1.1 request, but the addinfourl |
|
1070 # class isn't prepared to deal with a persistent connection. |
|
1071 # It will try to read all remaining data from the socket, |
|
1072 # which will block while the server waits for the next request. |
|
1073 # So make sure the connection gets closed after the (only) |
|
1074 # request. |
|
1075 headers["Connection"] = "close" |
|
1076 headers = dict( |
|
1077 (name.title(), val) for name, val in headers.items()) |
|
1078 try: |
|
1079 h.request(req.get_method(), req.get_selector(), req.data, headers) |
|
1080 r = h.getresponse() |
|
1081 except socket.error, err: # XXX what error? |
|
1082 raise URLError(err) |
|
1083 |
|
1084 # Pick apart the HTTPResponse object to get the addinfourl |
|
1085 # object initialized properly. |
|
1086 |
|
1087 # Wrap the HTTPResponse object in socket's file object adapter |
|
1088 # for Windows. That adapter calls recv(), so delegate recv() |
|
1089 # to read(). This weird wrapping allows the returned object to |
|
1090 # have readline() and readlines() methods. |
|
1091 |
|
1092 # XXX It might be better to extract the read buffering code |
|
1093 # out of socket._fileobject() and into a base class. |
|
1094 |
|
1095 r.recv = r.read |
|
1096 fp = socket._fileobject(r, close=True) |
|
1097 |
|
1098 resp = addinfourl(fp, r.msg, req.get_full_url()) |
|
1099 resp.code = r.status |
|
1100 resp.msg = r.reason |
|
1101 return resp |
|
1102 |
|
1103 |
|
1104 class HTTPHandler(AbstractHTTPHandler): |
|
1105 |
|
1106 def http_open(self, req): |
|
1107 return self.do_open(httplib.HTTPConnection, req) |
|
1108 |
|
1109 http_request = AbstractHTTPHandler.do_request_ |
|
1110 |
|
1111 if hasattr(httplib, 'HTTPS'): |
|
1112 class HTTPSHandler(AbstractHTTPHandler): |
|
1113 |
|
1114 def https_open(self, req): |
|
1115 return self.do_open(httplib.HTTPSConnection, req) |
|
1116 |
|
1117 https_request = AbstractHTTPHandler.do_request_ |
|
1118 |
|
1119 class HTTPCookieProcessor(BaseHandler): |
|
1120 def __init__(self, cookiejar=None): |
|
1121 import cookielib |
|
1122 if cookiejar is None: |
|
1123 cookiejar = cookielib.CookieJar() |
|
1124 self.cookiejar = cookiejar |
|
1125 |
|
1126 def http_request(self, request): |
|
1127 self.cookiejar.add_cookie_header(request) |
|
1128 return request |
|
1129 |
|
1130 def http_response(self, request, response): |
|
1131 self.cookiejar.extract_cookies(response, request) |
|
1132 return response |
|
1133 |
|
1134 https_request = http_request |
|
1135 https_response = http_response |
|
1136 |
|
1137 class UnknownHandler(BaseHandler): |
|
1138 def unknown_open(self, req): |
|
1139 type = req.get_type() |
|
1140 raise URLError('unknown url type: %s' % type) |
|
1141 |
|
1142 def parse_keqv_list(l): |
|
1143 """Parse list of key=value strings where keys are not duplicated.""" |
|
1144 parsed = {} |
|
1145 for elt in l: |
|
1146 k, v = elt.split('=', 1) |
|
1147 if v[0] == '"' and v[-1] == '"': |
|
1148 v = v[1:-1] |
|
1149 parsed[k] = v |
|
1150 return parsed |
|
1151 |
|
1152 def parse_http_list(s): |
|
1153 """Parse lists as described by RFC 2068 Section 2. |
|
1154 |
|
1155 In particular, parse comma-separated lists where the elements of |
|
1156 the list may include quoted-strings. A quoted-string could |
|
1157 contain a comma. A non-quoted string could have quotes in the |
|
1158 middle. Neither commas nor quotes count if they are escaped. |
|
1159 Only double-quotes count, not single-quotes. |
|
1160 """ |
|
1161 res = [] |
|
1162 part = '' |
|
1163 |
|
1164 escape = quote = False |
|
1165 for cur in s: |
|
1166 if escape: |
|
1167 part += cur |
|
1168 escape = False |
|
1169 continue |
|
1170 if quote: |
|
1171 if cur == '\\': |
|
1172 escape = True |
|
1173 continue |
|
1174 elif cur == '"': |
|
1175 quote = False |
|
1176 part += cur |
|
1177 continue |
|
1178 |
|
1179 if cur == ',': |
|
1180 res.append(part) |
|
1181 part = '' |
|
1182 continue |
|
1183 |
|
1184 if cur == '"': |
|
1185 quote = True |
|
1186 |
|
1187 part += cur |
|
1188 |
|
1189 # append last part |
|
1190 if part: |
|
1191 res.append(part) |
|
1192 |
|
1193 return [part.strip() for part in res] |
|
1194 |
|
1195 class FileHandler(BaseHandler): |
|
1196 # Use local file or FTP depending on form of URL |
|
1197 def file_open(self, req): |
|
1198 url = req.get_selector() |
|
1199 if url[:2] == '//' and url[2:3] != '/': |
|
1200 req.type = 'ftp' |
|
1201 return self.parent.open(req) |
|
1202 else: |
|
1203 return self.open_local_file(req) |
|
1204 |
|
1205 # names for the localhost |
|
1206 names = None |
|
1207 def get_names(self): |
|
1208 if FileHandler.names is None: |
|
1209 try: |
|
1210 FileHandler.names = (socket.gethostbyname('localhost'), |
|
1211 socket.gethostbyname(socket.gethostname())) |
|
1212 except socket.gaierror: |
|
1213 FileHandler.names = (socket.gethostbyname('localhost'),) |
|
1214 return FileHandler.names |
|
1215 |
|
1216 # not entirely sure what the rules are here |
|
1217 def open_local_file(self, req): |
|
1218 import email.Utils |
|
1219 import mimetypes |
|
1220 host = req.get_host() |
|
1221 file = req.get_selector() |
|
1222 localfile = url2pathname(file) |
|
1223 stats = os.stat(localfile) |
|
1224 size = stats.st_size |
|
1225 modified = email.Utils.formatdate(stats.st_mtime, usegmt=True) |
|
1226 mtype = mimetypes.guess_type(file)[0] |
|
1227 headers = mimetools.Message(StringIO( |
|
1228 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % |
|
1229 (mtype or 'text/plain', size, modified))) |
|
1230 if host: |
|
1231 host, port = splitport(host) |
|
1232 if not host or \ |
|
1233 (not port and socket.gethostbyname(host) in self.get_names()): |
|
1234 return addinfourl(open(localfile, 'rb'), |
|
1235 headers, 'file:'+file) |
|
1236 raise URLError('file not on local host') |
|
1237 |
|
1238 class FTPHandler(BaseHandler): |
|
1239 def ftp_open(self, req): |
|
1240 import ftplib |
|
1241 import mimetypes |
|
1242 host = req.get_host() |
|
1243 if not host: |
|
1244 raise IOError, ('ftp error', 'no host given') |
|
1245 host, port = splitport(host) |
|
1246 if port is None: |
|
1247 port = ftplib.FTP_PORT |
|
1248 else: |
|
1249 port = int(port) |
|
1250 |
|
1251 # username/password handling |
|
1252 user, host = splituser(host) |
|
1253 if user: |
|
1254 user, passwd = splitpasswd(user) |
|
1255 else: |
|
1256 passwd = None |
|
1257 host = unquote(host) |
|
1258 user = unquote(user or '') |
|
1259 passwd = unquote(passwd or '') |
|
1260 |
|
1261 try: |
|
1262 host = socket.gethostbyname(host) |
|
1263 except socket.error, msg: |
|
1264 raise URLError(msg) |
|
1265 path, attrs = splitattr(req.get_selector()) |
|
1266 dirs = path.split('/') |
|
1267 dirs = map(unquote, dirs) |
|
1268 dirs, file = dirs[:-1], dirs[-1] |
|
1269 if dirs and not dirs[0]: |
|
1270 dirs = dirs[1:] |
|
1271 try: |
|
1272 fw = self.connect_ftp(user, passwd, host, port, dirs) |
|
1273 type = file and 'I' or 'D' |
|
1274 for attr in attrs: |
|
1275 attr, value = splitvalue(attr) |
|
1276 if attr.lower() == 'type' and \ |
|
1277 value in ('a', 'A', 'i', 'I', 'd', 'D'): |
|
1278 type = value.upper() |
|
1279 fp, retrlen = fw.retrfile(file, type) |
|
1280 headers = "" |
|
1281 mtype = mimetypes.guess_type(req.get_full_url())[0] |
|
1282 if mtype: |
|
1283 headers += "Content-type: %s\n" % mtype |
|
1284 if retrlen is not None and retrlen >= 0: |
|
1285 headers += "Content-length: %d\n" % retrlen |
|
1286 sf = StringIO(headers) |
|
1287 headers = mimetools.Message(sf) |
|
1288 return addinfourl(fp, headers, req.get_full_url()) |
|
1289 except ftplib.all_errors, msg: |
|
1290 raise IOError, ('ftp error', msg), sys.exc_info()[2] |
|
1291 |
|
1292 def connect_ftp(self, user, passwd, host, port, dirs): |
|
1293 fw = ftpwrapper(user, passwd, host, port, dirs) |
|
1294 ## fw.ftp.set_debuglevel(1) |
|
1295 return fw |
|
1296 |
|
1297 class CacheFTPHandler(FTPHandler): |
|
1298 # XXX would be nice to have pluggable cache strategies |
|
1299 # XXX this stuff is definitely not thread safe |
|
1300 def __init__(self): |
|
1301 self.cache = {} |
|
1302 self.timeout = {} |
|
1303 self.soonest = 0 |
|
1304 self.delay = 60 |
|
1305 self.max_conns = 16 |
|
1306 |
|
1307 def setTimeout(self, t): |
|
1308 self.delay = t |
|
1309 |
|
1310 def setMaxConns(self, m): |
|
1311 self.max_conns = m |
|
1312 |
|
1313 def connect_ftp(self, user, passwd, host, port, dirs): |
|
1314 key = user, host, port, '/'.join(dirs) |
|
1315 if key in self.cache: |
|
1316 self.timeout[key] = time.time() + self.delay |
|
1317 else: |
|
1318 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs) |
|
1319 self.timeout[key] = time.time() + self.delay |
|
1320 self.check_cache() |
|
1321 return self.cache[key] |
|
1322 |
|
1323 def check_cache(self): |
|
1324 # first check for old ones |
|
1325 t = time.time() |
|
1326 if self.soonest <= t: |
|
1327 for k, v in self.timeout.items(): |
|
1328 if v < t: |
|
1329 self.cache[k].close() |
|
1330 del self.cache[k] |
|
1331 del self.timeout[k] |
|
1332 self.soonest = min(self.timeout.values()) |
|
1333 |
|
1334 # then check the size |
|
1335 if len(self.cache) == self.max_conns: |
|
1336 for k, v in self.timeout.items(): |
|
1337 if v == self.soonest: |
|
1338 del self.cache[k] |
|
1339 del self.timeout[k] |
|
1340 break |
|
1341 self.soonest = min(self.timeout.values()) |
|
1342 |
|
1343 class GopherHandler(BaseHandler): |
|
1344 def gopher_open(self, req): |
|
1345 # XXX can raise socket.error |
|
1346 import gopherlib # this raises DeprecationWarning in 2.5 |
|
1347 host = req.get_host() |
|
1348 if not host: |
|
1349 raise GopherError('no host given') |
|
1350 host = unquote(host) |
|
1351 selector = req.get_selector() |
|
1352 type, selector = splitgophertype(selector) |
|
1353 selector, query = splitquery(selector) |
|
1354 selector = unquote(selector) |
|
1355 if query: |
|
1356 query = unquote(query) |
|
1357 fp = gopherlib.send_query(selector, query, host) |
|
1358 else: |
|
1359 fp = gopherlib.send_selector(selector, host) |
|
1360 return addinfourl(fp, noheaders(), req.get_full_url()) |