diff -r 000000000000 -r ae805ac0140d python-2.5.2/win32/Lib/urlparse.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/python-2.5.2/win32/Lib/urlparse.py Fri Apr 03 17:19:34 2009 +0100 @@ -0,0 +1,374 @@ +"""Parse (absolute and relative) URLs. + +See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, +UC Irvine, June 1995. +""" + +__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", + "urlsplit", "urlunsplit"] + +# A classification of schemes ('' means apply by default) +uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', + 'wais', 'file', 'https', 'shttp', 'mms', + 'prospero', 'rtsp', 'rtspu', '', 'sftp'] +uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', + 'imap', 'wais', 'file', 'mms', 'https', 'shttp', + 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', + 'svn', 'svn+ssh', 'sftp'] +non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', + 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] +uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', + 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', + 'mms', '', 'sftp'] +uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', + 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] +uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', + 'nntp', 'wais', 'https', 'shttp', 'snews', + 'file', 'prospero', ''] + +# Characters valid in scheme names +scheme_chars = ('abcdefghijklmnopqrstuvwxyz' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + '0123456789' + '+-.') + +MAX_CACHE_SIZE = 20 +_parse_cache = {} + +def clear_cache(): + """Clear the parse cache.""" + global _parse_cache + _parse_cache = {} + + +class BaseResult(tuple): + """Base class for the parsed result objects. + + This provides the attributes shared by the two derived result + objects as read-only properties. The derived classes are + responsible for checking the right number of arguments were + supplied to the constructor. + + """ + + __slots__ = () + + # Attributes that access the basic components of the URL: + + @property + def scheme(self): + return self[0] + + @property + def netloc(self): + return self[1] + + @property + def path(self): + return self[2] + + @property + def query(self): + return self[-2] + + @property + def fragment(self): + return self[-1] + + # Additional attributes that provide access to parsed-out portions + # of the netloc: + + @property + def username(self): + netloc = self.netloc + if "@" in netloc: + userinfo = netloc.split("@", 1)[0] + if ":" in userinfo: + userinfo = userinfo.split(":", 1)[0] + return userinfo + return None + + @property + def password(self): + netloc = self.netloc + if "@" in netloc: + userinfo = netloc.split("@", 1)[0] + if ":" in userinfo: + return userinfo.split(":", 1)[1] + return None + + @property + def hostname(self): + netloc = self.netloc + if "@" in netloc: + netloc = netloc.split("@", 1)[1] + if ":" in netloc: + netloc = netloc.split(":", 1)[0] + return netloc.lower() or None + + @property + def port(self): + netloc = self.netloc + if "@" in netloc: + netloc = netloc.split("@", 1)[1] + if ":" in netloc: + port = netloc.split(":", 1)[1] + return int(port, 10) + return None + + +class SplitResult(BaseResult): + + __slots__ = () + + def __new__(cls, scheme, netloc, path, query, fragment): + return BaseResult.__new__( + cls, (scheme, netloc, path, query, fragment)) + + def geturl(self): + return urlunsplit(self) + + +class ParseResult(BaseResult): + + __slots__ = () + + def __new__(cls, scheme, netloc, path, params, query, fragment): + return BaseResult.__new__( + cls, (scheme, netloc, path, params, query, fragment)) + + @property + def params(self): + return self[3] + + def geturl(self): + return urlunparse(self) + + +def urlparse(url, scheme='', allow_fragments=True): + """Parse a URL into 6 components: + :///;?# + Return a 6-tuple: (scheme, netloc, path, params, query, fragment). + Note that we don't break the components up in smaller bits + (e.g. netloc is a single string) and we don't expand % escapes.""" + tuple = urlsplit(url, scheme, allow_fragments) + scheme, netloc, url, query, fragment = tuple + if scheme in uses_params and ';' in url: + url, params = _splitparams(url) + else: + params = '' + return ParseResult(scheme, netloc, url, params, query, fragment) + +def _splitparams(url): + if '/' in url: + i = url.find(';', url.rfind('/')) + if i < 0: + return url, '' + else: + i = url.find(';') + return url[:i], url[i+1:] + +def _splitnetloc(url, start=0): + delim = len(url) # position of end of domain part of url, default is end + for c in '/?#': # look for delimiters; the order is NOT important + wdelim = url.find(c, start) # find first of this delim + if wdelim >= 0: # if found + delim = min(delim, wdelim) # use earliest delim position + return url[start:delim], url[delim:] # return (domain, rest) + +def urlsplit(url, scheme='', allow_fragments=True): + """Parse a URL into 5 components: + :///?# + Return a 5-tuple: (scheme, netloc, path, query, fragment). + Note that we don't break the components up in smaller bits + (e.g. netloc is a single string) and we don't expand % escapes.""" + allow_fragments = bool(allow_fragments) + key = url, scheme, allow_fragments, type(url), type(scheme) + cached = _parse_cache.get(key, None) + if cached: + return cached + if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth + clear_cache() + netloc = query = fragment = '' + i = url.find(':') + if i > 0: + if url[:i] == 'http': # optimize the common case + scheme = url[:i].lower() + url = url[i+1:] + if url[:2] == '//': + netloc, url = _splitnetloc(url, 2) + if allow_fragments and '#' in url: + url, fragment = url.split('#', 1) + if '?' in url: + url, query = url.split('?', 1) + v = SplitResult(scheme, netloc, url, query, fragment) + _parse_cache[key] = v + return v + for c in url[:i]: + if c not in scheme_chars: + break + else: + scheme, url = url[:i].lower(), url[i+1:] + if scheme in uses_netloc and url[:2] == '//': + netloc, url = _splitnetloc(url, 2) + if allow_fragments and scheme in uses_fragment and '#' in url: + url, fragment = url.split('#', 1) + if scheme in uses_query and '?' in url: + url, query = url.split('?', 1) + v = SplitResult(scheme, netloc, url, query, fragment) + _parse_cache[key] = v + return v + +def urlunparse((scheme, netloc, url, params, query, fragment)): + """Put a parsed URL back together again. This may result in a + slightly different, but equivalent URL, if the URL that was parsed + originally had redundant delimiters, e.g. a ? with an empty query + (the draft states that these are equivalent).""" + if params: + url = "%s;%s" % (url, params) + return urlunsplit((scheme, netloc, url, query, fragment)) + +def urlunsplit((scheme, netloc, url, query, fragment)): + if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): + if url and url[:1] != '/': url = '/' + url + url = '//' + (netloc or '') + url + if scheme: + url = scheme + ':' + url + if query: + url = url + '?' + query + if fragment: + url = url + '#' + fragment + return url + +def urljoin(base, url, allow_fragments=True): + """Join a base URL and a possibly relative URL to form an absolute + interpretation of the latter.""" + if not base: + return url + if not url: + return base + bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ + urlparse(base, '', allow_fragments) + scheme, netloc, path, params, query, fragment = \ + urlparse(url, bscheme, allow_fragments) + if scheme != bscheme or scheme not in uses_relative: + return url + if scheme in uses_netloc: + if netloc: + return urlunparse((scheme, netloc, path, + params, query, fragment)) + netloc = bnetloc + if path[:1] == '/': + return urlunparse((scheme, netloc, path, + params, query, fragment)) + if not (path or params or query): + return urlunparse((scheme, netloc, bpath, + bparams, bquery, fragment)) + segments = bpath.split('/')[:-1] + path.split('/') + # XXX The stuff below is bogus in various ways... + if segments[-1] == '.': + segments[-1] = '' + while '.' in segments: + segments.remove('.') + while 1: + i = 1 + n = len(segments) - 1 + while i < n: + if (segments[i] == '..' + and segments[i-1] not in ('', '..')): + del segments[i-1:i+1] + break + i = i+1 + else: + break + if segments == ['', '..']: + segments[-1] = '' + elif len(segments) >= 2 and segments[-1] == '..': + segments[-2:] = [''] + return urlunparse((scheme, netloc, '/'.join(segments), + params, query, fragment)) + +def urldefrag(url): + """Removes any existing fragment from URL. + + Returns a tuple of the defragmented URL and the fragment. If + the URL contained no fragments, the second element is the + empty string. + """ + if '#' in url: + s, n, p, a, q, frag = urlparse(url) + defrag = urlunparse((s, n, p, a, q, '')) + return defrag, frag + else: + return url, '' + + +test_input = """ + http://a/b/c/d + + g:h = + http:g = + http: = + g = + ./g = + g/ = + /g = + //g = + ?y = + g?y = + g?y/./x = + . = + ./ = + .. = + ../ = + ../g = + ../.. = + ../../g = + ../../../g = + ./../g = + ./g/. = + /./g = + g/./h = + g/../h = + http:g = + http: = + http:?y = + http:g?y = + http:g?y/./x = +""" + +def test(): + import sys + base = '' + if sys.argv[1:]: + fn = sys.argv[1] + if fn == '-': + fp = sys.stdin + else: + fp = open(fn) + else: + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO + fp = StringIO(test_input) + while 1: + line = fp.readline() + if not line: break + words = line.split() + if not words: + continue + url = words[0] + parts = urlparse(url) + print '%-10s : %s' % (url, parts) + abs = urljoin(base, url) + if not base: + base = abs + wrapped = '' % abs + print '%-10s = %s' % (url, wrapped) + if len(words) == 3 and words[1] == '=': + if wrapped != words[2]: + print 'EXPECTED', words[2], '!!!!!!!!!!' + +if __name__ == '__main__': + test()