python-2.5.2/win32/Tools/webchecker/websucker.py
changeset 0 ae805ac0140d
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/python-2.5.2/win32/Tools/webchecker/websucker.py	Fri Apr 03 17:19:34 2009 +0100
@@ -0,0 +1,125 @@
+#! /usr/bin/env python
+
+"""A variant on webchecker that creates a mirror copy of a remote site."""
+
+__version__ = "$Revision: 28654 $"
+
+import os
+import sys
+import urllib
+import getopt
+
+import webchecker
+
+# Extract real version number if necessary
+if __version__[0] == '$':
+    _v = __version__.split()
+    if len(_v) == 3:
+        __version__ = _v[1]
+
+def main():
+    verbose = webchecker.VERBOSE
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], "qv")
+    except getopt.error, msg:
+        print msg
+        print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
+        return 2
+    for o, a in opts:
+        if o == "-q":
+            verbose = 0
+        if o == "-v":
+            verbose = verbose + 1
+    c = Sucker()
+    c.setflags(verbose=verbose)
+    c.urlopener.addheaders = [
+            ('User-agent', 'websucker/%s' % __version__),
+        ]
+    for arg in args:
+        print "Adding root", arg
+        c.addroot(arg)
+    print "Run..."
+    c.run()
+
+class Sucker(webchecker.Checker):
+
+    checkext = 0
+    nonames = 1
+
+    # SAM 11/13/99: in general, URLs are now URL pairs.
+    # Since we've suppressed name anchor checking,
+    # we can ignore the second dimension.
+
+    def readhtml(self, url_pair):
+        url = url_pair[0]
+        text = None
+        path = self.savefilename(url)
+        try:
+            f = open(path, "rb")
+        except IOError:
+            f = self.openpage(url_pair)
+            if f:
+                info = f.info()
+                nurl = f.geturl()
+                if nurl != url:
+                    url = nurl
+                    path = self.savefilename(url)
+                text = f.read()
+                f.close()
+                self.savefile(text, path)
+                if not self.checkforhtml(info, url):
+                    text = None
+        else:
+            if self.checkforhtml({}, url):
+                text = f.read()
+            f.close()
+        return text, url
+
+    def savefile(self, text, path):
+        dir, base = os.path.split(path)
+        makedirs(dir)
+        try:
+            f = open(path, "wb")
+            f.write(text)
+            f.close()
+            self.message("saved %s", path)
+        except IOError, msg:
+            self.message("didn't save %s: %s", path, str(msg))
+
+    def savefilename(self, url):
+        type, rest = urllib.splittype(url)
+        host, path = urllib.splithost(rest)
+        path = path.lstrip("/")
+        user, host = urllib.splituser(host)
+        host, port = urllib.splitnport(host)
+        host = host.lower()
+        if not path or path[-1] == "/":
+            path = path + "index.html"
+        if os.sep != "/":
+            path = os.sep.join(path.split("/"))
+            if os.name == "mac":
+                path = os.sep + path
+        path = os.path.join(host, path)
+        return path
+
+def makedirs(dir):
+    if not dir:
+        return
+    if os.path.exists(dir):
+        if not os.path.isdir(dir):
+            try:
+                os.rename(dir, dir + ".bak")
+                os.mkdir(dir)
+                os.rename(dir + ".bak", os.path.join(dir, "index.html"))
+            except os.error:
+                pass
+        return
+    head, tail = os.path.split(dir)
+    if not tail:
+        print "Huh?  Don't know how to make dir", dir
+        return
+    makedirs(head)
+    os.mkdir(dir, 0777)
+
+if __name__ == '__main__':
+    sys.exit(main() or 0)