1997-10-06 15:54:25 -03:00
|
|
|
#! /usr/bin/env python
|
|
|
|
|
|
|
|
"""A variant on webchecker that creates a mirror copy of a remote site."""
|
|
|
|
|
1998-02-21 16:08:39 -04:00
|
|
|
__version__ = "$Revision$"
|
1997-10-06 15:54:25 -03:00
|
|
|
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import urllib
|
|
|
|
import getopt
|
|
|
|
|
1999-11-17 11:40:48 -04:00
|
|
|
import webchecker
|
1998-02-21 16:08:39 -04:00
|
|
|
|
|
|
|
# Extract real version number if necessary
|
|
|
|
if __version__[0] == '$':
|
2002-09-11 17:36:02 -03:00
|
|
|
_v = __version__.split()
|
1998-02-21 16:08:39 -04:00
|
|
|
if len(_v) == 3:
|
1998-04-06 11:29:28 -03:00
|
|
|
__version__ = _v[1]
|
1997-10-06 15:54:25 -03:00
|
|
|
|
|
|
|
def main():
|
1998-02-21 16:08:39 -04:00
|
|
|
verbose = webchecker.VERBOSE
|
1997-10-06 15:54:25 -03:00
|
|
|
try:
|
1998-04-06 11:29:28 -03:00
|
|
|
opts, args = getopt.getopt(sys.argv[1:], "qv")
|
2007-01-10 12:19:56 -04:00
|
|
|
except getopt.error as msg:
|
1998-04-06 11:29:28 -03:00
|
|
|
print msg
|
|
|
|
print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
|
|
|
|
return 2
|
1997-10-06 15:54:25 -03:00
|
|
|
for o, a in opts:
|
1998-04-06 11:29:28 -03:00
|
|
|
if o == "-q":
|
|
|
|
verbose = 0
|
|
|
|
if o == "-v":
|
|
|
|
verbose = verbose + 1
|
1998-02-21 16:08:39 -04:00
|
|
|
c = Sucker()
|
|
|
|
c.setflags(verbose=verbose)
|
1997-10-06 15:54:25 -03:00
|
|
|
c.urlopener.addheaders = [
|
1998-04-06 11:29:28 -03:00
|
|
|
('User-agent', 'websucker/%s' % __version__),
|
|
|
|
]
|
1997-10-06 15:54:25 -03:00
|
|
|
for arg in args:
|
1998-04-06 11:29:28 -03:00
|
|
|
print "Adding root", arg
|
|
|
|
c.addroot(arg)
|
1997-10-06 15:54:25 -03:00
|
|
|
print "Run..."
|
|
|
|
c.run()
|
|
|
|
|
|
|
|
class Sucker(webchecker.Checker):
|
|
|
|
|
1998-02-21 16:08:39 -04:00
|
|
|
checkext = 0
|
1999-11-17 11:04:26 -04:00
|
|
|
nonames = 1
|
|
|
|
|
|
|
|
# SAM 11/13/99: in general, URLs are now URL pairs.
|
|
|
|
# Since we've suppressed name anchor checking,
|
|
|
|
# we can ignore the second dimension.
|
1998-02-21 16:08:39 -04:00
|
|
|
|
1999-11-17 11:04:26 -04:00
|
|
|
def readhtml(self, url_pair):
|
|
|
|
url = url_pair[0]
|
1998-04-06 11:29:28 -03:00
|
|
|
text = None
|
|
|
|
path = self.savefilename(url)
|
|
|
|
try:
|
|
|
|
f = open(path, "rb")
|
|
|
|
except IOError:
|
1999-11-17 11:04:26 -04:00
|
|
|
f = self.openpage(url_pair)
|
1998-04-06 11:29:28 -03:00
|
|
|
if f:
|
|
|
|
info = f.info()
|
|
|
|
nurl = f.geturl()
|
|
|
|
if nurl != url:
|
|
|
|
url = nurl
|
|
|
|
path = self.savefilename(url)
|
|
|
|
text = f.read()
|
|
|
|
f.close()
|
|
|
|
self.savefile(text, path)
|
|
|
|
if not self.checkforhtml(info, url):
|
|
|
|
text = None
|
|
|
|
else:
|
|
|
|
if self.checkforhtml({}, url):
|
|
|
|
text = f.read()
|
|
|
|
f.close()
|
|
|
|
return text, url
|
1997-10-06 15:54:25 -03:00
|
|
|
|
|
|
|
def savefile(self, text, path):
|
1998-04-06 11:29:28 -03:00
|
|
|
dir, base = os.path.split(path)
|
|
|
|
makedirs(dir)
|
1999-01-03 09:06:00 -04:00
|
|
|
try:
|
|
|
|
f = open(path, "wb")
|
|
|
|
f.write(text)
|
|
|
|
f.close()
|
|
|
|
self.message("saved %s", path)
|
2007-01-10 12:19:56 -04:00
|
|
|
except IOError as msg:
|
1999-01-03 09:06:00 -04:00
|
|
|
self.message("didn't save %s: %s", path, str(msg))
|
1997-10-06 15:54:25 -03:00
|
|
|
|
|
|
|
def savefilename(self, url):
|
1998-04-06 11:29:28 -03:00
|
|
|
type, rest = urllib.splittype(url)
|
|
|
|
host, path = urllib.splithost(rest)
|
2002-09-11 17:36:02 -03:00
|
|
|
path = path.lstrip("/")
|
1998-04-06 11:29:28 -03:00
|
|
|
user, host = urllib.splituser(host)
|
|
|
|
host, port = urllib.splitnport(host)
|
2002-09-11 17:36:02 -03:00
|
|
|
host = host.lower()
|
1998-06-15 09:34:41 -03:00
|
|
|
if not path or path[-1] == "/":
|
2000-04-25 18:13:24 -03:00
|
|
|
path = path + "index.html"
|
1998-04-06 11:29:28 -03:00
|
|
|
if os.sep != "/":
|
2002-09-11 17:36:02 -03:00
|
|
|
path = os.sep.join(path.split("/"))
|
2000-04-25 18:13:24 -03:00
|
|
|
if os.name == "mac":
|
|
|
|
path = os.sep + path
|
1998-06-15 09:34:41 -03:00
|
|
|
path = os.path.join(host, path)
|
1998-04-06 11:29:28 -03:00
|
|
|
return path
|
1997-10-06 15:54:25 -03:00
|
|
|
|
|
|
|
def makedirs(dir):
|
1999-01-03 09:06:00 -04:00
|
|
|
if not dir:
|
|
|
|
return
|
|
|
|
if os.path.exists(dir):
|
|
|
|
if not os.path.isdir(dir):
|
|
|
|
try:
|
|
|
|
os.rename(dir, dir + ".bak")
|
|
|
|
os.mkdir(dir)
|
|
|
|
os.rename(dir + ".bak", os.path.join(dir, "index.html"))
|
|
|
|
except os.error:
|
|
|
|
pass
|
1998-04-06 11:29:28 -03:00
|
|
|
return
|
1997-10-06 15:54:25 -03:00
|
|
|
head, tail = os.path.split(dir)
|
|
|
|
if not tail:
|
1998-04-06 11:29:28 -03:00
|
|
|
print "Huh? Don't know how to make dir", dir
|
|
|
|
return
|
1997-10-06 15:54:25 -03:00
|
|
|
makedirs(head)
|
|
|
|
os.mkdir(dir, 0777)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
sys.exit(main() or 0)
|