A variant on webchecker that creates a mirror copy of a remote site.
This commit is contained in:
parent
2237b73baf
commit
d57548023f
|
@ -0,0 +1,131 @@
|
|||
#! /usr/bin/env python
|
||||
|
||||
"""A variant on webchecker that creates a mirror copy of a remote site."""
|
||||
|
||||
__version__ = "0.1"
|
||||
|
||||
import os
|
||||
import sys
|
||||
import string
|
||||
import urllib
|
||||
import getopt
|
||||
|
||||
import webchecker
|
||||
verbose = webchecker.verbose
|
||||
|
||||
def main():
|
||||
global verbose
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "qv")
|
||||
except getopt.error, msg:
|
||||
print msg
|
||||
print "usage:", sys.argv[0], "[-v] ... [rooturl] ..."
|
||||
return 2
|
||||
for o, a in opts:
|
||||
if o == "-q":
|
||||
webchecker.verbose = verbose = 0
|
||||
if o == "-v":
|
||||
webchecker.verbose = verbose = verbose + 1
|
||||
c = Sucker(0)
|
||||
c.urlopener.addheaders = [
|
||||
('User-agent', 'websucker/%s' % __version__),
|
||||
]
|
||||
for arg in args:
|
||||
print "Adding root", arg
|
||||
c.addroot(arg)
|
||||
print "Run..."
|
||||
c.run()
|
||||
|
||||
class Sucker(webchecker.Checker):
|
||||
|
||||
# Alas, had to copy this to make one change...
|
||||
def getpage(self, url):
|
||||
if url[:7] == 'mailto:' or url[:5] == 'news:':
|
||||
if verbose > 1: print " Not checking mailto/news URL"
|
||||
return None
|
||||
isint = self.inroots(url)
|
||||
if not isint and not self.checkext:
|
||||
if verbose > 1: print " Not checking ext link"
|
||||
return None
|
||||
path = self.savefilename(url)
|
||||
saved = 0
|
||||
try:
|
||||
f = open(path, "rb")
|
||||
except IOError:
|
||||
try:
|
||||
f = self.urlopener.open(url)
|
||||
except IOError, msg:
|
||||
msg = webchecker.sanitize(msg)
|
||||
if verbose > 0:
|
||||
print "Error ", msg
|
||||
if verbose > 0:
|
||||
webchecker.show(" HREF ", url, " from", self.todo[url])
|
||||
self.setbad(url, msg)
|
||||
return None
|
||||
if not isint:
|
||||
if verbose > 1: print " Not gathering links from ext URL"
|
||||
safeclose(f)
|
||||
return None
|
||||
nurl = f.geturl()
|
||||
if nurl != url:
|
||||
path = self.savefilename(nurl)
|
||||
info = f.info()
|
||||
else:
|
||||
if verbose: print "Loading cached URL", url
|
||||
saved = 1
|
||||
nurl = url
|
||||
info = {}
|
||||
if url[-1:] == "/":
|
||||
info["content-type"] = "text/html"
|
||||
text = f.read()
|
||||
if not saved: self.savefile(text, path)
|
||||
if info.has_key('content-type'):
|
||||
ctype = string.lower(info['content-type'])
|
||||
else:
|
||||
ctype = None
|
||||
if nurl != url:
|
||||
if verbose > 1:
|
||||
print " Redirected to", nurl
|
||||
if not ctype:
|
||||
ctype, encoding = webchecker.mimetypes.guess_type(nurl)
|
||||
if ctype != 'text/html':
|
||||
webchecker.safeclose(f)
|
||||
if verbose > 1:
|
||||
print " Not HTML, mime type", ctype
|
||||
return None
|
||||
f.close()
|
||||
return webchecker.Page(text, nurl)
|
||||
|
||||
def savefile(self, text, path):
|
||||
dir, base = os.path.split(path)
|
||||
makedirs(dir)
|
||||
f = open(path, "wb")
|
||||
f.write(text)
|
||||
f.close()
|
||||
print "saved", path
|
||||
|
||||
def savefilename(self, url):
|
||||
type, rest = urllib.splittype(url)
|
||||
host, path = urllib.splithost(rest)
|
||||
while path[:1] == "/": path = path[1:]
|
||||
user, host = urllib.splituser(host)
|
||||
host, port = urllib.splitnport(host)
|
||||
host = string.lower(host)
|
||||
path = os.path.join(host, path)
|
||||
if path[-1] == "/": path = path + "index.html"
|
||||
if os.sep != "/":
|
||||
path = string.join(string.split(path, "/"), os.sep)
|
||||
return path
|
||||
|
||||
def makedirs(dir):
|
||||
if not dir or os.path.exists(dir):
|
||||
return
|
||||
head, tail = os.path.split(dir)
|
||||
if not tail:
|
||||
print "Huh? Don't know how to make dir", dir
|
||||
return
|
||||
makedirs(head)
|
||||
os.mkdir(dir, 0777)
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main() or 0)
|
Loading…
Reference in New Issue