cpython/Tools/webchecker/websucker.py

132 lines
3.1 KiB
Python
Raw Normal View History

#! /usr/bin/env python
"""A variant on webchecker that creates a mirror copy of a remote site."""
__version__ = "0.1"
import os
import sys
import string
import urllib
import getopt
import webchecker
verbose = webchecker.verbose
def main():
global verbose
try:
opts, args = getopt.getopt(sys.argv[1:], "qv")
except getopt.error, msg:
print msg
print "usage:", sys.argv[0], "[-v] ... [rooturl] ..."
return 2
for o, a in opts:
if o == "-q":
webchecker.verbose = verbose = 0
if o == "-v":
webchecker.verbose = verbose = verbose + 1
c = Sucker(0)
c.urlopener.addheaders = [
('User-agent', 'websucker/%s' % __version__),
]
for arg in args:
print "Adding root", arg
c.addroot(arg)
print "Run..."
c.run()
class Sucker(webchecker.Checker):
# Alas, had to copy this to make one change...
def getpage(self, url):
if url[:7] == 'mailto:' or url[:5] == 'news:':
if verbose > 1: print " Not checking mailto/news URL"
return None
isint = self.inroots(url)
if not isint and not self.checkext:
if verbose > 1: print " Not checking ext link"
return None
path = self.savefilename(url)
saved = 0
try:
f = open(path, "rb")
except IOError:
try:
f = self.urlopener.open(url)
except IOError, msg:
msg = webchecker.sanitize(msg)
if verbose > 0:
print "Error ", msg
if verbose > 0:
webchecker.show(" HREF ", url, " from", self.todo[url])
self.setbad(url, msg)
return None
if not isint:
if verbose > 1: print " Not gathering links from ext URL"
safeclose(f)
return None
nurl = f.geturl()
if nurl != url:
path = self.savefilename(nurl)
info = f.info()
else:
if verbose: print "Loading cached URL", url
saved = 1
nurl = url
info = {}
if url[-1:] == "/":
info["content-type"] = "text/html"
text = f.read()
if not saved: self.savefile(text, path)
if info.has_key('content-type'):
ctype = string.lower(info['content-type'])
else:
ctype = None
if nurl != url:
if verbose > 1:
print " Redirected to", nurl
if not ctype:
ctype, encoding = webchecker.mimetypes.guess_type(nurl)
if ctype != 'text/html':
webchecker.safeclose(f)
if verbose > 1:
print " Not HTML, mime type", ctype
return None
f.close()
return webchecker.Page(text, nurl)
def savefile(self, text, path):
dir, base = os.path.split(path)
makedirs(dir)
f = open(path, "wb")
f.write(text)
f.close()
print "saved", path
def savefilename(self, url):
type, rest = urllib.splittype(url)
host, path = urllib.splithost(rest)
while path[:1] == "/": path = path[1:]
user, host = urllib.splituser(host)
host, port = urllib.splitnport(host)
host = string.lower(host)
path = os.path.join(host, path)
if path[-1] == "/": path = path + "index.html"
if os.sep != "/":
path = string.join(string.split(path, "/"), os.sep)
return path
def makedirs(dir):
if not dir or os.path.exists(dir):
return
head, tail = os.path.split(dir)
if not tail:
print "Huh? Don't know how to make dir", dir
return
makedirs(head)
os.mkdir(dir, 0777)
if __name__ == '__main__':
sys.exit(main() or 0)