diff --git a/Lib/urllib.py b/Lib/urllib.py new file mode 100644 index 00000000000..7350de61e01 --- /dev/null +++ b/Lib/urllib.py @@ -0,0 +1,454 @@ +# Open an arbitrary URL +# +# See the following document for a tentative description of URLs: +# Uniform Resource Locators Tim Berners-Lee +# INTERNET DRAFT CERN +# IETF URL Working Group 14 July 1993 +# draft-ietf-uri-url-01.txt +# +# The object returned by URLopener().open(file) will differ per +# protocol. All you know is that is has methods read(), readline(), +# readlines(), fileno(), close() and info(). The read*(), fileno() +# and close() methods work like those of open files. +# The info() method returns an rfc822.Message object which can be +# used to query various info about the object, if available. +# (rfc822.Message objects are queried with the getheader() method.) + +import socket +import regex + + +# This really consists of two pieces: +# (1) a class which handles opening of all sorts of URLs +# (plus assorted utilities etc.) +# (2) a set of functions for parsing URLs +# XXX Should these be separated out into different modules? + + +# Shortcut for basic usage +_urlopener = None +def urlopen(url): + global _urlopener + if not _urlopener: + _urlopener = URLopener() + return _urlopener.open(url) +def urlretrieve(url): + global _urlopener + if not _urlopener: + _urlopener = URLopener() + return _urlopener.retrieve(url) +def urlcleanup(): + if _urlopener: + _urlopener.cleanup() + + +# Class to open URLs. +# This is a class rather than just a subroutine because we may need +# more than one set of global protocol-specific options. +ftpcache = {} +class URLopener: + + # Constructor + def __init__(self): + self.addheaders = [] + self.tempcache = {} + self.ftpcache = ftpcache + # Undocumented feature: you can use a different + # ftp cache by assigning to the .ftpcache member; + # in case you want logically independent URL openers + + def __del__(self): + self.close() + + def close(self): + self.cleanup() + + def cleanup(self): + import os + for url in self.tempcache.keys(): + try: + os.unlink(self.tempcache[url][0]) + except os.error: + pass + del self.tempcache[url] + + # Add a header to be used by the HTTP interface only + # e.g. u.addheader('Accept', 'sound/basic') + def addheader(self, *args): + self.addheaders.append(args) + + # External interface + # Use URLopener().open(file) instead of open(file, 'r') + def open(self, url): + type, url = splittype(unwrap(url)) + if not type: type = 'file' + name = 'open_' + type + if '-' in name: + import regsub + name = regsub.gsub('-', '_', name) + if not hasattr(self, name): + raise IOError, ('url error', 'unknown url type', type) + try: + return getattr(self, name)(url) + except socket.error, msg: + raise IOError, ('socket error', msg) + + # External interface + # retrieve(url) returns (filename, None) for a local object + # or (tempfilename, headers) for a remote object + def retrieve(self, url): + if self.tempcache.has_key(url): + return self.tempcache[url] + url1 = unwrap(url) + if self.tempcache.has_key(url1): + self.tempcache[url] = self.tempcache[url1] + return self.tempcache[url1] + type, url1 = splittype(url1) + if not type or type == 'file': + try: + fp = self.open_local_file(url1) + del fp + return splithost(url1)[1], None + except IOError, msg: + pass + fp = self.open(url) + headers = fp.info() + import tempfile + tfn = tempfile.mktemp() + self.tempcache[url] = result = tfn, headers + tfp = open(tfn, 'w') + bs = 1024*8 + block = fp.read(bs) + while block: + tfp.write(block) + block = fp.read(bs) + del fp + del tfp + return result + + # Each method named open_ knows how to open that type of URL + + # Use HTTP protocol + def open_http(self, url): + import httplib + host, selector = splithost(url) + h = httplib.HTTP(host) + h.putrequest('GET', selector) + for args in self.addheaders: apply(h.putheader, args) + errcode, errmsg, headers = h.getreply() + if errcode == 200: return addinfo(h.getfile(), headers) + else: raise IOError, ('http error', errcode, errmsg, headers) + + # Use Gopher protocol + def open_gopher(self, url): + import gopherlib + host, selector = splithost(url) + type, selector = splitgophertype(selector) + selector, query = splitquery(selector) + if query: fp = gopherlib.send_query(selector, query, host) + else: fp = gopherlib.send_selector(selector, host) + return addinfo(fp, noheaders()) + + # Use local file or FTP depending on form of URL + def open_file(self, url): + try: + return self.open_local_file(url) + except IOError: + return self.open_ftp(url) + + # Use local file + def open_local_file(self, url): + host, file = splithost(url) + if not host: return addinfo(open(file, 'r'), noheaders()) + host, port = splitport(host) + if not port and socket.gethostbyname(host) in ( + localhost(), thishost()): + return addinfo(open(file, 'r'), noheaders()) + raise IOError, ('local file error', 'not on local host') + + # Use FTP protocol + def open_ftp(self, url): + host, file = splithost(url) + if not host: raise IOError, ('ftp error', 'no host given') + host, port = splitport(host) + host = socket.gethostbyname(host) + if not port: + import ftplib + port = ftplib.FTP_PORT + key = (host, port) + try: + if not self.ftpcache.has_key(key): + self.ftpcache[key] = ftpwrapper(host, port) + return addinfo(self.ftpcache[key].retrfile(file), + noheaders()) + except ftperrors(), msg: + raise IOError, ('ftp error', msg) + + +# Utility functions + +# Return the IP address of the magic hostname 'localhost' +_localhost = None +def localhost(): + global _localhost + if not _localhost: + _localhost = socket.gethostbyname('localhost') + return _localhost + +# Return the IP address of the current host +_thishost = None +def thishost(): + global _thishost + if not _thishost: + _thishost = socket.gethostbyname(socket.gethostname()) + return _thishost + +# Return the set of errors raised by the FTP class +_ftperrors = None +def ftperrors(): + global _ftperrors + if not _ftperrors: + import ftplib + _ftperrors = (ftplib.error_reply, + ftplib.error_temp, + ftplib.error_perm, + ftplib.error_proto) + return _ftperrors + +# Return an empty rfc822.Message object +_noheaders = None +def noheaders(): + global _noheaders + if not _noheaders: + import rfc822 + _noheaders = rfc822.Message(open('/dev/null', 'r')) + _noheaders.fp.close() # Recycle file descriptor + return _noheaders + + +# Utility classes + +# Class used by open_ftp() for cache of open FTP connections +class ftpwrapper: + def __init__(self, host, port): + self.host = host + self.port = port + self.init() + def init(self): + import ftplib + self.ftp = ftplib.FTP() + self.ftp.connect(self.host, self.port) + self.ftp.login() + def retrfile(self, file): + import ftplib + try: + self.ftp.voidcmd('TYPE I') + except ftplib.all_errors: + self.init() + self.ftp.voidcmd('TYPE I') + conn = None + if file: + try: + cmd = 'RETR ' + file + conn = self.ftp.transfercmd(cmd) + except ftplib.error_perm, reason: + if reason[:3] != '550': + raise IOError, ('ftp error', reason) + if not conn: + # Try a directory listing + if file: cmd = 'LIST ' + file + else: cmd = 'LIST' + conn = self.ftp.transfercmd(cmd) + return addclosehook(conn.makefile('r'), self.ftp.voidresp) + +# Base class for addinfo and addclosehook +class addbase: + def __init__(self, fp): + self.fp = fp + self.read = self.fp.read + self.readline = self.fp.readline + self.readlines = self.fp.readlines + self.fileno = self.fp.fileno + def __repr__(self): + return '<%s at %s whose fp = %s>' % ( + self.__class__.__name__, `id(self)`, `self.fp`) + def __del__(self): + self.close() + def close(self): + self.read = None + self.readline = None + self.readlines = None + self.fileno = None + self.fp = None + +# Class to add a close hook to an open file +class addclosehook(addbase): + def __init__(self, fp, closehook, *hookargs): + addbase.__init__(self, fp) + self.closehook = closehook + self.hookargs = hookargs + def close(self): + if self.closehook: + apply(self.closehook, self.hookargs) + self.closehook = None + self.hookargs = None + addbase.close(self) + +# class to add an info() method to an open file +class addinfo(addbase): + def __init__(self, fp, headers): + addbase.__init__(self, fp) + self.headers = headers + def info(self): + return self.headers + + +# Utility to combine a URL with a base URL to form a new URL + +def basejoin(base, url): + type, path = splittype(url) + if type: return url + host, path = splithost(path) + basetype, basepath = splittype(base) + basehost, basepath = splithost(basepath) + basepath, basetag = splittag(basepath) + basepath, basequery = splitquery(basepath) + type = basetype or 'file' + if path[:1] != '/': + import string + i = string.rfind(basepath, '/') + if i < 0: basepath = '/' + else: basepath = basepath[:i+1] + path = basepath + path + if not host: host = basehost + if host: return type + '://' + host + path + else: return type + ':' + path + + +# Utilities to parse URLs: +# unwrap('') --> 'type//host/path' +# splittype('type:opaquestring') --> 'type', 'opaquestring' +# splithost('//host[:port]/path') --> 'host[:port]', '/path' +# splitport('host:port') --> 'host', 'port' +# splitquery('/path?query') --> '/path', 'query' +# splittag('/path#tag') --> '/path', 'tag' +# splitgophertype('/Xselector') --> 'X', 'selector' +# unquote('abc%20def') -> 'abc def' +# quote('abc def') -> 'abc%20def') + +def unwrap(url): + import string + url = string.strip(url) + if url[:1] == '<' and url[-1:] == '>': + url = string.strip(url[1:-1]) + if url[:4] == 'URL:': url = string.strip(url[4:]) + return url + +_typeprog = regex.compile('^\([^/:]+\):\(.*\)$') +def splittype(url): + if _typeprog.match(url) >= 0: return _typeprog.group(1, 2) + return None, url + +_hostprog = regex.compile('^//\([^/]+\)\(.*\)$') +def splithost(url): + if _hostprog.match(url) >= 0: return _hostprog.group(1, 2) + return None, url + +_portprog = regex.compile('^\(.*\):\([0-9]+\)$') +def splitport(host): + if _portprog.match(host) >= 0: return _portprog.group(1, 2) + return host, None + +_queryprog = regex.compile('^\(.*\)\?\([^?]*\)$') +def splitquery(url): + if _queryprog.match(url) >= 0: return _queryprog.group(1, 2) + return url, None + +_tagprog = regex.compile('^\(.*\)#\([^#]*\)$') +def splittag(url): + if _tagprog.match(url) >= 0: return _tagprog.group(1, 2) + return url, None + +def splitgophertype(selector): + if selector[:1] == '/' and selector[1:2]: + return selector[1], selector[2:] + return None, selector + +_quoteprog = regex.compile('%[0-9a-fA-F][0-9a-fA-F]') +def unquote(s): + import string + i = 0 + n = len(s) + res = '' + while 0 <= i < n: + j = _quoteprog.search(s, i) + if j < 0: + res = res + s[i:] + break + res = res + (s[i:j] + chr(eval('0x' + s[j+1:j+3]))) + i = j+3 + return res + +_acceptable = \ + 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._@' +def quote(s): + res = '' + for c in s: + if c in _acceptable: res = res + c + else: res = res + '%%%02x' % ord(c) + return res + +# Test and time quote() and unquote() +def test1(): + import time + s = '' + for i in range(256): s = s + chr(i) + s = s*4 + t0 = time.time() + qs = quote(s) + uqs = unquote(qs) + t1 = time.time() + if uqs != s: + print 'Wrong!' + print `s` + print `qs` + print `uqs` + print round(t1 - t0, 3), 'sec' + + +# Test program +def test(): + import sys + import regsub + args = sys.argv[1:] + if not args: + args = [ + '/etc/passwd', + 'file:/etc/passwd', + 'file://localhost/etc/passwd', + 'ftp://ftp.cwi.nl/etc/passwd', + 'gopher://gopher.cwi.nl/11/', + 'http://www.cwi.nl/index.html', + ] + try: + for url in args: + print '-'*10, url, '-'*10 + fn, h = urlretrieve(url) + print fn, h + if h: + print '======' + for k in h.keys(): print k + ':', h[k] + print '======' + fp = open(fn, 'r') + data = fp.read() + del fp + print regsub.gsub('\r', '', data) + fn, h = None, None + print '-'*40 + finally: + urlcleanup() + +# Run test program when run as a script +if __name__ == '__main__': + test1() + test()