From 107771a228ee73b4683242cb696e3024f93b74d5 Mon Sep 17 00:00:00 2001 From: Barry Warsaw Date: Thu, 25 Oct 2001 21:49:18 +0000 Subject: [PATCH] Applying proposed patch for bug #474583, optional support for non-standard but common types. Including Martin's suggestion to add rejected non-standard types from patch #438790. Specifically, guess_type(), guess_extension(): Both the functions and the methods grow an optional "strict" flag, defaulting to true, which determines whether to recognize non-standard, but commonly found types or not. Also, I sorted, reformatted, and culled duplicates from the big types_map dictionary. Note that there are a few non-equivalent duplicates (e.g. .cdf and .xls) for which the first will just get thrown away. I didn't remove those though. Finally, use of the module as a script as grown the -l and -e options to toggle strictness and to do guess_extension(), respectively. Doc and unittest updates too. --- Doc/lib/libmimetypes.tex | 55 ++++-- Lib/mimetypes.py | 344 +++++++++++++++++++++++-------------- Lib/test/test_mimetypes.py | 12 ++ 3 files changed, 263 insertions(+), 148 deletions(-) diff --git a/Doc/lib/libmimetypes.tex b/Doc/lib/libmimetypes.tex index 327b2ba7fe7..3747fe1e1bd 100644 --- a/Doc/lib/libmimetypes.tex +++ b/Doc/lib/libmimetypes.tex @@ -8,10 +8,10 @@ \indexii{MIME}{content type} -The \module{mimetypes} converts between a filename or URL and the MIME -type associated with the filename extension. Conversions are provided -from filename to MIME type and from MIME type to filename extension; -encodings are not supported for the later conversion. +The \module{mimetypes} module converts between a filename or URL and +the MIME type associated with the filename extension. Conversions are +provided from filename to MIME type and from MIME type to filename +extension; encodings are not supported for the latter conversion. The module provides one class and a number of convenience functions. The functions are the normal interface to this module, but some @@ -23,22 +23,31 @@ module. If the module has not been initialized, they will call sets up. -\begin{funcdesc}{guess_type}{filename} +\begin{funcdesc}{guess_type}{filename\optional{, strict}} Guess the type of a file based on its filename or URL, given by \var{filename}. The return value is a tuple \code{(\var{type}, \var{encoding})} where \var{type} is \code{None} if the type can't be -guessed (no or unknown suffix) or a string of the form +guessed (missing or unknown suffix) or a string of the form \code{'\var{type}/\var{subtype}'}, usable for a MIME -\mailheader{content-type} header\indexii{MIME}{headers}; and encoding -is \code{None} for no encoding or the name of the program used to -encode (e.g. \program{compress} or \program{gzip}). The encoding is -suitable for use as a \mailheader{Content-Encoding} header, \emph{not} -as a \mailheader{Content-Transfer-Encoding} header. The mappings are -table driven. Encoding suffixes are case sensitive; type suffixes are -first tried case sensitive, then case insensitive. +\mailheader{content-type} header\indexii{MIME}{headers}. + +\var{encoding} is \code{None} for no encoding or the name of the +program used to encode (e.g. \program{compress} or \program{gzip}). +The encoding is suitable for use as a \mailheader{Content-Encoding} +header, \emph{not} as a \mailheader{Content-Transfer-Encoding} header. +The mappings are table driven. Encoding suffixes are case sensitive; +type suffixes are first tried case sensitively, then case +insensitively. + +Optional \var{strict} is a flag specifying whether the list of known +MIME types is limited to only the official types \ulink{registered +with IANA}{http://www.isi.edu/in-notes/iana/assignments/media-types} +are recognized. When \var{strict} is true (the default), only the +IANA types are supported; when \var{strict} is false, some additional +non-standard but commonly used MIME types are also recognized. \end{funcdesc} -\begin{funcdesc}{guess_extension}{type} +\begin{funcdesc}{guess_extension}{type\optional{, strict}} Guess the extension for a file based on its MIME type, given by \var{type}. The return value is a string giving a filename extension, including the @@ -46,6 +55,9 @@ leading dot (\character{.}). The extension is not guaranteed to have been associated with any particular data stream, but would be mapped to the MIME type \var{type} by \function{guess_type()}. If no extension can be guessed for \var{type}, \code{None} is returned. + +Optional \var{strict} has the same meaning as with the +\function{guess_type()} function. \end{funcdesc} @@ -98,6 +110,11 @@ Dictionary mapping filename extensions to encoding types. Dictionary mapping filename extensions to MIME types. \end{datadesc} +\begin{datadesc}{common_types} +Dictionary mapping filename extensions to non-standard, but commonly +found MIME types. +\end{datadesc} + The \class{MimeTypes} class may be useful for applications which may want more than one MIME-type database: @@ -144,12 +161,18 @@ that of the \refmodule{mimetypes} module. module. \end{datadesc} -\begin{methoddesc}{guess_extension}{type} +\begin{datadesc}{common_types} + Dictionary mapping filename extensions to non-standard, but commonly + found MIME types. This is initially a copy of the global + \code{common_types} defined in the module. +\end{datadesc} + +\begin{methoddesc}{guess_extension}{type\optional{, strict}} Similar to the \function{guess_extension()} function, using the tables stored as part of the object. \end{methoddesc} -\begin{methoddesc}{guess_type}{url} +\begin{methoddesc}{guess_type}{url\optional{, strict}} Similar to the \function{guess_type()} function, using the tables stored as part of the object. \end{methoddesc} diff --git a/Lib/mimetypes.py b/Lib/mimetypes.py index 06b450bec48..1cd424acb05 100644 --- a/Lib/mimetypes.py +++ b/Lib/mimetypes.py @@ -2,9 +2,9 @@ This module defines two useful functions: -guess_type(url) -- guess the MIME type and encoding of a URL. +guess_type(url, strict=1) -- guess the MIME type and encoding of a URL. -guess_extension(type) -- guess the extension for a given MIME type. +guess_extension(type, strict=1) -- guess the extension for a given MIME type. It also contains the following, for tuning the behavior: @@ -21,6 +21,16 @@ Functions: init([files]) -- parse a list of files, default knownfiles read_mime_types(file) -- parse one file, return a dictionary or None +When run as a script, the following command line options are recognized: + +Usage: mimetypes.py [options] type +Options: + --help / -h -- print this message and exit + --lenient / -l -- additionally search of some common, but non-standard + types. + --extension / -e -- guess extension instead of type + +More than one type argument may be given. """ import os @@ -53,10 +63,11 @@ class MimeTypes: self.encodings_map = encodings_map.copy() self.suffix_map = suffix_map.copy() self.types_map = types_map.copy() + self.common_types = common_types.copy() for name in filenames: self.read(name) - def guess_type(self, url): + def guess_type(self, url, strict=1): """Guess the type of a file based on its URL. Return value is a tuple (type, encoding) where type is None if @@ -71,6 +82,9 @@ class MimeTypes: The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped to '.tar.gz'. (This is table-driven too, using the dictionary suffix_map.) + + Optional `strict' argument when false adds a bunch of commonly found, + but non-standard types. """ scheme, url = urllib.splittype(url) if scheme == 'data': @@ -101,14 +115,21 @@ class MimeTypes: else: encoding = None types_map = self.types_map + common_types = self.common_types if types_map.has_key(ext): return types_map[ext], encoding elif types_map.has_key(ext.lower()): return types_map[ext.lower()], encoding + elif strict: + return None, encoding + elif common_types.has_key(ext): + return common_types[ext], encoding + elif common_types.has_key(ext.lower()): + return common_types[ext.lower()], encoding else: return None, encoding - def guess_extension(self, type): + def guess_extension(self, type, strict=1): """Guess the extension for a file based on its MIME type. Return value is a string giving a filename extension, @@ -117,11 +138,18 @@ class MimeTypes: stream, but would be mapped to the MIME type `type' by guess_type(). If no extension can be guessed for `type', None is returned. + + Optional `strict' argument when false adds a bunch of commonly found, + but non-standard types. """ type = type.lower() for ext, stype in self.types_map.items(): if type == stype: return ext + if not strict: + for ext, stype in common_types.items(): + if type == stype: + return ext return None def read(self, filename): @@ -149,7 +177,7 @@ class MimeTypes: map['.' + suff] = type -def guess_type(url): +def guess_type(url, strict=1): """Guess the type of a file based on its URL. Return value is a tuple (type, encoding) where type is None if the @@ -163,12 +191,15 @@ def guess_type(url): The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped to ".tar.gz". (This is table-driven too, using the dictionary suffix_map). + + Optional `strict' argument when false adds a bunch of commonly found, but + non-standard types. """ init() - return guess_type(url) + return guess_type(url, strict) -def guess_extension(type): +def guess_extension(type, strict=1): """Guess the extension for a file based on its MIME type. Return value is a string giving a filename extension, including the @@ -176,14 +207,17 @@ def guess_extension(type): associated with any particular data stream, but would be mapped to the MIME type `type' by guess_type(). If no extension can be guessed for `type', None is returned. + + Optional `strict' argument when false adds a bunch of commonly found, + but non-standard types. """ init() - return guess_extension(type) + return guess_extension(type, strict) def init(files=None): global guess_extension, guess_type - global suffix_map, types_map, encodings_map + global suffix_map, types_map, encodings_map, common_types global inited inited = 1 db = MimeTypes() @@ -197,6 +231,7 @@ def init(files=None): types_map = db.types_map guess_extension = db.guess_extension guess_type = db.guess_type + common_types = db.common_types def read_mime_types(file): @@ -223,133 +258,178 @@ encodings_map = { # Before adding new types, make sure they are either registered with IANA, at # http://www.isi.edu/in-notes/iana/assignments/media-types # or extensions, i.e. using the x- prefix + +# If you add to these, please keep them sorted! types_map = { - '.a': 'application/octet-stream', - '.ai': 'application/postscript', - '.aif': 'audio/x-aiff', - '.aifc': 'audio/x-aiff', - '.aiff': 'audio/x-aiff', - '.au': 'audio/basic', - '.avi': 'video/x-msvideo', - '.bcpio': 'application/x-bcpio', - '.bin': 'application/octet-stream', - '.bmp': 'image/x-ms-bmp', - '.cdf': 'application/x-netcdf', - '.cpio': 'application/x-cpio', - '.csh': 'application/x-csh', - '.css': 'text/css', - '.dll': 'application/octet-stream', - '.doc': 'application/msword', - '.dvi': 'application/x-dvi', - '.exe': 'application/octet-stream', - '.eps': 'application/postscript', - '.etx': 'text/x-setext', - '.gif': 'image/gif', - '.gtar': 'application/x-gtar', - '.hdf': 'application/x-hdf', - '.htm': 'text/html', - '.html': 'text/html', - '.ief': 'image/ief', - '.jpe': 'image/jpeg', - '.jpeg': 'image/jpeg', - '.jpg': 'image/jpeg', - '.js': 'application/x-javascript', - '.latex': 'application/x-latex', - '.man': 'application/x-troff-man', - '.me': 'application/x-troff-me', - '.mif': 'application/x-mif', - '.mov': 'video/quicktime', - '.movie': 'video/x-sgi-movie', - '.mp2': 'audio/mpeg', - '.mp3': 'audio/mpeg', - '.mpe': 'video/mpeg', - '.mpeg': 'video/mpeg', - '.mpg': 'video/mpeg', - '.ms': 'application/x-troff-ms', - '.nc': 'application/x-netcdf', - '.o': 'application/octet-stream', - '.obj': 'application/octet-stream', - '.oda': 'application/oda', - '.pbm': 'image/x-portable-bitmap', - '.pdf': 'application/pdf', - '.pgm': 'image/x-portable-graymap', - '.pnm': 'image/x-portable-anymap', - '.png': 'image/png', - '.ppm': 'image/x-portable-pixmap', - '.ps': 'application/postscript', - '.py': 'text/x-python', - '.pyc': 'application/x-python-code', - '.pyo': 'application/x-python-code', - '.qt': 'video/quicktime', - '.ras': 'image/x-cmu-raster', - '.rgb': 'image/x-rgb', - '.rdf': 'application/xml', - '.roff': 'application/x-troff', - '.rtx': 'text/richtext', - '.sgm': 'text/x-sgml', - '.sgml': 'text/x-sgml', - '.sh': 'application/x-sh', - '.shar': 'application/x-shar', - '.snd': 'audio/basic', - '.so': 'application/octet-stream', - '.src': 'application/x-wais-source', + '.a' : 'application/octet-stream', + '.ai' : 'application/postscript', + '.aif' : 'audio/x-aiff', + '.aifc' : 'audio/x-aiff', + '.aiff' : 'audio/x-aiff', + '.au' : 'audio/basic', + '.avi' : 'video/x-msvideo', + '.bat' : 'text/plain', + '.bcpio' : 'application/x-bcpio', + '.bin' : 'application/octet-stream', + '.bmp' : 'image/x-ms-bmp', + '.c' : 'text/plain', + # Duplicates :( + '.cdf' : 'application/x-cdf', + '.cdf' : 'application/x-netcdf', + '.cpio' : 'application/x-cpio', + '.csh' : 'application/x-csh', + '.css' : 'text/css', + '.dll' : 'application/octet-stream', + '.doc' : 'application/msword', + '.dot' : 'application/msword', + '.dvi' : 'application/x-dvi', + '.eml' : 'message/rfc822', + '.eps' : 'application/postscript', + '.etx' : 'text/x-setext', + '.exe' : 'application/octet-stream', + '.gif' : 'image/gif', + '.gtar' : 'application/x-gtar', + '.h' : 'text/plain', + '.hdf' : 'application/x-hdf', + '.htm' : 'text/html', + '.html' : 'text/html', + '.ief' : 'image/ief', + '.jpe' : 'image/jpeg', + '.jpeg' : 'image/jpeg', + '.jpg' : 'image/jpeg', + '.js' : 'application/x-javascript', + '.ksh' : 'text/plain', + '.latex' : 'application/x-latex', + '.m1v' : 'video/mpeg', + '.man' : 'application/x-troff-man', + '.me' : 'application/x-troff-me', + '.mht' : 'message/rfc822', + '.mhtml' : 'message/rfc822', + '.mif' : 'application/x-mif', + '.mov' : 'video/quicktime', + '.movie' : 'video/x-sgi-movie', + '.mp2' : 'audio/mpeg', + '.mp3' : 'audio/mpeg', + '.mpa' : 'video/mpeg', + '.mpe' : 'video/mpeg', + '.mpeg' : 'video/mpeg', + '.mpg' : 'video/mpeg', + '.ms' : 'application/x-troff-ms', + '.nc' : 'application/x-netcdf', + '.nws' : 'message/rfc822', + '.o' : 'application/octet-stream', + '.obj' : 'application/octet-stream', + '.oda' : 'application/oda', + '.p12' : 'application/x-pkcs12', + '.p7c' : 'application/pkcs7-mime', + '.pbm' : 'image/x-portable-bitmap', + '.pdf' : 'application/pdf', + '.pfx' : 'application/x-pkcs12', + '.pgm' : 'image/x-portable-graymap', + '.pl' : 'text/plain', + '.png' : 'image/png', + '.pnm' : 'image/x-portable-anymap', + '.pot' : 'application/vnd.ms-powerpoint', + '.ppa' : 'application/vnd.ms-powerpoint', + '.ppm' : 'image/x-portable-pixmap', + '.pps' : 'application/vnd.ms-powerpoint', + '.ppt' : 'application/vnd.ms-powerpoint', + '.ps' : 'application/postscript', + '.pwz' : 'application/vnd.ms-powerpoint', + '.py' : 'text/x-python', + '.pyc' : 'application/x-python-code', + '.pyo' : 'application/x-python-code', + '.qt' : 'video/quicktime', + '.ra' : 'audio/x-pn-realaudio', + '.ram' : 'application/x-pn-realaudio', + '.ras' : 'image/x-cmu-raster', + '.rdf' : 'application/xml', + '.rgb' : 'image/x-rgb', + '.roff' : 'application/x-troff', + '.rtx' : 'text/richtext', + '.sgm' : 'text/x-sgml', + '.sgml' : 'text/x-sgml', + '.sh' : 'application/x-sh', + '.shar' : 'application/x-shar', + '.snd' : 'audio/basic', + '.so' : 'application/octet-stream', + '.src' : 'application/x-wais-source', '.sv4cpio': 'application/x-sv4cpio', - '.sv4crc': 'application/x-sv4crc', - '.t': 'application/x-troff', - '.tar': 'application/x-tar', - '.tcl': 'application/x-tcl', - '.tex': 'application/x-tex', - '.texi': 'application/x-texinfo', + '.sv4crc' : 'application/x-sv4crc', + '.t' : 'application/x-troff', + '.tar' : 'application/x-tar', + '.tcl' : 'application/x-tcl', + '.tex' : 'application/x-tex', + '.texi' : 'application/x-texinfo', '.texinfo': 'application/x-texinfo', - '.tif': 'image/tiff', - '.tiff': 'image/tiff', - '.tr': 'application/x-troff', - '.tsv': 'text/tab-separated-values', - '.txt': 'text/plain', - '.ustar': 'application/x-ustar', - '.wav': 'audio/x-wav', - '.xbm': 'image/x-xbitmap', - '.xls': 'application/excel', - '.xml': 'text/xml', - '.xsl': 'application/xml', - '.xpm': 'image/x-xpixmap', - '.xwd': 'image/x-xwindowdump', - '.zip': 'application/zip', - '.mp3': 'audio/mpeg', - '.ra': 'audio/x-pn-realaudio', - '.pdf': 'application/pdf', - '.c': 'text/plain', - '.bat': 'text/plain', - '.h': 'text/plain', - '.pl': 'text/plain', - '.ksh': 'text/plain', - '.ram': 'application/x-pn-realaudio', - '.cdf': 'application/x-cdf', - '.doc': 'application/msword', - '.dot': 'application/msword', - '.wiz': 'application/msword', - '.xlb': 'application/vnd.ms-excel', - '.xls': 'application/vnd.ms-excel', - '.ppa': 'application/vnd.ms-powerpoint', - '.ppt': 'application/vnd.ms-powerpoint', - '.pps': 'application/vnd.ms-powerpoint', - '.pot': 'application/vnd.ms-powerpoint', - '.pwz': 'application/vnd.ms-powerpoint', - '.eml': 'message/rfc822', - '.nws': 'message/rfc822', - '.mht': 'message/rfc822', - '.mhtml': 'message/rfc822', - '.css': 'text/css', - '.p7c': 'application/pkcs7-mime', - '.p12': 'application/x-pkcs12', - '.pfx': 'application/x-pkcs12', - '.js': 'application/x-javascript', - '.m1v': 'video/mpeg', - '.mpa': 'video/mpeg', - '.vcf': 'text/x-vcard', - '.xml': 'text/xml', + '.tif' : 'image/tiff', + '.tiff' : 'image/tiff', + '.tr' : 'application/x-troff', + '.tsv' : 'text/tab-separated-values', + '.txt' : 'text/plain', + '.ustar' : 'application/x-ustar', + '.vcf' : 'text/x-vcard', + '.wav' : 'audio/x-wav', + '.wiz' : 'application/msword', + '.xbm' : 'image/x-xbitmap', + '.xlb' : 'application/vnd.ms-excel', + # Duplicates :( + '.xls' : 'application/excel', + '.xls' : 'application/vnd.ms-excel', + '.xml' : 'text/xml', + '.xpm' : 'image/x-xpixmap', + '.xsl' : 'application/xml', + '.xwd' : 'image/x-xwindowdump', + '.zip' : 'application/zip', } +# These are non-standard types, commonly found in the wild. They will only +# match if strict=0 flag is given to the API methods. + +# Please sort these too +common_types = { + '.jpg' : 'image/jpg', + '.mid' : 'audio/midi', + '.midi': 'audio/midi', + '.pct' : 'image/pict', + '.pic' : 'image/pict', + '.pict': 'image/pict', + '.rtf' : 'application/rtf', + '.xul' : 'text/xul' + } + + +def usage(code, msg=''): + print __doc__ + if msg: print msg + sys.exit(code) + + if __name__ == '__main__': import sys - print guess_type(sys.argv[1]) + import getopt + + try: + opts, args = getopt.getopt(sys.argv[1:], 'hle', + ['help', 'lenient', 'extension']) + except getopt.error, msg: + usage(1, msg) + + strict = 1 + extension = 0 + for opt, arg in opts: + if opt in ('-h', '--help'): + usage(0) + elif opt in ('-l', '--lenient'): + strict = 0 + elif opt in ('-e', '--extension'): + extension = 1 + for gtype in args: + if extension: + guess = guess_extension(gtype, strict) + if not guess: print "I don't know anything about type", gtype + else: print guess + else: + guess, encoding = guess_type(gtype, strict) + if not guess: print "I don't know anything about type", gtype + else: print 'type:', guess, 'encoding:', encoding diff --git a/Lib/test/test_mimetypes.py b/Lib/test/test_mimetypes.py index 8735e278ab2..bca5766a729 100644 --- a/Lib/test/test_mimetypes.py +++ b/Lib/test/test_mimetypes.py @@ -38,6 +38,18 @@ class MimeTypesTestCase(unittest.TestCase): self.assertEqual(self.db.guess_extension("x-application/x-unittest"), ".pyunit") + def test_non_standard_types(self): + # First try strict + self.assertEqual(self.db.guess_type('foo.xul', strict=1), + (None, None)) + self.assertEqual(self.db.guess_extension('image/jpg', strict=1), + None) + # And then non-strict + self.assertEqual(self.db.guess_type('foo.xul', strict=0), + ('text/xul', None)) + self.assertEqual(self.db.guess_extension('image/jpg', strict=0), + '.jpg') + def test_main(): test_support.run_unittest(MimeTypesTestCase)