mirror of https://github.com/python/cpython
Applying proposed patch for bug #474583, optional support for
non-standard but common types. Including Martin's suggestion to add rejected non-standard types from patch #438790. Specifically, guess_type(), guess_extension(): Both the functions and the methods grow an optional "strict" flag, defaulting to true, which determines whether to recognize non-standard, but commonly found types or not. Also, I sorted, reformatted, and culled duplicates from the big types_map dictionary. Note that there are a few non-equivalent duplicates (e.g. .cdf and .xls) for which the first will just get thrown away. I didn't remove those though. Finally, use of the module as a script as grown the -l and -e options to toggle strictness and to do guess_extension(), respectively. Doc and unittest updates too.
This commit is contained in:
parent
9cd0efcee9
commit
107771a228
|
@ -8,10 +8,10 @@
|
|||
|
||||
\indexii{MIME}{content type}
|
||||
|
||||
The \module{mimetypes} converts between a filename or URL and the MIME
|
||||
type associated with the filename extension. Conversions are provided
|
||||
from filename to MIME type and from MIME type to filename extension;
|
||||
encodings are not supported for the later conversion.
|
||||
The \module{mimetypes} module converts between a filename or URL and
|
||||
the MIME type associated with the filename extension. Conversions are
|
||||
provided from filename to MIME type and from MIME type to filename
|
||||
extension; encodings are not supported for the latter conversion.
|
||||
|
||||
The module provides one class and a number of convenience functions.
|
||||
The functions are the normal interface to this module, but some
|
||||
|
@ -23,22 +23,31 @@ module. If the module has not been initialized, they will call
|
|||
sets up.
|
||||
|
||||
|
||||
\begin{funcdesc}{guess_type}{filename}
|
||||
\begin{funcdesc}{guess_type}{filename\optional{, strict}}
|
||||
Guess the type of a file based on its filename or URL, given by
|
||||
\var{filename}. The return value is a tuple \code{(\var{type},
|
||||
\var{encoding})} where \var{type} is \code{None} if the type can't be
|
||||
guessed (no or unknown suffix) or a string of the form
|
||||
guessed (missing or unknown suffix) or a string of the form
|
||||
\code{'\var{type}/\var{subtype}'}, usable for a MIME
|
||||
\mailheader{content-type} header\indexii{MIME}{headers}; and encoding
|
||||
is \code{None} for no encoding or the name of the program used to
|
||||
encode (e.g. \program{compress} or \program{gzip}). The encoding is
|
||||
suitable for use as a \mailheader{Content-Encoding} header, \emph{not}
|
||||
as a \mailheader{Content-Transfer-Encoding} header. The mappings are
|
||||
table driven. Encoding suffixes are case sensitive; type suffixes are
|
||||
first tried case sensitive, then case insensitive.
|
||||
\mailheader{content-type} header\indexii{MIME}{headers}.
|
||||
|
||||
\var{encoding} is \code{None} for no encoding or the name of the
|
||||
program used to encode (e.g. \program{compress} or \program{gzip}).
|
||||
The encoding is suitable for use as a \mailheader{Content-Encoding}
|
||||
header, \emph{not} as a \mailheader{Content-Transfer-Encoding} header.
|
||||
The mappings are table driven. Encoding suffixes are case sensitive;
|
||||
type suffixes are first tried case sensitively, then case
|
||||
insensitively.
|
||||
|
||||
Optional \var{strict} is a flag specifying whether the list of known
|
||||
MIME types is limited to only the official types \ulink{registered
|
||||
with IANA}{http://www.isi.edu/in-notes/iana/assignments/media-types}
|
||||
are recognized. When \var{strict} is true (the default), only the
|
||||
IANA types are supported; when \var{strict} is false, some additional
|
||||
non-standard but commonly used MIME types are also recognized.
|
||||
\end{funcdesc}
|
||||
|
||||
\begin{funcdesc}{guess_extension}{type}
|
||||
\begin{funcdesc}{guess_extension}{type\optional{, strict}}
|
||||
Guess the extension for a file based on its MIME type, given by
|
||||
\var{type}.
|
||||
The return value is a string giving a filename extension, including the
|
||||
|
@ -46,6 +55,9 @@ leading dot (\character{.}). The extension is not guaranteed to have been
|
|||
associated with any particular data stream, but would be mapped to the
|
||||
MIME type \var{type} by \function{guess_type()}. If no extension can
|
||||
be guessed for \var{type}, \code{None} is returned.
|
||||
|
||||
Optional \var{strict} has the same meaning as with the
|
||||
\function{guess_type()} function.
|
||||
\end{funcdesc}
|
||||
|
||||
|
||||
|
@ -98,6 +110,11 @@ Dictionary mapping filename extensions to encoding types.
|
|||
Dictionary mapping filename extensions to MIME types.
|
||||
\end{datadesc}
|
||||
|
||||
\begin{datadesc}{common_types}
|
||||
Dictionary mapping filename extensions to non-standard, but commonly
|
||||
found MIME types.
|
||||
\end{datadesc}
|
||||
|
||||
|
||||
The \class{MimeTypes} class may be useful for applications which may
|
||||
want more than one MIME-type database:
|
||||
|
@ -144,12 +161,18 @@ that of the \refmodule{mimetypes} module.
|
|||
module.
|
||||
\end{datadesc}
|
||||
|
||||
\begin{methoddesc}{guess_extension}{type}
|
||||
\begin{datadesc}{common_types}
|
||||
Dictionary mapping filename extensions to non-standard, but commonly
|
||||
found MIME types. This is initially a copy of the global
|
||||
\code{common_types} defined in the module.
|
||||
\end{datadesc}
|
||||
|
||||
\begin{methoddesc}{guess_extension}{type\optional{, strict}}
|
||||
Similar to the \function{guess_extension()} function, using the
|
||||
tables stored as part of the object.
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}{guess_type}{url}
|
||||
\begin{methoddesc}{guess_type}{url\optional{, strict}}
|
||||
Similar to the \function{guess_type()} function, using the tables
|
||||
stored as part of the object.
|
||||
\end{methoddesc}
|
||||
|
|
174
Lib/mimetypes.py
174
Lib/mimetypes.py
|
@ -2,9 +2,9 @@
|
|||
|
||||
This module defines two useful functions:
|
||||
|
||||
guess_type(url) -- guess the MIME type and encoding of a URL.
|
||||
guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
|
||||
|
||||
guess_extension(type) -- guess the extension for a given MIME type.
|
||||
guess_extension(type, strict=1) -- guess the extension for a given MIME type.
|
||||
|
||||
It also contains the following, for tuning the behavior:
|
||||
|
||||
|
@ -21,6 +21,16 @@ Functions:
|
|||
init([files]) -- parse a list of files, default knownfiles
|
||||
read_mime_types(file) -- parse one file, return a dictionary or None
|
||||
|
||||
When run as a script, the following command line options are recognized:
|
||||
|
||||
Usage: mimetypes.py [options] type
|
||||
Options:
|
||||
--help / -h -- print this message and exit
|
||||
--lenient / -l -- additionally search of some common, but non-standard
|
||||
types.
|
||||
--extension / -e -- guess extension instead of type
|
||||
|
||||
More than one type argument may be given.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
@ -53,10 +63,11 @@ class MimeTypes:
|
|||
self.encodings_map = encodings_map.copy()
|
||||
self.suffix_map = suffix_map.copy()
|
||||
self.types_map = types_map.copy()
|
||||
self.common_types = common_types.copy()
|
||||
for name in filenames:
|
||||
self.read(name)
|
||||
|
||||
def guess_type(self, url):
|
||||
def guess_type(self, url, strict=1):
|
||||
"""Guess the type of a file based on its URL.
|
||||
|
||||
Return value is a tuple (type, encoding) where type is None if
|
||||
|
@ -71,6 +82,9 @@ class MimeTypes:
|
|||
The suffixes .tgz, .taz and .tz (case sensitive!) are all
|
||||
mapped to '.tar.gz'. (This is table-driven too, using the
|
||||
dictionary suffix_map.)
|
||||
|
||||
Optional `strict' argument when false adds a bunch of commonly found,
|
||||
but non-standard types.
|
||||
"""
|
||||
scheme, url = urllib.splittype(url)
|
||||
if scheme == 'data':
|
||||
|
@ -101,14 +115,21 @@ class MimeTypes:
|
|||
else:
|
||||
encoding = None
|
||||
types_map = self.types_map
|
||||
common_types = self.common_types
|
||||
if types_map.has_key(ext):
|
||||
return types_map[ext], encoding
|
||||
elif types_map.has_key(ext.lower()):
|
||||
return types_map[ext.lower()], encoding
|
||||
elif strict:
|
||||
return None, encoding
|
||||
elif common_types.has_key(ext):
|
||||
return common_types[ext], encoding
|
||||
elif common_types.has_key(ext.lower()):
|
||||
return common_types[ext.lower()], encoding
|
||||
else:
|
||||
return None, encoding
|
||||
|
||||
def guess_extension(self, type):
|
||||
def guess_extension(self, type, strict=1):
|
||||
"""Guess the extension for a file based on its MIME type.
|
||||
|
||||
Return value is a string giving a filename extension,
|
||||
|
@ -117,11 +138,18 @@ class MimeTypes:
|
|||
stream, but would be mapped to the MIME type `type' by
|
||||
guess_type(). If no extension can be guessed for `type', None
|
||||
is returned.
|
||||
|
||||
Optional `strict' argument when false adds a bunch of commonly found,
|
||||
but non-standard types.
|
||||
"""
|
||||
type = type.lower()
|
||||
for ext, stype in self.types_map.items():
|
||||
if type == stype:
|
||||
return ext
|
||||
if not strict:
|
||||
for ext, stype in common_types.items():
|
||||
if type == stype:
|
||||
return ext
|
||||
return None
|
||||
|
||||
def read(self, filename):
|
||||
|
@ -149,7 +177,7 @@ class MimeTypes:
|
|||
map['.' + suff] = type
|
||||
|
||||
|
||||
def guess_type(url):
|
||||
def guess_type(url, strict=1):
|
||||
"""Guess the type of a file based on its URL.
|
||||
|
||||
Return value is a tuple (type, encoding) where type is None if the
|
||||
|
@ -163,12 +191,15 @@ def guess_type(url):
|
|||
The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
|
||||
to ".tar.gz". (This is table-driven too, using the dictionary
|
||||
suffix_map).
|
||||
|
||||
Optional `strict' argument when false adds a bunch of commonly found, but
|
||||
non-standard types.
|
||||
"""
|
||||
init()
|
||||
return guess_type(url)
|
||||
return guess_type(url, strict)
|
||||
|
||||
|
||||
def guess_extension(type):
|
||||
def guess_extension(type, strict=1):
|
||||
"""Guess the extension for a file based on its MIME type.
|
||||
|
||||
Return value is a string giving a filename extension, including the
|
||||
|
@ -176,14 +207,17 @@ def guess_extension(type):
|
|||
associated with any particular data stream, but would be mapped to the
|
||||
MIME type `type' by guess_type(). If no extension can be guessed for
|
||||
`type', None is returned.
|
||||
|
||||
Optional `strict' argument when false adds a bunch of commonly found,
|
||||
but non-standard types.
|
||||
"""
|
||||
init()
|
||||
return guess_extension(type)
|
||||
return guess_extension(type, strict)
|
||||
|
||||
|
||||
def init(files=None):
|
||||
global guess_extension, guess_type
|
||||
global suffix_map, types_map, encodings_map
|
||||
global suffix_map, types_map, encodings_map, common_types
|
||||
global inited
|
||||
inited = 1
|
||||
db = MimeTypes()
|
||||
|
@ -197,6 +231,7 @@ def init(files=None):
|
|||
types_map = db.types_map
|
||||
guess_extension = db.guess_extension
|
||||
guess_type = db.guess_type
|
||||
common_types = db.common_types
|
||||
|
||||
|
||||
def read_mime_types(file):
|
||||
|
@ -223,6 +258,8 @@ encodings_map = {
|
|||
# Before adding new types, make sure they are either registered with IANA, at
|
||||
# http://www.isi.edu/in-notes/iana/assignments/media-types
|
||||
# or extensions, i.e. using the x- prefix
|
||||
|
||||
# If you add to these, please keep them sorted!
|
||||
types_map = {
|
||||
'.a' : 'application/octet-stream',
|
||||
'.ai' : 'application/postscript',
|
||||
|
@ -231,21 +268,28 @@ types_map = {
|
|||
'.aiff' : 'audio/x-aiff',
|
||||
'.au' : 'audio/basic',
|
||||
'.avi' : 'video/x-msvideo',
|
||||
'.bat' : 'text/plain',
|
||||
'.bcpio' : 'application/x-bcpio',
|
||||
'.bin' : 'application/octet-stream',
|
||||
'.bmp' : 'image/x-ms-bmp',
|
||||
'.c' : 'text/plain',
|
||||
# Duplicates :(
|
||||
'.cdf' : 'application/x-cdf',
|
||||
'.cdf' : 'application/x-netcdf',
|
||||
'.cpio' : 'application/x-cpio',
|
||||
'.csh' : 'application/x-csh',
|
||||
'.css' : 'text/css',
|
||||
'.dll' : 'application/octet-stream',
|
||||
'.doc' : 'application/msword',
|
||||
'.dot' : 'application/msword',
|
||||
'.dvi' : 'application/x-dvi',
|
||||
'.exe': 'application/octet-stream',
|
||||
'.eml' : 'message/rfc822',
|
||||
'.eps' : 'application/postscript',
|
||||
'.etx' : 'text/x-setext',
|
||||
'.exe' : 'application/octet-stream',
|
||||
'.gif' : 'image/gif',
|
||||
'.gtar' : 'application/x-gtar',
|
||||
'.h' : 'text/plain',
|
||||
'.hdf' : 'application/x-hdf',
|
||||
'.htm' : 'text/html',
|
||||
'.html' : 'text/html',
|
||||
|
@ -254,36 +298,53 @@ types_map = {
|
|||
'.jpeg' : 'image/jpeg',
|
||||
'.jpg' : 'image/jpeg',
|
||||
'.js' : 'application/x-javascript',
|
||||
'.ksh' : 'text/plain',
|
||||
'.latex' : 'application/x-latex',
|
||||
'.m1v' : 'video/mpeg',
|
||||
'.man' : 'application/x-troff-man',
|
||||
'.me' : 'application/x-troff-me',
|
||||
'.mht' : 'message/rfc822',
|
||||
'.mhtml' : 'message/rfc822',
|
||||
'.mif' : 'application/x-mif',
|
||||
'.mov' : 'video/quicktime',
|
||||
'.movie' : 'video/x-sgi-movie',
|
||||
'.mp2' : 'audio/mpeg',
|
||||
'.mp3' : 'audio/mpeg',
|
||||
'.mpa' : 'video/mpeg',
|
||||
'.mpe' : 'video/mpeg',
|
||||
'.mpeg' : 'video/mpeg',
|
||||
'.mpg' : 'video/mpeg',
|
||||
'.ms' : 'application/x-troff-ms',
|
||||
'.nc' : 'application/x-netcdf',
|
||||
'.nws' : 'message/rfc822',
|
||||
'.o' : 'application/octet-stream',
|
||||
'.obj' : 'application/octet-stream',
|
||||
'.oda' : 'application/oda',
|
||||
'.p12' : 'application/x-pkcs12',
|
||||
'.p7c' : 'application/pkcs7-mime',
|
||||
'.pbm' : 'image/x-portable-bitmap',
|
||||
'.pdf' : 'application/pdf',
|
||||
'.pfx' : 'application/x-pkcs12',
|
||||
'.pgm' : 'image/x-portable-graymap',
|
||||
'.pnm': 'image/x-portable-anymap',
|
||||
'.pl' : 'text/plain',
|
||||
'.png' : 'image/png',
|
||||
'.pnm' : 'image/x-portable-anymap',
|
||||
'.pot' : 'application/vnd.ms-powerpoint',
|
||||
'.ppa' : 'application/vnd.ms-powerpoint',
|
||||
'.ppm' : 'image/x-portable-pixmap',
|
||||
'.pps' : 'application/vnd.ms-powerpoint',
|
||||
'.ppt' : 'application/vnd.ms-powerpoint',
|
||||
'.ps' : 'application/postscript',
|
||||
'.pwz' : 'application/vnd.ms-powerpoint',
|
||||
'.py' : 'text/x-python',
|
||||
'.pyc' : 'application/x-python-code',
|
||||
'.pyo' : 'application/x-python-code',
|
||||
'.qt' : 'video/quicktime',
|
||||
'.ra' : 'audio/x-pn-realaudio',
|
||||
'.ram' : 'application/x-pn-realaudio',
|
||||
'.ras' : 'image/x-cmu-raster',
|
||||
'.rgb': 'image/x-rgb',
|
||||
'.rdf' : 'application/xml',
|
||||
'.rgb' : 'image/x-rgb',
|
||||
'.roff' : 'application/x-troff',
|
||||
'.rtx' : 'text/richtext',
|
||||
'.sgm' : 'text/x-sgml',
|
||||
|
@ -307,49 +368,68 @@ types_map = {
|
|||
'.tsv' : 'text/tab-separated-values',
|
||||
'.txt' : 'text/plain',
|
||||
'.ustar' : 'application/x-ustar',
|
||||
'.vcf' : 'text/x-vcard',
|
||||
'.wav' : 'audio/x-wav',
|
||||
'.wiz' : 'application/msword',
|
||||
'.xbm' : 'image/x-xbitmap',
|
||||
'.xlb' : 'application/vnd.ms-excel',
|
||||
# Duplicates :(
|
||||
'.xls' : 'application/excel',
|
||||
'.xls' : 'application/vnd.ms-excel',
|
||||
'.xml' : 'text/xml',
|
||||
'.xsl': 'application/xml',
|
||||
'.xpm' : 'image/x-xpixmap',
|
||||
'.xsl' : 'application/xml',
|
||||
'.xwd' : 'image/x-xwindowdump',
|
||||
'.zip' : 'application/zip',
|
||||
'.mp3': 'audio/mpeg',
|
||||
'.ra': 'audio/x-pn-realaudio',
|
||||
'.pdf': 'application/pdf',
|
||||
'.c': 'text/plain',
|
||||
'.bat': 'text/plain',
|
||||
'.h': 'text/plain',
|
||||
'.pl': 'text/plain',
|
||||
'.ksh': 'text/plain',
|
||||
'.ram': 'application/x-pn-realaudio',
|
||||
'.cdf': 'application/x-cdf',
|
||||
'.doc': 'application/msword',
|
||||
'.dot': 'application/msword',
|
||||
'.wiz': 'application/msword',
|
||||
'.xlb': 'application/vnd.ms-excel',
|
||||
'.xls': 'application/vnd.ms-excel',
|
||||
'.ppa': 'application/vnd.ms-powerpoint',
|
||||
'.ppt': 'application/vnd.ms-powerpoint',
|
||||
'.pps': 'application/vnd.ms-powerpoint',
|
||||
'.pot': 'application/vnd.ms-powerpoint',
|
||||
'.pwz': 'application/vnd.ms-powerpoint',
|
||||
'.eml': 'message/rfc822',
|
||||
'.nws': 'message/rfc822',
|
||||
'.mht': 'message/rfc822',
|
||||
'.mhtml': 'message/rfc822',
|
||||
'.css': 'text/css',
|
||||
'.p7c': 'application/pkcs7-mime',
|
||||
'.p12': 'application/x-pkcs12',
|
||||
'.pfx': 'application/x-pkcs12',
|
||||
'.js': 'application/x-javascript',
|
||||
'.m1v': 'video/mpeg',
|
||||
'.mpa': 'video/mpeg',
|
||||
'.vcf': 'text/x-vcard',
|
||||
'.xml': 'text/xml',
|
||||
}
|
||||
|
||||
# These are non-standard types, commonly found in the wild. They will only
|
||||
# match if strict=0 flag is given to the API methods.
|
||||
|
||||
# Please sort these too
|
||||
common_types = {
|
||||
'.jpg' : 'image/jpg',
|
||||
'.mid' : 'audio/midi',
|
||||
'.midi': 'audio/midi',
|
||||
'.pct' : 'image/pict',
|
||||
'.pic' : 'image/pict',
|
||||
'.pict': 'image/pict',
|
||||
'.rtf' : 'application/rtf',
|
||||
'.xul' : 'text/xul'
|
||||
}
|
||||
|
||||
|
||||
def usage(code, msg=''):
|
||||
print __doc__
|
||||
if msg: print msg
|
||||
sys.exit(code)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
print guess_type(sys.argv[1])
|
||||
import getopt
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], 'hle',
|
||||
['help', 'lenient', 'extension'])
|
||||
except getopt.error, msg:
|
||||
usage(1, msg)
|
||||
|
||||
strict = 1
|
||||
extension = 0
|
||||
for opt, arg in opts:
|
||||
if opt in ('-h', '--help'):
|
||||
usage(0)
|
||||
elif opt in ('-l', '--lenient'):
|
||||
strict = 0
|
||||
elif opt in ('-e', '--extension'):
|
||||
extension = 1
|
||||
for gtype in args:
|
||||
if extension:
|
||||
guess = guess_extension(gtype, strict)
|
||||
if not guess: print "I don't know anything about type", gtype
|
||||
else: print guess
|
||||
else:
|
||||
guess, encoding = guess_type(gtype, strict)
|
||||
if not guess: print "I don't know anything about type", gtype
|
||||
else: print 'type:', guess, 'encoding:', encoding
|
||||
|
|
|
@ -38,6 +38,18 @@ class MimeTypesTestCase(unittest.TestCase):
|
|||
self.assertEqual(self.db.guess_extension("x-application/x-unittest"),
|
||||
".pyunit")
|
||||
|
||||
def test_non_standard_types(self):
|
||||
# First try strict
|
||||
self.assertEqual(self.db.guess_type('foo.xul', strict=1),
|
||||
(None, None))
|
||||
self.assertEqual(self.db.guess_extension('image/jpg', strict=1),
|
||||
None)
|
||||
# And then non-strict
|
||||
self.assertEqual(self.db.guess_type('foo.xul', strict=0),
|
||||
('text/xul', None))
|
||||
self.assertEqual(self.db.guess_extension('image/jpg', strict=0),
|
||||
'.jpg')
|
||||
|
||||
|
||||
def test_main():
|
||||
test_support.run_unittest(MimeTypesTestCase)
|
||||
|
|
Loading…
Reference in New Issue