cpython/Lib/mimetypes.py

"""Guess the MIME type of a file.

This module defines two useful functions:

guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.

guess_extension(type, strict=1) -- guess the extension for a given MIME type.

It also contains the following, for tuning the behavior:

Data:

knownfiles -- list of files to parse
inited -- flag set when init() has been called
suffix_map -- dictionary mapping suffixes to suffixes
encodings_map -- dictionary mapping suffixes to encodings
types_map -- dictionary mapping suffixes to types

Functions:

init([files]) -- parse a list of files, default knownfiles
read_mime_types(file) -- parse one file, return a dictionary or None
"""

import os
import posixpath
import urllib

__all__ = ["guess_type","guess_extension","read_mime_types","init"]

knownfiles = [
    "/usr/local/etc/httpd/conf/mime.types",
    "/usr/local/lib/netscape/mime.types",
    "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
    "/usr/local/etc/mime.types",                # Apache 1.3
    ]

inited = 0


class MimeTypes:
    """MIME-types datastore.

    This datastore can handle information from mime.types-style files
    and supports basic determination of MIME type from a filename or
    URL, and can guess a reasonable extension given a MIME type.
    """

    def __init__(self, filenames=()):
        if not inited:
            init()
        self.encodings_map = encodings_map.copy()
        self.suffix_map = suffix_map.copy()
        self.types_map = types_map.copy()
        self.common_types = common_types.copy()
        for name in filenames:
            self.read(name)

    def guess_type(self, url, strict=1):
        """Guess the type of a file based on its URL.

        Return value is a tuple (type, encoding) where type is None if
        the type can't be guessed (no or unknown suffix) or a string
        of the form type/subtype, usable for a MIME Content-type
        header; and encoding is None for no encoding or the name of
        the program used to encode (e.g. compress or gzip).  The
        mappings are table driven.  Encoding suffixes are case
        sensitive; type suffixes are first tried case sensitive, then
        case insensitive.

        The suffixes .tgz, .taz and .tz (case sensitive!) are all
        mapped to '.tar.gz'.  (This is table-driven too, using the
        dictionary suffix_map.)

        Optional `strict' argument when false adds a bunch of commonly found,
        but non-standard types.
        """
        scheme, url = urllib.splittype(url)
        if scheme == 'data':
            # syntax of data URLs:
            # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
            # mediatype := [ type "/" subtype ] *( ";" parameter )
            # data      := *urlchar
            # parameter := attribute "=" value
            # type/subtype defaults to "text/plain"
            comma = url.find(',')
            if comma < 0:
                # bad data URL
                return None, None
            semi = url.find(';', 0, comma)
            if semi >= 0:
                type = url[:semi]
            else:
                type = url[:comma]
            if '=' in type or '/' not in type:
                type = 'text/plain'
            return type, None           # never compressed, so encoding is None
        base, ext = posixpath.splitext(url)
        while self.suffix_map.has_key(ext):
            base, ext = posixpath.splitext(base + self.suffix_map[ext])
        if self.encodings_map.has_key(ext):
            encoding = self.encodings_map[ext]
            base, ext = posixpath.splitext(base)
        else:
            encoding = None
        types_map = self.types_map
        common_types = self.common_types
        if types_map.has_key(ext):
            return types_map[ext], encoding
        elif types_map.has_key(ext.lower()):
            return types_map[ext.lower()], encoding
        elif strict:
            return None, encoding
        elif common_types.has_key(ext):
            return common_types[ext], encoding
        elif common_types.has_key(ext.lower()):
            return common_types[ext.lower()], encoding
        else:
            return None, encoding

    def guess_extension(self, type, strict=1):
        """Guess the extension for a file based on its MIME type.

        Return value is a string giving a filename extension,
        including the leading dot ('.').  The extension is not
        guaranteed to have been associated with any particular data
        stream, but would be mapped to the MIME type `type' by
        guess_type().  If no extension can be guessed for `type', None
        is returned.

        Optional `strict' argument when false adds a bunch of commonly found,
        but non-standard types.
        """
        type = type.lower()
        for ext, stype in self.types_map.items():
            if type == stype:
                return ext
        if not strict:
            for ext, stype in common_types.items():
                if type == stype:
                    return ext
        return None

    def read(self, filename):
        """Read a single mime.types-format file, specified by pathname."""
        fp = open(filename)
        self.readfp(fp)
        fp.close()

    def readfp(self, fp):
        """Read a single mime.types-format file."""
        map = self.types_map
        while 1:
            line = fp.readline()
            if not line:
                break
            words = line.split()
            for i in range(len(words)):
                if words[i][0] == '#':
                    del words[i:]
                    break
            if not words:
                continue
            type, suffixes = words[0], words[1:]
            for suff in suffixes:
                map['.' + suff] = type


def guess_type(url, strict=1):
    """Guess the type of a file based on its URL.

    Return value is a tuple (type, encoding) where type is None if the
    type can't be guessed (no or unknown suffix) or a string of the
    form type/subtype, usable for a MIME Content-type header; and
    encoding is None for no encoding or the name of the program used
    to encode (e.g. compress or gzip).  The mappings are table
    driven.  Encoding suffixes are case sensitive; type suffixes are
    first tried case sensitive, then case insensitive.

    The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
    to ".tar.gz".  (This is table-driven too, using the dictionary
    suffix_map).

    Optional `strict' argument when false adds a bunch of commonly found, but
    non-standard types.
    """
    init()
    return guess_type(url, strict)


def guess_extension(type, strict=1):
    """Guess the extension for a file based on its MIME type.

    Return value is a string giving a filename extension, including the
    leading dot ('.').  The extension is not guaranteed to have been
    associated with any particular data stream, but would be mapped to the
    MIME type `type' by guess_type().  If no extension can be guessed for
    `type', None is returned.

    Optional `strict' argument when false adds a bunch of commonly found,
    but non-standard types.
    """
    init()
    return guess_extension(type, strict)


def init(files=None):
    global guess_extension, guess_type
    global suffix_map, types_map, encodings_map, common_types
    global inited
    inited = 1
    db = MimeTypes()
    if files is None:
        files = knownfiles
    for file in files:
        if os.path.isfile(file):
            db.readfp(open(file))
    encodings_map = db.encodings_map
    suffix_map = db.suffix_map
    types_map = db.types_map
    guess_extension = db.guess_extension
    guess_type = db.guess_type
    common_types = db.common_types


def read_mime_types(file):
    try:
        f = open(file)
    except IOError:
        return None
    db = MimeTypes()
    db.readfp(f)
    return db.types_map


suffix_map = {
    '.tgz': '.tar.gz',
    '.taz': '.tar.gz',
    '.tz': '.tar.gz',
    }

encodings_map = {
    '.gz': 'gzip',
    '.Z': 'compress',
    }

# Before adding new types, make sure they are either registered with IANA, at
# http://www.isi.edu/in-notes/iana/assignments/media-types
# or extensions, i.e. using the x- prefix

# If you add to these, please keep them sorted!
types_map = {
    '.a'      : 'application/octet-stream',
    '.ai'     : 'application/postscript',
    '.aif'    : 'audio/x-aiff',
    '.aifc'   : 'audio/x-aiff',
    '.aiff'   : 'audio/x-aiff',
    '.au'     : 'audio/basic',
    '.avi'    : 'video/x-msvideo',
    '.bat'    : 'text/plain',
    '.bcpio'  : 'application/x-bcpio',
    '.bin'    : 'application/octet-stream',
    '.bmp'    : 'image/x-ms-bmp',
    '.c'      : 'text/plain',
    # Duplicates :(
    '.cdf'    : 'application/x-cdf',
    '.cdf'    : 'application/x-netcdf',
    '.cpio'   : 'application/x-cpio',
    '.csh'    : 'application/x-csh',
    '.css'    : 'text/css',
    '.dll'    : 'application/octet-stream',
    '.doc'    : 'application/msword',
    '.dot'    : 'application/msword',
    '.dvi'    : 'application/x-dvi',
    '.eml'    : 'message/rfc822',
    '.eps'    : 'application/postscript',
    '.etx'    : 'text/x-setext',
    '.exe'    : 'application/octet-stream',
    '.gif'    : 'image/gif',
    '.gtar'   : 'application/x-gtar',
    '.h'      : 'text/plain',
    '.hdf'    : 'application/x-hdf',
    '.htm'    : 'text/html',
    '.html'   : 'text/html',
    '.ief'    : 'image/ief',
    '.jpe'    : 'image/jpeg',
    '.jpeg'   : 'image/jpeg',
    '.jpg'    : 'image/jpeg',
    '.js'     : 'application/x-javascript',
    '.ksh'    : 'text/plain',
    '.latex'  : 'application/x-latex',
    '.m1v'    : 'video/mpeg',
    '.man'    : 'application/x-troff-man',
    '.me'     : 'application/x-troff-me',
    '.mht'    : 'message/rfc822',
    '.mhtml'  : 'message/rfc822',
    '.mif'    : 'application/x-mif',
    '.mov'    : 'video/quicktime',
    '.movie'  : 'video/x-sgi-movie',
    '.mp2'    : 'audio/mpeg',
    '.mp3'    : 'audio/mpeg',
    '.mpa'    : 'video/mpeg',
    '.mpe'    : 'video/mpeg',
    '.mpeg'   : 'video/mpeg',
    '.mpg'    : 'video/mpeg',
    '.ms'     : 'application/x-troff-ms',
    '.nc'     : 'application/x-netcdf',
    '.nws'    : 'message/rfc822',
    '.o'      : 'application/octet-stream',
    '.obj'    : 'application/octet-stream',
    '.oda'    : 'application/oda',
    '.p12'    : 'application/x-pkcs12',
    '.p7c'    : 'application/pkcs7-mime',
    '.pbm'    : 'image/x-portable-bitmap',
    '.pdf'    : 'application/pdf',
    '.pfx'    : 'application/x-pkcs12',
    '.pgm'    : 'image/x-portable-graymap',
    '.pl'     : 'text/plain',
    '.png'    : 'image/png',
    '.pnm'    : 'image/x-portable-anymap',
    '.pot'    : 'application/vnd.ms-powerpoint',
    '.ppa'    : 'application/vnd.ms-powerpoint',
    '.ppm'    : 'image/x-portable-pixmap',
    '.pps'    : 'application/vnd.ms-powerpoint',
    '.ppt'    : 'application/vnd.ms-powerpoint',
    '.ps'     : 'application/postscript',
    '.pwz'    : 'application/vnd.ms-powerpoint',
    '.py'     : 'text/x-python',
    '.pyc'    : 'application/x-python-code',
    '.pyo'    : 'application/x-python-code',
    '.qt'     : 'video/quicktime',
    '.ra'     : 'audio/x-pn-realaudio',
    '.ram'    : 'application/x-pn-realaudio',
    '.ras'    : 'image/x-cmu-raster',
    '.rdf'    : 'application/xml',
    '.rgb'    : 'image/x-rgb',
    '.roff'   : 'application/x-troff',
    '.rtx'    : 'text/richtext',
    '.sgm'    : 'text/x-sgml',
    '.sgml'   : 'text/x-sgml',
    '.sh'     : 'application/x-sh',
    '.shar'   : 'application/x-shar',
    '.snd'    : 'audio/basic',
    '.so'     : 'application/octet-stream',
    '.src'    : 'application/x-wais-source',
    '.sv4cpio': 'application/x-sv4cpio',
    '.sv4crc' : 'application/x-sv4crc',
    '.t'      : 'application/x-troff',
    '.tar'    : 'application/x-tar',
    '.tcl'    : 'application/x-tcl',
    '.tex'    : 'application/x-tex',
    '.texi'   : 'application/x-texinfo',
    '.texinfo': 'application/x-texinfo',
    '.tif'    : 'image/tiff',
    '.tiff'   : 'image/tiff',
    '.tr'     : 'application/x-troff',
    '.tsv'    : 'text/tab-separated-values',
    '.txt'    : 'text/plain',
    '.ustar'  : 'application/x-ustar',
    '.vcf'    : 'text/x-vcard',
    '.wav'    : 'audio/x-wav',
    '.wiz'    : 'application/msword',
    '.xbm'    : 'image/x-xbitmap',
    '.xlb'    : 'application/vnd.ms-excel',
    # Duplicates :(
    '.xls'    : 'application/excel',
    '.xls'    : 'application/vnd.ms-excel',
    '.xml'    : 'text/xml',
    '.xpm'    : 'image/x-xpixmap',
    '.xsl'    : 'application/xml',
    '.xwd'    : 'image/x-xwindowdump',
    '.zip'    : 'application/zip',
    }

# These are non-standard types, commonly found in the wild.  They will only
# match if strict=0 flag is given to the API methods.

# Please sort these too
common_types = {
    '.jpg' : 'image/jpg',
    '.mid' : 'audio/midi',
    '.midi': 'audio/midi',
    '.pct' : 'image/pict',
    '.pic' : 'image/pict',
    '.pict': 'image/pict',
    '.rtf' : 'application/rtf',
    '.xul' : 'text/xul'
    }


if __name__ == '__main__':
    import sys
    import getopt

    USAGE = """\
Usage: mimetypes.py [options] type

Options:
    --help / -h       -- print this message and exit
    --lenient / -l    -- additionally search of some common, but non-standard
                         types.
    --extension / -e  -- guess extension instead of type

More than one type argument may be given.
"""

    def usage(code, msg=''):
        print USAGE
        if msg: print msg
        sys.exit(code)

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hle',
                                   ['help', 'lenient', 'extension'])
    except getopt.error, msg:
        usage(1, msg)

    strict = 1
    extension = 0
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt in ('-l', '--lenient'):
            strict = 0
        elif opt in ('-e', '--extension'):
            extension = 1
    for gtype in args:
        if extension:
            guess = guess_extension(gtype, strict)
            if not guess: print "I don't know anything about type", gtype
            else: print guess
        else:
            guess, encoding = guess_type(gtype, strict)
            if not guess: print "I don't know anything about type", gtype
            else: print 'type:', guess, 'encoding:', encoding