Refactor so that it is easier to work with alternate MIME types databases,

and programmatically extend the database in different ways. This closes the SF bug (feature request) #439710.
2001-08-03 21:01:44 +00:00 · 2001-08-03 21:01:44 +00:00 · eeee4ec4f1
parent e861365dab
commit eeee4ec4f1
1 changed files with 138 additions and 67 deletions
--- a/Lib/mimetypes.py
+++ b/Lib/mimetypes.py
@ -12,7 +12,7 @@ Data:

 knownfiles -- list of files to parse
 inited -- flag set when init() has been called
-suffixes_map -- dictionary mapping suffixes to suffixes
+suffix_map -- dictionary mapping suffixes to suffixes
 encodings_map -- dictionary mapping suffixes to encodings
 types_map -- dictionary mapping suffixes to types

@ -23,6 +23,7 @@ read_mime_types(file) -- parse one file, return a dictionary or None

 """

+import os
 import posixpath
 import urllib

@ -37,6 +38,117 @@ knownfiles = [

 inited = 0

+
+class MimeTypes:
+    """MIME-types datastore.
+
+    This datastore can handle information from mime.types-style files
+    and supports basic determination of MIME type from a filename or
+    URL, and can guess a reasonable extension given a MIME type.
+    """
+
+    def __init__(self, filenames=()):
+        if not inited:
+            init()
+        self.encodings_map = encodings_map.copy()
+        self.suffix_map = suffix_map.copy()
+        self.types_map = types_map.copy()
+        for name in filenames:
+            self.read(name)
+
+    def guess_type(self, url):
+        """Guess the type of a file based on its URL.
+
+        Return value is a tuple (type, encoding) where type is None if
+        the type can't be guessed (no or unknown suffix) or a string
+        of the form type/subtype, usable for a MIME Content-type
+        header; and encoding is None for no encoding or the name of
+        the program used to encode (e.g. compress or gzip).  The
+        mappings are table driven.  Encoding suffixes are case
+        sensitive; type suffixes are first tried case sensitive, then
+        case insensitive.
+
+        The suffixes .tgz, .taz and .tz (case sensitive!) are all
+        mapped to '.tar.gz'.  (This is table-driven too, using the
+        dictionary suffix_map.)
+        """
+        scheme, url = urllib.splittype(url)
+        if scheme == 'data':
+            # syntax of data URLs:
+            # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
+            # mediatype := [ type "/" subtype ] *( ";" parameter )
+            # data      := *urlchar
+            # parameter := attribute "=" value
+            # type/subtype defaults to "text/plain"
+            comma = url.find(',')
+            if comma < 0:
+                # bad data URL
+                return None, None
+            semi = url.find(';', 0, comma)
+            if semi >= 0:
+                type = url[:semi]
+            else:
+                type = url[:comma]
+            if '=' in type or '/' not in type:
+                type = 'text/plain'
+            return type, None           # never compressed, so encoding is None
+        base, ext = posixpath.splitext(url)
+        while self.suffix_map.has_key(ext):
+            base, ext = posixpath.splitext(base + self.suffix_map[ext])
+        if self.encodings_map.has_key(ext):
+            encoding = self.encodings_map[ext]
+            base, ext = posixpath.splitext(base)
+        else:
+            encoding = None
+        types_map = self.types_map
+        if types_map.has_key(ext):
+            return types_map[ext], encoding
+        elif types_map.has_key(ext.lower()):
+            return types_map[ext.lower()], encoding
+        else:
+            return None, encoding
+
+    def guess_extension(self, type):
+        """Guess the extension for a file based on its MIME type.
+
+        Return value is a string giving a filename extension,
+        including the leading dot ('.').  The extension is not
+        guaranteed to have been associated with any particular data
+        stream, but would be mapped to the MIME type `type' by
+        guess_type().  If no extension can be guessed for `type', None
+        is returned.
+        """
+        type = type.lower()
+        for ext, stype in self.types_map.items():
+            if type == stype:
+                return ext
+        return None
+
+    def read(self, filename):
+        """Read a single mime.types-format file, specified by pathname."""
+        fp = open(filename)
+        self.readfp(fp)
+        fp.close()
+
+    def readfp(self):
+        """Read a single mime.types-format file."""
+        map = self.types_map
+        while 1:
+            line = f.readline()
+            if not line:
+                break
+            words = line.split()
+            for i in range(len(words)):
+                if words[i][0] == '#':
+                    del words[i:]
+                    break
+            if not words:
+                continue
+            type, suffixes = words[0], words[1:]
+            for suff in suffixes:
+                map['.' + suff] = type
+
+
 def guess_type(url):
    """Guess the type of a file based on its URL.

@ -51,44 +163,10 @@ def guess_type(url):
    The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
    to ".tar.gz".  (This is table-driven too, using the dictionary
    suffix_map).
-
    """
-    if not inited:
-        init()
-    scheme, url = urllib.splittype(url)
-    if scheme == 'data':
-        # syntax of data URLs:
-        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
-        # mediatype := [ type "/" subtype ] *( ";" parameter )
-        # data      := *urlchar
-        # parameter := attribute "=" value
-        # type/subtype defaults to "text/plain"
-        comma = url.find(',')
-        if comma < 0:
-            # bad data URL
-            return None, None
-        semi = url.find(';', 0, comma)
-        if semi >= 0:
-            type = url[:semi]
-        else:
-            type = url[:comma]
-        if '=' in type or '/' not in type:
-            type = 'text/plain'
-        return type, None               # never compressed, so encoding is None
-    base, ext = posixpath.splitext(url)
-    while suffix_map.has_key(ext):
-        base, ext = posixpath.splitext(base + suffix_map[ext])
-    if encodings_map.has_key(ext):
-        encoding = encodings_map[ext]
-        base, ext = posixpath.splitext(base)
-    else:
-        encoding = None
-    if types_map.has_key(ext):
-        return types_map[ext], encoding
-    elif types_map.has_key(ext.lower()):
-        return types_map[ext.lower()], encoding
-    else:
-        return None, encoding
+    init()
+    return guess_type(url)
+

 def guess_extension(type):
    """Guess the extension for a file based on its MIME type.
@ -99,50 +177,43 @@ def guess_extension(type):
    MIME type `type' by guess_type().  If no extension can be guessed for
    `type', None is returned.
    """
-    global inited
-    if not inited:
-        init()
-    type = type.lower()
-    for ext, stype in types_map.items():
-        if type == stype:
-            return ext
-    return None
+    init()
+    return guess_extension(type)
+

 def init(files=None):
+    global guess_extension, guess_type
+    global suffix_map, types_map, encodings_map
    global inited
-    for file in files or knownfiles:
-        s = read_mime_types(file)
-        if s:
-            for key, value in s.items():
-                types_map[key] = value
    inited = 1
+    db = MimeTypes()
+    if files is None:
+        files = knownfiles
+    for file in files:
+        if os.path.isfile(file):
+            db.readfp(open(file))
+    encodings_map = db.encodings_map
+    suffix_map = db.encodings_map
+    types_map = db.types_map
+    guess_extension = db.guess_extension
+    guess_type = db.guess_type
+

 def read_mime_types(file):
    try:
        f = open(file)
    except IOError:
        return None
-    map = {}
-    while 1:
-        line = f.readline()
-        if not line: break
-        words = line.split()
-        for i in range(len(words)):
-            if words[i][0] == '#':
-                del words[i:]
-                break
-        if not words: continue
-        type, suffixes = words[0], words[1:]
-        for suff in suffixes:
-            map['.'+suff] = type
-    f.close()
-    return map
+    db = MimeTypes()
+    db.readfp(f)
+    return db.types_map
+

 suffix_map = {
    '.tgz': '.tar.gz',
    '.taz': '.tar.gz',
    '.tz': '.tar.gz',
-}
+    }

 encodings_map = {
    '.gz': 'gzip',