From d6fa1d4beef2bf9d83048469667e0ba5f2b41068 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 6 May 2024 15:50:52 +0300 Subject: [PATCH] gh-66543: Add mimetypes.guess_file_type() (GH-117258) --- Doc/includes/email-dir.py | 2 +- Doc/library/mimetypes.rst | 31 +++++++- Doc/library/wsgiref.rst | 2 +- Doc/whatsnew/3.13.rst | 11 +++ Lib/http/server.py | 2 +- Lib/mimetypes.py | 34 ++++++-- Lib/test/test_mimetypes.py | 79 +++++++++++++------ ...4-03-26-15-29-39.gh-issue-66543.OZBhU5.rst | 3 + 8 files changed, 129 insertions(+), 35 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-03-26-15-29-39.gh-issue-66543.OZBhU5.rst diff --git a/Doc/includes/email-dir.py b/Doc/includes/email-dir.py index 2fc1570e654..aa2a5c7cda5 100644 --- a/Doc/includes/email-dir.py +++ b/Doc/includes/email-dir.py @@ -53,7 +53,7 @@ must be running an SMTP server. # Guess the content type based on the file's extension. Encoding # will be ignored, although we should check for simple things like # gzip'd or compressed files. - ctype, encoding = mimetypes.guess_type(path) + ctype, encoding = mimetypes.guess_file_type(path) if ctype is None or encoding is not None: # No guess could be made, or the file is encoded (compressed), so # use a generic bag-of-bits type. diff --git a/Doc/library/mimetypes.rst b/Doc/library/mimetypes.rst index f610032acbe..a24eab21d57 100644 --- a/Doc/library/mimetypes.rst +++ b/Doc/library/mimetypes.rst @@ -52,7 +52,22 @@ the information :func:`init` sets up. are also recognized. .. versionchanged:: 3.8 - Added support for url being a :term:`path-like object`. + Added support for *url* being a :term:`path-like object`. + + .. deprecated:: 3.13 + Passing a file path instead of URL is :term:`soft deprecated`. + Use :func:`guess_file_type` for this. + + +.. function:: guess_file_type(path, *, strict=True) + + .. index:: pair: MIME; headers + + Guess the type of a file based on its path, given by *path*. + Similar to the :func:`guess_type` function, but accepts a path instead of URL. + Path can be a string, a bytes object or a :term:`path-like object`. + + .. versionadded:: 3.13 .. function:: guess_all_extensions(type, strict=True) @@ -61,7 +76,7 @@ the information :func:`init` sets up. return value is a list of strings giving all possible filename extensions, including the leading dot (``'.'``). The extensions are not guaranteed to have been associated with any particular data stream, but would be mapped to the MIME - type *type* by :func:`guess_type`. + type *type* by :func:`guess_type` and :func:`guess_file_type`. The optional *strict* argument has the same meaning as with the :func:`guess_type` function. @@ -72,8 +87,8 @@ the information :func:`init` sets up. return value is a string giving a filename extension, including the leading dot (``'.'``). The extension is not guaranteed to have been associated with any particular data stream, but would be mapped to the MIME type *type* by - :func:`guess_type`. If no extension can be guessed for *type*, ``None`` is - returned. + :func:`guess_type` and :func:`guess_file_type`. + If no extension can be guessed for *type*, ``None`` is returned. The optional *strict* argument has the same meaning as with the :func:`guess_type` function. @@ -238,6 +253,14 @@ than one MIME-type database; it provides an interface similar to the one of the the object. + .. method:: MimeTypes.guess_file_type(path, *, strict=True) + + Similar to the :func:`guess_file_type` function, using the tables stored + as part of the object. + + .. versionadded:: 3.13 + + .. method:: MimeTypes.guess_all_extensions(type, strict=True) Similar to the :func:`guess_all_extensions` function, using the tables stored diff --git a/Doc/library/wsgiref.rst b/Doc/library/wsgiref.rst index c2b0ba70469..7fe84a2de1f 100644 --- a/Doc/library/wsgiref.rst +++ b/Doc/library/wsgiref.rst @@ -865,7 +865,7 @@ directory and port number (default: 8000) on the command line:: fn = os.path.join(path, environ["PATH_INFO"][1:]) if "." not in fn.split(os.path.sep)[-1]: fn = os.path.join(fn, "index.html") - mime_type = mimetypes.guess_type(fn)[0] + mime_type = mimetypes.guess_file_type(fn)[0] # Return 200 OK if file exists, otherwise 404 Not Found if os.path.exists(fn): diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 558565ccbbe..0b75665ab9c 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -623,6 +623,13 @@ math "fusedMultiplyAdd" operation for special cases. (Contributed by Mark Dickinson and Victor Stinner in :gh:`73468`.) +mimetypes +--------- + +* Add the :func:`~mimetypes.guess_file_type` function which works with file path. + Passing file path instead of URL in :func:`~mimetypes.guess_type` is :term:`soft deprecated`. + (Contributed by Serhiy Storchaka in :gh:`66543`.) + mmap ---- @@ -1167,6 +1174,10 @@ Deprecated .. Add deprecations above alphabetically, not here at the end. +* Passing file path instead of URL in :func:`~mimetypes.guess_type` is :term:`soft deprecated`. + Use :func:`~mimetypes.guess_file_type` instead. + (Contributed by Serhiy Storchaka in :gh:`66543`.) + Pending Removal in Python 3.14 ------------------------------ diff --git a/Lib/http/server.py b/Lib/http/server.py index ee7a9b6aa55..7d0da5052d2 100644 --- a/Lib/http/server.py +++ b/Lib/http/server.py @@ -897,7 +897,7 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): ext = ext.lower() if ext in self.extensions_map: return self.extensions_map[ext] - guess, _ = mimetypes.guess_type(path) + guess, _ = mimetypes.guess_file_type(path) if guess: return guess return 'application/octet-stream' diff --git a/Lib/mimetypes.py b/Lib/mimetypes.py index 65a049ae994..8604000ed77 100644 --- a/Lib/mimetypes.py +++ b/Lib/mimetypes.py @@ -40,7 +40,7 @@ except ImportError: __all__ = [ "knownfiles", "inited", "MimeTypes", - "guess_type", "guess_all_extensions", "guess_extension", + "guess_type", "guess_file_type", "guess_all_extensions", "guess_extension", "add_type", "init", "read_mime_types", "suffix_map", "encodings_map", "types_map", "common_types" ] @@ -119,14 +119,14 @@ class MimeTypes: Optional `strict' argument when False adds a bunch of commonly found, but non-standard types. """ + # TODO: Deprecate accepting file paths (in particular path-like objects). url = os.fspath(url) p = urllib.parse.urlparse(url) if p.scheme and len(p.scheme) > 1: scheme = p.scheme url = p.path else: - scheme = None - url = os.path.splitdrive(url)[1] + return self.guess_file_type(url, strict=strict) if scheme == 'data': # syntax of data URLs: # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data @@ -146,13 +146,25 @@ class MimeTypes: if '=' in type or '/' not in type: type = 'text/plain' return type, None # never compressed, so encoding is None - base, ext = posixpath.splitext(url) + return self._guess_file_type(url, strict, posixpath.splitext) + + def guess_file_type(self, path, *, strict=True): + """Guess the type of a file based on its path. + + Similar to guess_type(), but takes file path istead of URL. + """ + path = os.fsdecode(path) + path = os.path.splitdrive(path)[1] + return self._guess_file_type(path, strict, os.path.splitext) + + def _guess_file_type(self, path, strict, splitext): + base, ext = splitext(path) while (ext_lower := ext.lower()) in self.suffix_map: - base, ext = posixpath.splitext(base + self.suffix_map[ext_lower]) + base, ext = splitext(base + self.suffix_map[ext_lower]) # encodings_map is case sensitive if ext in self.encodings_map: encoding = self.encodings_map[ext] - base, ext = posixpath.splitext(base) + base, ext = splitext(base) else: encoding = None ext = ext.lower() @@ -310,6 +322,16 @@ def guess_type(url, strict=True): return _db.guess_type(url, strict) +def guess_file_type(path, *, strict=True): + """Guess the type of a file based on its path. + + Similar to guess_type(), but takes file path istead of URL. + """ + if _db is None: + init() + return _db.guess_file_type(path, strict=strict) + + def guess_all_extensions(type, strict=True): """Guess the extensions for a file based on its MIME type. diff --git a/Lib/test/test_mimetypes.py b/Lib/test/test_mimetypes.py index bf6eae7d0ac..2e0ad0606ae 100644 --- a/Lib/test/test_mimetypes.py +++ b/Lib/test/test_mimetypes.py @@ -36,20 +36,28 @@ class MimeTypesTestCase(unittest.TestCase): def test_case_sensitivity(self): eq = self.assertEqual - eq(self.db.guess_type("foobar.HTML"), self.db.guess_type("foobar.html")) - eq(self.db.guess_type("foobar.TGZ"), self.db.guess_type("foobar.tgz")) - eq(self.db.guess_type("foobar.tar.Z"), ("application/x-tar", "compress")) - eq(self.db.guess_type("foobar.tar.z"), (None, None)) + eq(self.db.guess_file_type("foobar.html"), ("text/html", None)) + eq(self.db.guess_type("scheme:foobar.html"), ("text/html", None)) + eq(self.db.guess_file_type("foobar.HTML"), ("text/html", None)) + eq(self.db.guess_type("scheme:foobar.HTML"), ("text/html", None)) + eq(self.db.guess_file_type("foobar.tgz"), ("application/x-tar", "gzip")) + eq(self.db.guess_type("scheme:foobar.tgz"), ("application/x-tar", "gzip")) + eq(self.db.guess_file_type("foobar.TGZ"), ("application/x-tar", "gzip")) + eq(self.db.guess_type("scheme:foobar.TGZ"), ("application/x-tar", "gzip")) + eq(self.db.guess_file_type("foobar.tar.Z"), ("application/x-tar", "compress")) + eq(self.db.guess_type("scheme:foobar.tar.Z"), ("application/x-tar", "compress")) + eq(self.db.guess_file_type("foobar.tar.z"), (None, None)) + eq(self.db.guess_type("scheme:foobar.tar.z"), (None, None)) def test_default_data(self): eq = self.assertEqual - eq(self.db.guess_type("foo.html"), ("text/html", None)) - eq(self.db.guess_type("foo.HTML"), ("text/html", None)) - eq(self.db.guess_type("foo.tgz"), ("application/x-tar", "gzip")) - eq(self.db.guess_type("foo.tar.gz"), ("application/x-tar", "gzip")) - eq(self.db.guess_type("foo.tar.Z"), ("application/x-tar", "compress")) - eq(self.db.guess_type("foo.tar.bz2"), ("application/x-tar", "bzip2")) - eq(self.db.guess_type("foo.tar.xz"), ("application/x-tar", "xz")) + eq(self.db.guess_file_type("foo.html"), ("text/html", None)) + eq(self.db.guess_file_type("foo.HTML"), ("text/html", None)) + eq(self.db.guess_file_type("foo.tgz"), ("application/x-tar", "gzip")) + eq(self.db.guess_file_type("foo.tar.gz"), ("application/x-tar", "gzip")) + eq(self.db.guess_file_type("foo.tar.Z"), ("application/x-tar", "compress")) + eq(self.db.guess_file_type("foo.tar.bz2"), ("application/x-tar", "bzip2")) + eq(self.db.guess_file_type("foo.tar.xz"), ("application/x-tar", "xz")) def test_data_urls(self): eq = self.assertEqual @@ -63,7 +71,7 @@ class MimeTypesTestCase(unittest.TestCase): eq = self.assertEqual sio = io.StringIO("x-application/x-unittest pyunit\n") self.db.readfp(sio) - eq(self.db.guess_type("foo.pyunit"), + eq(self.db.guess_file_type("foo.pyunit"), ("x-application/x-unittest", None)) eq(self.db.guess_extension("x-application/x-unittest"), ".pyunit") @@ -95,12 +103,12 @@ class MimeTypesTestCase(unittest.TestCase): def test_non_standard_types(self): eq = self.assertEqual # First try strict - eq(self.db.guess_type('foo.xul', strict=True), (None, None)) + eq(self.db.guess_file_type('foo.xul', strict=True), (None, None)) eq(self.db.guess_extension('image/jpg', strict=True), None) # And then non-strict - eq(self.db.guess_type('foo.xul', strict=False), ('text/xul', None)) - eq(self.db.guess_type('foo.XUL', strict=False), ('text/xul', None)) - eq(self.db.guess_type('foo.invalid', strict=False), (None, None)) + eq(self.db.guess_file_type('foo.xul', strict=False), ('text/xul', None)) + eq(self.db.guess_file_type('foo.XUL', strict=False), ('text/xul', None)) + eq(self.db.guess_file_type('foo.invalid', strict=False), (None, None)) eq(self.db.guess_extension('image/jpg', strict=False), '.jpg') eq(self.db.guess_extension('image/JPG', strict=False), '.jpg') @@ -124,15 +132,26 @@ class MimeTypesTestCase(unittest.TestCase): '//share/server/', '\\\\share\\server\\'): path = prefix + name with self.subTest(path=path): + eq(self.db.guess_file_type(path), gzip_expected) eq(self.db.guess_type(path), gzip_expected) expected = (None, None) if os.name == 'nt' else gzip_expected for prefix in ('//', '\\\\', '//share/', '\\\\share\\'): path = prefix + name with self.subTest(path=path): + eq(self.db.guess_file_type(path), expected) eq(self.db.guess_type(path), expected) + eq(self.db.guess_file_type(r" \"\`;b&b&c |.tar.gz"), gzip_expected) eq(self.db.guess_type(r" \"\`;b&b&c |.tar.gz"), gzip_expected) + eq(self.db.guess_file_type(r'foo/.tar.gz'), (None, 'gzip')) + eq(self.db.guess_type(r'foo/.tar.gz'), (None, 'gzip')) + expected = (None, 'gzip') if os.name == 'nt' else gzip_expected + eq(self.db.guess_file_type(r'foo\.tar.gz'), expected) + eq(self.db.guess_type(r'foo\.tar.gz'), expected) + eq(self.db.guess_type(r'scheme:foo\.tar.gz'), gzip_expected) + def test_url(self): + result = self.db.guess_type('http://example.com/host.html') result = self.db.guess_type('http://host.html') msg = 'URL only has a host name, not a file' self.assertSequenceEqual(result, (None, None), msg) @@ -242,22 +261,38 @@ class MimeTypesTestCase(unittest.TestCase): def test_path_like_ob(self): filename = "LICENSE.txt" - filepath = pathlib.Path(filename) - filepath_with_abs_dir = pathlib.Path('/dir/'+filename) - filepath_relative = pathlib.Path('../dir/'+filename) - path_dir = pathlib.Path('./') + filepath = os_helper.FakePath(filename) + filepath_with_abs_dir = os_helper.FakePath('/dir/'+filename) + filepath_relative = os_helper.FakePath('../dir/'+filename) + path_dir = os_helper.FakePath('./') - expected = self.db.guess_type(filename) + expected = self.db.guess_file_type(filename) + self.assertEqual(self.db.guess_file_type(filepath), expected) self.assertEqual(self.db.guess_type(filepath), expected) + self.assertEqual(self.db.guess_file_type( + filepath_with_abs_dir), expected) self.assertEqual(self.db.guess_type( filepath_with_abs_dir), expected) + self.assertEqual(self.db.guess_file_type(filepath_relative), expected) self.assertEqual(self.db.guess_type(filepath_relative), expected) + + self.assertEqual(self.db.guess_file_type(path_dir), (None, None)) self.assertEqual(self.db.guess_type(path_dir), (None, None)) + def test_bytes_path(self): + self.assertEqual(self.db.guess_file_type(b'foo.html'), + self.db.guess_file_type('foo.html')) + self.assertEqual(self.db.guess_file_type(b'foo.tar.gz'), + self.db.guess_file_type('foo.tar.gz')) + self.assertEqual(self.db.guess_file_type(b'foo.tgz'), + self.db.guess_file_type('foo.tgz')) + def test_keywords_args_api(self): + self.assertEqual(self.db.guess_file_type( + path="foo.html", strict=True), ("text/html", None)) self.assertEqual(self.db.guess_type( - url="foo.html", strict=True), ("text/html", None)) + url="scheme:foo.html", strict=True), ("text/html", None)) self.assertEqual(self.db.guess_all_extensions( type='image/jpg', strict=True), []) self.assertEqual(self.db.guess_extension( diff --git a/Misc/NEWS.d/next/Library/2024-03-26-15-29-39.gh-issue-66543.OZBhU5.rst b/Misc/NEWS.d/next/Library/2024-03-26-15-29-39.gh-issue-66543.OZBhU5.rst new file mode 100644 index 00000000000..12ea5085814 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-03-26-15-29-39.gh-issue-66543.OZBhU5.rst @@ -0,0 +1,3 @@ +Add the :func:`mimetypes.guess_file_type` function which works with file +path. Passing file path instead of URL in :func:`~mimetypes.guess_type` is +:term:`soft deprecated`.