bpo-28080: Add support for the fallback encoding in ZIP files (GH-32007)

* Add the metadata_encoding parameter in the zipfile.ZipFile constructor.
* Add the --metadata-encoding option in the zipfile CLI.

Co-authored-by: Stephen J. Turnbull <stephen@xemacs.org>
This commit is contained in:
Serhiy Storchaka 2022-03-22 11:52:55 +02:00 committed by GitHub
parent c6cd3cc93c
commit a25a985535
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 211 additions and 11 deletions

View File

@ -139,7 +139,8 @@ ZipFile Objects
.. class:: ZipFile(file, mode='r', compression=ZIP_STORED, allowZip64=True, \
compresslevel=None, *, strict_timestamps=True)
compresslevel=None, *, strict_timestamps=True,
metadata_encoding=None)
Open a ZIP file, where *file* can be a path to a file (a string), a
file-like object or a :term:`path-like object`.
@ -183,6 +184,10 @@ ZipFile Objects
Similar behavior occurs with files newer than 2107-12-31,
the timestamp is also set to the limit.
When mode is ``'r'``, *metadata_encoding* may be set to the name of a codec,
which will be used to decode metadata such as the names of members and ZIP
comments.
If the file is created with mode ``'w'``, ``'x'`` or ``'a'`` and then
:meth:`closed <close>` without adding any files to the archive, the appropriate
ZIP structures for an empty archive will be written to the file.
@ -194,6 +199,19 @@ ZipFile Objects
with ZipFile('spam.zip', 'w') as myzip:
myzip.write('eggs.txt')
.. note::
*metadata_encoding* is an instance-wide setting for the ZipFile.
It is not currently possible to set this on a per-member basis.
This attribute is a workaround for legacy implementations which produce
archives with names in the current locale encoding or code page (mostly
on Windows). According to the .ZIP standard, the encoding of metadata
may be specified to be either IBM code page (default) or UTF-8 by a flag
in the archive header.
That flag takes precedence over *metadata_encoding*, which is
a Python-specific extension.
.. versionadded:: 3.2
Added the ability to use :class:`ZipFile` as a context manager.
@ -220,6 +238,10 @@ ZipFile Objects
.. versionadded:: 3.8
The *strict_timestamps* keyword-only argument
.. versionchanged:: 3.11
Added support for specifying member name encoding for reading
metadata in the zipfile's directory and file headers.
.. method:: ZipFile.close()
@ -395,6 +417,15 @@ ZipFile Objects
given.
The archive must be open with mode ``'w'``, ``'x'`` or ``'a'``.
.. note::
The ZIP file standard historically did not specify a metadata encoding,
but strongly recommended CP437 (the original IBM PC encoding) for
interoperability. Recent versions allow use of UTF-8 (only). In this
module, UTF-8 will automatically be used to write the member names if
they contain any non-ASCII characters. It is not possible to write
member names in any encoding other than ASCII or UTF-8.
.. note::
Archive names should be relative to the archive root, that is, they should not
@ -868,6 +899,14 @@ Command-line options
Test whether the zipfile is valid or not.
.. cmdoption:: --metadata-encoding <encoding>
Specify encoding of member names for :option:`-l`, :option:`-e` and
:option:`-t`.
.. versionadded:: 3.11
Decompression pitfalls
----------------------

View File

@ -432,6 +432,12 @@ venv
Third party code that also creates new virtual environments should do the same.
(Contributed by Miro Hrončok in :issue:`45413`.)
zipfile
-------
* Added support for specifying member name encoding for reading
metadata in the zipfile's directory and file headers.
(Contributed by Stephen J. Turnbull and Serhiy Storchaka in :issue:`28080`.)
fcntl
-----

View File

@ -21,8 +21,10 @@ from tempfile import TemporaryFile
from random import randint, random, randbytes
from test.support import script_helper
from test.support import (findfile, requires_zlib, requires_bz2,
requires_lzma, captured_stdout, requires_subprocess)
from test.support import (
findfile, requires_zlib, requires_bz2, requires_lzma,
captured_stdout, captured_stderr, requires_subprocess
)
from test.support.os_helper import (
TESTFN, unlink, rmtree, temp_dir, temp_cwd, fd_count
)
@ -3210,5 +3212,139 @@ class TestPath(unittest.TestCase):
assert isinstance(file, cls)
class EncodedMetadataTests(unittest.TestCase):
file_names = ['\u4e00', '\u4e8c', '\u4e09'] # Han 'one', 'two', 'three'
file_content = [
"This is pure ASCII.\n".encode('ascii'),
# This is modern Japanese. (UTF-8)
"\u3053\u308c\u306f\u73fe\u4ee3\u7684\u65e5\u672c\u8a9e\u3067\u3059\u3002\n".encode('utf-8'),
# This is obsolete Japanese. (Shift JIS)
"\u3053\u308c\u306f\u53e4\u3044\u65e5\u672c\u8a9e\u3067\u3059\u3002\n".encode('shift_jis'),
]
def setUp(self):
self.addCleanup(unlink, TESTFN)
# Create .zip of 3 members with Han names encoded in Shift JIS.
# Each name is 1 Han character encoding to 2 bytes in Shift JIS.
# The ASCII names are arbitrary as long as they are length 2 and
# not otherwise contained in the zip file.
# Data elements are encoded bytes (ascii, utf-8, shift_jis).
placeholders = ["n1", "n2"] + self.file_names[2:]
with zipfile.ZipFile(TESTFN, mode="w") as tf:
for temp, content in zip(placeholders, self.file_content):
tf.writestr(temp, content, zipfile.ZIP_STORED)
# Hack in the Shift JIS names with flag bit 11 (UTF-8) unset.
with open(TESTFN, "rb") as tf:
data = tf.read()
for name, temp in zip(self.file_names, placeholders[:2]):
data = data.replace(temp.encode('ascii'),
name.encode('shift_jis'))
with open(TESTFN, "wb") as tf:
tf.write(data)
def _test_read(self, zipfp, expected_names, expected_content):
# Check the namelist
names = zipfp.namelist()
self.assertEqual(sorted(names), sorted(expected_names))
# Check infolist
infos = zipfp.infolist()
names = [zi.filename for zi in infos]
self.assertEqual(sorted(names), sorted(expected_names))
# check getinfo
for name, content in zip(expected_names, expected_content):
info = zipfp.getinfo(name)
self.assertEqual(info.filename, name)
self.assertEqual(info.file_size, len(content))
self.assertEqual(zipfp.read(name), content)
def test_read_with_metadata_encoding(self):
# Read the ZIP archive with correct metadata_encoding
with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
self._test_read(zipfp, self.file_names, self.file_content)
def test_read_without_metadata_encoding(self):
# Read the ZIP archive without metadata_encoding
expected_names = [name.encode('shift_jis').decode('cp437')
for name in self.file_names[:2]] + self.file_names[2:]
with zipfile.ZipFile(TESTFN, "r") as zipfp:
self._test_read(zipfp, expected_names, self.file_content)
def test_read_with_incorrect_metadata_encoding(self):
# Read the ZIP archive with incorrect metadata_encoding
expected_names = [name.encode('shift_jis').decode('koi8-u')
for name in self.file_names[:2]] + self.file_names[2:]
with zipfile.ZipFile(TESTFN, "r", metadata_encoding='koi8-u') as zipfp:
self._test_read(zipfp, expected_names, self.file_content)
def test_read_with_unsuitable_metadata_encoding(self):
# Read the ZIP archive with metadata_encoding unsuitable for
# decoding metadata
with self.assertRaises(UnicodeDecodeError):
zipfile.ZipFile(TESTFN, "r", metadata_encoding='ascii')
with self.assertRaises(UnicodeDecodeError):
zipfile.ZipFile(TESTFN, "r", metadata_encoding='utf-8')
def test_read_after_append(self):
newname = '\u56db' # Han 'four'
expected_names = [name.encode('shift_jis').decode('cp437')
for name in self.file_names[:2]] + self.file_names[2:]
expected_names.append(newname)
expected_content = (*self.file_content, b"newcontent")
with zipfile.ZipFile(TESTFN, "a") as zipfp:
zipfp.writestr(newname, "newcontent")
self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names))
with zipfile.ZipFile(TESTFN, "r") as zipfp:
self._test_read(zipfp, expected_names, expected_content)
with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names))
for i, (name, content) in enumerate(zip(expected_names, expected_content)):
info = zipfp.getinfo(name)
self.assertEqual(info.filename, name)
self.assertEqual(info.file_size, len(content))
if i < 2:
with self.assertRaises(zipfile.BadZipFile):
zipfp.read(name)
else:
self.assertEqual(zipfp.read(name), content)
def test_write_with_metadata_encoding(self):
ZF = zipfile.ZipFile
for mode in ("w", "x", "a"):
with self.assertRaisesRegex(ValueError,
"^metadata_encoding is only"):
ZF("nonesuch.zip", mode, metadata_encoding="shift_jis")
def test_cli_with_metadata_encoding(self):
errmsg = "Non-conforming encodings not supported with -c."
args = ["--metadata-encoding=shift_jis", "-c", "nonesuch", "nonesuch"]
with captured_stdout() as stdout:
with captured_stderr() as stderr:
self.assertRaises(SystemExit, zipfile.main, args)
self.assertEqual(stdout.getvalue(), "")
self.assertIn(errmsg, stderr.getvalue())
with captured_stdout() as stdout:
zipfile.main(["--metadata-encoding=shift_jis", "-t", TESTFN])
listing = stdout.getvalue()
with captured_stdout() as stdout:
zipfile.main(["--metadata-encoding=shift_jis", "-l", TESTFN])
listing = stdout.getvalue()
for name in self.file_names:
self.assertIn(name, listing)
os.mkdir(TESTFN2)
self.addCleanup(rmtree, TESTFN2)
zipfile.main(["--metadata-encoding=shift_jis", "-e", TESTFN, TESTFN2])
listing = os.listdir(TESTFN2)
for name in self.file_names:
self.assertIn(name, listing)
if __name__ == "__main__":
unittest.main()

View File

@ -480,7 +480,7 @@ class ZipInfo (object):
def _encodeFilenameFlags(self):
try:
return self.filename.encode('ascii'), self.flag_bits
return self.filename.encode('ascii'), self.flag_bits & ~_MASK_UTF_FILENAME
except UnicodeEncodeError:
return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME
@ -1240,7 +1240,7 @@ class ZipFile:
_windows_illegal_name_trans_table = None
def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True,
compresslevel=None, *, strict_timestamps=True):
compresslevel=None, *, strict_timestamps=True, metadata_encoding=None):
"""Open the ZIP file with mode read 'r', write 'w', exclusive create 'x',
or append 'a'."""
if mode not in ('r', 'w', 'x', 'a'):
@ -1259,6 +1259,12 @@ class ZipFile:
self.pwd = None
self._comment = b''
self._strict_timestamps = strict_timestamps
self.metadata_encoding = metadata_encoding
# Check that we don't try to write with nonconforming codecs
if self.metadata_encoding and mode != 'r':
raise ValueError(
"metadata_encoding is only supported for reading files")
# Check if we were passed a file-like object
if isinstance(file, os.PathLike):
@ -1389,13 +1395,13 @@ class ZipFile:
if self.debug > 2:
print(centdir)
filename = fp.read(centdir[_CD_FILENAME_LENGTH])
flags = centdir[5]
flags = centdir[_CD_FLAG_BITS]
if flags & _MASK_UTF_FILENAME:
# UTF-8 file names extension
filename = filename.decode('utf-8')
else:
# Historical ZIP filename encoding
filename = filename.decode('cp437')
filename = filename.decode(self.metadata_encoding or 'cp437')
# Create ZipInfo instance to store file information
x = ZipInfo(filename)
x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
@ -1572,7 +1578,7 @@ class ZipFile:
# UTF-8 filename
fname_str = fname.decode("utf-8")
else:
fname_str = fname.decode("cp437")
fname_str = fname.decode(self.metadata_encoding or "cp437")
if fname_str != zinfo.orig_filename:
raise BadZipFile(
@ -2461,11 +2467,15 @@ def main(args=None):
help='Create zipfile from sources')
group.add_argument('-t', '--test', metavar='<zipfile>',
help='Test if a zipfile is valid')
parser.add_argument('--metadata-encoding', metavar='<encoding>',
help='Specify encoding of member names for -l, -e and -t')
args = parser.parse_args(args)
encoding = args.metadata_encoding
if args.test is not None:
src = args.test
with ZipFile(src, 'r') as zf:
with ZipFile(src, 'r', metadata_encoding=encoding) as zf:
badfile = zf.testzip()
if badfile:
print("The following enclosed file is corrupted: {!r}".format(badfile))
@ -2473,15 +2483,20 @@ def main(args=None):
elif args.list is not None:
src = args.list
with ZipFile(src, 'r') as zf:
with ZipFile(src, 'r', metadata_encoding=encoding) as zf:
zf.printdir()
elif args.extract is not None:
src, curdir = args.extract
with ZipFile(src, 'r') as zf:
with ZipFile(src, 'r', metadata_encoding=encoding) as zf:
zf.extractall(curdir)
elif args.create is not None:
if encoding:
print("Non-conforming encodings not supported with -c.",
file=sys.stderr)
sys.exit(1)
zip_name = args.create.pop(0)
files = args.create

View File

@ -0,0 +1,4 @@
Add the *metadata_encoding* parameter in the :class:`zipfile.ZipFile`
constructor and the ``--metadata-encoding`` option in the :mod:`zipfile`
CLI to allow reading zipfiles using non-standard codecs to encode the
filenames within the archive.