From 2da9363f253e5086e45b677e4edc916f04bab3db Mon Sep 17 00:00:00 2001 From: John Jolly Date: Sat, 30 Dec 2017 11:07:51 -0700 Subject: [PATCH] bpo-28494: Improve zipfile.is_zipfile reliability The zipfile.is_zipfile function would only search for the EndOfZipfile section header. This failed to correctly identify non-zipfiles that contained this header. Now the zipfile.is_zipfile function verifies the first central directory entry. Changes: * Extended zipfile.is_zipfile to verify zipfile catalog * Added tests to validate failure of binary non-zipfiles --- Lib/test/test_zipfile.py | 19 +++++++++++++++++++ Lib/zipfile.py | 14 ++++++++++++-- .../2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst | 13 +++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index bf5bb4d0f13..9fd510559a3 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -1411,6 +1411,25 @@ class OtherTests(unittest.TestCase): self.assertFalse(zipfile.is_zipfile(fp)) fp.seek(0, 0) self.assertFalse(zipfile.is_zipfile(fp)) + # - passing non-zipfile with ZIP header elements + # data created using pyPNG like so: + # d = [(ord('P'), ord('K'), 5, 6), (ord('P'), ord('K'), 6, 6)] + # w = png.Writer(1,2,alpha=True,compression=0) + # f = open('onepix.png', 'wb') + # w.write(f, d) + # w.close() + data = (b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00" + b"\x00\x02\x08\x06\x00\x00\x00\x99\x81\xb6'\x00\x00\x00\x15I" + b"DATx\x01\x01\n\x00\xf5\xff\x00PK\x05\x06\x00PK\x06\x06\x07" + b"\xac\x01N\xc6|a\r\x00\x00\x00\x00IEND\xaeB`\x82") + # - passing a filename + with open(TESTFN, "wb") as fp: + fp.write(data) + self.assertFalse(zipfile.is_zipfile(TESTFN)) + # - passing a file-like object + fp = io.BytesIO() + fp.write(data) + self.assertFalse(zipfile.is_zipfile(fp)) def test_damaged_zipfile(self): """Check that zipfiles with missing bytes at the end raise BadZipFile.""" diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 8f8cb863b00..d2adee373aa 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -186,8 +186,18 @@ def _strip_extra(extra, xids): def _check_zipfile(fp): try: - if _EndRecData(fp): - return True # file has correct magic number + endrec = _EndRecData(fp) + if endrec: + if endrec[_ECD_ENTRIES_TOTAL] == 0 and endrec[_ECD_SIZE] == 0 and endrec[_ECD_OFFSET] == 0: + return True # Empty zipfiles are still zipfiles + elif endrec[_ECD_DISK_NUMBER] == endrec[_ECD_DISK_START]: + fp.seek(endrec[_ECD_OFFSET]) # Central directory is on the same disk + if fp.tell() == endrec[_ECD_OFFSET] and endrec[_ECD_SIZE] >= sizeCentralDir: + data = fp.read(sizeCentralDir) # CD is where we expect it to be + if len(data) == sizeCentralDir: + centdir = struct.unpack(structCentralDir, data) # CD is the right size + if centdir[_CD_SIGNATURE] == stringCentralDir: + return True # First central directory entry has correct magic number except OSError: pass return False diff --git a/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst b/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst new file mode 100644 index 00000000000..d1aaf5d0de3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-12-30-18-21-00.bpo-28494.Dt_Wks.rst @@ -0,0 +1,13 @@ +Improve zipfile validation in `zipfile.is_zipfile`. + +Before this change `zipfile.is_zipfile()` only checked the End Central Directory +signature. If the signature could be found in the last 64k of the file, +success! This produced false positives on any file with `'PK\x05\x06'` in the +last 64k of the file - including PDFs and PNGs. + +This is now corrected by actually validating the Central Directory location +and size based on the information provided by the End Central Directory +along with verifying the Central Directory signature of the first entry. + +This should be sufficient for the vast number of zipfiles with fewer +false positives.