diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst index a0cd6732997..a48a8294042 100644 --- a/Doc/library/tarfile.rst +++ b/Doc/library/tarfile.rst @@ -272,13 +272,14 @@ object, see :ref:`tarinfo-objects` for details. :exc:`IOError` exceptions. If ``2``, all *non-fatal* errors are raised as :exc:`TarError` exceptions as well. - The *encoding* and *errors* arguments control the way strings are converted to - unicode objects and vice versa. The default settings will work for most users. + The *encoding* and *errors* arguments define the character encoding to be + used for reading or writing the archive and how conversion errors are going + to be handled. The default settings will work for most users. See section :ref:`tar-unicode` for in-depth information. .. versionadded:: 2.6 - The *pax_headers* argument is an optional dictionary of unicode strings which + The *pax_headers* argument is an optional dictionary of strings which will be added as a pax global header if *format* is :const:`PAX_FORMAT`. .. versionadded:: 2.6 @@ -703,36 +704,30 @@ Unicode issues The tar format was originally conceived to make backups on tape drives with the main focus on preserving file system information. Nowadays tar archives are commonly used for file distribution and exchanging archives over networks. One -problem of the original format (that all other formats are merely variants of) -is that there is no concept of supporting different character encodings. For +problem of the original format (which is the basis of all other formats) is +that there is no concept of supporting different character encodings. For example, an ordinary tar archive created on a *UTF-8* system cannot be read -correctly on a *Latin-1* system if it contains non-ASCII characters. Names (i.e. -filenames, linknames, user/group names) containing these characters will appear -damaged. Unfortunately, there is no way to autodetect the encoding of an -archive. +correctly on a *Latin-1* system if it contains non-*ASCII* characters. Textual +metadata (like filenames, linknames, user/group names) will appear damaged. +Unfortunately, there is no way to autodetect the encoding of an archive. The +pax format was designed to solve this problem. It stores non-ASCII metadata +using the universal character encoding *UTF-8*. -The pax format was designed to solve this problem. It stores non-ASCII names -using the universal character encoding *UTF-8*. When a pax archive is read, -these *UTF-8* names are converted to the encoding of the local file system. +The details of character conversion in :mod:`tarfile` are controlled by the +*encoding* and *errors* keyword arguments of the :class:`TarFile` class. -The details of unicode conversion are controlled by the *encoding* and *errors* -keyword arguments of the :class:`TarFile` class. - -The default value for *encoding* is the local character encoding. It is deduced -from :func:`sys.getfilesystemencoding` and :func:`sys.getdefaultencoding`. In -read mode, *encoding* is used exclusively to convert unicode names from a pax -archive to strings in the local character encoding. In write mode, the use of -*encoding* depends on the chosen archive format. In case of :const:`PAX_FORMAT`, -input names that contain non-ASCII characters need to be decoded before being -stored as *UTF-8* strings. The other formats do not make use of *encoding* -unless unicode objects are used as input names. These are converted to 8-bit -character strings before they are added to the archive. +*encoding* defines the character encoding to use for the metadata in the +archive. The default value is :func:`sys.getfilesystemencoding` or ``'ascii'`` +as a fallback. Depending on whether the archive is read or written, the +metadata must be either decoded or encoded. If *encoding* is not set +appropriately, this conversion may fail. The *errors* argument defines how characters are treated that cannot be -converted to or from *encoding*. Possible values are listed in section -:ref:`codec-base-classes`. In read mode, there is an additional scheme -``'utf-8'`` which means that bad characters are replaced by their *UTF-8* -representation. This is the default scheme. In write mode the default value for -*errors* is ``'strict'`` to ensure that name information is not altered -unnoticed. +converted. Possible values are listed in section :ref:`codec-base-classes`. In +read mode the default scheme is ``'replace'``. This avoids unexpected +:exc:`UnicodeError` exceptions and guarantees that an archive can always be +read. In write mode the default value for *errors* is ``'strict'``. This +ensures that name information is not altered unnoticed. +In case of writing :const:`PAX_FORMAT` archives, *encoding* is ignored because +non-ASCII metadata is stored using *UTF-8*. diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 2f05618bed4..bf67eab3cfd 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -167,7 +167,7 @@ TOEXEC = 0o001 # execute/search by other #--------------------------------------------------------- ENCODING = sys.getfilesystemencoding() if ENCODING is None: - ENCODING = sys.getdefaultencoding() + ENCODING = "ascii" #--------------------------------------------------------- # Some useful functions @@ -982,7 +982,7 @@ class TarInfo(object): elif format == GNU_FORMAT: return self.create_gnu_header(info, encoding, errors) elif format == PAX_FORMAT: - return self.create_pax_header(info, encoding, errors) + return self.create_pax_header(info) else: raise ValueError("invalid format") @@ -1013,7 +1013,7 @@ class TarInfo(object): return buf + self._create_header(info, GNU_FORMAT, encoding, errors) - def create_pax_header(self, info, encoding, errors): + def create_pax_header(self, info): """Return the object as a ustar header block. If it cannot be represented this way, prepend a pax extended header sequence with supplement information. @@ -1056,17 +1056,17 @@ class TarInfo(object): # Create a pax extended header if necessary. if pax_headers: - buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding, errors) + buf = self._create_pax_generic_header(pax_headers, XHDTYPE) else: buf = b"" - return buf + self._create_header(info, USTAR_FORMAT, encoding, errors) + return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace") @classmethod - def create_pax_global_header(cls, pax_headers, encoding, errors): + def create_pax_global_header(cls, pax_headers): """Return the object as a pax global header block sequence. """ - return cls._create_pax_generic_header(pax_headers, XGLTYPE, encoding, errors) + return cls._create_pax_generic_header(pax_headers, XGLTYPE) def _posix_split_name(self, name): """Split a name longer than 100 chars into a prefix @@ -1139,7 +1139,7 @@ class TarInfo(object): cls._create_payload(name) @classmethod - def _create_pax_generic_header(cls, pax_headers, type, encoding, errors): + def _create_pax_generic_header(cls, pax_headers, type): """Return a POSIX.1-2001 extended or global header sequence that contains a list of keyword, value pairs. The values must be strings. @@ -1166,7 +1166,7 @@ class TarInfo(object): info["magic"] = POSIX_MAGIC # Create pax header + record blocks. - return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \ + return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \ cls._create_payload(records) @classmethod @@ -1566,8 +1566,7 @@ class TarFile(object): self._loaded = True if self.pax_headers: - buf = self.tarinfo.create_pax_global_header( - self.pax_headers.copy(), self.encoding, self.errors) + buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy()) self.fileobj.write(buf) self.offset += len(buf) diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 05851317d58..913ab60c891 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -780,8 +780,8 @@ class PaxWriteTest(GNUWriteTest): tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT, encoding="iso8859-1") t = tarfile.TarInfo() - t.name = "\xe4\xf6\xfc" # non-ASCII - t.uid = 8**8 # too large + t.name = "\xe4\xf6\xfc" # non-ASCII + t.uid = 8**8 # too large t.pax_headers = pax_headers tar.addfile(t) tar.close() @@ -794,7 +794,6 @@ class PaxWriteTest(GNUWriteTest): class UstarUnicodeTest(unittest.TestCase): - # All *UnicodeTests FIXME format = tarfile.USTAR_FORMAT @@ -814,11 +813,14 @@ class UstarUnicodeTest(unittest.TestCase): tar.close() tar = tarfile.open(tmpname, encoding=encoding) - self.assert_(type(tar.getnames()[0]) is not bytes) self.assertEqual(tar.getmembers()[0].name, name) tar.close() def test_unicode_filename_error(self): + if self.format == tarfile.PAX_FORMAT: + # PAX_FORMAT ignores encoding in write mode. + return + tar = tarfile.open(tmpname, "w", format=self.format, encoding="ascii", errors="strict") tarinfo = tarfile.TarInfo() @@ -839,21 +841,24 @@ class UstarUnicodeTest(unittest.TestCase): tar.close() def test_uname_unicode(self): - for name in ("\xe4\xf6\xfc", "\xe4\xf6\xfc"): - t = tarfile.TarInfo("foo") - t.uname = name - t.gname = name + t = tarfile.TarInfo("foo") + t.uname = "\xe4\xf6\xfc" + t.gname = "\xe4\xf6\xfc" - fobj = io.BytesIO() - tar = tarfile.open("foo.tar", mode="w", fileobj=fobj, format=self.format, encoding="iso8859-1") - tar.addfile(t) - tar.close() - fobj.seek(0) + tar = tarfile.open(tmpname, mode="w", format=self.format, encoding="iso8859-1") + tar.addfile(t) + tar.close() - tar = tarfile.open("foo.tar", fileobj=fobj, encoding="iso8859-1") + tar = tarfile.open(tmpname, encoding="iso8859-1") + t = tar.getmember("foo") + self.assertEqual(t.uname, "\xe4\xf6\xfc") + self.assertEqual(t.gname, "\xe4\xf6\xfc") + + if self.format != tarfile.PAX_FORMAT: + tar = tarfile.open(tmpname, encoding="ascii") t = tar.getmember("foo") - self.assertEqual(t.uname, "\xe4\xf6\xfc") - self.assertEqual(t.gname, "\xe4\xf6\xfc") + self.assertEqual(t.uname, "\ufffd\ufffd\ufffd") + self.assertEqual(t.gname, "\ufffd\ufffd\ufffd") class GNUUnicodeTest(UstarUnicodeTest): @@ -861,6 +866,11 @@ class GNUUnicodeTest(UstarUnicodeTest): format = tarfile.GNU_FORMAT +class PAXUnicodeTest(UstarUnicodeTest): + + format = tarfile.PAX_FORMAT + + class AppendTest(unittest.TestCase): # Test append mode (cp. patch #1652681). @@ -1047,6 +1057,7 @@ def test_main(): PaxWriteTest, UstarUnicodeTest, GNUUnicodeTest, + PAXUnicodeTest, AppendTest, LimitsTest, MiscTest,