Issue #8390: tarfile uses surrogateespace as the default error handler

(instead of replace in read mode or strict in write mode)
This commit is contained in:
Victor Stinner 2010-05-05 21:43:57 +00:00
parent aac786e586
commit de629d46f2
4 changed files with 21 additions and 19 deletions

View File

@ -218,7 +218,7 @@ be finalized; only the internally used file object will be closed. See the
.. versionadded:: 3.2 .. versionadded:: 3.2
Added support for the context manager protocol. Added support for the context manager protocol.
.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors=None, pax_headers=None, debug=0, errorlevel=0) .. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors='surrogateescape', pax_headers=None, debug=0, errorlevel=0)
All following arguments are optional and can be accessed as instance attributes All following arguments are optional and can be accessed as instance attributes
as well. as well.
@ -267,6 +267,9 @@ be finalized; only the internally used file object will be closed. See the
to be handled. The default settings will work for most users. to be handled. The default settings will work for most users.
See section :ref:`tar-unicode` for in-depth information. See section :ref:`tar-unicode` for in-depth information.
.. versionchanged:: 3.2
Use ``'surrogateescape'`` as the default for the *errors* argument.
The *pax_headers* argument is an optional dictionary of strings which The *pax_headers* argument is an optional dictionary of strings which
will be added as a pax global header if *format* is :const:`PAX_FORMAT`. will be added as a pax global header if *format* is :const:`PAX_FORMAT`.
@ -449,11 +452,14 @@ It does *not* contain the file's data itself.
a :class:`TarInfo` object. a :class:`TarInfo` object.
.. method:: TarInfo.tobuf(format=DEFAULT_FORMAT, encoding=ENCODING, errors='strict') .. method:: TarInfo.tobuf(format=DEFAULT_FORMAT, encoding=ENCODING, errors='surrogateescape')
Create a string buffer from a :class:`TarInfo` object. For information on the Create a string buffer from a :class:`TarInfo` object. For information on the
arguments see the constructor of the :class:`TarFile` class. arguments see the constructor of the :class:`TarFile` class.
.. versionchanged:: 3.2
Use ``'surrogateescape'`` as the default for the *errors* argument.
A ``TarInfo`` object has the following public data attributes: A ``TarInfo`` object has the following public data attributes:
@ -701,11 +707,10 @@ metadata must be either decoded or encoded. If *encoding* is not set
appropriately, this conversion may fail. appropriately, this conversion may fail.
The *errors* argument defines how characters are treated that cannot be The *errors* argument defines how characters are treated that cannot be
converted. Possible values are listed in section :ref:`codec-base-classes`. In converted. Possible values are listed in section :ref:`codec-base-classes`.
read mode the default scheme is ``'replace'``. This avoids unexpected The default scheme is ``'surrogateescape'`` which Python also uses for its
:exc:`UnicodeError` exceptions and guarantees that an archive can always be file system calls, see :ref:`os-filenames`.
read. In write mode the default value for *errors* is ``'strict'``. This
ensures that name information is not altered unnoticed.
In case of writing :const:`PAX_FORMAT` archives, *encoding* is ignored because In case of writing :const:`PAX_FORMAT` archives, *encoding* is ignored because
non-ASCII metadata is stored using *UTF-8*. non-ASCII metadata is stored using *UTF-8*. Storing surrogate characters is not
possible and will raise a :exc:`UnicodeEncodeError`.

View File

@ -978,7 +978,7 @@ class TarInfo(object):
return info return info
def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="strict"): def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
"""Return a tar header as a string of 512 byte blocks. """Return a tar header as a string of 512 byte blocks.
""" """
info = self.get_info() info = self.get_info()
@ -1490,7 +1490,7 @@ class TarFile(object):
def __init__(self, name=None, mode="r", fileobj=None, format=None, def __init__(self, name=None, mode="r", fileobj=None, format=None,
tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
errors=None, pax_headers=None, debug=None, errorlevel=None): errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
"""Open an (uncompressed) tar archive `name'. `mode' is either 'r' to """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
read from an existing archive, 'a' to append data to an existing read from an existing archive, 'a' to append data to an existing
file or 'w' to create a new file overwriting an existing one. `mode' file or 'w' to create a new file overwriting an existing one. `mode'
@ -1531,13 +1531,7 @@ class TarFile(object):
self.ignore_zeros = ignore_zeros self.ignore_zeros = ignore_zeros
if encoding is not None: if encoding is not None:
self.encoding = encoding self.encoding = encoding
self.errors = errors
if errors is not None:
self.errors = errors
elif mode == "r":
self.errors = "replace"
else:
self.errors = "strict"
if pax_headers is not None and self.format == PAX_FORMAT: if pax_headers is not None and self.format == PAX_FORMAT:
self.pax_headers = pax_headers self.pax_headers = pax_headers

View File

@ -1118,8 +1118,8 @@ class UstarUnicodeTest(unittest.TestCase):
if self.format != tarfile.PAX_FORMAT: if self.format != tarfile.PAX_FORMAT:
tar = tarfile.open(tmpname, encoding="ascii") tar = tarfile.open(tmpname, encoding="ascii")
t = tar.getmember("foo") t = tar.getmember("foo")
self.assertEqual(t.uname, "\ufffd\ufffd\ufffd") self.assertEqual(t.uname, "\udce4\udcf6\udcfc")
self.assertEqual(t.gname, "\ufffd\ufffd\ufffd") self.assertEqual(t.gname, "\udce4\udcf6\udcfc")
class GNUUnicodeTest(UstarUnicodeTest): class GNUUnicodeTest(UstarUnicodeTest):

View File

@ -348,6 +348,9 @@ C-API
Library Library
------- -------
- Issue #8390: tarfile uses surrogateespace as the default error handler
(instead of replace in read mode or strict in write mode)
- Issue #7755: Use an unencumbered audio file for tests. - Issue #7755: Use an unencumbered audio file for tests.
- Issue #8621: uuid.uuid4() returned the same sequence of values in the - Issue #8621: uuid.uuid4() returned the same sequence of values in the