From a1a0eb4a394a5ac7a8422616ce1ee4125a3ef74f Mon Sep 17 00:00:00 2001 From: Sebastian Pedersen <31063917+SebastianGPedersen@users.noreply.github.com> Date: Tue, 14 Apr 2020 01:07:56 +0200 Subject: [PATCH] bpo-39380: Change ftplib encoding from latin-1 to utf-8 (GH-18048) Add the encoding in ftplib.FTP and ftplib.FTP_TLS to the constructor as keyword-only and change the default from "latin-1" to "utf-8" to follow RFC 2640. --- Doc/library/ftplib.rst | 33 ++++--- Doc/whatsnew/3.9.rst | 3 + Lib/ftplib.py | 22 +++-- Lib/test/test_ftplib.py | 94 ++++++++++++------- .../2020-03-22-20-00-04.bpo-39380.ZXlRQU.rst | 3 + 5 files changed, 101 insertions(+), 54 deletions(-) create mode 100644 Misc/NEWS.d/next/Tests/2020-03-22-20-00-04.bpo-39380.ZXlRQU.rst diff --git a/Doc/library/ftplib.rst b/Doc/library/ftplib.rst index a4bb695a9ab..f4d4cdf9ada 100644 --- a/Doc/library/ftplib.rst +++ b/Doc/library/ftplib.rst @@ -19,6 +19,8 @@ as mirroring other FTP servers. It is also used by the module :mod:`urllib.request` to handle URLs that use FTP. For more information on FTP (File Transfer Protocol), see Internet :rfc:`959`. +The default encoding is UTF-8, following :rfc:`2640`. + Here's a sample session using the :mod:`ftplib` module:: >>> from ftplib import FTP @@ -41,7 +43,7 @@ Here's a sample session using the :mod:`ftplib` module:: The module defines the following items: -.. class:: FTP(host='', user='', passwd='', acct='', timeout=None, source_address=None) +.. class:: FTP(host='', user='', passwd='', acct='', timeout=None, source_address=None, *, encoding='utf-8') Return a new instance of the :class:`FTP` class. When *host* is given, the method call ``connect(host)`` is made. When *user* is given, additionally @@ -50,7 +52,8 @@ The module defines the following items: parameter specifies a timeout in seconds for blocking operations like the connection attempt (if is not specified, the global default timeout setting will be used). *source_address* is a 2-tuple ``(host, port)`` for the socket - to bind to as its source address before connecting. + to bind to as its source address before connecting. The *encoding* parameter + specifies the encoding for directories and filenames. The :class:`FTP` class supports the :keyword:`with` statement, e.g.: @@ -74,9 +77,11 @@ The module defines the following items: .. versionchanged:: 3.9 If the *timeout* parameter is set to be zero, it will raise a - :class:`ValueError` to prevent the creation of a non-blocking socket + :class:`ValueError` to prevent the creation of a non-blocking socket. + The *encoding* parameter was added, and the default was changed from + Latin-1 to UTF-8 to follow :rfc:`2640`. -.. class:: FTP_TLS(host='', user='', passwd='', acct='', keyfile=None, certfile=None, context=None, timeout=None, source_address=None) +.. class:: FTP_TLS(host='', user='', passwd='', acct='', keyfile=None, certfile=None, context=None, timeout=None, source_address=None, *, encoding='utf-8') A :class:`FTP` subclass which adds TLS support to FTP as described in :rfc:`4217`. @@ -110,7 +115,9 @@ The module defines the following items: .. versionchanged:: 3.9 If the *timeout* parameter is set to be zero, it will raise a - :class:`ValueError` to prevent the creation of a non-blocking socket + :class:`ValueError` to prevent the creation of a non-blocking socket. + The *encoding* parameter was added, and the default was changed from + Latin-1 to UTF-8 to follow :rfc:`2640`. Here's a sample session using the :class:`FTP_TLS` class:: @@ -259,9 +266,10 @@ followed by ``lines`` for the text version or ``binary`` for the binary version. .. method:: FTP.retrlines(cmd, callback=None) - Retrieve a file or directory listing in ASCII transfer mode. *cmd* should be - an appropriate ``RETR`` command (see :meth:`retrbinary`) or a command such as - ``LIST`` or ``NLST`` (usually just the string ``'LIST'``). + Retrieve a file or directory listing in the encoding specified by the + *encoding* parameter at initialization. + *cmd* should be an appropriate ``RETR`` command (see :meth:`retrbinary`) or + a command such as ``LIST`` or ``NLST`` (usually just the string ``'LIST'``). ``LIST`` retrieves a list of files and information about those files. ``NLST`` retrieves a list of file names. The *callback* function is called for each line with a string argument @@ -291,7 +299,7 @@ followed by ``lines`` for the text version or ``binary`` for the binary version. .. method:: FTP.storlines(cmd, fp, callback=None) - Store a file in ASCII transfer mode. *cmd* should be an appropriate + Store a file in line mode. *cmd* should be an appropriate ``STOR`` command (see :meth:`storbinary`). Lines are read until EOF from the :term:`file object` *fp* (opened in binary mode) using its :meth:`~io.IOBase.readline` method to provide the data to be stored. *callback* is an optional single @@ -309,10 +317,9 @@ followed by ``lines`` for the text version or ``binary`` for the binary version. If optional *rest* is given, a ``REST`` command is sent to the server, passing *rest* as an argument. *rest* is usually a byte offset into the requested file, telling the server to restart sending the file's bytes at the requested offset, - skipping over the initial bytes. Note however that :rfc:`959` requires only that - *rest* be a string containing characters in the printable range from ASCII code - 33 to ASCII code 126. The :meth:`transfercmd` method, therefore, converts - *rest* to a string, but no check is performed on the string's contents. If the + skipping over the initial bytes. Note however that the :meth:`transfercmd` + method converts *rest* to a string with the *encoding* parameter specified + at initialization, but no check is performed on the string's contents. If the server does not recognize the ``REST`` command, an :exc:`error_reply` exception will be raised. If this happens, simply call :meth:`transfercmd` without a *rest* argument. diff --git a/Doc/whatsnew/3.9.rst b/Doc/whatsnew/3.9.rst index 020a86958f7..1bbcae36a1a 100644 --- a/Doc/whatsnew/3.9.rst +++ b/Doc/whatsnew/3.9.rst @@ -790,6 +790,9 @@ Changes in the Python API environment variable when the :option:`-E` or :option:`-I` command line options are being used. +* The *encoding* parameter has been added to the classes :class:`ftplib.FTP` and + :class:`ftplib.FTP_TLS` as a keyword-only parameter, and the default encoding + is changed from Latin-1 to UTF-8 to follow :rfc:`2640`. CPython bytecode changes ------------------------ diff --git a/Lib/ftplib.py b/Lib/ftplib.py index 71b3c289551..1f760ed1ce0 100644 --- a/Lib/ftplib.py +++ b/Lib/ftplib.py @@ -75,13 +75,14 @@ class FTP: '''An FTP client class. To create a connection, call the class using these arguments: - host, user, passwd, acct, timeout + host, user, passwd, acct, timeout, source_address, encoding The first four arguments are all strings, and have default value ''. - timeout must be numeric and defaults to None if not passed, - meaning that no timeout will be set on any ftp socket(s) + The parameter ´timeout´ must be numeric and defaults to None if not + passed, meaning that no timeout will be set on any ftp socket(s). If a timeout is passed, then this is now the default timeout for all ftp socket operations for this instance. + The last parameter is the encoding of filenames, which defaults to utf-8. Then use self.connect() with optional host and port argument. @@ -102,15 +103,16 @@ class FTP: file = None welcome = None passiveserver = 1 - encoding = "latin-1" def __init__(self, host='', user='', passwd='', acct='', - timeout=_GLOBAL_DEFAULT_TIMEOUT, source_address=None): + timeout=_GLOBAL_DEFAULT_TIMEOUT, source_address=None, *, + encoding='utf-8'): """Initialization method (called by class instantiation). Initialize host to localhost, port to standard ftp port. Optional arguments are host (for connect()), and user, passwd, acct (for login()). """ + self.encoding = encoding self.source_address = source_address self.timeout = timeout if host: @@ -706,9 +708,10 @@ else: ''' ssl_version = ssl.PROTOCOL_TLS_CLIENT - def __init__(self, host='', user='', passwd='', acct='', keyfile=None, - certfile=None, context=None, - timeout=_GLOBAL_DEFAULT_TIMEOUT, source_address=None): + def __init__(self, host='', user='', passwd='', acct='', + keyfile=None, certfile=None, context=None, + timeout=_GLOBAL_DEFAULT_TIMEOUT, source_address=None, *, + encoding='utf-8'): if context is not None and keyfile is not None: raise ValueError("context and keyfile arguments are mutually " "exclusive") @@ -727,7 +730,8 @@ else: keyfile=keyfile) self.context = context self._prot_p = False - super().__init__(host, user, passwd, acct, timeout, source_address) + super().__init__(host, user, passwd, acct, + timeout, source_address, encoding=encoding) def login(self, user='', passwd='', acct='', secure=True): if secure and not isinstance(self.sock, ssl.SSLSocket): diff --git a/Lib/test/test_ftplib.py b/Lib/test/test_ftplib.py index f40f3a4d9f7..cf30a3df35f 100644 --- a/Lib/test/test_ftplib.py +++ b/Lib/test/test_ftplib.py @@ -22,11 +22,12 @@ from test import support from test.support import HOST, HOSTv6 TIMEOUT = support.LOOPBACK_TIMEOUT +DEFAULT_ENCODING = 'utf-8' # the dummy data returned by server over the data channel when # RETR, LIST, NLST, MLSD commands are issued -RETR_DATA = 'abcde12345\r\n' * 1000 -LIST_DATA = 'foo\r\nbar\r\n' -NLST_DATA = 'foo\r\nbar\r\n' +RETR_DATA = 'abcde12345\r\n' * 1000 + 'non-ascii char \xAE\r\n' +LIST_DATA = 'foo\r\nbar\r\n non-ascii char \xAE\r\n' +NLST_DATA = 'foo\r\nbar\r\n non-ascii char \xAE\r\n' MLSD_DATA = ("type=cdir;perm=el;unique==keVO1+ZF4; test\r\n" "type=pdir;perm=e;unique==keVO1+d?3; ..\r\n" "type=OS.unix=slink:/foobar;perm=;unique==keVO1+4G4; foobar\r\n" @@ -41,7 +42,9 @@ MLSD_DATA = ("type=cdir;perm=el;unique==keVO1+ZF4; test\r\n" "type=dir;perm=cpmel;unique==keVO1+7G4; incoming\r\n" "type=file;perm=r;unique==keVO1+1G4; file2\r\n" "type=file;perm=r;unique==keVO1+1G4; file3\r\n" - "type=file;perm=r;unique==keVO1+1G4; file4\r\n") + "type=file;perm=r;unique==keVO1+1G4; file4\r\n" + "type=dir;perm=cpmel;unique==SGP1; dir \xAE non-ascii char\r\n" + "type=file;perm=r;unique==SGP2; file \xAE non-ascii char\r\n") class DummyDTPHandler(asynchat.async_chat): @@ -51,9 +54,11 @@ class DummyDTPHandler(asynchat.async_chat): asynchat.async_chat.__init__(self, conn) self.baseclass = baseclass self.baseclass.last_received_data = '' + self.encoding = baseclass.encoding def handle_read(self): - self.baseclass.last_received_data += self.recv(1024).decode('ascii') + new_data = self.recv(1024).decode(self.encoding, 'replace') + self.baseclass.last_received_data += new_data def handle_close(self): # XXX: this method can be called many times in a row for a single @@ -70,7 +75,7 @@ class DummyDTPHandler(asynchat.async_chat): self.baseclass.next_data = None if not what: return self.close_when_done() - super(DummyDTPHandler, self).push(what.encode('ascii')) + super(DummyDTPHandler, self).push(what.encode(self.encoding)) def handle_error(self): raise Exception @@ -80,7 +85,7 @@ class DummyFTPHandler(asynchat.async_chat): dtp_handler = DummyDTPHandler - def __init__(self, conn): + def __init__(self, conn, encoding=DEFAULT_ENCODING): asynchat.async_chat.__init__(self, conn) # tells the socket to handle urgent data inline (ABOR command) self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_OOBINLINE, 1) @@ -94,12 +99,13 @@ class DummyFTPHandler(asynchat.async_chat): self.rest = None self.next_retr_data = RETR_DATA self.push('220 welcome') + self.encoding = encoding def collect_incoming_data(self, data): self.in_buffer.append(data) def found_terminator(self): - line = b''.join(self.in_buffer).decode('ascii') + line = b''.join(self.in_buffer).decode(self.encoding) self.in_buffer = [] if self.next_response: self.push(self.next_response) @@ -121,7 +127,7 @@ class DummyFTPHandler(asynchat.async_chat): raise Exception def push(self, data): - asynchat.async_chat.push(self, data.encode('ascii') + b'\r\n') + asynchat.async_chat.push(self, data.encode(self.encoding) + b'\r\n') def cmd_port(self, arg): addr = list(map(int, arg.split(','))) @@ -251,7 +257,7 @@ class DummyFTPServer(asyncore.dispatcher, threading.Thread): handler = DummyFTPHandler - def __init__(self, address, af=socket.AF_INET): + def __init__(self, address, af=socket.AF_INET, encoding=DEFAULT_ENCODING): threading.Thread.__init__(self) asyncore.dispatcher.__init__(self) self.daemon = True @@ -262,6 +268,7 @@ class DummyFTPServer(asyncore.dispatcher, threading.Thread): self.active_lock = threading.Lock() self.host, self.port = self.socket.getsockname()[:2] self.handler_instance = None + self.encoding = encoding def start(self): assert not self.active @@ -284,7 +291,7 @@ class DummyFTPServer(asyncore.dispatcher, threading.Thread): self.join() def handle_accepted(self, conn, addr): - self.handler_instance = self.handler(conn) + self.handler_instance = self.handler(conn, encoding=self.encoding) def handle_connect(self): self.close() @@ -421,8 +428,8 @@ if ssl is not None: dtp_handler = DummyTLS_DTPHandler - def __init__(self, conn): - DummyFTPHandler.__init__(self, conn) + def __init__(self, conn, encoding=DEFAULT_ENCODING): + DummyFTPHandler.__init__(self, conn, encoding=encoding) self.secure_data_channel = False self._ccc = False @@ -462,10 +469,10 @@ if ssl is not None: class TestFTPClass(TestCase): - def setUp(self): - self.server = DummyFTPServer((HOST, 0)) + def setUp(self, encoding=DEFAULT_ENCODING): + self.server = DummyFTPServer((HOST, 0), encoding=encoding) self.server.start() - self.client = ftplib.FTP(timeout=TIMEOUT) + self.client = ftplib.FTP(timeout=TIMEOUT, encoding=encoding) self.client.connect(self.server.host, self.server.port) def tearDown(self): @@ -565,14 +572,14 @@ class TestFTPClass(TestCase): def test_retrbinary(self): def callback(data): - received.append(data.decode('ascii')) + received.append(data.decode(self.client.encoding)) received = [] self.client.retrbinary('retr', callback) self.check_data(''.join(received), RETR_DATA) def test_retrbinary_rest(self): def callback(data): - received.append(data.decode('ascii')) + received.append(data.decode(self.client.encoding)) for rest in (0, 10, 20): received = [] self.client.retrbinary('retr', callback, rest=rest) @@ -584,7 +591,7 @@ class TestFTPClass(TestCase): self.check_data(''.join(received), RETR_DATA.replace('\r\n', '')) def test_storbinary(self): - f = io.BytesIO(RETR_DATA.encode('ascii')) + f = io.BytesIO(RETR_DATA.encode(self.client.encoding)) self.client.storbinary('stor', f) self.check_data(self.server.handler_instance.last_received_data, RETR_DATA) # test new callback arg @@ -594,14 +601,16 @@ class TestFTPClass(TestCase): self.assertTrue(flag) def test_storbinary_rest(self): - f = io.BytesIO(RETR_DATA.replace('\r\n', '\n').encode('ascii')) + data = RETR_DATA.replace('\r\n', '\n').encode(self.client.encoding) + f = io.BytesIO(data) for r in (30, '30'): f.seek(0) self.client.storbinary('stor', f, rest=r) self.assertEqual(self.server.handler_instance.rest, str(r)) def test_storlines(self): - f = io.BytesIO(RETR_DATA.replace('\r\n', '\n').encode('ascii')) + data = RETR_DATA.replace('\r\n', '\n').encode(self.client.encoding) + f = io.BytesIO(data) self.client.storlines('stor', f) self.check_data(self.server.handler_instance.last_received_data, RETR_DATA) # test new callback arg @@ -790,14 +799,32 @@ class TestFTPClass(TestCase): f = io.BytesIO(b'x' * self.client.maxline * 2) self.assertRaises(ftplib.Error, self.client.storlines, 'stor', f) + def test_encoding_param(self): + encodings = ['latin-1', 'utf-8'] + for encoding in encodings: + with self.subTest(encoding=encoding): + self.tearDown() + self.setUp(encoding=encoding) + self.assertEqual(encoding, self.client.encoding) + self.test_retrbinary() + self.test_storbinary() + self.test_retrlines() + new_dir = self.client.mkd('/non-ascii dir \xAE') + self.check_data(new_dir, '/non-ascii dir \xAE') + # Check default encoding + client = ftplib.FTP(timeout=TIMEOUT) + self.assertEqual(DEFAULT_ENCODING, client.encoding) + @skipUnless(support.IPV6_ENABLED, "IPv6 not enabled") class TestIPv6Environment(TestCase): def setUp(self): - self.server = DummyFTPServer((HOSTv6, 0), af=socket.AF_INET6) + self.server = DummyFTPServer((HOSTv6, 0), + af=socket.AF_INET6, + encoding=DEFAULT_ENCODING) self.server.start() - self.client = ftplib.FTP(timeout=TIMEOUT) + self.client = ftplib.FTP(timeout=TIMEOUT, encoding=DEFAULT_ENCODING) self.client.connect(self.server.host, self.server.port) def tearDown(self): @@ -824,7 +851,7 @@ class TestIPv6Environment(TestCase): def test_transfer(self): def retr(): def callback(data): - received.append(data.decode('ascii')) + received.append(data.decode(self.client.encoding)) received = [] self.client.retrbinary('retr', callback) self.assertEqual(len(''.join(received)), len(RETR_DATA)) @@ -841,10 +868,10 @@ class TestTLS_FTPClassMixin(TestFTPClass): and data connections first. """ - def setUp(self): - self.server = DummyTLS_FTPServer((HOST, 0)) + def setUp(self, encoding=DEFAULT_ENCODING): + self.server = DummyTLS_FTPServer((HOST, 0), encoding=encoding) self.server.start() - self.client = ftplib.FTP_TLS(timeout=TIMEOUT) + self.client = ftplib.FTP_TLS(timeout=TIMEOUT, encoding=encoding) self.client.connect(self.server.host, self.server.port) # enable TLS self.client.auth() @@ -855,8 +882,8 @@ class TestTLS_FTPClassMixin(TestFTPClass): class TestTLS_FTPClass(TestCase): """Specific TLS_FTP class tests.""" - def setUp(self): - self.server = DummyTLS_FTPServer((HOST, 0)) + def setUp(self, encoding=DEFAULT_ENCODING): + self.server = DummyTLS_FTPServer((HOST, 0), encoding=encoding) self.server.start() self.client = ftplib.FTP_TLS(timeout=TIMEOUT) self.client.connect(self.server.host, self.server.port) @@ -877,7 +904,8 @@ class TestTLS_FTPClass(TestCase): # clear text with self.client.transfercmd('list') as sock: self.assertNotIsInstance(sock, ssl.SSLSocket) - self.assertEqual(sock.recv(1024), LIST_DATA.encode('ascii')) + self.assertEqual(sock.recv(1024), + LIST_DATA.encode(self.client.encoding)) self.assertEqual(self.client.voidresp(), "226 transfer complete") # secured, after PROT P @@ -886,14 +914,16 @@ class TestTLS_FTPClass(TestCase): self.assertIsInstance(sock, ssl.SSLSocket) # consume from SSL socket to finalize handshake and avoid # "SSLError [SSL] shutdown while in init" - self.assertEqual(sock.recv(1024), LIST_DATA.encode('ascii')) + self.assertEqual(sock.recv(1024), + LIST_DATA.encode(self.client.encoding)) self.assertEqual(self.client.voidresp(), "226 transfer complete") # PROT C is issued, the connection must be in cleartext again self.client.prot_c() with self.client.transfercmd('list') as sock: self.assertNotIsInstance(sock, ssl.SSLSocket) - self.assertEqual(sock.recv(1024), LIST_DATA.encode('ascii')) + self.assertEqual(sock.recv(1024), + LIST_DATA.encode(self.client.encoding)) self.assertEqual(self.client.voidresp(), "226 transfer complete") def test_login(self): diff --git a/Misc/NEWS.d/next/Tests/2020-03-22-20-00-04.bpo-39380.ZXlRQU.rst b/Misc/NEWS.d/next/Tests/2020-03-22-20-00-04.bpo-39380.ZXlRQU.rst new file mode 100644 index 00000000000..1ac9ead0eb3 --- /dev/null +++ b/Misc/NEWS.d/next/Tests/2020-03-22-20-00-04.bpo-39380.ZXlRQU.rst @@ -0,0 +1,3 @@ +Add the encoding in :class:`ftplib.FTP` and :class:`ftplib.FTP_TLS` to the +constructor as keyword-only and change the default from ``latin-1`` to ``utf-8`` +to follow :rfc:`2640`.