From cc3fa204d357be5fafc10eb8c2a80fe0bca998f1 Mon Sep 17 00:00:00 2001 From: Pierre Quentel Date: Mon, 8 May 2017 14:08:34 +0200 Subject: [PATCH] bpo-29979: Rewrite cgi.parse_multipart to make it consistent with FieldStorage (#991) --- Doc/library/cgi.rst | 19 ++++---- Doc/whatsnew/3.7.rst | 8 ++++ Lib/cgi.py | 105 ++++++------------------------------------- Lib/test/test_cgi.py | 4 +- Misc/NEWS | 4 ++ 5 files changed, 38 insertions(+), 102 deletions(-) diff --git a/Doc/library/cgi.rst b/Doc/library/cgi.rst index 41219eeaaba..b60e1cc41e4 100644 --- a/Doc/library/cgi.rst +++ b/Doc/library/cgi.rst @@ -294,19 +294,20 @@ algorithms implemented in this module in other circumstances. This function is deprecated in this module. Use :func:`urllib.parse.parse_qsl` instead. It is maintained here only for backward compatibility. -.. function:: parse_multipart(fp, pdict) +.. function:: parse_multipart(fp, pdict, encoding="utf-8") Parse input of type :mimetype:`multipart/form-data` (for file uploads). - Arguments are *fp* for the input file and *pdict* for a dictionary containing - other parameters in the :mailheader:`Content-Type` header. + Arguments are *fp* for the input file, *pdict* for a dictionary containing + other parameters in the :mailheader:`Content-Type` header, and *encoding*, + the request encoding. - Returns a dictionary just like :func:`urllib.parse.parse_qs` keys are the field names, each - value is a list of values for that field. This is easy to use but not much good - if you are expecting megabytes to be uploaded --- in that case, use the - :class:`FieldStorage` class instead which is much more flexible. + Returns a dictionary just like :func:`urllib.parse.parse_qs`: keys are the + field names, each value is a list of values for that field. For non-file + fields, the value is a list of strings. - Note that this does not parse nested multipart parts --- use - :class:`FieldStorage` for that. + This is easy to use but not much good if you are expecting megabytes to be + uploaded --- in that case, use the :class:`FieldStorage` class instead + which is much more flexible. .. function:: parse_header(string) diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 7edf4fc3cf4..b2dc995dced 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -95,6 +95,14 @@ New Modules Improved Modules ================ +cgi +--- + +:func:`~cgi.parse_multipart` returns the same results as +:class:`~FieldStorage` : for non-file fields, the value associated to a key +is a list of strings, not bytes. +(Contributed by Pierre Quentel in :issue:`29979`.) + binascii -------- diff --git a/Lib/cgi.py b/Lib/cgi.py index 14d15a692ba..f5e85aa263b 100755 --- a/Lib/cgi.py +++ b/Lib/cgi.py @@ -198,105 +198,28 @@ def parse_qsl(qs, keep_blank_values=0, strict_parsing=0): DeprecationWarning, 2) return urllib.parse.parse_qsl(qs, keep_blank_values, strict_parsing) -def parse_multipart(fp, pdict): +def parse_multipart(fp, pdict, encoding="utf-8"): """Parse multipart input. Arguments: fp : input file pdict: dictionary containing other parameters of content-type header + encoding: request encoding Returns a dictionary just like parse_qs(): keys are the field names, each - value is a list of values for that field. This is easy to use but not - much good if you are expecting megabytes to be uploaded -- in that case, - use the FieldStorage class instead which is much more flexible. Note - that content-type is the raw, unparsed contents of the content-type - header. - - XXX This does not parse nested multipart parts -- use FieldStorage for - that. - - XXX This should really be subsumed by FieldStorage altogether -- no - point in having two implementations of the same parsing algorithm. - Also, FieldStorage protects itself better against certain DoS attacks - by limiting the size of the data read in one chunk. The API here - does not support that kind of protection. This also affects parse() - since it can call parse_multipart(). - + value is a list of values for that field. For non-file fields, the value + is a list of strings. """ - import http.client - - boundary = b"" - if 'boundary' in pdict: - boundary = pdict['boundary'] - if not valid_boundary(boundary): - raise ValueError('Invalid boundary in multipart form: %r' - % (boundary,)) - - nextpart = b"--" + boundary - lastpart = b"--" + boundary + b"--" - partdict = {} - terminator = b"" - - while terminator != lastpart: - bytes = -1 - data = None - if terminator: - # At start of next part. Read headers first. - headers = http.client.parse_headers(fp) - clength = headers.get('content-length') - if clength: - try: - bytes = int(clength) - except ValueError: - pass - if bytes > 0: - if maxlen and bytes > maxlen: - raise ValueError('Maximum content length exceeded') - data = fp.read(bytes) - else: - data = b"" - # Read lines until end of part. - lines = [] - while 1: - line = fp.readline() - if not line: - terminator = lastpart # End outer loop - break - if line.startswith(b"--"): - terminator = line.rstrip() - if terminator in (nextpart, lastpart): - break - lines.append(line) - # Done with part. - if data is None: - continue - if bytes < 0: - if lines: - # Strip final line terminator - line = lines[-1] - if line[-2:] == b"\r\n": - line = line[:-2] - elif line[-1:] == b"\n": - line = line[:-1] - lines[-1] = line - data = b"".join(lines) - line = headers['content-disposition'] - if not line: - continue - key, params = parse_header(line) - if key != 'form-data': - continue - if 'name' in params: - name = params['name'] - else: - continue - if name in partdict: - partdict[name].append(data) - else: - partdict[name] = [data] - - return partdict - + # RFC 2026, Section 5.1 : The "multipart" boundary delimiters are always + # represented as 7bit US-ASCII. + boundary = pdict['boundary'].decode('ascii') + ctype = "multipart/form-data; boundary={}".format(boundary) + headers = Message() + headers.set_type(ctype) + headers['Content-Length'] = pdict['CONTENT-LENGTH'] + fs = FieldStorage(fp, headers=headers, encoding=encoding, + environ={'REQUEST_METHOD': 'POST'}) + return {k: fs.getlist(k) for k in fs} def _parseparam(s): while s[:1] == ';': diff --git a/Lib/test/test_cgi.py b/Lib/test/test_cgi.py index 637322137d6..903d0731f97 100644 --- a/Lib/test/test_cgi.py +++ b/Lib/test/test_cgi.py @@ -126,8 +126,8 @@ class CgiTests(unittest.TestCase): env = {'boundary': BOUNDARY.encode('latin1'), 'CONTENT-LENGTH': '558'} result = cgi.parse_multipart(fp, env) - expected = {'submit': [b' Add '], 'id': [b'1234'], - 'file': [b'Testing 123.\n'], 'title': [b'']} + expected = {'submit': [' Add '], 'id': ['1234'], + 'file': [b'Testing 123.\n'], 'title': ['']} self.assertEqual(result, expected) def test_fieldstorage_properties(self): diff --git a/Misc/NEWS b/Misc/NEWS index a72fceff10d..5e5ce59e29d 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -317,6 +317,10 @@ Extension Modules Library ------- +- bpo-29979: rewrite cgi.parse_multipart, reusing the FieldStorage class and + making its results consistent with those of FieldStorage for + multipart/form-data requests. Patch by Pierre Quentel. + - bpo-30243: Removed the __init__ methods of _json's scanner and encoder. Misusing them could cause memory leaks or crashes. Now scanner and encoder objects are completely initialized in the __new__ methods.