bpo-29979: Rewrite cgi.parse_multipart to make it consistent with FieldStorage (#991)

This commit is contained in:
Pierre Quentel 2017-05-08 14:08:34 +02:00 committed by Senthil Kumaran
parent f34c685020
commit cc3fa204d3
5 changed files with 38 additions and 102 deletions

View File

@ -294,19 +294,20 @@ algorithms implemented in this module in other circumstances.
This function is deprecated in this module. Use :func:`urllib.parse.parse_qsl`
instead. It is maintained here only for backward compatibility.
.. function:: parse_multipart(fp, pdict)
.. function:: parse_multipart(fp, pdict, encoding="utf-8")
Parse input of type :mimetype:`multipart/form-data` (for file uploads).
Arguments are *fp* for the input file and *pdict* for a dictionary containing
other parameters in the :mailheader:`Content-Type` header.
Arguments are *fp* for the input file, *pdict* for a dictionary containing
other parameters in the :mailheader:`Content-Type` header, and *encoding*,
the request encoding.
Returns a dictionary just like :func:`urllib.parse.parse_qs` keys are the field names, each
value is a list of values for that field. This is easy to use but not much good
if you are expecting megabytes to be uploaded --- in that case, use the
:class:`FieldStorage` class instead which is much more flexible.
Returns a dictionary just like :func:`urllib.parse.parse_qs`: keys are the
field names, each value is a list of values for that field. For non-file
fields, the value is a list of strings.
Note that this does not parse nested multipart parts --- use
:class:`FieldStorage` for that.
This is easy to use but not much good if you are expecting megabytes to be
uploaded --- in that case, use the :class:`FieldStorage` class instead
which is much more flexible.
.. function:: parse_header(string)

View File

@ -95,6 +95,14 @@ New Modules
Improved Modules
================
cgi
---
:func:`~cgi.parse_multipart` returns the same results as
:class:`~FieldStorage` : for non-file fields, the value associated to a key
is a list of strings, not bytes.
(Contributed by Pierre Quentel in :issue:`29979`.)
binascii
--------

View File

@ -198,105 +198,28 @@ def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
DeprecationWarning, 2)
return urllib.parse.parse_qsl(qs, keep_blank_values, strict_parsing)
def parse_multipart(fp, pdict):
def parse_multipart(fp, pdict, encoding="utf-8"):
"""Parse multipart input.
Arguments:
fp : input file
pdict: dictionary containing other parameters of content-type header
encoding: request encoding
Returns a dictionary just like parse_qs(): keys are the field names, each
value is a list of values for that field. This is easy to use but not
much good if you are expecting megabytes to be uploaded -- in that case,
use the FieldStorage class instead which is much more flexible. Note
that content-type is the raw, unparsed contents of the content-type
header.
XXX This does not parse nested multipart parts -- use FieldStorage for
that.
XXX This should really be subsumed by FieldStorage altogether -- no
point in having two implementations of the same parsing algorithm.
Also, FieldStorage protects itself better against certain DoS attacks
by limiting the size of the data read in one chunk. The API here
does not support that kind of protection. This also affects parse()
since it can call parse_multipart().
value is a list of values for that field. For non-file fields, the value
is a list of strings.
"""
import http.client
boundary = b""
if 'boundary' in pdict:
boundary = pdict['boundary']
if not valid_boundary(boundary):
raise ValueError('Invalid boundary in multipart form: %r'
% (boundary,))
nextpart = b"--" + boundary
lastpart = b"--" + boundary + b"--"
partdict = {}
terminator = b""
while terminator != lastpart:
bytes = -1
data = None
if terminator:
# At start of next part. Read headers first.
headers = http.client.parse_headers(fp)
clength = headers.get('content-length')
if clength:
try:
bytes = int(clength)
except ValueError:
pass
if bytes > 0:
if maxlen and bytes > maxlen:
raise ValueError('Maximum content length exceeded')
data = fp.read(bytes)
else:
data = b""
# Read lines until end of part.
lines = []
while 1:
line = fp.readline()
if not line:
terminator = lastpart # End outer loop
break
if line.startswith(b"--"):
terminator = line.rstrip()
if terminator in (nextpart, lastpart):
break
lines.append(line)
# Done with part.
if data is None:
continue
if bytes < 0:
if lines:
# Strip final line terminator
line = lines[-1]
if line[-2:] == b"\r\n":
line = line[:-2]
elif line[-1:] == b"\n":
line = line[:-1]
lines[-1] = line
data = b"".join(lines)
line = headers['content-disposition']
if not line:
continue
key, params = parse_header(line)
if key != 'form-data':
continue
if 'name' in params:
name = params['name']
else:
continue
if name in partdict:
partdict[name].append(data)
else:
partdict[name] = [data]
return partdict
# RFC 2026, Section 5.1 : The "multipart" boundary delimiters are always
# represented as 7bit US-ASCII.
boundary = pdict['boundary'].decode('ascii')
ctype = "multipart/form-data; boundary={}".format(boundary)
headers = Message()
headers.set_type(ctype)
headers['Content-Length'] = pdict['CONTENT-LENGTH']
fs = FieldStorage(fp, headers=headers, encoding=encoding,
environ={'REQUEST_METHOD': 'POST'})
return {k: fs.getlist(k) for k in fs}
def _parseparam(s):
while s[:1] == ';':

View File

@ -126,8 +126,8 @@ class CgiTests(unittest.TestCase):
env = {'boundary': BOUNDARY.encode('latin1'),
'CONTENT-LENGTH': '558'}
result = cgi.parse_multipart(fp, env)
expected = {'submit': [b' Add '], 'id': [b'1234'],
'file': [b'Testing 123.\n'], 'title': [b'']}
expected = {'submit': [' Add '], 'id': ['1234'],
'file': [b'Testing 123.\n'], 'title': ['']}
self.assertEqual(result, expected)
def test_fieldstorage_properties(self):

View File

@ -317,6 +317,10 @@ Extension Modules
Library
-------
- bpo-29979: rewrite cgi.parse_multipart, reusing the FieldStorage class and
making its results consistent with those of FieldStorage for
multipart/form-data requests. Patch by Pierre Quentel.
- bpo-30243: Removed the __init__ methods of _json's scanner and encoder.
Misusing them could cause memory leaks or crashes. Now scanner and encoder
objects are completely initialized in the __new__ methods.