Issue #21448: Fixed FeedParser feed() to avoid O(N**2) behavior when parsing long line.
Original patch by Raymond Hettinger.
This commit is contained in:
parent
6f20170762
commit
320a1c0ff7
|
@ -50,8 +50,8 @@ class BufferedSubFile(object):
|
|||
simple abstraction -- it parses until EOF closes the current message.
|
||||
"""
|
||||
def __init__(self):
|
||||
# The last partial line pushed into this object.
|
||||
self._partial = ''
|
||||
# Chunks of the last partial line pushed into this object.
|
||||
self._partial = []
|
||||
# The list of full, pushed lines, in reverse order
|
||||
self._lines = []
|
||||
# The stack of false-EOF checking predicates.
|
||||
|
@ -67,8 +67,8 @@ class BufferedSubFile(object):
|
|||
|
||||
def close(self):
|
||||
# Don't forget any trailing partial line.
|
||||
self._lines.append(self._partial)
|
||||
self._partial = ''
|
||||
self.pushlines(''.join(self._partial).splitlines(True))
|
||||
self._partial = []
|
||||
self._closed = True
|
||||
|
||||
def readline(self):
|
||||
|
@ -96,16 +96,26 @@ class BufferedSubFile(object):
|
|||
|
||||
def push(self, data):
|
||||
"""Push some new data into this object."""
|
||||
# Handle any previous leftovers
|
||||
data, self._partial = self._partial + data, ''
|
||||
# Crack into lines, but preserve the linesep characters on the end of each
|
||||
parts = data.splitlines(True)
|
||||
|
||||
if not parts or not parts[0].endswith(('\n', '\r')):
|
||||
# No new complete lines, so just accumulate partials
|
||||
self._partial += parts
|
||||
return
|
||||
|
||||
if self._partial:
|
||||
# If there are previous leftovers, complete them now
|
||||
self._partial.append(parts[0])
|
||||
parts[0:1] = ''.join(self._partial).splitlines(True)
|
||||
del self._partial[:]
|
||||
|
||||
# If the last element of the list does not end in a newline, then treat
|
||||
# it as a partial line. We only check for '\n' here because a line
|
||||
# ending with '\r' might be a line that was split in the middle of a
|
||||
# '\r\n' sequence (see bugs 1555570 and 1721862).
|
||||
if parts and not parts[-1].endswith('\n'):
|
||||
self._partial = parts.pop()
|
||||
if not parts[-1].endswith('\n'):
|
||||
self._partial = [parts.pop()]
|
||||
self.pushlines(parts)
|
||||
|
||||
def pushlines(self, lines):
|
||||
|
|
|
@ -10,6 +10,7 @@ import textwrap
|
|||
|
||||
from io import StringIO, BytesIO
|
||||
from itertools import chain
|
||||
from random import choice
|
||||
|
||||
import email
|
||||
import email.policy
|
||||
|
@ -3353,16 +3354,70 @@ Do you like this message?
|
|||
bsf.push(il)
|
||||
nt += n
|
||||
n1 = 0
|
||||
while True:
|
||||
ol = bsf.readline()
|
||||
if ol == NeedMoreData:
|
||||
break
|
||||
for ol in iter(bsf.readline, NeedMoreData):
|
||||
om.append(ol)
|
||||
n1 += 1
|
||||
self.assertEqual(n, n1)
|
||||
self.assertEqual(len(om), nt)
|
||||
self.assertEqual(''.join([il for il, n in imt]), ''.join(om))
|
||||
|
||||
def test_push_random(self):
|
||||
from email.feedparser import BufferedSubFile, NeedMoreData
|
||||
|
||||
n = 10000
|
||||
chunksize = 5
|
||||
chars = 'abcd \t\r\n'
|
||||
|
||||
s = ''.join(choice(chars) for i in range(n)) + '\n'
|
||||
target = s.splitlines(True)
|
||||
|
||||
bsf = BufferedSubFile()
|
||||
lines = []
|
||||
for i in range(0, len(s), chunksize):
|
||||
chunk = s[i:i+chunksize]
|
||||
bsf.push(chunk)
|
||||
lines.extend(iter(bsf.readline, NeedMoreData))
|
||||
self.assertEqual(lines, target)
|
||||
|
||||
|
||||
class TestFeedParsers(TestEmailBase):
|
||||
|
||||
def parse(self, chunks):
|
||||
from email.feedparser import FeedParser
|
||||
feedparser = FeedParser()
|
||||
for chunk in chunks:
|
||||
feedparser.feed(chunk)
|
||||
return feedparser.close()
|
||||
|
||||
def test_newlines(self):
|
||||
m = self.parse(['a:\nb:\rc:\r\nd:\n'])
|
||||
self.assertEqual(m.keys(), ['a', 'b', 'c', 'd'])
|
||||
m = self.parse(['a:\nb:\rc:\r\nd:'])
|
||||
self.assertEqual(m.keys(), ['a', 'b', 'c', 'd'])
|
||||
m = self.parse(['a:\rb', 'c:\n'])
|
||||
self.assertEqual(m.keys(), ['a', 'bc'])
|
||||
m = self.parse(['a:\r', 'b:\n'])
|
||||
self.assertEqual(m.keys(), ['a', 'b'])
|
||||
m = self.parse(['a:\r', '\nb:\n'])
|
||||
self.assertEqual(m.keys(), ['a', 'b'])
|
||||
m = self.parse(['a:\x85b:\u2028c:\n'])
|
||||
self.assertEqual(m.items(), [('a', '\x85'), ('b', '\u2028'), ('c', '')])
|
||||
m = self.parse(['a:\r', 'b:\x85', 'c:\n'])
|
||||
self.assertEqual(m.items(), [('a', ''), ('b', '\x85'), ('c', '')])
|
||||
|
||||
def test_long_lines(self):
|
||||
M, N = 1000, 100000
|
||||
m = self.parse(['a:b\n\n'] + ['x'*M] * N)
|
||||
self.assertEqual(m.items(), [('a', 'b')])
|
||||
self.assertEqual(m.get_payload(), 'x'*M*N)
|
||||
m = self.parse(['a:b\r\r'] + ['x'*M] * N)
|
||||
self.assertEqual(m.items(), [('a', 'b')])
|
||||
self.assertEqual(m.get_payload(), 'x'*M*N)
|
||||
m = self.parse(['a:b\r\r'] + ['x'*M+'\x85'] * N)
|
||||
self.assertEqual(m.items(), [('a', 'b')])
|
||||
self.assertEqual(m.get_payload(), ('x'*M+'\x85')*N)
|
||||
m = self.parse(['a:\r', 'b: '] + ['x'*M] * N)
|
||||
self.assertEqual(m.items(), [('a', ''), ('b', 'x'*M*N)])
|
||||
|
||||
|
||||
class TestParsers(TestEmailBase):
|
||||
|
|
|
@ -27,6 +27,9 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #21448: Changed FeedParser feed() to avoid O(N**2) behavior when
|
||||
parsing long line. Original patch by Raymond Hettinger.
|
||||
|
||||
- Issue #17923: glob() patterns ending with a slash no longer match non-dirs on
|
||||
AIX. Based on patch by Delhallt.
|
||||
|
||||
|
|
Loading…
Reference in New Issue