From f27b9a741ac7771aa1f6c1219d86a61222fdc20a Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Sat, 1 Feb 2014 21:21:01 +0200 Subject: [PATCH] #20288: fix handling of invalid numeric charrefs in HTMLParser. --- Lib/html/parser.py | 6 +++--- Lib/test/test_htmlparser.py | 6 ++++++ Misc/NEWS | 2 ++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 2d3bef351b0..63fe77425bd 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -228,9 +228,9 @@ class HTMLParser(_markupbase.ParserBase): i = self.updatepos(i, k) continue else: - if ";" in rawdata[i:]: #bail by consuming &# - self.handle_data(rawdata[0:2]) - i = self.updatepos(i, 2) + if ";" in rawdata[i:]: # bail by consuming &# + self.handle_data(rawdata[i:i+2]) + i = self.updatepos(i, i+2) break elif startswith('&', i): match = entityref.match(rawdata, i) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index c977a9dd4d7..11d9c9ce8b8 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -151,6 +151,12 @@ text ("data", "&#bad;"), ("endtag", "p"), ]) + # add the [] as a workaround to avoid buffering (see #20288) + self._run_check(["
&#bad;
"], [ + ("starttag", "div", []), + ("data", "&#bad;"), + ("endtag", "div"), + ]) def test_unclosed_entityref(self): self._run_check("&entityref foo", [ diff --git a/Misc/NEWS b/Misc/NEWS index 5a84af86cd7..d2efc232a2d 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -45,6 +45,8 @@ Core and Builtins Library ------- +- Issue #20288: fix handling of invalid numeric charrefs in HTMLParser. + - Issue #20424: Python implementation of io.StringIO now supports lone surrogates. - Issue #19456: ntpath.join() now joins relative paths correctly when a drive