From 3f60f09eb23be3289ac5cc019391711dcdf800b3 Mon Sep 17 00:00:00 2001 From: Senthil Kumaran Date: Tue, 28 Dec 2010 16:05:07 +0000 Subject: [PATCH] Fix Issue10759 - HTMLParser.unescape() to handle malform charrefs. --- Lib/HTMLParser.py | 17 ++++++++++------- Lib/test/test_htmlparser.py | 5 +++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 7cee47a7c5d..4fdc09aa763 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -367,13 +367,16 @@ class HTMLParser(markupbase.ParserBase): return s def replaceEntities(s): s = s.groups()[0] - if s[0] == "#": - s = s[1:] - if s[0] in ['x','X']: - c = int(s[1:], 16) - else: - c = int(s) - return unichr(c) + try: + if s[0] == "#": + s = s[1:] + if s[0] in ['x','X']: + c = int(s[1:], 16) + else: + c = int(s) + return unichr(c) + except ValueError: + return '&#'+s+';' else: # Cannot use name2codepoint directly, because HTMLParser supports apos, # which is not part of HTML 4 diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index c45cf00ecea..717585ca5b9 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -320,6 +320,11 @@ DOCTYPE html [ ("endtag", "p"), ]) + def test_unescape_function(self): + parser = HTMLParser.HTMLParser() + self.assertEqual(parser.unescape('&#bad;'),'&#bad;') + self.assertEqual(parser.unescape('&'),'&') + def test_main(): test_support.run_unittest(HTMLParserTestCase)