diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 194396bb5d4..3ab57c23071 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -33,7 +33,7 @@ endbracket = re.compile('[<>]') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') + r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') class SGMLParseError(RuntimeError): @@ -400,11 +400,11 @@ class SGMLParser(markupbase.ParserBase): def handle_charref(self, name): """Handle character reference, no need to override.""" - replacement = convert_charref(name) + replacement = self.convert_charref(name) if replacement is None: self.unknown_charref(name) else: - self.handle_data(convert_charref(name)) + self.handle_data(replacement) # Definition of entities -- derived classes may override entitydefs = \ diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py index 31b54de3845..076df37efb2 100644 --- a/Lib/test/test_sgmllib.py +++ b/Lib/test/test_sgmllib.py @@ -1,4 +1,6 @@ +import htmlentitydefs import pprint +import re import sgmllib import unittest from test import test_support @@ -65,20 +67,34 @@ class CDATAEventCollector(EventCollector): class HTMLEntityCollector(EventCollector): - import re, htmlentitydefs + entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)' '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)') def convert_charref(self, name): self.append(("charref", "convert", name)) - if name.startswith('x'): - return unichr(int(name[1:],16)) - else: - return unichr(int(name)) + if name[0] != "x": + return EventCollector.convert_charref(self, name) + + def convert_codepoint(self, codepoint): + self.append(("codepoint", "convert", codepoint)) + EventCollector.convert_codepoint(self, codepoint) def convert_entityref(self, name): self.append(("entityref", "convert", name)) - return unichr(self.htmlentitydefs.name2codepoint[name]) + return EventCollector.convert_entityref(self, name) + + # These to record that they were called, then pass the call along + # to the default implementation so that it's actions can be + # recorded. + + def handle_charref(self, data): + self.append(("charref", data)) + sgmllib.SGMLParser.handle_charref(self, data) + + def handle_entityref(self, data): + self.append(("entityref", data)) + sgmllib.SGMLParser.handle_entityref(self, data) class SGMLParserTestCase(unittest.TestCase): @@ -251,13 +267,23 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ])]) def test_convert_overrides(self): + # This checks that the character and entity reference + # conversion helpers are called at the documented times. No + # attempt is made to really change what the parser accepts. + # self.collector = HTMLEntityCollector - self.check_events('foo', [ + self.check_events(('foo' + '&foobar;*'), [ ('entityref', 'convert', 'ldquo'), ('charref', 'convert', 'x201d'), - ('starttag', 'a', [('title', u'\u201ctest\u201d')]), + ('starttag', 'a', [('title', '“test”')]), ('data', 'foo'), ('endtag', 'a'), + ('entityref', 'foobar'), + ('entityref', 'convert', 'foobar'), + ('charref', '42'), + ('charref', 'convert', '42'), + ('codepoint', 'convert', 42), ]) def test_attr_funky_names(self): @@ -265,6 +291,14 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), ]) + def test_attr_value_ip6_url(self): + # http://www.python.org/sf/853506 + self.check_events(("" + ""), [ + ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]), + ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]), + ]) + def test_illegal_declarations(self): s = 'abcdef' self.check_events(s, [