- SF bug #853506: IP6 address parsing in sgmllib

('[' and ']' were not accepted in unquoted attribute values) - cleaned up tests of character and entity reference decoding so the tests cover the documented relationships among handle_charref, handle_entityref, convert_charref, convert_codepoint, and convert_entityref, without bringing up Unicode issues that sgmllib cannot be involved in
2006-06-23 06:03:45 +00:00 · 2006-06-23 06:03:45 +00:00 · 2f99da636b
parent b114984225
commit 2f99da636b
2 changed files with 45 additions and 11 deletions
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@ -33,7 +33,7 @@ endbracket = re.compile('[<>]')
 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
 attrfind = re.compile(
    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
-    r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
+    r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
 class SGMLParseError(RuntimeError):
@ -400,11 +400,11 @@ class SGMLParser(markupbase.ParserBase):
    def handle_charref(self, name):
        """Handle character reference, no need to override."""
-        replacement = convert_charref(name)
+        replacement = self.convert_charref(name)
        if replacement is None:
            self.unknown_charref(name)
        else:
-            self.handle_data(convert_charref(name))
+            self.handle_data(replacement)
    # Definition of entities -- derived classes may override
    entitydefs = \
--- a/Lib/test/test_sgmllib.py
+++ b/Lib/test/test_sgmllib.py
@ -1,4 +1,6 @@
 import htmlentitydefs
 import pprint
 import re
 import sgmllib
 import unittest
 from test import test_support
@ -65,20 +67,34 @@ class CDATAEventCollector(EventCollector):
 class HTMLEntityCollector(EventCollector):
-    import re, htmlentitydefs
+
    entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
        '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
    def convert_charref(self, name):
        self.append(("charref", "convert", name))
-        if name.startswith('x'):
+        if name[0] != "x":
-            return unichr(int(name[1:],16))
+            return EventCollector.convert_charref(self, name)
-        else:
+
-            return unichr(int(name))
+    def convert_codepoint(self, codepoint):
        self.append(("codepoint", "convert", codepoint))
        EventCollector.convert_codepoint(self, codepoint)
    def convert_entityref(self, name):
        self.append(("entityref", "convert", name))
-        return unichr(self.htmlentitydefs.name2codepoint[name])
+        return EventCollector.convert_entityref(self, name)
    # These to record that they were called, then pass the call along
    # to the default implementation so that it's actions can be
    # recorded.
    def handle_charref(self, data):
        self.append(("charref", data))
        sgmllib.SGMLParser.handle_charref(self, data)
    def handle_entityref(self, data):
        self.append(("entityref", data))
        sgmllib.SGMLParser.handle_entityref(self, data)
 class SGMLParserTestCase(unittest.TestCase):
@ -251,13 +267,23 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
                                ])])
    def test_convert_overrides(self):
        # This checks that the character and entity reference
        # conversion helpers are called at the documented times.  No
        # attempt is made to really change what the parser accepts.
        #
        self.collector = HTMLEntityCollector
-        self.check_events('<a title="&ldquo;test&#x201d;">foo</a>', [
+        self.check_events(('<a title="&ldquo;test&#x201d;">foo</a>'
                           '&foobar;&#42;'), [
            ('entityref', 'convert', 'ldquo'),
            ('charref', 'convert', 'x201d'),
-            ('starttag', 'a', [('title', u'\u201ctest\u201d')]),
+            ('starttag', 'a', [('title', '&ldquo;test&#x201d;')]),
            ('data', 'foo'),
            ('endtag', 'a'),
            ('entityref', 'foobar'),
            ('entityref', 'convert', 'foobar'),
            ('charref', '42'),
            ('charref', 'convert', '42'),
            ('codepoint', 'convert', 42),
            ])
    def test_attr_funky_names(self):
@ -265,6 +291,14 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
            ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
            ])
    def test_attr_value_ip6_url(self):
        # http://www.python.org/sf/853506
        self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>"
                           "<a href=http://[1080::8:800:200C:417A]/>"), [
            ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
            ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
            ])
    def test_illegal_declarations(self):
        s = 'abc<!spacer type="block" height="25">def'
        self.check_events(s, [