diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index 194396bb5d4..3ab57c23071 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@@ -33,7 +33,7 @@ endbracket = re.compile('[<>]')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
- r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
+ r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
class SGMLParseError(RuntimeError):
@@ -400,11 +400,11 @@ class SGMLParser(markupbase.ParserBase):
def handle_charref(self, name):
"""Handle character reference, no need to override."""
- replacement = convert_charref(name)
+ replacement = self.convert_charref(name)
if replacement is None:
self.unknown_charref(name)
else:
- self.handle_data(convert_charref(name))
+ self.handle_data(replacement)
# Definition of entities -- derived classes may override
entitydefs = \
diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py
index 31b54de3845..076df37efb2 100644
--- a/Lib/test/test_sgmllib.py
+++ b/Lib/test/test_sgmllib.py
@@ -1,4 +1,6 @@
+import htmlentitydefs
import pprint
+import re
import sgmllib
import unittest
from test import test_support
@@ -65,20 +67,34 @@ class CDATAEventCollector(EventCollector):
class HTMLEntityCollector(EventCollector):
- import re, htmlentitydefs
+
entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
'|(x[0-9a-zA-Z]+|[0-9]+))(;?)')
def convert_charref(self, name):
self.append(("charref", "convert", name))
- if name.startswith('x'):
- return unichr(int(name[1:],16))
- else:
- return unichr(int(name))
+ if name[0] != "x":
+ return EventCollector.convert_charref(self, name)
+
+ def convert_codepoint(self, codepoint):
+ self.append(("codepoint", "convert", codepoint))
+ EventCollector.convert_codepoint(self, codepoint)
def convert_entityref(self, name):
self.append(("entityref", "convert", name))
- return unichr(self.htmlentitydefs.name2codepoint[name])
+ return EventCollector.convert_entityref(self, name)
+
+ # These to record that they were called, then pass the call along
+ # to the default implementation so that it's actions can be
+ # recorded.
+
+ def handle_charref(self, data):
+ self.append(("charref", data))
+ sgmllib.SGMLParser.handle_charref(self, data)
+
+ def handle_entityref(self, data):
+ self.append(("entityref", data))
+ sgmllib.SGMLParser.handle_entityref(self, data)
class SGMLParserTestCase(unittest.TestCase):
@@ -251,13 +267,23 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
])])
def test_convert_overrides(self):
+ # This checks that the character and entity reference
+ # conversion helpers are called at the documented times. No
+ # attempt is made to really change what the parser accepts.
+ #
self.collector = HTMLEntityCollector
- self.check_events('foo', [
+ self.check_events(('foo'
+ '&foobar;*'), [
('entityref', 'convert', 'ldquo'),
('charref', 'convert', 'x201d'),
- ('starttag', 'a', [('title', u'\u201ctest\u201d')]),
+ ('starttag', 'a', [('title', '“test”')]),
('data', 'foo'),
('endtag', 'a'),
+ ('entityref', 'foobar'),
+ ('entityref', 'convert', 'foobar'),
+ ('charref', '42'),
+ ('charref', 'convert', '42'),
+ ('codepoint', 'convert', 42),
])
def test_attr_funky_names(self):
@@ -265,6 +291,14 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
])
+ def test_attr_value_ip6_url(self):
+ # http://www.python.org/sf/853506
+ self.check_events((""
+ ""), [
+ ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
+ ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
+ ])
+
def test_illegal_declarations(self):
s = 'abcdef'
self.check_events(s, [