- SF bug #853506: IP6 address parsing in sgmllib

('[' and ']' were not accepted in unquoted attribute values)

- cleaned up tests of character and entity reference decoding so the
  tests cover the documented relationships among handle_charref,
  handle_entityref, convert_charref, convert_codepoint, and
  convert_entityref, without bringing up Unicode issues that sgmllib
  cannot be involved in
This commit is contained in:
Fred Drake 2006-06-23 06:03:45 +00:00
parent b114984225
commit 2f99da636b
2 changed files with 45 additions and 11 deletions

View File

@ -33,7 +33,7 @@ endbracket = re.compile('[<>]')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile( attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
class SGMLParseError(RuntimeError): class SGMLParseError(RuntimeError):
@ -400,11 +400,11 @@ class SGMLParser(markupbase.ParserBase):
def handle_charref(self, name): def handle_charref(self, name):
"""Handle character reference, no need to override.""" """Handle character reference, no need to override."""
replacement = convert_charref(name) replacement = self.convert_charref(name)
if replacement is None: if replacement is None:
self.unknown_charref(name) self.unknown_charref(name)
else: else:
self.handle_data(convert_charref(name)) self.handle_data(replacement)
# Definition of entities -- derived classes may override # Definition of entities -- derived classes may override
entitydefs = \ entitydefs = \

View File

@ -1,4 +1,6 @@
import htmlentitydefs
import pprint import pprint
import re
import sgmllib import sgmllib
import unittest import unittest
from test import test_support from test import test_support
@ -65,20 +67,34 @@ class CDATAEventCollector(EventCollector):
class HTMLEntityCollector(EventCollector): class HTMLEntityCollector(EventCollector):
import re, htmlentitydefs
entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)' entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
'|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)') '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
def convert_charref(self, name): def convert_charref(self, name):
self.append(("charref", "convert", name)) self.append(("charref", "convert", name))
if name.startswith('x'): if name[0] != "x":
return unichr(int(name[1:],16)) return EventCollector.convert_charref(self, name)
else:
return unichr(int(name)) def convert_codepoint(self, codepoint):
self.append(("codepoint", "convert", codepoint))
EventCollector.convert_codepoint(self, codepoint)
def convert_entityref(self, name): def convert_entityref(self, name):
self.append(("entityref", "convert", name)) self.append(("entityref", "convert", name))
return unichr(self.htmlentitydefs.name2codepoint[name]) return EventCollector.convert_entityref(self, name)
# These to record that they were called, then pass the call along
# to the default implementation so that it's actions can be
# recorded.
def handle_charref(self, data):
self.append(("charref", data))
sgmllib.SGMLParser.handle_charref(self, data)
def handle_entityref(self, data):
self.append(("entityref", data))
sgmllib.SGMLParser.handle_entityref(self, data)
class SGMLParserTestCase(unittest.TestCase): class SGMLParserTestCase(unittest.TestCase):
@ -251,13 +267,23 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
])]) ])])
def test_convert_overrides(self): def test_convert_overrides(self):
# This checks that the character and entity reference
# conversion helpers are called at the documented times. No
# attempt is made to really change what the parser accepts.
#
self.collector = HTMLEntityCollector self.collector = HTMLEntityCollector
self.check_events('<a title="&ldquo;test&#x201d;">foo</a>', [ self.check_events(('<a title="&ldquo;test&#x201d;">foo</a>'
'&foobar;&#42;'), [
('entityref', 'convert', 'ldquo'), ('entityref', 'convert', 'ldquo'),
('charref', 'convert', 'x201d'), ('charref', 'convert', 'x201d'),
('starttag', 'a', [('title', u'\u201ctest\u201d')]), ('starttag', 'a', [('title', '&ldquo;test&#x201d;')]),
('data', 'foo'), ('data', 'foo'),
('endtag', 'a'), ('endtag', 'a'),
('entityref', 'foobar'),
('entityref', 'convert', 'foobar'),
('charref', '42'),
('charref', 'convert', '42'),
('codepoint', 'convert', 42),
]) ])
def test_attr_funky_names(self): def test_attr_funky_names(self):
@ -265,6 +291,14 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
]) ])
def test_attr_value_ip6_url(self):
# http://www.python.org/sf/853506
self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>"
"<a href=http://[1080::8:800:200C:417A]/>"), [
("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
])
def test_illegal_declarations(self): def test_illegal_declarations(self):
s = 'abc<!spacer type="block" height="25">def' s = 'abc<!spacer type="block" height="25">def'
self.check_events(s, [ self.check_events(s, [