#7311: fix HTMLParser to accept non-ASCII attribute values.

This commit is contained in:
Ezio Melotti 2011-04-05 20:40:52 +03:00
parent 104c3f1020
commit 9f1ffb2ae9
3 changed files with 20 additions and 1 deletions

View File

@ -26,7 +26,7 @@ commentclose = re.compile(r'--\s*>')
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name

View File

@ -208,6 +208,23 @@ DOCTYPE html [
("starttag", "a", [("href", "mailto:xyz@example.com")]),
])
def test_attr_nonascii(self):
# see issue 7311
self._run_check(u"<img src=/foo/bar.png alt=\u4e2d\u6587>", [
("starttag", "img", [("src", "/foo/bar.png"),
("alt", u"\u4e2d\u6587")]),
])
self._run_check(u"<a title='\u30c6\u30b9\u30c8' "
u"href='\u30c6\u30b9\u30c8.html'>", [
("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
("href", u"\u30c6\u30b9\u30c8.html")]),
])
self._run_check(u'<a title="\u30c6\u30b9\u30c8" '
u'href="\u30c6\u30b9\u30c8.html">', [
("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
("href", u"\u30c6\u30b9\u30c8.html")]),
])
def test_attr_entity_replacement(self):
self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
("starttag", "a", [("b", "&><\"'")]),

View File

@ -47,6 +47,8 @@ Core and Builtins
Library
-------
- Issue #7311: fix HTMLParser to accept non-ASCII attribute values.
- Issue #10963: Ensure that subprocess.communicate() never raises EPIPE.
- Issue #11662: Make urllib and urllib2 ignore redirections if the