Merge heads.
This commit is contained in:
commit
becb70c329
|
@ -11,10 +11,6 @@
|
||||||
|
|
||||||
This module defines four dictionaries, :data:`html5`,
|
This module defines four dictionaries, :data:`html5`,
|
||||||
:data:`name2codepoint`, :data:`codepoint2name`, and :data:`entitydefs`.
|
:data:`name2codepoint`, :data:`codepoint2name`, and :data:`entitydefs`.
|
||||||
:data:`entitydefs` is used to provide the :attr:`entitydefs`
|
|
||||||
attribute of the :class:`html.parser.HTMLParser` class. The definition provided
|
|
||||||
here contains all the entities defined by XHTML 1.0 that can be handled using
|
|
||||||
simple textual substitution in the Latin-1 character set (ISO-8859-1).
|
|
||||||
|
|
||||||
|
|
||||||
.. data:: html5
|
.. data:: html5
|
||||||
|
|
|
@ -500,7 +500,6 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
self.error("unknown declaration: %r" % (data,))
|
self.error("unknown declaration: %r" % (data,))
|
||||||
|
|
||||||
# Internal -- helper to remove special character quoting
|
# Internal -- helper to remove special character quoting
|
||||||
entitydefs = None
|
|
||||||
def unescape(self, s):
|
def unescape(self, s):
|
||||||
if '&' not in s:
|
if '&' not in s:
|
||||||
return s
|
return s
|
||||||
|
@ -510,24 +509,23 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
if s[0] == "#":
|
if s[0] == "#":
|
||||||
s = s[1:]
|
s = s[1:]
|
||||||
if s[0] in ['x','X']:
|
if s[0] in ['x','X']:
|
||||||
c = int(s[1:], 16)
|
c = int(s[1:].rstrip(';'), 16)
|
||||||
else:
|
else:
|
||||||
c = int(s)
|
c = int(s.rstrip(';'))
|
||||||
return chr(c)
|
return chr(c)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return '&#'+ s +';'
|
return '&#' + s
|
||||||
else:
|
else:
|
||||||
# Cannot use name2codepoint directly, because HTMLParser
|
from html.entities import html5
|
||||||
# supports apos, which is not part of HTML 4
|
if s in html5:
|
||||||
import html.entities
|
return html5[s]
|
||||||
if HTMLParser.entitydefs is None:
|
elif s.endswith(';'):
|
||||||
entitydefs = HTMLParser.entitydefs = {'apos':"'"}
|
return '&' + s
|
||||||
for k, v in html.entities.name2codepoint.items():
|
for x in range(2, len(s)):
|
||||||
entitydefs[k] = chr(v)
|
if s[:x] in html5:
|
||||||
try:
|
return html5[s[:x]] + s[x:]
|
||||||
return self.entitydefs[s]
|
else:
|
||||||
except KeyError:
|
return '&' + s
|
||||||
return '&'+s+';'
|
|
||||||
|
|
||||||
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
|
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
|
||||||
replaceEntities, s, flags=re.ASCII)
|
replaceEntities, s, flags=re.ASCII)
|
||||||
|
|
|
@ -456,7 +456,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
|
||||||
self._run_check('<form action="/xxx.php?a=1&b=2&", '
|
self._run_check('<form action="/xxx.php?a=1&b=2&", '
|
||||||
'method="post">', [
|
'method="post">', [
|
||||||
('starttag', 'form',
|
('starttag', 'form',
|
||||||
[('action', '/xxx.php?a=1&b=2&'),
|
[('action', '/xxx.php?a=1&b=2&'),
|
||||||
(',', None), ('method', 'post')])])
|
(',', None), ('method', 'post')])])
|
||||||
|
|
||||||
def test_weird_chars_in_unquoted_attribute_values(self):
|
def test_weird_chars_in_unquoted_attribute_values(self):
|
||||||
|
@ -541,6 +541,11 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
|
||||||
self.assertEqual(p.unescape('&'),'&')
|
self.assertEqual(p.unescape('&'),'&')
|
||||||
# see #12888
|
# see #12888
|
||||||
self.assertEqual(p.unescape('{ ' * 1050), '{ ' * 1050)
|
self.assertEqual(p.unescape('{ ' * 1050), '{ ' * 1050)
|
||||||
|
# see #15156
|
||||||
|
self.assertEqual(p.unescape('ÉricÉric'
|
||||||
|
'&alphacentauriαcentauri'),
|
||||||
|
'ÉricÉric&alphacentauriαcentauri')
|
||||||
|
self.assertEqual(p.unescape('&co;'), '&co;')
|
||||||
|
|
||||||
def test_broken_comments(self):
|
def test_broken_comments(self):
|
||||||
html = ('<! not really a comment >'
|
html = ('<! not really a comment >'
|
||||||
|
|
|
@ -76,6 +76,8 @@ Library
|
||||||
It is used automatically on platforms supporting the necessary os.openat()
|
It is used automatically on platforms supporting the necessary os.openat()
|
||||||
and os.unlinkat() functions. Main code by Martin von Löwis.
|
and os.unlinkat() functions. Main code by Martin von Löwis.
|
||||||
|
|
||||||
|
- Issue #15156: HTMLParser now uses the new "html.entities.html5" dictionary.
|
||||||
|
|
||||||
- Issue #11113: add a new "html5" dictionary containing the named character
|
- Issue #11113: add a new "html5" dictionary containing the named character
|
||||||
references defined by the HTML5 standard and the equivalent Unicode
|
references defined by the HTML5 standard and the equivalent Unicode
|
||||||
character(s) to the html.entities module.
|
character(s) to the html.entities module.
|
||||||
|
|
Loading…
Reference in New Issue