Merge heads.

This commit is contained in:
Ezio Melotti 2012-06-24 22:04:02 +02:00
commit becb70c329
4 changed files with 22 additions and 21 deletions

View File

@ -11,10 +11,6 @@
This module defines four dictionaries, :data:`html5`, This module defines four dictionaries, :data:`html5`,
:data:`name2codepoint`, :data:`codepoint2name`, and :data:`entitydefs`. :data:`name2codepoint`, :data:`codepoint2name`, and :data:`entitydefs`.
:data:`entitydefs` is used to provide the :attr:`entitydefs`
attribute of the :class:`html.parser.HTMLParser` class. The definition provided
here contains all the entities defined by XHTML 1.0 that can be handled using
simple textual substitution in the Latin-1 character set (ISO-8859-1).
.. data:: html5 .. data:: html5

View File

@ -500,7 +500,6 @@ class HTMLParser(_markupbase.ParserBase):
self.error("unknown declaration: %r" % (data,)) self.error("unknown declaration: %r" % (data,))
# Internal -- helper to remove special character quoting # Internal -- helper to remove special character quoting
entitydefs = None
def unescape(self, s): def unescape(self, s):
if '&' not in s: if '&' not in s:
return s return s
@ -510,24 +509,23 @@ class HTMLParser(_markupbase.ParserBase):
if s[0] == "#": if s[0] == "#":
s = s[1:] s = s[1:]
if s[0] in ['x','X']: if s[0] in ['x','X']:
c = int(s[1:], 16) c = int(s[1:].rstrip(';'), 16)
else: else:
c = int(s) c = int(s.rstrip(';'))
return chr(c) return chr(c)
except ValueError: except ValueError:
return '&#'+ s +';' return '&#' + s
else: else:
# Cannot use name2codepoint directly, because HTMLParser from html.entities import html5
# supports apos, which is not part of HTML 4 if s in html5:
import html.entities return html5[s]
if HTMLParser.entitydefs is None: elif s.endswith(';'):
entitydefs = HTMLParser.entitydefs = {'apos':"'"} return '&' + s
for k, v in html.entities.name2codepoint.items(): for x in range(2, len(s)):
entitydefs[k] = chr(v) if s[:x] in html5:
try: return html5[s[:x]] + s[x:]
return self.entitydefs[s] else:
except KeyError: return '&' + s
return '&'+s+';'
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
replaceEntities, s, flags=re.ASCII) replaceEntities, s, flags=re.ASCII)

View File

@ -456,7 +456,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", ' self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", '
'method="post">', [ 'method="post">', [
('starttag', 'form', ('starttag', 'form',
[('action', '/xxx.php?a=1&b=2&amp'), [('action', '/xxx.php?a=1&b=2&'),
(',', None), ('method', 'post')])]) (',', None), ('method', 'post')])])
def test_weird_chars_in_unquoted_attribute_values(self): def test_weird_chars_in_unquoted_attribute_values(self):
@ -541,6 +541,11 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
self.assertEqual(p.unescape('&#0038;'),'&') self.assertEqual(p.unescape('&#0038;'),'&')
# see #12888 # see #12888
self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050) self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050)
# see #15156
self.assertEqual(p.unescape('&Eacuteric&Eacute;ric'
'&alphacentauri&alpha;centauri'),
'ÉricÉric&alphacentauriαcentauri')
self.assertEqual(p.unescape('&co;'), '&co;')
def test_broken_comments(self): def test_broken_comments(self):
html = ('<! not really a comment >' html = ('<! not really a comment >'

View File

@ -76,6 +76,8 @@ Library
It is used automatically on platforms supporting the necessary os.openat() It is used automatically on platforms supporting the necessary os.openat()
and os.unlinkat() functions. Main code by Martin von Löwis. and os.unlinkat() functions. Main code by Martin von Löwis.
- Issue #15156: HTMLParser now uses the new "html.entities.html5" dictionary.
- Issue #11113: add a new "html5" dictionary containing the named character - Issue #11113: add a new "html5" dictionary containing the named character
references defined by the HTML5 standard and the equivalent Unicode references defined by the HTML5 standard and the equivalent Unicode
character(s) to the html.entities module. character(s) to the html.entities module.