#2927: Added the unescape() function to the html module.

2013-11-19 20:28:45 +02:00 · 2013-11-19 20:28:45 +02:00 · 4a9ee26750
parent 5160da1afc
commit 4a9ee26750
7 changed files with 215 additions and 49 deletions
--- a/Doc/library/html.entities.rst
+++ b/Doc/library/html.entities.rst
@ -20,6 +20,7 @@ This module defines four dictionaries, :data:`html5`,
   Note that the trailing semicolon is included in the name (e.g. ``'gt;'``),
   however some of the names are accepted by the standard even without the
   semicolon: in this case the name is present with and without the ``';'``.
   See also :func:`html.unescape`.
   .. versionadded:: 3.3
--- a/Doc/library/html.rst
+++ b/Doc/library/html.rst
@ -20,6 +20,17 @@ This module defines utilities to manipulate HTML.
   .. versionadded:: 3.2
 .. function:: unescape(s)
   Convert all named and numeric character references (e.g. ``&gt;``,
   ``&#62;``, ``&x3e;``) in the string *s* to the corresponding unicode
   characters.  This function uses the rules defined by the HTML 5 standard
   for both valid and invalid character references, and the :data:`list of
   HTML 5 named character references <html.entities.html5>`.
   .. versionadded:: 3.4
 --------------
 Submodules in the ``html`` package are:
--- a/Lib/html/init.py
+++ b/Lib/html/init.py
@ -2,7 +2,12 @@
 General functions for HTML manipulation.
 """
-# NB: this is a candidate for a bytes/string polymorphic interface
+import re as _re
 from html.entities import html5 as _html5
 __all__ = ['escape', 'unescape']
 def escape(s, quote=True):
    """
@ -18,3 +23,110 @@ def escape(s, quote=True):
        s = s.replace('"', "&quot;")
        s = s.replace('\'', "&#x27;")
    return s
 # see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
 _invalid_charrefs = {
    0x00: '\ufffd',  # REPLACEMENT CHARACTER
    0x0d: '\r',      # CARRIAGE RETURN
    0x80: '\u20ac',  # EURO SIGN
    0x81: '\x81',    # <control>
    0x82: '\u201a',  # SINGLE LOW-9 QUOTATION MARK
    0x83: '\u0192',  # LATIN SMALL LETTER F WITH HOOK
    0x84: '\u201e',  # DOUBLE LOW-9 QUOTATION MARK
    0x85: '\u2026',  # HORIZONTAL ELLIPSIS
    0x86: '\u2020',  # DAGGER
    0x87: '\u2021',  # DOUBLE DAGGER
    0x88: '\u02c6',  # MODIFIER LETTER CIRCUMFLEX ACCENT
    0x89: '\u2030',  # PER MILLE SIGN
    0x8a: '\u0160',  # LATIN CAPITAL LETTER S WITH CARON
    0x8b: '\u2039',  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
    0x8c: '\u0152',  # LATIN CAPITAL LIGATURE OE
    0x8d: '\x8d',    # <control>
    0x8e: '\u017d',  # LATIN CAPITAL LETTER Z WITH CARON
    0x8f: '\x8f',    # <control>
    0x90: '\x90',    # <control>
    0x91: '\u2018',  # LEFT SINGLE QUOTATION MARK
    0x92: '\u2019',  # RIGHT SINGLE QUOTATION MARK
    0x93: '\u201c',  # LEFT DOUBLE QUOTATION MARK
    0x94: '\u201d',  # RIGHT DOUBLE QUOTATION MARK
    0x95: '\u2022',  # BULLET
    0x96: '\u2013',  # EN DASH
    0x97: '\u2014',  # EM DASH
    0x98: '\u02dc',  # SMALL TILDE
    0x99: '\u2122',  # TRADE MARK SIGN
    0x9a: '\u0161',  # LATIN SMALL LETTER S WITH CARON
    0x9b: '\u203a',  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
    0x9c: '\u0153',  # LATIN SMALL LIGATURE OE
    0x9d: '\x9d',    # <control>
    0x9e: '\u017e',  # LATIN SMALL LETTER Z WITH CARON
    0x9f: '\u0178',  # LATIN CAPITAL LETTER Y WITH DIAERESIS
 }
 _invalid_codepoints = {
    # 0x0001 to 0x0008
    0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
    # 0x000E to 0x001F
    0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
    0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    # 0x007F to 0x009F
    0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
    0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
    # 0xFDD0 to 0xFDEF
    0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
    0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
    0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
    0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
    # others
    0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
    0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
    0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
    0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
    0x10fffe, 0x10ffff
 }
 def _replace_charref(s):
    s = s.group(1)
    if s[0] == '#':
        # numeric charref
        if s[1] in 'xX':
            num = int(s[2:].rstrip(';'), 16)
        else:
            num = int(s[1:].rstrip(';'))
        if num in _invalid_charrefs:
            return _invalid_charrefs[num]
        if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
            return '\uFFFD'
        if num in _invalid_codepoints:
            return ''
        return chr(num)
    else:
        # named charref
        if s in _html5:
            return _html5[s]
        # find the longest matching name (as defined by the standard)
        for x in range(len(s)-1, 1, -1):
            if s[:x] in _html5:
                return _html5[s[:x]] + s[x:]
        else:
            return '&' + s
 _charref = _re.compile(r'&(#[0-9]+;?'
                       r'|#[xX][0-9a-fA-F]+;?'
                       r'|[^\t\n\f <&#;]{1,32};?)')
 def unescape(s):
    """
    Convert all named and numeric character references (e.g. &gt;, &#62;,
    &x3e;) in the string s to the corresponding unicode characters.
    This function uses the rules defined by the HTML 5 standard
    for both valid and invalid character references, and the list of
    HTML 5 named character references defined in html.entities.html5.
    """
    if '&' not in s:
        return s
    return _charref.sub(_replace_charref, s)
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -8,9 +8,12 @@
 # and CDATA (character data -- only end tags are special).
 import _markupbase
 import re
 import warnings
 import _markupbase
 from html import unescape
 __all__ = ['HTMLParser']
@ -357,7 +360,7 @@ class HTMLParser(_markupbase.ParserBase):
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
            if attrvalue:
-                attrvalue = self.unescape(attrvalue)
+                attrvalue = unescape(attrvalue)
            attrs.append((attrname.lower(), attrvalue))
            k = m.end()
@ -510,34 +513,3 @@ class HTMLParser(_markupbase.ParserBase):
    def unknown_decl(self, data):
        if self.strict:
            self.error("unknown declaration: %r" % (data,))
    # Internal -- helper to remove special character quoting
    def unescape(self, s):
        if '&' not in s:
            return s
        def replaceEntities(s):
            s = s.groups()[0]
            try:
                if s[0] == "#":
                    s = s[1:]
                    if s[0] in ['x','X']:
                        c = int(s[1:].rstrip(';'), 16)
                    else:
                        c = int(s.rstrip(';'))
                    return chr(c)
            except ValueError:
                return '&#' + s
            else:
                from html.entities import html5
                if s in html5:
                    return html5[s]
                elif s.endswith(';'):
                    return '&' + s
                for x in range(2, len(s)):
                    if s[:x] in html5:
                        return html5[s[:x]] + s[x:]
                else:
                    return '&' + s
        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
                      replaceEntities, s, flags=re.ASCII)
--- a/Lib/test/test_html.py
+++ b/Lib/test/test_html.py
@ -16,9 +16,89 @@ class HtmlTests(unittest.TestCase):
            html.escape('\'<script>"&foo;"</script>\'', False),
            '\'&lt;script&gt;"&amp;foo;"&lt;/script&gt;\'')
    def test_unescape(self):
        numeric_formats = ['&#%d', '&#%d;', '&#x%x', '&#x%x;']
        errmsg = 'unescape(%r) should have returned %r'
        def check(text, expected):
            self.assertEqual(html.unescape(text), expected,
                             msg=errmsg % (text, expected))
        def check_num(num, expected):
            for format in numeric_formats:
                text = format % num
                self.assertEqual(html.unescape(text), expected,
                                 msg=errmsg % (text, expected))
        # check text with no character references
        check('no character references', 'no character references')
        # check & followed by invalid chars
        check('&\n&\t& &&', '&\n&\t& &&')
        # check & followed by numbers and letters
        check('&0 &9 &a &0; &9; &a;', '&0 &9 &a &0; &9; &a;')
        # check incomplete entities at the end of the string
        for x in ['&', '&#', '&#x', '&#X', '&#y', '&#xy', '&#Xy']:
            check(x, x)
            check(x+';', x+';')
        # check several combinations of numeric character references,
        # possibly followed by different characters
        formats = ['&#%d', '&#%07d', '&#%d;', '&#%07d;',
                   '&#x%x', '&#x%06x', '&#x%x;', '&#x%06x;',
                   '&#x%X', '&#x%06X', '&#X%x;', '&#X%06x;']
        for num, char in zip([65, 97, 34, 38, 0x2603, 0x101234],
                             ['A', 'a', '"', '&', '\u2603', '\U00101234']):
            for s in formats:
                check(s % num, char)
                for end in [' ', 'X']:
                    check((s+end) % num, char+end)
        # check invalid codepoints
        for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]:
            check_num(cp, '\uFFFD')
        # check more invalid codepoints
        for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]:
            check_num(cp, '')
        # check invalid numbers
        for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\r\u20ac\u2022\x9d'):
            check_num(num, ch)
        # check small numbers
        check_num(0, '\uFFFD')
        check_num(9, '\t')
        # check a big number
        check_num(1000000000000000000, '\uFFFD')
        # check that multiple trailing semicolons are handled correctly
        for e in ['&quot;;', '&#34;;', '&#x22;;', '&#X22;;']:
            check(e, '";')
        # check that semicolons in the middle don't create problems
        for e in ['&quot;quot;', '&#34;quot;', '&#x22;quot;', '&#X22;quot;']:
            check(e, '"quot;')
        # check triple adjacent charrefs
        for e in ['&quot', '&#34', '&#x22', '&#X22']:
            check(e*3, '"""')
            check((e+';')*3, '"""')
        # check that the case is respected
        for e in ['&amp', '&amp;', '&AMP', '&AMP;']:
            check(e, '&')
        for e in ['&Amp', '&Amp;']:
            check(e, e)
        # check that non-existent named entities are returned unchanged
        check('&svadilfari;', '&svadilfari;')
        # the following examples are in the html5 specs
        check('&notit', '¬it')
        check('&notit;', '¬it;')
        check('&notin', '¬in')
        check('&notin;', '∉')
        # a similar example with a long name
        check('&notReallyAnExistingNamedCharacterReference;',
              '¬ReallyAnExistingNamedCharacterReference;')
        # longest valid name
        check('&CounterClockwiseContourIntegral;', '∳')
        # check a charref that maps to two unicode chars
        check('&acE;', '\u223E\u0333')
        check('&acE', '&acE')
        # see #12888
        check('&#123; ' * 1050, '{ ' * 1050)
        # see #15156
        check('&Eacuteric&Eacute;ric&alphacentauri&alpha;centauri',
              'ÉricÉric&alphacentauriαcentauri')
        check('&co;', '&co;')
 def test_main():
    run_unittest(HtmlTests)
 if __name__ == '__main__':
-    test_main()
+    unittest.main()
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@ -569,18 +569,6 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
        for html, expected in data:
            self._run_check(html, expected)
    def test_unescape_function(self):
        p = self.get_collector()
        self.assertEqual(p.unescape('&#bad;'),'&#bad;')
        self.assertEqual(p.unescape('&#0038;'),'&')
        # see #12888
        self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050)
        # see #15156
        self.assertEqual(p.unescape('&Eacuteric&Eacute;ric'
                                    '&alphacentauri&alpha;centauri'),
                                    'ÉricÉric&alphacentauriαcentauri')
        self.assertEqual(p.unescape('&co;'), '&co;')
    def test_broken_comments(self):
        html = ('<! not really a comment >'
                '<! not a comment either -->'
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -59,6 +59,8 @@ Library
 - Issue #19449: in csv's writerow, handle non-string keys when generating the
  error message that certain keys are not in the 'fieldnames' list.
 - Issue #2927: Added the unescape() function to the html module.
 - Issue #8402: Added the escape() function to the glob module.
 - Issue #17618: Add Base85 and Ascii85 encoding/decoding to the base64 module.