cpython/Lib/html/__init__.py

"""
General functions for HTML manipulation.
"""

import re as _re
from html.entities import html5 as _html5


__all__ = ['escape', 'unescape']


def escape(s, quote=True):
    """
    Replace special characters "&", "<" and ">" to HTML-safe sequences.
    If the optional flag quote is true (the default), the quotation mark
    characters, both double quote (") and single quote (') characters are also
    translated.
    """
    s = s.replace("&", "&amp;") # Must be done first!
    s = s.replace("<", "&lt;")
    s = s.replace(">", "&gt;")
    if quote:
        s = s.replace('"', "&quot;")
        s = s.replace('\'', "&#x27;")
    return s


# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references

_invalid_charrefs = {
    0x00: '\ufffd',  # REPLACEMENT CHARACTER
    0x0d: '\r',      # CARRIAGE RETURN
    0x80: '\u20ac',  # EURO SIGN
    0x81: '\x81',    # <control>
    0x82: '\u201a',  # SINGLE LOW-9 QUOTATION MARK
    0x83: '\u0192',  # LATIN SMALL LETTER F WITH HOOK
    0x84: '\u201e',  # DOUBLE LOW-9 QUOTATION MARK
    0x85: '\u2026',  # HORIZONTAL ELLIPSIS
    0x86: '\u2020',  # DAGGER
    0x87: '\u2021',  # DOUBLE DAGGER
    0x88: '\u02c6',  # MODIFIER LETTER CIRCUMFLEX ACCENT
    0x89: '\u2030',  # PER MILLE SIGN
    0x8a: '\u0160',  # LATIN CAPITAL LETTER S WITH CARON
    0x8b: '\u2039',  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
    0x8c: '\u0152',  # LATIN CAPITAL LIGATURE OE
    0x8d: '\x8d',    # <control>
    0x8e: '\u017d',  # LATIN CAPITAL LETTER Z WITH CARON
    0x8f: '\x8f',    # <control>
    0x90: '\x90',    # <control>
    0x91: '\u2018',  # LEFT SINGLE QUOTATION MARK
    0x92: '\u2019',  # RIGHT SINGLE QUOTATION MARK
    0x93: '\u201c',  # LEFT DOUBLE QUOTATION MARK
    0x94: '\u201d',  # RIGHT DOUBLE QUOTATION MARK
    0x95: '\u2022',  # BULLET
    0x96: '\u2013',  # EN DASH
    0x97: '\u2014',  # EM DASH
    0x98: '\u02dc',  # SMALL TILDE
    0x99: '\u2122',  # TRADE MARK SIGN
    0x9a: '\u0161',  # LATIN SMALL LETTER S WITH CARON
    0x9b: '\u203a',  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
    0x9c: '\u0153',  # LATIN SMALL LIGATURE OE
    0x9d: '\x9d',    # <control>
    0x9e: '\u017e',  # LATIN SMALL LETTER Z WITH CARON
    0x9f: '\u0178',  # LATIN CAPITAL LETTER Y WITH DIAERESIS
}

_invalid_codepoints = {
    # 0x0001 to 0x0008
    0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
    # 0x000E to 0x001F
    0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
    0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    # 0x007F to 0x009F
    0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
    0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
    # 0xFDD0 to 0xFDEF
    0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
    0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
    0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
    0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
    # others
    0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
    0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
    0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
    0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
    0x10fffe, 0x10ffff
}


def _replace_charref(s):
    s = s.group(1)
    if s[0] == '#':
        # numeric charref
        if s[1] in 'xX':
            num = int(s[2:].rstrip(';'), 16)
        else:
            num = int(s[1:].rstrip(';'))
        if num in _invalid_charrefs:
            return _invalid_charrefs[num]
        if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
            return '\uFFFD'
        if num in _invalid_codepoints:
            return ''
        return chr(num)
    else:
        # named charref
        if s in _html5:
            return _html5[s]
        # find the longest matching name (as defined by the standard)
        for x in range(len(s)-1, 1, -1):
            if s[:x] in _html5:
                return _html5[s[:x]] + s[x:]
        else:
            return '&' + s


_charref = _re.compile(r'&(#[0-9]+;?'
                       r'|#[xX][0-9a-fA-F]+;?'
                       r'|[^\t\n\f <&#;]{1,32};?)')

def unescape(s):
    """
    Convert all named and numeric character references (e.g. &gt;, &#62;,
    &x3e;) in the string s to the corresponding unicode characters.
    This function uses the rules defined by the HTML 5 standard
    for both valid and invalid character references, and the list of
    HTML 5 named character references defined in html.entities.html5.
    """
    if '&' not in s:
        return s
    return _charref.sub(_replace_charref, s)
#2830: add html.escape() helper and move cgi.escape() uses in the standard library to it. It defaults to quote=True and also escapes single quotes, which makes casual use safer. The cgi.escape() interface is not touched, but emits a (silent) PendingDeprecationWarning. 2010-10-15 12:57:45 -03:00			`"""`
			`General functions for HTML manipulation.`
			`"""`

#2927: Added the unescape() function to the html module. 2013-11-19 14:28:45 -04:00			`import re as _re`
			`from html.entities import html5 as _html5`


			`__all__ = ['escape', 'unescape']`

#2830: add html.escape() helper and move cgi.escape() uses in the standard library to it. It defaults to quote=True and also escapes single quotes, which makes casual use safer. The cgi.escape() interface is not touched, but emits a (silent) PendingDeprecationWarning. 2010-10-15 12:57:45 -03:00
			`def escape(s, quote=True):`
			`"""`
			`Replace special characters "&", "<" and ">" to HTML-safe sequences.`
			`If the optional flag quote is true (the default), the quotation mark`
Fix issue12938 - Update the docstring of html.escape. Include the information on single quote. 2011-09-12 20:14:13 -03:00			`characters, both double quote (") and single quote (') characters are also`
			`translated.`
#2830: add html.escape() helper and move cgi.escape() uses in the standard library to it. It defaults to quote=True and also escapes single quotes, which makes casual use safer. The cgi.escape() interface is not touched, but emits a (silent) PendingDeprecationWarning. 2010-10-15 12:57:45 -03:00			`"""`
#18020: improve html.escape speed by an order of magnitude. Patch by Matt Bryant. 2013-07-07 06:11:24 -03:00			`s = s.replace("&", "&") # Must be done first!`
			`s = s.replace("<", "<")`
			`s = s.replace(">", ">")`
#2830: add html.escape() helper and move cgi.escape() uses in the standard library to it. It defaults to quote=True and also escapes single quotes, which makes casual use safer. The cgi.escape() interface is not touched, but emits a (silent) PendingDeprecationWarning. 2010-10-15 12:57:45 -03:00			`if quote:`
#18020: improve html.escape speed by an order of magnitude. Patch by Matt Bryant. 2013-07-07 06:11:24 -03:00			`s = s.replace('"', """)`
			`s = s.replace('\'', "'")`
			`return s`
#2927: Added the unescape() function to the html module. 2013-11-19 14:28:45 -04:00

			`# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references`

			`_invalid_charrefs = {`
			`0x00: '\ufffd', # REPLACEMENT CHARACTER`
			`0x0d: '\r', # CARRIAGE RETURN`
			`0x80: '\u20ac', # EURO SIGN`
			`0x81: '\x81', # <control>`
			`0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK`
			`0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK`
			`0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK`
			`0x85: '\u2026', # HORIZONTAL ELLIPSIS`
			`0x86: '\u2020', # DAGGER`
			`0x87: '\u2021', # DOUBLE DAGGER`
			`0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT`
			`0x89: '\u2030', # PER MILLE SIGN`
			`0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON`
			`0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK`
			`0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE`
			`0x8d: '\x8d', # <control>`
			`0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON`
			`0x8f: '\x8f', # <control>`
			`0x90: '\x90', # <control>`
			`0x91: '\u2018', # LEFT SINGLE QUOTATION MARK`
			`0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK`
			`0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK`
			`0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK`
			`0x95: '\u2022', # BULLET`
			`0x96: '\u2013', # EN DASH`
			`0x97: '\u2014', # EM DASH`
			`0x98: '\u02dc', # SMALL TILDE`
			`0x99: '\u2122', # TRADE MARK SIGN`
			`0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON`
			`0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK`
			`0x9c: '\u0153', # LATIN SMALL LIGATURE OE`
			`0x9d: '\x9d', # <control>`
			`0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON`
			`0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS`
			`}`

			`_invalid_codepoints = {`
			`# 0x0001 to 0x0008`
			`0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,`
			`# 0x000E to 0x001F`
			`0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,`
			`0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,`
			`# 0x007F to 0x009F`
			`0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,`
			`0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,`
			`0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,`
			`# 0xFDD0 to 0xFDEF`
			`0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,`
			`0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,`
			`0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,`
			`0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,`
			`# others`
			`0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,`
			`0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,`
			`0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,`
			`0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,`
			`0x10fffe, 0x10ffff`
			`}`


			`def _replace_charref(s):`
			`s = s.group(1)`
			`if s[0] == '#':`
			`# numeric charref`
			`if s[1] in 'xX':`
			`num = int(s[2:].rstrip(';'), 16)`
			`else:`
			`num = int(s[1:].rstrip(';'))`
			`if num in _invalid_charrefs:`
			`return _invalid_charrefs[num]`
			`if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:`
			`return '\uFFFD'`
			`if num in _invalid_codepoints:`
			`return ''`
			`return chr(num)`
			`else:`
			`# named charref`
			`if s in _html5:`
			`return _html5[s]`
			`# find the longest matching name (as defined by the standard)`
			`for x in range(len(s)-1, 1, -1):`
			`if s[:x] in _html5:`
			`return _html5[s[:x]] + s[x:]`
			`else:`
			`return '&' + s`


			`_charref = _re.compile(r'&(#[0-9]+;?'`
			`r'\|#[xX][0-9a-fA-F]+;?'`
			`r'\|[^\t\n\f <&#;]{1,32};?)')`

			`def unescape(s):`
			`"""`
			`Convert all named and numeric character references (e.g. >, >,`
			`&x3e;) in the string s to the corresponding unicode characters.`
			`This function uses the rules defined by the HTML 5 standard`
			`for both valid and invalid character references, and the list of`
			`HTML 5 named character references defined in html.entities.html5.`
			`"""`
			`if '&' not in s:`
			`return s`
			`return _charref.sub(_replace_charref, s)`