#13633: Added a new convert_charrefs keyword arg to HTMLParser that, when True, automatically converts all character references.
This commit is contained in:
parent
e7f87e1262
commit
95401c5f6b
|
@ -16,14 +16,21 @@
|
|||
This module defines a class :class:`HTMLParser` which serves as the basis for
|
||||
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
|
||||
|
||||
.. class:: HTMLParser(strict=False)
|
||||
.. class:: HTMLParser(strict=False, *, convert_charrefs=False)
|
||||
|
||||
Create a parser instance. If *strict* is ``False`` (the default), the parser
|
||||
will accept and parse invalid markup. If *strict* is ``True`` the parser
|
||||
will raise an :exc:`~html.parser.HTMLParseError` exception instead [#]_ when
|
||||
it's not able to parse the markup.
|
||||
The use of ``strict=True`` is discouraged and the *strict* argument is
|
||||
deprecated.
|
||||
Create a parser instance.
|
||||
|
||||
If *convert_charrefs* is ``True`` (default: ``False``), all character
|
||||
references (except the ones in ``script``/``style`` elements) are
|
||||
automatically converted to the corresponding Unicode characters.
|
||||
The use of ``convert_charrefs=True`` is encouraged and will become
|
||||
the default in Python 3.5.
|
||||
|
||||
If *strict* is ``False`` (the default), the parser will accept and parse
|
||||
invalid markup. If *strict* is ``True`` the parser will raise an
|
||||
:exc:`~html.parser.HTMLParseError` exception instead [#]_ when it's not
|
||||
able to parse the markup. The use of ``strict=True`` is discouraged and
|
||||
the *strict* argument is deprecated.
|
||||
|
||||
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
|
||||
when start tags, end tags, text, comments, and other markup elements are
|
||||
|
@ -34,12 +41,15 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
|
|||
handler for elements which are closed implicitly by closing an outer element.
|
||||
|
||||
.. versionchanged:: 3.2
|
||||
*strict* keyword added.
|
||||
*strict* argument added.
|
||||
|
||||
.. deprecated-removed:: 3.3 3.5
|
||||
The *strict* argument and the strict mode have been deprecated.
|
||||
The parser is now able to accept and parse invalid markup too.
|
||||
|
||||
.. versionchanged:: 3.4
|
||||
*convert_charrefs* keyword argument added.
|
||||
|
||||
An exception is defined as well:
|
||||
|
||||
|
||||
|
@ -181,7 +191,8 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
|
|||
|
||||
This method is called to process a named character reference of the form
|
||||
``&name;`` (e.g. ``>``), where *name* is a general entity reference
|
||||
(e.g. ``'gt'``).
|
||||
(e.g. ``'gt'``). This method is never called if *convert_charrefs* is
|
||||
``True``.
|
||||
|
||||
|
||||
.. method:: HTMLParser.handle_charref(name)
|
||||
|
@ -189,7 +200,8 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
|
|||
This method is called to process decimal and hexadecimal numeric character
|
||||
references of the form ``&#NNN;`` and ``&#xNNN;``. For example, the decimal
|
||||
equivalent for ``>`` is ``>``, whereas the hexadecimal is ``>``;
|
||||
in this case the method will receive ``'62'`` or ``'x3E'``.
|
||||
in this case the method will receive ``'62'`` or ``'x3E'``. This method
|
||||
is never called if *convert_charrefs* is ``True``.
|
||||
|
||||
|
||||
.. method:: HTMLParser.handle_comment(data)
|
||||
|
@ -324,7 +336,8 @@ correct char (note: these 3 references are all equivalent to ``'>'``)::
|
|||
Num ent : >
|
||||
|
||||
Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
|
||||
:meth:`~HTMLParser.handle_data` might be called more than once::
|
||||
:meth:`~HTMLParser.handle_data` might be called more than once
|
||||
(unless *convert_charrefs* is set to ``True``)::
|
||||
|
||||
>>> for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']:
|
||||
... parser.feed(chunk)
|
||||
|
|
|
@ -97,7 +97,7 @@ class HTMLParseError(Exception):
|
|||
return result
|
||||
|
||||
|
||||
_strict_sentinel = object()
|
||||
_default_sentinel = object()
|
||||
|
||||
class HTMLParser(_markupbase.ParserBase):
|
||||
"""Find tags and other markup and call handler functions.
|
||||
|
@ -112,28 +112,39 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
self.handle_startendtag(); end tags by self.handle_endtag(). The
|
||||
data between tags is passed from the parser to the derived class
|
||||
by calling self.handle_data() with the data as argument (the data
|
||||
may be split up in arbitrary chunks). Entity references are
|
||||
passed by calling self.handle_entityref() with the entity
|
||||
reference as the argument. Numeric character references are
|
||||
passed to self.handle_charref() with the string containing the
|
||||
reference as the argument.
|
||||
may be split up in arbitrary chunks). If convert_charrefs is
|
||||
True the character references are converted automatically to the
|
||||
corresponding Unicode character (and self.handle_data() is no
|
||||
longer split in chunks), otherwise they are passed by calling
|
||||
self.handle_entityref() or self.handle_charref() with the string
|
||||
containing respectively the named or numeric reference as the
|
||||
argument.
|
||||
"""
|
||||
|
||||
CDATA_CONTENT_ELEMENTS = ("script", "style")
|
||||
|
||||
def __init__(self, strict=_strict_sentinel):
|
||||
def __init__(self, strict=_default_sentinel, *,
|
||||
convert_charrefs=_default_sentinel):
|
||||
"""Initialize and reset this instance.
|
||||
|
||||
If convert_charrefs is True (default: False), all character references
|
||||
are automatically converted to the corresponding Unicode characters.
|
||||
If strict is set to False (the default) the parser will parse invalid
|
||||
markup, otherwise it will raise an error. Note that the strict mode
|
||||
and argument are deprecated.
|
||||
"""
|
||||
if strict is not _strict_sentinel:
|
||||
if strict is not _default_sentinel:
|
||||
warnings.warn("The strict argument and mode are deprecated.",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
else:
|
||||
strict = False # default
|
||||
self.strict = strict
|
||||
if convert_charrefs is _default_sentinel:
|
||||
convert_charrefs = False # default
|
||||
warnings.warn("The value of convert_charrefs will become True in "
|
||||
"3.5. You are encouraged to set the value explicitly.",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
self.convert_charrefs = convert_charrefs
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
|
@ -184,14 +195,25 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
i = 0
|
||||
n = len(rawdata)
|
||||
while i < n:
|
||||
match = self.interesting.search(rawdata, i) # < or &
|
||||
if match:
|
||||
j = match.start()
|
||||
if self.convert_charrefs and not self.cdata_elem:
|
||||
j = rawdata.find('<', i)
|
||||
if j < 0:
|
||||
if not end:
|
||||
break # wait till we get all the text
|
||||
j = n
|
||||
else:
|
||||
if self.cdata_elem:
|
||||
break
|
||||
j = n
|
||||
if i < j: self.handle_data(rawdata[i:j])
|
||||
match = self.interesting.search(rawdata, i) # < or &
|
||||
if match:
|
||||
j = match.start()
|
||||
else:
|
||||
if self.cdata_elem:
|
||||
break
|
||||
j = n
|
||||
if i < j:
|
||||
if self.convert_charrefs and not self.cdata_elem:
|
||||
self.handle_data(unescape(rawdata[i:j]))
|
||||
else:
|
||||
self.handle_data(rawdata[i:j])
|
||||
i = self.updatepos(i, j)
|
||||
if i == n: break
|
||||
startswith = rawdata.startswith
|
||||
|
@ -226,7 +248,10 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
k = i + 1
|
||||
else:
|
||||
k += 1
|
||||
self.handle_data(rawdata[i:k])
|
||||
if self.convert_charrefs and not self.cdata_elem:
|
||||
self.handle_data(unescape(rawdata[i:k]))
|
||||
else:
|
||||
self.handle_data(rawdata[i:k])
|
||||
i = self.updatepos(i, k)
|
||||
elif startswith("&#", i):
|
||||
match = charref.match(rawdata, i)
|
||||
|
@ -277,7 +302,10 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
assert 0, "interesting.search() lied"
|
||||
# end while
|
||||
if end and i < n and not self.cdata_elem:
|
||||
self.handle_data(rawdata[i:n])
|
||||
if self.convert_charrefs and not self.cdata_elem:
|
||||
self.handle_data(unescape(rawdata[i:n]))
|
||||
else:
|
||||
self.handle_data(rawdata[i:n])
|
||||
i = self.updatepos(i, n)
|
||||
self.rawdata = rawdata[i:]
|
||||
|
||||
|
|
|
@ -70,6 +70,18 @@ class EventCollectorExtra(EventCollector):
|
|||
self.append(("starttag_text", self.get_starttag_text()))
|
||||
|
||||
|
||||
class EventCollectorCharrefs(EventCollector):
|
||||
|
||||
def get_events(self):
|
||||
return self.events
|
||||
|
||||
def handle_charref(self, data):
|
||||
self.fail('This should never be called with convert_charrefs=True')
|
||||
|
||||
def handle_entityref(self, data):
|
||||
self.fail('This should never be called with convert_charrefs=True')
|
||||
|
||||
|
||||
class TestCaseBase(unittest.TestCase):
|
||||
|
||||
def get_collector(self):
|
||||
|
@ -84,12 +96,14 @@ class TestCaseBase(unittest.TestCase):
|
|||
parser.close()
|
||||
events = parser.get_events()
|
||||
if events != expected_events:
|
||||
self.fail("received events did not match expected events\n"
|
||||
"Expected:\n" + pprint.pformat(expected_events) +
|
||||
self.fail("received events did not match expected events" +
|
||||
"\nSource:\n" + repr(source) +
|
||||
"\nExpected:\n" + pprint.pformat(expected_events) +
|
||||
"\nReceived:\n" + pprint.pformat(events))
|
||||
|
||||
def _run_check_extra(self, source, events):
|
||||
self._run_check(source, events, EventCollectorExtra())
|
||||
self._run_check(source, events,
|
||||
EventCollectorExtra(convert_charrefs=False))
|
||||
|
||||
def _parse_error(self, source):
|
||||
def parse(source=source):
|
||||
|
@ -105,7 +119,7 @@ class HTMLParserStrictTestCase(TestCaseBase):
|
|||
|
||||
def get_collector(self):
|
||||
with support.check_warnings(("", DeprecationWarning), quite=False):
|
||||
return EventCollector(strict=True)
|
||||
return EventCollector(strict=True, convert_charrefs=False)
|
||||
|
||||
def test_processing_instruction_only(self):
|
||||
self._run_check("<?processing instruction>", [
|
||||
|
@ -335,7 +349,7 @@ text
|
|||
self._run_check(s, [("starttag", element_lower, []),
|
||||
("data", content),
|
||||
("endtag", element_lower)],
|
||||
collector=Collector())
|
||||
collector=Collector(convert_charrefs=False))
|
||||
|
||||
def test_comments(self):
|
||||
html = ("<!-- I'm a valid comment -->"
|
||||
|
@ -363,13 +377,53 @@ text
|
|||
('comment', '[if lte IE 7]>pretty?<![endif]')]
|
||||
self._run_check(html, expected)
|
||||
|
||||
def test_convert_charrefs(self):
|
||||
collector = lambda: EventCollectorCharrefs(convert_charrefs=True)
|
||||
self.assertTrue(collector().convert_charrefs)
|
||||
charrefs = ['"', '"', '"', '"', '"', '"']
|
||||
# check charrefs in the middle of the text/attributes
|
||||
expected = [('starttag', 'a', [('href', 'foo"zar')]),
|
||||
('data', 'a"z'), ('endtag', 'a')]
|
||||
for charref in charrefs:
|
||||
self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
|
||||
expected, collector=collector())
|
||||
# check charrefs at the beginning/end of the text/attributes
|
||||
expected = [('data', '"'),
|
||||
('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
|
||||
('data', '"'), ('endtag', 'a'), ('data', '"')]
|
||||
for charref in charrefs:
|
||||
self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
|
||||
'{0}</a>{0}'.format(charref),
|
||||
expected, collector=collector())
|
||||
# check charrefs in <script>/<style> elements
|
||||
for charref in charrefs:
|
||||
text = 'X'.join([charref]*3)
|
||||
expected = [('data', '"'),
|
||||
('starttag', 'script', []), ('data', text),
|
||||
('endtag', 'script'), ('data', '"'),
|
||||
('starttag', 'style', []), ('data', text),
|
||||
('endtag', 'style'), ('data', '"')]
|
||||
self._run_check('{1}<script>{0}</script>{1}'
|
||||
'<style>{0}</style>{1}'.format(text, charref),
|
||||
expected, collector=collector())
|
||||
# check truncated charrefs at the end of the file
|
||||
html = '&quo &# &#x'
|
||||
for x in range(1, len(html)):
|
||||
self._run_check(html[:x], [('data', html[:x])],
|
||||
collector=collector())
|
||||
# check a string with no charrefs
|
||||
self._run_check('no charrefs here', [('data', 'no charrefs here')],
|
||||
collector=collector())
|
||||
|
||||
|
||||
class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
|
||||
|
||||
def get_collector(self):
|
||||
return EventCollector()
|
||||
return EventCollector(convert_charrefs=False)
|
||||
|
||||
def test_deprecation_warnings(self):
|
||||
with self.assertWarns(DeprecationWarning):
|
||||
EventCollector() # convert_charrefs not passed explicitly
|
||||
with self.assertWarns(DeprecationWarning):
|
||||
EventCollector(strict=True)
|
||||
with self.assertWarns(DeprecationWarning):
|
||||
|
@ -630,7 +684,7 @@ class AttributesStrictTestCase(TestCaseBase):
|
|||
|
||||
def get_collector(self):
|
||||
with support.check_warnings(("", DeprecationWarning), quite=False):
|
||||
return EventCollector(strict=True)
|
||||
return EventCollector(strict=True, convert_charrefs=False)
|
||||
|
||||
def test_attr_syntax(self):
|
||||
output = [
|
||||
|
@ -691,7 +745,7 @@ class AttributesStrictTestCase(TestCaseBase):
|
|||
class AttributesTolerantTestCase(AttributesStrictTestCase):
|
||||
|
||||
def get_collector(self):
|
||||
return EventCollector()
|
||||
return EventCollector(convert_charrefs=False)
|
||||
|
||||
def test_attr_funky_names2(self):
|
||||
self._run_check(
|
||||
|
|
|
@ -132,6 +132,9 @@ Library
|
|||
- Issue #19449: in csv's writerow, handle non-string keys when generating the
|
||||
error message that certain keys are not in the 'fieldnames' list.
|
||||
|
||||
- Issue #13633: Added a new convert_charrefs keyword arg to HTMLParser that,
|
||||
when True, automatically converts all character references.
|
||||
|
||||
- Issue #2927: Added the unescape() function to the html module.
|
||||
|
||||
- Issue #8402: Added the escape() function to the glob module.
|
||||
|
|
Loading…
Reference in New Issue