mirror of https://github.com/python/cpython
Patch #912410: Replace HTML entity references for attribute values
in HTMLParser.
This commit is contained in:
parent
ff432e6f4a
commit
ab8a6bba25
|
@ -75,14 +75,18 @@ This method is called to handle the start of a tag. It is intended to
|
||||||
be overridden by a derived class; the base class implementation does
|
be overridden by a derived class; the base class implementation does
|
||||||
nothing.
|
nothing.
|
||||||
|
|
||||||
The \var{tag} argument is the name of the tag converted to
|
The \var{tag} argument is the name of the tag converted to lower case.
|
||||||
lower case. The \var{attrs} argument is a list of \code{(\var{name},
|
The \var{attrs} argument is a list of \code{(\var{name}, \var{value})}
|
||||||
\var{value})} pairs containing the attributes found inside the tag's
|
pairs containing the attributes found inside the tag's \code{<>}
|
||||||
\code{<>} brackets. The \var{name} will be translated to lower case
|
brackets. The \var{name} will be translated to lower case, and quotes
|
||||||
and double quotes and backslashes in the \var{value} have been
|
in the \var{value} have been removed, and character and entity
|
||||||
interpreted. For instance, for the tag \code{<A
|
references have been replaced. For instance, for the tag \code{<A
|
||||||
HREF="http://www.cwi.nl/">}, this method would be called as
|
HREF="http://www.cwi.nl/">}, this method would be called as
|
||||||
\samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}.
|
\samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}.
|
||||||
|
|
||||||
|
\versionchanged[All entity references from htmlentitydefs are now
|
||||||
|
replaced in the attribute values]{2.6}
|
||||||
|
|
||||||
\end{methoddesc}
|
\end{methoddesc}
|
||||||
|
|
||||||
\begin{methoddesc}{handle_startendtag}{tag, attrs}
|
\begin{methoddesc}{handle_startendtag}{tag, attrs}
|
||||||
|
|
|
@ -358,12 +358,30 @@ class HTMLParser(markupbase.ParserBase):
|
||||||
self.error("unknown declaration: %r" % (data,))
|
self.error("unknown declaration: %r" % (data,))
|
||||||
|
|
||||||
# Internal -- helper to remove special character quoting
|
# Internal -- helper to remove special character quoting
|
||||||
|
entitydefs = None
|
||||||
def unescape(self, s):
|
def unescape(self, s):
|
||||||
if '&' not in s:
|
if '&' not in s:
|
||||||
return s
|
return s
|
||||||
s = s.replace("<", "<")
|
def replaceEntities(s):
|
||||||
s = s.replace(">", ">")
|
s = s.groups()[0]
|
||||||
s = s.replace("'", "'")
|
if s[0] == "#":
|
||||||
s = s.replace(""", '"')
|
s = s[1:]
|
||||||
s = s.replace("&", "&") # Must be last
|
if s[0] in ['x','X']:
|
||||||
return s
|
c = int(s[1:], 16)
|
||||||
|
else:
|
||||||
|
c = int(s)
|
||||||
|
return unichr(c)
|
||||||
|
else:
|
||||||
|
# Cannot use name2codepoint directly, because HTMLParser supports apos,
|
||||||
|
# which is not part of HTML 4
|
||||||
|
import htmlentitydefs
|
||||||
|
if HTMLParser.entitydefs is None:
|
||||||
|
entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
|
||||||
|
for k, v in htmlentitydefs.name2codepoint.iteritems():
|
||||||
|
entitydefs[k] = unichr(v)
|
||||||
|
try:
|
||||||
|
return self.entitydefs[s]
|
||||||
|
except KeyError:
|
||||||
|
return '&'+s+';'
|
||||||
|
|
||||||
|
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
|
||||||
|
|
|
@ -309,6 +309,11 @@ DOCTYPE html [
|
||||||
("endtag", "script"),
|
("endtag", "script"),
|
||||||
])
|
])
|
||||||
|
|
||||||
|
def test_entityrefs_in_attributes(self):
|
||||||
|
self._run_check("<html foo='€&aa&unsupported;'>", [
|
||||||
|
("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
def test_main():
|
def test_main():
|
||||||
test_support.run_unittest(HTMLParserTestCase)
|
test_support.run_unittest(HTMLParserTestCase)
|
||||||
|
|
|
@ -141,6 +141,9 @@ Core and builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Patch #912410: Replace HTML entity references for attribute values
|
||||||
|
in HTMLParser.
|
||||||
|
|
||||||
- Patch #1663234: you can now run doctest on test files and modules
|
- Patch #1663234: you can now run doctest on test files and modules
|
||||||
using "python -m doctest [-v] filename ...".
|
using "python -m doctest [-v] filename ...".
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue