mirror of https://github.com/python/cpython
Patch #912410: Replace HTML entity references for attribute values
in HTMLParser.
This commit is contained in:
parent
ff432e6f4a
commit
ab8a6bba25
|
@ -75,14 +75,18 @@ This method is called to handle the start of a tag. It is intended to
|
|||
be overridden by a derived class; the base class implementation does
|
||||
nothing.
|
||||
|
||||
The \var{tag} argument is the name of the tag converted to
|
||||
lower case. The \var{attrs} argument is a list of \code{(\var{name},
|
||||
\var{value})} pairs containing the attributes found inside the tag's
|
||||
\code{<>} brackets. The \var{name} will be translated to lower case
|
||||
and double quotes and backslashes in the \var{value} have been
|
||||
interpreted. For instance, for the tag \code{<A
|
||||
HREF="http://www.cwi.nl/">}, this method would be called as
|
||||
The \var{tag} argument is the name of the tag converted to lower case.
|
||||
The \var{attrs} argument is a list of \code{(\var{name}, \var{value})}
|
||||
pairs containing the attributes found inside the tag's \code{<>}
|
||||
brackets. The \var{name} will be translated to lower case, and quotes
|
||||
in the \var{value} have been removed, and character and entity
|
||||
references have been replaced. For instance, for the tag \code{<A
|
||||
HREF="http://www.cwi.nl/">}, this method would be called as
|
||||
\samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}.
|
||||
|
||||
\versionchanged[All entity references from htmlentitydefs are now
|
||||
replaced in the attribute values]{2.6}
|
||||
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}{handle_startendtag}{tag, attrs}
|
||||
|
|
|
@ -358,12 +358,30 @@ class HTMLParser(markupbase.ParserBase):
|
|||
self.error("unknown declaration: %r" % (data,))
|
||||
|
||||
# Internal -- helper to remove special character quoting
|
||||
entitydefs = None
|
||||
def unescape(self, s):
|
||||
if '&' not in s:
|
||||
return s
|
||||
s = s.replace("<", "<")
|
||||
s = s.replace(">", ">")
|
||||
s = s.replace("'", "'")
|
||||
s = s.replace(""", '"')
|
||||
s = s.replace("&", "&") # Must be last
|
||||
return s
|
||||
def replaceEntities(s):
|
||||
s = s.groups()[0]
|
||||
if s[0] == "#":
|
||||
s = s[1:]
|
||||
if s[0] in ['x','X']:
|
||||
c = int(s[1:], 16)
|
||||
else:
|
||||
c = int(s)
|
||||
return unichr(c)
|
||||
else:
|
||||
# Cannot use name2codepoint directly, because HTMLParser supports apos,
|
||||
# which is not part of HTML 4
|
||||
import htmlentitydefs
|
||||
if HTMLParser.entitydefs is None:
|
||||
entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
|
||||
for k, v in htmlentitydefs.name2codepoint.iteritems():
|
||||
entitydefs[k] = unichr(v)
|
||||
try:
|
||||
return self.entitydefs[s]
|
||||
except KeyError:
|
||||
return '&'+s+';'
|
||||
|
||||
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
|
||||
|
|
|
@ -309,6 +309,11 @@ DOCTYPE html [
|
|||
("endtag", "script"),
|
||||
])
|
||||
|
||||
def test_entityrefs_in_attributes(self):
|
||||
self._run_check("<html foo='€&aa&unsupported;'>", [
|
||||
("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])
|
||||
])
|
||||
|
||||
|
||||
def test_main():
|
||||
test_support.run_unittest(HTMLParserTestCase)
|
||||
|
|
Loading…
Reference in New Issue