Patch #912410: Replace HTML entity references for attribute values

in HTMLParser.
This commit is contained in:
Martin v. Löwis 2007-03-06 14:43:00 +00:00
parent ff432e6f4a
commit ab8a6bba25
4 changed files with 43 additions and 13 deletions

View File

@ -75,14 +75,18 @@ This method is called to handle the start of a tag. It is intended to
be overridden by a derived class; the base class implementation does
nothing.
The \var{tag} argument is the name of the tag converted to
lower case. The \var{attrs} argument is a list of \code{(\var{name},
\var{value})} pairs containing the attributes found inside the tag's
\code{<>} brackets. The \var{name} will be translated to lower case
and double quotes and backslashes in the \var{value} have been
interpreted. For instance, for the tag \code{<A
The \var{tag} argument is the name of the tag converted to lower case.
The \var{attrs} argument is a list of \code{(\var{name}, \var{value})}
pairs containing the attributes found inside the tag's \code{<>}
brackets. The \var{name} will be translated to lower case, and quotes
in the \var{value} have been removed, and character and entity
references have been replaced. For instance, for the tag \code{<A
HREF="http://www.cwi.nl/">}, this method would be called as
\samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}.
\versionchanged[All entity references from htmlentitydefs are now
replaced in the attribute values]{2.6}
\end{methoddesc}
\begin{methoddesc}{handle_startendtag}{tag, attrs}

View File

@ -358,12 +358,30 @@ class HTMLParser(markupbase.ParserBase):
self.error("unknown declaration: %r" % (data,))
# Internal -- helper to remove special character quoting
entitydefs = None
def unescape(self, s):
if '&' not in s:
return s
s = s.replace("&lt;", "<")
s = s.replace("&gt;", ">")
s = s.replace("&apos;", "'")
s = s.replace("&quot;", '"')
s = s.replace("&amp;", "&") # Must be last
return s
def replaceEntities(s):
s = s.groups()[0]
if s[0] == "#":
s = s[1:]
if s[0] in ['x','X']:
c = int(s[1:], 16)
else:
c = int(s)
return unichr(c)
else:
# Cannot use name2codepoint directly, because HTMLParser supports apos,
# which is not part of HTML 4
import htmlentitydefs
if HTMLParser.entitydefs is None:
entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
for k, v in htmlentitydefs.name2codepoint.iteritems():
entitydefs[k] = unichr(v)
try:
return self.entitydefs[s]
except KeyError:
return '&'+s+';'
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)

View File

@ -309,6 +309,11 @@ DOCTYPE html [
("endtag", "script"),
])
def test_entityrefs_in_attributes(self):
self._run_check("<html foo='&euro;&amp;&#97;&#x61;&unsupported;'>", [
("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])
])
def test_main():
test_support.run_unittest(HTMLParserTestCase)

View File

@ -141,6 +141,9 @@ Core and builtins
Library
-------
- Patch #912410: Replace HTML entity references for attribute values
in HTMLParser.
- Patch #1663234: you can now run doctest on test files and modules
using "python -m doctest [-v] filename ...".