Patch #912410: Replace HTML entity references for attribute values

in HTMLParser.
This commit is contained in:
Martin v. Löwis 2007-03-06 14:43:00 +00:00
parent ff432e6f4a
commit ab8a6bba25
4 changed files with 43 additions and 13 deletions

View File

@ -75,14 +75,18 @@ This method is called to handle the start of a tag. It is intended to
be overridden by a derived class; the base class implementation does be overridden by a derived class; the base class implementation does
nothing. nothing.
The \var{tag} argument is the name of the tag converted to The \var{tag} argument is the name of the tag converted to lower case.
lower case. The \var{attrs} argument is a list of \code{(\var{name}, The \var{attrs} argument is a list of \code{(\var{name}, \var{value})}
\var{value})} pairs containing the attributes found inside the tag's pairs containing the attributes found inside the tag's \code{<>}
\code{<>} brackets. The \var{name} will be translated to lower case brackets. The \var{name} will be translated to lower case, and quotes
and double quotes and backslashes in the \var{value} have been in the \var{value} have been removed, and character and entity
interpreted. For instance, for the tag \code{<A references have been replaced. For instance, for the tag \code{<A
HREF="http://www.cwi.nl/">}, this method would be called as HREF="http://www.cwi.nl/">}, this method would be called as
\samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}. \samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}.
\versionchanged[All entity references from htmlentitydefs are now
replaced in the attribute values]{2.6}
\end{methoddesc} \end{methoddesc}
\begin{methoddesc}{handle_startendtag}{tag, attrs} \begin{methoddesc}{handle_startendtag}{tag, attrs}

View File

@ -358,12 +358,30 @@ class HTMLParser(markupbase.ParserBase):
self.error("unknown declaration: %r" % (data,)) self.error("unknown declaration: %r" % (data,))
# Internal -- helper to remove special character quoting # Internal -- helper to remove special character quoting
entitydefs = None
def unescape(self, s): def unescape(self, s):
if '&' not in s: if '&' not in s:
return s return s
s = s.replace("&lt;", "<") def replaceEntities(s):
s = s.replace("&gt;", ">") s = s.groups()[0]
s = s.replace("&apos;", "'") if s[0] == "#":
s = s.replace("&quot;", '"') s = s[1:]
s = s.replace("&amp;", "&") # Must be last if s[0] in ['x','X']:
return s c = int(s[1:], 16)
else:
c = int(s)
return unichr(c)
else:
# Cannot use name2codepoint directly, because HTMLParser supports apos,
# which is not part of HTML 4
import htmlentitydefs
if HTMLParser.entitydefs is None:
entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
for k, v in htmlentitydefs.name2codepoint.iteritems():
entitydefs[k] = unichr(v)
try:
return self.entitydefs[s]
except KeyError:
return '&'+s+';'
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)

View File

@ -309,6 +309,11 @@ DOCTYPE html [
("endtag", "script"), ("endtag", "script"),
]) ])
def test_entityrefs_in_attributes(self):
self._run_check("<html foo='&euro;&amp;&#97;&#x61;&unsupported;'>", [
("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])
])
def test_main(): def test_main():
test_support.run_unittest(HTMLParserTestCase) test_support.run_unittest(HTMLParserTestCase)

View File

@ -141,6 +141,9 @@ Core and builtins
Library Library
------- -------
- Patch #912410: Replace HTML entity references for attribute values
in HTMLParser.
- Patch #1663234: you can now run doctest on test files and modules - Patch #1663234: you can now run doctest on test files and modules
using "python -m doctest [-v] filename ...". using "python -m doctest [-v] filename ...".