patch #1462498: handle entityrefs in attribute values.
This commit is contained in:
parent
48d5e508eb
commit
7f6b67c235
|
@ -95,12 +95,15 @@ lower case, and the \var{method} argument is the bound method which
|
|||
should be used to support semantic interpretation of the start tag.
|
||||
The \var{attributes} argument is a list of \code{(\var{name},
|
||||
\var{value})} pairs containing the attributes found inside the tag's
|
||||
\code{<>} brackets. The \var{name} has been translated to lower case
|
||||
and double quotes and backslashes in the \var{value} have been interpreted.
|
||||
\code{<>} brackets. The \var{name} has been translated to lower case.
|
||||
Double quotes and backslashes in the \var{value} have been interpreted,
|
||||
as well as known entity and character references.
|
||||
For instance, for the tag \code{<A HREF="http://www.cwi.nl/">}, this
|
||||
method would be called as \samp{unknown_starttag('a', [('href',
|
||||
'http://www.cwi.nl/')])}. The base implementation simply calls
|
||||
\var{method} with \var{attributes} as the only argument.
|
||||
\versionadded[Handling of entity and character references within
|
||||
attribute values]{2.5}
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}{handle_endtag}{tag, method}
|
||||
|
|
|
@ -269,9 +269,37 @@ class SGMLParser(markupbase.ParserBase):
|
|||
attrname, rest, attrvalue = match.group(1, 2, 3)
|
||||
if not rest:
|
||||
attrvalue = attrname
|
||||
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
|
||||
attrvalue[:1] == '"' == attrvalue[-1:]:
|
||||
attrvalue = attrvalue[1:-1]
|
||||
else:
|
||||
if (attrvalue[:1] == "'" == attrvalue[-1:] or
|
||||
attrvalue[:1] == '"' == attrvalue[-1:]):
|
||||
# strip quotes
|
||||
attrvalue = attrvalue[1:-1]
|
||||
l = 0
|
||||
new_attrvalue = ''
|
||||
while l < len(attrvalue):
|
||||
av_match = entityref.match(attrvalue, l)
|
||||
if (av_match and av_match.group(1) in self.entitydefs and
|
||||
attrvalue[av_match.end(1)] == ';'):
|
||||
# only substitute entityrefs ending in ';' since
|
||||
# otherwise we may break <a href='?p=x&q=y'>
|
||||
# which is very common
|
||||
new_attrvalue += self.entitydefs[av_match.group(1)]
|
||||
l = av_match.end(0)
|
||||
continue
|
||||
ch_match = charref.match(attrvalue, l)
|
||||
if ch_match:
|
||||
try:
|
||||
char = chr(int(ch_match.group(1)))
|
||||
new_attrvalue += char
|
||||
l = ch_match.end(0)
|
||||
continue
|
||||
except ValueError:
|
||||
# invalid character reference, don't substitute
|
||||
pass
|
||||
# all other cases
|
||||
new_attrvalue += attrvalue[l]
|
||||
l += 1
|
||||
attrvalue = new_attrvalue
|
||||
attrs.append((attrname.lower(), attrvalue))
|
||||
k = match.end(0)
|
||||
if rawdata[j] == '>':
|
||||
|
|
|
@ -214,6 +214,20 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
|
|||
("starttag", "e", [("a", "rgb(1,2,3)")]),
|
||||
])
|
||||
|
||||
def test_attr_values_entities(self):
|
||||
"""Substitution of entities and charrefs in attribute values"""
|
||||
# SF bug #1452246
|
||||
self.check_events("""<a b=< c=<> d=<-> e='< '
|
||||
f="&xxx;" g=' !' h='Ǵ' i='x?a=b&c=d;'>""",
|
||||
[("starttag", "a", [("b", "<"),
|
||||
("c", "<>"),
|
||||
("d", "<->"),
|
||||
("e", "< "),
|
||||
("f", "&xxx;"),
|
||||
("g", " !"),
|
||||
("h", "Ǵ"),
|
||||
("i", "x?a=b&c=d;"), ])])
|
||||
|
||||
def test_attr_funky_names(self):
|
||||
self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
|
||||
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
|
||||
|
|
|
@ -489,6 +489,9 @@ Extension Modules
|
|||
Library
|
||||
-------
|
||||
|
||||
- Patch #1462498: sgmllib now handles entity and character references
|
||||
in attribute values.
|
||||
|
||||
- Added the sqlite3 package. This is based on pysqlite2.1.3, and provides
|
||||
a DB-API interface in the standard library. You'll need sqlite 3.2.2 or
|
||||
later to build this - if you have an earlier version, the C extension
|
||||
|
|
Loading…
Reference in New Issue