From 7f6b67c2359d9b52b2eabc1e2ccff4cedb5c78b7 Mon Sep 17 00:00:00 2001 From: Georg Brandl Date: Sat, 1 Apr 2006 08:35:18 +0000 Subject: [PATCH] patch #1462498: handle entityrefs in attribute values. --- Doc/lib/libsgmllib.tex | 7 +++++-- Lib/sgmllib.py | 34 +++++++++++++++++++++++++++++++--- Lib/test/test_sgmllib.py | 14 ++++++++++++++ Misc/NEWS | 3 +++ 4 files changed, 53 insertions(+), 5 deletions(-) diff --git a/Doc/lib/libsgmllib.tex b/Doc/lib/libsgmllib.tex index 27bf0b0ff3d..592c1916318 100644 --- a/Doc/lib/libsgmllib.tex +++ b/Doc/lib/libsgmllib.tex @@ -95,12 +95,15 @@ lower case, and the \var{method} argument is the bound method which should be used to support semantic interpretation of the start tag. The \var{attributes} argument is a list of \code{(\var{name}, \var{value})} pairs containing the attributes found inside the tag's -\code{<>} brackets. The \var{name} has been translated to lower case -and double quotes and backslashes in the \var{value} have been interpreted. +\code{<>} brackets. The \var{name} has been translated to lower case. +Double quotes and backslashes in the \var{value} have been interpreted, +as well as known entity and character references. For instance, for the tag \code{}, this method would be called as \samp{unknown_starttag('a', [('href', 'http://www.cwi.nl/')])}. The base implementation simply calls \var{method} with \var{attributes} as the only argument. +\versionadded[Handling of entity and character references within + attribute values]{2.5} \end{methoddesc} \begin{methoddesc}{handle_endtag}{tag, method} diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 08e365bdef9..784dbe1b8a4 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -269,9 +269,37 @@ class SGMLParser(markupbase.ParserBase): attrname, rest, attrvalue = match.group(1, 2, 3) if not rest: attrvalue = attrname - elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ - attrvalue[:1] == '"' == attrvalue[-1:]: - attrvalue = attrvalue[1:-1] + else: + if (attrvalue[:1] == "'" == attrvalue[-1:] or + attrvalue[:1] == '"' == attrvalue[-1:]): + # strip quotes + attrvalue = attrvalue[1:-1] + l = 0 + new_attrvalue = '' + while l < len(attrvalue): + av_match = entityref.match(attrvalue, l) + if (av_match and av_match.group(1) in self.entitydefs and + attrvalue[av_match.end(1)] == ';'): + # only substitute entityrefs ending in ';' since + # otherwise we may break + # which is very common + new_attrvalue += self.entitydefs[av_match.group(1)] + l = av_match.end(0) + continue + ch_match = charref.match(attrvalue, l) + if ch_match: + try: + char = chr(int(ch_match.group(1))) + new_attrvalue += char + l = ch_match.end(0) + continue + except ValueError: + # invalid character reference, don't substitute + pass + # all other cases + new_attrvalue += attrvalue[l] + l += 1 + attrvalue = new_attrvalue attrs.append((attrname.lower(), attrvalue)) k = match.end(0) if rawdata[j] == '>': diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py index bc25bd0195c..8e8b02f8770 100644 --- a/Lib/test/test_sgmllib.py +++ b/Lib/test/test_sgmllib.py @@ -214,6 +214,20 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ("starttag", "e", [("a", "rgb(1,2,3)")]), ]) + def test_attr_values_entities(self): + """Substitution of entities and charrefs in attribute values""" + # SF bug #1452246 + self.check_events("""""", + [("starttag", "a", [("b", "<"), + ("c", "<>"), + ("d", "<->"), + ("e", "< "), + ("f", "&xxx;"), + ("g", " !"), + ("h", "Ǵ"), + ("i", "x?a=b&c=d;"), ])]) + def test_attr_funky_names(self): self.check_events("""""", [ ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), diff --git a/Misc/NEWS b/Misc/NEWS index 0ecea2e4d28..260d38f18b2 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -489,6 +489,9 @@ Extension Modules Library ------- +- Patch #1462498: sgmllib now handles entity and character references + in attribute values. + - Added the sqlite3 package. This is based on pysqlite2.1.3, and provides a DB-API interface in the standard library. You'll need sqlite 3.2.2 or later to build this - if you have an earlier version, the C extension