diff --git a/Doc/lib/libsgmllib.tex b/Doc/lib/libsgmllib.tex index 3ec10181006..1fe0d6309b6 100644 --- a/Doc/lib/libsgmllib.tex +++ b/Doc/lib/libsgmllib.tex @@ -132,27 +132,59 @@ nothing. \begin{methoddesc}{handle_charref}{ref} This method is called to process a character reference of the form -\samp{\&\#\var{ref};}. In the base implementation, \var{ref} must -be a decimal number in the -range 0-255. It translates the character to \ASCII{} and calls the -method \method{handle_data()} with the character as argument. If -\var{ref} is invalid or out of range, the method -\code{unknown_charref(\var{ref})} is called to handle the error. A -subclass must override this method to provide support for named -character entities. +\samp{\&\#\var{ref};}. The base implementation uses +\method{convert_charref()} to convert the reference to a string. If +that method returns a string, it is passed to \method{handle_data()}, +otherwise \method{unknown_charref(\var{ref})} is called to handle the +error. +\versionchanged[Use \method{convert_charref()} instead of hard-coding +the conversion]{2.5} +\end{methoddesc} + +\begin{methoddesc}{convert_charref}{ref} +Convert a character reference to a string, or \code{None}. \var{ref} +is the reference passed in as a string. In the base implementation, +\var{ref} must be a decimal number in the range 0-255. It converts +the code point found using the \method{convert_codepoint()} method. +If \var{ref} is invalid or out of range, this method returns +\code{None}. This method is called by the default +\method{handle_charref()} implementation and by the attribute value +parser. +\versionadded{2.5} +\end{methoddesc} + +\begin{methoddesc}{convert_codepoint}{codepoint} +Convert a codepoint to a \class{str} value. Encodings can be handled +here if appropriate, though the rest of \module{sgmllib} is oblivious +on this matter. +\versionadded{2.5} \end{methoddesc} \begin{methoddesc}{handle_entityref}{ref} This method is called to process a general entity reference of the form \samp{\&\var{ref};} where \var{ref} is an general entity -reference. It looks for \var{ref} in the instance (or class) -variable \member{entitydefs} which should be a mapping from entity -names to corresponding translations. If a translation is found, it +reference. It converts \var{ref} by passing it to +\method{convert_entityref()}. If a translation is returned, it calls the method \method{handle_data()} with the translation; otherwise, it calls the method \code{unknown_entityref(\var{ref})}. The default \member{entitydefs} defines translations for \code{\&}, \code{\&apos}, \code{\>}, \code{\<}, and \code{\"}. +\versionchanged[Use \method{convert_entityref()} instead of hard-coding +the conversion]{2.5} +\end{methoddesc} + +\begin{methoddesc}{convert_entityref}{ref} +Convert a named entity reference to a \class{str} value, or +\code{None}. The resulting value will not be parsed. \var{ref} will +be only the name of the entity. The default implementation looks for +\var{ref} in the instance (or class) variable \member{entitydefs} +which should be a mapping from entity names to corresponding +translations. If no translation is available for \var{ref}, this +method returns \code{None}. This method is called by the default +\method{handle_entityref()} implementation and by the attribute value +parser. +\versionadded{2.5} \end{methoddesc} \begin{methoddesc}{handle_comment}{comment} diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 27352a1adf4..5c59a5c24fc 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -53,6 +53,10 @@ class SGMLParseError(RuntimeError): # self.handle_entityref() with the entity reference as argument. class SGMLParser(markupbase.ParserBase): + # Definition of entities -- derived classes may override + entity_or_charref = re.compile('&(?:' + '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' + ')(;?)') def __init__(self, verbose=0): """Initialize and reset this instance.""" @@ -277,32 +281,8 @@ class SGMLParser(markupbase.ParserBase): attrvalue[:1] == '"' == attrvalue[-1:]): # strip quotes attrvalue = attrvalue[1:-1] - l = 0 - new_attrvalue = '' - while l < len(attrvalue): - av_match = entityref.match(attrvalue, l) - if (av_match and av_match.group(1) in self.entitydefs and - attrvalue[av_match.end(1)] == ';'): - # only substitute entityrefs ending in ';' since - # otherwise we may break - # which is very common - new_attrvalue += self.entitydefs[av_match.group(1)] - l = av_match.end(0) - continue - ch_match = charref.match(attrvalue, l) - if ch_match: - try: - char = chr(int(ch_match.group(1))) - new_attrvalue += char - l = ch_match.end(0) - continue - except ValueError: - # invalid character reference, don't substitute - pass - # all other cases - new_attrvalue += attrvalue[l] - l += 1 - attrvalue = new_attrvalue + attrvalue = self.entity_or_charref.sub( + self._convert_ref, attrvalue) attrs.append((attrname.lower(), attrvalue)) k = match.end(0) if rawdata[j] == '>': @@ -311,6 +291,17 @@ class SGMLParser(markupbase.ParserBase): self.finish_starttag(tag, attrs) return j + # Internal -- convert entity or character reference + def _convert_ref(self, match): + if match.group(2): + return self.convert_charref(match.group(2)) or \ + '&#%s%s' % match.groups()[1:] + elif match.group(3): + return self.convert_entityref(match.group(1)) or \ + '&%s;' % match.group(1) + else: + return '&%s' % match.group(1) + # Internal -- parse endtag def parse_endtag(self, i): rawdata = self.rawdata @@ -394,35 +385,51 @@ class SGMLParser(markupbase.ParserBase): print '*** Unbalanced ' print '*** Stack:', self.stack - def handle_charref(self, name): - """Handle character reference, no need to override.""" + def convert_charref(self, name): + """Convert character reference, may be overridden.""" try: n = int(name) except ValueError: - self.unknown_charref(name) return if not 0 <= n <= 255: - self.unknown_charref(name) return - self.handle_data(chr(n)) + return self.convert_codepoint(n) + + def convert_codepoint(self, codepoint): + return chr(codepoint) + + def handle_charref(self, name): + """Handle character reference, no need to override.""" + replacement = convert_charref(name) + if replacement is None: + self.unknown_charref(name) + else: + self.handle_data(convert_charref(name)) # Definition of entities -- derived classes may override entitydefs = \ {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} - def handle_entityref(self, name): - """Handle entity references. + def convert_entityref(self, name): + """Convert entity references. - There should be no need to override this method; it can be - tailored by setting up the self.entitydefs mapping appropriately. + As an alternative to overriding this method; one can tailor the + results by setting up the self.entitydefs mapping appropriately. """ table = self.entitydefs if name in table: - self.handle_data(table[name]) + return table[name] else: - self.unknown_entityref(name) return + def handle_entityref(self, name): + """Handle entity references, no need to override.""" + replacement = convert_entityref(name) + if replacement is None: + self.unknown_entityref(name) + else: + self.handle_data(convert_entityref(name)) + # Example -- handle data, should be overridden def handle_data(self, data): pass diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py index ec417d032ad..31b54de3845 100644 --- a/Lib/test/test_sgmllib.py +++ b/Lib/test/test_sgmllib.py @@ -64,6 +64,23 @@ class CDATAEventCollector(EventCollector): self.setliteral() +class HTMLEntityCollector(EventCollector): + import re, htmlentitydefs + entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)' + '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)') + + def convert_charref(self, name): + self.append(("charref", "convert", name)) + if name.startswith('x'): + return unichr(int(name[1:],16)) + else: + return unichr(int(name)) + + def convert_entityref(self, name): + self.append(("entityref", "convert", name)) + return unichr(self.htmlentitydefs.name2codepoint[name]) + + class SGMLParserTestCase(unittest.TestCase): collector = EventCollector @@ -233,6 +250,16 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ("k", "*"), ])]) + def test_convert_overrides(self): + self.collector = HTMLEntityCollector + self.check_events('foo', [ + ('entityref', 'convert', 'ldquo'), + ('charref', 'convert', 'x201d'), + ('starttag', 'a', [('title', u'\u201ctest\u201d')]), + ('data', 'foo'), + ('endtag', 'a'), + ]) + def test_attr_funky_names(self): self.check_events("""""", [ ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), diff --git a/Misc/ACKS b/Misc/ACKS index 4e29c7a2ca3..c51303f707a 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -528,6 +528,7 @@ Hugo van Rossum Saskia van Rossum Donald Wallace Rouse II Liam Routt +Sam Ruby Paul Rubin Audun S. Runde Jeff Rush