diff --git a/Doc/lib/libsgmllib.tex b/Doc/lib/libsgmllib.tex
index 3ec10181006..1fe0d6309b6 100644
--- a/Doc/lib/libsgmllib.tex
+++ b/Doc/lib/libsgmllib.tex
@@ -132,27 +132,59 @@ nothing.
\begin{methoddesc}{handle_charref}{ref}
This method is called to process a character reference of the form
-\samp{\&\#\var{ref};}. In the base implementation, \var{ref} must
-be a decimal number in the
-range 0-255. It translates the character to \ASCII{} and calls the
-method \method{handle_data()} with the character as argument. If
-\var{ref} is invalid or out of range, the method
-\code{unknown_charref(\var{ref})} is called to handle the error. A
-subclass must override this method to provide support for named
-character entities.
+\samp{\&\#\var{ref};}. The base implementation uses
+\method{convert_charref()} to convert the reference to a string. If
+that method returns a string, it is passed to \method{handle_data()},
+otherwise \method{unknown_charref(\var{ref})} is called to handle the
+error.
+\versionchanged[Use \method{convert_charref()} instead of hard-coding
+the conversion]{2.5}
+\end{methoddesc}
+
+\begin{methoddesc}{convert_charref}{ref}
+Convert a character reference to a string, or \code{None}. \var{ref}
+is the reference passed in as a string. In the base implementation,
+\var{ref} must be a decimal number in the range 0-255. It converts
+the code point found using the \method{convert_codepoint()} method.
+If \var{ref} is invalid or out of range, this method returns
+\code{None}. This method is called by the default
+\method{handle_charref()} implementation and by the attribute value
+parser.
+\versionadded{2.5}
+\end{methoddesc}
+
+\begin{methoddesc}{convert_codepoint}{codepoint}
+Convert a codepoint to a \class{str} value. Encodings can be handled
+here if appropriate, though the rest of \module{sgmllib} is oblivious
+on this matter.
+\versionadded{2.5}
\end{methoddesc}
\begin{methoddesc}{handle_entityref}{ref}
This method is called to process a general entity reference of the
form \samp{\&\var{ref};} where \var{ref} is an general entity
-reference. It looks for \var{ref} in the instance (or class)
-variable \member{entitydefs} which should be a mapping from entity
-names to corresponding translations. If a translation is found, it
+reference. It converts \var{ref} by passing it to
+\method{convert_entityref()}. If a translation is returned, it
calls the method \method{handle_data()} with the translation;
otherwise, it calls the method \code{unknown_entityref(\var{ref})}.
The default \member{entitydefs} defines translations for
\code{\&}, \code{\&apos}, \code{\>}, \code{\<}, and
\code{\"}.
+\versionchanged[Use \method{convert_entityref()} instead of hard-coding
+the conversion]{2.5}
+\end{methoddesc}
+
+\begin{methoddesc}{convert_entityref}{ref}
+Convert a named entity reference to a \class{str} value, or
+\code{None}. The resulting value will not be parsed. \var{ref} will
+be only the name of the entity. The default implementation looks for
+\var{ref} in the instance (or class) variable \member{entitydefs}
+which should be a mapping from entity names to corresponding
+translations. If no translation is available for \var{ref}, this
+method returns \code{None}. This method is called by the default
+\method{handle_entityref()} implementation and by the attribute value
+parser.
+\versionadded{2.5}
\end{methoddesc}
\begin{methoddesc}{handle_comment}{comment}
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index 27352a1adf4..5c59a5c24fc 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@@ -53,6 +53,10 @@ class SGMLParseError(RuntimeError):
# self.handle_entityref() with the entity reference as argument.
class SGMLParser(markupbase.ParserBase):
+ # Definition of entities -- derived classes may override
+ entity_or_charref = re.compile('&(?:'
+ '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
+ ')(;?)')
def __init__(self, verbose=0):
"""Initialize and reset this instance."""
@@ -277,32 +281,8 @@ class SGMLParser(markupbase.ParserBase):
attrvalue[:1] == '"' == attrvalue[-1:]):
# strip quotes
attrvalue = attrvalue[1:-1]
- l = 0
- new_attrvalue = ''
- while l < len(attrvalue):
- av_match = entityref.match(attrvalue, l)
- if (av_match and av_match.group(1) in self.entitydefs and
- attrvalue[av_match.end(1)] == ';'):
- # only substitute entityrefs ending in ';' since
- # otherwise we may break
- # which is very common
- new_attrvalue += self.entitydefs[av_match.group(1)]
- l = av_match.end(0)
- continue
- ch_match = charref.match(attrvalue, l)
- if ch_match:
- try:
- char = chr(int(ch_match.group(1)))
- new_attrvalue += char
- l = ch_match.end(0)
- continue
- except ValueError:
- # invalid character reference, don't substitute
- pass
- # all other cases
- new_attrvalue += attrvalue[l]
- l += 1
- attrvalue = new_attrvalue
+ attrvalue = self.entity_or_charref.sub(
+ self._convert_ref, attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = match.end(0)
if rawdata[j] == '>':
@@ -311,6 +291,17 @@ class SGMLParser(markupbase.ParserBase):
self.finish_starttag(tag, attrs)
return j
+ # Internal -- convert entity or character reference
+ def _convert_ref(self, match):
+ if match.group(2):
+ return self.convert_charref(match.group(2)) or \
+ '%s%s' % match.groups()[1:]
+ elif match.group(3):
+ return self.convert_entityref(match.group(1)) or \
+ '&%s;' % match.group(1)
+ else:
+ return '&%s' % match.group(1)
+
# Internal -- parse endtag
def parse_endtag(self, i):
rawdata = self.rawdata
@@ -394,35 +385,51 @@ class SGMLParser(markupbase.ParserBase):
print '*** Unbalanced ' + tag + '>'
print '*** Stack:', self.stack
- def handle_charref(self, name):
- """Handle character reference, no need to override."""
+ def convert_charref(self, name):
+ """Convert character reference, may be overridden."""
try:
n = int(name)
except ValueError:
- self.unknown_charref(name)
return
if not 0 <= n <= 255:
- self.unknown_charref(name)
return
- self.handle_data(chr(n))
+ return self.convert_codepoint(n)
+
+ def convert_codepoint(self, codepoint):
+ return chr(codepoint)
+
+ def handle_charref(self, name):
+ """Handle character reference, no need to override."""
+ replacement = convert_charref(name)
+ if replacement is None:
+ self.unknown_charref(name)
+ else:
+ self.handle_data(convert_charref(name))
# Definition of entities -- derived classes may override
entitydefs = \
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
- def handle_entityref(self, name):
- """Handle entity references.
+ def convert_entityref(self, name):
+ """Convert entity references.
- There should be no need to override this method; it can be
- tailored by setting up the self.entitydefs mapping appropriately.
+ As an alternative to overriding this method; one can tailor the
+ results by setting up the self.entitydefs mapping appropriately.
"""
table = self.entitydefs
if name in table:
- self.handle_data(table[name])
+ return table[name]
else:
- self.unknown_entityref(name)
return
+ def handle_entityref(self, name):
+ """Handle entity references, no need to override."""
+ replacement = convert_entityref(name)
+ if replacement is None:
+ self.unknown_entityref(name)
+ else:
+ self.handle_data(convert_entityref(name))
+
# Example -- handle data, should be overridden
def handle_data(self, data):
pass
diff --git a/Lib/test/test_sgmllib.py b/Lib/test/test_sgmllib.py
index ec417d032ad..31b54de3845 100644
--- a/Lib/test/test_sgmllib.py
+++ b/Lib/test/test_sgmllib.py
@@ -64,6 +64,23 @@ class CDATAEventCollector(EventCollector):
self.setliteral()
+class HTMLEntityCollector(EventCollector):
+ import re, htmlentitydefs
+ entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
+ '|(x[0-9a-zA-Z]+|[0-9]+))(;?)')
+
+ def convert_charref(self, name):
+ self.append(("charref", "convert", name))
+ if name.startswith('x'):
+ return unichr(int(name[1:],16))
+ else:
+ return unichr(int(name))
+
+ def convert_entityref(self, name):
+ self.append(("entityref", "convert", name))
+ return unichr(self.htmlentitydefs.name2codepoint[name])
+
+
class SGMLParserTestCase(unittest.TestCase):
collector = EventCollector
@@ -233,6 +250,16 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
("k", "*"),
])])
+ def test_convert_overrides(self):
+ self.collector = HTMLEntityCollector
+ self.check_events('foo', [
+ ('entityref', 'convert', 'ldquo'),
+ ('charref', 'convert', 'x201d'),
+ ('starttag', 'a', [('title', u'\u201ctest\u201d')]),
+ ('data', 'foo'),
+ ('endtag', 'a'),
+ ])
+
def test_attr_funky_names(self):
self.check_events("""""", [
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
diff --git a/Misc/ACKS b/Misc/ACKS
index 4e29c7a2ca3..c51303f707a 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -528,6 +528,7 @@ Hugo van Rossum
Saskia van Rossum
Donald Wallace Rouse II
Liam Routt
+Sam Ruby
Paul Rubin
Audun S. Runde
Jeff Rush