#1065986: Make pydoc handle unicode strings.

Patch by Akira Kitada.
2014-01-05 12:35:59 -05:00 · 2014-01-05 12:35:59 -05:00 · 984f630f0a
parent 1d2ef64df6
commit 984f630f0a
3 changed files with 137 additions and 14 deletions
--- a/Lib/pydoc.py
+++ b/Lib/pydoc.py
@ -81,6 +81,7 @@ def pathdirs():
 def getdoc(object):
    """Get the doc string or comments for an object."""
    result = inspect.getdoc(object) or inspect.getcomments(object)
+    result = _encode(result)
    return result and re.sub('^ *\n', '', rstrip(result)) or ''

 def splitdoc(doc):
@ -182,6 +183,34 @@ def classify_class_attrs(object):
        return name, kind, cls, value
    return map(fixup, inspect.classify_class_attrs(object))

+# ----------------------------------------------------- Unicode support helpers
+
+try:
+    _unicode = unicode
+except NameError:
+    # If Python is built without Unicode support, the unicode type
+    # will not exist. Fake one that nothing will match, and make
+    # the _encode function that do nothing.
+    class _unicode(object):
+        pass
+    _encoding = 'ascii'
+    def _encode(text, encoding='ascii'):
+        return text
+else:
+    import locale
+    _encoding = locale.getpreferredencoding()
+
+    def _encode(text, encoding=None):
+        if isinstance(text, unicode):
+            return text.encode(encoding or _encoding, 'xmlcharrefreplace')
+        else:
+            return text
+
+def _binstr(obj):
+    # Ensure that we have an encoded (binary) string representation of obj,
+    # even if it is a unicode string.
+    return obj.encode(_encoding) if isinstance(obj, _unicode) else str(obj)
+
 # ----------------------------------------------------- module manipulation

 def ispackage(path):
@ -424,12 +453,13 @@ class HTMLDoc(Doc):

    def page(self, title, contents):
        """Format an HTML page."""
-        return '''
+        return _encode('''
 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
 <html><head><title>Python: %s</title>
+<meta charset="utf-8">
 </head><body bgcolor="#f0f0f8">
 %s
-</body></html>''' % (title, contents)
+</body></html>''' % (title, contents), 'ascii')

    def heading(self, title, fgcol, bgcol, extras=''):
        """Format a page heading."""
@ -606,12 +636,12 @@ class HTMLDoc(Doc):
            filelink = '(built-in)'
        info = []
        if hasattr(object, '__version__'):
-            version = str(object.__version__)
+            version = _binstr(object.__version__)
            if version[:11] == '$' + 'Revision: ' and version[-1:] == '$':
                version = strip(version[11:-1])
            info.append('version %s' % self.escape(version))
        if hasattr(object, '__date__'):
-            info.append(self.escape(str(object.__date__)))
+            info.append(self.escape(_binstr(object.__date__)))
        if info:
            head = head + ' (%s)' % join(info, ', ')
        docloc = self.getdocloc(object)
@ -694,11 +724,11 @@ class HTMLDoc(Doc):
            result = result + self.bigsection(
                'Data', '#ffffff', '#55aa55', join(contents, '<br>\n'))
        if hasattr(object, '__author__'):
-            contents = self.markup(str(object.__author__), self.preformat)
+            contents = self.markup(_binstr(object.__author__), self.preformat)
            result = result + self.bigsection(
                'Author', '#ffffff', '#7799ee', contents)
        if hasattr(object, '__credits__'):
-            contents = self.markup(str(object.__credits__), self.preformat)
+            contents = self.markup(_binstr(object.__credits__), self.preformat)
            result = result + self.bigsection(
                'Credits', '#ffffff', '#7799ee', contents)

@ -1116,16 +1146,16 @@ class TextDoc(Doc):
            result = result + self.section('DATA', join(contents, '\n'))

        if hasattr(object, '__version__'):
-            version = str(object.__version__)
+            version = _binstr(object.__version__)
            if version[:11] == '$' + 'Revision: ' and version[-1:] == '$':
                version = strip(version[11:-1])
            result = result + self.section('VERSION', version)
        if hasattr(object, '__date__'):
-            result = result + self.section('DATE', str(object.__date__))
+            result = result + self.section('DATE', _binstr(object.__date__))
        if hasattr(object, '__author__'):
-            result = result + self.section('AUTHOR', str(object.__author__))
+            result = result + self.section('AUTHOR', _binstr(object.__author__))
        if hasattr(object, '__credits__'):
-            result = result + self.section('CREDITS', str(object.__credits__))
+            result = result + self.section('CREDITS', _binstr(object.__credits__))
        return result

    def docclass(self, object, name=None, mod=None, *ignored):
@ -1375,7 +1405,7 @@ def pipepager(text, cmd):
    """Page through text by feeding it to another program."""
    pipe = os.popen(cmd, 'w')
    try:
-        pipe.write(text)
+        pipe.write(_encode(text))
        pipe.close()
    except IOError:
        pass # Ignore broken pipes caused by quitting the pager program.
@ -1385,7 +1415,7 @@ def tempfilepager(text, cmd):
    import tempfile
    filename = tempfile.mktemp()
    file = open(filename, 'w')
-    file.write(text)
+    file.write(_encode(text))
    file.close()
    try:
        os.system(cmd + ' "' + filename + '"')
@ -1394,7 +1424,7 @@ def tempfilepager(text, cmd):

 def ttypager(text):
    """Page through text on a text terminal."""
-    lines = split(plain(text), '\n')
+    lines = plain(_encode(plain(text), getattr(sys.stdout, 'encoding', _encoding))).split('\n')
    try:
        import tty
        fd = sys.stdin.fileno()
@ -1432,7 +1462,7 @@ def ttypager(text):

 def plainpager(text):
    """Simply print unformatted text.  This is the ultimate fallback."""
-    sys.stdout.write(plain(text))
+    sys.stdout.write(_encode(plain(text), getattr(sys.stdout, 'encoding', _encoding)))

 def describe(thing):
    """Produce a short description of the given thing."""
--- a/Lib/test/test_pydoc.py
+++ b/Lib/test/test_pydoc.py
@ -10,6 +10,7 @@ import keyword
 import pkgutil
 import unittest
 import xml.etree
+import types
 import test.test_support
 from collections import namedtuple
 from test.script_helper import assert_python_ok
@ -428,6 +429,95 @@ class TestDescriptions(unittest.TestCase):
        self.assertIn('_asdict', helptext)


+@unittest.skipUnless(test.test_support.have_unicode,
+                     "test requires unicode support")
+class TestUnicode(unittest.TestCase):
+
+    def setUp(self):
+        # Better not to use unicode escapes in literals, lest the
+        # parser choke on it if Python has been built without
+        # unicode support.
+        self.Q  = types.ModuleType(
+            'Q', 'Rational numbers: \xe2\x84\x9a'.decode('utf8'))
+        self.Q.__version__ = '\xe2\x84\x9a'.decode('utf8')
+        self.Q.__date__ = '\xe2\x84\x9a'.decode('utf8')
+        self.Q.__author__ = '\xe2\x84\x9a'.decode('utf8')
+        self.Q.__credits__ = '\xe2\x84\x9a'.decode('utf8')
+
+        self.assertIsInstance(self.Q.__doc__, unicode)
+
+    def test_render_doc(self):
+        # render_doc is robust against unicode in docstrings
+        doc = pydoc.render_doc(self.Q)
+        self.assertIsInstance(doc, str)
+
+    def test_encode(self):
+        # _encode is robust against characters out the specified encoding
+        self.assertEqual(pydoc._encode(self.Q.__doc__, 'ascii'), 'Rational numbers: &#8474;')
+
+    def test_pipepager(self):
+        # pipepager does not choke on unicode
+        doc = pydoc.render_doc(self.Q)
+
+        saved, os.popen = os.popen, open
+        try:
+            with test.test_support.temp_cwd():
+                pydoc.pipepager(doc, 'pipe')
+                self.assertEqual(open('pipe').read(), pydoc._encode(doc))
+        finally:
+            os.popen = saved
+
+    def test_tempfilepager(self):
+        # tempfilepager does not choke on unicode
+        doc = pydoc.render_doc(self.Q)
+
+        output = {}
+        def mock_system(cmd):
+            import ast
+            output['content'] = open(ast.literal_eval(cmd.strip())).read()
+        saved, os.system = os.system, mock_system
+        try:
+            pydoc.tempfilepager(doc, '')
+            self.assertEqual(output['content'], pydoc._encode(doc))
+        finally:
+            os.system = saved
+
+    def test_plainpager(self):
+        # plainpager does not choke on unicode
+        doc = pydoc.render_doc(self.Q)
+
+        # Note: captured_stdout is too permissive when it comes to
+        # unicode, and using it here would make the test always
+        # pass.
+        with test.test_support.temp_cwd():
+            with open('output', 'w') as f:
+                saved, sys.stdout = sys.stdout, f
+                try:
+                    pydoc.plainpager(doc)
+                finally:
+                    sys.stdout = saved
+            self.assertIn('Rational numbers:', open('output').read())
+
+    def test_ttypager(self):
+        # ttypager does not choke on unicode
+        doc = pydoc.render_doc(self.Q)
+        # Test ttypager
+        with test.test_support.temp_cwd(), test.test_support.captured_stdin():
+            with open('output', 'w') as f:
+                saved, sys.stdout = sys.stdout, f
+                try:
+                    pydoc.ttypager(doc)
+                finally:
+                    sys.stdout = saved
+            self.assertIn('Rational numbers:', open('output').read())
+
+    def test_htmlpage(self):
+        # html.page does not choke on unicode
+        with test.test_support.temp_cwd():
+            with captured_stdout() as output:
+                pydoc.writedoc(self.Q)
+        self.assertEqual(output.getvalue(), 'wrote Q.html\n')
+
 class TestHelper(unittest.TestCase):
    def test_keywords(self):
        self.assertEqual(sorted(pydoc.Helper.keywords),
@ -456,6 +546,7 @@ def test_main():
        test.test_support.run_unittest(PydocDocTest,
                                       PydocImportTest,
                                       TestDescriptions,
+                                       TestUnicode,
                                       TestHelper)
    finally:
        reap_children()
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -30,6 +30,8 @@ Core and Builtins
 Library
 -------

+- Issue #1065986: pydoc can now handle unicode strings.
+
 - Issue #16039: CVE-2013-1752: Change use of readline in imaplib module to
  limit line length.  Patch by Emil Lind.