From 8673ab97cc1930f5f2c5d96667386e09d22d60ec Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 2 Feb 2013 10:28:30 +0200 Subject: [PATCH] =?UTF-8?q?Issue=20#11159:=20SAX=C2=A0parser=20now=20suppo?= =?UTF-8?q?rts=20unicode=20file=20names.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Lib/test/test_sax.py | 50 ++++++++++++++++++++++++++++++++++++++ Lib/xml/sax/expatreader.py | 5 +++- Lib/xml/sax/saxutils.py | 28 +++++++++++++++++---- Misc/NEWS | 2 ++ 4 files changed, 79 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_sax.py b/Lib/test/test_sax.py index c3b44f82dc4..c7604a1ad61 100644 --- a/Lib/test/test_sax.py +++ b/Lib/test/test_sax.py @@ -14,6 +14,8 @@ from xml.sax.expatreader import create_parser from xml.sax.handler import feature_namespaces from xml.sax.xmlreader import InputSource, AttributesImpl, AttributesNSImpl from cStringIO import StringIO +import shutil +import test.test_support as support from test.test_support import findfile, run_unittest import unittest @@ -384,6 +386,22 @@ class ExpatReaderTest(XmlTestBase): self.assertEqual(result.getvalue(), xml_test_out) + @unittest.skipUnless(hasattr(support, 'TESTFN_UNICODE'), + 'Requires unicode filenames support') + def test_expat_file_unicode(self): + fname = support.TESTFN_UNICODE + shutil.copyfile(TEST_XMLFILE, fname) + self.addCleanup(support.unlink, fname) + + parser = create_parser() + result = StringIO() + xmlgen = XMLGenerator(result) + + parser.setContentHandler(xmlgen) + parser.parse(open(fname)) + + self.assertEqual(result.getvalue(), xml_test_out) + # ===== DTDHandler support class TestDTDHandler: @@ -523,6 +541,22 @@ class ExpatReaderTest(XmlTestBase): self.assertEqual(result.getvalue(), xml_test_out) + @unittest.skipUnless(hasattr(support, 'TESTFN_UNICODE'), + 'Requires unicode filenames support') + def test_expat_inpsource_sysid_unicode(self): + fname = support.TESTFN_UNICODE + shutil.copyfile(TEST_XMLFILE, fname) + self.addCleanup(support.unlink, fname) + + parser = create_parser() + result = StringIO() + xmlgen = XMLGenerator(result) + + parser.setContentHandler(xmlgen) + parser.parse(InputSource(fname)) + + self.assertEqual(result.getvalue(), xml_test_out) + def test_expat_inpsource_stream(self): parser = create_parser() result = StringIO() @@ -596,6 +630,22 @@ class ExpatReaderTest(XmlTestBase): self.assertEqual(parser.getSystemId(), TEST_XMLFILE) self.assertEqual(parser.getPublicId(), None) + @unittest.skipUnless(hasattr(support, 'TESTFN_UNICODE'), + 'Requires unicode filenames support') + def test_expat_locator_withinfo_unicode(self): + fname = support.TESTFN_UNICODE + shutil.copyfile(TEST_XMLFILE, fname) + self.addCleanup(support.unlink, fname) + + result = StringIO() + xmlgen = XMLGenerator(result) + parser = create_parser() + parser.setContentHandler(xmlgen) + parser.parse(fname) + + self.assertEqual(parser.getSystemId(), fname) + self.assertEqual(parser.getPublicId(), None) + # =========================================================================== # diff --git a/Lib/xml/sax/expatreader.py b/Lib/xml/sax/expatreader.py index 92a79c1c74d..9de3e72307d 100644 --- a/Lib/xml/sax/expatreader.py +++ b/Lib/xml/sax/expatreader.py @@ -108,7 +108,10 @@ class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): def prepareParser(self, source): if source.getSystemId() is not None: - self._parser.SetBase(source.getSystemId()) + base = source.getSystemId() + if isinstance(base, unicode): + base = base.encode('utf-8') + self._parser.SetBase(base) # Redefined setContentHandler to allow changing handlers during parsing diff --git a/Lib/xml/sax/saxutils.py b/Lib/xml/sax/saxutils.py index 97d65d8fd33..7989713f588 100644 --- a/Lib/xml/sax/saxutils.py +++ b/Lib/xml/sax/saxutils.py @@ -4,6 +4,7 @@ convenience of application and driver writers. """ import os, urlparse, urllib, types +import sys import handler import xmlreader @@ -293,14 +294,31 @@ def prepare_input_source(source, base = ""): source.setSystemId(f.name) if source.getByteStream() is None: - sysid = source.getSystemId() - basehead = os.path.dirname(os.path.normpath(base)) - sysidfilename = os.path.join(basehead, sysid) - if os.path.isfile(sysidfilename): + try: + sysid = source.getSystemId() + basehead = os.path.dirname(os.path.normpath(base)) + encoding = sys.getfilesystemencoding() + if isinstance(sysid, unicode): + if not isinstance(basehead, unicode): + try: + basehead = basehead.decode(encoding) + except UnicodeDecodeError: + sysid = sysid.encode(encoding) + else: + if isinstance(basehead, unicode): + try: + sysid = sysid.decode(encoding) + except UnicodeDecodeError: + basehead = basehead.encode(encoding) + sysidfilename = os.path.join(basehead, sysid) + isfile = os.path.isfile(sysidfilename) + except UnicodeError: + isfile = False + if isfile: source.setSystemId(sysidfilename) f = open(sysidfilename, "rb") else: - source.setSystemId(urlparse.urljoin(base, sysid)) + source.setSystemId(urlparse.urljoin(base, source.getSystemId())) f = urllib.urlopen(source.getSystemId()) source.setByteStream(f) diff --git a/Misc/NEWS b/Misc/NEWS index fb68ddebb28..cfe99c98abe 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -202,6 +202,8 @@ Core and Builtins Library ------- +- Issue #11159: SAX parser now supports unicode file names. + - Issue #6972: The zipfile module no longer overwrites files outside of its destination path when extracting malicious zip files.