Backport from trunk r56727:

Fix gb18030 codec's bug that doesn't map two-byte characters on
GB18030 extension in encoding. (bug reported by Bjorn Stabell)
This commit is contained in:
Hye-Shik Chang 2007-08-04 04:15:04 +00:00
parent 979f5cd3e0
commit 36fe3c0a84
4 changed files with 27 additions and 1 deletions

View File

@ -19,10 +19,18 @@ class TestGBKMap(test_multibytecodec_support.TestBase_Mapping,
mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/' \
'MICSFT/WINDOWS/CP936.TXT'
class TestGB18030Map(test_multibytecodec_support.TestBase_Mapping,
unittest.TestCase):
encoding = 'gb18030'
mapfileurl = 'http://source.icu-project.org/repos/icu/data/' \
'trunk/charset/data/xml/gb-18030-2000.xml'
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(TestGB2312Map))
suite.addTest(unittest.makeSuite(TestGBKMap))
suite.addTest(unittest.makeSuite(TestGB18030Map))
test_support.run_suite(suite)
if __name__ == "__main__":

View File

@ -5,7 +5,7 @@
#
import sys, codecs, os.path
import unittest
import unittest, re
from test import test_support
from StringIO import StringIO
@ -272,6 +272,12 @@ class TestBase_Mapping(unittest.TestCase):
return test_support.open_urlresource(self.mapfileurl)
def test_mapping_file(self):
if self.mapfileurl.endswith('.xml'):
self._test_mapping_file_ucm()
else:
self._test_mapping_file_plain()
def _test_mapping_file_plain(self):
unichrs = lambda s: u''.join(map(unichr, map(eval, s.split('+'))))
urt_wa = {}
@ -303,6 +309,14 @@ class TestBase_Mapping(unittest.TestCase):
self._testpoint(csetch, unich)
def _test_mapping_file_ucm(self):
ucmdata = self.open_mapping_file().read()
uc = re.findall('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>', ucmdata)
for uni, coded in uc:
unich = unichr(int(uni, 16))
codech = ''.join(chr(int(c, 16)) for c in coded.split())
self._testpoint(codech, unich)
def test_mapping_supplemental(self):
for mapping in self.supmaps:
self._testpoint(*mapping)

View File

@ -26,6 +26,9 @@ Core and builtins
Library
-------
- GB18030 codec now can encode additional two-byte characters that
are missing in GBK.
- Bug #1704793: Raise KeyError if unicodedata.lookup cannot
represent the result in a single character.

View File

@ -197,6 +197,7 @@ ENCODER(gb18030)
REQUIRE_OUTBUF(2)
GBK_ENCODE(c, code)
else TRYMAP_ENC(gb18030ext, code, c);
else {
const struct _gb18030_to_unibmp_ranges *utrrange;