Fix gb18030 codec's bug that doesn't map two-byte characters on
GB18030 extension in encoding. (bug reported by Bjorn Stabell)
This commit is contained in:
parent
766d880a2f
commit
f3e93a0268
|
@ -19,6 +19,13 @@ class TestGBKMap(test_multibytecodec_support.TestBase_Mapping,
|
||||||
mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/' \
|
mapfileurl = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/' \
|
||||||
'MICSFT/WINDOWS/CP936.TXT'
|
'MICSFT/WINDOWS/CP936.TXT'
|
||||||
|
|
||||||
|
class TestGB18030Map(test_multibytecodec_support.TestBase_Mapping,
|
||||||
|
unittest.TestCase):
|
||||||
|
encoding = 'gb18030'
|
||||||
|
mapfileurl = 'http://source.icu-project.org/repos/icu/data/' \
|
||||||
|
'trunk/charset/data/xml/gb-18030-2000.xml'
|
||||||
|
|
||||||
|
|
||||||
def test_main():
|
def test_main():
|
||||||
test_support.run_unittest(__name__)
|
test_support.run_unittest(__name__)
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
import sys, codecs, os.path
|
import sys, codecs, os.path
|
||||||
import unittest
|
import unittest, re
|
||||||
from test import test_support
|
from test import test_support
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
|
|
||||||
|
@ -272,6 +272,12 @@ class TestBase_Mapping(unittest.TestCase):
|
||||||
return test_support.open_urlresource(self.mapfileurl)
|
return test_support.open_urlresource(self.mapfileurl)
|
||||||
|
|
||||||
def test_mapping_file(self):
|
def test_mapping_file(self):
|
||||||
|
if self.mapfileurl.endswith('.xml'):
|
||||||
|
self._test_mapping_file_ucm()
|
||||||
|
else:
|
||||||
|
self._test_mapping_file_plain()
|
||||||
|
|
||||||
|
def _test_mapping_file_plain(self):
|
||||||
unichrs = lambda s: u''.join(map(unichr, map(eval, s.split('+'))))
|
unichrs = lambda s: u''.join(map(unichr, map(eval, s.split('+'))))
|
||||||
urt_wa = {}
|
urt_wa = {}
|
||||||
|
|
||||||
|
@ -303,6 +309,14 @@ class TestBase_Mapping(unittest.TestCase):
|
||||||
|
|
||||||
self._testpoint(csetch, unich)
|
self._testpoint(csetch, unich)
|
||||||
|
|
||||||
|
def _test_mapping_file_ucm(self):
|
||||||
|
ucmdata = self.open_mapping_file().read()
|
||||||
|
uc = re.findall('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>', ucmdata)
|
||||||
|
for uni, coded in uc:
|
||||||
|
unich = unichr(int(uni, 16))
|
||||||
|
codech = ''.join(chr(int(c, 16)) for c in coded.split())
|
||||||
|
self._testpoint(codech, unich)
|
||||||
|
|
||||||
def test_mapping_supplemental(self):
|
def test_mapping_supplemental(self):
|
||||||
for mapping in self.supmaps:
|
for mapping in self.supmaps:
|
||||||
self._testpoint(*mapping)
|
self._testpoint(*mapping)
|
||||||
|
|
|
@ -240,6 +240,9 @@ Core and builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- GB18030 codec now can encode additional two-byte characters that
|
||||||
|
are missing in GBK.
|
||||||
|
|
||||||
- Bug #1704793: Return UTF-16 pair if unicodedata.lookup cannot
|
- Bug #1704793: Return UTF-16 pair if unicodedata.lookup cannot
|
||||||
represent the result in a single character.
|
represent the result in a single character.
|
||||||
|
|
||||||
|
|
|
@ -197,6 +197,7 @@ ENCODER(gb18030)
|
||||||
REQUIRE_OUTBUF(2)
|
REQUIRE_OUTBUF(2)
|
||||||
|
|
||||||
GBK_ENCODE(c, code)
|
GBK_ENCODE(c, code)
|
||||||
|
else TRYMAP_ENC(gb18030ext, code, c);
|
||||||
else {
|
else {
|
||||||
const struct _gb18030_to_unibmp_ranges *utrrange;
|
const struct _gb18030_to_unibmp_ranges *utrrange;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue