From 199f1db1fa75e1f1b7cfe73259d6a811aa494a36 Mon Sep 17 00:00:00 2001 From: Hye-Shik Chang Date: Tue, 5 Sep 2006 12:07:09 +0000 Subject: [PATCH] Fix a few bugs on cjkcodecs found by Oren Tirosh: - gbk and gb18030 codec now handle U+30FB KATAKANA MIDDLE DOT correctly. - iso2022_jp_2 codec now encodes into G0 for KS X 1001, GB2312 codepoints to conform the standard. - iso2022_jp_3 and iso2022_jp_2004 codec can encode JIS X 2013:2 codepoints now. --- Lib/test/test_codecencodings_cn.py | 2 ++ Lib/test/test_multibytecodec.py | 6 +++++ Misc/NEWS | 6 +++++ Modules/cjkcodecs/_codecs_cn.c | 35 ++++++++++++++++------------- Modules/cjkcodecs/_codecs_iso2022.c | 15 ++++++++----- Modules/cjkcodecs/cjkcodecs.h | 17 ++++++++------ 6 files changed, 53 insertions(+), 28 deletions(-) diff --git a/Lib/test/test_codecencodings_cn.py b/Lib/test/test_codecencodings_cn.py index 1bf85834bc8..c558f1bb99b 100644 --- a/Lib/test/test_codecencodings_cn.py +++ b/Lib/test/test_codecencodings_cn.py @@ -32,6 +32,7 @@ class Test_GBK(test_multibytecodec_support.TestBase, unittest.TestCase): ("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u804a\ufffd"), ("abc\x80\x80\xc1\xc4", "ignore", u"abc\u804a"), ("\x83\x34\x83\x31", "strict", None), + (u"\u30fb", "strict", None), ) class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase): @@ -45,6 +46,7 @@ class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase): ("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u804a\ufffd"), ("abc\x80\x80\xc1\xc4", "ignore", u"abc\u804a"), ("abc\x84\x39\x84\x39\xc1\xc4", "replace", u"abc\ufffd\u804a"), + (u"\u30fb", "strict", "\x819\xa79"), ) has_iso10646 = True diff --git a/Lib/test/test_multibytecodec.py b/Lib/test/test_multibytecodec.py index 397ebeb97a6..800456e23b8 100644 --- a/Lib/test/test_multibytecodec.py +++ b/Lib/test/test_multibytecodec.py @@ -202,6 +202,12 @@ class Test_ISO2022(unittest.TestCase): uni = u':hu4:unit\xe9 de famille' self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni) + def test_iso2022_jp_g0(self): + self.failIf('\x0e' in u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2')) + for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'): + e = u'\u3406'.encode(encoding) + self.failIf(filter(lambda x: x >= '\x80', e)) + def test_main(): suite = unittest.TestSuite() suite.addTest(unittest.makeSuite(Test_MultibyteCodec)) diff --git a/Misc/NEWS b/Misc/NEWS index dc29237929b..06cc1b4fb6a 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -45,6 +45,12 @@ Extension Modules - Bug #1550714: fix SystemError from itertools.tee on negative value for n. +- Fixed a few bugs on cjkcodecs: + - gbk and gb18030 codec now handle U+30FB KATAKANA MIDDLE DOT correctly. + - iso2022_jp_2 codec now encodes into G0 for KS X 1001, GB2312 + codepoints to conform the standard. + - iso2022_jp_3 and iso2022_jp_2004 codec can encode JIS X 2013:2 + codepoints now. Tests ----- diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c index fb51297ae06..c811a67eda0 100644 --- a/Modules/cjkcodecs/_codecs_cn.c +++ b/Modules/cjkcodecs/_codecs_cn.c @@ -15,14 +15,26 @@ #undef hz #endif -#define GBK_PREDECODE(dc1, dc2, assi) \ +/* GBK and GB2312 map differently in few codepoints that are listed below: + * + * gb2312 gbk + * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT + * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH + * A844 undefined U+2015 HORIZONTAL BAR + */ + +#define GBK_DECODE(dc1, dc2, assi) \ if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \ else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \ - else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; -#define GBK_PREENCODE(code, assi) \ + else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \ + else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \ + else TRYMAP_DEC(gbkext, assi, dc1, dc2); + +#define GBK_ENCODE(code, assi) \ if ((code) == 0x2014) (assi) = 0xa1aa; \ else if ((code) == 0x2015) (assi) = 0xa844; \ - else if ((code) == 0x00b7) (assi) = 0xa1a4; + else if ((code) == 0x00b7) (assi) = 0xa1a4; \ + else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code)); /* * GB2312 codec @@ -99,8 +111,7 @@ ENCODER(gbk) REQUIRE_OUTBUF(2) - GBK_PREENCODE(c, code) - else TRYMAP_ENC(gbcommon, code, c); + GBK_ENCODE(c, code) else return 1; OUT1((code >> 8) | 0x80) @@ -129,9 +140,7 @@ DECODER(gbk) REQUIRE_INBUF(2) - GBK_PREDECODE(c, IN2, **outbuf) - else TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80); - else TRYMAP_DEC(gbkext, **outbuf, c, IN2); + GBK_DECODE(c, IN2, **outbuf) else return 2; NEXT(2, 1) @@ -187,9 +196,7 @@ ENCODER(gb18030) REQUIRE_OUTBUF(2) - GBK_PREENCODE(c, code) - else TRYMAP_ENC(gbcommon, code, c); - else TRYMAP_ENC(gb18030ext, code, c); + GBK_ENCODE(c, code) else { const struct _gb18030_to_unibmp_ranges *utrrange; @@ -287,9 +294,7 @@ DECODER(gb18030) return 4; } - GBK_PREDECODE(c, c2, **outbuf) - else TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, c2 ^ 0x80); - else TRYMAP_DEC(gbkext, **outbuf, c, c2); + GBK_DECODE(c, c2, **outbuf) else TRYMAP_DEC(gb18030ext, **outbuf, c, c2); else return 2; diff --git a/Modules/cjkcodecs/_codecs_iso2022.c b/Modules/cjkcodecs/_codecs_iso2022.c index 8a2ab7eb2f4..2a11e9a2747 100644 --- a/Modules/cjkcodecs/_codecs_iso2022.c +++ b/Modules/cjkcodecs/_codecs_iso2022.c @@ -854,7 +854,7 @@ jisx0213_2000_2_encoder(const ucs4_t *data, Py_ssize_t *length) if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) return coded; else if (coded & 0x8000) - return coded; + return coded & 0x7fff; else return MAP_UNMAPPABLE; } @@ -901,7 +901,7 @@ jisx0213_2004_2_encoder(const ucs4_t *data, Py_ssize_t *length) if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) return coded; else if (coded & 0x8000) - return coded; + return coded & 0x7fff; else return MAP_UNMAPPABLE; } @@ -992,7 +992,10 @@ dummy_encoder(const ucs4_t *data, Py_ssize_t *length) /*-*- registry tables -*-*/ -#define REGISTRY_KSX1001 { CHARSET_KSX1001, 1, 2, \ +#define REGISTRY_KSX1001_G0 { CHARSET_KSX1001, 0, 2, \ + ksx1001_init, \ + ksx1001_decoder, ksx1001_encoder } +#define REGISTRY_KSX1001_G1 { CHARSET_KSX1001, 1, 2, \ ksx1001_init, \ ksx1001_decoder, ksx1001_encoder } #define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \ @@ -1034,7 +1037,7 @@ dummy_encoder(const ucs4_t *data, Py_ssize_t *length) jisx0213_init, \ jisx0213_2004_2_decoder, \ jisx0213_2004_2_encoder } -#define REGISTRY_GB2312 { CHARSET_GB2312, 1, 2, \ +#define REGISTRY_GB2312 { CHARSET_GB2312, 0, 2, \ gb2312_init, \ gb2312_decoder, gb2312_encoder } #define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \ @@ -1054,7 +1057,7 @@ dummy_encoder(const ucs4_t *data, Py_ssize_t *length) }; static const struct iso2022_designation iso2022_kr_designations[] = { - REGISTRY_KSX1001, REGISTRY_SENTINEL + REGISTRY_KSX1001_G1, REGISTRY_SENTINEL }; CONFIGDEF(kr, 0) @@ -1071,7 +1074,7 @@ static const struct iso2022_designation iso2022_jp_1_designations[] = { CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT) static const struct iso2022_designation iso2022_jp_2_designations[] = { - REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001, + REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0, REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O, REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL }; diff --git a/Modules/cjkcodecs/cjkcodecs.h b/Modules/cjkcodecs/cjkcodecs.h index b266c8f9b47..71c54f093fc 100644 --- a/Modules/cjkcodecs/cjkcodecs.h +++ b/Modules/cjkcodecs/cjkcodecs.h @@ -159,29 +159,32 @@ static const struct dbcs_map *mapping_list; #endif #define _TRYMAP_ENC(m, assi, val) \ - if ((m)->map != NULL && (val) >= (m)->bottom && \ + ((m)->map != NULL && (val) >= (m)->bottom && \ (val)<= (m)->top && ((assi) = (m)->map[(val) - \ (m)->bottom]) != NOCHAR) -#define TRYMAP_ENC(charset, assi, uni) \ +#define TRYMAP_ENC_COND(charset, assi, uni) \ _TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff) +#define TRYMAP_ENC(charset, assi, uni) \ + if TRYMAP_ENC_COND(charset, assi, uni) + #define _TRYMAP_DEC(m, assi, val) \ - if ((m)->map != NULL && (val) >= (m)->bottom && \ + ((m)->map != NULL && (val) >= (m)->bottom && \ (val)<= (m)->top && ((assi) = (m)->map[(val) - \ (m)->bottom]) != UNIINV) #define TRYMAP_DEC(charset, assi, c1, c2) \ - _TRYMAP_DEC(&charset##_decmap[c1], assi, c2) + if _TRYMAP_DEC(&charset##_decmap[c1], assi, c2) #define _TRYMAP_ENC_MPLANE(m, assplane, asshi, asslo, val) \ - if ((m)->map != NULL && (val) >= (m)->bottom && \ + ((m)->map != NULL && (val) >= (m)->bottom && \ (val)<= (m)->top && \ ((assplane) = (m)->map[((val) - (m)->bottom)*3]) != 0 && \ (((asshi) = (m)->map[((val) - (m)->bottom)*3 + 1]), 1) && \ (((asslo) = (m)->map[((val) - (m)->bottom)*3 + 2]), 1)) #define TRYMAP_ENC_MPLANE(charset, assplane, asshi, asslo, uni) \ - _TRYMAP_ENC_MPLANE(&charset##_encmap[(uni) >> 8], \ + if _TRYMAP_ENC_MPLANE(&charset##_encmap[(uni) >> 8], \ assplane, asshi, asslo, (uni) & 0xff) #define TRYMAP_DEC_MPLANE(charset, assi, plane, c1, c2) \ - _TRYMAP_DEC(&charset##_decmap[plane][c1], assi, c2) + if _TRYMAP_DEC(&charset##_decmap[plane][c1], assi, c2) #if Py_UNICODE_SIZE == 2 #define DECODE_SURROGATE(c) \