Fix a few bugs on cjkcodecs found by Oren Tirosh:

- gbk and gb18030 codec now handle U+30FB KATAKANA MIDDLE DOT correctly.
- iso2022_jp_2 codec now encodes into G0 for KS X 1001, GB2312
  codepoints to conform the standard.
- iso2022_jp_3 and iso2022_jp_2004 codec can encode JIS X 2013:2
  codepoints now.
This commit is contained in:
Hye-Shik Chang 2006-09-05 12:07:09 +00:00
parent d042132268
commit 199f1db1fa
6 changed files with 53 additions and 28 deletions

View File

@ -32,6 +32,7 @@ class Test_GBK(test_multibytecodec_support.TestBase, unittest.TestCase):
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u804a\ufffd"), ("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u804a\ufffd"),
("abc\x80\x80\xc1\xc4", "ignore", u"abc\u804a"), ("abc\x80\x80\xc1\xc4", "ignore", u"abc\u804a"),
("\x83\x34\x83\x31", "strict", None), ("\x83\x34\x83\x31", "strict", None),
(u"\u30fb", "strict", None),
) )
class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase): class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase):
@ -45,6 +46,7 @@ class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase):
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u804a\ufffd"), ("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\u804a\ufffd"),
("abc\x80\x80\xc1\xc4", "ignore", u"abc\u804a"), ("abc\x80\x80\xc1\xc4", "ignore", u"abc\u804a"),
("abc\x84\x39\x84\x39\xc1\xc4", "replace", u"abc\ufffd\u804a"), ("abc\x84\x39\x84\x39\xc1\xc4", "replace", u"abc\ufffd\u804a"),
(u"\u30fb", "strict", "\x819\xa79"),
) )
has_iso10646 = True has_iso10646 = True

View File

@ -202,6 +202,12 @@ class Test_ISO2022(unittest.TestCase):
uni = u':hu4:unit\xe9 de famille' uni = u':hu4:unit\xe9 de famille'
self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni) self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
def test_iso2022_jp_g0(self):
self.failIf('\x0e' in u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
e = u'\u3406'.encode(encoding)
self.failIf(filter(lambda x: x >= '\x80', e))
def test_main(): def test_main():
suite = unittest.TestSuite() suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(Test_MultibyteCodec)) suite.addTest(unittest.makeSuite(Test_MultibyteCodec))

View File

@ -45,6 +45,12 @@ Extension Modules
- Bug #1550714: fix SystemError from itertools.tee on negative value for n. - Bug #1550714: fix SystemError from itertools.tee on negative value for n.
- Fixed a few bugs on cjkcodecs:
- gbk and gb18030 codec now handle U+30FB KATAKANA MIDDLE DOT correctly.
- iso2022_jp_2 codec now encodes into G0 for KS X 1001, GB2312
codepoints to conform the standard.
- iso2022_jp_3 and iso2022_jp_2004 codec can encode JIS X 2013:2
codepoints now.
Tests Tests
----- -----

View File

@ -15,14 +15,26 @@
#undef hz #undef hz
#endif #endif
#define GBK_PREDECODE(dc1, dc2, assi) \ /* GBK and GB2312 map differently in few codepoints that are listed below:
*
* gb2312 gbk
* A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT
* A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH
* A844 undefined U+2015 HORIZONTAL BAR
*/
#define GBK_DECODE(dc1, dc2, assi) \
if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \ if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \ else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
#define GBK_PREENCODE(code, assi) \ else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
else TRYMAP_DEC(gbkext, assi, dc1, dc2);
#define GBK_ENCODE(code, assi) \
if ((code) == 0x2014) (assi) = 0xa1aa; \ if ((code) == 0x2014) (assi) = 0xa1aa; \
else if ((code) == 0x2015) (assi) = 0xa844; \ else if ((code) == 0x2015) (assi) = 0xa844; \
else if ((code) == 0x00b7) (assi) = 0xa1a4; else if ((code) == 0x00b7) (assi) = 0xa1a4; \
else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
/* /*
* GB2312 codec * GB2312 codec
@ -99,8 +111,7 @@ ENCODER(gbk)
REQUIRE_OUTBUF(2) REQUIRE_OUTBUF(2)
GBK_PREENCODE(c, code) GBK_ENCODE(c, code)
else TRYMAP_ENC(gbcommon, code, c);
else return 1; else return 1;
OUT1((code >> 8) | 0x80) OUT1((code >> 8) | 0x80)
@ -129,9 +140,7 @@ DECODER(gbk)
REQUIRE_INBUF(2) REQUIRE_INBUF(2)
GBK_PREDECODE(c, IN2, **outbuf) GBK_DECODE(c, IN2, **outbuf)
else TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80);
else TRYMAP_DEC(gbkext, **outbuf, c, IN2);
else return 2; else return 2;
NEXT(2, 1) NEXT(2, 1)
@ -187,9 +196,7 @@ ENCODER(gb18030)
REQUIRE_OUTBUF(2) REQUIRE_OUTBUF(2)
GBK_PREENCODE(c, code) GBK_ENCODE(c, code)
else TRYMAP_ENC(gbcommon, code, c);
else TRYMAP_ENC(gb18030ext, code, c);
else { else {
const struct _gb18030_to_unibmp_ranges *utrrange; const struct _gb18030_to_unibmp_ranges *utrrange;
@ -287,9 +294,7 @@ DECODER(gb18030)
return 4; return 4;
} }
GBK_PREDECODE(c, c2, **outbuf) GBK_DECODE(c, c2, **outbuf)
else TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, c2 ^ 0x80);
else TRYMAP_DEC(gbkext, **outbuf, c, c2);
else TRYMAP_DEC(gb18030ext, **outbuf, c, c2); else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
else return 2; else return 2;

View File

@ -854,7 +854,7 @@ jisx0213_2000_2_encoder(const ucs4_t *data, Py_ssize_t *length)
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
return coded; return coded;
else if (coded & 0x8000) else if (coded & 0x8000)
return coded; return coded & 0x7fff;
else else
return MAP_UNMAPPABLE; return MAP_UNMAPPABLE;
} }
@ -901,7 +901,7 @@ jisx0213_2004_2_encoder(const ucs4_t *data, Py_ssize_t *length)
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
return coded; return coded;
else if (coded & 0x8000) else if (coded & 0x8000)
return coded; return coded & 0x7fff;
else else
return MAP_UNMAPPABLE; return MAP_UNMAPPABLE;
} }
@ -992,7 +992,10 @@ dummy_encoder(const ucs4_t *data, Py_ssize_t *length)
/*-*- registry tables -*-*/ /*-*- registry tables -*-*/
#define REGISTRY_KSX1001 { CHARSET_KSX1001, 1, 2, \ #define REGISTRY_KSX1001_G0 { CHARSET_KSX1001, 0, 2, \
ksx1001_init, \
ksx1001_decoder, ksx1001_encoder }
#define REGISTRY_KSX1001_G1 { CHARSET_KSX1001, 1, 2, \
ksx1001_init, \ ksx1001_init, \
ksx1001_decoder, ksx1001_encoder } ksx1001_decoder, ksx1001_encoder }
#define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \ #define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \
@ -1034,7 +1037,7 @@ dummy_encoder(const ucs4_t *data, Py_ssize_t *length)
jisx0213_init, \ jisx0213_init, \
jisx0213_2004_2_decoder, \ jisx0213_2004_2_decoder, \
jisx0213_2004_2_encoder } jisx0213_2004_2_encoder }
#define REGISTRY_GB2312 { CHARSET_GB2312, 1, 2, \ #define REGISTRY_GB2312 { CHARSET_GB2312, 0, 2, \
gb2312_init, \ gb2312_init, \
gb2312_decoder, gb2312_encoder } gb2312_decoder, gb2312_encoder }
#define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \ #define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \
@ -1054,7 +1057,7 @@ dummy_encoder(const ucs4_t *data, Py_ssize_t *length)
}; };
static const struct iso2022_designation iso2022_kr_designations[] = { static const struct iso2022_designation iso2022_kr_designations[] = {
REGISTRY_KSX1001, REGISTRY_SENTINEL REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
}; };
CONFIGDEF(kr, 0) CONFIGDEF(kr, 0)
@ -1071,7 +1074,7 @@ static const struct iso2022_designation iso2022_jp_1_designations[] = {
CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT) CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
static const struct iso2022_designation iso2022_jp_2_designations[] = { static const struct iso2022_designation iso2022_jp_2_designations[] = {
REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001, REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O, REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
}; };

View File

@ -159,29 +159,32 @@ static const struct dbcs_map *mapping_list;
#endif #endif
#define _TRYMAP_ENC(m, assi, val) \ #define _TRYMAP_ENC(m, assi, val) \
if ((m)->map != NULL && (val) >= (m)->bottom && \ ((m)->map != NULL && (val) >= (m)->bottom && \
(val)<= (m)->top && ((assi) = (m)->map[(val) - \ (val)<= (m)->top && ((assi) = (m)->map[(val) - \
(m)->bottom]) != NOCHAR) (m)->bottom]) != NOCHAR)
#define TRYMAP_ENC(charset, assi, uni) \ #define TRYMAP_ENC_COND(charset, assi, uni) \
_TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff) _TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff)
#define TRYMAP_ENC(charset, assi, uni) \
if TRYMAP_ENC_COND(charset, assi, uni)
#define _TRYMAP_DEC(m, assi, val) \ #define _TRYMAP_DEC(m, assi, val) \
if ((m)->map != NULL && (val) >= (m)->bottom && \ ((m)->map != NULL && (val) >= (m)->bottom && \
(val)<= (m)->top && ((assi) = (m)->map[(val) - \ (val)<= (m)->top && ((assi) = (m)->map[(val) - \
(m)->bottom]) != UNIINV) (m)->bottom]) != UNIINV)
#define TRYMAP_DEC(charset, assi, c1, c2) \ #define TRYMAP_DEC(charset, assi, c1, c2) \
_TRYMAP_DEC(&charset##_decmap[c1], assi, c2) if _TRYMAP_DEC(&charset##_decmap[c1], assi, c2)
#define _TRYMAP_ENC_MPLANE(m, assplane, asshi, asslo, val) \ #define _TRYMAP_ENC_MPLANE(m, assplane, asshi, asslo, val) \
if ((m)->map != NULL && (val) >= (m)->bottom && \ ((m)->map != NULL && (val) >= (m)->bottom && \
(val)<= (m)->top && \ (val)<= (m)->top && \
((assplane) = (m)->map[((val) - (m)->bottom)*3]) != 0 && \ ((assplane) = (m)->map[((val) - (m)->bottom)*3]) != 0 && \
(((asshi) = (m)->map[((val) - (m)->bottom)*3 + 1]), 1) && \ (((asshi) = (m)->map[((val) - (m)->bottom)*3 + 1]), 1) && \
(((asslo) = (m)->map[((val) - (m)->bottom)*3 + 2]), 1)) (((asslo) = (m)->map[((val) - (m)->bottom)*3 + 2]), 1))
#define TRYMAP_ENC_MPLANE(charset, assplane, asshi, asslo, uni) \ #define TRYMAP_ENC_MPLANE(charset, assplane, asshi, asslo, uni) \
_TRYMAP_ENC_MPLANE(&charset##_encmap[(uni) >> 8], \ if _TRYMAP_ENC_MPLANE(&charset##_encmap[(uni) >> 8], \
assplane, asshi, asslo, (uni) & 0xff) assplane, asshi, asslo, (uni) & 0xff)
#define TRYMAP_DEC_MPLANE(charset, assi, plane, c1, c2) \ #define TRYMAP_DEC_MPLANE(charset, assi, plane, c1, c2) \
_TRYMAP_DEC(&charset##_decmap[plane][c1], assi, c2) if _TRYMAP_DEC(&charset##_decmap[plane][c1], assi, c2)
#if Py_UNICODE_SIZE == 2 #if Py_UNICODE_SIZE == 2
#define DECODE_SURROGATE(c) \ #define DECODE_SURROGATE(c) \