mirror of https://github.com/python/cpython
Add cheot-ga-keut composed make-up sequence support in EUC-KR codec.
This commit is contained in:
parent
c553f42907
commit
2390104d81
|
@ -376,13 +376,20 @@ teststring = {
|
||||||
"\xcc\xc7\xce\x2c\x20\xb1\xd7\xb8\xae\xb0\xed\x20\xc0\xce\xc5\xcd"
|
"\xcc\xc7\xce\x2c\x20\xb1\xd7\xb8\xae\xb0\xed\x20\xc0\xce\xc5\xcd"
|
||||||
"\xc7\xc1\xb8\xae\xc6\xc3\x0a\xc8\xaf\xb0\xe6\xc0\xba\x20\xc6\xc4"
|
"\xc7\xc1\xb8\xae\xc6\xc3\x0a\xc8\xaf\xb0\xe6\xc0\xba\x20\xc6\xc4"
|
||||||
"\xc0\xcc\xbd\xe3\xc0\xbb\x20\xbd\xba\xc5\xa9\xb8\xb3\xc6\xc3\xb0"
|
"\xc0\xcc\xbd\xe3\xc0\xbb\x20\xbd\xba\xc5\xa9\xb8\xb3\xc6\xc3\xb0"
|
||||||
"\xfa\x20\xbf\xa9\xb7\xc1\x20\xba\xd0\xbe\xdf\xbf\xa1\xbc\xad\xbf"
|
"\xfa\x20\xbf\xa9\xb7\xaf\x20\xba\xd0\xbe\xdf\xbf\xa1\xbc\xad\xbf"
|
||||||
"\xcd\x20\xb4\xeb\xba\xce\xba\xd0\xc0\xc7\x20\xc7\xc3\xb7\xa7\xc6"
|
"\xcd\x20\xb4\xeb\xba\xce\xba\xd0\xc0\xc7\x20\xc7\xc3\xb7\xa7\xc6"
|
||||||
"\xfb\xbf\xa1\xbc\xad\xc0\xc7\x20\xba\xfc\xb8\xa5\x0a\xbe\xd6\xc7"
|
"\xfb\xbf\xa1\xbc\xad\xc0\xc7\x20\xba\xfc\xb8\xa5\x0a\xbe\xd6\xc7"
|
||||||
"\xc3\xb8\xae\xc4\xc9\xc0\xcc\xbc\xc7\x20\xb0\xb3\xb9\xdf\xc0\xbb"
|
"\xc3\xb8\xae\xc4\xc9\xc0\xcc\xbc\xc7\x20\xb0\xb3\xb9\xdf\xc0\xbb"
|
||||||
"\x20\xc7\xd2\x20\xbc\xf6\x20\xc0\xd6\xb4\xc2\x20\xc0\xcc\xbb\xf3"
|
"\x20\xc7\xd2\x20\xbc\xf6\x20\xc0\xd6\xb4\xc2\x20\xc0\xcc\xbb\xf3"
|
||||||
"\xc0\xfb\xc0\xce\x20\xbe\xf0\xbe\xee\xb7\xce\x20\xb8\xb8\xb5\xe9"
|
"\xc0\xfb\xc0\xce\x20\xbe\xf0\xbe\xee\xb7\xce\x20\xb8\xb8\xb5\xe9"
|
||||||
"\xbe\xee\xc1\xdd\xb4\xcf\xb4\xd9\x2e\x0a\x0a",
|
"\xbe\xee\xc1\xdd\xb4\xcf\xb4\xd9\x2e\x0a\x0a\xa1\xd9\xc3\xb9\xb0"
|
||||||
|
"\xa1\xb3\xa1\x3a\x20\xb3\xaf\xbe\xc6\xb6\xf3\x20\xa4\xd4\xa4\xb6"
|
||||||
|
"\xa4\xd0\xa4\xd4\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4\xbe\xb1\x7e\x20"
|
||||||
|
"\xa4\xd4\xa4\xa4\xa4\xd2\xa4\xb7\xc5\xad\x21\x20\xa4\xd4\xa4\xa8"
|
||||||
|
"\xa4\xd1\xa4\xb7\xb1\xdd\xbe\xf8\xc0\xcc\x20\xc0\xfc\xa4\xd4\xa4"
|
||||||
|
"\xbe\xa4\xc8\xa4\xb2\xb4\xcf\xb4\xd9\x2e\x20\xa4\xd4\xa4\xb2\xa4"
|
||||||
|
"\xce\xa4\xaa\x2e\x20\xb1\xd7\xb7\xb1\xb0\xc5\x20\xa4\xd4\xa4\xb7"
|
||||||
|
"\xa4\xd1\xa4\xb4\xb4\xd9\x2e\x0a",
|
||||||
"\xe2\x97\x8e\x20\xed\x8c\x8c\xec\x9d\xb4\xec\x8d\xac\x28\x50\x79"
|
"\xe2\x97\x8e\x20\xed\x8c\x8c\xec\x9d\xb4\xec\x8d\xac\x28\x50\x79"
|
||||||
"\x74\x68\x6f\x6e\x29\xec\x9d\x80\x20\xeb\xb0\xb0\xec\x9a\xb0\xea"
|
"\x74\x68\x6f\x6e\x29\xec\x9d\x80\x20\xeb\xb0\xb0\xec\x9a\xb0\xea"
|
||||||
"\xb8\xb0\x20\xec\x89\xbd\xea\xb3\xa0\x2c\x20\xea\xb0\x95\xeb\xa0"
|
"\xb8\xb0\x20\xec\x89\xbd\xea\xb3\xa0\x2c\x20\xea\xb0\x95\xeb\xa0"
|
||||||
|
@ -404,7 +411,7 @@ teststring = {
|
||||||
"\xec\x9d\xb8\xed\x84\xb0\xed\x94\x84\xeb\xa6\xac\xed\x8c\x85\x0a"
|
"\xec\x9d\xb8\xed\x84\xb0\xed\x94\x84\xeb\xa6\xac\xed\x8c\x85\x0a"
|
||||||
"\xed\x99\x98\xea\xb2\xbd\xec\x9d\x80\x20\xed\x8c\x8c\xec\x9d\xb4"
|
"\xed\x99\x98\xea\xb2\xbd\xec\x9d\x80\x20\xed\x8c\x8c\xec\x9d\xb4"
|
||||||
"\xec\x8d\xac\xec\x9d\x84\x20\xec\x8a\xa4\xed\x81\xac\xeb\xa6\xbd"
|
"\xec\x8d\xac\xec\x9d\x84\x20\xec\x8a\xa4\xed\x81\xac\xeb\xa6\xbd"
|
||||||
"\xed\x8c\x85\xea\xb3\xbc\x20\xec\x97\xac\xeb\xa0\xa4\x20\xeb\xb6"
|
"\xed\x8c\x85\xea\xb3\xbc\x20\xec\x97\xac\xeb\x9f\xac\x20\xeb\xb6"
|
||||||
"\x84\xec\x95\xbc\xec\x97\x90\xec\x84\x9c\xec\x99\x80\x20\xeb\x8c"
|
"\x84\xec\x95\xbc\xec\x97\x90\xec\x84\x9c\xec\x99\x80\x20\xeb\x8c"
|
||||||
"\x80\xeb\xb6\x80\xeb\xb6\x84\xec\x9d\x98\x20\xed\x94\x8c\xeb\x9e"
|
"\x80\xeb\xb6\x80\xeb\xb6\x84\xec\x9d\x98\x20\xed\x94\x8c\xeb\x9e"
|
||||||
"\xab\xed\x8f\xbc\xec\x97\x90\xec\x84\x9c\xec\x9d\x98\x20\xeb\xb9"
|
"\xab\xed\x8f\xbc\xec\x97\x90\xec\x84\x9c\xec\x9d\x98\x20\xeb\xb9"
|
||||||
|
@ -413,7 +420,13 @@ teststring = {
|
||||||
"\x84\x20\xed\x95\xa0\x20\xec\x88\x98\x20\xec\x9e\x88\xeb\x8a\x94"
|
"\x84\x20\xed\x95\xa0\x20\xec\x88\x98\x20\xec\x9e\x88\xeb\x8a\x94"
|
||||||
"\x20\xec\x9d\xb4\xec\x83\x81\xec\xa0\x81\xec\x9d\xb8\x20\xec\x96"
|
"\x20\xec\x9d\xb4\xec\x83\x81\xec\xa0\x81\xec\x9d\xb8\x20\xec\x96"
|
||||||
"\xb8\xec\x96\xb4\xeb\xa1\x9c\x20\xeb\xa7\x8c\xeb\x93\xa4\xec\x96"
|
"\xb8\xec\x96\xb4\xeb\xa1\x9c\x20\xeb\xa7\x8c\xeb\x93\xa4\xec\x96"
|
||||||
"\xb4\xec\xa4\x8d\xeb\x8b\x88\xeb\x8b\xa4\x2e\x0a\x0a"),
|
"\xb4\xec\xa4\x8d\xeb\x8b\x88\xeb\x8b\xa4\x2e\x0a\x0a\xe2\x98\x86"
|
||||||
|
"\xec\xb2\xab\xea\xb0\x80\xeb\x81\x9d\x3a\x20\xeb\x82\xa0\xec\x95"
|
||||||
|
"\x84\xeb\x9d\xbc\x20\xec\x93\x94\xec\x93\x94\xec\x93\xa9\x7e\x20"
|
||||||
|
"\xeb\x8b\x81\xed\x81\xbc\x21\x20\xeb\x9c\xbd\xea\xb8\x88\xec\x97"
|
||||||
|
"\x86\xec\x9d\xb4\x20\xec\xa0\x84\xed\x99\xa5\xeb\x8b\x88\xeb\x8b"
|
||||||
|
"\xa4\x2e\x20\xeb\xb7\x81\x2e\x20\xea\xb7\xb8\xeb\x9f\xb0\xea\xb1"
|
||||||
|
"\xb0\x20\xec\x9d\x8e\xeb\x8b\xa4\x2e\x0a"),
|
||||||
'gb18030': (
|
'gb18030': (
|
||||||
"\x50\x79\x74\x68\x6f\x6e\xa3\xa8\xc5\xc9\xc9\xad\xa3\xa9\xd3\xef"
|
"\x50\x79\x74\x68\x6f\x6e\xa3\xa8\xc5\xc9\xc9\xad\xa3\xa9\xd3\xef"
|
||||||
"\xd1\xd4\xca\xc7\xd2\xbb\xd6\xd6\xb9\xa6\xc4\xdc\xc7\xbf\xb4\xf3"
|
"\xd1\xd4\xca\xc7\xd2\xbb\xd6\xd6\xb9\xa6\xc4\xdc\xc7\xbf\xb4\xf3"
|
||||||
|
|
|
@ -30,6 +30,24 @@ class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase):
|
||||||
("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\uc894"),
|
("abc\x80\x80\xc1\xc4", "replace", u"abc\ufffd\uc894"),
|
||||||
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\uc894\ufffd"),
|
("abc\x80\x80\xc1\xc4\xc8", "replace", u"abc\ufffd\uc894\ufffd"),
|
||||||
("abc\x80\x80\xc1\xc4", "ignore", u"abc\uc894"),
|
("abc\x80\x80\xc1\xc4", "ignore", u"abc\uc894"),
|
||||||
|
|
||||||
|
# composed make-up sequence errors
|
||||||
|
("\xa4\xd4", "strict", None),
|
||||||
|
("\xa4\xd4\xa4", "strict", None),
|
||||||
|
("\xa4\xd4\xa4\xb6", "strict", None),
|
||||||
|
("\xa4\xd4\xa4\xb6\xa4", "strict", None),
|
||||||
|
("\xa4\xd4\xa4\xb6\xa4\xd0", "strict", None),
|
||||||
|
("\xa4\xd4\xa4\xb6\xa4\xd0\xa4", "strict", None),
|
||||||
|
("\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4", "strict", u"\uc4d4"),
|
||||||
|
("\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4x", "strict", u"\uc4d4x"),
|
||||||
|
("a\xa4\xd4\xa4\xb6\xa4", "replace", u"a\ufffd"),
|
||||||
|
("\xa4\xd4\xa3\xb6\xa4\xd0\xa4\xd4", "strict", None),
|
||||||
|
("\xa4\xd4\xa4\xb6\xa3\xd0\xa4\xd4", "strict", None),
|
||||||
|
("\xa4\xd4\xa4\xb6\xa4\xd0\xa3\xd4", "strict", None),
|
||||||
|
("\xa4\xd4\xa4\xff\xa4\xd0\xa4\xd4", "replace", u"\ufffd"),
|
||||||
|
("\xa4\xd4\xa4\xb6\xa4\xff\xa4\xd4", "replace", u"\ufffd"),
|
||||||
|
("\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xff", "replace", u"\ufffd"),
|
||||||
|
("\xc1\xc4", "strict", u"\uc894"),
|
||||||
)
|
)
|
||||||
|
|
||||||
class Test_JOHAB(test_multibytecodec_support.TestBase, unittest.TestCase):
|
class Test_JOHAB(test_multibytecodec_support.TestBase, unittest.TestCase):
|
||||||
|
|
|
@ -20,6 +20,10 @@ class TestEUCKRMap(test_multibytecodec_support.TestBase_Mapping,
|
||||||
encoding = 'euc_kr'
|
encoding = 'euc_kr'
|
||||||
mapfileurl = 'http://people.freebsd.org/~perky/i18n/EUC-KR.TXT'
|
mapfileurl = 'http://people.freebsd.org/~perky/i18n/EUC-KR.TXT'
|
||||||
|
|
||||||
|
# A4D4 HANGUL FILLER indicates the begin of 8-bytes make-up sequence.
|
||||||
|
pass_enctest = [('\xa4\xd4', u'\u3164')]
|
||||||
|
pass_dectest = [('\xa4\xd4', u'\u3164')]
|
||||||
|
|
||||||
|
|
||||||
class TestJOHABMap(test_multibytecodec_support.TestBase_Mapping,
|
class TestJOHABMap(test_multibytecodec_support.TestBase_Mapping,
|
||||||
unittest.TestCase):
|
unittest.TestCase):
|
||||||
|
|
|
@ -240,6 +240,9 @@ Core and builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- EUC-KR codec now handles the cheot-ga-keut composed make-up hangul
|
||||||
|
syllables.
|
||||||
|
|
||||||
- GB18030 codec now can encode additional two-byte characters that
|
- GB18030 codec now can encode additional two-byte characters that
|
||||||
are missing in GBK.
|
are missing in GBK.
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,26 @@
|
||||||
* EUC-KR codec
|
* EUC-KR codec
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#define EUCKR_JAMO_FIRSTBYTE 0xA4
|
||||||
|
#define EUCKR_JAMO_FILLER 0xD4
|
||||||
|
|
||||||
|
static const unsigned char u2cgk_choseong[19] = {
|
||||||
|
0xa1, 0xa2, 0xa4, 0xa7, 0xa8, 0xa9, 0xb1, 0xb2,
|
||||||
|
0xb3, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
|
||||||
|
0xbc, 0xbd, 0xbe
|
||||||
|
};
|
||||||
|
static const unsigned char u2cgk_jungseong[21] = {
|
||||||
|
0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6,
|
||||||
|
0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce,
|
||||||
|
0xcf, 0xd0, 0xd1, 0xd2, 0xd3
|
||||||
|
};
|
||||||
|
static const unsigned char u2cgk_jongseong[28] = {
|
||||||
|
0xd4, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
|
||||||
|
0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
|
||||||
|
0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xba,
|
||||||
|
0xbb, 0xbc, 0xbd, 0xbe
|
||||||
|
};
|
||||||
|
|
||||||
ENCODER(euc_kr)
|
ENCODER(euc_kr)
|
||||||
{
|
{
|
||||||
while (inleft > 0) {
|
while (inleft > 0) {
|
||||||
|
@ -28,17 +48,57 @@ ENCODER(euc_kr)
|
||||||
TRYMAP_ENC(cp949, code, c);
|
TRYMAP_ENC(cp949, code, c);
|
||||||
else return 1;
|
else return 1;
|
||||||
|
|
||||||
if (code & 0x8000) /* MSB set: CP949 */
|
if ((code & 0x8000) == 0) {
|
||||||
return 1;
|
/* KS X 1001 coded character */
|
||||||
|
OUT1((code >> 8) | 0x80)
|
||||||
|
OUT2((code & 0xFF) | 0x80)
|
||||||
|
NEXT(1, 2)
|
||||||
|
}
|
||||||
|
else { /* Mapping is found in CP949 extension,
|
||||||
|
* but we encode it in KS X 1001:1998 Annex 3,
|
||||||
|
* make-up sequence for EUC-KR. */
|
||||||
|
|
||||||
OUT1((code >> 8) | 0x80)
|
REQUIRE_OUTBUF(8)
|
||||||
OUT2((code & 0xFF) | 0x80)
|
|
||||||
NEXT(1, 2)
|
/* syllable composition precedence */
|
||||||
|
OUT1(EUCKR_JAMO_FIRSTBYTE)
|
||||||
|
OUT2(EUCKR_JAMO_FILLER)
|
||||||
|
|
||||||
|
/* All codepoints in CP949 extension are in unicode
|
||||||
|
* Hangul Syllable area. */
|
||||||
|
assert(0xac00 <= c && c <= 0xd7a3);
|
||||||
|
c -= 0xac00;
|
||||||
|
|
||||||
|
OUT3(EUCKR_JAMO_FIRSTBYTE)
|
||||||
|
OUT4(u2cgk_choseong[c / 588])
|
||||||
|
NEXT_OUT(4)
|
||||||
|
|
||||||
|
OUT1(EUCKR_JAMO_FIRSTBYTE)
|
||||||
|
OUT2(u2cgk_jungseong[(c / 28) % 21])
|
||||||
|
OUT3(EUCKR_JAMO_FIRSTBYTE)
|
||||||
|
OUT4(u2cgk_jongseong[c % 28])
|
||||||
|
NEXT(1, 4)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define NONE 127
|
||||||
|
|
||||||
|
static const unsigned char cgk2u_choseong[] = { /* [A1, BE] */
|
||||||
|
0, 1, NONE, 2, NONE, NONE, 3, 4,
|
||||||
|
5, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
|
||||||
|
6, 7, 8, NONE, 9, 10, 11, 12,
|
||||||
|
13, 14, 15, 16, 17, 18
|
||||||
|
};
|
||||||
|
static const unsigned char cgk2u_jongseong[] = { /* [A1, BE] */
|
||||||
|
1, 2, 3, 4, 5, 6, 7, NONE,
|
||||||
|
8, 9, 10, 11, 12, 13, 14, 15,
|
||||||
|
16, 17, NONE, 18, 19, 20, 21, 22,
|
||||||
|
NONE, 23, 24, 25, 26, 27
|
||||||
|
};
|
||||||
|
|
||||||
DECODER(euc_kr)
|
DECODER(euc_kr)
|
||||||
{
|
{
|
||||||
while (inleft > 0) {
|
while (inleft > 0) {
|
||||||
|
@ -54,13 +114,50 @@ DECODER(euc_kr)
|
||||||
|
|
||||||
REQUIRE_INBUF(2)
|
REQUIRE_INBUF(2)
|
||||||
|
|
||||||
TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
|
if (c == EUCKR_JAMO_FIRSTBYTE &&
|
||||||
|
IN2 == EUCKR_JAMO_FILLER) {
|
||||||
|
/* KS X 1001:1998 Annex 3 make-up sequence */
|
||||||
|
DBCHAR cho, jung, jong;
|
||||||
|
|
||||||
|
REQUIRE_INBUF(8)
|
||||||
|
if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
|
||||||
|
(*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
|
||||||
|
(*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
|
||||||
|
return 8;
|
||||||
|
|
||||||
|
c = (*inbuf)[3];
|
||||||
|
if (0xa1 <= c && c <= 0xbe)
|
||||||
|
cho = cgk2u_choseong[c - 0xa1];
|
||||||
|
else
|
||||||
|
cho = NONE;
|
||||||
|
|
||||||
|
c = (*inbuf)[5];
|
||||||
|
jung = (0xbf <= c && c <= 0xd3) ? c - 0xbf : NONE;
|
||||||
|
|
||||||
|
c = (*inbuf)[7];
|
||||||
|
if (c == EUCKR_JAMO_FILLER)
|
||||||
|
jong = 0;
|
||||||
|
else if (0xa1 <= c && c <= 0xbe)
|
||||||
|
jong = cgk2u_jongseong[c - 0xa1];
|
||||||
|
else
|
||||||
|
jong = NONE;
|
||||||
|
|
||||||
|
if (cho == NONE || jung == NONE || jong == NONE)
|
||||||
|
return 8;
|
||||||
|
|
||||||
|
OUT1(0xac00 + cho*588 + jung*28 + jong);
|
||||||
|
NEXT(8, 1)
|
||||||
|
}
|
||||||
|
else TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
|
||||||
NEXT(2, 1)
|
NEXT(2, 1)
|
||||||
} else return 2;
|
}
|
||||||
|
else
|
||||||
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
#undef NONE
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Loading…
Reference in New Issue