Close #17693: Rewrite CJK decoders to use the _PyUnicodeWriter API instead of

the legacy Py_UNICODE API.

Add also a new _PyUnicodeWriter_WriteChar() function.
This commit is contained in:
Victor Stinner 2013-04-11 22:09:04 +02:00
parent d8a5cc91e6
commit a0dd0213cc
15 changed files with 401 additions and 440 deletions

View File

@ -933,6 +933,13 @@ PyAPI_FUNC(int)
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
Py_ssize_t length, Py_UCS4 maxchar);
/* Append a Unicode character.
Return 0 on success, raise an exception and return -1 on error. */
PyAPI_FUNC(int)
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
Py_UCS4 ch
);
/* Append a Unicode string.
Return 0 on success, raise an exception and return -1 on error. */
PyAPI_FUNC(int)

View File

@ -23,12 +23,12 @@
* A844 undefined U+2015 HORIZONTAL BAR
*/
#define GBK_DECODE(dc1, dc2, assi) \
if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
else TRYMAP_DEC(gbkext, assi, dc1, dc2);
#define GBK_DECODE(dc1, dc2, writer) \
if ((dc1) == 0xa1 && (dc2) == 0xaa) OUTCHAR(0x2014); \
else if ((dc1) == 0xa8 && (dc2) == 0x44) OUTCHAR(0x2015); \
else if ((dc1) == 0xa1 && (dc2) == 0xa4) OUTCHAR(0x00b7); \
else TRYMAP_DEC(gb2312, writer, dc1 ^ 0x80, dc2 ^ 0x80); \
else TRYMAP_DEC(gbkext, writer, dc1, dc2);
#define GBK_ENCODE(code, assi) \
if ((code) == 0x2014) (assi) = 0xa1aa; \
@ -43,7 +43,7 @@
ENCODER(gb2312)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
Py_UCS4 c = IN1;
DBCHAR code;
if (c < 0x80) {
@ -73,17 +73,15 @@ DECODER(gb2312)
while (inleft > 0) {
unsigned char c = **inbuf;
REQUIRE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
OUTCHAR(c);
NEXT_IN(1);
continue;
}
REQUIRE_INBUF(2)
TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
NEXT(2, 1)
TRYMAP_DEC(gb2312, writer, c ^ 0x80, IN2 ^ 0x80) {
NEXT_IN(2);
}
else return 1;
}
@ -99,7 +97,7 @@ DECODER(gb2312)
ENCODER(gbk)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
Py_UCS4 c = IN1;
DBCHAR code;
if (c < 0x80) {
@ -130,20 +128,18 @@ DECODER(gbk)
while (inleft > 0) {
unsigned char c = IN1;
REQUIRE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
OUTCHAR(c);
NEXT_IN(1);
continue;
}
REQUIRE_INBUF(2)
GBK_DECODE(c, IN2, **outbuf)
GBK_DECODE(c, IN2, writer)
else return 1;
NEXT(2, 1)
NEXT_IN(2);
}
return 0;
@ -157,7 +153,7 @@ DECODER(gbk)
ENCODER(gb18030)
{
while (inleft > 0) {
ucs4_t c = IN1;
Py_UCS4 c = IN1;
DBCHAR code;
if (c < 0x80) {
@ -174,7 +170,7 @@ ENCODER(gb18030)
return 1;
#endif
else if (c >= 0x10000) {
ucs4_t tc = c - 0x10000;
Py_UCS4 tc = c - 0x10000;
REQUIRE_OUTBUF(4)
@ -208,7 +204,7 @@ ENCODER(gb18030)
utrrange++)
if (utrrange->first <= c &&
c <= utrrange->last) {
Py_UNICODE tc;
Py_UCS4 tc;
tc = c - utrrange->first +
utrrange->base;
@ -247,11 +243,9 @@ DECODER(gb18030)
while (inleft > 0) {
unsigned char c = IN1, c2;
REQUIRE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
OUTCHAR(c);
NEXT_IN(1);
continue;
}
@ -261,7 +255,7 @@ DECODER(gb18030)
if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
const struct _gb18030_to_unibmp_ranges *utr;
unsigned char c3, c4;
ucs4_t lseq;
Py_UCS4 lseq;
REQUIRE_INBUF(4)
c3 = IN3;
@ -272,34 +266,34 @@ DECODER(gb18030)
c3 -= 0x81; c4 -= 0x30;
if (c < 4) { /* U+0080 - U+FFFF */
lseq = ((ucs4_t)c * 10 + c2) * 1260 +
(ucs4_t)c3 * 10 + c4;
lseq = ((Py_UCS4)c * 10 + c2) * 1260 +
(Py_UCS4)c3 * 10 + c4;
if (lseq < 39420) {
for (utr = gb18030_to_unibmp_ranges;
lseq >= (utr + 1)->base;
utr++) ;
OUT1(utr->first - utr->base + lseq)
NEXT(4, 1)
OUTCHAR(utr->first - utr->base + lseq);
NEXT_IN(4);
continue;
}
}
else if (c >= 15) { /* U+10000 - U+10FFFF */
lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)
* 1260 + (ucs4_t)c3 * 10 + c4;
lseq = 0x10000 + (((Py_UCS4)c-15) * 10 + c2)
* 1260 + (Py_UCS4)c3 * 10 + c4;
if (lseq <= 0x10FFFF) {
WRITEUCS4(lseq);
NEXT_IN(4)
OUTCHAR(lseq);
NEXT_IN(4);
continue;
}
}
return 1;
}
GBK_DECODE(c, c2, **outbuf)
else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
GBK_DECODE(c, c2, writer)
else TRYMAP_DEC(gb18030ext, writer, c, c2);
else return 1;
NEXT(2, 1)
NEXT_IN(2);
}
return 0;
@ -329,7 +323,7 @@ ENCODER_RESET(hz)
ENCODER(hz)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
Py_UCS4 c = IN1;
DBCHAR code;
if (c < 0x80) {
@ -389,8 +383,8 @@ DECODER(hz)
REQUIRE_INBUF(2)
if (c2 == '~') {
WRITE1('~')
NEXT(2, 1)
OUTCHAR('~');
NEXT_IN(2);
continue;
}
else if (c2 == '{' && state->i == 0)
@ -401,7 +395,7 @@ DECODER(hz)
; /* line-continuation */
else
return 1;
NEXT(2, 0);
NEXT_IN(2);
continue;
}
@ -409,14 +403,13 @@ DECODER(hz)
return 1;
if (state->i == 0) { /* ASCII mode */
WRITE1(c)
NEXT(1, 1)
OUTCHAR(c);
NEXT_IN(1);
}
else { /* GB mode */
REQUIRE_INBUF(2)
REQUIRE_OUTBUF(1)
TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
NEXT(2, 1)
TRYMAP_DEC(gb2312, writer, c, IN2) {
NEXT_IN(2);
}
else
return 1;

View File

@ -39,7 +39,7 @@ static const DBCHAR big5hkscs_pairenc_table[4] = {0x8862, 0x8864, 0x88a3, 0x88a5
ENCODER(big5hkscs)
{
while (inleft > 0) {
ucs4_t c = **inbuf;
Py_UCS4 c = **inbuf;
DBCHAR code;
Py_ssize_t insize;
@ -103,26 +103,24 @@ DECODER(big5hkscs)
{
while (inleft > 0) {
unsigned char c = IN1;
ucs4_t decoded;
REQUIRE_OUTBUF(1)
Py_UCS4 decoded;
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
OUTCHAR(c);
NEXT_IN(1);
continue;
}
REQUIRE_INBUF(2)
if (0xc6 > c || c > 0xc8 || (c < 0xc7 && IN2 < 0xa1)) {
TRYMAP_DEC(big5, **outbuf, c, IN2) {
NEXT(2, 1)
TRYMAP_DEC(big5, writer, c, IN2) {
NEXT_IN(2);
continue;
}
}
TRYMAP_DEC(big5hkscs, decoded, c, IN2)
TRYMAP_DEC_CHAR(big5hkscs, decoded, c, IN2)
{
int s = BH2S(c, IN2);
const unsigned char *hintbase;
@ -146,25 +144,25 @@ DECODER(big5hkscs)
return MBERR_INTERNAL;
if (hintbase[s >> 3] & (1 << (s & 7))) {
WRITEUCS4(decoded | 0x20000)
NEXT_IN(2)
OUTCHAR(decoded | 0x20000);
NEXT_IN(2);
}
else {
OUT1(decoded)
NEXT(2, 1)
OUTCHAR(decoded);
NEXT_IN(2);
}
continue;
}
switch ((c << 8) | IN2) {
case 0x8862: WRITE2(0x00ca, 0x0304); break;
case 0x8864: WRITE2(0x00ca, 0x030c); break;
case 0x88a3: WRITE2(0x00ea, 0x0304); break;
case 0x88a5: WRITE2(0x00ea, 0x030c); break;
case 0x8862: OUTCHAR2(0x00ca, 0x0304); break;
case 0x8864: OUTCHAR2(0x00ca, 0x030c); break;
case 0x88a3: OUTCHAR2(0x00ea, 0x0304); break;
case 0x88a5: OUTCHAR2(0x00ea, 0x030c); break;
default: return 1;
}
NEXT(2, 2) /* all decoded codepoints are pairs, above. */
NEXT_IN(2); /* all decoded codepoints are pairs, above. */
}
return 0;

View File

@ -102,8 +102,8 @@
/*-*- internal data structures -*-*/
typedef int (*iso2022_init_func)(void);
typedef ucs4_t (*iso2022_decode_func)(const unsigned char *data);
typedef DBCHAR (*iso2022_encode_func)(const ucs4_t *data, Py_ssize_t *length);
typedef Py_UCS4 (*iso2022_decode_func)(const unsigned char *data);
typedef DBCHAR (*iso2022_encode_func)(const Py_UCS4 *data, Py_ssize_t *length);
struct iso2022_designation {
unsigned char mark;
@ -158,7 +158,7 @@ ENCODER(iso2022)
while (inleft > 0) {
const struct iso2022_designation *dsg;
DBCHAR encoded;
ucs4_t c = **inbuf;
Py_UCS4 c = **inbuf;
Py_ssize_t insize;
if (c < 0x80) {
@ -196,9 +196,9 @@ ENCODER(iso2022)
length = 2;
#if Py_UNICODE_SIZE == 2
if (length == 2) {
ucs4_t u4in[2];
u4in[0] = (ucs4_t)IN1;
u4in[1] = (ucs4_t)IN2;
Py_UCS4 u4in[2];
u4in[0] = (Py_UCS4)IN1;
u4in[1] = (Py_UCS4)IN2;
encoded = dsg->encoder(u4in, &length);
} else
encoded = dsg->encoder(&c, &length);
@ -277,7 +277,7 @@ ENCODER(iso2022)
WRITE2(encoded >> 8, encoded & 0xff)
NEXT_OUT(2)
}
NEXT_IN(insize)
NEXT_IN(insize);
}
return 0;
@ -376,45 +376,43 @@ iso2022processesc(const void *config, MultibyteCodec_State *state,
return 0;
}
#define ISO8859_7_DECODE(c, assi) \
if ((c) < 0xa0) (assi) = (c); \
else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) \
(assi) = (c); \
else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \
(0xbffffd77L & (1L << ((c)-0xb4))))) \
(assi) = 0x02d0 + (c); \
else if ((c) == 0xa1) (assi) = 0x2018; \
else if ((c) == 0xa2) (assi) = 0x2019; \
else if ((c) == 0xaf) (assi) = 0x2015;
#define ISO8859_7_DECODE(c, writer) \
if ((c) < 0xa0) OUTCHAR(c); \
else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) \
OUTCHAR(c); \
else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \
(0xbffffd77L & (1L << ((c)-0xb4))))) \
OUTCHAR(0x02d0 + (c)); \
else if ((c) == 0xa1) OUTCHAR(0x2018); \
else if ((c) == 0xa2) OUTCHAR(0x2019); \
else if ((c) == 0xaf) OUTCHAR(0x2015);
static Py_ssize_t
iso2022processg2(const void *config, MultibyteCodec_State *state,
const unsigned char **inbuf, Py_ssize_t *inleft,
Py_UNICODE **outbuf, Py_ssize_t *outleft)
_PyUnicodeWriter *writer)
{
/* not written to use encoder, decoder functions because only few
* encodings use G2 designations in CJKCodecs */
if (STATE_G2 == CHARSET_ISO8859_1) {
if (IN3 < 0x80)
OUT1(IN3 + 0x80)
OUTCHAR(IN3 + 0x80);
else
return 3;
}
else if (STATE_G2 == CHARSET_ISO8859_7) {
ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf)
ISO8859_7_DECODE(IN3 ^ 0x80, writer)
else return 3;
}
else if (STATE_G2 == CHARSET_ASCII) {
if (IN3 & 0x80) return 3;
else **outbuf = IN3;
else OUTCHAR(IN3);
}
else
return MBERR_INTERNAL;
(*inbuf) += 3;
*inleft -= 3;
(*outbuf) += 1;
*outleft -= 1;
return 0;
}
@ -429,8 +427,8 @@ DECODER(iso2022)
if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
/* ESC throughout mode:
* for non-iso2022 escape sequences */
WRITE1(c) /* assume as ISO-8859-1 */
NEXT(1, 1)
OUTCHAR(c); /* assume as ISO-8859-1 */
NEXT_IN(1);
if (IS_ESCEND(c)) {
STATE_CLEARFLAG(F_ESCTHROUGHOUT)
}
@ -449,32 +447,32 @@ DECODER(iso2022)
else if (CONFIG_ISSET(USE_G2) && IN2 == 'N') {/* SS2 */
REQUIRE_INBUF(3)
err = iso2022processg2(config, state,
inbuf, &inleft, outbuf, &outleft);
inbuf, &inleft, writer);
if (err != 0)
return err;
}
else {
WRITE1(ESC)
OUTCHAR(ESC);
STATE_SETFLAG(F_ESCTHROUGHOUT)
NEXT(1, 1)
NEXT_IN(1);
}
break;
case SI:
if (CONFIG_ISSET(NO_SHIFT))
goto bypass;
STATE_CLEARFLAG(F_SHIFTED)
NEXT_IN(1)
NEXT_IN(1);
break;
case SO:
if (CONFIG_ISSET(NO_SHIFT))
goto bypass;
STATE_SETFLAG(F_SHIFTED)
NEXT_IN(1)
NEXT_IN(1);
break;
case LF:
STATE_CLEARFLAG(F_SHIFTED)
WRITE1(LF)
NEXT(1, 1)
OUTCHAR(LF);
NEXT_IN(1);
break;
default:
if (c < 0x20) /* C0 */
@ -484,7 +482,7 @@ DECODER(iso2022)
else {
const struct iso2022_designation *dsg;
unsigned char charset;
ucs4_t decoded;
Py_UCS4 decoded;
if (STATE_GETFLAG(F_SHIFTED))
charset = STATE_G1;
@ -492,8 +490,8 @@ DECODER(iso2022)
charset = STATE_G0;
if (charset == CHARSET_ASCII) {
bypass: WRITE1(c)
NEXT(1, 1)
bypass: OUTCHAR(c);
NEXT_IN(1);
break;
}
@ -518,17 +516,15 @@ bypass: WRITE1(c)
return dsg->width;
if (decoded < 0x10000) {
WRITE1(decoded)
NEXT_OUT(1)
OUTCHAR(decoded);
}
else if (decoded < 0x30000) {
WRITEUCS4(decoded)
OUTCHAR(decoded);
}
else { /* JIS X 0213 pairs */
WRITE2(decoded >> 16, decoded & 0xffff)
NEXT_OUT(2)
OUTCHAR2(decoded >> 16, decoded & 0xffff);
}
NEXT_IN(dsg->width)
NEXT_IN(dsg->width);
}
break;
}
@ -577,18 +573,18 @@ ksx1001_init(void)
return 0;
}
static ucs4_t
static Py_UCS4
ksx1001_decoder(const unsigned char *data)
{
ucs4_t u;
TRYMAP_DEC(ksx1001, u, data[0], data[1])
Py_UCS4 u;
TRYMAP_DEC_CHAR(ksx1001, u, data[0], data[1])
return u;
else
return MAP_UNMAPPABLE;
}
static DBCHAR
ksx1001_encoder(const ucs4_t *data, Py_ssize_t *length)
ksx1001_encoder(const Py_UCS4 *data, Py_ssize_t *length)
{
DBCHAR coded;
assert(*length == 1);
@ -613,20 +609,20 @@ jisx0208_init(void)
return 0;
}
static ucs4_t
static Py_UCS4
jisx0208_decoder(const unsigned char *data)
{
ucs4_t u;
Py_UCS4 u;
if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
return 0xff3c;
else TRYMAP_DEC(jisx0208, u, data[0], data[1])
else TRYMAP_DEC_CHAR(jisx0208, u, data[0], data[1])
return u;
else
return MAP_UNMAPPABLE;
}
static DBCHAR
jisx0208_encoder(const ucs4_t *data, Py_ssize_t *length)
jisx0208_encoder(const Py_UCS4 *data, Py_ssize_t *length)
{
DBCHAR coded;
assert(*length == 1);
@ -654,18 +650,18 @@ jisx0212_init(void)
return 0;
}
static ucs4_t
static Py_UCS4
jisx0212_decoder(const unsigned char *data)
{
ucs4_t u;
TRYMAP_DEC(jisx0212, u, data[0], data[1])
Py_UCS4 u;
TRYMAP_DEC_CHAR(jisx0212, u, data[0], data[1])
return u;
else
return MAP_UNMAPPABLE;
}
static DBCHAR
jisx0212_encoder(const ucs4_t *data, Py_ssize_t *length)
jisx0212_encoder(const Py_UCS4 *data, Py_ssize_t *length)
{
DBCHAR coded;
assert(*length == 1);
@ -705,30 +701,30 @@ jisx0213_init(void)
}
#define config ((void *)2000)
static ucs4_t
static Py_UCS4
jisx0213_2000_1_decoder(const unsigned char *data)
{
ucs4_t u;
Py_UCS4 u;
EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1])
else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
return 0xff3c;
else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
else TRYMAP_DEC_CHAR(jisx0208, u, data[0], data[1]);
else TRYMAP_DEC_CHAR(jisx0213_1_bmp, u, data[0], data[1]);
else TRYMAP_DEC_CHAR(jisx0213_1_emp, u, data[0], data[1])
u |= 0x20000;
else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
else TRYMAP_DEC_CHAR(jisx0213_pair, u, data[0], data[1]);
else
return MAP_UNMAPPABLE;
return u;
}
static ucs4_t
static Py_UCS4
jisx0213_2000_2_decoder(const unsigned char *data)
{
ucs4_t u;
EMULATE_JISX0213_2000_DECODE_PLANE2(u, data[0], data[1])
TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
Py_UCS4 u;
EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(u, data[0], data[1])
TRYMAP_DEC_CHAR(jisx0213_2_bmp, u, data[0], data[1]);
else TRYMAP_DEC_CHAR(jisx0213_2_emp, u, data[0], data[1])
u |= 0x20000;
else
return MAP_UNMAPPABLE;
@ -736,28 +732,28 @@ jisx0213_2000_2_decoder(const unsigned char *data)
}
#undef config
static ucs4_t
static Py_UCS4
jisx0213_2004_1_decoder(const unsigned char *data)
{
ucs4_t u;
Py_UCS4 u;
if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
return 0xff3c;
else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
else TRYMAP_DEC_CHAR(jisx0208, u, data[0], data[1]);
else TRYMAP_DEC_CHAR(jisx0213_1_bmp, u, data[0], data[1]);
else TRYMAP_DEC_CHAR(jisx0213_1_emp, u, data[0], data[1])
u |= 0x20000;
else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
else TRYMAP_DEC_CHAR(jisx0213_pair, u, data[0], data[1]);
else
return MAP_UNMAPPABLE;
return u;
}
static ucs4_t
static Py_UCS4
jisx0213_2004_2_decoder(const unsigned char *data)
{
ucs4_t u;
TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
Py_UCS4 u;
TRYMAP_DEC_CHAR(jisx0213_2_bmp, u, data[0], data[1]);
else TRYMAP_DEC_CHAR(jisx0213_2_emp, u, data[0], data[1])
u |= 0x20000;
else
return MAP_UNMAPPABLE;
@ -765,7 +761,7 @@ jisx0213_2004_2_decoder(const unsigned char *data)
}
static DBCHAR
jisx0213_encoder(const ucs4_t *data, Py_ssize_t *length, void *config)
jisx0213_encoder(const Py_UCS4 *data, Py_ssize_t *length, void *config)
{
DBCHAR coded;
@ -819,7 +815,7 @@ jisx0213_encoder(const ucs4_t *data, Py_ssize_t *length, void *config)
}
static DBCHAR
jisx0213_2000_1_encoder(const ucs4_t *data, Py_ssize_t *length)
jisx0213_2000_1_encoder(const Py_UCS4 *data, Py_ssize_t *length)
{
DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
@ -831,7 +827,7 @@ jisx0213_2000_1_encoder(const ucs4_t *data, Py_ssize_t *length)
}
static DBCHAR
jisx0213_2000_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
jisx0213_2000_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length)
{
DBCHAR coded;
Py_ssize_t ilength = *length;
@ -854,7 +850,7 @@ jisx0213_2000_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
}
static DBCHAR
jisx0213_2000_2_encoder(const ucs4_t *data, Py_ssize_t *length)
jisx0213_2000_2_encoder(const Py_UCS4 *data, Py_ssize_t *length)
{
DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
@ -866,7 +862,7 @@ jisx0213_2000_2_encoder(const ucs4_t *data, Py_ssize_t *length)
}
static DBCHAR
jisx0213_2004_1_encoder(const ucs4_t *data, Py_ssize_t *length)
jisx0213_2004_1_encoder(const Py_UCS4 *data, Py_ssize_t *length)
{
DBCHAR coded = jisx0213_encoder(data, length, NULL);
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
@ -878,7 +874,7 @@ jisx0213_2004_1_encoder(const ucs4_t *data, Py_ssize_t *length)
}
static DBCHAR
jisx0213_2004_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
jisx0213_2004_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length)
{
DBCHAR coded;
Py_ssize_t ilength = *length;
@ -901,7 +897,7 @@ jisx0213_2004_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
}
static DBCHAR
jisx0213_2004_2_encoder(const ucs4_t *data, Py_ssize_t *length)
jisx0213_2004_2_encoder(const Py_UCS4 *data, Py_ssize_t *length)
{
DBCHAR coded = jisx0213_encoder(data, length, NULL);
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
@ -912,17 +908,17 @@ jisx0213_2004_2_encoder(const ucs4_t *data, Py_ssize_t *length)
return MAP_UNMAPPABLE;
}
static ucs4_t
static Py_UCS4
jisx0201_r_decoder(const unsigned char *data)
{
ucs4_t u;
JISX0201_R_DECODE(*data, u)
Py_UCS4 u;
JISX0201_R_DECODE_CHAR(*data, u)
else return MAP_UNMAPPABLE;
return u;
}
static DBCHAR
jisx0201_r_encoder(const ucs4_t *data, Py_ssize_t *length)
jisx0201_r_encoder(const Py_UCS4 *data, Py_ssize_t *length)
{
DBCHAR coded;
JISX0201_R_ENCODE(*data, coded)
@ -930,17 +926,17 @@ jisx0201_r_encoder(const ucs4_t *data, Py_ssize_t *length)
return coded;
}
static ucs4_t
static Py_UCS4
jisx0201_k_decoder(const unsigned char *data)
{
ucs4_t u;
JISX0201_K_DECODE(*data ^ 0x80, u)
Py_UCS4 u;
JISX0201_K_DECODE_CHAR(*data ^ 0x80, u)
else return MAP_UNMAPPABLE;
return u;
}
static DBCHAR
jisx0201_k_encoder(const ucs4_t *data, Py_ssize_t *length)
jisx0201_k_encoder(const Py_UCS4 *data, Py_ssize_t *length)
{
DBCHAR coded;
JISX0201_K_ENCODE(*data, coded)
@ -961,18 +957,18 @@ gb2312_init(void)
return 0;
}
static ucs4_t
static Py_UCS4
gb2312_decoder(const unsigned char *data)
{
ucs4_t u;
TRYMAP_DEC(gb2312, u, data[0], data[1])
Py_UCS4 u;
TRYMAP_DEC_CHAR(gb2312, u, data[0], data[1])
return u;
else
return MAP_UNMAPPABLE;
}
static DBCHAR
gb2312_encoder(const ucs4_t *data, Py_ssize_t *length)
gb2312_encoder(const Py_UCS4 *data, Py_ssize_t *length)
{
DBCHAR coded;
assert(*length == 1);
@ -986,14 +982,14 @@ gb2312_encoder(const ucs4_t *data, Py_ssize_t *length)
}
static ucs4_t
static Py_UCS4
dummy_decoder(const unsigned char *data)
{
return MAP_UNMAPPABLE;
}
static DBCHAR
dummy_encoder(const ucs4_t *data, Py_ssize_t *length)
dummy_encoder(const Py_UCS4 *data, Py_ssize_t *length)
{
return MAP_UNMAPPABLE;
}

View File

@ -20,7 +20,7 @@
ENCODER(cp932)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
Py_UCS4 c = IN1;
DBCHAR code;
unsigned char c1, c2;
@ -66,8 +66,8 @@ ENCODER(cp932)
}
else if (c >= 0xe000 && c < 0xe758) {
/* User-defined area */
c1 = (Py_UNICODE)(c - 0xe000) / 188;
c2 = (Py_UNICODE)(c - 0xe000) % 188;
c1 = (Py_UCS4)(c - 0xe000) / 188;
c2 = (Py_UCS4)(c - 0xe000) % 188;
OUT1(c1 + 0xf0)
OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
}
@ -85,31 +85,30 @@ DECODER(cp932)
while (inleft > 0) {
unsigned char c = IN1, c2;
REQUIRE_OUTBUF(1)
if (c <= 0x80) {
OUT1(c)
NEXT(1, 1)
OUTCHAR(c);
NEXT_IN(1);
continue;
}
else if (c >= 0xa0 && c <= 0xdf) {
if (c == 0xa0)
OUT1(0xf8f0) /* half-width katakana */
OUTCHAR(0xf8f0); /* half-width katakana */
else
OUT1(0xfec0 + c)
NEXT(1, 1)
OUTCHAR(0xfec0 + c);
NEXT_IN(1);
continue;
}
else if (c >= 0xfd/* && c <= 0xff*/) {
/* Windows compatibility */
OUT1(0xf8f1 - 0xfd + c)
NEXT(1, 1)
OUTCHAR(0xf8f1 - 0xfd + c);
NEXT_IN(1);
continue;
}
REQUIRE_INBUF(2)
c2 = IN2;
TRYMAP_DEC(cp932ext, **outbuf, c, c2);
TRYMAP_DEC(cp932ext, writer, c, c2);
else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
return 1;
@ -119,21 +118,21 @@ DECODER(cp932)
c = (2 * c + (c2 < 0x5e ? 0 : 1) + 0x21);
c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
TRYMAP_DEC(jisx0208, **outbuf, c, c2);
TRYMAP_DEC(jisx0208, writer, c, c2);
else return 1;
}
else if (c >= 0xf0 && c <= 0xf9) {
if ((c2 >= 0x40 && c2 <= 0x7e) ||
(c2 >= 0x80 && c2 <= 0xfc))
OUT1(0xe000 + 188 * (c - 0xf0) +
(c2 < 0x80 ? c2 - 0x40 : c2 - 0x41))
OUTCHAR(0xe000 + 188 * (c - 0xf0) +
(c2 < 0x80 ? c2 - 0x40 : c2 - 0x41));
else
return 1;
}
else
return 1;
NEXT(2, 1)
NEXT_IN(2);
}
return 0;
@ -147,7 +146,7 @@ DECODER(cp932)
ENCODER(euc_jis_2004)
{
while (inleft > 0) {
ucs4_t c = IN1;
Py_UCS4 c = IN1;
DBCHAR code;
Py_ssize_t insize;
@ -235,13 +234,11 @@ DECODER(euc_jis_2004)
{
while (inleft > 0) {
unsigned char c = IN1;
ucs4_t code;
REQUIRE_OUTBUF(1)
Py_UCS4 code;
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
OUTCHAR(c);
NEXT_IN(1);
continue;
}
@ -252,8 +249,8 @@ DECODER(euc_jis_2004)
REQUIRE_INBUF(2)
c2 = IN2;
if (c2 >= 0xa1 && c2 <= 0xdf) {
OUT1(0xfec0 + c2)
NEXT(2, 1)
OUTCHAR(0xfec0 + c2);
NEXT_IN(2);
}
else
return 1;
@ -266,16 +263,16 @@ DECODER(euc_jis_2004)
c3 = IN3 ^ 0x80;
/* JIS X 0213 Plane 2 or JIS X 0212 (see NOTES) */
EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf, c2, c3)
else TRYMAP_DEC(jisx0213_2_bmp, **outbuf, c2, c3) ;
else TRYMAP_DEC(jisx0213_2_emp, code, c2, c3) {
WRITEUCS4(EMPBASE | code)
NEXT_IN(3)
EMULATE_JISX0213_2000_DECODE_PLANE2(writer, c2, c3)
else TRYMAP_DEC(jisx0213_2_bmp, writer, c2, c3) ;
else TRYMAP_DEC_CHAR(jisx0213_2_emp, code, c2, c3) {
OUTCHAR(EMPBASE | code);
NEXT_IN(3);
continue;
}
else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ;
else TRYMAP_DEC(jisx0212, writer, c2, c3) ;
else return 1;
NEXT(3, 1)
NEXT_IN(3);
}
else {
unsigned char c2;
@ -285,23 +282,23 @@ DECODER(euc_jis_2004)
c2 = IN2 ^ 0x80;
/* JIS X 0213 Plane 1 */
EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf, c, c2)
else if (c == 0x21 && c2 == 0x40) **outbuf = 0xff3c;
else if (c == 0x22 && c2 == 0x32) **outbuf = 0xff5e;
else TRYMAP_DEC(jisx0208, **outbuf, c, c2);
else TRYMAP_DEC(jisx0213_1_bmp, **outbuf, c, c2);
else TRYMAP_DEC(jisx0213_1_emp, code, c, c2) {
WRITEUCS4(EMPBASE | code)
NEXT_IN(2)
EMULATE_JISX0213_2000_DECODE_PLANE1(writer, c, c2)
else if (c == 0x21 && c2 == 0x40) OUTCHAR(0xff3c);
else if (c == 0x22 && c2 == 0x32) OUTCHAR(0xff5e);
else TRYMAP_DEC(jisx0208, writer, c, c2);
else TRYMAP_DEC(jisx0213_1_bmp, writer, c, c2);
else TRYMAP_DEC_CHAR(jisx0213_1_emp, code, c, c2) {
OUTCHAR(EMPBASE | code);
NEXT_IN(2);
continue;
}
else TRYMAP_DEC(jisx0213_pair, code, c, c2) {
WRITE2(code >> 16, code & 0xffff)
NEXT(2, 2)
else TRYMAP_DEC_CHAR(jisx0213_pair, code, c, c2) {
OUTCHAR2(code >> 16, code & 0xffff);
NEXT_IN(2);
continue;
}
else return 1;
NEXT(2, 1)
NEXT_IN(2);
}
}
@ -316,7 +313,7 @@ DECODER(euc_jis_2004)
ENCODER(euc_jp)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
Py_UCS4 c = IN1;
DBCHAR code;
if (c < 0x80) {
@ -369,11 +366,9 @@ DECODER(euc_jp)
while (inleft > 0) {
unsigned char c = IN1;
REQUIRE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
OUTCHAR(c);
NEXT_IN(1);
continue;
}
@ -384,8 +379,8 @@ DECODER(euc_jp)
REQUIRE_INBUF(2)
c2 = IN2;
if (c2 >= 0xa1 && c2 <= 0xdf) {
OUT1(0xfec0 + c2)
NEXT(2, 1)
OUTCHAR(0xfec0 + c2);
NEXT_IN(2);
}
else
return 1;
@ -397,8 +392,8 @@ DECODER(euc_jp)
c2 = IN2;
c3 = IN3;
/* JIS X 0212 */
TRYMAP_DEC(jisx0212, **outbuf, c2 ^ 0x80, c3 ^ 0x80) {
NEXT(3, 1)
TRYMAP_DEC(jisx0212, writer, c2 ^ 0x80, c3 ^ 0x80) {
NEXT_IN(3);
}
else
return 1;
@ -412,13 +407,13 @@ DECODER(euc_jp)
#ifndef STRICT_BUILD
if (c == 0xa1 && c2 == 0xc0)
/* FULL-WIDTH REVERSE SOLIDUS */
**outbuf = 0xff3c;
OUTCHAR(0xff3c);
else
#endif
TRYMAP_DEC(jisx0208, **outbuf,
TRYMAP_DEC(jisx0208, writer,
c ^ 0x80, c2 ^ 0x80) ;
else return 1;
NEXT(2, 1)
NEXT_IN(2);
}
}
@ -433,7 +428,7 @@ DECODER(euc_jp)
ENCODER(shift_jis)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
Py_UCS4 c = IN1;
DBCHAR code;
unsigned char c1, c2;
@ -488,14 +483,12 @@ DECODER(shift_jis)
while (inleft > 0) {
unsigned char c = IN1;
REQUIRE_OUTBUF(1)
#ifdef STRICT_BUILD
JISX0201_R_DECODE(c, **outbuf)
JISX0201_R_DECODE(c, writer)
#else
if (c < 0x80) **outbuf = c;
if (c < 0x80) OUTCHAR(c);
#endif
else JISX0201_K_DECODE(c, **outbuf)
else JISX0201_K_DECODE(c, writer)
else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
unsigned char c1, c2;
@ -512,13 +505,13 @@ DECODER(shift_jis)
#ifndef STRICT_BUILD
if (c1 == 0x21 && c2 == 0x40) {
/* FULL-WIDTH REVERSE SOLIDUS */
OUT1(0xff3c)
NEXT(2, 1)
OUTCHAR(0xff3c);
NEXT_IN(2);
continue;
}
#endif
TRYMAP_DEC(jisx0208, **outbuf, c1, c2) {
NEXT(2, 1)
TRYMAP_DEC(jisx0208, writer, c1, c2) {
NEXT_IN(2);
continue;
}
else
@ -527,7 +520,7 @@ DECODER(shift_jis)
else
return 1;
NEXT(1, 1) /* JIS X 0201 */
NEXT_IN(1); /* JIS X 0201 */
}
return 0;
@ -541,7 +534,7 @@ DECODER(shift_jis)
ENCODER(shift_jis_2004)
{
while (inleft > 0) {
ucs4_t c = IN1;
Py_UCS4 c = IN1;
DBCHAR code = NOCHAR;
int c1, c2;
Py_ssize_t insize;
@ -636,11 +629,10 @@ DECODER(shift_jis_2004)
while (inleft > 0) {
unsigned char c = IN1;
REQUIRE_OUTBUF(1)
JISX0201_DECODE(c, **outbuf)
JISX0201_DECODE(c, writer)
else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc)){
unsigned char c1, c2;
ucs4_t code;
Py_UCS4 code;
REQUIRE_INBUF(2)
c2 = IN2;
@ -654,50 +646,47 @@ DECODER(shift_jis_2004)
if (c1 < 0x5e) { /* Plane 1 */
c1 += 0x21;
EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf,
EMULATE_JISX0213_2000_DECODE_PLANE1(writer,
c1, c2)
else TRYMAP_DEC(jisx0208, **outbuf, c1, c2) {
NEXT_OUT(1)
else TRYMAP_DEC(jisx0208, writer, c1, c2) {
}
else TRYMAP_DEC(jisx0213_1_bmp, **outbuf,
else TRYMAP_DEC(jisx0213_1_bmp, writer,
c1, c2) {
NEXT_OUT(1)
}
else TRYMAP_DEC(jisx0213_1_emp, code, c1, c2) {
WRITEUCS4(EMPBASE | code)
else TRYMAP_DEC_CHAR(jisx0213_1_emp, code, c1, c2) {
OUTCHAR(EMPBASE | code);
}
else TRYMAP_DEC(jisx0213_pair, code, c1, c2) {
WRITE2(code >> 16, code & 0xffff)
NEXT_OUT(2)
else TRYMAP_DEC_CHAR(jisx0213_pair, code, c1, c2) {
OUTCHAR2(code >> 16, code & 0xffff);
}
else
return 1;
NEXT_IN(2)
NEXT_IN(2);
}
else { /* Plane 2 */
if (c1 >= 0x67) c1 += 0x07;
else if (c1 >= 0x63 || c1 == 0x5f) c1 -= 0x37;
else c1 -= 0x3d;
EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf,
EMULATE_JISX0213_2000_DECODE_PLANE2(writer,
c1, c2)
else TRYMAP_DEC(jisx0213_2_bmp, **outbuf,
c1, c2) ;
else TRYMAP_DEC(jisx0213_2_emp, code, c1, c2) {
WRITEUCS4(EMPBASE | code)
NEXT_IN(2)
else TRYMAP_DEC(jisx0213_2_bmp, writer,
c1, c2) {
} else TRYMAP_DEC_CHAR(jisx0213_2_emp, code, c1, c2) {
OUTCHAR(EMPBASE | code);
NEXT_IN(2);
continue;
}
else
return 1;
NEXT(2, 1)
NEXT_IN(2);
}
continue;
}
else
return 1;
NEXT(1, 1) /* JIS X 0201 */
NEXT_IN(1); /* JIS X 0201 */
}
return 0;

View File

@ -34,7 +34,7 @@ static const unsigned char u2cgk_jongseong[28] = {
ENCODER(euc_kr)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
Py_UCS4 c = IN1;
DBCHAR code;
if (c < 0x80) {
@ -104,11 +104,9 @@ DECODER(euc_kr)
while (inleft > 0) {
unsigned char c = IN1;
REQUIRE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
OUTCHAR(c);
NEXT_IN(1);
continue;
}
@ -145,11 +143,11 @@ DECODER(euc_kr)
if (cho == NONE || jung == NONE || jong == NONE)
return 1;
OUT1(0xac00 + cho*588 + jung*28 + jong);
NEXT(8, 1)
OUTCHAR(0xac00 + cho*588 + jung*28 + jong);
NEXT_IN(8);
}
else TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
NEXT(2, 1)
else TRYMAP_DEC(ksx1001, writer, c ^ 0x80, IN2 ^ 0x80) {
NEXT_IN(2);
}
else
return 1;
@ -167,7 +165,7 @@ DECODER(euc_kr)
ENCODER(cp949)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
Py_UCS4 c = IN1;
DBCHAR code;
if (c < 0x80) {
@ -197,20 +195,18 @@ DECODER(cp949)
while (inleft > 0) {
unsigned char c = IN1;
REQUIRE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
OUTCHAR(c);
NEXT_IN(1);
continue;
}
REQUIRE_INBUF(2)
TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80);
else TRYMAP_DEC(cp949ext, **outbuf, c, IN2);
TRYMAP_DEC(ksx1001, writer, c ^ 0x80, IN2 ^ 0x80);
else TRYMAP_DEC(cp949ext, writer, c, IN2);
else return 1;
NEXT(2, 1)
NEXT_IN(2);
}
return 0;
@ -251,7 +247,7 @@ static const DBCHAR u2johabjamo[] = {
ENCODER(johab)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
Py_UCS4 c = IN1;
DBCHAR code;
if (c < 0x80) {
@ -350,11 +346,9 @@ DECODER(johab)
while (inleft > 0) {
unsigned char c = IN1, c2;
REQUIRE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
OUTCHAR(c);
NEXT_IN(1);
continue;
}
@ -381,33 +375,33 @@ DECODER(johab)
if (i_cho == FILL) {
if (i_jung == FILL) {
if (i_jong == FILL)
OUT1(0x3000)
OUTCHAR(0x3000);
else
OUT1(0x3100 |
johabjamo_jongseong[c_jong])
OUTCHAR(0x3100 |
johabjamo_jongseong[c_jong]);
}
else {
if (i_jong == FILL)
OUT1(0x3100 |
johabjamo_jungseong[c_jung])
OUTCHAR(0x3100 |
johabjamo_jungseong[c_jung]);
else
return 1;
}
} else {
if (i_jung == FILL) {
if (i_jong == FILL)
OUT1(0x3100 |
johabjamo_choseong[c_cho])
OUTCHAR(0x3100 |
johabjamo_choseong[c_cho]);
else
return 1;
}
else
OUT1(0xac00 +
i_cho * 588 +
i_jung * 28 +
(i_jong == FILL ? 0 : i_jong))
OUTCHAR(0xac00 +
i_cho * 588 +
i_jung * 28 +
(i_jong == FILL ? 0 : i_jong));
}
NEXT(2, 1)
NEXT_IN(2);
} else {
/* KS X 1001 except hangul jamos and syllables */
if (c == 0xdf || c > 0xf9 ||
@ -424,9 +418,9 @@ DECODER(johab)
t1 = t1 + (t2 < 0x5e ? 0 : 1) + 0x21;
t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
TRYMAP_DEC(ksx1001, **outbuf, t1, t2);
TRYMAP_DEC(ksx1001, writer, t1, t2);
else return 1;
NEXT(2, 1)
NEXT_IN(2);
}
}
}

View File

@ -14,7 +14,7 @@
ENCODER(big5)
{
while (inleft > 0) {
Py_UNICODE c = **inbuf;
Py_UCS4 c = **inbuf;
DBCHAR code;
if (c < 0x80) {
@ -43,17 +43,15 @@ DECODER(big5)
while (inleft > 0) {
unsigned char c = IN1;
REQUIRE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
OUTCHAR(c);
NEXT_IN(1);
continue;
}
REQUIRE_INBUF(2)
TRYMAP_DEC(big5, **outbuf, c, IN2) {
NEXT(2, 1)
TRYMAP_DEC(big5, writer, c, IN2) {
NEXT_IN(2);
}
else return 1;
}
@ -69,7 +67,7 @@ DECODER(big5)
ENCODER(cp950)
{
while (inleft > 0) {
Py_UNICODE c = IN1;
Py_UCS4 c = IN1;
DBCHAR code;
if (c < 0x80) {
@ -97,21 +95,19 @@ DECODER(cp950)
while (inleft > 0) {
unsigned char c = IN1;
REQUIRE_OUTBUF(1)
if (c < 0x80) {
OUT1(c)
NEXT(1, 1)
OUTCHAR(c);
NEXT_IN(1);
continue;
}
REQUIRE_INBUF(2)
TRYMAP_DEC(cp950ext, **outbuf, c, IN2);
else TRYMAP_DEC(big5, **outbuf, c, IN2);
TRYMAP_DEC(cp950ext, writer, c, IN2);
else TRYMAP_DEC(big5, writer, c, IN2);
else return 1;
NEXT(2, 1)
NEXT_IN(2);
}
return 0;

View File

@ -10,15 +10,24 @@
JISX0201_R_ENCODE(c, assi) \
else JISX0201_K_ENCODE(c, assi)
#define JISX0201_R_DECODE(c, assi) \
#define JISX0201_R_DECODE_CHAR(c, assi) \
if ((c) < 0x5c) (assi) = (c); \
else if ((c) == 0x5c) (assi) = 0x00a5; \
else if ((c) < 0x7e) (assi) = (c); \
else if ((c) == 0x7e) (assi) = 0x203e; \
else if ((c) == 0x7f) (assi) = 0x7f;
#define JISX0201_K_DECODE(c, assi) \
#define JISX0201_R_DECODE(c, writer) \
if ((c) < 0x5c) OUTCHAR(c); \
else if ((c) == 0x5c) OUTCHAR(0x00a5); \
else if ((c) < 0x7e) OUTCHAR(c); \
else if ((c) == 0x7e) OUTCHAR(0x203e); \
else if ((c) == 0x7f) OUTCHAR(0x7f);
#define JISX0201_K_DECODE(c, writer) \
if ((c) >= 0xa1 && (c) <= 0xdf) \
(assi) = 0xfec0 + (c);
#define JISX0201_DECODE(c, assi) \
JISX0201_R_DECODE(c, assi) \
else JISX0201_K_DECODE(c, assi)
OUTCHAR(0xfec0 + (c));
#define JISX0201_K_DECODE_CHAR(c, assi) \
if ((c) >= 0xa1 && (c) <= 0xdf) \
(assi) = 0xfec0 + (c);
#define JISX0201_DECODE(c, writer) \
JISX0201_R_DECODE(c, writer) \
else JISX0201_K_DECODE(c, writer)

View File

@ -33,7 +33,7 @@ struct dbcs_index {
typedef struct dbcs_index decode_map;
struct widedbcs_index {
const ucs4_t *map;
const Py_UCS4 *map;
unsigned char bottom, top;
};
typedef struct widedbcs_index widedecode_map;
@ -56,7 +56,7 @@ struct dbcs_map {
};
struct pair_encodemap {
ucs4_t uniseq;
Py_UCS4 uniseq;
DBCHAR code;
};
@ -86,7 +86,7 @@ static const struct dbcs_map *mapping_list;
static Py_ssize_t encoding##_decode( \
MultibyteCodec_State *state, const void *config, \
const unsigned char **inbuf, Py_ssize_t inleft, \
Py_UNICODE **outbuf, Py_ssize_t outleft)
_PyUnicodeWriter *writer)
#define DECODER_RESET(encoding) \
static Py_ssize_t encoding##_decode_reset( \
MultibyteCodec_State *state, const void *config)
@ -101,13 +101,15 @@ static const struct dbcs_map *mapping_list;
#endif
#define NEXT_IN(i) \
(*inbuf) += (i); \
(inleft) -= (i);
do { \
(*inbuf) += (i); \
(inleft) -= (i); \
} while (0)
#define NEXT_OUT(o) \
(*outbuf) += (o); \
(outleft) -= (o);
#define NEXT(i, o) \
NEXT_IN(i) NEXT_OUT(o)
NEXT_IN(i); NEXT_OUT(o)
#define REQUIRE_INBUF(n) \
if (inleft < (n)) \
@ -121,6 +123,23 @@ static const struct dbcs_map *mapping_list;
#define IN3 ((*inbuf)[2])
#define IN4 ((*inbuf)[3])
#define OUTCHAR(c) \
do { \
if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0) \
return MBERR_TOOSMALL; \
} while (0)
#define OUTCHAR2(c1, c2) \
do { \
Py_UCS4 _c1 = (c1); \
Py_UCS4 _c2 = (c2); \
if (_PyUnicodeWriter_Prepare(writer, 2, Py_MAX(_c1, c2)) < 0) \
return MBERR_TOOSMALL; \
PyUnicode_WRITE(writer->kind, writer->data, writer->pos, _c1); \
PyUnicode_WRITE(writer->kind, writer->data, writer->pos + 1, _c2); \
writer->pos += 2; \
} while (0)
#define OUT1(c) ((*outbuf)[0]) = (c);
#define OUT2(c) ((*outbuf)[1]) = (c);
#define OUT3(c) ((*outbuf)[2]) = (c);
@ -145,19 +164,6 @@ static const struct dbcs_map *mapping_list;
(*outbuf)[2] = (c3); \
(*outbuf)[3] = (c4);
#if Py_UNICODE_SIZE == 2
# define WRITEUCS4(c) \
REQUIRE_OUTBUF(2) \
(*outbuf)[0] = Py_UNICODE_HIGH_SURROGATE(c); \
(*outbuf)[1] = Py_UNICODE_LOW_SURROGATE(c); \
NEXT_OUT(2)
#else
# define WRITEUCS4(c) \
REQUIRE_OUTBUF(1) \
**outbuf = (Py_UNICODE)(c); \
NEXT_OUT(1)
#endif
#define _TRYMAP_ENC(m, assi, val) \
((m)->map != NULL && (val) >= (m)->bottom && \
(val)<= (m)->top && ((assi) = (m)->map[(val) - \
@ -167,24 +173,41 @@ static const struct dbcs_map *mapping_list;
#define TRYMAP_ENC(charset, assi, uni) \
if TRYMAP_ENC_COND(charset, assi, uni)
#define _TRYMAP_DEC(m, assi, val) \
((m)->map != NULL && (val) >= (m)->bottom && \
(val)<= (m)->top && ((assi) = (m)->map[(val) - \
(m)->bottom]) != UNIINV)
#define TRYMAP_DEC(charset, assi, c1, c2) \
if _TRYMAP_DEC(&charset##_decmap[c1], assi, c2)
Py_LOCAL_INLINE(int)
_TRYMAP_DEC_WRITE(_PyUnicodeWriter *writer, Py_UCS4 c)
{
if (c == UNIINV || _PyUnicodeWriter_WriteChar(writer, c) < 0)
return UNIINV;
else
return c;
}
#define _TRYMAP_ENC_MPLANE(m, assplane, asshi, asslo, val) \
((m)->map != NULL && (val) >= (m)->bottom && \
(val)<= (m)->top && \
((assplane) = (m)->map[((val) - (m)->bottom)*3]) != 0 && \
#define _TRYMAP_DEC(m, writer, val) \
((m)->map != NULL && \
(val) >= (m)->bottom && \
(val)<= (m)->top && \
_TRYMAP_DEC_WRITE(writer, (m)->map[(val) - (m)->bottom]) != UNIINV)
#define _TRYMAP_DEC_CHAR(m, assi, val) \
((m)->map != NULL && \
(val) >= (m)->bottom && \
(val)<= (m)->top && \
((assi) = (m)->map[(val) - (m)->bottom]) != UNIINV)
#define TRYMAP_DEC(charset, writer, c1, c2) \
if _TRYMAP_DEC(&charset##_decmap[c1], writer, c2)
#define TRYMAP_DEC_CHAR(charset, assi, c1, c2) \
if _TRYMAP_DEC_CHAR(&charset##_decmap[c1], assi, c2)
#define _TRYMAP_ENC_MPLANE(m, assplane, asshi, asslo, val) \
((m)->map != NULL && (val) >= (m)->bottom && \
(val)<= (m)->top && \
((assplane) = (m)->map[((val) - (m)->bottom)*3]) != 0 && \
(((asshi) = (m)->map[((val) - (m)->bottom)*3 + 1]), 1) && \
(((asslo) = (m)->map[((val) - (m)->bottom)*3 + 2]), 1))
#define TRYMAP_ENC_MPLANE(charset, assplane, asshi, asslo, uni) \
if _TRYMAP_ENC_MPLANE(&charset##_encmap[(uni) >> 8], \
assplane, asshi, asslo, (uni) & 0xff)
#define TRYMAP_DEC_MPLANE(charset, assi, plane, c1, c2) \
if _TRYMAP_DEC(&charset##_decmap[plane][c1], assi, c2)
#define TRYMAP_DEC_MPLANE(charset, writer, plane, c1, c2) \
if _TRYMAP_DEC(&charset##_decmap[plane][c1], writer, c2)
#if Py_UNICODE_SIZE == 2
#define DECODE_SURROGATE(c) \
@ -323,7 +346,7 @@ find_pairencmap(ucs2_t body, ucs2_t modifier,
const struct pair_encodemap *haystack, int haystacksize)
{
int pos, min, max;
ucs4_t value = body << 16 | modifier;
Py_UCS4 value = body << 16 | modifier;
min = 0;
max = haystacksize;

View File

@ -38,6 +38,9 @@
((c1) == 0x7E && (c2) == 0x7E))) \
return EMULATE_JISX0213_2000_DECODE_INVALID;
#define EMULATE_JISX0213_2000_DECODE_PLANE2(assi, c1, c2) \
#define EMULATE_JISX0213_2000_DECODE_PLANE2(writer, c1, c2) \
if (config == (void *)2000 && (c1) == 0x7D && (c2) == 0x3B) \
OUTCHAR(0x9B1D);
#define EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(assi, c1, c2) \
if (config == (void *)2000 && (c1) == 0x7D && (c2) == 0x3B) \
(assi) = 0x9B1D;

View File

@ -4049,7 +4049,7 @@ __gb18030ext_encmap+3126,0,100},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0
static const struct _gb18030_to_unibmp_ranges {
Py_UNICODE first, last;
Py_UCS4 first, last;
DBCHAR base;
} gb18030_to_unibmp_ranges[] = {
{128,163,0},{165,166,36},{169,175,38},{178,182,45},{184,214,50},{216,223,81},{

View File

@ -3,7 +3,7 @@
static const struct widedbcs_index *jisx0213_pair_decmap;
static const struct pair_encodemap *jisx0213_pair_encmap;
#else
static const ucs4_t __jisx0213_pair_decmap[49] = {
static const Py_UCS4 __jisx0213_pair_decmap[49] = {
810234010,810365082,810496154,810627226,810758298,816525466,816656538,
816787610,816918682,817049754,817574042,818163866,818426010,838283418,
15074048,U,U,U,39060224,39060225,42730240,42730241,39387904,39387905,39453440,

View File

@ -17,8 +17,8 @@ typedef struct {
typedef struct {
const unsigned char *inbuf, *inbuf_top, *inbuf_end;
Py_UNICODE *outbuf, *outbuf_end;
PyObject *excobj, *outobj;
PyObject *excobj;
_PyUnicodeWriter writer;
} MultibyteDecodeBuffer;
PyDoc_STRVAR(MultibyteCodec_Encode__doc__,
@ -197,29 +197,6 @@ expand_encodebuffer(MultibyteEncodeBuffer *buf, Py_ssize_t esize)
goto errorexit; \
}
static int
expand_decodebuffer(MultibyteDecodeBuffer *buf, Py_ssize_t esize)
{
Py_ssize_t orgpos, orgsize;
orgpos = (Py_ssize_t)(buf->outbuf - PyUnicode_AS_UNICODE(buf->outobj));
orgsize = PyUnicode_GET_SIZE(buf->outobj);
if (PyUnicode_Resize(&buf->outobj, orgsize + (
esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize)) == -1)
return -1;
buf->outbuf = PyUnicode_AS_UNICODE(buf->outobj) + orgpos;
buf->outbuf_end = PyUnicode_AS_UNICODE(buf->outobj)
+ PyUnicode_GET_SIZE(buf->outobj);
return 0;
}
#define REQUIRE_DECODEBUFFER(buf, s) { \
if ((s) < 1 || (buf)->outbuf + (s) > (buf)->outbuf_end) \
if (expand_decodebuffer(buf, s) == -1) \
goto errorexit; \
}
/**
* MultibyteCodec object
@ -374,7 +351,7 @@ multibytecodec_decerror(MultibyteCodec *codec,
PyObject *errors, Py_ssize_t e)
{
PyObject *retobj = NULL, *retuni = NULL;
Py_ssize_t retunisize, newpos;
Py_ssize_t newpos;
const char *reason;
Py_ssize_t esize, start, end;
@ -385,7 +362,6 @@ multibytecodec_decerror(MultibyteCodec *codec,
else {
switch (e) {
case MBERR_TOOSMALL:
REQUIRE_DECODEBUFFER(buf, -1);
return 0; /* retry it */
case MBERR_TOOFEW:
reason = "incomplete multibyte sequence";
@ -403,8 +379,9 @@ multibytecodec_decerror(MultibyteCodec *codec,
}
if (errors == ERROR_REPLACE) {
REQUIRE_DECODEBUFFER(buf, 1);
*buf->outbuf++ = Py_UNICODE_REPLACEMENT_CHARACTER;
if (_PyUnicodeWriter_WriteChar(&buf->writer,
Py_UNICODE_REPLACEMENT_CHARACTER) < 0)
goto errorexit;
}
if (errors == ERROR_IGNORE || errors == ERROR_REPLACE) {
buf->inbuf += esize;
@ -447,15 +424,8 @@ multibytecodec_decerror(MultibyteCodec *codec,
goto errorexit;
}
if (PyUnicode_AsUnicode(retuni) == NULL)
if (_PyUnicodeWriter_WriteStr(&buf->writer, retuni) < 0)
goto errorexit;
retunisize = PyUnicode_GET_SIZE(retuni);
if (retunisize > 0) {
REQUIRE_DECODEBUFFER(buf, retunisize);
memcpy((char *)buf->outbuf, PyUnicode_AS_UNICODE(retuni),
retunisize * Py_UNICODE_SIZE);
buf->outbuf += retunisize;
}
newpos = PyLong_AsSsize_t(PyTuple_GET_ITEM(retobj, 1));
if (newpos < 0 && !PyErr_Occurred())
@ -617,10 +587,10 @@ MultibyteCodec_Decode(MultibyteCodecObject *self,
{
MultibyteCodec_State state;
MultibyteDecodeBuffer buf;
PyObject *errorcb;
PyObject *errorcb, *res;
Py_buffer pdata;
const char *data, *errors = NULL;
Py_ssize_t datalen, finalsize;
Py_ssize_t datalen;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|z:decode",
codeckwarglist, &pdata, &errors))
@ -640,29 +610,22 @@ MultibyteCodec_Decode(MultibyteCodecObject *self,
return make_tuple(PyUnicode_New(0, 0), 0);
}
_PyUnicodeWriter_Init(&buf.writer, datalen);
buf.excobj = NULL;
buf.inbuf = buf.inbuf_top = (unsigned char *)data;
buf.inbuf_end = buf.inbuf_top + datalen;
buf.outobj = PyUnicode_FromUnicode(NULL, datalen);
if (buf.outobj == NULL)
goto errorexit;
buf.outbuf = PyUnicode_AS_UNICODE(buf.outobj);
if (buf.outbuf == NULL)
goto errorexit;
buf.outbuf_end = buf.outbuf + PyUnicode_GET_SIZE(buf.outobj);
if (self->codec->decinit != NULL &&
self->codec->decinit(&state, self->codec->config) != 0)
goto errorexit;
while (buf.inbuf < buf.inbuf_end) {
Py_ssize_t inleft, outleft, r;
Py_ssize_t inleft, r;
inleft = (Py_ssize_t)(buf.inbuf_end - buf.inbuf);
outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf);
r = self->codec->decode(&state, self->codec->config,
&buf.inbuf, inleft, &buf.outbuf, outleft);
&buf.inbuf, inleft, &buf.writer);
if (r == 0)
break;
else if (multibytecodec_decerror(self->codec, &state,
@ -670,23 +633,20 @@ MultibyteCodec_Decode(MultibyteCodecObject *self,
goto errorexit;
}
finalsize = (Py_ssize_t)(buf.outbuf -
PyUnicode_AS_UNICODE(buf.outobj));
if (finalsize != PyUnicode_GET_SIZE(buf.outobj))
if (PyUnicode_Resize(&buf.outobj, finalsize) == -1)
goto errorexit;
res = _PyUnicodeWriter_Finish(&buf.writer);
if (res == NULL)
goto errorexit;
PyBuffer_Release(&pdata);
Py_XDECREF(buf.excobj);
ERROR_DECREF(errorcb);
return make_tuple(buf.outobj, datalen);
return make_tuple(res, datalen);
errorexit:
PyBuffer_Release(&pdata);
ERROR_DECREF(errorcb);
Py_XDECREF(buf.excobj);
Py_XDECREF(buf.outobj);
_PyUnicodeWriter_Dealloc(&buf.writer);
return NULL;
}
@ -859,17 +819,7 @@ decoder_prepare_buffer(MultibyteDecodeBuffer *buf, const char *data,
{
buf->inbuf = buf->inbuf_top = (const unsigned char *)data;
buf->inbuf_end = buf->inbuf_top + size;
if (buf->outobj == NULL) { /* only if outobj is not allocated yet */
buf->outobj = PyUnicode_FromUnicode(NULL, size);
if (buf->outobj == NULL)
return -1;
buf->outbuf = PyUnicode_AsUnicode(buf->outobj);
if (buf->outbuf == NULL)
return -1;
buf->outbuf_end = buf->outbuf +
PyUnicode_GET_SIZE(buf->outobj);
}
_PyUnicodeWriter_Init(&buf->writer, size);
return 0;
}
@ -878,14 +828,13 @@ decoder_feed_buffer(MultibyteStatefulDecoderContext *ctx,
MultibyteDecodeBuffer *buf)
{
while (buf->inbuf < buf->inbuf_end) {
Py_ssize_t inleft, outleft;
Py_ssize_t inleft;
Py_ssize_t r;
inleft = (Py_ssize_t)(buf->inbuf_end - buf->inbuf);
outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf);
r = ctx->codec->decode(&ctx->state, ctx->codec->config,
&buf->inbuf, inleft, &buf->outbuf, outleft);
&buf->inbuf, inleft, &buf->writer);
if (r == 0 || r == MBERR_TOOFEW)
break;
else if (multibytecodec_decerror(ctx->codec, &ctx->state,
@ -1058,8 +1007,9 @@ mbidecoder_decode(MultibyteIncrementalDecoderObject *self,
MultibyteDecodeBuffer buf;
char *data, *wdata = NULL;
Py_buffer pdata;
Py_ssize_t wsize, finalsize = 0, size, origpending;
Py_ssize_t wsize, size, origpending;
int final = 0;
PyObject *res;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|i:decode",
incrementalkwarglist, &pdata, &final))
@ -1067,7 +1017,8 @@ mbidecoder_decode(MultibyteIncrementalDecoderObject *self,
data = pdata.buf;
size = pdata.len;
buf.outobj = buf.excobj = NULL;
_PyUnicodeWriter_Init(&buf.writer, 1);
buf.excobj = NULL;
origpending = self->pendingsize;
if (self->pendingsize == 0) {
@ -1109,23 +1060,22 @@ mbidecoder_decode(MultibyteIncrementalDecoderObject *self,
goto errorexit;
}
finalsize = (Py_ssize_t)(buf.outbuf - PyUnicode_AS_UNICODE(buf.outobj));
if (finalsize != PyUnicode_GET_SIZE(buf.outobj))
if (PyUnicode_Resize(&buf.outobj, finalsize) == -1)
goto errorexit;
res = _PyUnicodeWriter_Finish(&buf.writer);
if (res == NULL)
goto errorexit;
PyBuffer_Release(&pdata);
if (wdata != data)
PyMem_Del(wdata);
Py_XDECREF(buf.excobj);
return buf.outobj;
return res;
errorexit:
PyBuffer_Release(&pdata);
if (wdata != NULL && wdata != data)
PyMem_Del(wdata);
Py_XDECREF(buf.excobj);
Py_XDECREF(buf.outobj);
_PyUnicodeWriter_Dealloc(&buf.writer);
return NULL;
}
@ -1265,13 +1215,14 @@ mbstreamreader_iread(MultibyteStreamReaderObject *self,
const char *method, Py_ssize_t sizehint)
{
MultibyteDecodeBuffer buf;
PyObject *cres;
Py_ssize_t rsize, finalsize = 0;
PyObject *cres, *res;
Py_ssize_t rsize;
if (sizehint == 0)
return PyUnicode_New(0, 0);
buf.outobj = buf.excobj = NULL;
_PyUnicodeWriter_Init(&buf.writer, 1);
buf.excobj = NULL;
cres = NULL;
for (;;) {
@ -1340,29 +1291,27 @@ mbstreamreader_iread(MultibyteStreamReaderObject *self,
goto errorexit;
}
finalsize = (Py_ssize_t)(buf.outbuf -
PyUnicode_AS_UNICODE(buf.outobj));
Py_DECREF(cres);
cres = NULL;
if (sizehint < 0 || finalsize != 0 || rsize == 0)
if (sizehint < 0 || buf.writer.pos != 0 || rsize == 0)
break;
sizehint = 1; /* read 1 more byte and retry */
}
if (finalsize != PyUnicode_GET_SIZE(buf.outobj))
if (PyUnicode_Resize(&buf.outobj, finalsize) == -1)
goto errorexit;
res = _PyUnicodeWriter_Finish(&buf.writer);
if (res == NULL)
goto errorexit;
Py_XDECREF(cres);
Py_XDECREF(buf.excobj);
return buf.outobj;
return res;
errorexit:
Py_XDECREF(cres);
Py_XDECREF(buf.excobj);
Py_XDECREF(buf.outobj);
_PyUnicodeWriter_Dealloc(&buf.writer);
return NULL;
}

View File

@ -10,12 +10,6 @@
extern "C" {
#endif
#ifdef uint32_t
typedef uint32_t ucs4_t;
#else
typedef unsigned int ucs4_t;
#endif
#ifdef uint16_t
typedef uint16_t ucs2_t, DBCHAR;
#else
@ -27,7 +21,7 @@ typedef union {
int i;
unsigned char c[8];
ucs2_t u2[4];
ucs4_t u4[2];
Py_UCS4 u4[2];
} MultibyteCodec_State;
typedef int (*mbcodec_init)(const void *config);
@ -44,7 +38,7 @@ typedef Py_ssize_t (*mbencodereset_func)(MultibyteCodec_State *state,
typedef Py_ssize_t (*mbdecode_func)(MultibyteCodec_State *state,
const void *config,
const unsigned char **inbuf, Py_ssize_t inleft,
Py_UNICODE **outbuf, Py_ssize_t outleft);
_PyUnicodeWriter *writer);
typedef int (*mbdecodeinit_func)(MultibyteCodec_State *state,
const void *config);
typedef Py_ssize_t (*mbdecodereset_func)(MultibyteCodec_State *state,

View File

@ -12947,6 +12947,16 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
return 0;
}
int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
{
if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
return -1;
PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
writer->pos++;
return 0;
}
int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
{