Added new codecs and aliases for ISO_8859-11, ISO_8859-16 and

TIS-620.

Closes SF bug #1001895: Adding missing ISO 8859 codecs, especially Thai.
This commit is contained in:
Marc-André Lemburg 2004-08-05 12:43:30 +00:00
parent 1884dda233
commit c759f070ef
5 changed files with 288 additions and 1 deletions

View File

@ -17,7 +17,7 @@
"""
aliases = {
# Please keep this list sorted alphabetically !
# Please keep this list sorted alphabetically by value !
# ascii codec
'646' : 'ascii',
@ -373,6 +373,18 @@ aliases = {
'l5' : 'iso8859_9',
'latin5' : 'iso8859_9',
# iso8859_11 codec
'thai' : 'iso8859_11',
'iso_8859_11' : 'iso8859_11',
'iso_8859_11_2001' : 'iso8859_11',
# iso8859_16 codec
'iso_8859_16' : 'iso8859_16',
'iso_8859_16_2001' : 'iso8859_16',
'iso_ir_226' : 'iso8859_16',
'l10' : 'iso8859_16',
'latin10' : 'iso8859_16',
# johab codec
'cp1361' : 'johab',
'ms1361' : 'johab',
@ -448,6 +460,13 @@ aliases = {
# tactis codec
'tis260' : 'tactis',
# tis_620 codec
'tis620' : 'tis_620',
'tis_620_0' : 'tis_620',
'tis_620_2529_0' : 'tis_620',
'tis_620_2529_1' : 'tis_620',
'iso_ir_166' : 'tis_620',
# utf_16 codec
'u16' : 'utf_16',
'utf16' : 'utf_16',

137
Lib/encodings/iso8859_11.py Normal file
View File

@ -0,0 +1,137 @@
""" Python Character Mapping Codec generated from '8859-11.TXT' with gencodec.py.
Generated from mapping found in
ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-11.TXT
"""#"
import codecs
### Codec APIs
class Codec(codecs.Codec):
def encode(self,input,errors='strict'):
return codecs.charmap_encode(input,errors,encoding_map)
def decode(self,input,errors='strict'):
return codecs.charmap_decode(input,errors,decoding_map)
class StreamWriter(Codec,codecs.StreamWriter):
pass
class StreamReader(Codec,codecs.StreamReader):
pass
### encodings module API
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
### Decoding Map
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x00a1: 0x0e01, # THAI CHARACTER KO KAI
0x00a2: 0x0e02, # THAI CHARACTER KHO KHAI
0x00a3: 0x0e03, # THAI CHARACTER KHO KHUAT
0x00a4: 0x0e04, # THAI CHARACTER KHO KHWAI
0x00a5: 0x0e05, # THAI CHARACTER KHO KHON
0x00a6: 0x0e06, # THAI CHARACTER KHO RAKHANG
0x00a7: 0x0e07, # THAI CHARACTER NGO NGU
0x00a8: 0x0e08, # THAI CHARACTER CHO CHAN
0x00a9: 0x0e09, # THAI CHARACTER CHO CHING
0x00aa: 0x0e0a, # THAI CHARACTER CHO CHANG
0x00ab: 0x0e0b, # THAI CHARACTER SO SO
0x00ac: 0x0e0c, # THAI CHARACTER CHO CHOE
0x00ad: 0x0e0d, # THAI CHARACTER YO YING
0x00ae: 0x0e0e, # THAI CHARACTER DO CHADA
0x00af: 0x0e0f, # THAI CHARACTER TO PATAK
0x00b0: 0x0e10, # THAI CHARACTER THO THAN
0x00b1: 0x0e11, # THAI CHARACTER THO NANGMONTHO
0x00b2: 0x0e12, # THAI CHARACTER THO PHUTHAO
0x00b3: 0x0e13, # THAI CHARACTER NO NEN
0x00b4: 0x0e14, # THAI CHARACTER DO DEK
0x00b5: 0x0e15, # THAI CHARACTER TO TAO
0x00b6: 0x0e16, # THAI CHARACTER THO THUNG
0x00b7: 0x0e17, # THAI CHARACTER THO THAHAN
0x00b8: 0x0e18, # THAI CHARACTER THO THONG
0x00b9: 0x0e19, # THAI CHARACTER NO NU
0x00ba: 0x0e1a, # THAI CHARACTER BO BAIMAI
0x00bb: 0x0e1b, # THAI CHARACTER PO PLA
0x00bc: 0x0e1c, # THAI CHARACTER PHO PHUNG
0x00bd: 0x0e1d, # THAI CHARACTER FO FA
0x00be: 0x0e1e, # THAI CHARACTER PHO PHAN
0x00bf: 0x0e1f, # THAI CHARACTER FO FAN
0x00c0: 0x0e20, # THAI CHARACTER PHO SAMPHAO
0x00c1: 0x0e21, # THAI CHARACTER MO MA
0x00c2: 0x0e22, # THAI CHARACTER YO YAK
0x00c3: 0x0e23, # THAI CHARACTER RO RUA
0x00c4: 0x0e24, # THAI CHARACTER RU
0x00c5: 0x0e25, # THAI CHARACTER LO LING
0x00c6: 0x0e26, # THAI CHARACTER LU
0x00c7: 0x0e27, # THAI CHARACTER WO WAEN
0x00c8: 0x0e28, # THAI CHARACTER SO SALA
0x00c9: 0x0e29, # THAI CHARACTER SO RUSI
0x00ca: 0x0e2a, # THAI CHARACTER SO SUA
0x00cb: 0x0e2b, # THAI CHARACTER HO HIP
0x00cc: 0x0e2c, # THAI CHARACTER LO CHULA
0x00cd: 0x0e2d, # THAI CHARACTER O ANG
0x00ce: 0x0e2e, # THAI CHARACTER HO NOKHUK
0x00cf: 0x0e2f, # THAI CHARACTER PAIYANNOI
0x00d0: 0x0e30, # THAI CHARACTER SARA A
0x00d1: 0x0e31, # THAI CHARACTER MAI HAN-AKAT
0x00d2: 0x0e32, # THAI CHARACTER SARA AA
0x00d3: 0x0e33, # THAI CHARACTER SARA AM
0x00d4: 0x0e34, # THAI CHARACTER SARA I
0x00d5: 0x0e35, # THAI CHARACTER SARA II
0x00d6: 0x0e36, # THAI CHARACTER SARA UE
0x00d7: 0x0e37, # THAI CHARACTER SARA UEE
0x00d8: 0x0e38, # THAI CHARACTER SARA U
0x00d9: 0x0e39, # THAI CHARACTER SARA UU
0x00da: 0x0e3a, # THAI CHARACTER PHINTHU
0x00db: None,
0x00dc: None,
0x00dd: None,
0x00de: None,
0x00df: 0x0e3f, # THAI CURRENCY SYMBOL BAHT
0x00e0: 0x0e40, # THAI CHARACTER SARA E
0x00e1: 0x0e41, # THAI CHARACTER SARA AE
0x00e2: 0x0e42, # THAI CHARACTER SARA O
0x00e3: 0x0e43, # THAI CHARACTER SARA AI MAIMUAN
0x00e4: 0x0e44, # THAI CHARACTER SARA AI MAIMALAI
0x00e5: 0x0e45, # THAI CHARACTER LAKKHANGYAO
0x00e6: 0x0e46, # THAI CHARACTER MAIYAMOK
0x00e7: 0x0e47, # THAI CHARACTER MAITAIKHU
0x00e8: 0x0e48, # THAI CHARACTER MAI EK
0x00e9: 0x0e49, # THAI CHARACTER MAI THO
0x00ea: 0x0e4a, # THAI CHARACTER MAI TRI
0x00eb: 0x0e4b, # THAI CHARACTER MAI CHATTAWA
0x00ec: 0x0e4c, # THAI CHARACTER THANTHAKHAT
0x00ed: 0x0e4d, # THAI CHARACTER NIKHAHIT
0x00ee: 0x0e4e, # THAI CHARACTER YAMAKKAN
0x00ef: 0x0e4f, # THAI CHARACTER FONGMAN
0x00f0: 0x0e50, # THAI DIGIT ZERO
0x00f1: 0x0e51, # THAI DIGIT ONE
0x00f2: 0x0e52, # THAI DIGIT TWO
0x00f3: 0x0e53, # THAI DIGIT THREE
0x00f4: 0x0e54, # THAI DIGIT FOUR
0x00f5: 0x0e55, # THAI DIGIT FIVE
0x00f6: 0x0e56, # THAI DIGIT SIX
0x00f7: 0x0e57, # THAI DIGIT SEVEN
0x00f8: 0x0e58, # THAI DIGIT EIGHT
0x00f9: 0x0e59, # THAI DIGIT NINE
0x00fa: 0x0e5a, # THAI CHARACTER ANGKHANKHU
0x00fb: 0x0e5b, # THAI CHARACTER KHOMUT
0x00fc: None,
0x00fd: None,
0x00fe: None,
0x00ff: None,
})
### Encoding Map
encoding_map = codecs.make_encoding_map(decoding_map)

View File

@ -0,0 +1,82 @@
""" Python Character Mapping Codec generated from '8859-16.TXT' with gencodec.py.
Generated from mapping found in
ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-16.TXT
"""#"
import codecs
### Codec APIs
class Codec(codecs.Codec):
def encode(self,input,errors='strict'):
return codecs.charmap_encode(input,errors,encoding_map)
def decode(self,input,errors='strict'):
return codecs.charmap_decode(input,errors,decoding_map)
class StreamWriter(Codec,codecs.StreamWriter):
pass
class StreamReader(Codec,codecs.StreamReader):
pass
### encodings module API
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
### Decoding Map
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
0x00a1: 0x0104, # LATIN CAPITAL LETTER A WITH OGONEK
0x00a2: 0x0105, # LATIN SMALL LETTER A WITH OGONEK
0x00a3: 0x0141, # LATIN CAPITAL LETTER L WITH STROKE
0x00a4: 0x20ac, # EURO SIGN
0x00a5: 0x201e, # DOUBLE LOW-9 QUOTATION MARK
0x00a6: 0x0160, # LATIN CAPITAL LETTER S WITH CARON
0x00a8: 0x0161, # LATIN SMALL LETTER S WITH CARON
0x00aa: 0x0218, # LATIN CAPITAL LETTER S WITH COMMA BELOW
0x00ac: 0x0179, # LATIN CAPITAL LETTER Z WITH ACUTE
0x00ae: 0x017a, # LATIN SMALL LETTER Z WITH ACUTE
0x00af: 0x017b, # LATIN CAPITAL LETTER Z WITH DOT ABOVE
0x00b2: 0x010c, # LATIN CAPITAL LETTER C WITH CARON
0x00b3: 0x0142, # LATIN SMALL LETTER L WITH STROKE
0x00b4: 0x017d, # LATIN CAPITAL LETTER Z WITH CARON
0x00b5: 0x201d, # RIGHT DOUBLE QUOTATION MARK
0x00b8: 0x017e, # LATIN SMALL LETTER Z WITH CARON
0x00b9: 0x010d, # LATIN SMALL LETTER C WITH CARON
0x00ba: 0x0219, # LATIN SMALL LETTER S WITH COMMA BELOW
0x00bc: 0x0152, # LATIN CAPITAL LIGATURE OE
0x00bd: 0x0153, # LATIN SMALL LIGATURE OE
0x00be: 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS
0x00bf: 0x017c, # LATIN SMALL LETTER Z WITH DOT ABOVE
0x00c3: 0x0102, # LATIN CAPITAL LETTER A WITH BREVE
0x00c5: 0x0106, # LATIN CAPITAL LETTER C WITH ACUTE
0x00d0: 0x0110, # LATIN CAPITAL LETTER D WITH STROKE
0x00d1: 0x0143, # LATIN CAPITAL LETTER N WITH ACUTE
0x00d5: 0x0150, # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
0x00d7: 0x015a, # LATIN CAPITAL LETTER S WITH ACUTE
0x00d8: 0x0170, # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
0x00dd: 0x0118, # LATIN CAPITAL LETTER E WITH OGONEK
0x00de: 0x021a, # LATIN CAPITAL LETTER T WITH COMMA BELOW
0x00e3: 0x0103, # LATIN SMALL LETTER A WITH BREVE
0x00e5: 0x0107, # LATIN SMALL LETTER C WITH ACUTE
0x00f0: 0x0111, # LATIN SMALL LETTER D WITH STROKE
0x00f1: 0x0144, # LATIN SMALL LETTER N WITH ACUTE
0x00f5: 0x0151, # LATIN SMALL LETTER O WITH DOUBLE ACUTE
0x00f7: 0x015b, # LATIN SMALL LETTER S WITH ACUTE
0x00f8: 0x0171, # LATIN SMALL LETTER U WITH DOUBLE ACUTE
0x00fd: 0x0119, # LATIN SMALL LETTER E WITH OGONEK
0x00fe: 0x021b, # LATIN SMALL LETTER T WITH COMMA BELOW
})
### Encoding Map
encoding_map = codecs.make_encoding_map(decoding_map)

46
Lib/encodings/tis_620.py Normal file
View File

@ -0,0 +1,46 @@
""" Python Character Mapping Codec for TIS-620.
According to
ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-11.TXT the
TIS-620 is the identical to ISO_8859-11 with the 0xA0 (no-break
space) mapping removed.
"""#"
import codecs
from encodings.iso8859_11 import decoding_map
### Codec APIs
class Codec(codecs.Codec):
def encode(self,input,errors='strict'):
return codecs.charmap_encode(input,errors,encoding_map)
def decode(self,input,errors='strict'):
return codecs.charmap_decode(input,errors,decoding_map)
class StreamWriter(Codec,codecs.StreamWriter):
pass
class StreamReader(Codec,codecs.StreamReader):
pass
### encodings module API
def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
### Decoding Map
decoding_map = decoding_map.copy()
decoding_map.update({
0x00a0: None,
})
### Encoding Map
encoding_map = codecs.make_encoding_map(decoding_map)

View File

@ -83,6 +83,9 @@ Extension modules
Library
-------
- Added new codecs and aliases for ISO_8859-11, ISO_8859-16 and
TIS-620
- Thanks to Edward Loper, doctest has been massively refactored, and
many new features were added. Full docs will appear later. For now
the doctest module comments and new test cases give good coverage.