cpython/Lib/test/test_codecencodings_cn.py

#!/usr/bin/env python3
#
# test_codecencodings_cn.py
#   Codec encoding tests for PRC encodings.
#

from test import support
from test import test_multibytecodec_support
import unittest

class Test_GB2312(test_multibytecodec_support.TestBase, unittest.TestCase):
    encoding = 'gb2312'
    tstring = test_multibytecodec_support.load_teststring('gb2312')
    codectests = (
        # invalid bytes
        (b"abc\x81\x81\xc1\xc4", "strict",  None),
        (b"abc\xc8", "strict",  None),
        (b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
        (b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
        (b"abc\x81\x81\xc1\xc4", "ignore",  "abc\u804a"),
        (b"\xc1\x64", "strict", None),
    )

class Test_GBK(test_multibytecodec_support.TestBase, unittest.TestCase):
    encoding = 'gbk'
    tstring = test_multibytecodec_support.load_teststring('gbk')
    codectests = (
        # invalid bytes
        (b"abc\x80\x80\xc1\xc4", "strict",  None),
        (b"abc\xc8", "strict",  None),
        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
        (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u804a"),
        (b"\x83\x34\x83\x31", "strict", None),
        ("\u30fb", "strict", None),
    )

class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase):
    encoding = 'gb18030'
    tstring = test_multibytecodec_support.load_teststring('gb18030')
    codectests = (
        # invalid bytes
        (b"abc\x80\x80\xc1\xc4", "strict",  None),
        (b"abc\xc8", "strict",  None),
        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
        (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u804a"),
        (b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd9\ufffd9\u804a"),
        ("\u30fb", "strict", b"\x819\xa79"),
        (b"abc\x84\x32\x80\x80def", "replace", 'abc\ufffd2\ufffd\ufffddef'),
        (b"abc\x81\x30\x81\x30def", "strict", 'abc\x80def'),
        (b"abc\x86\x30\x81\x30def", "replace", 'abc\ufffd0\ufffd0def'),
    )
    has_iso10646 = True

class Test_HZ(test_multibytecodec_support.TestBase, unittest.TestCase):
    encoding = 'hz'
    tstring = test_multibytecodec_support.load_teststring('hz')
    codectests = (
        # test '~\n' (3 lines)
        (b'This sentence is in ASCII.\n'
         b'The next sentence is in GB.~{<:Ky2;S{#,~}~\n'
         b'~{NpJ)l6HK!#~}Bye.\n',
         'strict',
         'This sentence is in ASCII.\n'
         'The next sentence is in GB.'
         '\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002'
         'Bye.\n'),
        # test '~\n' (4 lines)
        (b'This sentence is in ASCII.\n'
         b'The next sentence is in GB.~\n'
         b'~{<:Ky2;S{#,NpJ)l6HK!#~}~\n'
         b'Bye.\n',
         'strict',
         'This sentence is in ASCII.\n'
         'The next sentence is in GB.'
         '\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002'
         'Bye.\n'),
        # invalid bytes
        (b'ab~cd', 'replace', 'ab\uFFFDcd'),
        (b'ab\xffcd', 'replace', 'ab\uFFFDcd'),
        (b'ab~{\x81\x81\x41\x44~}cd', 'replace', 'ab\uFFFD\uFFFD\u804Acd'),
        (b'ab~{\x41\x44~}cd', 'replace', 'ab\u804Acd'),
        (b"ab~{\x79\x79\x41\x44~}cd", "replace", "ab\ufffd\ufffd\u804acd"),
    )

def test_main():
    support.run_unittest(__name__)

if __name__ == "__main__":
    test_main()
convert shebang lines: python -> python3 2010-03-11 18:53:45 -04:00			`#!/usr/bin/env python3`
Add CJK codecs support as discussed on python-dev. (SF #873597) Several style fixes are suggested by Martin v. Loewis and Marc-Andre Lemburg. Thanks! 2004-01-17 10:29:29 -04:00			`#`
			`# test_codecencodings_cn.py`
			`# Codec encoding tests for PRC encodings.`
			`#`

#2621 rename test.test_support to test.support 2008-05-20 18:35:26 -03:00			`from test import support`
Add CJK codecs support as discussed on python-dev. (SF #873597) Several style fixes are suggested by Martin v. Loewis and Marc-Andre Lemburg. Thanks! 2004-01-17 10:29:29 -04:00			`from test import test_multibytecodec_support`
			`import unittest`

			`class Test_GB2312(test_multibytecodec_support.TestBase, unittest.TestCase):`
			`encoding = 'gb2312'`
			`tstring = test_multibytecodec_support.load_teststring('gb2312')`
			`codectests = (`
			`# invalid bytes`
Make all the multibyte codec tests pass. Changes to io.py, necessary to make this work: - Redid io.StringIO as a TextIOWrapper on top of a BytesIO instance. - Got rid of _MemoryIOMixin, folding it into BytesIO instead. - The read() functions that take -1 to mean "eveything" now also take None. - Added readline() support to BufferedIOBase. :-( 2007-05-17 20:59:11 -03:00			`(b"abc\x81\x81\xc1\xc4", "strict", None),`
			`(b"abc\xc8", "strict", None),`
Issue #12016: Multibyte CJK decoders now resynchronize faster They only ignore the first byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'. 2011-07-07 20:45:13 -03:00			`(b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),`
			`(b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),`
Make all the multibyte codec tests pass. Changes to io.py, necessary to make this work: - Redid io.StringIO as a TextIOWrapper on top of a BytesIO instance. - Got rid of _MemoryIOMixin, folding it into BytesIO instead. - The read() functions that take -1 to mean "eveything" now also take None. - Added readline() support to BufferedIOBase. :-( 2007-05-17 20:59:11 -03:00			`(b"abc\x81\x81\xc1\xc4", "ignore", "abc\u804a"),`
			`(b"\xc1\x64", "strict", None),`
Add CJK codecs support as discussed on python-dev. (SF #873597) Several style fixes are suggested by Martin v. Loewis and Marc-Andre Lemburg. Thanks! 2004-01-17 10:29:29 -04:00			`)`

			`class Test_GBK(test_multibytecodec_support.TestBase, unittest.TestCase):`
			`encoding = 'gbk'`
			`tstring = test_multibytecodec_support.load_teststring('gbk')`
			`codectests = (`
			`# invalid bytes`
Make all the multibyte codec tests pass. Changes to io.py, necessary to make this work: - Redid io.StringIO as a TextIOWrapper on top of a BytesIO instance. - Got rid of _MemoryIOMixin, folding it into BytesIO instead. - The read() functions that take -1 to mean "eveything" now also take None. - Added readline() support to BufferedIOBase. :-( 2007-05-17 20:59:11 -03:00			`(b"abc\x80\x80\xc1\xc4", "strict", None),`
			`(b"abc\xc8", "strict", None),`
Issue #12016: Multibyte CJK decoders now resynchronize faster They only ignore the first byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'. 2011-07-07 20:45:13 -03:00			`(b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),`
			`(b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),`
Make all the multibyte codec tests pass. Changes to io.py, necessary to make this work: - Redid io.StringIO as a TextIOWrapper on top of a BytesIO instance. - Got rid of _MemoryIOMixin, folding it into BytesIO instead. - The read() functions that take -1 to mean "eveything" now also take None. - Added readline() support to BufferedIOBase. :-( 2007-05-17 20:59:11 -03:00			`(b"abc\x80\x80\xc1\xc4", "ignore", "abc\u804a"),`
			`(b"\x83\x34\x83\x31", "strict", None),`
Rip out all the u"..." literals and calls to unicode(). 2007-05-02 16:09:54 -03:00			`("\u30fb", "strict", None),`
Add CJK codecs support as discussed on python-dev. (SF #873597) Several style fixes are suggested by Martin v. Loewis and Marc-Andre Lemburg. Thanks! 2004-01-17 10:29:29 -04:00			`)`

			`class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase):`
			`encoding = 'gb18030'`
			`tstring = test_multibytecodec_support.load_teststring('gb18030')`
			`codectests = (`
			`# invalid bytes`
Make all the multibyte codec tests pass. Changes to io.py, necessary to make this work: - Redid io.StringIO as a TextIOWrapper on top of a BytesIO instance. - Got rid of _MemoryIOMixin, folding it into BytesIO instead. - The read() functions that take -1 to mean "eveything" now also take None. - Added readline() support to BufferedIOBase. :-( 2007-05-17 20:59:11 -03:00			`(b"abc\x80\x80\xc1\xc4", "strict", None),`
			`(b"abc\xc8", "strict", None),`
Issue #12016: Multibyte CJK decoders now resynchronize faster They only ignore the first byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'. 2011-07-07 20:45:13 -03:00			`(b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),`
			`(b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),`
Make all the multibyte codec tests pass. Changes to io.py, necessary to make this work: - Redid io.StringIO as a TextIOWrapper on top of a BytesIO instance. - Got rid of _MemoryIOMixin, folding it into BytesIO instead. - The read() functions that take -1 to mean "eveything" now also take None. - Added readline() support to BufferedIOBase. :-( 2007-05-17 20:59:11 -03:00			`(b"abc\x80\x80\xc1\xc4", "ignore", "abc\u804a"),`
Issue #12016: Multibyte CJK decoders now resynchronize faster They only ignore the first byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'. 2011-07-07 20:45:13 -03:00			`(b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd9\ufffd9\u804a"),`
Make all the multibyte codec tests pass. Changes to io.py, necessary to make this work: - Redid io.StringIO as a TextIOWrapper on top of a BytesIO instance. - Got rid of _MemoryIOMixin, folding it into BytesIO instead. - The read() functions that take -1 to mean "eveything" now also take None. - Added readline() support to BufferedIOBase. :-( 2007-05-17 20:59:11 -03:00			`("\u30fb", "strict", b"\x819\xa79"),`
Issue #12016: Multibyte CJK decoders now resynchronize faster They only ignore the first byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'. 2011-07-07 20:45:13 -03:00			`(b"abc\x84\x32\x80\x80def", "replace", 'abc\ufffd2\ufffd\ufffddef'),`
			`(b"abc\x81\x30\x81\x30def", "strict", 'abc\x80def'),`
			`(b"abc\x86\x30\x81\x30def", "replace", 'abc\ufffd0\ufffd0def'),`
Add CJK codecs support as discussed on python-dev. (SF #873597) Several style fixes are suggested by Martin v. Loewis and Marc-Andre Lemburg. Thanks! 2004-01-17 10:29:29 -04:00			`)`
			`has_iso10646 = True`

Issue #12057: Add tests for the HZ encoding 2011-05-24 19:06:51 -03:00			`class Test_HZ(test_multibytecodec_support.TestBase, unittest.TestCase):`
			`encoding = 'hz'`
			`tstring = test_multibytecodec_support.load_teststring('hz')`
			`codectests = (`
			`# test '~\n' (3 lines)`
			`(b'This sentence is in ASCII.\n'`
			`b'The next sentence is in GB.~{<:Ky2;S{#,~}~\n'`
			`b'~{NpJ)l6HK!#~}Bye.\n',`
			`'strict',`
			`'This sentence is in ASCII.\n'`
			`'The next sentence is in GB.'`
			`'\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002'`
			`'Bye.\n'),`
			`# test '~\n' (4 lines)`
			`(b'This sentence is in ASCII.\n'`
			`b'The next sentence is in GB.~\n'`
			`b'~{<:Ky2;S{#,NpJ)l6HK!#~}~\n'`
			`b'Bye.\n',`
			`'strict',`
			`'This sentence is in ASCII.\n'`
			`'The next sentence is in GB.'`
			`'\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002'`
			`'Bye.\n'),`
			`# invalid bytes`
Issue #12016: Multibyte CJK decoders now resynchronize faster They only ignore the first byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'. 2011-07-07 20:45:13 -03:00			`(b'ab~cd', 'replace', 'ab\uFFFDcd'),`
Issue #12057: Add tests for the HZ encoding 2011-05-24 19:06:51 -03:00			`(b'ab\xffcd', 'replace', 'ab\uFFFDcd'),`
			`(b'ab~{\x81\x81\x41\x44~}cd', 'replace', 'ab\uFFFD\uFFFD\u804Acd'),`
Issue #12016: Multibyte CJK decoders now resynchronize faster They only ignore the first byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'. 2011-07-07 20:45:13 -03:00			`(b'ab~{\x41\x44~}cd', 'replace', 'ab\u804Acd'),`
			`(b"ab~{\x79\x79\x41\x44~}cd", "replace", "ab\ufffd\ufffd\u804acd"),`
Issue #12057: Add tests for the HZ encoding 2011-05-24 19:06:51 -03:00			`)`

Add CJK codecs support as discussed on python-dev. (SF #873597) Several style fixes are suggested by Martin v. Loewis and Marc-Andre Lemburg. Thanks! 2004-01-17 10:29:29 -04:00			`def test_main():`
#2621 rename test.test_support to test.support 2008-05-20 18:35:26 -03:00			`support.run_unittest(__name__)`
Add CJK codecs support as discussed on python-dev. (SF #873597) Several style fixes are suggested by Martin v. Loewis and Marc-Andre Lemburg. Thanks! 2004-01-17 10:29:29 -04:00
			`if __name__ == "__main__":`
			`test_main()`