cpython/Lib/test/test_ucn.py

""" Test script for the Unicode implementation.

Written by Bill Tutt.
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)

(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

"""#"

import unittest
import sys

from test import test_support

try:
    from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
except ImportError:
    INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1

class UnicodeNamesTest(unittest.TestCase):

    def checkletter(self, name, code):
        # Helper that put all \N escapes inside eval'd raw strings,
        # to make sure this script runs even if the compiler
        # chokes on \N escapes
        res = eval(ur'u"\N{%s}"' % name)
        self.assertEqual(res, code)
        return res

    def test_general(self):
        # General and case insensitivity test:
        chars = [
            "LATIN CAPITAL LETTER T",
            "LATIN SMALL LETTER H",
            "LATIN SMALL LETTER E",
            "SPACE",
            "LATIN SMALL LETTER R",
            "LATIN CAPITAL LETTER E",
            "LATIN SMALL LETTER D",
            "SPACE",
            "LATIN SMALL LETTER f",
            "LATIN CAPITAL LeTtEr o",
            "LATIN SMaLl LETTER x",
            "SPACE",
            "LATIN SMALL LETTER A",
            "LATIN SMALL LETTER T",
            "LATIN SMALL LETTER E",
            "SPACE",
            "LATIN SMALL LETTER T",
            "LATIN SMALL LETTER H",
            "LATIN SMALL LETTER E",
            "SpAcE",
            "LATIN SMALL LETTER S",
            "LATIN SMALL LETTER H",
            "LATIN small LETTER e",
            "LATIN small LETTER e",
            "LATIN SMALL LETTER P",
            "FULL STOP"
        ]
        string = u"The rEd fOx ate the sheep."

        self.assertEqual(
            u"".join([self.checkletter(*args) for args in zip(chars, string)]),
            string
        )

    def test_ascii_letters(self):
        import unicodedata

        for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
            name = "LATIN SMALL LETTER %s" % char.upper()
            code = unicodedata.lookup(name)
            self.assertEqual(unicodedata.name(code), name)

    def test_hangul_syllables(self):
        self.checkletter("HANGUL SYLLABLE GA", u"\uac00")
        self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")
        self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")
        self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")
        self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")
        self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")
        self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")
        self.checkletter("HANGUL SYLLABLE YI", u"\uc758")
        self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")
        self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")
        self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")
        self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")
        self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")

        import unicodedata
        self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")

    def test_cjk_unified_ideographs(self):
        self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400")
        self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5")
        self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00")
        self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5")
        self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000")
        self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6")

    def test_bmp_characters(self):
        import unicodedata
        count = 0
        for code in xrange(0x10000):
            char = unichr(code)
            name = unicodedata.name(char, None)
            if name is not None:
                self.assertEqual(unicodedata.lookup(name), char)
                count += 1

    def test_misc_symbols(self):
        self.checkletter("PILCROW SIGN", u"\u00b6")
        self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD")
        self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F")
        self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41")

    def test_errors(self):
        import unicodedata
        self.assertRaises(TypeError, unicodedata.name)
        self.assertRaises(TypeError, unicodedata.name, u'xx')
        self.assertRaises(TypeError, unicodedata.lookup)
        self.assertRaises(KeyError, unicodedata.lookup, u'unknown')

    def test_strict_eror_handling(self):
        # bogus character name
        self.assertRaises(
            UnicodeError,
            unicode, "\\N{blah}", 'unicode-escape', 'strict'
        )
        # long bogus character name
        self.assertRaises(
            UnicodeError,
            unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
        )
        # missing closing brace
        self.assertRaises(
            UnicodeError,
            unicode, "\\N{SPACE", 'unicode-escape', 'strict'
        )
        # missing opening brace
        self.assertRaises(
            UnicodeError,
            unicode, "\\NSPACE", 'unicode-escape', 'strict'
        )

    @test_support.cpython_only
    @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
    @unittest.skipUnless(UINT_MAX < sys.maxint, "needs UINT_MAX < sys.maxint")
    @test_support.bigmemtest(minsize=UINT_MAX + 1,
                             memuse=2 + 4 // len(u'\U00010000'))
    def test_issue16335(self, size):
        func = self.test_issue16335
        if size < func.minsize:
            raise unittest.SkipTest("not enough memory: %.1fG minimum needed" %
                    (func.minsize * func.memuse / float(1024**3),))
        # very very long bogus character name
        x = b'\\N{SPACE' + b'x' * int(UINT_MAX + 1) + b'}'
        self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
        self.assertRaisesRegexp(UnicodeError,
            'unknown Unicode character name',
            x.decode, 'unicode-escape'
        )


def test_main():
    test_support.run_unittest(UnicodeNamesTest)

if __name__ == "__main__":
    test_main()
New test for the ucnhash module. 2000-06-30 06:45:20 -03:00			`""" Test script for the Unicode implementation.`

			`Written by Bill Tutt.`
Move uchhash functionality into unicodedata (after the recent crop of changes, the files are small enough to do this). Also adds "name" and "lookup" functions to unicodedata. 2001-01-24 03:59:11 -04:00			`Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)`
New test for the ucnhash module. 2000-06-30 06:45:20 -03:00
			`(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.`

			`"""#"`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 10:49:41 -04:00
			`import unittest`
Add bigmemtest decorator to test of issue #16335. 2013-01-21 14:23:01 -04:00			`import sys`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 10:49:41 -04:00
			`from test import test_support`

Issue #20532: Tests which use _testcapi now are marked as CPython only. 2014-02-07 04:06:05 -04:00			`try:`
			`from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX`
			`except ImportError:`
			`INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1`

Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 10:49:41 -04:00			`class UnicodeNamesTest(unittest.TestCase):`

			`def checkletter(self, name, code):`
			`# Helper that put all \N escapes inside eval'd raw strings,`
Whitespace normalization. 2003-03-07 13:30:48 -04:00			`# to make sure this script runs even if the compiler`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 10:49:41 -04:00			`# chokes on \N escapes`
			`res = eval(ur'u"\N{%s}"' % name)`
			`self.assertEqual(res, code)`
			`return res`

			`def test_general(self):`
			`# General and case insensitivity test:`
			`chars = [`
			`"LATIN CAPITAL LETTER T",`
			`"LATIN SMALL LETTER H",`
			`"LATIN SMALL LETTER E",`
			`"SPACE",`
			`"LATIN SMALL LETTER R",`
			`"LATIN CAPITAL LETTER E",`
			`"LATIN SMALL LETTER D",`
			`"SPACE",`
			`"LATIN SMALL LETTER f",`
			`"LATIN CAPITAL LeTtEr o",`
			`"LATIN SMaLl LETTER x",`
			`"SPACE",`
			`"LATIN SMALL LETTER A",`
			`"LATIN SMALL LETTER T",`
			`"LATIN SMALL LETTER E",`
			`"SPACE",`
			`"LATIN SMALL LETTER T",`
			`"LATIN SMALL LETTER H",`
			`"LATIN SMALL LETTER E",`
			`"SpAcE",`
			`"LATIN SMALL LETTER S",`
			`"LATIN SMALL LETTER H",`
			`"LATIN small LETTER e",`
			`"LATIN small LETTER e",`
			`"LATIN SMALL LETTER P",`
			`"FULL STOP"`
			`]`
			`string = u"The rEd fOx ate the sheep."`

			`self.assertEqual(`
			`u"".join([self.checkletter(*args) for args in zip(chars, string)]),`
			`string`
			`)`

			`def test_ascii_letters(self):`
			`import unicodedata`

			`for char in "".join(map(chr, xrange(ord("a"), ord("z")))):`
			`name = "LATIN SMALL LETTER %s" % char.upper()`
			`code = unicodedata.lookup(name)`
			`self.assertEqual(unicodedata.name(code), name)`

			`def test_hangul_syllables(self):`
			`self.checkletter("HANGUL SYLLABLE GA", u"\uac00")`
			`self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")`
			`self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")`
			`self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")`
			`self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")`
			`self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")`
			`self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")`
			`self.checkletter("HANGUL SYLLABLE YI", u"\uc758")`
			`self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")`
			`self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")`
			`self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")`
			`self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")`
			`self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")`

			`import unicodedata`
			`self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")`

			`def test_cjk_unified_ideographs(self):`
			`self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400")`
			`self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5")`
			`self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00")`
			`self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5")`
			`self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000")`
			`self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6")`

			`def test_bmp_characters(self):`
			`import unicodedata`
			`count = 0`
			`for code in xrange(0x10000):`
			`char = unichr(code)`
			`name = unicodedata.name(char, None)`
			`if name is not None:`
			`self.assertEqual(unicodedata.lookup(name), char)`
			`count += 1`

			`def test_misc_symbols(self):`
			`self.checkletter("PILCROW SIGN", u"\u00b6")`
			`self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD")`
			`self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F")`
			`self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41")`

			`def test_errors(self):`
			`import unicodedata`
			`self.assertRaises(TypeError, unicodedata.name)`
			`self.assertRaises(TypeError, unicodedata.name, u'xx')`
			`self.assertRaises(TypeError, unicodedata.lookup)`
			`self.assertRaises(KeyError, unicodedata.lookup, u'unknown')`

			`def test_strict_eror_handling(self):`
			`# bogus character name`
			`self.assertRaises(`
			`UnicodeError,`
			`unicode, "\\N{blah}", 'unicode-escape', 'strict'`
			`)`
			`# long bogus character name`
			`self.assertRaises(`
			`UnicodeError,`
			`unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'`
			`)`
			`# missing closing brace`
			`self.assertRaises(`
			`UnicodeError,`
			`unicode, "\\N{SPACE", 'unicode-escape', 'strict'`
			`)`
			`# missing opening brace`
			`self.assertRaises(`
			`UnicodeError,`
			`unicode, "\\NSPACE", 'unicode-escape', 'strict'`
			`)`

Issue #20532: Tests which use _testcapi now are marked as CPython only. 2014-02-07 04:06:05 -04:00			`@test_support.cpython_only`
			`@unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")`
			`@unittest.skipUnless(UINT_MAX < sys.maxint, "needs UINT_MAX < sys.maxint")`
			`@test_support.bigmemtest(minsize=UINT_MAX + 1,`
Don't run the test for issue #16335 when -M is not specified. Increase memory limit in this test. 2013-01-25 04:03:12 -04:00			`memuse=2 + 4 // len(u'\U00010000'))`
Add bigmemtest decorator to test of issue #16335. 2013-01-21 14:23:01 -04:00			`def test_issue16335(self, size):`
Don't run the test for issue #16335 when -M is not specified. Increase memory limit in this test. 2013-01-25 04:03:12 -04:00			`func = self.test_issue16335`
			`if size < func.minsize:`
			`raise unittest.SkipTest("not enough memory: %.1fG minimum needed" %`
			`(func.minsize * func.memuse / float(1024**3),))`
Issue #16335: Fix integer overflow in unicode-escape decoder. 2013-01-21 05:48:24 -04:00			`# very very long bogus character name`
Issue #20532: Tests which use _testcapi now are marked as CPython only. 2014-02-07 04:06:05 -04:00			`x = b'\\N{SPACE' + b'x' * int(UINT_MAX + 1) + b'}'`
			`self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))`
Add bigmemtest decorator to test of issue #16335. 2013-01-21 14:23:01 -04:00			`self.assertRaisesRegexp(UnicodeError,`
			`'unknown Unicode character name',`
			`x.decode, 'unicode-escape'`
			`)`
Issue #16335: Fix integer overflow in unicode-escape decoder. 2013-01-21 05:48:24 -04:00

Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 10:49:41 -04:00			`def test_main():`
Combine the functionality of test_support.run_unittest() and test_support.run_classtests() into run_unittest() and use it wherever possible. Also don't use "from test.test_support import ...", but "from test import test_support" in a few spots. From SF patch #662807. 2003-05-01 14:45:56 -03:00			`test_support.run_unittest(UnicodeNamesTest)`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 10:49:41 -04:00
			`if __name__ == "__main__":`
			`test_main()`