cpython/Lib/test/test_unicode.py

# -*- coding: iso-8859-1 -*-
""" Test script for the Unicode implementation.

Written by Marc-Andre Lemburg (mal@lemburg.com).

(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

"""#"
from test.test_support import verify, verbose, TestFailed
import sys, string

if not sys.platform.startswith('java'):
    # Test basic sanity of repr()
    verify(repr(u'abc') == "u'abc'")
    verify(repr(u'ab\\c') == "u'ab\\\\c'")
    verify(repr(u'ab\\') == "u'ab\\\\'")
    verify(repr(u'\\c') == "u'\\\\c'")
    verify(repr(u'\\') == "u'\\\\'")
    verify(repr(u'\n') == "u'\\n'")
    verify(repr(u'\r') == "u'\\r'")
    verify(repr(u'\t') == "u'\\t'")
    verify(repr(u'\b') == "u'\\x08'")
    verify(repr(u"'\"") == """u'\\'"'""")
    verify(repr(u"'\"") == """u'\\'"'""")
    verify(repr(u"'") == '''u"'"''')
    verify(repr(u'"') == """u'"'""")
    latin1repr = (
        "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
        "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
        "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
        "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
        "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
        "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
        "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
        "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
        "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
        "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
        "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
        "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
        "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
        "\\xfe\\xff'")
    testrepr = repr(u''.join(map(unichr, range(256))))
    verify(testrepr == latin1repr)

def test(method, input, output, *args):
    if verbose:
        print '%s.%s%s =? %s... ' % (repr(input), method, args, repr(output)),
    try:
        f = getattr(input, method)
        value = apply(f, args)
    except:
        value = sys.exc_type
        exc = sys.exc_info()[:2]
    else:
        exc = None
    if value == output and type(value) is type(output):
        # if the original is returned make sure that
        # this doesn't happen with subclasses
        if value is input:
            class usub(unicode):
                def __repr__(self):
                    return 'usub(%r)' % unicode.__repr__(self)
            input = usub(input)
            try:
                f = getattr(input, method)
                value = apply(f, args)
            except:
                value = sys.exc_type
                exc = sys.exc_info()[:2]
            if value is input:
                if verbose:
                    print 'no'
                print '*',f, `input`, `output`, `value`
                return
    if value != output or type(value) is not type(output):
        if verbose:
            print 'no'
        print '*',f, `input`, `output`, `value`
        if exc:
            print '  value == %s: %s' % (exc)
    else:
        if verbose:
            print 'yes'

test('capitalize', u' hello ', u' hello ')
test('capitalize', u'Hello ', u'Hello ')
test('capitalize', u'hello ', u'Hello ')
test('capitalize', u'aaaa', u'Aaaa')
test('capitalize', u'AaAa', u'Aaaa')

test('count', u'aaa', 3, u'a')
test('count', u'aaa', 0, u'b')
test('count', 'aaa', 3, u'a')
test('count', 'aaa', 0, u'b')
test('count', u'aaa', 3, 'a')
test('count', u'aaa', 0, 'b')

test('title', u' hello ', u' Hello ')
test('title', u'Hello ', u'Hello ')
test('title', u'hello ', u'Hello ')
test('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
test('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
test('title', u"getInt", u'Getint')

test('find', u'abcdefghiabc', 0, u'abc')
test('find', u'abcdefghiabc', 9, u'abc', 1)
test('find', u'abcdefghiabc', -1, u'def', 4)

test('rfind', u'abcdefghiabc', 9, u'abc')

test('lower', u'HeLLo', u'hello')
test('lower', u'hello', u'hello')

test('upper', u'HeLLo', u'HELLO')
test('upper', u'HELLO', u'HELLO')

if 0:
    transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'

    test('maketrans', u'abc', transtable, u'xyz')
    test('maketrans', u'abc', ValueError, u'xyzq')

test('split', u'this is the split function',
     [u'this', u'is', u'the', u'split', u'function'])
test('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
test('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
test('split', u'a b c d', [u'a', u'b c d'], None, 1)
test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
test('split', u'a b c d', [u'a b c d'], None, 0)
test('split', u'a  b  c  d', [u'a', u'b', u'c  d'], None, 2)
test('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
test('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
test('split', u'endcase test', [u'endcase ', u''], u'test')
test('split', u'endcase test', [u'endcase ', u''], 'test')
test('split', 'endcase test', [u'endcase ', u''], u'test')


# join now works with any sequence type
class Sequence:
    def __init__(self, seq): self.seq = seq
    def __len__(self): return len(self.seq)
    def __getitem__(self, i): return self.seq[i]

test('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
test('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
test('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
test('join', u' ', u'w x y z', Sequence('wxyz'))
test('join', u' ', TypeError, 7)
test('join', u' ', TypeError, Sequence([7, u'hello', 123L]))
test('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
test('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
test('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
test('join', ' ', u'w x y z', Sequence(u'wxyz'))
test('join', ' ', TypeError, 7)

result = u''
for i in range(10):
    if i > 0:
        result = result + u':'
    result = result + u'x'*10
test('join', u':', result, [u'x' * 10] * 10)
test('join', u':', result, (u'x' * 10,) * 10)

test('strip', u'   hello   ', u'hello')
test('lstrip', u'   hello   ', u'hello   ')
test('rstrip', u'   hello   ', u'   hello')
test('strip', u'hello', u'hello')

# strip/lstrip/rstrip with None arg
test('strip', u'   hello   ', u'hello', None)
test('lstrip', u'   hello   ', u'hello   ', None)
test('rstrip', u'   hello   ', u'   hello', None)
test('strip', u'hello', u'hello', None)

# strip/lstrip/rstrip with unicode arg
test('strip', u'xyzzyhelloxyzzy', u'hello', u'xyz')
test('lstrip', u'xyzzyhelloxyzzy', u'helloxyzzy', u'xyz')
test('rstrip', u'xyzzyhelloxyzzy', u'xyzzyhello', u'xyz')
test('strip', u'hello', u'hello', u'xyz')

# strip/lstrip/rstrip with str arg
test('strip', u'xyzzyhelloxyzzy', u'hello', 'xyz')
test('lstrip', u'xyzzyhelloxyzzy', u'helloxyzzy', 'xyz')
test('rstrip', u'xyzzyhelloxyzzy', u'xyzzyhello', 'xyz')
test('strip', u'hello', u'hello', 'xyz')

test('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')

if 0:
    test('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')

    table = string.maketrans('a', u'A')
    test('translate', u'abc', u'Abc', table)
    test('translate', u'xyz', u'xyz', table)

test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
test('replace', u'one!two!three!', u'onetwothree', '!', '')
test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
test('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)

test('startswith', u'hello', True, u'he')
test('startswith', u'hello', True, u'hello')
test('startswith', u'hello', False, u'hello world')
test('startswith', u'hello', True, u'')
test('startswith', u'hello', False, u'ello')
test('startswith', u'hello', True, u'ello', 1)
test('startswith', u'hello', True, u'o', 4)
test('startswith', u'hello', False, u'o', 5)
test('startswith', u'hello', True, u'', 5)
test('startswith', u'hello', False, u'lo', 6)
test('startswith', u'helloworld', True, u'lowo', 3)
test('startswith', u'helloworld', True, u'lowo', 3, 7)
test('startswith', u'helloworld', False, u'lowo', 3, 6)

test('endswith', u'hello', True, u'lo')
test('endswith', u'hello', False, u'he')
test('endswith', u'hello', True, u'')
test('endswith', u'hello', False, u'hello world')
test('endswith', u'helloworld', False, u'worl')
test('endswith', u'helloworld', True, u'worl', 3, 9)
test('endswith', u'helloworld', True, u'world', 3, 12)
test('endswith', u'helloworld', True, u'lowo', 1, 7)
test('endswith', u'helloworld', True, u'lowo', 2, 7)
test('endswith', u'helloworld', True, u'lowo', 3, 7)
test('endswith', u'helloworld', False, u'lowo', 4, 7)
test('endswith', u'helloworld', False, u'lowo', 3, 8)
test('endswith', u'ab', False, u'ab', 0, 1)
test('endswith', u'ab', False, u'ab', 0, 0)

test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab      def\ng       hi')
test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab      def\ng       hi', 8)
test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab  def\ng   hi', 4)
test('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab  def\ng   hi', 4)
test('expandtabs', u'abc\r\nab\r\ndef\ng\r\nhi', u'abc\r\nab\r\ndef\ng\r\nhi', 4)

if 0:
    test('capwords', u'abc def ghi', u'Abc Def Ghi')
    test('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
    test('capwords', u'abc\t   def  \nghi', u'Abc Def Ghi')

test('zfill', u'123', u'123', 2)
test('zfill', u'123', u'123', 3)
test('zfill', u'123', u'0123', 4)
test('zfill', u'+123', u'+123', 3)
test('zfill', u'+123', u'+123', 4)
test('zfill', u'+123', u'+0123', 5)
test('zfill', u'-123', u'-123', 3)
test('zfill', u'-123', u'-123', 4)
test('zfill', u'-123', u'-0123', 5)
test('zfill', u'', u'000', 3)
test('zfill', u'34', u'34', 1)
test('zfill', u'34', u'00034', 5)

# Comparisons:
print 'Testing Unicode comparisons...',
verify(u'abc' == 'abc')
verify('abc' == u'abc')
verify(u'abc' == u'abc')
verify(u'abcd' > 'abc')
verify('abcd' > u'abc')
verify(u'abcd' > u'abc')
verify(u'abc' < 'abcd')
verify('abc' < u'abcd')
verify(u'abc' < u'abcd')
print 'done.'

if 0:
    # Move these tests to a Unicode collation module test...

    print 'Testing UTF-16 code point order comparisons...',
    #No surrogates, no fixup required.
    verify(u'\u0061' < u'\u20ac')
    # Non surrogate below surrogate value, no fixup required
    verify(u'\u0061' < u'\ud800\udc02')

    # Non surrogate above surrogate value, fixup required
    def test_lecmp(s, s2):
        verify(s <  s2 , "comparison failed on %s < %s" % (s, s2))

    def test_fixup(s):
        s2 = u'\ud800\udc01'
        test_lecmp(s, s2)
        s2 = u'\ud900\udc01'
        test_lecmp(s, s2)
        s2 = u'\uda00\udc01'
        test_lecmp(s, s2)
        s2 = u'\udb00\udc01'
        test_lecmp(s, s2)
        s2 = u'\ud800\udd01'
        test_lecmp(s, s2)
        s2 = u'\ud900\udd01'
        test_lecmp(s, s2)
        s2 = u'\uda00\udd01'
        test_lecmp(s, s2)
        s2 = u'\udb00\udd01'
        test_lecmp(s, s2)
        s2 = u'\ud800\ude01'
        test_lecmp(s, s2)
        s2 = u'\ud900\ude01'
        test_lecmp(s, s2)
        s2 = u'\uda00\ude01'
        test_lecmp(s, s2)
        s2 = u'\udb00\ude01'
        test_lecmp(s, s2)
        s2 = u'\ud800\udfff'
        test_lecmp(s, s2)
        s2 = u'\ud900\udfff'
        test_lecmp(s, s2)
        s2 = u'\uda00\udfff'
        test_lecmp(s, s2)
        s2 = u'\udb00\udfff'
        test_lecmp(s, s2)

    test_fixup(u'\ue000')
    test_fixup(u'\uff61')

    # Surrogates on both sides, no fixup required
    verify(u'\ud800\udc02' < u'\ud84d\udc56')
    print 'done.'

test('ljust', u'abc',  u'abc       ', 10)
test('rjust', u'abc',  u'       abc', 10)
test('center', u'abc', u'   abc    ', 10)
test('ljust', u'abc',  u'abc   ', 6)
test('rjust', u'abc',  u'   abc', 6)
test('center', u'abc', u' abc  ', 6)
test('ljust', u'abc', u'abc', 2)
test('rjust', u'abc', u'abc', 2)
test('center', u'abc', u'abc', 2)

test('islower', u'a', True)
test('islower', u'A', False)
test('islower', u'\n', False)
test('islower', u'\u1FFc', False)
test('islower', u'abc', True)
test('islower', u'aBc', False)
test('islower', u'abc\n', True)

test('isupper', u'a', False)
test('isupper', u'A', True)
test('isupper', u'\n', False)
if sys.platform[:4] != 'java':
    test('isupper', u'\u1FFc', False)
test('isupper', u'ABC', True)
test('isupper', u'AbC', False)
test('isupper', u'ABC\n', True)

test('istitle', u'a', False)
test('istitle', u'A', True)
test('istitle', u'\n', False)
test('istitle', u'\u1FFc', True)
test('istitle', u'A Titlecased Line', True)
test('istitle', u'A\nTitlecased Line', True)
test('istitle', u'A Titlecased, Line', True)
test('istitle', u'Greek \u1FFcitlecases ...', True)
test('istitle', u'Not a capitalized String', False)
test('istitle', u'Not\ta Titlecase String', False)
test('istitle', u'Not--a Titlecase String', False)

test('isalpha', u'a', True)
test('isalpha', u'A', True)
test('isalpha', u'\n', False)
test('isalpha', u'\u1FFc', True)
test('isalpha', u'abc', True)
test('isalpha', u'aBc123', False)
test('isalpha', u'abc\n', False)

test('isalnum', u'a', True)
test('isalnum', u'A', True)
test('isalnum', u'\n', False)
test('isalnum', u'123abc456', True)
test('isalnum', u'a1b3c', True)
test('isalnum', u'aBc000 ', False)
test('isalnum', u'abc\n', False)

test('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
test('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
test('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
test('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
test('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], True)

test('translate', u"abababc", u'bbbc', {ord('a'):None})
test('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
test('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})

# Contains:
print 'Testing Unicode contains method...',
verify(('a' in u'abdb') == 1)
verify(('a' in u'bdab') == 1)
verify(('a' in u'bdaba') == 1)
verify(('a' in u'bdba') == 1)
verify(('a' in u'bdba') == 1)
verify((u'a' in u'bdba') == 1)
verify((u'a' in u'bdb') == 0)
verify((u'a' in 'bdb') == 0)
verify((u'a' in 'bdba') == 1)
verify((u'a' in ('a',1,None)) == 1)
verify((u'a' in (1,None,'a')) == 1)
verify((u'a' in (1,None,u'a')) == 1)
verify(('a' in ('a',1,None)) == 1)
verify(('a' in (1,None,'a')) == 1)
verify(('a' in (1,None,u'a')) == 1)
verify(('a' in ('x',1,u'y')) == 0)
verify(('a' in ('x',1,None)) == 0)
print 'done.'

# Formatting:
print 'Testing Unicode formatting strings...',
verify(u"%s, %s" % (u"abc", "abc") == u'abc, abc')
verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3) == u'abc, abc, 1, 2.000000,  3.00')
verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3) == u'abc, abc, 1, -2.000000,  3.00')
verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5) == u'abc, abc, -1, -2.000000,  3.50')
verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57) == u'abc, abc, -1, -2.000000,  3.57')
verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57) == u'abc, abc, -1, -2.000000, 1003.57')
verify(u"%c" % (u"a",) == u'a')
verify(u"%c" % ("a",) == u'a')
verify(u"%c" % (34,) == u'"')
verify(u"%c" % (36,) == u'$')
if sys.platform[:4] != 'java':
    value = u"%r, %r" % (u"abc", "abc")
    if value != u"u'abc', 'abc'":
        print '*** formatting failed for "%s"' % 'u"%r, %r" % (u"abc", "abc")'

verify(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def')
try:
    value = u"%(x)s, %(<28>)s" % {'x':u"abc", u'<EFBFBD>':"def"}
except KeyError:
    print '*** formatting failed for "%s"' % "u'abc, def'"
else:
    verify(value == u'abc, def')

# formatting jobs delegated from the string implementation:
verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...')
verify('...%(foo)s...' % {'foo':"abc"} == '...abc...')
verify('...%(foo)s...' % {u'foo':"abc"} == '...abc...')
verify('...%(foo)s...' % {u'foo':u"abc"} == u'...abc...')
verify('...%(foo)s...' % {u'foo':u"abc",'def':123} ==  u'...abc...')
verify('...%(foo)s...' % {u'foo':u"abc",u'def':123} == u'...abc...')
verify('...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...1...2...3...abc...')
verify('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...%...%s...1...2...3...abc...')
verify('...%s...' % u"abc" == u'...abc...')
verify('%*s' % (5,u'abc',) == u'  abc')
verify('%*s' % (-5,u'abc',) == u'abc  ')
verify('%*.*s' % (5,2,u'abc',) == u'   ab')
verify('%*.*s' % (5,3,u'abc',) == u'  abc')
verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10   abc')
verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103   abc')
print 'done.'

print 'Testing builtin unicode()...',

# unicode(obj) tests (this maps to PyObject_Unicode() at C level)

verify(unicode(u'unicode remains unicode') == u'unicode remains unicode')

class UnicodeSubclass(unicode):
    pass

verify(unicode(UnicodeSubclass('unicode subclass becomes unicode'))
       == u'unicode subclass becomes unicode')

verify(unicode('strings are converted to unicode')
       == u'strings are converted to unicode')

class UnicodeCompat:
    def __init__(self, x):
        self.x = x
    def __unicode__(self):
        return self.x

verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized'))
       == u'__unicode__ compatible objects are recognized')

class StringCompat:
    def __init__(self, x):
        self.x = x
    def __str__(self):
        return self.x

verify(unicode(StringCompat('__str__ compatible objects are recognized'))
       == u'__str__ compatible objects are recognized')

# unicode(obj) is compatible to str():

o = StringCompat('unicode(obj) is compatible to str()')
verify(unicode(o) == u'unicode(obj) is compatible to str()')
verify(str(o) == 'unicode(obj) is compatible to str()')

for obj in (123, 123.45, 123L):
    verify(unicode(obj) == unicode(str(obj)))

# unicode(obj, encoding, error) tests (this maps to
# PyUnicode_FromEncodedObject() at C level)

if not sys.platform.startswith('java'):
    try:
        unicode(u'decoding unicode is not supported', 'utf-8', 'strict')
    except TypeError:
        pass
    else:
        raise TestFailed, "decoding unicode should NOT be supported"

verify(unicode('strings are decoded to unicode', 'utf-8', 'strict')
       == u'strings are decoded to unicode')

if not sys.platform.startswith('java'):
    verify(unicode(buffer('character buffers are decoded to unicode'),
                   'utf-8', 'strict')
           == u'character buffers are decoded to unicode')

print 'done.'

# Test builtin codecs
print 'Testing builtin codecs...',

# UTF-7 specific encoding tests:
utfTests = [(u'A\u2262\u0391.', 'A+ImIDkQ.'),  # RFC2152 example
 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'),     # RFC2152 example
 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'),        # RFC2152 example
 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
 (u'+', '+-'),
 (u'+-', '+--'),
 (u'+?', '+-?'),
 (u'\?', '+AFw?'),
 (u'+?', '+-?'),
 (ur'\\?', '+AFwAXA?'),
 (ur'\\\?', '+AFwAXABc?'),
 (ur'++--', '+-+---')]

for x,y in utfTests:
    verify( x.encode('utf-7') == y )

try:
    unicode('+3ADYAA-', 'utf-7') # surrogates not supported
except UnicodeError:
    pass
else:
    raise TestFailed, "unicode('+3ADYAA-', 'utf-7') failed to raise an exception"

verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')

# UTF-8 specific encoding tests:
verify(u''.encode('utf-8') == '')
verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
verify(u'\ud800'.encode('utf-8') == '\xed\xa0\x80')
verify(u'\udc00'.encode('utf-8') == '\xed\xb0\x80')
verify((u'\ud800\udc02'*1000).encode('utf-8') ==
       '\xf0\x90\x80\x82'*1000)
verify(u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
       u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
       u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
       u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
       u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
       u' Nunstuck git und'.encode('utf-8') ==
       '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
       '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
       '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
       '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
       '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
       '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
       '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
       '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
       '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
       '\xe3\x80\x8cWenn ist das Nunstuck git und')

# UTF-8 specific decoding tests
verify(unicode('\xf0\xa3\x91\x96', 'utf-8') == u'\U00023456' )
verify(unicode('\xf0\x90\x80\x82', 'utf-8') == u'\U00010002' )
verify(unicode('\xe2\x82\xac', 'utf-8') == u'\u20ac' )

# Other possible utf-8 test cases:
# * strict decoding testing for all of the
#   UTF8_ERROR cases in PyUnicode_DecodeUTF8

verify(unicode('hello','ascii') == u'hello')
verify(unicode('hello','utf-8') == u'hello')
verify(unicode('hello','utf8') == u'hello')
verify(unicode('hello','latin-1') == u'hello')

# Error handling
try:
    u'Andr\202 x'.encode('ascii')
    u'Andr\202 x'.encode('ascii','strict')
except ValueError:
    pass
else:
    raise TestFailed, "u'Andr\202'.encode('ascii') failed to raise an exception"
verify(u'Andr\202 x'.encode('ascii','ignore') == "Andr x")
verify(u'Andr\202 x'.encode('ascii','replace') == "Andr? x")

try:
    unicode('Andr\202 x','ascii')
    unicode('Andr\202 x','ascii','strict')
except ValueError:
    pass
else:
    raise TestFailed, "unicode('Andr\202') failed to raise an exception"
verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')

verify("\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx")
try:
    "\\".decode("unicode-escape")
except ValueError:
    pass
else:
    raise TestFailed, '"\\".decode("unicode-escape") should fail'

verify(u'hello'.encode('ascii') == 'hello')
verify(u'hello'.encode('utf-7') == 'hello')
verify(u'hello'.encode('utf-8') == 'hello')
verify(u'hello'.encode('utf8') == 'hello')
verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
verify(u'hello'.encode('latin-1') == 'hello')

# Roundtrip safety for BMP (just the first 1024 chars)
u = u''.join(map(unichr, range(1024)))
for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
                 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
    verify(unicode(u.encode(encoding),encoding) == u)

# Roundtrip safety for BMP (just the first 256 chars)
u = u''.join(map(unichr, range(256)))
for encoding in (
    'latin-1',
    ):
    try:
        verify(unicode(u.encode(encoding),encoding) == u)
    except TestFailed:
        print '*** codec "%s" failed round-trip' % encoding
    except ValueError,why:
        print '*** codec for "%s" failed: %s' % (encoding, why)

# Roundtrip safety for BMP (just the first 128 chars)
u = u''.join(map(unichr, range(128)))
for encoding in (
    'ascii',
    ):
    try:
        verify(unicode(u.encode(encoding),encoding) == u)
    except TestFailed:
        print '*** codec "%s" failed round-trip' % encoding
    except ValueError,why:
        print '*** codec for "%s" failed: %s' % (encoding, why)

# Roundtrip safety for non-BMP (just a few chars)
u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
for encoding in ('utf-8',
                 'utf-16', 'utf-16-le', 'utf-16-be',
                 #'raw_unicode_escape',
                 'unicode_escape', 'unicode_internal'):
    verify(unicode(u.encode(encoding),encoding) == u)

# UTF-8 must be roundtrip safe for all UCS-2 code points
u = u''.join(map(unichr, range(0x10000)))
for encoding in ('utf-8',):
    verify(unicode(u.encode(encoding),encoding) == u)

print 'done.'

print 'Testing standard mapping codecs...',

print '0-127...',
s = ''.join(map(chr, range(128)))
for encoding in (
    'cp037', 'cp1026',
    'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
    'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
    'cp863', 'cp865', 'cp866',
    'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
    'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
    'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
    'mac_cyrillic', 'mac_latin2',

    'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
    'cp1256', 'cp1257', 'cp1258',
    'cp856', 'cp857', 'cp864', 'cp869', 'cp874',

    'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
    'cp1006', 'iso8859_8',

    ### These have undefined mappings:
    #'cp424',

    ### These fail the round-trip:
    #'cp875'

    ):
    try:
        verify(unicode(s,encoding).encode(encoding) == s)
    except TestFailed:
        print '*** codec "%s" failed round-trip' % encoding
    except ValueError,why:
        print '*** codec for "%s" failed: %s' % (encoding, why)

print '128-255...',
s = ''.join(map(chr, range(128,256)))
for encoding in (
    'cp037', 'cp1026',
    'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
    'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
    'cp863', 'cp865', 'cp866',
    'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
    'iso8859_2', 'iso8859_4', 'iso8859_5',
    'iso8859_9', 'koi8_r', 'latin_1',
    'mac_cyrillic', 'mac_latin2',

    ### These have undefined mappings:
    #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
    #'cp1256', 'cp1257', 'cp1258',
    #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
    #'iso8859_3', 'iso8859_6', 'iso8859_7',
    #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',

    ### These fail the round-trip:
    #'cp1006', 'cp875', 'iso8859_8',

    ):
    try:
        verify(unicode(s,encoding).encode(encoding) == s)
    except TestFailed:
        print '*** codec "%s" failed round-trip' % encoding
    except ValueError,why:
        print '*** codec for "%s" failed: %s' % (encoding, why)

print 'done.'

print 'Testing Unicode string concatenation...',
verify((u"abc" u"def") == u"abcdef")
verify(("abc" u"def") == u"abcdef")
verify((u"abc" "def") == u"abcdef")
verify((u"abc" u"def" "ghi") == u"abcdefghi")
verify(("abc" "def" u"ghi") == u"abcdefghi")
print 'done.'

print 'Testing Unicode printing...',
print u'abc'
print u'abc', u'def'
print u'abc', 'def'
print 'abc', u'def'
print u'abc\n'
print u'abc\n',
print u'abc\n',
print u'def\n'
print u'def\n'
print 'done.'
-												Add encoding declaration.

											
										
										
											2002-08-04 14:28:33 -03:00
+								# -*- coding: iso-8859-1 -*-
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								""" Test script for the Unicode implementation.
 								Written by Marc-Andre Lemburg (mal@lemburg.com).
 								(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								"""#"
-												Get rid of relative imports in all unittests.  Now anything that
imports e.g. test_support must do so using an absolute package name
such as "import test.test_support" or "from test import test_support".

This also updates the README in Lib/test, and gets rid of the
duplicate data dirctory in Lib/test/data (replaced by
Lib/email/test/data).

Now Tim and Jack can have at it. :)

											
										
										
											2002-07-23 16:04:11 -03:00
+								from test.test_support import verify, verbose, TestFailed
-												As part of fixing bug #536241, add a test case for string.zfill() with Unicode

											
										
										
											2002-03-29 12:21:44 -04:00
+								import sys, string
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
-												Skipping some tests by adding the usual jython conditional test around:

- the repr of unicode. Jython only add the u'' if the string contains char
  values > 255.
- A unicode arg to unicode() is perfectly valid in jython.
- A test buffer() test. No buffer() on Jython

This closes patch "[ #490920 ] Jython and test_unicode".

											
										
										
											2001-12-10 16:57:34 -04:00
+								if not sys.platform.startswith('java'):
 								    # Test basic sanity of repr()
 								    verify(repr(u'abc') == "u'abc'")
 								    verify(repr(u'ab\\c') == "u'ab\\\\c'")
 								    verify(repr(u'ab\\') == "u'ab\\\\'")
 								    verify(repr(u'\\c') == "u'\\\\c'")
 								    verify(repr(u'\\') == "u'\\\\'")
 								    verify(repr(u'\n') == "u'\\n'")
 								    verify(repr(u'\r') == "u'\\r'")
 								    verify(repr(u'\t') == "u'\\t'")
 								    verify(repr(u'\b') == "u'\\x08'")
 								    verify(repr(u"'\"") == """u'\\'"'""")
 								    verify(repr(u"'\"") == """u'\\'"'""")
 								    verify(repr(u"'") == '''u"'"''')
 								    verify(repr(u'"') == """u'"'""")
-												Fix to the UTF-8 encoder: it failed on 0-length input strings.

Fix for the UTF-8 decoder: it will now accept isolated surrogates
(previously it raised an exception which causes round-trips to
fail).

Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for
marshalling Unicode objects, so we better make sure it works for
all Unicode code points, including isolated surrogates).

Bumped the PYC magic in a non-standard way -- please review. This
was needed because the old PYC format used illegal UTF-8 sequences
for isolated high surrogates which now raise an exception.

											
										
										
											2002-02-07 07:33:49 -04:00
+								    latin1repr = (
 								        "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
 								        "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
 								        "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
 								        "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
 								        "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
 								        "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
 								        "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
 								        "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
 								        "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
 								        "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
 								        "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
 								        "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
 								        "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
 								        "\\xfe\\xff'")
 								    testrepr = repr(u''.join(map(unichr, range(256))))
 								    verify(testrepr == latin1repr)
-												Test basic functioning of unicode repr().  (If this breaks Jython,
please let me know and we'll figure out how to fix the test.)

											
										
										
											2001-09-21 12:36:41 -03:00
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								def test(method, input, output, *args):
 								    if verbose:
-												Slight improvement to Unicode test suite, inspired by patch #102563:
also test join method of 8-bit strings.

Also changed the test() function to (1) compare the types of the
expected and actual result, and (2) in verbose mode, print the repr()
of the output.

											
										
										
											2000-11-29 08:13:59 -04:00
+								        print '%s.%s%s =? %s... ' % (repr(input), method, args, repr(output)),
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								    try:
 								        f = getattr(input, method)
 								        value = apply(f, args)
 								    except:
 								        value = sys.exc_type
-												Get rid of memory leak caused by assingning sys.exc_info() to a local.
Store sys.exc_info()[:2] instead.

											
										
										
											2000-04-28 17:39:58 -03:00
+								        exc = sys.exc_info()[:2]
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								    else:
 								        exc = None
-												Apply diff3.txt from SF patch http://www.python.org/sf/536241

If a str or unicode method returns the original object,
make sure that for str and unicode subclasses the original
will not be returned.

This should prevent SF bug http://www.python.org/sf/460020
from reappearing.

											
										
										
											2002-04-17 18:34:05 -03:00
+								    if value == output and type(value) is type(output):
 								        # if the original is returned make sure that
 								        # this doesn't happen with subclasses
 								        if value is input:
 								            class usub(unicode):
 								                def __repr__(self):
 								                    return 'usub(%r)' % unicode.__repr__(self)
 								            input = usub(input)
 								            try:
 								                f = getattr(input, method)
 								                value = apply(f, args)
 								            except:
 								                value = sys.exc_type
 								                exc = sys.exc_info()[:2]
 								            if value is input:
 								                if verbose:
-												Whitespace normalization.

											
										
										
											2002-05-23 12:15:30 -03:00
+								                    print 'no'
-												Apply diff3.txt from SF patch http://www.python.org/sf/536241

If a str or unicode method returns the original object,
make sure that for str and unicode subclasses the original
will not be returned.

This should prevent SF bug http://www.python.org/sf/460020
from reappearing.

											
										
										
											2002-04-17 18:34:05 -03:00
+								                print '*',f, `input`, `output`, `value`
 								                return
-												Slight improvement to Unicode test suite, inspired by patch #102563:
also test join method of 8-bit strings.

Also changed the test() function to (1) compare the types of the
expected and actual result, and (2) in verbose mode, print the repr()
of the output.

											
										
										
											2000-11-29 08:13:59 -04:00
+								    if value != output or type(value) is not type(output):
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								        if verbose:
 								            print 'no'
 								        print '*',f, `input`, `output`, `value`
 								        if exc:
-												Get rid of memory leak caused by assingning sys.exc_info() to a local.
Store sys.exc_info()[:2] instead.

											
										
										
											2000-04-28 17:39:58 -03:00
+								            print '  value == %s: %s' % (exc)
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								    else:
 								        if verbose:
 								            print 'yes'
 								test('capitalize', u' hello ', u' hello ')
-												Apply diff3.txt from SF patch http://www.python.org/sf/536241

If a str or unicode method returns the original object,
make sure that for str and unicode subclasses the original
will not be returned.

This should prevent SF bug http://www.python.org/sf/460020
from reappearing.

											
										
										
											2002-04-17 18:34:05 -03:00
+								test('capitalize', u'Hello ', u'Hello ')
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								test('capitalize', u'hello ', u'Hello ')
-												Fixed .capitalize() method of Unicode objects to work like the
corresponding string method. Added tests for this too.

Patch written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-29 07:14:16 -04:00
+								test('capitalize', u'aaaa', u'Aaaa')
 								test('capitalize', u'AaAa', u'Aaaa')
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
-												Added checks to prevent PyUnicode_Count() from dumping core
in case the parameters are out of bounds and fixes error handling
for .count(), .startswith() and .endswith() for the case of
mixed string/Unicode objects.

This patch adds Python style index semantics to PyUnicode_Count()
indices (including the special handling of negative indices).

The patch is an extended version of patch #103249 submitted
by Michael Hudson (mwh) on SF. It also includes new test cases.

											
										
										
											2001-01-16 07:54:12 -04:00
+								test('count', u'aaa', 3, u'a')
 								test('count', u'aaa', 0, u'b')
 								test('count', 'aaa', 3, u'a')
 								test('count', 'aaa', 0, u'b')
 								test('count', u'aaa', 3, 'a')
 								test('count', u'aaa', 0, 'b')
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								test('title', u' hello ', u' Hello ')
-												Apply diff3.txt from SF patch http://www.python.org/sf/536241

If a str or unicode method returns the original object,
make sure that for str and unicode subclasses the original
will not be returned.

This should prevent SF bug http://www.python.org/sf/460020
from reappearing.

											
										
										
											2002-04-17 18:34:05 -03:00
+								test('title', u'Hello ', u'Hello ')
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								test('title', u'hello ', u'Hello ')
 								test('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
 								test('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
 								test('title', u"getInt", u'Getint')
 								test('find', u'abcdefghiabc', 0, u'abc')
 								test('find', u'abcdefghiabc', 9, u'abc', 1)
 								test('find', u'abcdefghiabc', -1, u'def', 4)
 								test('rfind', u'abcdefghiabc', 9, u'abc')
 								test('lower', u'HeLLo', u'hello')
 								test('lower', u'hello', u'hello')
 								test('upper', u'HeLLo', u'HELLO')
 								test('upper', u'HELLO', u'HELLO')
 								if 0:
 								    transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
 								    test('maketrans', u'abc', transtable, u'xyz')
 								    test('maketrans', u'abc', ValueError, u'xyzq')
 								test('split', u'this is the split function',
 								     [u'this', u'is', u'the', u'split', u'function'])
 								test('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
 								test('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
 								test('split', u'a b c d', [u'a', u'b c d'], None, 1)
 								test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
 								test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
 								test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
 								test('split', u'a b c d', [u'a b c d'], None, 0)
 								test('split', u'a  b  c  d', [u'a', u'b', u'c  d'], None, 2)
 								test('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
-												Test more split argument combinations:
1) multi-char separator
2) multi-char separator that only occurs at last position
3) all of the above with mixed Unicode and 8-bit-string arguments

											
										
										
											2000-12-18 22:22:31 -04:00
+								test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
 								test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
 								test('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
 								test('split', u'endcase test', [u'endcase ', u''], u'test')
 								test('split', u'endcase test', [u'endcase ', u''], 'test')
 								test('split', 'endcase test', [u'endcase ', u''], u'test')
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
 								# join now works with any sequence type
 								class Sequence:
-												Slight improvement to Unicode test suite, inspired by patch #102563:
also test join method of 8-bit strings.

Also changed the test() function to (1) compare the types of the
expected and actual result, and (2) in verbose mode, print the repr()
of the output.

											
										
										
											2000-11-29 08:13:59 -04:00
+								    def __init__(self, seq): self.seq = seq
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								    def __len__(self): return len(self.seq)
 								    def __getitem__(self, i): return self.seq[i]
 								test('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
-												Slight improvement to Unicode test suite, inspired by patch #102563:
also test join method of 8-bit strings.

Also changed the test() function to (1) compare the types of the
expected and actual result, and (2) in verbose mode, print the repr()
of the output.

											
										
										
											2000-11-29 08:13:59 -04:00
+								test('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								test('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
-												Slight improvement to Unicode test suite, inspired by patch #102563:
also test join method of 8-bit strings.

Also changed the test() function to (1) compare the types of the
expected and actual result, and (2) in verbose mode, print the repr()
of the output.

											
										
										
											2000-11-29 08:13:59 -04:00
+								test('join', u' ', u'w x y z', Sequence('wxyz'))
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								test('join', u' ', TypeError, 7)
-												Slight improvement to Unicode test suite, inspired by patch #102563:
also test join method of 8-bit strings.

Also changed the test() function to (1) compare the types of the
expected and actual result, and (2) in verbose mode, print the repr()
of the output.

											
										
										
											2000-11-29 08:13:59 -04:00
+								test('join', u' ', TypeError, Sequence([7, u'hello', 123L]))
 								test('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
 								test('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
 								test('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
 								test('join', ' ', u'w x y z', Sequence(u'wxyz'))
 								test('join', ' ', TypeError, 7)
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
 								result = u''
 								for i in range(10):
 								    if i > 0:
 								        result = result + u':'
 								    result = result + u'x'*10
 								test('join', u':', result, [u'x' * 10] * 10)
 								test('join', u':', result, (u'x' * 10,) * 10)
 								test('strip', u'   hello   ', u'hello')
 								test('lstrip', u'   hello   ', u'hello   ')
 								test('rstrip', u'   hello   ', u'   hello')
 								test('strip', u'hello', u'hello')
-												Apply patch diff.txt from SF feature request
http://www.python.org/sf/444708

This adds the optional argument for str.strip
to unicode.strip too and makes it possible
to call str.strip with a unicode argument
and unicode.strip with a str argument.

											
										
										
											2002-04-22 14:42:37 -03:00
+								# strip/lstrip/rstrip with None arg
 								test('strip', u'   hello   ', u'hello', None)
 								test('lstrip', u'   hello   ', u'hello   ', None)
 								test('rstrip', u'   hello   ', u'   hello', None)
 								test('strip', u'hello', u'hello', None)
 								# strip/lstrip/rstrip with unicode arg
 								test('strip', u'xyzzyhelloxyzzy', u'hello', u'xyz')
 								test('lstrip', u'xyzzyhelloxyzzy', u'helloxyzzy', u'xyz')
 								test('rstrip', u'xyzzyhelloxyzzy', u'xyzzyhello', u'xyz')
 								test('strip', u'hello', u'hello', u'xyz')
 								# strip/lstrip/rstrip with str arg
 								test('strip', u'xyzzyhelloxyzzy', u'hello', 'xyz')
 								test('lstrip', u'xyzzyhelloxyzzy', u'helloxyzzy', 'xyz')
 								test('rstrip', u'xyzzyhelloxyzzy', u'xyzzyhello', 'xyz')
 								test('strip', u'hello', u'hello', 'xyz')
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								test('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')
 								if 0:
 								    test('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')
 								    table = string.maketrans('a', u'A')
 								    test('translate', u'abc', u'Abc', table)
 								    test('translate', u'xyz', u'xyz', table)
 								test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
-												On 17-Mar-2000, Marc-Andre Lemburg said:

    Attached you find an update of the Unicode implementation.

    The patch is against the current CVS version. I would appreciate
    if someone with CVS checkin permissions could check the changes
    in.

    The patch contains all bugs and patches sent this week and also
    fixes a leak in the codecs code and a bug in the free list code
    for Unicode objects (which only shows up when compiling Python
    with Py_DEBUG; thanks to MarkH for spotting this one).

											
										
										
											2000-03-20 12:36:48 -04:00
+								test('replace', u'one!two!three!', u'onetwothree', '!', '')
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
 								test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
 								test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
 								test('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
 								test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
 								test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
 								test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)
-												Add the 'bool' type and its values 'False' and 'True', as described in
PEP 285.  Everything described in the PEP is here, and there is even
some documentation.  I had to fix 12 unit tests; all but one of these
were printing Boolean outcomes that changed from 0/1 to False/True.
(The exception is test_unicode.py, which did a type(x) == type(y)
style comparison.  I could've fixed that with a single line using
issubtype(x, type(y)), but instead chose to be explicit about those
places where a bool is expected.

Still to do: perhaps more documentation; change standard library
modules to return False/True from predicates.

											
										
										
											2002-04-03 18:41:51 -04:00
+								test('startswith', u'hello', True, u'he')
 								test('startswith', u'hello', True, u'hello')
 								test('startswith', u'hello', False, u'hello world')
 								test('startswith', u'hello', True, u'')
 								test('startswith', u'hello', False, u'ello')
 								test('startswith', u'hello', True, u'ello', 1)
 								test('startswith', u'hello', True, u'o', 4)
 								test('startswith', u'hello', False, u'o', 5)
 								test('startswith', u'hello', True, u'', 5)
 								test('startswith', u'hello', False, u'lo', 6)
 								test('startswith', u'helloworld', True, u'lowo', 3)
 								test('startswith', u'helloworld', True, u'lowo', 3, 7)
 								test('startswith', u'helloworld', False, u'lowo', 3, 6)
 								test('endswith', u'hello', True, u'lo')
 								test('endswith', u'hello', False, u'he')
 								test('endswith', u'hello', True, u'')
 								test('endswith', u'hello', False, u'hello world')
 								test('endswith', u'helloworld', False, u'worl')
 								test('endswith', u'helloworld', True, u'worl', 3, 9)
 								test('endswith', u'helloworld', True, u'world', 3, 12)
 								test('endswith', u'helloworld', True, u'lowo', 1, 7)
 								test('endswith', u'helloworld', True, u'lowo', 2, 7)
 								test('endswith', u'helloworld', True, u'lowo', 3, 7)
 								test('endswith', u'helloworld', False, u'lowo', 4, 7)
 								test('endswith', u'helloworld', False, u'lowo', 3, 8)
 								test('endswith', u'ab', False, u'ab', 0, 1)
 								test('endswith', u'ab', False, u'ab', 0, 0)
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
 								test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab      def\ng       hi')
 								test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab      def\ng       hi', 8)
 								test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab  def\ng   hi', 4)
 								test('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab  def\ng   hi', 4)
-												Apply diff3.txt from SF patch http://www.python.org/sf/536241

If a str or unicode method returns the original object,
make sure that for str and unicode subclasses the original
will not be returned.

This should prevent SF bug http://www.python.org/sf/460020
from reappearing.

											
										
										
											2002-04-17 18:34:05 -03:00
+								test('expandtabs', u'abc\r\nab\r\ndef\ng\r\nhi', u'abc\r\nab\r\ndef\ng\r\nhi', 4)
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
 								if 0:
 								    test('capwords', u'abc def ghi', u'Abc Def Ghi')
 								    test('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
 								    test('capwords', u'abc\t   def  \nghi', u'Abc Def Ghi')
-												Apply the second version of SF patch http://www.python.org/sf/536241

Add a method zfill to str, unicode and UserString and change
Lib/string.py accordingly.

This activates the zfill version in unicodeobject.c that was
commented out and implements the same in stringobject.c. It also
adds the test for unicode support in Lib/string.py back in and
uses repr() instead() of str() (as it was before Lib/string.py 1.62)

											
										
										
											2002-04-15 10:36:47 -03:00
+								test('zfill', u'123', u'123', 2)
 								test('zfill', u'123', u'123', 3)
 								test('zfill', u'123', u'0123', 4)
 								test('zfill', u'+123', u'+123', 3)
 								test('zfill', u'+123', u'+123', 4)
 								test('zfill', u'+123', u'+0123', 5)
 								test('zfill', u'-123', u'-123', 3)
 								test('zfill', u'-123', u'-123', 4)
 								test('zfill', u'-123', u'-0123', 5)
 								test('zfill', u'', u'000', 3)
 								test('zfill', u'34', u'34', 1)
 								test('zfill', u'34', u'00034', 5)
-												As part of fixing bug #536241, add a test case for string.zfill() with Unicode

											
										
										
											2002-03-29 12:21:44 -04:00
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								# Comparisons:
 								print 'Testing Unicode comparisons...',
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								verify(u'abc' == 'abc')
 								verify('abc' == u'abc')
 								verify(u'abc' == u'abc')
 								verify(u'abcd' > 'abc')
 								verify('abcd' > u'abc')
 								verify(u'abcd' > u'abc')
 								verify(u'abc' < 'abcd')
 								verify('abc' < u'abcd')
 								verify(u'abc' < u'abcd')
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								print 'done.'
-												Removing UTF-16 aware Unicode comparison code. This kind of compare
function (together with other locale aware ones) should into a new collation
support module. See python-dev for a discussion of this removal.

Note: This patch should also be applied to the 1.6 branch.

											
										
										
											2000-08-08 05:04:29 -03:00
+								if 0:
 								    # Move these tests to a Unicode collation module test...
 								    print 'Testing UTF-16 code point order comparisons...',
 								    #No surrogates, no fixup required.
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								    verify(u'\u0061' < u'\u20ac')
-												Removing UTF-16 aware Unicode comparison code. This kind of compare
function (together with other locale aware ones) should into a new collation
support module. See python-dev for a discussion of this removal.

Note: This patch should also be applied to the 1.6 branch.

											
										
										
											2000-08-08 05:04:29 -03:00
+								    # Non surrogate below surrogate value, no fixup required
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								    verify(u'\u0061' < u'\ud800\udc02')
-												Removing UTF-16 aware Unicode comparison code. This kind of compare
function (together with other locale aware ones) should into a new collation
support module. See python-dev for a discussion of this removal.

Note: This patch should also be applied to the 1.6 branch.

											
										
										
											2000-08-08 05:04:29 -03:00
 								    # Non surrogate above surrogate value, fixup required
 								    def test_lecmp(s, s2):
-												Whitespace normalization.  Leaving tokenize_tests.py alone for now.

											
										
										
											2001-01-17 22:22:22 -04:00
+								        verify(s <  s2 , "comparison failed on %s < %s" % (s, s2))
-												Removing UTF-16 aware Unicode comparison code. This kind of compare
function (together with other locale aware ones) should into a new collation
support module. See python-dev for a discussion of this removal.

Note: This patch should also be applied to the 1.6 branch.

											
										
										
											2000-08-08 05:04:29 -03:00
 								    def test_fixup(s):
-												Make reindent.py happy (convert everything to 4-space indents!).

											
										
										
											2000-10-23 14:22:08 -03:00
+								        s2 = u'\ud800\udc01'
 								        test_lecmp(s, s2)
 								        s2 = u'\ud900\udc01'
 								        test_lecmp(s, s2)
 								        s2 = u'\uda00\udc01'
 								        test_lecmp(s, s2)
 								        s2 = u'\udb00\udc01'
 								        test_lecmp(s, s2)
 								        s2 = u'\ud800\udd01'
 								        test_lecmp(s, s2)
 								        s2 = u'\ud900\udd01'
 								        test_lecmp(s, s2)
 								        s2 = u'\uda00\udd01'
 								        test_lecmp(s, s2)
 								        s2 = u'\udb00\udd01'
 								        test_lecmp(s, s2)
 								        s2 = u'\ud800\ude01'
 								        test_lecmp(s, s2)
 								        s2 = u'\ud900\ude01'
 								        test_lecmp(s, s2)
 								        s2 = u'\uda00\ude01'
 								        test_lecmp(s, s2)
 								        s2 = u'\udb00\ude01'
 								        test_lecmp(s, s2)
 								        s2 = u'\ud800\udfff'
 								        test_lecmp(s, s2)
 								        s2 = u'\ud900\udfff'
 								        test_lecmp(s, s2)
 								        s2 = u'\uda00\udfff'
 								        test_lecmp(s, s2)
 								        s2 = u'\udb00\udfff'
 								        test_lecmp(s, s2)
-												Removing UTF-16 aware Unicode comparison code. This kind of compare
function (together with other locale aware ones) should into a new collation
support module. See python-dev for a discussion of this removal.

Note: This patch should also be applied to the 1.6 branch.

											
										
										
											2000-08-08 05:04:29 -03:00
 								    test_fixup(u'\ue000')
 								    test_fixup(u'\uff61')
 								    # Surrogates on both sides, no fixup required
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								    verify(u'\ud800\udc02' < u'\ud84d\udc56')
-												Removing UTF-16 aware Unicode comparison code. This kind of compare
function (together with other locale aware ones) should into a new collation
support module. See python-dev for a discussion of this removal.

Note: This patch should also be applied to the 1.6 branch.

											
										
										
											2000-08-08 05:04:29 -03:00
+								    print 'done.'
-												Tests for new surrogate support in the UTF-8 codec. By Bill Tutt.

											
										
										
											2000-07-07 14:48:52 -03:00
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								test('ljust', u'abc',  u'abc       ', 10)
 								test('rjust', u'abc',  u'       abc', 10)
 								test('center', u'abc', u'   abc    ', 10)
 								test('ljust', u'abc',  u'abc   ', 6)
 								test('rjust', u'abc',  u'   abc', 6)
 								test('center', u'abc', u' abc  ', 6)
 								test('ljust', u'abc', u'abc', 2)
 								test('rjust', u'abc', u'abc', 2)
 								test('center', u'abc', u'abc', 2)
-												Add the 'bool' type and its values 'False' and 'True', as described in
PEP 285.  Everything described in the PEP is here, and there is even
some documentation.  I had to fix 12 unit tests; all but one of these
were printing Boolean outcomes that changed from 0/1 to False/True.
(The exception is test_unicode.py, which did a type(x) == type(y)
style comparison.  I could've fixed that with a single line using
issubtype(x, type(y)), but instead chose to be explicit about those
places where a bool is expected.

Still to do: perhaps more documentation; change standard library
modules to return False/True from predicates.

											
										
										
											2002-04-03 18:41:51 -04:00
+								test('islower', u'a', True)
 								test('islower', u'A', False)
 								test('islower', u'\n', False)
 								test('islower', u'\u1FFc', False)
 								test('islower', u'abc', True)
 								test('islower', u'aBc', False)
 								test('islower', u'abc\n', True)
 								test('isupper', u'a', False)
 								test('isupper', u'A', True)
 								test('isupper', u'\n', False)
-												Patch by Finn Bock to make test_unicode.py work for Jython.

											
										
										
											2001-02-10 10:09:31 -04:00
+								if sys.platform[:4] != 'java':
-												Add the 'bool' type and its values 'False' and 'True', as described in
PEP 285.  Everything described in the PEP is here, and there is even
some documentation.  I had to fix 12 unit tests; all but one of these
were printing Boolean outcomes that changed from 0/1 to False/True.
(The exception is test_unicode.py, which did a type(x) == type(y)
style comparison.  I could've fixed that with a single line using
issubtype(x, type(y)), but instead chose to be explicit about those
places where a bool is expected.

Still to do: perhaps more documentation; change standard library
modules to return False/True from predicates.

											
										
										
											2002-04-03 18:41:51 -04:00
+								    test('isupper', u'\u1FFc', False)
 								test('isupper', u'ABC', True)
 								test('isupper', u'AbC', False)
 								test('isupper', u'ABC\n', True)
 								test('istitle', u'a', False)
 								test('istitle', u'A', True)
 								test('istitle', u'\n', False)
 								test('istitle', u'\u1FFc', True)
 								test('istitle', u'A Titlecased Line', True)
 								test('istitle', u'A\nTitlecased Line', True)
 								test('istitle', u'A Titlecased, Line', True)
 								test('istitle', u'Greek \u1FFcitlecases ...', True)
 								test('istitle', u'Not a capitalized String', False)
 								test('istitle', u'Not\ta Titlecase String', False)
 								test('istitle', u'Not--a Titlecase String', False)
 								test('isalpha', u'a', True)
 								test('isalpha', u'A', True)
 								test('isalpha', u'\n', False)
 								test('isalpha', u'\u1FFc', True)
 								test('isalpha', u'abc', True)
 								test('isalpha', u'aBc123', False)
 								test('isalpha', u'abc\n', False)
 								test('isalnum', u'a', True)
 								test('isalnum', u'A', True)
 								test('isalnum', u'\n', False)
 								test('isalnum', u'123abc456', True)
 								test('isalnum', u'a1b3c', True)
 								test('isalnum', u'aBc000 ', False)
 								test('isalnum', u'abc\n', False)
-												Added tests for the new .isalpha() and .isalnum() methods.

											
										
										
											2000-07-05 06:46:40 -03:00
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								test('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
 								test('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
 								test('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
 								test('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
 								test('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
 								test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
-												Add the 'bool' type and its values 'False' and 'True', as described in
PEP 285.  Everything described in the PEP is here, and there is even
some documentation.  I had to fix 12 unit tests; all but one of these
were printing Boolean outcomes that changed from 0/1 to False/True.
(The exception is test_unicode.py, which did a type(x) == type(y)
style comparison.  I could've fixed that with a single line using
issubtype(x, type(y)), but instead chose to be explicit about those
places where a bool is expected.

Still to do: perhaps more documentation; change standard library
modules to return False/True from predicates.

											
										
										
											2002-04-03 18:41:51 -04:00
+								test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], True)
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
 								test('translate', u"abababc", u'bbbc', {ord('a'):None})
 								test('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
 								test('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
-												Marc-Andre Lemburg: Add tests for mixed use of char in string.

											
										
										
											2000-03-13 19:21:48 -04:00
+								# Contains:
 								print 'Testing Unicode contains method...',
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								verify(('a' in u'abdb') == 1)
 								verify(('a' in u'bdab') == 1)
 								verify(('a' in u'bdaba') == 1)
 								verify(('a' in u'bdba') == 1)
 								verify(('a' in u'bdba') == 1)
 								verify((u'a' in u'bdba') == 1)
 								verify((u'a' in u'bdb') == 0)
 								verify((u'a' in 'bdb') == 0)
 								verify((u'a' in 'bdba') == 1)
 								verify((u'a' in ('a',1,None)) == 1)
 								verify((u'a' in (1,None,'a')) == 1)
 								verify((u'a' in (1,None,u'a')) == 1)
 								verify(('a' in ('a',1,None)) == 1)
 								verify(('a' in (1,None,'a')) == 1)
 								verify(('a' in (1,None,u'a')) == 1)
 								verify(('a' in ('x',1,u'y')) == 0)
 								verify(('a' in ('x',1,None)) == 0)
-												Marc-Andre Lemburg: Add tests for mixed use of char in string.

											
										
										
											2000-03-13 19:21:48 -04:00
+								print 'done.'
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								# Formatting:
 								print 'Testing Unicode formatting strings...',
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								verify(u"%s, %s" % (u"abc", "abc") == u'abc, abc')
 								verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3) == u'abc, abc, 1, 2.000000,  3.00')
 								verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3) == u'abc, abc, 1, -2.000000,  3.00')
 								verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5) == u'abc, abc, -1, -2.000000,  3.50')
 								verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57) == u'abc, abc, -1, -2.000000,  3.57')
 								verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57) == u'abc, abc, -1, -2.000000, 1003.57')
 								verify(u"%c" % (u"a",) == u'a')
 								verify(u"%c" % ("a",) == u'a')
 								verify(u"%c" % (34,) == u'"')
 								verify(u"%c" % (36,) == u'$')
-												Patch by Finn Bock to make test_unicode.py work for Jython.

											
										
										
											2001-02-10 10:09:31 -04:00
+								if sys.platform[:4] != 'java':
 								    value = u"%r, %r" % (u"abc", "abc")
 								    if value != u"u'abc', 'abc'":
 								        print '*** formatting failed for "%s"' % 'u"%r, %r" % (u"abc", "abc")'
-												Marc-Andre Lemburg <mal@lemburg.com>:
Fixed some tests to not cause the script to fail, but rather
output a warning (which then is caught by regrtest.py as wrong
output). This is needed to make test_unicode.py run through
on JPython.
Thanks to Finn Bock.

											
										
										
											2000-06-13 09:05:36 -03:00
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								verify(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def')
-												Marc-Andre Lemburg <mal@lemburg.com>:
Fixed some tests to not cause the script to fail, but rather
output a warning (which then is caught by regrtest.py as wrong
output). This is needed to make test_unicode.py run through
on JPython.
Thanks to Finn Bock.

											
										
										
											2000-06-13 09:05:36 -03:00
+								try:
-												Fix for bug #438164: %-formatting using Unicode objects.

This patch also does away with an incompatibility between Jython
and CPython.

											
										
										
											2001-11-20 11:18:49 -04:00
+								    value = u"%(x)s, %(<28>)s" % {'x':u"abc", u'<EFBFBD>':"def"}
-												Marc-Andre Lemburg <mal@lemburg.com>:
Fixed some tests to not cause the script to fail, but rather
output a warning (which then is caught by regrtest.py as wrong
output). This is needed to make test_unicode.py run through
on JPython.
Thanks to Finn Bock.

											
										
										
											2000-06-13 09:05:36 -03:00
+								except KeyError:
 								    print '*** formatting failed for "%s"' % "u'abc, def'"
 								else:
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								    verify(value == u'abc, def')
-												Marc-Andre Lemburg <mal@lemburg.com>:
Fixed some tests to not cause the script to fail, but rather
output a warning (which then is caught by regrtest.py as wrong
output). This is needed to make test_unicode.py run through
on JPython.
Thanks to Finn Bock.

											
										
										
											2000-06-13 09:05:36 -03:00
-												Marc-Andre Lemburg:

* '...%s...' % u"abc" now coerces to Unicode just like
  string methods. Care is taken not to reevaluate already formatted
  arguments -- only the first Unicode object appearing in the
  argument mapping is looked up twice. Added test cases for
  this to test_unicode.py.

											
										
										
											2000-04-10 10:52:48 -03:00
+								# formatting jobs delegated from the string implementation:
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...')
 								verify('...%(foo)s...' % {'foo':"abc"} == '...abc...')
 								verify('...%(foo)s...' % {u'foo':"abc"} == '...abc...')
 								verify('...%(foo)s...' % {u'foo':u"abc"} == u'...abc...')
 								verify('...%(foo)s...' % {u'foo':u"abc",'def':123} ==  u'...abc...')
 								verify('...%(foo)s...' % {u'foo':u"abc",u'def':123} == u'...abc...')
 								verify('...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...1...2...3...abc...')
 								verify('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...%...%s...1...2...3...abc...')
 								verify('...%s...' % u"abc" == u'...abc...')
-												Fix for bug #417030: "print '%*s' fails for unicode string"

											
										
										
											2001-05-02 11:21:53 -03:00
+								verify('%*s' % (5,u'abc',) == u'  abc')
 								verify('%*s' % (-5,u'abc',) == u'abc  ')
 								verify('%*.*s' % (5,2,u'abc',) == u'   ab')
 								verify('%*.*s' % (5,3,u'abc',) == u'  abc')
 								verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10   abc')
 								verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103   abc')
-												Marc-Andre Lemburg: test script for Unicode implementation.

											
										
										
											2000-03-10 19:23:21 -04:00
+								print 'done.'
-												Additional test and documentation for the unicode() changes.

This patch should also be applied to the 2.2b1 trunk.

											
										
										
											2001-10-19 09:02:29 -03:00
+								print 'Testing builtin unicode()...',
 								# unicode(obj) tests (this maps to PyObject_Unicode() at C level)
 								verify(unicode(u'unicode remains unicode') == u'unicode remains unicode')
 								class UnicodeSubclass(unicode):
 								    pass
 								verify(unicode(UnicodeSubclass('unicode subclass becomes unicode'))
 								       == u'unicode subclass becomes unicode')
 								verify(unicode('strings are converted to unicode')
 								       == u'strings are converted to unicode')
 								class UnicodeCompat:
 								    def __init__(self, x):
 								        self.x = x
 								    def __unicode__(self):
 								        return self.x
 								verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized'))
 								       == u'__unicode__ compatible objects are recognized')
 								class StringCompat:
 								    def __init__(self, x):
 								        self.x = x
 								    def __str__(self):
 								        return self.x
 								verify(unicode(StringCompat('__str__ compatible objects are recognized'))
 								       == u'__str__ compatible objects are recognized')
 								# unicode(obj) is compatible to str():
 								o = StringCompat('unicode(obj) is compatible to str()')
 								verify(unicode(o) == u'unicode(obj) is compatible to str()')
 								verify(str(o) == 'unicode(obj) is compatible to str()')
 								for obj in (123, 123.45, 123L):
 								    verify(unicode(obj) == unicode(str(obj)))
 								# unicode(obj, encoding, error) tests (this maps to
 								# PyUnicode_FromEncodedObject() at C level)
-												Skipping some tests by adding the usual jython conditional test around:

- the repr of unicode. Jython only add the u'' if the string contains char
  values > 255.
- A unicode arg to unicode() is perfectly valid in jython.
- A test buffer() test. No buffer() on Jython

This closes patch "[ #490920 ] Jython and test_unicode".

											
										
										
											2001-12-10 16:57:34 -04:00
+								if not sys.platform.startswith('java'):
 								    try:
 								        unicode(u'decoding unicode is not supported', 'utf-8', 'strict')
 								    except TypeError:
 								        pass
 								    else:
 								        raise TestFailed, "decoding unicode should NOT be supported"
-												Additional test and documentation for the unicode() changes.

This patch should also be applied to the 2.2b1 trunk.

											
										
										
											2001-10-19 09:02:29 -03:00
 								verify(unicode('strings are decoded to unicode', 'utf-8', 'strict')
 								       == u'strings are decoded to unicode')
-												Skipping some tests by adding the usual jython conditional test around:

- the repr of unicode. Jython only add the u'' if the string contains char
  values > 255.
- A unicode arg to unicode() is perfectly valid in jython.
- A test buffer() test. No buffer() on Jython

This closes patch "[ #490920 ] Jython and test_unicode".

											
										
										
											2001-12-10 16:57:34 -04:00
+								if not sys.platform.startswith('java'):
 								    verify(unicode(buffer('character buffers are decoded to unicode'),
 								                   'utf-8', 'strict')
 								           == u'character buffers are decoded to unicode')
-												Additional test and documentation for the unicode() changes.

This patch should also be applied to the 2.2b1 trunk.

											
										
										
											2001-10-19 09:02:29 -03:00
 								print 'done.'
-												Marc-Andre Lemburg:

Attached you find the latest update of the Unicode implementation.
The patch is against the current CVS version.

It includes the fix I posted yesterday for the core dump problem
in codecs.c (was introduced by my previous patch set -- sorry),
adds more tests for the codecs and two new parser markers
"es" and "es#".

											
										
										
											2000-03-24 18:14:19 -04:00
+								# Test builtin codecs
 								print 'Testing builtin codecs...',
-												Patch #435971: UTF-7 codec by Brian Quinlan.

											
										
										
											2001-09-20 07:35:46 -03:00
+								# UTF-7 specific encoding tests:
 								utfTests = [(u'A\u2262\u0391.', 'A+ImIDkQ.'),  # RFC2152 example
 								 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'),     # RFC2152 example
 								 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'),        # RFC2152 example
 								 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
 								 (u'+', '+-'),
 								 (u'+-', '+--'),
 								 (u'+?', '+-?'),
 								 (u'\?', '+AFw?'),
 								 (u'+?', '+-?'),
 								 (ur'\\?', '+AFwAXA?'),
 								 (ur'\\\?', '+AFwAXABc?'),
 								 (ur'++--', '+-+---')]
 								for x,y in utfTests:
 								    verify( x.encode('utf-7') == y )
-												Whitespace normalization.

											
										
										
											2001-10-04 02:36:56 -03:00
+								try:
-												Patch #435971: UTF-7 codec by Brian Quinlan.

											
										
										
											2001-09-20 07:35:46 -03:00
+								    unicode('+3ADYAA-', 'utf-7') # surrogates not supported
 								except UnicodeError:
 								    pass
 								else:
 								    raise TestFailed, "unicode('+3ADYAA-', 'utf-7') failed to raise an exception"
 								verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
-												Tests for new surrogate support in the UTF-8 codec. By Bill Tutt.

											
										
										
											2000-07-07 14:48:52 -03:00
+								# UTF-8 specific encoding tests:
-												Fix to the UTF-8 encoder: it failed on 0-length input strings.

Fix for the UTF-8 decoder: it will now accept isolated surrogates
(previously it raised an exception which causes round-trips to
fail).

Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for
marshalling Unicode objects, so we better make sure it works for
all Unicode code points, including isolated surrogates).

Bumped the PYC magic in a non-standard way -- please review. This
was needed because the old PYC format used illegal UTF-8 sequences
for isolated high surrogates which now raise an exception.

											
										
										
											2002-02-07 07:33:49 -04:00
+								verify(u''.encode('utf-8') == '')
-												Fix for the UTF-8 memory allocation bug and the UTF-8 encoding
bug related to lone high surrogates.

											
										
										
											2002-02-06 14:09:02 -04:00
+								verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
 								verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
 								verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
 								verify(u'\ud800'.encode('utf-8') == '\xed\xa0\x80')
 								verify(u'\udc00'.encode('utf-8') == '\xed\xb0\x80')
 								verify((u'\ud800\udc02'*1000).encode('utf-8') ==
 								       '\xf0\x90\x80\x82'*1000)
-												Added test case for UTF-8 encoding bug #541828.

											
										
										
											2002-04-10 14:18:02 -03:00
+								verify(u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
 								       u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
 								       u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
 								       u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
 								       u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
-												Whitespace normalization.

											
										
										
											2002-04-15 22:38:40 -03:00
+								       u' Nunstuck git und'.encode('utf-8') ==
-												Added test case for UTF-8 encoding bug #541828.

											
										
										
											2002-04-10 14:18:02 -03:00
+								       '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
 								       '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
 								       '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
 								       '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
 								       '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
 								       '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
 								       '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
 								       '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
 								       '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
 								       '\xe3\x80\x8cWenn ist das Nunstuck git und')
-												Fix for the UTF-8 memory allocation bug and the UTF-8 encoding
bug related to lone high surrogates.

											
										
										
											2002-02-06 14:09:02 -04:00
-												Tests for new surrogate support in the UTF-8 codec. By Bill Tutt.

											
										
										
											2000-07-07 14:48:52 -03:00
+								# UTF-8 specific decoding tests
-												Fix for the UTF-8 memory allocation bug and the UTF-8 encoding
bug related to lone high surrogates.

											
										
										
											2002-02-06 14:09:02 -04:00
+								verify(unicode('\xf0\xa3\x91\x96', 'utf-8') == u'\U00023456' )
 								verify(unicode('\xf0\x90\x80\x82', 'utf-8') == u'\U00010002' )
 								verify(unicode('\xe2\x82\xac', 'utf-8') == u'\u20ac' )
-												Tests for new surrogate support in the UTF-8 codec. By Bill Tutt.

											
										
										
											2000-07-07 14:48:52 -03:00
 								# Other possible utf-8 test cases:
 								# * strict decoding testing for all of the
 								#   UTF8_ERROR cases in PyUnicode_DecodeUTF8
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								verify(unicode('hello','ascii') == u'hello')
 								verify(unicode('hello','utf-8') == u'hello')
 								verify(unicode('hello','utf8') == u'hello')
 								verify(unicode('hello','latin-1') == u'hello')
-												Marc-Andre Lemburg:

Attached you find the latest update of the Unicode implementation.
The patch is against the current CVS version.

It includes the fix I posted yesterday for the core dump problem
in codecs.c (was introduced by my previous patch set -- sorry),
adds more tests for the codecs and two new parser markers
"es" and "es#".

											
										
										
											2000-03-24 18:14:19 -04:00
-												Implement the changes proposed in patch #413333. unicode(obj) now
works just like str(obj) in that it tries __str__/tp_str on the object
in case it finds that the object is not a string or buffer.

											
										
										
											2001-09-20 09:53:16 -03:00
+								# Error handling
-												Marc-Andre Lemburg:

* '...%s...' % u"abc" now coerces to Unicode just like
  string methods. Care is taken not to reevaluate already formatted
  arguments -- only the first Unicode object appearing in the
  argument mapping is looked up twice. Added test cases for
  this to test_unicode.py.

											
										
										
											2000-04-10 10:52:48 -03:00
+								try:
 								    u'Andr\202 x'.encode('ascii')
 								    u'Andr\202 x'.encode('ascii','strict')
 								except ValueError:
 								    pass
 								else:
-												Change verify() function to raise TestFailed, not AssertionError.

(I realize that I didn't really test this, because all the tests
succeed, so verify() never raised an AssertionError -- but the test
suite still succeeds, so I'm not too worried.)

											
										
										
											2001-01-19 15:01:56 -04:00
+								    raise TestFailed, "u'Andr\202'.encode('ascii') failed to raise an exception"
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								verify(u'Andr\202 x'.encode('ascii','ignore') == "Andr x")
 								verify(u'Andr\202 x'.encode('ascii','replace') == "Andr? x")
-												Marc-Andre Lemburg:

* '...%s...' % u"abc" now coerces to Unicode just like
  string methods. Care is taken not to reevaluate already formatted
  arguments -- only the first Unicode object appearing in the
  argument mapping is looked up twice. Added test cases for
  this to test_unicode.py.

											
										
										
											2000-04-10 10:52:48 -03:00
 								try:
 								    unicode('Andr\202 x','ascii')
 								    unicode('Andr\202 x','ascii','strict')
 								except ValueError:
 								    pass
 								else:
-												Change verify() function to raise TestFailed, not AssertionError.

(I realize that I didn't really test this, because all the tests
succeed, so verify() never raised an AssertionError -- but the test
suite still succeeds, so I'm not too worried.)

											
										
										
											2001-01-19 15:01:56 -04:00
+								    raise TestFailed, "unicode('Andr\202') failed to raise an exception"
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
 								verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
-												Marc-Andre Lemburg:

* '...%s...' % u"abc" now coerces to Unicode just like
  string methods. Care is taken not to reevaluate already formatted
  arguments -- only the first Unicode object appearing in the
  argument mapping is looked up twice. Added test cases for
  this to test_unicode.py.

											
										
										
											2000-04-10 10:52:48 -03:00
-												Do not insert characters for unicode-escape decoders if the error mode
is "ignore". Fixes #529104.

											
										
										
											2002-03-21 04:55:28 -04:00
+								verify("\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx")
 								try:
 								    "\\".decode("unicode-escape")
 								except ValueError:
 								    pass
 								else:
 								    raise TestFailed, '"\\".decode("unicode-escape") should fail'
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								verify(u'hello'.encode('ascii') == 'hello')
-												Patch #435971: UTF-7 codec by Brian Quinlan.

											
										
										
											2001-09-20 07:35:46 -03:00
+								verify(u'hello'.encode('utf-7') == 'hello')
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								verify(u'hello'.encode('utf-8') == 'hello')
 								verify(u'hello'.encode('utf8') == 'hello')
 								verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
 								verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
 								verify(u'hello'.encode('latin-1') == 'hello')
-												Marc-Andre Lemburg:

Attached you find the latest update of the Unicode implementation.
The patch is against the current CVS version.

It includes the fix I posted yesterday for the core dump problem
in codecs.c (was introduced by my previous patch set -- sorry),
adds more tests for the codecs and two new parser markers
"es" and "es#".

											
										
										
											2000-03-24 18:14:19 -04:00
-												Make the unicode-escape and the UTF-16 codecs handle surrogates
correctly and thus roundtrip-safe.

Some minor cleanups of the code.

Added tests for the roundtrip-safety.

											
										
										
											2001-07-20 14:39:11 -03:00
+								# Roundtrip safety for BMP (just the first 1024 chars)
-												Marc-Andre Lemburg:

Attached you find the latest update of the Unicode implementation.
The patch is against the current CVS version.

It includes the fix I posted yesterday for the core dump problem
in codecs.c (was introduced by my previous patch set -- sorry),
adds more tests for the codecs and two new parser markers
"es" and "es#".

											
										
										
											2000-03-24 18:14:19 -04:00
+								u = u''.join(map(unichr, range(1024)))
-												Patch #435971: UTF-7 codec by Brian Quinlan.

											
										
										
											2001-09-20 07:35:46 -03:00
+								for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
-												Marc-Andre Lemburg:

Attached you find the latest update of the Unicode implementation.
The patch is against the current CVS version.

It includes the fix I posted yesterday for the core dump problem
in codecs.c (was introduced by my previous patch set -- sorry),
adds more tests for the codecs and two new parser markers
"es" and "es#".

											
										
										
											2000-03-24 18:14:19 -04:00
+								                 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								    verify(unicode(u.encode(encoding),encoding) == u)
-												Make the unicode-escape and the UTF-16 codecs handle surrogates
correctly and thus roundtrip-safe.

Some minor cleanups of the code.

Added tests for the roundtrip-safety.

											
										
										
											2001-07-20 14:39:11 -03:00
-												Fix to the UTF-8 encoder: it failed on 0-length input strings.

Fix for the UTF-8 decoder: it will now accept isolated surrogates
(previously it raised an exception which causes round-trips to
fail).

Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for
marshalling Unicode objects, so we better make sure it works for
all Unicode code points, including isolated surrogates).

Bumped the PYC magic in a non-standard way -- please review. This
was needed because the old PYC format used illegal UTF-8 sequences
for isolated high surrogates which now raise an exception.

											
										
										
											2002-02-07 07:33:49 -04:00
+								# Roundtrip safety for BMP (just the first 256 chars)
-												Marc-Andre Lemburg:

Attached you find the latest update of the Unicode implementation.
The patch is against the current CVS version.

It includes the fix I posted yesterday for the core dump problem
in codecs.c (was introduced by my previous patch set -- sorry),
adds more tests for the codecs and two new parser markers
"es" and "es#".

											
										
										
											2000-03-24 18:14:19 -04:00
+								u = u''.join(map(unichr, range(256)))
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								for encoding in (
 								    'latin-1',
 								    ):
 								    try:
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								        verify(unicode(u.encode(encoding),encoding) == u)
-												Change verify() function to raise TestFailed, not AssertionError.

(I realize that I didn't really test this, because all the tests
succeed, so verify() never raised an AssertionError -- but the test
suite still succeeds, so I'm not too worried.)

											
										
										
											2001-01-19 15:01:56 -04:00
+								    except TestFailed:
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								        print '*** codec "%s" failed round-trip' % encoding
 								    except ValueError,why:
 								        print '*** codec for "%s" failed: %s' % (encoding, why)
-												Marc-Andre Lemburg:

Attached you find the latest update of the Unicode implementation.
The patch is against the current CVS version.

It includes the fix I posted yesterday for the core dump problem
in codecs.c (was introduced by my previous patch set -- sorry),
adds more tests for the codecs and two new parser markers
"es" and "es#".

											
										
										
											2000-03-24 18:14:19 -04:00
-												Fix to the UTF-8 encoder: it failed on 0-length input strings.

Fix for the UTF-8 decoder: it will now accept isolated surrogates
(previously it raised an exception which causes round-trips to
fail).

Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for
marshalling Unicode objects, so we better make sure it works for
all Unicode code points, including isolated surrogates).

Bumped the PYC magic in a non-standard way -- please review. This
was needed because the old PYC format used illegal UTF-8 sequences
for isolated high surrogates which now raise an exception.

											
										
										
											2002-02-07 07:33:49 -04:00
+								# Roundtrip safety for BMP (just the first 128 chars)
-												Marc-Andre Lemburg:

Attached you find the latest update of the Unicode implementation.
The patch is against the current CVS version.

It includes the fix I posted yesterday for the core dump problem
in codecs.c (was introduced by my previous patch set -- sorry),
adds more tests for the codecs and two new parser markers
"es" and "es#".

											
										
										
											2000-03-24 18:14:19 -04:00
+								u = u''.join(map(unichr, range(128)))
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								for encoding in (
 								    'ascii',
 								    ):
 								    try:
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								        verify(unicode(u.encode(encoding),encoding) == u)
-												Change verify() function to raise TestFailed, not AssertionError.

(I realize that I didn't really test this, because all the tests
succeed, so verify() never raised an AssertionError -- but the test
suite still succeeds, so I'm not too worried.)

											
										
										
											2001-01-19 15:01:56 -04:00
+								    except TestFailed:
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								        print '*** codec "%s" failed round-trip' % encoding
 								    except ValueError,why:
 								        print '*** codec for "%s" failed: %s' % (encoding, why)
-												Fix to the UTF-8 encoder: it failed on 0-length input strings.

Fix for the UTF-8 decoder: it will now accept isolated surrogates
(previously it raised an exception which causes round-trips to
fail).

Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for
marshalling Unicode objects, so we better make sure it works for
all Unicode code points, including isolated surrogates).

Bumped the PYC magic in a non-standard way -- please review. This
was needed because the old PYC format used illegal UTF-8 sequences
for isolated high surrogates which now raise an exception.

											
										
										
											2002-02-07 07:33:49 -04:00
+								# Roundtrip safety for non-BMP (just a few chars)
 								u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
 								for encoding in ('utf-8',
 								                 'utf-16', 'utf-16-le', 'utf-16-be',
 								                 #'raw_unicode_escape',
 								                 'unicode_escape', 'unicode_internal'):
 								    verify(unicode(u.encode(encoding),encoding) == u)
 								# UTF-8 must be roundtrip safe for all UCS-2 code points
 								u = u''.join(map(unichr, range(0x10000)))
 								for encoding in ('utf-8',):
 								    verify(unicode(u.encode(encoding),encoding) == u)
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								print 'done.'
 								print 'Testing standard mapping codecs...',
 								print '0-127...',
 								s = ''.join(map(chr, range(128)))
 								for encoding in (
 								    'cp037', 'cp1026',
 								    'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
 								    'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
-												Make reindent.py happy (convert everything to 4-space indents!).

											
										
										
											2000-10-23 14:22:08 -03:00
+								    'cp863', 'cp865', 'cp866',
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								    'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
 								    'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
 								    'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
 								    'mac_cyrillic', 'mac_latin2',
 								    'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
 								    'cp1256', 'cp1257', 'cp1258',
 								    'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
 								    'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
-												Get rid of the superstitious "~" in dict hashing's "i = (~hash) & mask".
The comment following used to say:
	/* We use ~hash instead of hash, as degenerate hash functions, such
	   as for ints <sigh>, can have lots of leading zeros. It's not
	   really a performance risk, but better safe than sorry.
	   12-Dec-00 tim:  so ~hash produces lots of leading ones instead --
	   what's the gain? */
That is, there was never a good reason for doing it.  And to the contrary,
as explained on Python-Dev last December, it tended to make the *sum*
(i + incr) & mask (which is the first table index examined in case of
collison) the same "too often" across distinct hashes.

Changing to the simpler "i = hash & mask" reduced the number of string-dict
collisions (== # number of times we go around the lookup for-loop) from about
6 million to 5 million during a full run of the test suite (these are
approximate because the test suite does some random stuff from run to run).
The number of collisions in non-string dicts also decreased, but not as
dramatically.

Note that this may, for a given dict, change the order (wrt previous
releases) of entries exposed by .keys(), .values() and .items().  A number
of std tests suffered bogus failures as a result.  For dicts keyed by
small ints, or (less so) by characters, the order is much more likely to be
in increasing order of key now; e.g.,

>>> d = {}
>>> for i in range(10):
...    d[i] = i
...
>>> d
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9}
>>>

Unfortunately. people may latch on to that in small examples and draw a
bogus conclusion.

test_support.py
    Moved test_extcall's sortdict() into test_support, made it stronger,
    and imported sortdict into other std tests that needed it.
test_unicode.py
    Excluced cp875 from the "roundtrip over range(128)" test, because
    cp875 doesn't have a well-defined inverse for unicode("?", "cp875").
    See Python-Dev for excruciating details.
Cookie.py
    Chaged various output functions to sort dicts before building
    strings from them.
test_extcall
    Fiddled the expected-result file.  This remains sensitive to native
    dict ordering, because, e.g., if there are multiple errors in a
    keyword-arg dict (and test_extcall sets up many cases like that), the
    specific error Python complains about first depends on native dict
    ordering.

											
										
										
											2001-05-12 21:19:31 -03:00
+								    'cp1006', 'iso8859_8',
-												Make reindent.py happy (convert everything to 4-space indents!).

											
										
										
											2000-10-23 14:22:08 -03:00
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								    ### These have undefined mappings:
 								    #'cp424',
-												Make reindent.py happy (convert everything to 4-space indents!).

											
										
										
											2000-10-23 14:22:08 -03:00
-												Get rid of the superstitious "~" in dict hashing's "i = (~hash) & mask".
The comment following used to say:
	/* We use ~hash instead of hash, as degenerate hash functions, such
	   as for ints <sigh>, can have lots of leading zeros. It's not
	   really a performance risk, but better safe than sorry.
	   12-Dec-00 tim:  so ~hash produces lots of leading ones instead --
	   what's the gain? */
That is, there was never a good reason for doing it.  And to the contrary,
as explained on Python-Dev last December, it tended to make the *sum*
(i + incr) & mask (which is the first table index examined in case of
collison) the same "too often" across distinct hashes.

Changing to the simpler "i = hash & mask" reduced the number of string-dict
collisions (== # number of times we go around the lookup for-loop) from about
6 million to 5 million during a full run of the test suite (these are
approximate because the test suite does some random stuff from run to run).
The number of collisions in non-string dicts also decreased, but not as
dramatically.

Note that this may, for a given dict, change the order (wrt previous
releases) of entries exposed by .keys(), .values() and .items().  A number
of std tests suffered bogus failures as a result.  For dicts keyed by
small ints, or (less so) by characters, the order is much more likely to be
in increasing order of key now; e.g.,

>>> d = {}
>>> for i in range(10):
...    d[i] = i
...
>>> d
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9}
>>>

Unfortunately. people may latch on to that in small examples and draw a
bogus conclusion.

test_support.py
    Moved test_extcall's sortdict() into test_support, made it stronger,
    and imported sortdict into other std tests that needed it.
test_unicode.py
    Excluced cp875 from the "roundtrip over range(128)" test, because
    cp875 doesn't have a well-defined inverse for unicode("?", "cp875").
    See Python-Dev for excruciating details.
Cookie.py
    Chaged various output functions to sort dicts before building
    strings from them.
test_extcall
    Fiddled the expected-result file.  This remains sensitive to native
    dict ordering, because, e.g., if there are multiple errors in a
    keyword-arg dict (and test_extcall sets up many cases like that), the
    specific error Python complains about first depends on native dict
    ordering.

											
										
										
											2001-05-12 21:19:31 -03:00
+								    ### These fail the round-trip:
 								    #'cp875'
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								    ):
 								    try:
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								        verify(unicode(s,encoding).encode(encoding) == s)
-												Change verify() function to raise TestFailed, not AssertionError.

(I realize that I didn't really test this, because all the tests
succeed, so verify() never raised an AssertionError -- but the test
suite still succeeds, so I'm not too worried.)

											
										
										
											2001-01-19 15:01:56 -04:00
+								    except TestFailed:
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								        print '*** codec "%s" failed round-trip' % encoding
 								    except ValueError,why:
 								        print '*** codec for "%s" failed: %s' % (encoding, why)
 								print '128-255...',
 								s = ''.join(map(chr, range(128,256)))
 								for encoding in (
 								    'cp037', 'cp1026',
 								    'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
 								    'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
-												Make reindent.py happy (convert everything to 4-space indents!).

											
										
										
											2000-10-23 14:22:08 -03:00
+								    'cp863', 'cp865', 'cp866',
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								    'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
-												Whitespace normalization.  Leaving tokenize_tests.py alone for now.

											
										
										
											2001-01-17 22:22:22 -04:00
+								    'iso8859_2', 'iso8859_4', 'iso8859_5',
-												This patch changes the default behaviour of the builtin charmap
codec to not apply Latin-1 mappings for keys which are not found
in the mapping dictionaries, but instead treat them as undefined
mappings.

The patch was originally written by Martin v. Loewis with some
additional (cosmetic) changes and an updated test script
by Marc-Andre Lemburg.

The standard codecs were recreated from the most current files
available at the Unicode.org site using the Tools/scripts/gencodec.py
tool.

This patch closes the bugs #116285 and #119960.

											
										
										
											2001-01-03 17:29:14 -04:00
+								    'iso8859_9', 'koi8_r', 'latin_1',
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								    'mac_cyrillic', 'mac_latin2',
-												Make reindent.py happy (convert everything to 4-space indents!).

											
										
										
											2000-10-23 14:22:08 -03:00
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								    ### These have undefined mappings:
 								    #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
 								    #'cp1256', 'cp1257', 'cp1258',
 								    #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
-												Whitespace normalization.  Leaving tokenize_tests.py alone for now.

											
										
										
											2001-01-17 22:22:22 -04:00
+								    #'iso8859_3', 'iso8859_6', 'iso8859_7',
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								    #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
-												Make reindent.py happy (convert everything to 4-space indents!).

											
										
										
											2000-10-23 14:22:08 -03:00
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								    ### These fail the round-trip:
 								    #'cp1006', 'cp875', 'iso8859_8',
-												Make reindent.py happy (convert everything to 4-space indents!).

											
										
										
											2000-10-23 14:22:08 -03:00
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								    ):
 								    try:
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								        verify(unicode(s,encoding).encode(encoding) == s)
-												Change verify() function to raise TestFailed, not AssertionError.

(I realize that I didn't really test this, because all the tests
succeed, so verify() never raised an AssertionError -- but the test
suite still succeeds, so I'm not too worried.)

											
										
										
											2001-01-19 15:01:56 -04:00
+								    except TestFailed:
-												Marc-Andre's third try at this bulk patch seems to work (except that
his copy of test_contains.py seems to be broken -- the lines he
deleted were already absent).  Checkin messages:


New Unicode support for int(), float(), complex() and long().

- new APIs PyInt_FromUnicode() and PyLong_FromUnicode()
- added support for Unicode to PyFloat_FromString()
- new encoding API PyUnicode_EncodeDecimal() which converts
  Unicode to a decimal char* string (used in the above new
  APIs)
- shortcuts for calls like int(<int object>) and float(<float obj>)
- tests for all of the above

Unicode compares and contains checks:
- comparing Unicode and non-string types now works; TypeErrors
  are masked, all other errors such as ValueError during
  Unicode coercion are passed through (note that PyUnicode_Compare
  does not implement the masking -- PyObject_Compare does this)
- contains now works for non-string types too; TypeErrors are
  masked and 0 returned; all other errors are passed through

Better testing support for the standard codecs.

Misc minor enhancements, such as an alias dbcs for the mbcs codec.

Changes:
- PyLong_FromString() now applies the same error checks as
  does PyInt_FromString(): trailing garbage is reported
  as error and not longer silently ignored. The only characters
  which may be trailing the digits are 'L' and 'l' -- these
  are still silently ignored.
- string.ato?() now directly interface to int(), long() and
  float(). The error strings are now a little different, but
  the type still remains the same. These functions are now
  ready to get declared obsolete ;-)
- PyNumber_Int() now also does a check for embedded NULL chars
  in the input string; PyNumber_Long() already did this (and
  still does)

Followed by:

Looks like I've gone a step too far there... (and test_contains.py
seem to have a bug too).

I've changed back to reporting all errors in PyUnicode_Contains()
and added a few more test cases to test_contains.py (plus corrected
the join() NameError).

											
										
										
											2000-04-05 17:11:21 -03:00
+								        print '*** codec "%s" failed round-trip' % encoding
 								    except ValueError,why:
 								        print '*** codec for "%s" failed: %s' % (encoding, why)
-												Marc-Andre Lemburg:

Attached you find the latest update of the Unicode implementation.
The patch is against the current CVS version.

It includes the fix I posted yesterday for the core dump problem
in codecs.c (was introduced by my previous patch set -- sorry),
adds more tests for the codecs and two new parser markers
"es" and "es#".

											
										
										
											2000-03-24 18:14:19 -04:00
 								print 'done.'
-												M.-A. Lemburg <mal@lemburg.com>:

Added test for Unicode string concatenation.

											
										
										
											2000-04-13 11:11:56 -03:00
 								print 'Testing Unicode string concatenation...',
-												This patch removes all uses of "assert" in the regression test suite
and replaces them with a new API verify(). As a result the regression
suite will also perform its tests in optimization mode.

Written by Marc-Andre Lemburg. Copyright assigned to Guido van Rossum.

											
										
										
											2001-01-17 15:11:13 -04:00
+								verify((u"abc" u"def") == u"abcdef")
 								verify(("abc" u"def") == u"abcdef")
 								verify((u"abc" "def") == u"abcdef")
 								verify((u"abc" u"def" "ghi") == u"abcdefghi")
 								verify(("abc" "def" u"ghi") == u"abcdefghi")
-												M.-A. Lemburg <mal@lemburg.com>:

Added test for Unicode string concatenation.

											
										
										
											2000-04-13 11:11:56 -03:00
+								print 'done.'
-												Fix for bug #480188: printing unicode objects

											
										
										
											2001-11-20 11:17:25 -04:00
 								print 'Testing Unicode printing...',
 								print u'abc'
 								print u'abc', u'def'
 								print u'abc', 'def'
 								print 'abc', u'def'
 								print u'abc\n'
 								print u'abc\n',
 								print u'abc\n',
 								print u'def\n'
 								print u'def\n'
 								print 'done.'