cpython/Lib/test/test_regex.py

from test_support import verbose, sortdict
import warnings
warnings.filterwarnings("ignore", "the regex module is deprecated",
                        DeprecationWarning, __name__)
import regex
from regex_syntax import *

re = 'a+b+c+'
print 'no match:', regex.match(re, 'hello aaaabcccc world')
print 'successful search:', regex.search(re, 'hello aaaabcccc world')
try:
    cre = regex.compile('\(' + re)
except regex.error:
    print 'caught expected exception'
else:
    print 'expected regex.error not raised'

print 'failed awk syntax:', regex.search('(a+)|(b+)', 'cdb')
prev = regex.set_syntax(RE_SYNTAX_AWK)
print 'successful awk syntax:', regex.search('(a+)|(b+)', 'cdb')
regex.set_syntax(prev)
print 'failed awk syntax:', regex.search('(a+)|(b+)', 'cdb')

re = '\(<one>[0-9]+\) *\(<two>[0-9]+\)'
print 'matching with group names and compile()'
cre = regex.compile(re)
print cre.match('801 999')
try:
    print cre.group('one')
except regex.error:
    print 'caught expected exception'
else:
    print 'expected regex.error not raised'

print 'matching with group names and symcomp()'
cre = regex.symcomp(re)
print cre.match('801 999')
print cre.group(0)
print cre.group('one')
print cre.group(1, 2)
print cre.group('one', 'two')
print 'realpat:', cre.realpat
print 'groupindex:', sortdict(cre.groupindex)

re = 'world'
cre = regex.compile(re)
print 'not case folded search:', cre.search('HELLO WORLD')
cre = regex.compile(re, regex.casefold)
print 'case folded search:', cre.search('HELLO WORLD')

print '__members__:', cre.__members__
print 'regs:', cre.regs
print 'last:', cre.last
print 'translate:', len(cre.translate)
print 'givenpat:', cre.givenpat

print 'match with pos:', cre.match('hello world', 7)
print 'search with pos:', cre.search('hello world there world', 7)
print 'bogus group:', cre.group(0, 1, 3)
try:
    print 'no name:', cre.group('one')
except regex.error:
    print 'caught expected exception'
else:
    print 'expected regex.error not raised'

from regex_tests import *
if verbose: print 'Running regex_tests test suite'

for t in tests:
    pattern=s=outcome=repl=expected=None
    if len(t)==5:
        pattern, s, outcome, repl, expected = t
    elif len(t)==3:
        pattern, s, outcome = t
    else:
        raise ValueError, ('Test tuples should have 3 or 5 fields',t)

    try:
        obj=regex.compile(pattern)
    except regex.error:
        if outcome==SYNTAX_ERROR: pass    # Expected a syntax error
        else:
            # Regex syntax errors aren't yet reported, so for
            # the official test suite they'll be quietly ignored.
            pass
            #print '=== Syntax error:', t
    else:
        try:
            result=obj.search(s)
        except regex.error, msg:
            print '=== Unexpected exception', t, repr(msg)
        if outcome==SYNTAX_ERROR:
            # This should have been a syntax error; forget it.
            pass
        elif outcome==FAIL:
            if result==-1: pass   # No match, as expected
            else: print '=== Succeeded incorrectly', t
        elif outcome==SUCCEED:
            if result!=-1:
                # Matched, as expected, so now we compute the
                # result string and compare it to our expected result.
                start, end = obj.regs[0]
                found=s[start:end]
                groups=obj.group(1,2,3,4,5,6,7,8,9,10)
                vardict=vars()
                for i in range(len(groups)):
                    vardict['g'+str(i+1)]=str(groups[i])
                repl=eval(repl)
                if repl!=expected:
                    print '=== grouping error', t, repr(repl)+' should be '+repr(expected)
            else:
                print '=== Failed incorrectly', t
Get rid of the superstitious "~" in dict hashing's "i = (~hash) & mask". The comment following used to say: /* We use ~hash instead of hash, as degenerate hash functions, such as for ints <sigh>, can have lots of leading zeros. It's not really a performance risk, but better safe than sorry. 12-Dec-00 tim: so ~hash produces lots of leading ones instead -- what's the gain? / That is, there was never a good reason for doing it. And to the contrary, as explained on Python-Dev last December, it tended to make the sum* (i + incr) & mask (which is the first table index examined in case of collison) the same "too often" across distinct hashes. Changing to the simpler "i = hash & mask" reduced the number of string-dict collisions (== # number of times we go around the lookup for-loop) from about 6 million to 5 million during a full run of the test suite (these are approximate because the test suite does some random stuff from run to run). The number of collisions in non-string dicts also decreased, but not as dramatically. Note that this may, for a given dict, change the order (wrt previous releases) of entries exposed by .keys(), .values() and .items(). A number of std tests suffered bogus failures as a result. For dicts keyed by small ints, or (less so) by characters, the order is much more likely to be in increasing order of key now; e.g., >>> d = {} >>> for i in range(10): ... d[i] = i ... >>> d {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9} >>> Unfortunately. people may latch on to that in small examples and draw a bogus conclusion. test_support.py Moved test_extcall's sortdict() into test_support, made it stronger, and imported sortdict into other std tests that needed it. test_unicode.py Excluced cp875 from the "roundtrip over range(128)" test, because cp875 doesn't have a well-defined inverse for unicode("?", "cp875"). See Python-Dev for excruciating details. Cookie.py Chaged various output functions to sort dicts before building strings from them. test_extcall Fiddled the expected-result file. This remains sensitive to native dict ordering, because, e.g., if there are multiple errors in a keyword-arg dict (and test_extcall sets up many cases like that), the specific error Python complains about first depends on native dict ordering. 2001-05-12 21:19:31 -03:00			`from test_support import verbose, sortdict`
The regression test for the regex module should not trip the deprecation warning for that module, so suppress just that one warning. 2000-12-23 18:08:27 -04:00			`import warnings`
			`warnings.filterwarnings("ignore", "the regex module is deprecated",`
Use __name__ instead of "test_regex" as the module name in the warnings.filterwarnings() call. This suppresses the warning when the module is imported with its full name (test.test_regex) too. 2001-01-16 23:12:01 -04:00			`DeprecationWarning, __name__)`
added test of the regex module [NOTE: testall.py and autotest.py might could go away soon, I've played with Guido's new regrtest.py script and it seems to work well. I'll wait until Guido gives the word to completely switch over -- and change the Makefile too!] 1996-12-20 18:00:21 -04:00			`import regex`
			`from regex_syntax import *`

			`re = 'a+b+c+'`
			`print 'no match:', regex.match(re, 'hello aaaabcccc world')`
			`print 'successful search:', regex.search(re, 'hello aaaabcccc world')`
			`try:`
			`cre = regex.compile('\(' + re)`
			`except regex.error:`
			`print 'caught expected exception'`
			`else:`
			`print 'expected regex.error not raised'`

			`print 'failed awk syntax:', regex.search('(a+)\|(b+)', 'cdb')`
			`prev = regex.set_syntax(RE_SYNTAX_AWK)`
			`print 'successful awk syntax:', regex.search('(a+)\|(b+)', 'cdb')`
			`regex.set_syntax(prev)`
			`print 'failed awk syntax:', regex.search('(a+)\|(b+)', 'cdb')`

			`re = '\(<one>[0-9]+\) *\(<two>[0-9]+\)'`
			`print 'matching with group names and compile()'`
			`cre = regex.compile(re)`
			`print cre.match('801 999')`
			`try:`
			`print cre.group('one')`
			`except regex.error:`
			`print 'caught expected exception'`
			`else:`
			`print 'expected regex.error not raised'`

			`print 'matching with group names and symcomp()'`
			`cre = regex.symcomp(re)`
			`print cre.match('801 999')`
			`print cre.group(0)`
			`print cre.group('one')`
			`print cre.group(1, 2)`
			`print cre.group('one', 'two')`
			`print 'realpat:', cre.realpat`
Get rid of the superstitious "~" in dict hashing's "i = (~hash) & mask". The comment following used to say: /* We use ~hash instead of hash, as degenerate hash functions, such as for ints <sigh>, can have lots of leading zeros. It's not really a performance risk, but better safe than sorry. 12-Dec-00 tim: so ~hash produces lots of leading ones instead -- what's the gain? / That is, there was never a good reason for doing it. And to the contrary, as explained on Python-Dev last December, it tended to make the sum* (i + incr) & mask (which is the first table index examined in case of collison) the same "too often" across distinct hashes. Changing to the simpler "i = hash & mask" reduced the number of string-dict collisions (== # number of times we go around the lookup for-loop) from about 6 million to 5 million during a full run of the test suite (these are approximate because the test suite does some random stuff from run to run). The number of collisions in non-string dicts also decreased, but not as dramatically. Note that this may, for a given dict, change the order (wrt previous releases) of entries exposed by .keys(), .values() and .items(). A number of std tests suffered bogus failures as a result. For dicts keyed by small ints, or (less so) by characters, the order is much more likely to be in increasing order of key now; e.g., >>> d = {} >>> for i in range(10): ... d[i] = i ... >>> d {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9} >>> Unfortunately. people may latch on to that in small examples and draw a bogus conclusion. test_support.py Moved test_extcall's sortdict() into test_support, made it stronger, and imported sortdict into other std tests that needed it. test_unicode.py Excluced cp875 from the "roundtrip over range(128)" test, because cp875 doesn't have a well-defined inverse for unicode("?", "cp875"). See Python-Dev for excruciating details. Cookie.py Chaged various output functions to sort dicts before building strings from them. test_extcall Fiddled the expected-result file. This remains sensitive to native dict ordering, because, e.g., if there are multiple errors in a keyword-arg dict (and test_extcall sets up many cases like that), the specific error Python complains about first depends on native dict ordering. 2001-05-12 21:19:31 -03:00			`print 'groupindex:', sortdict(cre.groupindex)`
added test of the regex module [NOTE: testall.py and autotest.py might could go away soon, I've played with Guido's new regrtest.py script and it seems to work well. I'll wait until Guido gives the word to completely switch over -- and change the Makefile too!] 1996-12-20 18:00:21 -04:00
			`re = 'world'`
			`cre = regex.compile(re)`
			`print 'not case folded search:', cre.search('HELLO WORLD')`
			`cre = regex.compile(re, regex.casefold)`
			`print 'case folded search:', cre.search('HELLO WORLD')`

			`print '__members__:', cre.__members__`
			`print 'regs:', cre.regs`
			`print 'last:', cre.last`
Changes to make these tests work on the Mac. 1997-05-16 10:51:48 -03:00			`print 'translate:', len(cre.translate)`
added test of the regex module [NOTE: testall.py and autotest.py might could go away soon, I've played with Guido's new regrtest.py script and it seems to work well. I'll wait until Guido gives the word to completely switch over -- and change the Makefile too!] 1996-12-20 18:00:21 -04:00			`print 'givenpat:', cre.givenpat`

			`print 'match with pos:', cre.match('hello world', 7)`
			`print 'search with pos:', cre.search('hello world there world', 7)`
			`print 'bogus group:', cre.group(0, 1, 3)`
			`try:`
			`print 'no name:', cre.group('one')`
			`except regex.error:`
			`print 'caught expected exception'`
			`else:`
			`print 'expected regex.error not raised'`
AMK's regex test suite 1997-06-03 15:07:49 -03:00
			`from regex_tests import *`
			`if verbose: print 'Running regex_tests test suite'`

			`for t in tests:`
			`pattern=s=outcome=repl=expected=None`
			`if len(t)==5:`
Mass check-in after untabifying all files that need it. 1998-03-26 15:42:58 -04:00			`pattern, s, outcome, repl, expected = t`
AMK's regex test suite 1997-06-03 15:07:49 -03:00			`elif len(t)==3:`
Make reindent.py happy (convert everything to 4-space indents!). 2000-10-23 14:22:08 -03:00			`pattern, s, outcome = t`
AMK's regex test suite 1997-06-03 15:07:49 -03:00			`else:`
Mass check-in after untabifying all files that need it. 1998-03-26 15:42:58 -04:00			`raise ValueError, ('Test tuples should have 3 or 5 fields',t)`
AMK's regex test suite 1997-06-03 15:07:49 -03:00
			`try:`
Mass check-in after untabifying all files that need it. 1998-03-26 15:42:58 -04:00			`obj=regex.compile(pattern)`
AMK's regex test suite 1997-06-03 15:07:49 -03:00			`except regex.error:`
Mass check-in after untabifying all files that need it. 1998-03-26 15:42:58 -04:00			`if outcome==SYNTAX_ERROR: pass # Expected a syntax error`
Make reindent.py happy (convert everything to 4-space indents!). 2000-10-23 14:22:08 -03:00			`else:`
			`# Regex syntax errors aren't yet reported, so for`
Mass check-in after untabifying all files that need it. 1998-03-26 15:42:58 -04:00			`# the official test suite they'll be quietly ignored.`
			`pass`
			`#print '=== Syntax error:', t`
AMK's regex test suite 1997-06-03 15:07:49 -03:00			`else:`
Mass check-in after untabifying all files that need it. 1998-03-26 15:42:58 -04:00			`try:`
			`result=obj.search(s)`
			`except regex.error, msg:`
			`print '=== Unexpected exception', t, repr(msg)`
			`if outcome==SYNTAX_ERROR:`
			`# This should have been a syntax error; forget it.`
			`pass`
			`elif outcome==FAIL:`
			`if result==-1: pass # No match, as expected`
			`else: print '=== Succeeded incorrectly', t`
			`elif outcome==SUCCEED:`
			`if result!=-1:`
			`# Matched, as expected, so now we compute the`
			`# result string and compare it to our expected result.`
			`start, end = obj.regs[0]`
			`found=s[start:end]`
			`groups=obj.group(1,2,3,4,5,6,7,8,9,10)`
			`vardict=vars()`
			`for i in range(len(groups)):`
			`vardict['g'+str(i+1)]=str(groups[i])`
			`repl=eval(repl)`
			`if repl!=expected:`
			`print '=== grouping error', t, repr(repl)+' should be '+repr(expected)`
			`else:`
			`print '=== Failed incorrectly', t`