cpython/Lib/regsub.py

188 lines
5.0 KiB
Python

# Regular expression subroutines:
# sub(pat, repl, str): replace first occurrence of pattern in string
# gsub(pat, repl, str): replace all occurrences of pattern in string
# split(str, pat, maxsplit): split string using pattern as delimiter
# splitx(str, pat, maxsplit): split string using pattern as delimiter plus
# return delimiters
import regex
# Replace first occurrence of pattern pat in string str by replacement
# repl. If the pattern isn't found, the string is returned unchanged.
# The replacement may contain references \digit to subpatterns and
# escaped backslashes. The pattern may be a string or an already
# compiled pattern.
def sub(pat, repl, str):
prog = compile(pat)
if prog.search(str) >= 0:
regs = prog.regs
a, b = regs[0]
str = str[:a] + expand(repl, regs, str) + str[b:]
return str
# Replace all (non-overlapping) occurrences of pattern pat in string
# str by replacement repl. The same rules as for sub() apply.
# Empty matches for the pattern are replaced only when not adjacent to
# a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.
def gsub(pat, repl, str):
prog = compile(pat)
new = ''
start = 0
first = 1
while prog.search(str, start) >= 0:
regs = prog.regs
a, b = regs[0]
if a == b == start and not first:
if start >= len(str) or prog.search(str, start+1) < 0:
break
regs = prog.regs
a, b = regs[0]
new = new + str[start:a] + expand(repl, regs, str)
start = b
first = 0
new = new + str[start:]
return new
# Split string str in fields separated by delimiters matching pattern
# pat. Only non-empty matches for the pattern are considered, so e.g.
# split('abc', '') returns ['abc'].
# The optional 3rd argument sets the number of splits that are performed.
def split(str, pat, maxsplit = 0):
return intsplit(str, pat, maxsplit, 0)
# Split string str in fields separated by delimiters matching pattern
# pat. Only non-empty matches for the pattern are considered, so e.g.
# split('abc', '') returns ['abc']. The delimiters are also included
# in the list.
# The optional 3rd argument sets the number of splits that are performed.
def splitx(str, pat, maxsplit = 0):
return intsplit(str, pat, maxsplit, 1)
# Internal function used to implement split() and splitx().
def intsplit(str, pat, maxsplit, retain):
prog = compile(pat)
res = []
start = next = 0
splitcount = 0
while prog.search(str, next) >= 0:
regs = prog.regs
a, b = regs[0]
if a == b:
next = next + 1
if next >= len(str):
break
else:
res.append(str[start:a])
if retain:
res.append(str[a:b])
start = next = b
splitcount = splitcount + 1
if (maxsplit and (splitcount >= maxsplit)):
break
res.append(str[start:])
return res
# Capitalize words split using a pattern
def capwords(str, pat='[^a-zA-Z0-9_]+'):
import string
words = splitx(str, pat)
for i in range(0, len(words), 2):
words[i] = string.capitalize(words[i])
return string.joinfields(words, "")
# Internal subroutines:
# compile(pat): compile a pattern, caching already compiled patterns
# expand(repl, regs, str): expand \digit escapes in replacement string
# Manage a cache of compiled regular expressions.
#
# If the pattern is a string a compiled version of it is returned. If
# the pattern has been used before we return an already compiled
# version from the cache; otherwise we compile it now and save the
# compiled version in the cache, along with the syntax it was compiled
# with. Instead of a string, a compiled regular expression can also
# be passed.
cache = {}
def compile(pat):
if type(pat) <> type(''):
return pat # Assume it is a compiled regex
key = (pat, regex.get_syntax())
if cache.has_key(key):
prog = cache[key] # Get it from the cache
else:
prog = cache[key] = regex.compile(pat)
return prog
def clear_cache():
global cache
cache = {}
# Expand \digit in the replacement.
# Each occurrence of \digit is replaced by the substring of str
# indicated by regs[digit]. To include a literal \ in the
# replacement, double it; other \ escapes are left unchanged (i.e.
# the \ and the following character are both copied).
def expand(repl, regs, str):
if '\\' not in repl:
return repl
new = ''
i = 0
ord0 = ord('0')
while i < len(repl):
c = repl[i]; i = i+1
if c <> '\\' or i >= len(repl):
new = new + c
else:
c = repl[i]; i = i+1
if '0' <= c <= '9':
a, b = regs[ord(c)-ord0]
new = new + str[a:b]
elif c == '\\':
new = new + c
else:
new = new + '\\' + c
return new
# Test program, reads sequences "pat repl str" from stdin.
# Optional argument specifies pattern used to split lines.
def test():
import sys
if sys.argv[1:]:
delpat = sys.argv[1]
else:
delpat = '[ \t\n]+'
while 1:
if sys.stdin.isatty(): sys.stderr.write('--> ')
line = sys.stdin.readline()
if not line: break
if line[-1] == '\n': line = line[:-1]
fields = split(line, delpat)
if len(fields) <> 3:
print 'Sorry, not three fields'
print 'split:', `fields`
continue
[pat, repl, str] = split(line, delpat)
print 'sub :', `sub(pat, repl, str)`
print 'gsub:', `gsub(pat, repl, str)`