1996-11-27 15:52:01 -04:00
|
|
|
#! /usr/bin/env python
|
1993-12-14 06:08:02 -04:00
|
|
|
|
|
|
|
class Markov:
|
|
|
|
def __init__(self, histsize, choice):
|
|
|
|
self.histsize = histsize
|
|
|
|
self.choice = choice
|
|
|
|
self.trans = {}
|
|
|
|
def add(self, state, next):
|
|
|
|
if not self.trans.has_key(state):
|
|
|
|
self.trans[state] = [next]
|
|
|
|
else:
|
|
|
|
self.trans[state].append(next)
|
|
|
|
def put(self, seq):
|
|
|
|
n = self.histsize
|
|
|
|
add = self.add
|
|
|
|
add(None, seq[:0])
|
|
|
|
for i in range(len(seq)):
|
|
|
|
add(seq[max(0, i-n):i], seq[i:i+1])
|
|
|
|
add(seq[len(seq)-n:], None)
|
|
|
|
def get(self):
|
|
|
|
choice = self.choice
|
|
|
|
trans = self.trans
|
|
|
|
n = self.histsize
|
|
|
|
seq = choice(trans[None])
|
|
|
|
while 1:
|
|
|
|
subseq = seq[max(0, len(seq)-n):]
|
|
|
|
options = trans[subseq]
|
|
|
|
next = choice(options)
|
|
|
|
if not next: break
|
|
|
|
seq = seq + next
|
|
|
|
return seq
|
|
|
|
|
|
|
|
def test():
|
1998-05-20 14:13:01 -03:00
|
|
|
import sys, string, random, getopt
|
1993-12-14 06:08:02 -04:00
|
|
|
args = sys.argv[1:]
|
|
|
|
try:
|
|
|
|
opts, args = getopt.getopt(args, '0123456789cdw')
|
|
|
|
except getopt.error:
|
|
|
|
print 'Usage: markov [-#] [-cddqw] [file] ...'
|
|
|
|
print 'Options:'
|
|
|
|
print '-#: 1-digit history size (default 2)'
|
|
|
|
print '-c: characters (default)'
|
|
|
|
print '-w: words'
|
|
|
|
print '-d: more debugging output'
|
|
|
|
print '-q: no debugging output'
|
|
|
|
print 'Input files (default stdin) are split in paragraphs'
|
|
|
|
print 'separated blank lines and each paragraph is split'
|
|
|
|
print 'in words by whitespace, then reconcatenated with'
|
|
|
|
print 'exactly one space separating words.'
|
|
|
|
print 'Output consists of paragraphs separated by blank'
|
|
|
|
print 'lines, where lines are no longer than 72 characters.'
|
|
|
|
histsize = 2
|
|
|
|
do_words = 0
|
|
|
|
debug = 1
|
|
|
|
for o, a in opts:
|
|
|
|
if '-0' <= o <= '-9': histsize = eval(o[1:])
|
|
|
|
if o == '-c': do_words = 0
|
|
|
|
if o == '-d': debug = debug + 1
|
|
|
|
if o == '-q': debug = 0
|
|
|
|
if o == '-w': do_words = 1
|
|
|
|
if not args: args = ['-']
|
1998-05-20 14:13:01 -03:00
|
|
|
m = Markov(histsize, random.choice)
|
1993-12-14 06:08:02 -04:00
|
|
|
try:
|
|
|
|
for filename in args:
|
|
|
|
if filename == '-':
|
|
|
|
f = sys.stdin
|
|
|
|
if f.isatty():
|
|
|
|
print 'Sorry, need stdin from file'
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
f = open(filename, 'r')
|
|
|
|
if debug: print 'processing', filename, '...'
|
|
|
|
text = f.read()
|
|
|
|
f.close()
|
|
|
|
paralist = string.splitfields(text, '\n\n')
|
|
|
|
for para in paralist:
|
|
|
|
if debug > 1: print 'feeding ...'
|
|
|
|
words = string.split(para)
|
|
|
|
if words:
|
|
|
|
if do_words: data = tuple(words)
|
|
|
|
else: data = string.joinfields(words, ' ')
|
|
|
|
m.put(data)
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
print 'Interrupted -- continue with data read so far'
|
|
|
|
if not m.trans:
|
|
|
|
print 'No valid input files'
|
|
|
|
return
|
|
|
|
if debug: print 'done.'
|
|
|
|
if debug > 1:
|
|
|
|
for key in m.trans.keys():
|
|
|
|
if key is None or len(key) < histsize:
|
|
|
|
print `key`, m.trans[key]
|
|
|
|
if histsize == 0: print `''`, m.trans['']
|
|
|
|
print
|
|
|
|
while 1:
|
|
|
|
data = m.get()
|
|
|
|
if do_words: words = data
|
|
|
|
else: words = string.split(data)
|
|
|
|
n = 0
|
|
|
|
limit = 72
|
|
|
|
for w in words:
|
|
|
|
if n + len(w) > limit:
|
|
|
|
print
|
|
|
|
n = 0
|
|
|
|
print w,
|
|
|
|
n = n + len(w) + 1
|
|
|
|
print
|
|
|
|
print
|
|
|
|
|
|
|
|
def tuple(list):
|
|
|
|
if len(list) == 0: return ()
|
|
|
|
if len(list) == 1: return (list[0],)
|
|
|
|
i = len(list)/2
|
|
|
|
return tuple(list[:i]) + tuple(list[i:])
|
|
|
|
|
|
|
|
test()
|