# # partparse.py: parse a by-Guido-written-and-by-Jan-Hein-edited LaTeX file, # and generate texinfo source. # # This is *not* a good example of good programming practices. In fact, this # file could use a complete rewrite, in order to become faster, more # easily extensible and maintainable. # # However, I added some comments on a few places for the pityful person who # would ever need to take a look into this file. # # Have I been clear enough?? # # -jh # # Yup. I made some performance improvements and hope this lasts a while; # I don't want to be the schmuck who ends up re-writting it! # # -fld import sys, string, regex, getopt, os from types import IntType, ListType, StringType, TupleType # Different parse modes for phase 1 MODE_REGULAR = 0 MODE_VERBATIM = 1 MODE_CS_SCAN = 2 MODE_COMMENT = 3 MODE_MATH = 4 MODE_DMATH = 5 MODE_GOBBLEWHITE = 6 the_modes = (MODE_REGULAR, MODE_VERBATIM, MODE_CS_SCAN, MODE_COMMENT, MODE_MATH, MODE_DMATH, MODE_GOBBLEWHITE) # Show the neighbourhood of the scanned buffer def epsilon(buf, where): wmt, wpt = where - 10, where + 10 if wmt < 0: wmt = 0 if wpt > len(buf): wpt = len(buf) return ' Context ' + `buf[wmt:where]` + '.' + `buf[where:wpt]` + '.' # Should return the line number. never worked def lin(): global lineno return ' Line ' + `lineno` + '.' # Displays the recursion level. def lv(lvl): return ' Level ' + `lvl` + '.' # Combine the three previous functions. Used often. def lle(lvl, buf, where): return lv(lvl) + lin() + epsilon(buf, where) # This class is only needed for _symbolic_ representation of the parse mode. class Mode: def __init__(self, arg): if arg not in the_modes: raise ValueError, 'mode not in the_modes' self.mode = arg def __cmp__(self, other): if type(self) != type(other): other = mode[other] return cmp(self.mode, other.mode) def __repr__(self): if self.mode == MODE_REGULAR: return 'MODE_REGULAR' elif self.mode == MODE_VERBATIM: return 'MODE_VERBATIM' elif self.mode == MODE_CS_SCAN: return 'MODE_CS_SCAN' elif self.mode == MODE_COMMENT: return 'MODE_COMMENT' elif self.mode == MODE_MATH: return 'MODE_MATH' elif self.mode == MODE_DMATH: return 'MODE_DMATH' elif self.mode == MODE_GOBBLEWHITE: return 'MODE_GOBBLEWHITE' else: raise ValueError, 'mode not in the_modes' # just a wrapper around a class initialisation mode = {} for t in the_modes: mode[t] = Mode(t) # After phase 1, the text consists of chunks, with a certain type # this type will be assigned to the chtype member of the chunk # the where-field contains the file position where this is found # and the data field contains (1): a tuple describing start- end end # positions of the substring (can be used as slice for the buf-variable), # (2) just a string, mostly generated by the changeit routine, # or (3) a list, describing a (recursive) subgroup of chunks PLAIN = 0 # ASSUME PLAINTEXT, data = the text GROUP = 1 # GROUP ({}), data = [chunk, chunk,..] CSNAME = 2 # CONTROL SEQ TOKEN, data = the command COMMENT = 3 # data is the actual comment DMATH = 4 # DISPLAYMATH, data = [chunk, chunk,..] MATH = 5 # MATH, see DISPLAYMATH OTHER = 6 # CHAR WITH CATCODE OTHER, data = char ACTIVE = 7 # ACTIVE CHAR GOBBLEDWHITE = 8 # Gobbled LWSP, after CSNAME ENDLINE = 9 # END-OF-LINE, data = '\n' DENDLINE = 10 # DOUBLE EOL, data='\n', indicates \par ENV = 11 # LaTeX-environment # data =(envname,[ch,ch,ch,.]) CSLINE = 12 # for texi: next chunk will be one group # of args. Will be set all on 1 line IGNORE = 13 # IGNORE this data ENDENV = 14 # TEMP END OF GROUP INDICATOR IF = 15 # IF-directive # data = (flag,negate,[ch, ch, ch,...]) the_types = (PLAIN, GROUP, CSNAME, COMMENT, DMATH, MATH, OTHER, ACTIVE, GOBBLEDWHITE, ENDLINE, DENDLINE, ENV, CSLINE, IGNORE, ENDENV, IF) # class, just to display symbolic name class ChunkType: def __init__(self, chunk_type): if chunk_type not in the_types: raise ValueError, 'chunk_type not in the_types' self.chunk_type = chunk_type def __cmp__(self, other): if type(self) != type(other): other = chunk_type[other] return cmp(self.chunk_type, other.chunk_type) def __repr__(self): if self.chunk_type == PLAIN: return 'PLAIN' elif self.chunk_type == GROUP: return 'GROUP' elif self.chunk_type == CSNAME: return 'CSNAME' elif self.chunk_type == COMMENT: return 'COMMENT' elif self.chunk_type == DMATH: return 'DMATH' elif self.chunk_type == MATH: return 'MATH' elif self.chunk_type == OTHER: return 'OTHER' elif self.chunk_type == ACTIVE: return 'ACTIVE' elif self.chunk_type == GOBBLEDWHITE: return 'GOBBLEDWHITE' elif self.chunk_type == DENDLINE: return 'DENDLINE' elif self.chunk_type == ENDLINE: return 'ENDLINE' elif self.chunk_type == ENV: return 'ENV' elif self.chunk_type == CSLINE: return 'CSLINE' elif self.chunk_type == IGNORE: return 'IGNORE' elif self.chunk_type == ENDENV: return 'ENDENV' elif self.chunk_type == IF: return 'IF' else: raise ValueError, 'chunk_type not in the_types' # ...and the wrapper chunk_type = {} for t in the_types: chunk_type[t] = ChunkType(t) # store a type object of the ChunkType-class-instance... chunk_type_type = type(chunk_type[PLAIN]) # this class contains a part of the parsed buffer class Chunk: def __init__(self, chtype, where, data): if type(chtype) != chunk_type_type: chtype = chunk_type[chtype] self.chtype = chtype self.where = where self.data = data def __repr__(self): return 'chunk' + `self.chtype, self.where, self.data` # and the wrapper chunk = Chunk error = 'partparse.error' # # TeX's catcodes... # CC_ESCAPE = 0 CC_LBRACE = 1 CC_RBRACE = 2 CC_MATHSHIFT = 3 CC_ALIGNMENT = 4 CC_ENDLINE = 5 CC_PARAMETER = 6 CC_SUPERSCRIPT = 7 CC_SUBSCRIPT = 8 CC_IGNORE = 9 CC_WHITE = 10 CC_LETTER = 11 CC_OTHER = 12 CC_ACTIVE = 13 CC_COMMENT = 14 CC_INVALID = 15 # and the names cc_names = [ 'CC_ESCAPE', 'CC_LBRACE', 'CC_RBRACE', 'CC_MATHSHIFT', 'CC_ALIGNMENT', 'CC_ENDLINE', 'CC_PARAMETER', 'CC_SUPERSCRIPT', 'CC_SUBSCRIPT', 'CC_IGNORE', 'CC_WHITE', 'CC_LETTER', 'CC_OTHER', 'CC_ACTIVE', 'CC_COMMENT', 'CC_INVALID', ] # Show a list of catcode-name-symbols def pcl(codelist): result = '' for i in codelist: result = result + cc_names[i] + ', ' return '[' + result[:-2] + ']' # the name of the catcode (ACTIVE, OTHER, etc.) def pc(code): return cc_names[code] # Which catcodes make the parser stop parsing regular plaintext regular_stopcodes = [CC_ESCAPE, CC_LBRACE, CC_RBRACE, CC_MATHSHIFT, CC_ALIGNMENT, CC_PARAMETER, CC_SUPERSCRIPT, CC_SUBSCRIPT, CC_IGNORE, CC_ACTIVE, CC_COMMENT, CC_INVALID, CC_ENDLINE] # same for scanning a control sequence name csname_scancodes = [CC_LETTER] # same for gobbling LWSP white_scancodes = [CC_WHITE] ##white_scancodes = [CC_WHITE, CC_ENDLINE] # make a list of all catcode id's, except for catcode ``other'' all_but_other_codes = range(16) del all_but_other_codes[CC_OTHER] ##print all_but_other_codes # when does a comment end comment_stopcodes = [CC_ENDLINE] # gather all characters together, specified by a list of catcodes def code2string(cc, codelist): ##print 'code2string: codelist = ' + pcl(codelist), result = '' for category in codelist: if cc[category]: result = result + cc[category] ##print 'result = ' + `result` return result # automatically generate all characters of catcode other, being the # complement set in the ASCII range (128 characters) def make_other_codes(cc): otherchars = range(256) # could be made 256, no problem for category in all_but_other_codes: if cc[category]: for c in cc[category]: otherchars[ord(c)] = None result = '' for i in otherchars: if i != None: result = result + chr(i) return result # catcode dump (which characters have which catcodes). def dump_cc(name, cc): ##print '\t' + name ##print '=' * (8+len(name)) if len(cc) != 16: raise TypeError, 'cc not good cat class' ## for i in range(16): ## print pc(i) + '\t' + `cc[i]` # In the beginning,.... epoch_cc = [None] * 16 ##dump_cc('epoch_cc', epoch_cc) # INITEX initex_cc = epoch_cc[:] initex_cc[CC_ESCAPE] = '\\' initex_cc[CC_ENDLINE], initex_cc[CC_IGNORE], initex_cc[CC_WHITE] = \ '\n', '\0', ' ' initex_cc[CC_LETTER] = string.uppercase + string.lowercase initex_cc[CC_COMMENT], initex_cc[CC_INVALID] = '%', '\x7F' #initex_cc[CC_OTHER] = make_other_codes(initex_cc) I don't need them, anyway ##dump_cc('initex_cc', initex_cc) # LPLAIN: LaTeX catcode setting (see lplain.tex) lplain_cc = initex_cc[:] lplain_cc[CC_LBRACE], lplain_cc[CC_RBRACE] = '{', '}' lplain_cc[CC_MATHSHIFT] = '$' lplain_cc[CC_ALIGNMENT] = '&' lplain_cc[CC_PARAMETER] = '#' lplain_cc[CC_SUPERSCRIPT] = '^\x0B' # '^' and C-k lplain_cc[CC_SUBSCRIPT] = '_\x01' # '_' and C-a lplain_cc[CC_WHITE] = lplain_cc[CC_WHITE] + '\t' lplain_cc[CC_ACTIVE] = '~\x0C' # '~' and C-l lplain_cc[CC_OTHER] = make_other_codes(lplain_cc) ##dump_cc('lplain_cc', lplain_cc) # Guido's LaTeX environment catcoded '_' as ``other'' # my own purpose catlist my_cc = lplain_cc[:] my_cc[CC_SUBSCRIPT] = my_cc[CC_SUBSCRIPT][1:] # remove '_' here my_cc[CC_OTHER] = my_cc[CC_OTHER] + '_' # add it to OTHER list dump_cc('my_cc', my_cc) # needed for un_re, my equivalent for regexp-quote in Emacs re_meaning = '\\[]^$' def un_re(str): result = '' for i in str: if i in re_meaning: result = result + '\\' result = result + i return result # NOTE the negate ('^') operator in *some* of the regexps below def make_rc_regular(cc): # problems here if '[]' are included!! return regex.compile('[' + code2string(cc, regular_stopcodes) + ']') def make_rc_cs_scan(cc): return regex.compile('[^' + code2string(cc, csname_scancodes) + ']') def make_rc_comment(cc): return regex.compile('[' + code2string(cc, comment_stopcodes) + ']') def make_rc_endwhite(cc): return regex.compile('[^' + code2string(cc, white_scancodes) + ']') # regular: normal mode: rc_regular = make_rc_regular(my_cc) # scan: scan a command sequence e.g. `newlength' or `mbox' or `;', `,' or `$' rc_cs_scan = make_rc_cs_scan(my_cc) rc_comment = make_rc_comment(my_cc) rc_endwhite = make_rc_endwhite(my_cc) # parseit (BUF, PARSEMODE=mode[MODE_REGULAR], START=0, RECURSION-LEVEL=0) # RECURSION-LEVEL will is incremented on entry. # result contains the list of chunks returned # together with this list, the buffer position is returned # RECURSION-LEVEL will be set to zero *again*, when recursively a # {,D}MATH-mode scan has been enetered. # This has been done in order to better check for environment-mismatches def parseit(buf, parsemode=mode[MODE_REGULAR], start=0, lvl=0): global lineno result = [] end = len(buf) if lvl == 0 and parsemode == mode[MODE_REGULAR]: lineno = 1 lvl = lvl + 1 ##print 'parseit(' + epsilon(buf, start) + ', ' + `parsemode` + ', ' + `start` + ', ' + `lvl` + ')' # # some of the more regular modes... # if parsemode in (mode[MODE_REGULAR], mode[MODE_DMATH], mode[MODE_MATH]): cstate = [] newpos = start curpmode = parsemode while 1: where = newpos #print '\tnew round: ' + epsilon(buf, where) if where == end: if lvl > 1 or curpmode != mode[MODE_REGULAR]: # not the way we started... raise EOFError, 'premature end of file.' + lle(lvl, buf, where) # the real ending of lvl-1 parse return end, result pos = rc_regular.search(buf, where) if pos < 0: pos = end if pos != where: newpos, c = pos, chunk(PLAIN, where, (where, pos)) result.append(c) continue # # ok, pos == where and pos != end # foundchar = buf[where] if foundchar in my_cc[CC_LBRACE]: # recursive subgroup parse... newpos, data = parseit(buf, curpmode, where+1, lvl) result.append(chunk(GROUP, where, data)) elif foundchar in my_cc[CC_RBRACE]: if lvl <= 1: raise error, 'ENDGROUP while in base level.' + lle(lvl, buf, where) if lvl == 1 and mode != mode[MODE_REGULAR]: raise error, 'endgroup while in math mode. +lin() + epsilon(buf, where)' return where + 1, result elif foundchar in my_cc[CC_ESCAPE]: # # call the routine that actually deals with # this problem. If do_ret is None, than # return the value of do_ret # # Note that handle_cs might call this routine # recursively again... # do_ret, newpos = handlecs(buf, where, curpmode, lvl, result, end) if do_ret != None: return do_ret elif foundchar in my_cc[CC_COMMENT]: newpos, data = parseit(buf, mode[MODE_COMMENT], where+1, lvl) result.append(chunk(COMMENT, where, data)) elif foundchar in my_cc[CC_MATHSHIFT]: # note that recursive calls to math-mode # scanning are called with recursion-level 0 # again, in order to check for bad mathend # if where + 1 != end and buf[where + 1] in my_cc[CC_MATHSHIFT]: # # double mathshift, e.g. '$$' # if curpmode == mode[MODE_REGULAR]: newpos, data = parseit(buf, mode[MODE_DMATH], where + 2, 0) result.append(chunk(DMATH, where, data)) elif curpmode == mode[MODE_MATH]: raise error, 'wrong math delimiiter' + lin() + epsilon(buf, where) elif lvl != 1: raise error, 'bad mathend.' + lle(lvl, buf, where) else: return where + 2, result else: # # single math shift, e.g. '$' # if curpmode == mode[MODE_REGULAR]: newpos, data = parseit(buf, mode[MODE_MATH], where + 1, 0) result.append(chunk(MATH, where, data)) elif curpmode == mode[MODE_DMATH]: raise error, 'wrong math delimiiter' + lin() + epsilon(buf, where) elif lvl != 1: raise error, 'bad mathend.' + lv(lvl, buf, where) else: return where + 1, result elif foundchar in my_cc[CC_IGNORE]: print 'warning: ignored char', `foundchar` newpos = where + 1 elif foundchar in my_cc[CC_ACTIVE]: result.append(chunk(ACTIVE, where, foundchar)) newpos = where + 1 elif foundchar in my_cc[CC_INVALID]: raise error, 'invalid char ' + `foundchar` newpos = where + 1 elif foundchar in my_cc[CC_ENDLINE]: # # after an end of line, eat the rest of # whitespace on the beginning of the next line # this is what LaTeX more or less does # # also, try to indicate double newlines (\par) # lineno = lineno + 1 savedwhere = where newpos, dummy = parseit(buf, mode[MODE_GOBBLEWHITE], where + 1, lvl) if newpos != end and buf[newpos] in my_cc[CC_ENDLINE]: result.append(chunk(DENDLINE, savedwhere, foundchar)) else: result.append(chunk(ENDLINE, savedwhere, foundchar)) else: result.append(chunk(OTHER, where, foundchar)) newpos = where + 1 elif parsemode == mode[MODE_CS_SCAN]: # # scan for a control sequence token. `\ape', `\nut' or `\%' # if start == end: raise EOFError, 'can\'t find end of csname' pos = rc_cs_scan.search(buf, start) if pos < 0: pos = end if pos == start: # first non-letter right where we started the search # ---> the control sequence name consists of one single # character. Also: don't eat white space... if buf[pos] in my_cc[CC_ENDLINE]: lineno = lineno + 1 pos = pos + 1 return pos, (start, pos) else: spos = pos if buf[pos] == '\n': lineno = lineno + 1 spos = pos + 1 pos2, dummy = parseit(buf, mode[MODE_GOBBLEWHITE], spos, lvl) return pos2, (start, pos) elif parsemode == mode[MODE_GOBBLEWHITE]: if start == end: return start, '' pos = rc_endwhite.search(buf, start) if pos < 0: pos = start return pos, (start, pos) elif parsemode == mode[MODE_COMMENT]: pos = rc_comment.search(buf, start) lineno = lineno + 1 if pos < 0: print 'no newline perhaps?' raise EOFError, 'can\'t find end of comment' pos = pos + 1 pos2, dummy = parseit(buf, mode[MODE_GOBBLEWHITE], pos, lvl) return pos2, (start, pos) else: raise error, 'Unknown mode (' + `parsemode` + ')' #moreresult = cswitch(buf[x1:x2], buf, newpos, parsemode, lvl) #boxcommands = 'mbox', 'fbox' #defcommands = 'def', 'newcommand' endverbstr = '\\end{verbatim}' re_endverb = regex.compile(un_re(endverbstr)) # # handlecs: helper function for parseit, for the special thing we might # wanna do after certain command control sequences # returns: None or return_data, newpos # # in the latter case, the calling function is instructed to immediately # return with the data in return_data # def handlecs(buf, where, curpmode, lvl, result, end): global lineno # get the control sequence name... newpos, data = parseit(buf, mode[MODE_CS_SCAN], where+1, lvl) saveddata = data s_buf_data = s(buf, data) if s_buf_data in ('begin', 'end'): # skip the expected '{' and get the LaTeX-envname '}' newpos, data = parseit(buf, mode[MODE_REGULAR], newpos+1, lvl) if len(data) != 1: raise error, 'expected 1 chunk of data.' + lle(lvl, buf, where) # yucky, we've got an environment envname = s(buf, data[0].data) s_buf_saveddata = s(buf, saveddata) ##print 'FOUND ' + s(buf, saveddata) + '. Name ' + `envname` + '.' + lv(lvl) if s_buf_saveddata == 'begin' and envname == 'verbatim': # verbatim deserves special treatment pos = re_endverb.search(buf, newpos) if pos < 0: raise error, "%s not found.%s" \ % (`endverbstr`, lle(lvl, buf, where)) result.append(chunk(ENV, where, (envname, [chunk(PLAIN, newpos, (newpos, pos))]))) newpos = pos + len(endverbstr) elif s_buf_saveddata == 'begin': # start parsing recursively... If that parse returns # from an '\end{...}', then should the last item of # the returned data be a string containing the ended # environment newpos, data = parseit(buf, curpmode, newpos, lvl) if not data or type(data[-1]) is not StringType: raise error, "missing 'end'" + lle(lvl, buf, where) \ + epsilon(buf, newpos) retenv = data[-1] del data[-1] if retenv != envname: #[`retenv`, `envname`] raise error, 'environments do not match.%s%s' \ % (lle(lvl, buf, where), epsilon(buf, newpos)) result.append(chunk(ENV, where, (retenv, data))) else: # 'end'... append the environment name, as just # pointed out, and order parsit to return... result.append(envname) ##print 'POINT of return: ' + epsilon(buf, newpos) # the tuple will be returned by parseit return (newpos, result), newpos # end of \begin ... \end handling elif s_buf_data[0:2] == 'if': # another scary monster: the 'if' directive flag = s_buf_data[2:] # recursively call parseit, just like environment above.. # the last item of data should contain the if-termination # e.g., 'else' of 'fi' newpos, data = parseit(buf, curpmode, newpos, lvl) if not data or data[-1] not in ('else', 'fi'): raise error, 'wrong if... termination' + \ lle(lvl, buf, where) + epsilon(buf, newpos) ifterm = data[-1] del data[-1] # 0 means dont_negate flag result.append(chunk(IF, where, (flag, 0, data))) if ifterm == 'else': # do the whole thing again, there is only one way # to end this one, by 'fi' newpos, data = parseit(buf, curpmode, newpos, lvl) if not data or data[-1] not in ('fi', ): raise error, 'wrong if...else... termination' \ + lle(lvl, buf, where) \ + epsilon(buf, newpos) ifterm = data[-1] del data[-1] result.append(chunk(IF, where, (flag, 1, data))) #done implicitely: return None, newpos elif s_buf_data in ('else', 'fi'): result.append(s(buf, data)) # order calling party to return tuple return (newpos, result), newpos # end of \if, \else, ... \fi handling elif s(buf, saveddata) == 'verb': x2 = saveddata[1] result.append(chunk(CSNAME, where, data)) if x2 == end: raise error, 'premature end of command.' + lle(lvl, buf, where) delimchar = buf[x2] ##print 'VERB: delimchar ' + `delimchar` pos = regex.compile(un_re(delimchar)).search(buf, x2 + 1) if pos < 0: raise error, 'end of \'verb\' argument (' + \ `delimchar` + ') not found.' + \ lle(lvl, buf, where) result.append(chunk(GROUP, x2, [chunk(PLAIN, x2+1, (x2+1, pos))])) newpos = pos + 1 else: result.append(chunk(CSNAME, where, data)) return None, newpos # this is just a function to get the string value if the possible data-tuple def s(buf, data): if type(data) is StringType: return data if len(data) != 2 or not (type(data[0]) is type(data[1]) is IntType): raise TypeError, 'expected tuple of 2 integers' x1, x2 = data return buf[x1:x2] ##length, data1, i = getnextarg(length, buf, pp, i + 1) # make a deep-copy of some chunks def crcopy(r): return map(chunkcopy, r) # copy a chunk, would better be a method of class Chunk... def chunkcopy(ch): if ch.chtype == chunk_type[GROUP]: return chunk(GROUP, ch.where, map(chunkcopy, ch.data)) else: return chunk(ch.chtype, ch.where, ch.data) # get next argument for TeX-macro, flatten a group (insert between) # or return Command Sequence token, or give back one character def getnextarg(length, buf, pp, item): ##wobj = Wobj() ##dumpit(buf, wobj.write, pp[item:min(length, item + 5)]) ##print 'GETNEXTARG, (len, item) =', `length, item` + ' ---> ' + wobj.data + ' <---' while item < length and pp[item].chtype == chunk_type[ENDLINE]: del pp[item] length = length - 1 if item >= length: raise error, 'no next arg.' + epsilon(buf, pp[-1].where) if pp[item].chtype == chunk_type[GROUP]: newpp = pp[item].data del pp[item] length = length - 1 changeit(buf, newpp) length = length + len(newpp) pp[item:item] = newpp item = item + len(newpp) if len(newpp) < 10: wobj = Wobj() dumpit(buf, wobj.write, newpp) ##print 'GETNEXTARG: inserted ' + `wobj.data` return length, item elif pp[item].chtype == chunk_type[PLAIN]: #grab one char print 'WARNING: grabbing one char' if len(s(buf, pp[item].data)) > 1: pp.insert(item, chunk(PLAIN, pp[item].where, s(buf, pp[item].data)[:1])) item, length = item+1, length+1 pp[item].data = s(buf, pp[item].data)[1:] else: item = item+1 return length, item else: ch = pp[item] try: str = `s(buf, ch.data)` except TypeError: str = `ch.data` if len(str) > 400: str = str[:400] + '...' print 'GETNEXTARG:', ch.chtype, 'not handled, data ' + str return length, item # this one is needed to find the end of LaTeX's optional argument, like # item[...] re_endopt = regex.compile(']') # get a LaTeX-optional argument, you know, the square braces '[' and ']' def getoptarg(length, buf, pp, item): wobj = Wobj() dumpit(buf, wobj.write, pp[item:min(length, item + 5)]) ##print 'GETOPTARG, (len, item) =', `length, item` + ' ---> ' + wobj.data + ' <---' if item >= length or \ pp[item].chtype != chunk_type[PLAIN] or \ s(buf, pp[item].data)[0] != '[': return length, item pp[item].data = s(buf, pp[item].data)[1:] if len(pp[item].data) == 0: del pp[item] length = length-1 while 1: if item == length: raise error, 'No end of optional arg found' if pp[item].chtype == chunk_type[PLAIN]: text = s(buf, pp[item].data) pos = re_endopt.search(text) if pos >= 0: pp[item].data = text[:pos] if pos == 0: del pp[item] length = length-1 else: item=item+1 text = text[pos+1:] while text and text[0] in ' \t': text = text[1:] if text: pp.insert(item, chunk(PLAIN, 0, text)) length = length + 1 return length, item item = item+1 # Wobj just add write-requests to the ``data'' attribute class Wobj: data = '' def write(self, data): self.data = self.data + data # ignore these commands ignoredcommands = ('bcode', 'ecode') # map commands like these to themselves as plaintext wordsselves = ('UNIX', 'ABC', 'C', 'ASCII', 'EOF', 'LaTeX') # \{ --> {, \} --> }, etc themselves = ('{', '}', ',', '.', '@', ' ', '\n') + wordsselves # these ones also themselves (see argargs macro in myformat.sty) inargsselves = (',', '[', ']', '(', ')') # this is how *I* would show the difference between emph and strong # code 1 means: fold to uppercase markcmds = {'code': ('', ''), 'var': 1, 'emph': ('_', '_'), 'strong': ('*', '*')} # recognise patter {\FONTCHANGE-CMD TEXT} to \MAPPED-FC-CMD{TEXT} fontchanges = {'rm': 'r', 'it': 'i', 'em': 'emph', 'bf': 'b', 'tt': 't'} # transparent for these commands for_texi = ('emph', 'var', 'strong', 'code', 'kbd', 'key', 'dfn', 'samp', 'file', 'r', 'i', 't') # try to remove macros and return flat text def flattext(buf, pp): pp = crcopy(pp) ##print '---> FLATTEXT ' + `pp` wobj = Wobj() i, length = 0, len(pp) while 1: if len(pp) != length: raise 'FATAL', 'inconsistent length' if i >= length: break ch = pp[i] i = i+1 if ch.chtype == chunk_type[PLAIN]: pass elif ch.chtype == chunk_type[CSNAME]: s_buf_data = s(buf, ch.data) if s_buf_data in themselves or hist.inargs and s_buf_data in inargsselves: ch.chtype = chunk_type[PLAIN] elif s_buf_data == 'e': ch.chtype = chunk_type[PLAIN] ch.data = '\\' elif len(s_buf_data) == 1 \ and s_buf_data in onlylatexspecial: ch.chtype = chunk_type[PLAIN] # if it is followed by an empty group, # remove that group, it was needed for # a true space if i < length \ and pp[i].chtype==chunk_type[GROUP] \ and len(pp[i].data) == 0: del pp[i] length = length-1 elif s_buf_data in markcmds.keys(): length, newi = getnextarg(length, buf, pp, i) str = flattext(buf, pp[i:newi]) del pp[i:newi] length = length - (newi - i) ch.chtype = chunk_type[PLAIN] markcmd = s_buf_data x = markcmds[markcmd] if type(x) == TupleType: pre, after = x str = pre+str+after elif x == 1: str = string.upper(str) else: raise 'FATAL', 'corrupt markcmds' ch.data = str else: if s_buf_data not in ignoredcommands: print 'WARNING: deleting command ' + s_buf_data print 'PP' + `pp[i-1]` del pp[i-1] i, length = i-1, length-1 elif ch.chtype == chunk_type[GROUP]: length, newi = getnextarg(length, buf, pp, i-1) i = i-1 ## str = flattext(buf, crcopy(pp[i-1:newi])) ## del pp[i:newi] ## length = length - (newi - i) ## ch.chtype = chunk_type[PLAIN] ## ch.data = str else: pass dumpit(buf, wobj.write, pp) ##print 'FLATTEXT: RETURNING ' + `wobj.data` return wobj.data # try to generate node names (a bit shorter than the chapter title) # note that the \nodename command (see elsewhere) overules these efforts def invent_node_names(text): words = string.split(text) ##print 'WORDS ' + `words` if len(words) == 2 \ and string.lower(words[0]) == 'built-in' \ and string.lower(words[1]) not in ('modules', 'functions'): return words[1] if len(words) == 3 and string.lower(words[1]) == 'module': return words[2] if len(words) == 3 and string.lower(words[1]) == 'object': return string.join(words[0:2]) if len(words) > 4 \ and (string.lower(string.join(words[-4:])) \ == 'methods and data attributes'): return string.join(words[:2]) return text re_commas_etc = regex.compile('[,`\'@{}]') re_whitespace = regex.compile('[ \t]*') ##nodenamecmd = next_command_p(length, buf, pp, newi, 'nodename') # look if the next non-white stuff is also a command, resulting in skipping # double endlines (DENDLINE) too, and thus omitting \par's # Sometimes this is too much, maybe consider DENDLINE's as stop def next_command_p(length, buf, pp, i, cmdname): while 1: if i >= len(pp): break ch = pp[i] i = i+1 if ch.chtype == chunk_type[ENDLINE]: continue if ch.chtype == chunk_type[DENDLINE]: continue if ch.chtype == chunk_type[PLAIN]: if re_whitespace.search(s(buf, ch.data)) == 0 and \ re_whitespace.match(s(buf, ch.data)) == len(s(buf, ch.data)): continue return -1 if ch.chtype == chunk_type[CSNAME]: if s(buf, ch.data) == cmdname: return i # _after_ the command return -1 return -1 # things that are special to LaTeX, but not to texi.. onlylatexspecial = '_~^$#&%' class Struct: pass hist = Struct() out = Struct() def startchange(): global hist, out hist.inenv = [] hist.nodenames = [] hist.cindex = [] hist.inargs = 0 hist.enumeratenesting, hist.itemizenesting = 0, 0 out.doublenodes = [] out.doublecindeces = [] spacech = [chunk(PLAIN, 0, ' ')] commach = [chunk(PLAIN, 0, ', ')] cindexch = [chunk(CSLINE, 0, 'cindex')] # the standard variation in symbols for itemize itemizesymbols = ['bullet', 'minus', 'dots'] # same for enumerate enumeratesymbols = ['1', 'A', 'a'] ## ## \begin{ {func,data,exc}desc }{name}... ## the resulting texi-code is dependent on the contents of indexsubitem ## # indexsubitem: `['XXX', 'function'] # funcdesc: # deffn {`idxsi`} NAME (FUNCARGS) # indexsubitem: `['XXX', 'method']` # funcdesc: # defmethod {`idxsi[0]`} NAME (FUNCARGS) # indexsubitem: `['in', 'module', 'MODNAME']' # datadesc: # defcv data {`idxsi[1:]`} NAME # excdesc: # defcv exception {`idxsi[1:]`} NAME # funcdesc: # deffn {function of `idxsi[1:]`} NAME (FUNCARGS) # indexsubitem: `['OBJECT', 'attribute']' # datadesc # defcv attribute {`OBJECT`} NAME ## this routine will be called on \begin{funcdesc}{NAME}{ARGS} ## or \funcline{NAME}{ARGS} ## def do_funcdesc(length, buf, pp, i): startpoint = i-1 ch = pp[startpoint] wh = ch.where length, newi = getnextarg(length, buf, pp, i) funcname = chunk(GROUP, wh, pp[i:newi]) del pp[i:newi] length = length - (newi-i) save = hist.inargs hist.inargs = 1 length, newi = getnextarg(length, buf, pp, i) hist.inargs = save del save the_args = [chunk(PLAIN, wh, '()'[0])] + pp[i:newi] + \ [chunk(PLAIN, wh, '()'[1])] del pp[i:newi] length = length - (newi-i) idxsi = hist.indexsubitem # words command = '' cat_class = '' if idxsi and idxsi[-1] in ('method', 'protocol'): command = 'defmethod' cat_class = string.join(idxsi[:-1]) elif len(idxsi) == 2 and idxsi[1] == 'function': command = 'deffn' cat_class = string.join(idxsi) elif len(idxsi) == 3 and idxsi[:2] == ['in', 'module']: command = 'deffn' cat_class = 'function of ' + string.join(idxsi[1:]) if not command: raise error, 'don\'t know what to do with indexsubitem ' + `idxsi` ch.chtype = chunk_type[CSLINE] ch.data = command cslinearg = [chunk(GROUP, wh, [chunk(PLAIN, wh, cat_class)])] cslinearg.append(chunk(PLAIN, wh, ' ')) cslinearg.append(funcname) cslinearg.append(chunk(PLAIN, wh, ' ')) l = len(cslinearg) cslinearg[l:l] = the_args pp.insert(i, chunk(GROUP, wh, cslinearg)) i, length = i+1, length+1 hist.command = command return length, i ## this routine will be called on \begin{excdesc}{NAME} ## or \excline{NAME} ## def do_excdesc(length, buf, pp, i): startpoint = i-1 ch = pp[startpoint] wh = ch.where length, newi = getnextarg(length, buf, pp, i) excname = chunk(GROUP, wh, pp[i:newi]) del pp[i:newi] length = length - (newi-i) idxsi = hist.indexsubitem # words command = '' cat_class = '' class_class = '' if len(idxsi) == 2 and idxsi[1] == 'exception': command = 'defvr' cat_class = string.join(idxsi) elif len(idxsi) == 3 and idxsi[:2] == ['in', 'module']: command = 'defcv' cat_class = 'exception' class_class = string.join(idxsi[1:]) elif len(idxsi) == 4 and idxsi[:3] == ['exception', 'in', 'module']: command = 'defcv' cat_class = 'exception' class_class = string.join(idxsi[2:]) if not command: raise error, 'don\'t know what to do with indexsubitem ' + `idxsi` ch.chtype = chunk_type[CSLINE] ch.data = command cslinearg = [chunk(GROUP, wh, [chunk(PLAIN, wh, cat_class)])] cslinearg.append(chunk(PLAIN, wh, ' ')) if class_class: cslinearg.append(chunk(GROUP, wh, [chunk(PLAIN, wh, class_class)])) cslinearg.append(chunk(PLAIN, wh, ' ')) cslinearg.append(excname) pp.insert(i, chunk(GROUP, wh, cslinearg)) i, length = i+1, length+1 hist.command = command return length, i ## same for datadesc or dataline... def do_datadesc(length, buf, pp, i): startpoint = i-1 ch = pp[startpoint] wh = ch.where length, newi = getnextarg(length, buf, pp, i) dataname = chunk(GROUP, wh, pp[i:newi]) del pp[i:newi] length = length - (newi-i) idxsi = hist.indexsubitem # words command = '' cat_class = '' class_class = '' if idxsi[-1] in ('attribute', 'option'): command = 'defcv' cat_class = idxsi[-1] class_class = string.join(idxsi[:-1]) elif len(idxsi) == 3 and idxsi[:2] == ['in', 'module']: command = 'defcv' cat_class = 'data' class_class = string.join(idxsi[1:]) elif len(idxsi) == 4 and idxsi[:3] == ['data', 'in', 'module']: command = 'defcv' cat_class = 'data' class_class = string.join(idxsi[2:]) else: command = 'defcv' cat_class = 'data' class_class = string.join(idxsi) ch.chtype = chunk_type[CSLINE] ch.data = command cslinearg = [chunk(GROUP, wh, [chunk(PLAIN, wh, cat_class)])] cslinearg.append(chunk(PLAIN, wh, ' ')) if class_class: cslinearg.append(chunk(GROUP, wh, [chunk(PLAIN, wh, class_class)])) cslinearg.append(chunk(PLAIN, wh, ' ')) cslinearg.append(dataname) pp.insert(i, chunk(GROUP, wh, cslinearg)) i, length = i+1, length+1 hist.command = command return length, i # regular indices: those that are not set in tt font by default.... regindices = ('cindex', ) # remove illegal characters from node names def rm_commas_etc(text): result = '' changed = 0 while 1: pos = re_commas_etc.search(text) if pos >= 0: changed = 1 result = result + text[:pos] text = text[pos+1:] else: result = result + text break if changed: print 'Warning: nodename changhed to ' + `result` return result # boolean flags flags = {'texi': 1} ## ## changeit: the actual routine, that changes the contents of the parsed ## chunks ## def changeit(buf, pp): global onlylatexspecial, hist, out i, length = 0, len(pp) while 1: # sanity check: length should always equal len(pp) if len(pp) != length: raise 'FATAL', 'inconsistent length. thought ' + `length` + ', but should really be ' + `len(pp)` if i >= length: break ch = pp[i] i = i + 1 if type(ch) is StringType: #normally, only chunks are present in pp, # but in some cases, some extra info # has been inserted, e.g., the \end{...} clauses raise 'FATAL', 'got string, probably too many ' + `end` if ch.chtype == chunk_type[GROUP]: # check for {\em ...} constructs if ch.data and \ ch.data[0].chtype == chunk_type[CSNAME] and \ s(buf, ch.data[0].data) in fontchanges.keys(): k = s(buf, ch.data[0].data) del ch.data[0] pp.insert(i-1, chunk(CSNAME, ch.where, fontchanges[k])) length, i = length+1, i+1 # recursively parse the contents of the group changeit(buf, ch.data) elif ch.chtype == chunk_type[IF]: # \if... flag, negate, data = ch.data ##print 'IF: flag, negate = ' + `flag, negate` if flag not in flags.keys(): raise error, 'unknown flag ' + `flag` value = flags[flag] if negate: value = (not value) del pp[i-1] length, i = length-1, i-1 if value: pp[i:i] = data length = length + len(data) elif ch.chtype == chunk_type[ENV]: # \begin{...} .... envname, data = ch.data #push this environment name on stack hist.inenv.insert(0, envname) #append an endenv chunk after grouped data data.append(chunk(ENDENV, ch.where, envname)) ##[`data`] #delete this object del pp[i-1] i, length = i-1, length-1 #insert found data pp[i:i] = data length = length + len(data) if envname == 'verbatim': pp[i:i] = [chunk(CSLINE, ch.where, 'example'), chunk(GROUP, ch.where, [])] length, i = length+2, i+2 elif envname == 'itemize': if hist.itemizenesting > len(itemizesymbols): raise error, 'too deep itemize nesting' ingroupch = [chunk(CSNAME, ch.where, itemizesymbols[hist.itemizenesting])] hist.itemizenesting = hist.itemizenesting + 1 pp[i:i] = [chunk(CSLINE, ch.where, 'itemize'), chunk(GROUP, ch.where, ingroupch)] length, i = length+2, i+2 elif envname == 'enumerate': if hist.enumeratenesting > len(enumeratesymbols): raise error, 'too deep enumerate nesting' ingroupch = [chunk(PLAIN, ch.where, enumeratesymbols[hist.enumeratenesting])] hist.enumeratenesting = hist.enumeratenesting + 1 pp[i:i] = [chunk(CSLINE, ch.where, 'enumerate'), chunk(GROUP, ch.where, ingroupch)] length, i = length+2, i+2 elif envname == 'description': ingroupch = [chunk(CSNAME, ch.where, 'b')] pp[i:i] = [chunk(CSLINE, ch.where, 'table'), chunk(GROUP, ch.where, ingroupch)] length, i = length+2, i+2 elif (envname == 'tableiii') or (envname == 'tableii'): if (envname == 'tableii'): ltable = 2 else: ltable = 3 wh = ch.where newcode = [] #delete tabular format description # e.g., {|l|c|l|} length, newi = getnextarg(length, buf, pp, i) del pp[i:newi] length = length - (newi-i) newcode.append(chunk(CSLINE, wh, 'table')) ingroupch = [chunk(CSNAME, wh, 'asis')] newcode.append(chunk(GROUP, wh, ingroupch)) newcode.append(chunk(CSLINE, wh, 'item')) #get the name of macro for @item # e.g., {code} length, newi = getnextarg(length, buf, pp, i) if newi-i != 1: raise error, 'Sorry, expected 1 chunk argument' if pp[i].chtype != chunk_type[PLAIN]: raise error, 'Sorry, expected plain text argument' hist.itemargmacro = s(buf, pp[i].data) del pp[i:newi] length = length - (newi-i) itembody = [] for count in range(ltable): length, newi = getnextarg(length, buf, pp, i) emphgroup = [ chunk(CSNAME, wh, 'emph'), chunk(GROUP, 0, pp[i:newi])] del pp[i:newi] length = length - (newi-i) if count == 0: itemarg = emphgroup elif count == ltable-1: itembody = itembody + \ [chunk(PLAIN, wh, ' --- ')] + emphgroup else: itembody = emphgroup newcode.append(chunk(GROUP, wh, itemarg)) newcode = newcode + itembody + [chunk(DENDLINE, wh, '\n')] pp[i:i] = newcode l = len(newcode) length, i = length+l, i+l del newcode, l if length != len(pp): raise 'STILL, SOMETHING wrong', `i` elif envname == 'funcdesc': pp.insert(i, chunk(PLAIN, ch.where, '')) i, length = i+1, length+1 length, i = do_funcdesc(length, buf, pp, i) elif envname == 'excdesc': pp.insert(i, chunk(PLAIN, ch.where, '')) i, length = i+1, length+1 length, i = do_excdesc(length, buf, pp, i) elif envname == 'datadesc': pp.insert(i, chunk(PLAIN, ch.where, '')) i, length = i+1, length+1 length, i = do_datadesc(length, buf, pp, i) else: print 'WARNING: don\'t know what to do with env ' + `envname` elif ch.chtype == chunk_type[ENDENV]: envname = ch.data if envname != hist.inenv[0]: raise error, '\'end\' does not match. Name ' + `envname` + ', expected ' + `hist.inenv[0]` del hist.inenv[0] del pp[i-1] i, length = i-1, length-1 if envname == 'verbatim': pp[i:i] = [ chunk(CSLINE, ch.where, 'end'), chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, 'example')])] i, length = i+2, length+2 elif envname == 'itemize': hist.itemizenesting = hist.itemizenesting - 1 pp[i:i] = [ chunk(CSLINE, ch.where, 'end'), chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, 'itemize')])] i, length = i+2, length+2 elif envname == 'enumerate': hist.enumeratenesting = hist.enumeratenesting-1 pp[i:i] = [ chunk(CSLINE, ch.where, 'end'), chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, 'enumerate')])] i, length = i+2, length+2 elif envname == 'description': pp[i:i] = [ chunk(CSLINE, ch.where, 'end'), chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, 'table')])] i, length = i+2, length+2 elif (envname == 'tableiii') or (envname == 'tableii'): pp[i:i] = [ chunk(CSLINE, ch.where, 'end'), chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, 'table')])] i, length = i+2, length + 2 pp.insert(i, chunk(DENDLINE, ch.where, '\n')) i, length = i+1, length+1 elif envname in ('funcdesc', 'excdesc', 'datadesc'): pp[i:i] = [ chunk(CSLINE, ch.where, 'end'), chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, hist.command)])] i, length = i+2, length+2 else: print 'WARNING: ending env ' + `envname` + 'has no actions' elif ch.chtype == chunk_type[CSNAME]: # control name transformations s_buf_data = s(buf, ch.data) if s_buf_data == 'optional': pp[i-1].chtype = chunk_type[PLAIN] pp[i-1].data = '[' if (i < length) and \ (pp[i].chtype == chunk_type[GROUP]): cp=pp[i].data pp[i:i+1]=cp + [ chunk(PLAIN, ch.where, ']')] length = length+len(cp) elif s_buf_data in ignoredcommands: del pp[i-1] i, length = i-1, length-1 elif s_buf_data == '@' and \ i != length and \ pp[i].chtype == chunk_type[PLAIN] and \ s(buf, pp[i].data)[0] == '.': # \@. --> \. --> @. ch.data = '.' del pp[i] length = length-1 elif s_buf_data == '\\': # \\ --> \* --> @* ch.data = '*' elif len(s_buf_data) == 1 and \ s_buf_data in onlylatexspecial: ch.chtype = chunk_type[PLAIN] # check if such a command is followed by # an empty group: e.g., `\%{}'. If so, remove # this empty group too if i < length and \ pp[i].chtype == chunk_type[GROUP] \ and len(pp[i].data) == 0: del pp[i] length = length-1 elif hist.inargs and s_buf_data in inargsselves: # This is the special processing of the # arguments of the \begin{funcdesc}... or # \funcline... arguments # \, --> , \[ --> [, \] --> ] ch.chtype = chunk_type[PLAIN] elif s_buf_data == 'renewcommand': # \renewcommand{\indexsubitem}.... i, length = i-1, length-1 del pp[i] length, newi = getnextarg(length, buf, pp, i) if newi-i == 1 \ and i < length \ and pp[i].chtype == chunk_type[CSNAME] \ and s(buf, pp[i].data) == 'indexsubitem': del pp[i:newi] length = length - (newi-i) length, newi = getnextarg(length, buf, pp, i) text = flattext(buf, pp[i:newi]) if text[:1] != '(' or text[-1:] != ')': raise error, \ 'expected indexsubitem enclosed in parenteses' words = string.split(text[1:-1]) hist.indexsubitem = words ## print 'set hist.indexsubitem =', words del text, words else: print 'WARNING: renewcommand with unsupported arg removed' del pp[i:newi] length = length - (newi-i) elif s_buf_data == 'item': ch.chtype = chunk_type[CSLINE] length, newi = getoptarg(length, buf, pp, i) ingroupch = pp[i:newi] del pp[i:newi] length = length - (newi-i) pp.insert(i, chunk(GROUP, ch.where, ingroupch)) i, length = i+1, length+1 elif s_buf_data == 'ttindex': idxsi = hist.indexsubitem cat_class = '' if len(idxsi) >= 2 and idxsi[1] in \ ('method', 'function', 'protocol'): command = 'findex' elif len(idxsi) >= 2 and idxsi[1] in \ ('exception', 'object'): command = 'vindex' elif len(idxsi) == 3 and idxsi[:2] == ['in', 'module']: command = 'cindex' else: print 'WARNING: can\'t categorize ' + `idxsi` \ + ' for \'ttindex\' command' command = 'cindex' if not cat_class: cat_class = '('+string.join(idxsi)+')' ch.chtype = chunk_type[CSLINE] ch.data = command length, newi = getnextarg(length, buf, pp, i) arg = pp[i:newi] del pp[i:newi] length = length - (newi-i) cat_arg = [chunk(PLAIN, ch.where, cat_class)] # determine what should be set in roman, and # what in tt-font if command in regindices: arg = [chunk(CSNAME, ch.where, 't'), chunk(GROUP, ch.where, arg)] else: cat_arg = [chunk(CSNAME, ch.where, 'r'), chunk(GROUP, ch.where, cat_arg)] ingroupch = arg + \ [chunk(PLAIN, ch.where, ' ')] + \ cat_arg pp.insert(i, chunk(GROUP, ch.where, ingroupch)) length, i = length+1, i+1 elif s_buf_data == 'ldots': # \ldots --> \dots{} --> @dots{} ch.data = 'dots' if i == length \ or pp[i].chtype != chunk_type[GROUP] \ or pp[i].data != []: pp.insert(i, chunk(GROUP, ch.where, [])) i, length = i+1, length+1 elif s_buf_data in themselves: # \UNIX --> UNIX ch.chtype = chunk_type[PLAIN] if i != length \ and pp[i].chtype == chunk_type[GROUP] \ and pp[i].data == []: del pp[i] length = length-1 elif s_buf_data in for_texi: pass elif s_buf_data == 'e': # "\e" --> "\" ch.data = '\\' ch.chtype = chunk_type[PLAIN] elif s_buf_data in ('lineiii', 'lineii'): # This is the most tricky one # \lineiii{a1}{a2}[{a3}] --> # @item @{a1} # a2 [ -- a3] # ##print 'LINEIIIIII!!!!!!!' ## wobj = Wobj() ## dumpit(buf, wobj.write, pp[i-1:i+5]) ## print '--->' + wobj.data + '<----' if not hist.inenv: raise error, 'no environment for lineiii' if (hist.inenv[0] != 'tableiii') and \ (hist.inenv[0] != 'tableii'): raise error, \ 'wrong command (%s) in wrong environment (%s)' \ % (s_buf_data, `hist.inenv[0]`) ch.chtype = chunk_type[CSLINE] ch.data = 'item' length, newi = getnextarg(length, buf, pp, i) ingroupch = [chunk(CSNAME, 0, hist.itemargmacro), chunk(GROUP, 0, pp[i:newi])] del pp[i:newi] length = length - (newi-i) ## print 'ITEM ARG: --->', ## wobj = Wobj() ## dumpit(buf, wobj.write, ingroupch) ## print wobj.data, '<---' pp.insert(i, chunk(GROUP, ch.where, ingroupch)) grouppos = i i, length = i+1, length+1 length, i = getnextarg(length, buf, pp, i) length, newi = getnextarg(length, buf, pp, i) if newi > i: # we have a 3rd arg pp.insert(i, chunk(PLAIN, ch.where, ' --- ')) i = newi + 1 length = length + 1 ## pp[grouppos].data = pp[grouppos].data \ ## + [chunk(PLAIN, ch.where, ' ')] \ ## + pp[i:newi] ## del pp[i:newi] ## length = length - (newi-i) if length != len(pp): raise 'IN LINEIII IS THE ERR', `i` elif s_buf_data in ('chapter', 'section', 'subsection', 'subsubsection'): #\xxxsection{A} ----> # @node A, , , # @xxxsection A ## also: remove commas and quotes ch.chtype = chunk_type[CSLINE] length, newi = getnextarg(length, buf, pp, i) afternodenamecmd = next_command_p(length, buf, pp, newi, 'nodename') if afternodenamecmd < 0: cp1 = crcopy(pp[i:newi]) pp[i:newi] = [chunk(GROUP, ch.where, pp[i:newi])] length, newi = length - (newi-i) + 1, i+1 text = flattext(buf, cp1) text = invent_node_names(text) else: length, endarg = getnextarg(length, buf, pp, afternodenamecmd) cp1 = crcopy(pp[afternodenamecmd:endarg]) del pp[newi:endarg] length = length - (endarg-newi) pp[i:newi] = [chunk(GROUP, ch.where, pp[i:newi])] length, newi = length - (newi-i) + 1, i + 1 text = flattext(buf, cp1) if text[-1] == '.': text = text[:-1] ## print 'FLATTEXT:', `text` if text in hist.nodenames: print 'WARNING: node name ' + `text` + ' already used' out.doublenodes.append(text) else: hist.nodenames.append(text) text = rm_commas_etc(text) pp[i-1:i-1] = [chunk(CSLINE, ch.where, 'node'), chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, text+', , ,') ])] i, length = newi+2, length+2 elif s_buf_data == 'funcline': # fold it to a very short environment pp[i-1:i-1] = [chunk(CSLINE, ch.where, 'end'), chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, hist.command)])] i, length = i+2, length+2 length, i = do_funcdesc(length, buf, pp, i) elif s_buf_data == 'dataline': pp[i-1:i-1] = [chunk(CSLINE, ch.where, 'end'), chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, hist.command)])] i, length = i+2, length+2 length, i = do_datadesc(length, buf, pp, i) elif s_buf_data == 'excline': pp[i-1:i-1] = [chunk(CSLINE, ch.where, 'end'), chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, hist.command)])] i, length = i+2, length+2 length, i = do_excdesc(length, buf, pp, i) elif s_buf_data == 'index': #\index{A} ---> # @cindex A ch.chtype = chunk_type[CSLINE] ch.data = 'cindex' length, newi = getnextarg(length, buf, pp, i) ingroupch = pp[i:newi] del pp[i:newi] length = length - (newi-i) pp.insert(i, chunk(GROUP, ch.where, ingroupch)) length, i = length+1, i+1 elif s_buf_data == 'bifuncindex': ch.chtype = chunk_type[CSLINE] ch.data = 'findex' length, newi = getnextarg(length, buf, pp, i) ingroupch = pp[i:newi] del pp[i:newi] length = length - (newi-i) ingroupch.append(chunk(PLAIN, ch.where, ' ')) ingroupch.append(chunk(CSNAME, ch.where, 'r')) ingroupch.append(chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, '(built-in function)')])) pp.insert(i, chunk(GROUP, ch.where, ingroupch)) length, i = length+1, i+1 elif s_buf_data == 'obindex': ch.chtype = chunk_type[CSLINE] ch.data = 'findex' length, newi = getnextarg(length, buf, pp, i) ingroupch = pp[i:newi] del pp[i:newi] length = length - (newi-i) ingroupch.append(chunk(PLAIN, ch.where, ' ')) ingroupch.append(chunk(CSNAME, ch.where, 'r')) ingroupch.append(chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, '(object)')])) pp.insert(i, chunk(GROUP, ch.where, ingroupch)) length, i = length+1, i+1 elif s_buf_data == 'opindex': ch.chtype = chunk_type[CSLINE] ch.data = 'findex' length, newi = getnextarg(length, buf, pp, i) ingroupch = pp[i:newi] del pp[i:newi] length = length - (newi-i) ingroupch.append(chunk(PLAIN, ch.where, ' ')) ingroupch.append(chunk(CSNAME, ch.where, 'r')) ingroupch.append(chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, '(operator)')])) pp.insert(i, chunk(GROUP, ch.where, ingroupch)) length, i = length+1, i+1 elif s_buf_data == 'bimodindex': ch.chtype = chunk_type[CSLINE] ch.data = 'pindex' length, newi = getnextarg(length, buf, pp, i) ingroupch = pp[i:newi] del pp[i:newi] length = length - (newi-i) ingroupch.append(chunk(PLAIN, ch.where, ' ')) ingroupch.append(chunk(CSNAME, ch.where, 'r')) ingroupch.append(chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, '(built-in)')])) pp.insert(i, chunk(GROUP, ch.where, ingroupch)) length, i = length+1, i+1 elif s_buf_data == 'sectcode': ch.data = 'code' elif s_buf_data == 'stmodindex': ch.chtype = chunk_type[CSLINE] # use the program index as module index ch.data = 'pindex' length, newi = getnextarg(length, buf, pp, i) ingroupch = pp[i:newi] del pp[i:newi] length = length - (newi-i) ingroupch.append(chunk(PLAIN, ch.where, ' ')) ingroupch.append(chunk(CSNAME, ch.where, 'r')) ingroupch.append(chunk(GROUP, ch.where, [ chunk(PLAIN, ch.where, '(standard)')])) pp.insert(i, chunk(GROUP, ch.where, ingroupch)) length, i = length+1, i+1 elif s_buf_data == 'stindex': # XXX must actually go to newindex st wh = ch.where ch.chtype = chunk_type[CSLINE] ch.data = 'cindex' length, newi = getnextarg(length, buf, pp, i) ingroupch = [chunk(CSNAME, wh, 'code'), chunk(GROUP, wh, pp[i:newi])] del pp[i:newi] length = length - (newi-i) t = ingroupch[:] t.append(chunk(PLAIN, wh, ' statement')) pp.insert(i, chunk(GROUP, wh, t)) i, length = i+1, length+1 pp.insert(i, chunk(CSLINE, wh, 'cindex')) i, length = i+1, length+1 t = ingroupch[:] t.insert(0, chunk(PLAIN, wh, 'statement, ')) pp.insert(i, chunk(GROUP, wh, t)) i, length = i+1, length+1 elif s_buf_data == 'indexii': #\indexii{A}{B} ---> # @cindex A B # @cindex B, A length, newi = getnextarg(length, buf, pp, i) cp11 = pp[i:newi] cp21 = crcopy(pp[i:newi]) del pp[i:newi] length = length - (newi-i) length, newi = getnextarg(length, buf, pp, i) cp12 = pp[i:newi] cp22 = crcopy(pp[i:newi]) del pp[i:newi] length = length - (newi-i) ch.chtype = chunk_type[CSLINE] ch.data = 'cindex' pp.insert(i, chunk(GROUP, ch.where, cp11 + [ chunk(PLAIN, ch.where, ' ')] + cp12)) i, length = i+1, length+1 pp[i:i] = [chunk(CSLINE, ch.where, 'cindex'), chunk(GROUP, ch.where, cp22 + [ chunk(PLAIN, ch.where, ', ')]+ cp21)] i, length = i+2, length+2 elif s_buf_data == 'indexiii': length, newi = getnextarg(length, buf, pp, i) cp11 = pp[i:newi] cp21 = crcopy(pp[i:newi]) cp31 = crcopy(pp[i:newi]) del pp[i:newi] length = length - (newi-i) length, newi = getnextarg(length, buf, pp, i) cp12 = pp[i:newi] cp22 = crcopy(pp[i:newi]) cp32 = crcopy(pp[i:newi]) del pp[i:newi] length = length - (newi-i) length, newi = getnextarg(length, buf, pp, i) cp13 = pp[i:newi] cp23 = crcopy(pp[i:newi]) cp33 = crcopy(pp[i:newi]) del pp[i:newi] length = length - (newi-i) ch.chtype = chunk_type[CSLINE] ch.data = 'cindex' pp.insert(i, chunk(GROUP, ch.where, cp11 + [ chunk(PLAIN, ch.where, ' ')] + cp12 + [chunk(PLAIN, ch.where, ' ')] + cp13)) i, length = i+1, length+1 pp[i:i] = [chunk(CSLINE, ch.where, 'cindex'), chunk(GROUP, ch.where, cp22 + [ chunk(PLAIN, ch.where, ' ')]+ cp23 + [chunk(PLAIN, ch.where, ', ')] + cp21)] i, length = i+2, length+2 pp[i:i] = [chunk(CSLINE, ch.where, 'cindex'), chunk(GROUP, ch.where, cp33 + [ chunk(PLAIN, ch.where, ', ')]+ cp31 + [chunk(PLAIN, ch.where, ' ')] + cp32)] i, length = i+2, length+2 elif s_buf_data == 'indexiv': length, newi = getnextarg(length, buf, pp, i) cp11 = pp[i:newi] cp21 = crcopy(pp[i:newi]) cp31 = crcopy(pp[i:newi]) cp41 = crcopy(pp[i:newi]) del pp[i:newi] length = length - (newi-i) length, newi = getnextarg(length, buf, pp, i) cp12 = pp[i:newi] cp22 = crcopy(pp[i:newi]) cp32 = crcopy(pp[i:newi]) cp42 = crcopy(pp[i:newi]) del pp[i:newi] length = length - (newi-i) length, newi = getnextarg(length, buf, pp, i) cp13 = pp[i:newi] cp23 = crcopy(pp[i:newi]) cp33 = crcopy(pp[i:newi]) cp43 = crcopy(pp[i:newi]) del pp[i:newi] length = length - (newi-i) length, newi = getnextarg(length, buf, pp, i) cp14 = pp[i:newi] cp24 = crcopy(pp[i:newi]) cp34 = crcopy(pp[i:newi]) cp44 = crcopy(pp[i:newi]) del pp[i:newi] length = length - (newi-i) ch.chtype = chunk_type[CSLINE] ch.data = 'cindex' ingroupch = cp11 + \ spacech + cp12 + \ spacech + cp13 + \ spacech + cp14 pp.insert(i, chunk(GROUP, ch.where, ingroupch)) i, length = i+1, length+1 ingroupch = cp22 + \ spacech + cp23 + \ spacech + cp24 + \ commach + cp21 pp[i:i] = cindexch + [ chunk(GROUP, ch.where, ingroupch)] i, length = i+2, length+2 ingroupch = cp33 + \ spacech + cp34 + \ commach + cp31 + \ spacech + cp32 pp[i:i] = cindexch + [ chunk(GROUP, ch.where, ingroupch)] i, length = i+2, length+2 ingroupch = cp44 + \ commach + cp41 + \ spacech + cp42 + \ spacech + cp43 pp[i:i] = cindexch + [ chunk(GROUP, ch.where, ingroupch)] i, length = i+2, length+2 ## elif s_buf_data == 'indexsubitem': ## ch.data = flattext(buf, [ch]) ## ch.chtype = chunk_type[PLAIN] elif s_buf_data in ('noindent', 'indexsubitem'): pass else: print "don't know what to do with keyword " + s_buf_data re_atsign = regex.compile('[@{}]') re_newline = regex.compile('\n') def dumpit(buf, wm, pp): global out i, length = 0, len(pp) addspace = 0 while 1: if len(pp) != length: raise 'FATAL', 'inconsistent length' if i == length: break ch = pp[i] i = i + 1 dospace = addspace addspace = 0 if ch.chtype == chunk_type[CSNAME]: s_buf_data = s(buf, ch.data) wm('@' + s_buf_data) if s_buf_data == 'node' and \ pp[i].chtype == chunk_type[PLAIN] and \ s(buf, pp[i].data) in out.doublenodes: ##XXX doesnt work yet?? wm(' ZZZ-' + zfill(`i`, 4)) if s_buf_data[0] in string.letters: addspace = 1 elif ch.chtype == chunk_type[PLAIN]: if dospace and s(buf, ch.data) not in (' ', '\t'): wm(' ') text = s(buf, ch.data) while 1: pos = re_atsign.search(text) if pos < 0: break wm(text[:pos] + '@' + text[pos]) text = text[pos+1:] wm(text) elif ch.chtype == chunk_type[GROUP]: wm('{') dumpit(buf, wm, ch.data) wm('}') elif ch.chtype == chunk_type[DENDLINE]: wm('\n\n') while i != length and pp[i].chtype in \ (chunk_type[DENDLINE], chunk_type[ENDLINE]): i = i + 1 elif ch.chtype == chunk_type[OTHER]: wm(s(buf, ch.data)) elif ch.chtype == chunk_type[ACTIVE]: wm(s(buf, ch.data)) elif ch.chtype == chunk_type[ENDLINE]: wm('\n') elif ch.chtype == chunk_type[CSLINE]: if i >= 2 and pp[i-2].chtype not in \ (chunk_type[ENDLINE], chunk_type[DENDLINE]) \ and (pp[i-2].chtype != chunk_type[PLAIN] or s(buf, pp[i-2].data)[-1] != '\n'): wm('\n') wm('@' + s(buf, ch.data)) if i == length: raise error, 'CSLINE expected another chunk' if pp[i].chtype != chunk_type[GROUP]: raise error, 'CSLINE expected GROUP' if type(pp[i].data) != ListType: raise error, 'GROUP chould contain []-data' wobj = Wobj() dumpit(buf, wobj.write, pp[i].data) i = i + 1 text = wobj.data del wobj if text: wm(' ') while 1: pos = re_newline.search(text) if pos < 0: break print 'WARNING: found newline in csline arg' wm(text[:pos] + ' ') text = text[pos+1:] wm(text) if i >= length or \ pp[i].chtype not in (chunk_type[CSLINE], chunk_type[ENDLINE], chunk_type[DENDLINE]) \ and (pp[i].chtype != chunk_type[PLAIN] or s(buf, pp[i].data)[0] != '\n'): wm('\n') elif ch.chtype == chunk_type[COMMENT]: ## print 'COMMENT: previous chunk =', pp[i-2] ## if pp[i-2].chtype == chunk_type[PLAIN]: ## print 'PLAINTEXT =', `s(buf, pp[i-2].data)` if s(buf, ch.data) and \ regex.match('^[ \t]*$', s(buf, ch.data)) < 0: if i >= 2 \ and pp[i-2].chtype not in (chunk_type[ENDLINE], chunk_type[DENDLINE]) \ and not (pp[i-2].chtype == chunk_type[PLAIN] and regex.match('\\(.\\|\n\\)*[ \t]*\n$', s(buf, pp[i-2].data)) >= 0): wm('\n') wm('@c ' + s(buf, ch.data)) elif ch.chtype == chunk_type[IGNORE]: pass else: try: str = `s(buf, ch.data)` except TypeError: str = `ch.data` if len(str) > 400: str = str[:400] + '...' print 'warning:', ch.chtype, 'not handled, data ' + str def main(): outfile = None headerfile = 'texipre.dat' trailerfile = 'texipost.dat' try: opts, args = getopt.getopt(sys.argv[1:], 'o:h:t:') except getopt.error: args = [] if not args: print 'usage: partparse [-o outfile] [-h headerfile]', print '[-t trailerfile] file ...' sys.exit(2) for opt, arg in opts: if opt == '-o': outfile = arg if opt == '-h': headerfile = arg if opt == '-t': trailerfile = arg if not outfile: root, ext = os.path.splitext(args[0]) outfile = root + '.texi' if outfile in args: print 'will not overwrite input file', outfile sys.exit(2) outf = open(outfile, 'w') outf.write(open(headerfile, 'r').read()) for file in args: if len(args) > 1: print '='*20, file, '='*20 buf = open(file, 'r').read() w, pp = parseit(buf) startchange() changeit(buf, pp) dumpit(buf, outf.write, pp) outf.write(open(trailerfile, 'r').read()) outf.close() if __name__ == "__main__": main()