#! /usr/bin/env perl # html2texi.pl -- Convert HTML documentation to Texinfo format # Michael Ernst # Time-stamp: <1999-01-12 21:34:27 mernst> # This program converts HTML documentation trees into Texinfo format. # Given the name of a main (or contents) HTML file, it processes that file, # and other files (transitively) referenced by it, into a Texinfo file # (whose name is chosen from the file or directory name of the argument). # For instance: # html2texi.pl api/index.html # produces file "api.texi". # Texinfo format can be easily converted to Info format (for browsing in # Emacs or the standalone Info browser), to a printed manual, or to HTML. # Thus, html2texi.pl permits conversion of HTML files to Info format, and # secondarily enables producing printed versions of Web page hierarchies. # Unlike HTML, Info format is searchable. Since Info is integrated into # Emacs, one can read documentation without starting a separate Web # browser. Additionally, Info browsers (including Emacs) contain # convenient features missing from Web browsers, such as easy index lookup # and mouse-free browsing. # Limitations: # html2texi.pl is currently tuned to latex2html output (and it corrects # several latex2html bugs), but should be extensible to arbitrary HTML # documents. It will be most useful for HTML with a hierarchical structure # and an index, and it recognizes those features as created by latex2html # (and possibly by some other tools). The HTML tree to be traversed must # be on local disk, rather than being accessed via HTTP. # This script requires the use of "checkargs.pm". To eliminate that # dependence, replace calls to check_args* by @_ (which is always the last # argument to those functions). # Also see the "to do" section, below. # Comments, suggestions, bug fixes, and enhancements are welcome. # Troubleshooting: # Malformed HTML can cause this program to abort, so # you should check your HTML files to make sure they are legal. ### ### Typical usage for the Python documentation: ### # (Actually, most of this is in a Makefile instead.) # The resulting Info format Python documentation is currently available at # ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz # Fix up HTML problems, eg
should be
. # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/lib/index.html # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/mac/index.html # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ref/index.html # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/tut/index.html # Edit the generated .texi files: # * change @setfilename to prefix "python-" # * fix up any sectioning, such as for Abstract # * make Texinfo menus # * perhaps remove the @detailmenu ... @end detailmenu # In Emacs, to do all this: # (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer)) # makeinfo api.texi # makeinfo ext.texi # makeinfo lib.texi # makeinfo mac.texi # makeinfo ref.texi # makeinfo tut.texi ### ### Structure of the code ### # To be written... ### ### Design decisions ### # Source and destination languages # -------------------------------- # # The goal is Info files; I create Texinfo, so I don't have to worry about # the finer details of Info file creation. (I'm not even sure of its exact # format.) # # Why not start from LaTeX rather than HTML? # I could hack latex2html itself to produce Texinfo instead, or fix up # partparse.py (which already translates LaTeX to Teinfo). # Pros: # * has high-level information such as index entries, original formatting # Cons: # * those programs are complicated to read and understand # * those programs try to handle arbitrary LaTeX input, track catcodes, # and more: I don't want to go to that effort. HTML isn't as powerful # as LaTeX, so there are fewer subtleties. # * the result wouldn't work for arbitrary HTML documents; it would be # nice to eventually extend this program to HTML produced from Docbook, # Frame, and more. # Parsing # ------- # # I don't want to view the text as a linear stream; I'd rather parse the # whole thing and then do pattern matching over the parsed representation (to # find idioms such as indices, lists of child nodes, etc.). # * Perl provides HTML::TreeBuilder, which does just what I want. # * libwww-perl: http://www.linpro.no/lwp/ # * TreeBuilder: HTML-Tree-0.51.tar.gz # * Python Parsers, Formatters, and Writers don't really provide the right # interface (and the version in Grail doesn't correspond to another # distributed version, so I'm confused about which to be using). I could # write something in Python that creates a parse tree, but why bother? # Other implementation language issues: # * Python lacks variable declarations, reasonable scoping, and static # checking tools. I've written some of the latter for myself that make # my Perl programming a lot safer than my Python programming will be until # I have a similar suite for that language. ########################################################################### ### To do ### # Section names: # Fix the problem with multiple sections in a single file (eg, Abstract in # Front Matter section). # Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310 # Index: # Perhaps double-check that every tag mentioned in the index is found # in the text. # Python: email to docs@python.org, to get their feedback. # Compare to existing lib/ Info manual # Write the hooks into info-look; replace pyliblookup1-1.tar.gz. # Postpass to remove extra quotation marks around typography already in # a different font (to avoid double delimiters as in "`code'"); or # perhaps consider using only font-based markup so that we don't get # the extra *bold* and `code' markup in Info. ## Perhaps don't rely on automatic means for adding up, next, prev; I have ## all that info available to me already, so it's not so much trouble to ## add it. (Right?) But it is *so* easy to use Emacs instead... ########################################################################### ### Strictures ### # man HTML::TreeBuilder # man HTML::Parser # man HTML::Element # require HTML::ParserWComment; require HTML::Parser; require HTML::TreeBuilder; require HTML::Element; use File::Basename; use strict; # use Carp; use checkargs; ########################################################################### ### Variables ### my @section_stack = (); # elements are chapter/section/subsec nodetitles (I think) my $current_ref_tdf; # for the file currently being processed; # used in error messages my $html_directory; my %footnotes; # First element should not be used. my @sectionmarker = ("manual", "chapter", "section", "subsection", "subsubsection"); my %inline_markup = ("b" => "strong", "code" => "code", "i" => "emph", "kbd" => "kbd", "samp" => "samp", "strong" => "strong", "tt" => "code", "var" => "var"); my @deferred_index_entries = (); my @index_titles = (); # list of (filename, type) lists my %index_info = ("Index" => ["\@blindex", "bl"], "Concept Index" => ["\@cindex", "cp"], "Module Index" => ["\@mdindex", "md"]); ########################################################################### ### Main/contents page ### # Process first-level page on its own, or just a contents page? Well, I do # want the title, author, etc., and the front matter... For now, just add # that by hand at the end. # data structure possibilities: # * tree-like (need some kind of stack when processing (or parent pointers)) # * list of name and depth; remember old and new depths. # Each element is a reference to a list of (nodetitle, depth, filename). my @contents_list = (); # The problem with doing fixups on the fly is that some sections may have # already been processed (and no longer available) by the time we notice # others with the same name. It's probably better to fully construct the # contents list (reading in all files of interest) upfront; that will also # let me do a better job with cross-references, because again, all files # will already be read in. my %contents_hash = (); my %contents_fixups = (); my @current_contents_list = (); # Merge @current_contents_list into @contents_list, # and set @current_contents_list to be empty. sub merge_contents_lists ( ) { check_args(0, @_); # Three possibilities: # * @contents_list is empty: replace it by @current_contents_list. # * prefixes of the two lists are identical: do nothing # * @current_contents_list is all at lower level than $contents_list[0]; # prefix @contents_list by @current_contents_list if (scalar(@current_contents_list) == 0) { die "empty current_contents_list"; } # if (scalar(@contents_list) == 0) # { @contents_list = @current_contents_list; # @current_contents_list = (); # return; } # if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1])) # { unshift @contents_list, @current_contents_list; # @current_contents_list = (); # return; } for (my $i=0; $i= scalar(@contents_list)) { push @contents_list, $ref_c_tdf; my $title = $ {$ref_c_tdf}[0]; if (defined $contents_hash{$title}) { $contents_fixups{$title} = 1; } else { $contents_hash{$title} = 1; } next; } my $ref_tdf = $contents_list[$i]; my ($title, $depth, $file) = @{$ref_tdf}; my ($c_title, $c_depth, $c_file) = @{$ref_c_tdf}; if (($title ne $c_title) && ($depth < $c_depth) && ($file ne $c_file)) { splice @contents_list, $i, 0, $ref_c_tdf; if (defined $contents_hash{$c_title}) { $contents_fixups{$c_title} = 1; } else { $contents_hash{$c_title} = 1; } next; } if (($title ne $c_title) || ($depth != $c_depth) || ($file ne $c_file)) { die ("while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:", "\n main: <<<$title>>> $depth $file", "\n curr: <<<$c_title>>> $c_depth $c_file"); } } @current_contents_list = (); } # Set @current_contents_list to a list of (title, href, sectionlevel); # then merge that list into @contents_list. # Maybe this function should also produce a map # from title (or href) to sectionlevel (eg "chapter"?). sub process_child_links ( $ ) { my ($he) = check_args(1, @_); # $he->dump(); if (scalar(@current_contents_list) != 0) { die "current_contents_list nonempty: @current_contents_list"; } $he->traverse(\&increment_current_contents_list, 'ignore text'); # Normalize the depths; for instance, convert 1,3,5 into 0,1,2. my %depths = (); for my $ref_tdf (@current_contents_list) { $depths{$ {$ref_tdf}[1]} = 1; } my @sorted_depths = sort keys %depths; my $current_depth = scalar(@section_stack)-1; my $current_depth_2 = $ {$current_ref_tdf}[1]; if ($current_depth != $current_depth_2) { die "mismatch in current depths: $current_depth $current_depth_2; ", join(", ", @section_stack); } for (my $i=0; $i 1) && ($ {$current_contents_list[1]}[0] eq "Contents")) { my $ref_first_tdf = shift @current_contents_list; $current_contents_list[0] = $ref_first_tdf; } for (my $i=0; $itag eq "li") { my @li_content = @{$he->content}; if ($li_content[0]->tag ne "a") { die "first element of
  • should be "; } my ($name, $href, @content) = anchor_info($li_content[0]); # unused $name my $title = join("", collect_texts($li_content[0])); $title = texi_remove_punctuation($title); # The problem with these is that they are formatted differently in # @menu and @node! $title =~ s/``/\"/g; $title =~ s/''/\"/g; $title =~ s/ -- / /g; push @current_contents_list, [ $title, $depth, $href ]; } return 1; } # Simple version for section titles sub html_to_texi ( $ ) { my ($he) = check_args(1, @_); if (!ref $he) { return $he; } my $tag = $he->tag; if (exists $inline_markup{$tag}) { my $result = "\@$inline_markup{$tag}\{"; for my $elt (@{$he->content}) { $result .= html_to_texi($elt); } $result .= "\}"; return $result; } else { $he->dump(); die "html_to_texi confused by <$tag>"; } } sub print_contents_list () { check_args(0, @_); print STDERR "Contents list:\n"; for my $ref_tdf (@contents_list) { my ($title, $depth, $file) = @{$ref_tdf}; print STDERR "$title $depth $file\n"; } } ########################################################################### ### Index ### my $l2h_broken_link_name = "l2h-"; # map from file to (map from anchor name to (list of index texts)) # (The list is needed when a single LaTeX command like \envvar # expands to multiple \index commands.) my %file_index_entries = (); my %this_index_entries; # map from anchor name to (list of index texts) my %file_index_entries_broken = (); # map from file to (list of index texts) my @this_index_entries_broken; my $index_prefix = ""; my @index_prefixes = (); my $this_indexing_command; sub print_index_info () { check_args(0, @_); my ($key, $val); for my $file (sort keys %file_index_entries) { my %index_entries = %{$file_index_entries{$file}}; print STDERR "file: $file\n"; for my $aname (sort keys %index_entries) { my @entries = @{$index_entries{$aname}}; if (scalar(@entries) == 1) { print STDERR " $aname : $entries[0]\n"; } else { print STDERR " $aname : ", join("\n " . (" " x length($aname)), @entries), "\n"; } } } for my $file (sort keys %file_index_entries_broken) { my @entries = @{$file_index_entries_broken{$file}}; print STDERR "file: $file\n"; for my $entry (@entries) { print STDERR " $entry\n"; } } } sub process_index_file ( $$ ) { my ($file, $indexing_command) = check_args(2, @_); # print "process_index_file $file $indexing_command\n"; my $he = file_to_tree($html_directory . $file); # $he->dump(); $this_indexing_command = $indexing_command; $he->traverse(\&process_if_index_dl_compact, 'ignore text'); undef $this_indexing_command; # print "process_index_file done\n"; } sub process_if_index_dl_compact ( $$$ ) { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument if (!$startflag) { return; } if (($he->tag() eq "dl") && (defined $he->attr('compact'))) { process_index_dl_compact($he); return 0; } else { return 1; } } # The elements of a
    list from a LaTeX2HTML index: # * a single space: text to be ignored # *
    elements with an optional
    element following each one # Two types of
    elements: # * Followed by a
    element: the
    contains a single # string, and the
    contains a whitespace string to be ignored, a #
    to be recursively processed (with the
    string as a # prefix), and a whitespace string to be ignored. # * Not followed by a
    element: contains a list of anchors # and texts (ignore the texts, which are only whitespace and commas). # Optionally contains a
    to be recursively processed (with # the
    string as a prefix) sub process_index_dl_compact ( $ ) { my ($h) = check_args(1, @_); my @content = @{$h->content()}; for (my $i = 0; $i < scalar(@content); $i++) { my $this_he = $content[$i]; if ($this_he->tag ne "dt") { $this_he->dump(); die "Expected
    tag: " . $this_he->tag; } if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd")) { process_index_dt_and_dd($this_he, $content[$i+1]); $i++; } else { process_index_lone_dt($this_he); } } } # Argument is a
    element. If it contains more than one anchor, then # the texts of all subsequent ones are "[Link]". Example: #
    # # "$PATH" # ", " # # "[Link]" # Optionally contains a
    as well. Example: #
    # # "attribute" #
    #
    # # "assignment" # ", " # # "[Link]" #
    # # "assignment, class" sub process_index_lone_dt ( $ ) { my ($dt) = check_args(1, @_); my @dtcontent = @{$dt->content()}; my $acontent; my $acontent_suffix; for my $a (@dtcontent) { if ($a eq ", ") { next; } if (!ref $a) { $dt->dump; die "Unexpected
    string element: $a"; } if ($a->tag eq "dl") { push @index_prefixes, $index_prefix; if (!defined $acontent_suffix) { die "acontent_suffix not yet defined"; } $index_prefix .= $acontent_suffix . ", "; process_index_dl_compact($a); $index_prefix = pop(@index_prefixes); return; } if ($a->tag ne "a") { $dt->dump; $a->dump; die "Expected anchor in lone
    "; } my ($aname, $ahref, @acontent) = anchor_info($a); # unused $aname if (scalar(@acontent) != 1) { die "Expected just one content of in
    : @acontent"; } if (ref $acontent[0]) { $acontent[0]->dump; die "Expected string content of in
    : $acontent[0]"; } if (!defined($acontent)) { $acontent = $index_prefix . $acontent[0]; $acontent_suffix = $acontent[0]; } elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0]))) { die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; } if (!defined $ahref) { $dt->dump; die "no HREF in nachor in
    "; } my ($ahref_file, $ahref_name) = split(/\#/, $ahref); if (!defined $ahref_name) { # Reference to entire file $ahref_name = ""; } if ($ahref_name eq $l2h_broken_link_name) { if (!exists $file_index_entries_broken{$ahref_file}) { $file_index_entries_broken{$ahref_file} = []; } push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent"; next; } if (!exists $file_index_entries{$ahref_file}) { $file_index_entries{$ahref_file} = {}; } # Don't do this! It appears to make a copy, which is not desired. # my %index_entries = %{$file_index_entries{$ahref_file}}; if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name}) { $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; } # { my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name}; # if ($acontent eq $oldcontent) # { die "Multiple identical index entries?"; } # die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; } push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent"; # print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n"; } } sub process_index_dt_and_dd ( $$ ) { my ($dt, $dd) = check_args(2, @_); my $dtcontent; { my @dtcontent = @{$dt->content()}; if ((scalar(@dtcontent) != 1) || (ref $dtcontent[0])) { $dd->dump; $dt->dump; die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of
    : @dtcontent"; } $dtcontent = $dtcontent[0]; $dtcontent =~ s/ +$//; } my $ddcontent; { my @ddcontent = @{$dd->content()}; if (scalar(@ddcontent) != 1) { die "Expected single
    content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; } $ddcontent = $ddcontent[0]; } if ($ddcontent->tag ne "dl") { die "Expected
    as content of
    , but saw: $ddcontent"; } push @index_prefixes, $index_prefix; $index_prefix .= $dtcontent . ", "; process_index_dl_compact($ddcontent); $index_prefix = pop(@index_prefixes); } ########################################################################### ### Ordinary sections ### sub process_section_file ( $$$ ) { my ($file, $depth, $nodetitle) = check_args(3, @_); my $he = file_to_tree(($file =~ /^\//) ? $file : $html_directory . $file); # print STDERR "process_section_file: $file $depth $nodetitle\n"; # Equivalently: # while ($depth >= scalar(@section_stack)) { pop(@section_stack); } @section_stack = @section_stack[0..$depth-1]; # Not a great nodename fixup scheme; need a more global view if ((defined $contents_fixups{$nodetitle}) && (scalar(@section_stack) > 0)) { my $up_title = $section_stack[$#section_stack]; # hack for Python Standard Library $up_title =~ s/^(Built-in|Standard) Module //g; my ($up_first_word) = split(/ /, $up_title); $nodetitle = "$up_first_word $nodetitle"; } push @section_stack, $nodetitle; # print STDERR "new section_stack: ", join(", ", @section_stack), "\n"; $he->traverse(\&process_if_child_links, 'ignore text'); %footnotes = (); # $he->dump; $he->traverse(\&process_if_footnotes, 'ignore text'); # $he->dump; if (exists $file_index_entries{$file}) { %this_index_entries = %{$file_index_entries{$file}}; # print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n"; } else { # print STDERR "Warning: no index entries for file $file\n"; %this_index_entries = (); } if (exists $file_index_entries_broken{$file}) { @this_index_entries_broken = @{$file_index_entries_broken{$file}}; } else { # print STDERR "Warning: no index entries for file $file\n"; @this_index_entries_broken = (); } if ($he->tag() ne "html") { die "Expected at top level"; } my @content = @{$he->content()}; if ((!ref $content[0]) or ($content[0]->tag ne "head")) { $he->dump; die " not first element of "; } if ((!ref $content[1]) or ($content[1]->tag ne "body")) { $he->dump; die " not second element of "; } $content[1]->traverse(\&output_body); } # stack of things we're inside that are preventing indexing from occurring now. # These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?) my @index_deferrers = (); sub push_or_pop_index_deferrers ( $$ ) { my ($tag, $startflag) = check_args(2, @_); if ($startflag) { push @index_deferrers, $tag; } else { my $old_deferrer = pop @index_deferrers; if ($tag ne $old_deferrer) { die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); } do_deferred_index_entries(); } } sub label_add_index_entries ( $;$ ) { my ($label, $he) = check_args_range(1, 2, @_); # print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n"; # $he is the anchor element if (exists $this_index_entries{$label}) { push @deferred_index_entries, @{$this_index_entries{$label}}; return; } if ($label eq $l2h_broken_link_name) { # Try to find some text to use in guessing which links should point here # I should probably only look at the previous element, or if that is # all punctuation, the one before it; collecting all the previous texts # is a bit of overkill. my @anchor_texts = collect_texts($he); my @previous_texts = collect_texts($he->parent, $he); # 4 elements is arbitrary; ought to filter out punctuation and small words # first, then perhaps keep fewer. Perhaps also filter out formatting so # that we can see a larger chunk of text? (Probably not.) # Also perhaps should do further chunking into words, in case the # index term isn't a chunk of its own (eg, was in .... my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]); my $guessed = 0; for my $text (@candidate_texts) { # my $orig_text = $text; if ($text =~ /^[\"\`\'().?! ]*$/) { next; } if (length($text) <= 2) { next; } # hack for Python manual; maybe defer until failure first time around? $text =~ s/^sys\.//g; for my $iterm (@this_index_entries_broken) { # I could test for zero: LaTeX2HTML's failures in the Python # documentation are only for items of the form "... (built-in...)" if (index($iterm, $text) != -1) { push @deferred_index_entries, $iterm; # print STDERR "Guessing index term `$iterm' for text `$orig_text'\n"; $guessed = 1; } } } if (!$guessed) { # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n"; } } } # Need to add calls to this at various places. # Perhaps add HTML::Element argument and do the check for appropriateness # here (ie, no action if inside

    , etc.). sub do_deferred_index_entries () { check_args(0, @_); if ((scalar(@deferred_index_entries) > 0) && (scalar(@index_deferrers) == 0)) { print TEXI "\n", join("\n", @deferred_index_entries), "\n"; @deferred_index_entries = (); } } my $table_columns; # undefined if not in a table my $table_first_column; # boolean sub output_body ( $$$ ) { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument if (!ref $he) { my $space_index = index($he, " "); if ($space_index != -1) { # Why does # print TEXI texi_quote(substr($he, 0, $space_index+1)); # give: Can't locate object method "TEXI" via package "texi_quote" # (Because the definition texi_quote hasn't been seen yet.) print TEXI &texi_quote(substr($he, 0, $space_index+1)); do_deferred_index_entries(); print TEXI &texi_quote(substr($he, $space_index+1)); } else { print TEXI &texi_quote($he); } return; } my $tag = $he->tag(); # Ordinary text markup first if (exists $inline_markup{$tag}) { if ($startflag) { print TEXI "\@$inline_markup{$tag}\{"; } else { print TEXI "\}"; } } elsif ($tag eq "a") { my ($name, $href, @content) = anchor_info($he); if (!$href) { # This anchor is only here for indexing/cross referencing purposes. if ($startflag) { label_add_index_entries($name, $he); } } elsif ($href =~ "^(ftp|http|news):") { if ($startflag) { # Should avoid second argument if it's identical to the URL. print TEXI "\@uref\{$href, "; } else { print TEXI "\}"; } } elsif ($href =~ /^\#(foot[0-9]+)$/) { # Footnote if ($startflag) { # Could double-check name and content, but I'm not # currently storing that information. print TEXI "\@footnote\{"; $footnotes{$1}->traverse(\&output_body); print TEXI "\}"; return 0; } } else { if ($startflag) { # cross-references are not active Info links, but no text is lost print STDERR "Can't deal with internal HREF anchors yet:\n"; $he->dump; } } } elsif ($tag eq "br") { print TEXI "\@\n"; } elsif ($tag eq "body") { } elsif ($tag eq "center") { if (has_single_content_string($he) && ($ {$he->content}[0] =~ /^ *$/)) { return 0; } if ($startflag) { print TEXI "\n\@center\n"; } else { print TEXI "\n\@end center\n"; } } elsif ($tag eq "div") { my $align = $he->attr('align'); if (defined($align) && ($align eq "center")) { if (has_single_content_string($he) && ($ {$he->content}[0] =~ /^ *$/)) { return 0; } if ($startflag) { print TEXI "\n\@center\n"; } else { print TEXI "\n\@end center\n"; } } } elsif ($tag eq "dl") { # Recognize "
     ... 
    " paradigm for "@example" if (has_single_content_with_tag($he, "dd")) { my $he_dd = $ {$he->content}[0]; if (has_single_content_with_tag($he_dd, "pre")) { my $he_pre = $ {$he_dd->content}[0]; print_pre($he_pre); return 0; } } if ($startflag) { # Could examine the elements, to be cleverer about formatting. # (Also to use ftable, vtable...) print TEXI "\n\@table \@asis\n"; } else { print TEXI "\n\@end table\n"; } } elsif ($tag eq "dt") { push_or_pop_index_deferrers($tag, $startflag); if ($startflag) { print TEXI "\n\@item "; } else { } } elsif ($tag eq "dd") { if ($startflag) { print TEXI "\n"; } else { } if (scalar(@index_deferrers) != 0) { $he->dump; die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; } do_deferred_index_entries(); } elsif ($tag =~ /^(font|big|small)$/) { # Do nothing for now. } elsif ($tag =~ /^h[1-6]$/) { # We don't need this because we never recursively enter the heading content. # push_or_pop_index_deferrers($tag, $startflag); my $secname = ""; my @seclabels = (); for my $elt (@{$he->content}) { if (!ref $elt) { $secname .= $elt; } elsif ($elt->tag eq "br") { } elsif ($elt->tag eq "a") { my ($name, $href, @acontent) = anchor_info($elt); if ($href) { $he->dump; $elt->dump; die "Nonsimple anchor in <$tag>"; } if (!defined $name) { die "No NAME for anchor in $tag"; } push @seclabels, $name; for my $subelt (@acontent) { $secname .= html_to_texi($subelt); } } else { $secname .= html_to_texi($elt); } } if ($secname eq "") { die "No section name in <$tag>"; } if (scalar(@section_stack) == 1) { if ($section_stack[-1] ne "Top") { die "Not top? $section_stack[-1]"; } print TEXI "\@settitle $secname\n"; print TEXI "\@c %**end of header\n"; print TEXI "\n"; print TEXI "\@node Top\n"; print TEXI "\n"; } else { print TEXI "\n\@node $section_stack[-1]\n"; print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; } for my $seclabel (@seclabels) { label_add_index_entries($seclabel); } # This should only happen once per file. label_add_index_entries(""); if (scalar(@index_deferrers) != 0) { $he->dump; die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; } do_deferred_index_entries(); return 0; } elsif ($tag eq "hr") { } elsif ($tag eq "ignore") { # Hack for ignored elements return 0; } elsif ($tag eq "li") { if ($startflag) { print TEXI "\n\n\@item\n"; do_deferred_index_entries(); } } elsif ($tag eq "ol") { if ($startflag) { print TEXI "\n\@enumerate \@bullet\n"; } else { print TEXI "\n\@end enumerate\n"; } } elsif ($tag eq "p") { if ($startflag) { print TEXI "\n\n"; } if (scalar(@index_deferrers) != 0) { $he->dump; die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; } do_deferred_index_entries(); } elsif ($tag eq "pre") { print_pre($he); return 0; } elsif ($tag eq "table") { # Could also indicate common formatting for first column, or # determine relative widths for columns (or determine a prototype row) if ($startflag) { if (defined $table_columns) { $he->dump; die "Can't deal with table nested inside $table_columns-column table"; } $table_columns = table_columns($he); if ($table_columns < 2) { $he->dump; die "Column with $table_columns columns?"; } elsif ($table_columns == 2) { print TEXI "\n\@table \@asis\n"; } else { print TEXI "\n\@multitable \@columnfractions"; for (my $i=0; $i<$table_columns; $i++) { print TEXI " ", 1.0/$table_columns; } print TEXI "\n"; } } else { if ($table_columns == 2) { print TEXI "\n\@end table\n"; } else { print TEXI "\n\@end multitable\n"; } undef $table_columns; } } elsif (($tag eq "td") || ($tag eq "th")) { if ($startflag) { if ($table_first_column) { print TEXI "\n\@item "; $table_first_column = 0; } elsif ($table_columns > 2) { print TEXI "\n\@tab "; } } else { print TEXI "\n"; } } elsif ($tag eq "tr") { if ($startflag) { $table_first_column = 1; } } elsif ($tag eq "ul") { if ($startflag) { print TEXI "\n\@itemize \@bullet\n"; } else { print TEXI "\n\@end itemize\n"; } } else { # I used to have a newline before "output_body" here. print STDERR "output_body: ignoring <$tag> tag\n"; $he->dump; return 0; } return 1; } sub print_pre ( $ ) { my ($he_pre) = check_args(1, @_); if (!has_single_content_string($he_pre)) { die "Multiple or non-string content for
    : ", @{$he_pre->content}; }
      my $pre_content = $ {$he_pre->content}[0];
      print TEXI "\n\@example";
      print TEXI &texi_quote($pre_content);
      print TEXI "\@end example\n";
    }
    
    sub table_columns ( $ )
    { my ($table) = check_args(1, @_);
      my $result = 0;
      for my $row (@{$table->content})
        { if ($row->tag ne "tr")
    	{ $table->dump;
    	  $row->dump;
    	  die "Expected  as table row."; }
          $result = max($result, scalar(@{$row->content})); }
      return $result;
    }
    
    
    ###########################################################################
    ### Utilities
    ###
    
    sub min ( $$ )
    { my ($x, $y) = check_args(2, @_);
      return ($x < $y) ? $x : $y;
    }
    
    sub max ( $$ )
    { my ($x, $y) = check_args(2, @_);
      return ($x > $y) ? $x : $y;
    }
    
    sub file_to_tree ( $ )
    { my ($file) = check_args(1, @_);
    
      my $tree = new HTML::TreeBuilder;
      $tree->ignore_unknown(1);
      # $tree->warn(1);
      $tree->parse_file($file);
      cleanup_parse_tree($tree);
      return $tree
    }
    
    
    sub has_single_content ( $ )
    { my ($he) = check_args(1, @_);
      if (!ref $he)
        { # return 0;
          die "Non-reference argument: $he"; }
      my $ref_content = $he->content;
      if (!defined $ref_content)
        { return 0; }
      my @content = @{$ref_content};
      if (scalar(@content) != 1)
        { return 0; }
      return 1;
    }
    
    
    # Return true if the content of the element contains only one element itself,
    # and that inner element has the specified tag.
    sub has_single_content_with_tag ( $$ )
    { my ($he, $tag) = check_args(2, @_);
      if (!has_single_content($he))
        { return 0; }
      my $content = $ {$he->content}[0];
      if (!ref $content)
        { return 0; }
      my $content_tag = $content->tag;
      if (!defined $content_tag)
        { return 0; }
      return $content_tag eq $tag;
    }
    
    sub has_single_content_string ( $ )
    { my ($he) = check_args(1, @_);
      if (!has_single_content($he))
        { return 0; }
      my $content = $ {$he->content}[0];
      if (ref $content)
        { return 0; }
      return 1;
    }
    
    
    # Return name, href, content.  First two may be undefined; third is an array.
    # I don't see how to determine if there are more attributes.
    sub anchor_info ( $ )
    { my ($he) = check_args(1, @_);
      if ($he->tag ne "a")
        { $he->dump;
          die "passed non-anchor to anchor_info"; }
      my $name = $he->attr('name');
      my $href = $he->attr('href');
      my @content = ();
      { my $ref_content = $he->content;
        if (defined $ref_content)
          { @content = @{$ref_content}; } }
      return ($name, $href, @content);
    }
    
    
    sub texi_quote ( $ )
    { my ($text) = check_args(1, @_);
      $text =~ s/([\@\{\}])/\@$1/g;
      $text =~ s/ -- / --- /g;
      return $text;
    }
    
    # Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles.
    sub texi_remove_punctuation ( $ )
    { my ($text) = check_args(1, @_);
    
      $text =~ s/^ +//g;
      $text =~ s/[ :]+$//g;
      $text =~ s/^[1-9][0-9.]* +//g;
      $text =~ s/,//g;
      # Both embedded colons and " -- " confuse makeinfo.  (Perhaps " -- "
      # gets converted into " - ", just as "---" would be converted into " -- ",
      # so the names end up differing.)
      # $text =~ s/:/ -- /g;
      $text =~ s/://g;
      return $text;
    }
    
    
    ## Do not use this inside `traverse':  it throws off the traversal.  Use
    ## html_replace_by_ignore or html_replace_by_meta instead.
    # Returns 1 if success, 0 if failure.
    sub html_remove ( $;$ )
    { my ($he, $parent) = check_args_range(1, 2, @_);
      if (!defined $parent)
        { $parent = $he->parent; }
      my $ref_pcontent = $parent->content;
      my @pcontent = @{$ref_pcontent};
      for (my $i=0; $iparent(undef);
    	  return 1; } }
      die "Didn't find $he in $parent";
    }
    
    
    sub html_replace ( $$;$ )
    { my ($orig, $new, $parent) = check_args_range(2, 3, @_);
      if (!defined $parent)
        { $parent = $orig->parent; }
      my $ref_pcontent = $parent->content;
      my @pcontent = @{$ref_pcontent};
      for (my $i=0; $iparent($parent);
    	  $orig->parent(undef);
    	  return 1; } }
      die "Didn't find $orig in $parent";
    }
    
    sub html_replace_by_meta ( $;$ )
    { my ($orig, $parent) = check_args_range(1, 2, @_);
      my $meta = new HTML::Element "meta";
      if (!defined $parent)
        { $parent = $orig->parent; }
      return html_replace($orig, $meta, $parent);
    }
    
    sub html_replace_by_ignore ( $;$ )
    { my ($orig, $parent) = check_args_range(1, 2, @_);
      my $ignore = new HTML::Element "ignore";
      if (!defined $parent)
        { $parent = $orig->parent; }
      return html_replace($orig, $ignore, $parent);
    }
    
    
    
    ###
    ### Collect text elements
    ###
    
    my @collected_texts;
    my $collect_texts_stoppoint;
    my $done_collecting;
    
    sub collect_texts ( $;$ )
    { my ($root, $stop) = check_args_range(1, 2, @_);
      # print STDERR "collect_texts: $root $stop\n";
      $collect_texts_stoppoint = $stop;
      $done_collecting = 0;
      @collected_texts = ();
      $root->traverse(\&collect_if_text); # process texts
      # print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n";
      return @collected_texts;
    }
    
    sub collect_if_text ( $$$ )
    { my $he = (check_args(3, @_))[0]; #  ignore depth and startflag arguments
      if ($done_collecting)
        { return 0; }
      if (!defined $he)
        { return 0; }
      if (!ref $he)
        { push @collected_texts, $he;
          return 0; }
      if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint))
        { $done_collecting = 1;
          return 0; }
      return 1;
    }
    
    
    ###########################################################################
    ### Clean up parse tree
    ###
    
    sub cleanup_parse_tree ( $ )
    { my ($he) = check_args(1, @_);
      $he->traverse(\&delete_if_navigation, 'ignore text');
      $he->traverse(\&delete_extra_spaces, 'ignore text');
      $he->traverse(\&merge_dl, 'ignore text');
      $he->traverse(\&reorder_dt_and_dl, 'ignore text');
      return $he;
    }
    
    
    ## Simpler version that deletes contents but not the element itself.
    # sub delete_if_navigation ( $$$ )
    # { my $he = (check_args(3, @_))[0]; # ignore startflag and depth
    #   if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation'))
    #     { $he->delete();
    #       return 0; }
    #   else
    #     { return 1; }
    # }
    
    sub delete_if_navigation ( $$$ )
    { my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
      if (!$startflag)
        { return; }
    
      if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation'))
        { my $ref_pcontent = $he->parent()->content();
          # Don't try to modify @pcontent, which appears to be a COPY.
          # my @pcontent = @{$ref_pcontent};
          for (my $i = 0; $idelete();
          return 0; }
      else
        { return 1; }
    }
    
    sub delete_extra_spaces ( $$$ )
    { my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
      if (!$startflag)
        { return; }
    
      my $tag = $he->tag;
      if ($tag =~ /^(head|html|table|tr|ul)$/)
        { delete_child_spaces($he); }
      delete_trailing_spaces($he);
      return 1;
    }
    
    
    sub delete_child_spaces ( $ )
    { my ($he) = check_args(1, @_);
      my $ref_content = $he->content();
      for (my $i = 0; $icontent();
      if (! defined $ref_content)
        { return; }
      # Could also check for previous element = /^h[1-6]$/.
      for (my $i = 0; $itag =~ /^(br|dd|dl|dt|hr|p|ul)$/))
    	    { splice(@{$ref_content}, $i, 1);
    	      $i--; } } }
      if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/)
        { my $last_elt = $ {$ref_content}[$#{$ref_content}];
          if ((defined $last_elt) && ($last_elt =~ /^ *$/))
    	{ pop @{$ref_content}; } }
    }
    
    
    # LaTeX2HTML sometimes creates
    #   
    text #
    text # which should actually be: #
    #
    text #
    text # Since a
    gets added, this ends up looking like #

    #

    #
    # text1... #
    #
    # text2... # dt_or_dd1... # dt_or_dd2... # which should become #

    #

    #
    # text1... #
    # text2... # dt_or_dd1... # dt_or_dd2... sub reorder_dt_and_dl ( $$$ ) { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument if (!$startflag) { return; } if ($he->tag() eq "p") { my $ref_pcontent = $he->content(); if (defined $ref_pcontent) { my @pcontent = @{$ref_pcontent}; # print "reorder_dt_and_dl found a

    \n"; $he->dump(); if ((scalar(@pcontent) >= 1) && (ref $pcontent[0]) && ($pcontent[0]->tag() eq "dl") && $pcontent[0]->implicit()) { my $ref_dlcontent = $pcontent[0]->content(); # print "reorder_dt_and_dl found a

    and implicit

    \n"; if (defined $ref_dlcontent) { my @dlcontent = @{$ref_dlcontent}; if ((scalar(@dlcontent) >= 1) && (ref $dlcontent[0]) && ($dlcontent[0]->tag() eq "dt")) { my $ref_dtcontent = $dlcontent[0]->content(); # print "reorder_dt_and_dl found a

    , implicit

    , and
    \n"; if (defined $ref_dtcontent) { my @dtcontent = @{$ref_dtcontent}; if ((scalar(@dtcontent) > 0) && (ref $dtcontent[$#dtcontent]) && ($dtcontent[$#dtcontent]->tag() eq "dl")) { my $ref_dl2content = $dtcontent[$#dtcontent]->content(); # print "reorder_dt_and_dl found a

    , implicit

    ,
    , and
    \n"; if (defined $ref_dl2content) { my @dl2content = @{$ref_dl2content}; if ((scalar(@dl2content) > 0) && (ref ($dl2content[0])) && ($dl2content[0]->tag() eq "dd")) { # print "reorder_dt_and_dl found a

    , implicit

    ,
    ,
    , and
    \n"; # print STDERR "CHANGING\n"; $he->dump(); html_replace_by_ignore($dtcontent[$#dtcontent]); splice(@{$ref_dlcontent}, 1, 0, @dl2content); # print STDERR "CHANGED TO:\n"; $he->dump(); return 0; # don't traverse children } } } } } } } } } return 1; } # If we find a paragraph that looks like #

    #


    #