...
: ", @{$he_pre->content}; } my $pre_content = $ {$he_pre->content}[0]; print TEXI "\n\@example"; print TEXI &texi_quote($pre_content); print TEXI "\@end example\n"; } sub table_columns ( $ ) { my ($table) = check_args(1, @_); my $result = 0; for my $row (@{$table->content}) { if ($row->tag ne "tr") { $table->dump; $row->dump; die "Expectedas table row."; } $result = max($result, scalar(@{$row->content})); } return $result; } ########################################################################### ### Utilities ### sub min ( $$ ) { my ($x, $y) = check_args(2, @_); return ($x < $y) ? $x : $y; } sub max ( $$ ) { my ($x, $y) = check_args(2, @_); return ($x > $y) ? $x : $y; } sub file_to_tree ( $ ) { my ($file) = check_args(1, @_); my $tree = new HTML::TreeBuilder; $tree->ignore_unknown(1); # $tree->warn(1); $tree->parse_file($file); cleanup_parse_tree($tree); return $tree } sub has_single_content ( $ ) { my ($he) = check_args(1, @_); if (!ref $he) { # return 0; die "Non-reference argument: $he"; } my $ref_content = $he->content; if (!defined $ref_content) { return 0; } my @content = @{$ref_content}; if (scalar(@content) != 1) { return 0; } return 1; } # Return true if the content of the element contains only one element itself, # and that inner element has the specified tag. sub has_single_content_with_tag ( $$ ) { my ($he, $tag) = check_args(2, @_); if (!has_single_content($he)) { return 0; } my $content = $ {$he->content}[0]; if (!ref $content) { return 0; } my $content_tag = $content->tag; if (!defined $content_tag) { return 0; } return $content_tag eq $tag; } sub has_single_content_string ( $ ) { my ($he) = check_args(1, @_); if (!has_single_content($he)) { return 0; } my $content = $ {$he->content}[0]; if (ref $content) { return 0; } return 1; } # Return name, href, content. First two may be undefined; third is an array. # I don't see how to determine if there are more attributes. sub anchor_info ( $ ) { my ($he) = check_args(1, @_); if ($he->tag ne "a") { $he->dump; die "passed non-anchor to anchor_info"; } my $name = $he->attr('name'); my $href = $he->attr('href'); my @content = (); { my $ref_content = $he->content; if (defined $ref_content) { @content = @{$ref_content}; } } return ($name, $href, @content); } sub texi_quote ( $ ) { my ($text) = check_args(1, @_); $text =~ s/([\@\{\}])/\@$1/g; $text =~ s/ -- / --- /g; return $text; } # Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles. sub texi_remove_punctuation ( $ ) { my ($text) = check_args(1, @_); $text =~ s/^ +//g; $text =~ s/[ :]+$//g; $text =~ s/^[1-9][0-9.]* +//g; $text =~ s/,//g; # Both embedded colons and " -- " confuse makeinfo. (Perhaps " -- " # gets converted into " - ", just as "---" would be converted into " -- ", # so the names end up differing.) # $text =~ s/:/ -- /g; $text =~ s/://g; return $text; } ## Do not use this inside `traverse': it throws off the traversal. Use ## html_replace_by_ignore or html_replace_by_meta instead. # Returns 1 if success, 0 if failure. sub html_remove ( $;$ ) { my ($he, $parent) = check_args_range(1, 2, @_); if (!defined $parent) { $parent = $he->parent; } my $ref_pcontent = $parent->content; my @pcontent = @{$ref_pcontent}; for (my $i=0; $i parent(undef); return 1; } } die "Didn't find $he in $parent"; } sub html_replace ( $$;$ ) { my ($orig, $new, $parent) = check_args_range(2, 3, @_); if (!defined $parent) { $parent = $orig->parent; } my $ref_pcontent = $parent->content; my @pcontent = @{$ref_pcontent}; for (my $i=0; $i parent($parent); $orig->parent(undef); return 1; } } die "Didn't find $orig in $parent"; } sub html_replace_by_meta ( $;$ ) { my ($orig, $parent) = check_args_range(1, 2, @_); my $meta = new HTML::Element "meta"; if (!defined $parent) { $parent = $orig->parent; } return html_replace($orig, $meta, $parent); } sub html_replace_by_ignore ( $;$ ) { my ($orig, $parent) = check_args_range(1, 2, @_); my $ignore = new HTML::Element "ignore"; if (!defined $parent) { $parent = $orig->parent; } return html_replace($orig, $ignore, $parent); } ### ### Collect text elements ### my @collected_texts; my $collect_texts_stoppoint; my $done_collecting; sub collect_texts ( $;$ ) { my ($root, $stop) = check_args_range(1, 2, @_); # print STDERR "collect_texts: $root $stop\n"; $collect_texts_stoppoint = $stop; $done_collecting = 0; @collected_texts = (); $root->traverse(\&collect_if_text); # process texts # print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n"; return @collected_texts; } sub collect_if_text ( $$$ ) { my $he = (check_args(3, @_))[0]; # ignore depth and startflag arguments if ($done_collecting) { return 0; } if (!defined $he) { return 0; } if (!ref $he) { push @collected_texts, $he; return 0; } if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint)) { $done_collecting = 1; return 0; } return 1; } ########################################################################### ### Clean up parse tree ### sub cleanup_parse_tree ( $ ) { my ($he) = check_args(1, @_); $he->traverse(\&delete_if_navigation, 'ignore text'); $he->traverse(\&delete_extra_spaces, 'ignore text'); $he->traverse(\&merge_dl, 'ignore text'); $he->traverse(\&reorder_dt_and_dl, 'ignore text'); return $he; } ## Simpler version that deletes contents but not the element itself. # sub delete_if_navigation ( $$$ ) # { my $he = (check_args(3, @_))[0]; # ignore startflag and depth # if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation')) # { $he->delete(); # return 0; } # else # { return 1; } # } sub delete_if_navigation ( $$$ ) { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument if (!$startflag) { return; } if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation')) { my $ref_pcontent = $he->parent()->content(); # Don't try to modify @pcontent, which appears to be a COPY. # my @pcontent = @{$ref_pcontent}; for (my $i = 0; $i delete(); return 0; } else { return 1; } } sub delete_extra_spaces ( $$$ ) { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument if (!$startflag) { return; } my $tag = $he->tag; if ($tag =~ /^(head|html|table|tr|ul)$/) { delete_child_spaces($he); } delete_trailing_spaces($he); return 1; } sub delete_child_spaces ( $ ) { my ($he) = check_args(1, @_); my $ref_content = $he->content(); for (my $i = 0; $i content(); if (! defined $ref_content) { return; } # Could also check for previous element = /^h[1-6]$/. for (my $i = 0; $i tag =~ /^(br|dd|dl|dt|hr|p|ul)$/)) { splice(@{$ref_content}, $i, 1); $i--; } } } if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/) { my $last_elt = $ {$ref_content}[$#{$ref_content}]; if ((defined $last_elt) && ($last_elt =~ /^ *$/)) { pop @{$ref_content}; } } } # LaTeX2HTML sometimes creates # - text #
- text # which should actually be: #
#
- text #
- text # Since a
gets added, this ends up looking like #
#
#
- # text1... #
#
- # text2... # dt_or_dd1... # dt_or_dd2... # which should become #
#
#
- # text1... #
- # text2... # dt_or_dd1... # dt_or_dd2... sub reorder_dt_and_dl ( $$$ ) { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument if (!$startflag) { return; } if ($he->tag() eq "p") { my $ref_pcontent = $he->content(); if (defined $ref_pcontent) { my @pcontent = @{$ref_pcontent}; # print "reorder_dt_and_dl found a
\n"; $he->dump(); if ((scalar(@pcontent) >= 1) && (ref $pcontent[0]) && ($pcontent[0]->tag() eq "dl") && $pcontent[0]->implicit()) { my $ref_dlcontent = $pcontent[0]->content(); # print "reorder_dt_and_dl found a
and implicit
\n"; if (defined $ref_dlcontent) { my @dlcontent = @{$ref_dlcontent}; if ((scalar(@dlcontent) >= 1) && (ref $dlcontent[0]) && ($dlcontent[0]->tag() eq "dt")) { my $ref_dtcontent = $dlcontent[0]->content(); # print "reorder_dt_and_dl found a
, implicit
, and
- \n"; if (defined $ref_dtcontent) { my @dtcontent = @{$ref_dtcontent}; if ((scalar(@dtcontent) > 0) && (ref $dtcontent[$#dtcontent]) && ($dtcontent[$#dtcontent]->tag() eq "dl")) { my $ref_dl2content = $dtcontent[$#dtcontent]->content(); # print "reorder_dt_and_dl found a
, implicit
,
- , and
\n"; if (defined $ref_dl2content) { my @dl2content = @{$ref_dl2content}; if ((scalar(@dl2content) > 0) && (ref ($dl2content[0])) && ($dl2content[0]->tag() eq "dd")) { # print "reorder_dt_and_dl found a
, implicit
,
- ,
, and
- \n"; # print STDERR "CHANGING\n"; $he->dump(); html_replace_by_ignore($dtcontent[$#dtcontent]); splice(@{$ref_dlcontent}, 1, 0, @dl2content); # print STDERR "CHANGED TO:\n"; $he->dump(); return 0; # don't traverse children } } } } } } } } } return 1; } # If we find a paragraph that looks like #
#
## then accumulate its links into a contents_list and delete the paragraph. sub process_if_child_links ( $$$ ) { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument if (!$startflag) { return; } if ($he->tag() eq "p") { my $ref_content = $he->content(); if (defined $ref_content) { my @content = @{$ref_content}; if ((scalar(@content) == 2) && (ref $content[0]) && $content[0]->tag() eq "hr" && (ref $content[1]) && $content[1]->tag() eq "ul") { process_child_links($he); $he->delete(); return 0; } } } return 1; } # If we find #
# "Footnotes" #
#
- # # "...borrow" # # "1.2" #
- # "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. " # ... # then record the footnote information and delete the section and list. my $process_if_footnotes_expect_dl_next = 0; sub process_if_footnotes ( $$$ ) { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument if (!$startflag) { return; } if (($he->tag() eq "h4") && has_single_content_string($he) && ($ {$he->content}[0] eq "Footnotes")) { html_replace_by_ignore($he); $process_if_footnotes_expect_dl_next = 1; return 0; } if ($process_if_footnotes_expect_dl_next && ($he->tag() eq "dl")) { my $ref_content = $he->content(); if (defined $ref_content) { $process_if_footnotes_expect_dl_next = 0; my @content = @{$ref_content}; for (my $i=0; $i<$#content; $i+=2) { my $he_dt = $content[$i]; my $he_dd = $content[$i+1]; if (($he_dt->tag ne "dt") || ($he_dd->tag ne "dd")) { $he->dump; die "expected
- and
- at positions $i and ", $i+1; } my @dt_content = @{$he_dt->content()}; if ((scalar(@dt_content) != 2) || ($dt_content[0]->tag ne "a") || ($dt_content[1]->tag ne "a")) { $he_dt->dump; die "Expected 2 anchors as content of
- "; } my ($dt1_name, $dt1_href, $dt1_content) = anchor_info($dt_content[0]); my ($dt2_name, $dt2_href, $dt2_content) = anchor_info($dt_content[0]); # unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content if ($dt1_name ne $dt2_name) { $he_dt->dump; die "Expected identical names for anchors"; } html_replace_by_ignore($he_dd); $he_dd->tag("div"); # has no effect $footnotes{$dt1_name} = $he_dd; } html_replace_by_ignore($he); return 0; } } if ($process_if_footnotes_expect_dl_next) { $he->dump; die "Expected
for footnotes next"; } return 1; } ## Merge two adjacent paragraphs containing
items, such as: #
#
#
- # ... #
- # ... #
#
#
- # ... #
- # ... sub merge_dl ( $$$ ) { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument if (!$startflag) { return; } my $ref_content = $he->content; if (!defined $ref_content) { return; } my $i = 0; while ($i < scalar(@{$ref_content})-1) { my $p1 = $ {$ref_content}[$i]; if ((ref $p1) && ($p1->tag eq "p") && has_single_content_with_tag($p1, "dl")) { my $dl1 = $ {$p1->content}[0]; # In this loop, rhs, not lhs, of < comparison changes, # because we are removing elements from the content of $he. while ($i < scalar(@{$ref_content})-1) { my $p2 = $ {$ref_content}[$i+1]; if (!((ref $p2) && ($p2->tag eq "p") && has_single_content_with_tag($p2, "dl"))) { last; } # Merge these two elements. splice(@{$ref_content}, $i+1, 1); # remove $p2 my $dl2 = $ {$p2->content}[0]; $dl1->push_content(@{$dl2->content}); # put $dl2's content in $dl1 } # extra increment because next element isn't a candidate for $p1 $i++; } $i++; } return 1; } ########################################################################### ### Testing ### sub test ( $$ ) { my ($action, $file) = check_args(2, @_); # General testing if (($action eq "view") || ($action eq "")) { # # $file = "/homes/gws/mernst/www/links.html"; # # $file = "/homes/gws/mernst/www/index.html"; # # $file = "/homes/fish/mernst/java/gud/doc/manual.html"; # # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html"; # # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html"; # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html"; my $tree = file_to_tree($file); ## Testing # print STDERR $tree->as_HTML; $tree->dump(); # print STDERR $tree->tag(), "\n"; # print STDERR @{$tree->content()}, "\n"; # # for (@{ $tree->extract_links(qw(a img)) }) { # my ($link, $linkelem) = @$_; # print STDERR "$link ", $linkelem->as_HTML; # } # # print STDERR @{$tree->extract_links()}, "\n"; # my @top_level_elts = @{$tree->content()}; # if scalar(@{$tree->content()}) return; } elsif ($action eq "raw") { my $tree = new HTML::TreeBuilder; $tree->ignore_unknown(1); # $tree->warn(1); $tree->parse_file($file); $tree->dump(); # cleanup_parse_tree($tree); # $tree->dump(); return; } # Test dealing with a section. elsif ($action eq "section") { # my $file; # $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html"; # $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html"; # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html"; process_section_file($file, 0, "Title"); } # Test dealing with many sections elsif (0) { my @files = ("/homes/fish/mernst/tmp/python-doc/html/api/about.html", "/homes/fish/mernst/tmp/python-doc/html/api/abstract.html", "/homes/fish/mernst/tmp/python-doc/html/api/api.html", "/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/concrete.html", # "/homes/fish/mernst/tmp/python-doc/html/api/contents.html", "/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html", "/homes/fish/mernst/tmp/python-doc/html/api/debugging.html", "/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/embedding.html", "/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html", "/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html", "/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/front.html", "/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html", # "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html", "/homes/fish/mernst/tmp/python-doc/html/api/importing.html", "/homes/fish/mernst/tmp/python-doc/html/api/includes.html", "/homes/fish/mernst/tmp/python-doc/html/api/index.html", "/homes/fish/mernst/tmp/python-doc/html/api/initialization.html", "/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/intro.html", "/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/mapping.html", "/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html", "/homes/fish/mernst/tmp/python-doc/html/api/node24.html", "/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html", "/homes/fish/mernst/tmp/python-doc/html/api/number.html", "/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/object.html", "/homes/fish/mernst/tmp/python-doc/html/api/objects.html", "/homes/fish/mernst/tmp/python-doc/html/api/os.html", "/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/processControl.html", "/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html", "/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html", "/homes/fish/mernst/tmp/python-doc/html/api/sequence.html", "/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html", "/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/threads.html", "/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html", "/homes/fish/mernst/tmp/python-doc/html/api/types.html", "/homes/fish/mernst/tmp/python-doc/html/api/utilities.html", "/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html"); for my $file (@files) { print STDERR "\n", "=" x 75, "\n", "$file:\n"; process_section_file($file, 0, "Title"); } } # Test dealing with index. elsif ($action eq "index") { # my $file; # $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html"; process_index_file($file, "\@cindex"); print_index_info(); } else { die "Unrecognized action `$action'"; } } ########################################################################### ### Main loop ### sub process_contents_file ( $ ) { my ($file) = check_args(1, @_); # could also use File::Basename my $info_file = $file; $info_file =~ s/(\/?index)?\.html$//; if ($info_file eq "") { chomp($info_file = `pwd`); } $info_file =~ s/^.*\///; # not the most efficient way to remove dirs $html_directory = $file; $html_directory =~ s/(\/|^)[^\/]+$/$1/; my $texi_file = "$info_file.texi"; open(TEXI, ">$texi_file"); print TEXI "\\input texinfo \@c -*-texinfo-*-\n"; print TEXI "\@c %**start of header\n"; print TEXI "\@setfilename $info_file\n"; # 2. Summary Description and Copyright # The "Summary Description and Copyright" segment describes the # document and contains the copyright notice and copying permissions # for the Info file. The segment must be enclosed between `@ifinfo' # and `@end ifinfo' commands so that the formatters place it only in # the Info file. # # The summary description and copyright segment does not appear in the # printed document. # # @ifinfo # This is a short example of a complete Texinfo file. # # Copyright @copyright{} 1990 Free Software Foundation, Inc. # @end ifinfo # 3. Title and Copyright # The "Title and Copyright" segment contains the title and copyright # pages and copying permissions for the printed manual. The segment # must be enclosed between `@titlepage' and `@end titlepage' # commands. The title and copyright page appear only in the printed # manual. # # The titlepage segment does not appear in the Info file. # # @titlepage # @sp 10 # @comment The title is printed in a large font. # @center @titlefont{Sample Title} # # @c The following two commands start the copyright page. # @page # @vskip 0pt plus 1filll # Copyright @copyright{} 1990 Free Software Foundation, Inc. # @end titlepage # 4. `Top' Node and Master Menu # The "Master Menu" contains a complete menu of all the nodes in the # whole Info file. It appears only in the Info file, in the `Top' # node. # # The `Top' node contains the master menu for the Info file. Since a # printed manual uses a table of contents rather than a menu, the master # menu appears only in the Info file. # # @node Top, First Chapter, , (dir) # @comment node-name, next, previous, up # # @menu # * First Chapter:: The first chapter is the # only chapter in this sample. # * Concept Index:: This index has two entries. # @end menu $current_ref_tdf = [ "Top", 0, $ARGV[0] ]; process_section_file($file, 0, "Top"); while (scalar(@contents_list)) { $current_ref_tdf = shift @contents_list; process_section_file($ {$current_ref_tdf}[2], $ {$current_ref_tdf}[1], $ {$current_ref_tdf}[0]); } print TEXI "\n"; for my $indextitle (@index_titles) { print TEXI "\@node $indextitle\n"; print TEXI "\@unnumbered $indextitle\n"; print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n"; print TEXI "\n"; } print TEXI "\@contents\n"; print TEXI "\@bye\n"; close(TEXI); } # This needs to be last so global variable initializations are reached. if (scalar(@ARGV) == 0) { die "No arguments supplied to html2texi.pl"; } if ($ARGV[0] eq "-test") { my @test_args = @ARGV[1..$#ARGV]; if (scalar(@test_args) == 0) { test("", "index.html"); } elsif (scalar(@test_args) == 1) { test("", $test_args[0]); } elsif (scalar(@test_args) == 2) { test($test_args[0], $test_args[1]); } else { die "Too many test arguments passed to html2texi: ", join(" ", @ARGV); } exit(); } if (scalar(@ARGV) != 1) { die "Pass one argument, the main/contents page"; } process_contents_file($ARGV[0]); # end of html2texi.pl