1999-01-08 11:25:29 -04:00
#! /usr/bin/env perl -w
# html2texi.pl -- Convert HTML documentation to Texinfo format
# Michael Ernst <mernst@cs.washington.edu>
# Time-stamp: <1998-09-10 12:52:38 mernst>
# This program converts HTML documentation trees into Texinfo format.
# Given the name of a main (or contents) HTML file, it processes that file,
# and other files (transitively) referenced by it, into a Texinfo file
# (whose name is chosen from the file or directory name of the argument).
# For instance:
# html2texi.pl api/index.pl
# produces file "api.texi".
# Texinfo format can be easily converted to Info format (for browsing in
# Emacs or the standalone Info browser), to a printed manual, or to HTML.
# Thus, html2texi.pl permits conversion of HTML files to Info format, and
# secondarily enables producing printed versions of Web page hierarchies.
# Unlike HTML, Info format is searchable. Since Info is integrated into
# Emacs, one can read documentation without starting a separate Web
# browser. Additionally, Info browsers (including Emacs) contain
# convenient features missing from Web browsers, such as easy index lookup
# and mouse-free browsing.
# Limitations:
# html2texi.pl is currently tuned to latex2html output, but should be
# extensible to arbitrary HTML documents. It will be most useful for HTML
# with a hierarchical structure and an index. The HTML tree to be
# traversed must be on local disk, rather than being accessed via HTTP.
# This script requires the use of "checkargs.pm". To eliminate that
# dependence, replace calls to check_args* by @_ (which is always the last
# argument to those functions).
# Also see the "to do" section, below.
# Comments, suggestions, bug fixes, and enhancements are welcome.
###
### Typical usage for the Python documentation:
###
# (Actually, most of this is in a Makefile instead.)
# The resulting Info format Python documentation is currently available at
# ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz
# Fix up HTML problems, eg <DL COMPACT><DD>
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/lib/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/mac/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ref/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/tut/index.html
# Edit the generated .texi files:
# * change @setfilename to prefix "python-"
# * fix up any sectioning, such as for Abstract
# * make Texinfo menus
# * perhaps remove the @detailmenu ... @end detailmenu
# In Emacs:
# (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer))
# makeinfo api.texi
# makeinfo ext.texi
# makeinfo lib.texi
# makeinfo mac.texi
# makeinfo ref.texi
# makeinfo tut.texi
###
### Structure of the code
###
# To be written...
###
### Design decisions
###
# Source and destination languages
# --------------------------------
#
# The goal is Info files; I create Texinfo, so I don't have to worry about
# the finer details of Info file creation. (I'm not even sure of its exact
# format.)
#
# Why not start from LaTeX rather than HTML?
# I could hack latex2html itself to produce Texinfo instead, or fix up
# partparse.py (which already translates LaTeX to Teinfo).
# Pros:
# * has high-level information such as index entries, original formatting
# Cons:
# * those programs are complicated to read and understand
# * those programs try to handle arbitrary LaTeX input, track catcodes,
# and more: I don't want to go to that effort. HTML isn't as powerful
# as LaTeX, so there are fewer subtleties.
# * the result wouldn't work for arbitrary HTML documents; it would be
# nice to eventually extend this program to HTML produced from Docbook,
# Frame, and more.
# Parsing
# -------
#
# I don't want to view the text as a linear stream; I'd rather parse the
# whole thing and then do pattern matching over the parsed representation (to
# find idioms such as indices, lists of child nodes, etc.).
# * Perl provides HTML::TreeBuilder, which does just what I want.
# * libwww-perl: http://www.linpro.no/lwp/
# * TreeBuilder: HTML-Tree-0.51.tar.gz
# * Python Parsers, Formatters, and Writers don't really provide the right
# interface (and the version in Grail doesn't correspond to another
# distributed version, so I'm confused about which to be using). I could
# write something in Python that creates a parse tree, but why bother?
# Other implementation language issues:
# * Python lacks variable declarations, reasonable scoping, and static
# checking tools. I've written some of the latter for myself that make
# my Perl programming a lot safer than my Python programming will be until
# I have a similar suite for that language.
###########################################################################
### To do
###
# Section names:
# Fix the problem with multiple sections in a single file (eg, Abstract in
# Front Matter section).
# Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310
# Index:
# Perhaps double-check that every tag mentioned in the index is found
# in the text.
# Python: email to python-docs@python.org, to get their feedback.
# Compare to existing lib/ Info manual
# Write the hooks into info-look; replace pyliblookup1-1.tar.gz.
# Postpass to remove extra quotation marks around typography already in
# a different font (to avoid double delimiters as in "`code'"); or
# perhaps consider using only font-based markup so that we don't get
# the extra *bold* and `code' markup in Info.
## Perhaps don't rely on automatic means for adding up, next, prev; I have
## all that info available to me already, so it's not so much trouble to
## add it. (Right?) But it is *so* easy to use Emacs instead...
###########################################################################
### Strictures
###
# man HTML::TreeBuilder
# man HTML::Parser
# man HTML::Element
# require HTML::ParserWComment;
require HTML::Parser ;
require HTML::TreeBuilder ;
require HTML::Element ;
use File::Basename ;
use Cwd ;
use strict ;
# use Carp;
use checkargs ;
###########################################################################
### Variables
###
my @ section_stack = ( ) ; # elements are chapter/section/subsec nodetitles (I think)
my $ current_ref_tdf ; # for the file currently being processed;
# used in error messages
my $ html_directory ;
my % footnotes ;
# First element should not be used.
my @ sectionmarker = ( "manual" , "chapter" , "section" , "subsection" , "subsubsection" ) ;
my % inline_markup = ( "b" = > "strong" ,
"code" = > "code" ,
"i" = > "emph" ,
"kbd" = > "kbd" ,
"samp" = > "samp" ,
"strong" = > "strong" ,
"tt" = > "code" ,
"var" = > "var" ) ;
my @ deferred_index_entries = ( ) ;
my @ index_titles = ( ) ; # list of (filename, type) lists
my % index_info = ( "Index" = > [ "\@blindex" , "bl" ] ,
"Concept Index" = > [ "\@cindex" , "cp" ] ,
"Module Index" = > [ "\@mdindex" , "md" ] ) ;
###########################################################################
### Main/contents page
###
# Process first-level page on its own, or just a contents page? Well, I do
# want the title, author, etc., and the front matter... For now, just add
# that by hand at the end.
# data structure possibilities:
# * tree-like (need some kind of stack when processing (or parent pointers))
# * list of name and depth; remember old and new depths.
# Each element is a reference to a list of (nodetitle, depth, filename).
my @ contents_list = ( ) ;
# The problem with doing fixups on the fly is that some sections may have
# already been processed (and no longer available) by the time we notice
# others with the same name. It's probably better to fully construct the
# contents list (reading in all files of interest) upfront; that will also
# let me do a better job with cross-references, because again, all files
# will already be read in.
my % contents_hash = ( ) ;
my % contents_fixups = ( ) ;
my @ current_contents_list = ( ) ;
# Merge @current_contents_list into @contents_list,
# and set @current_contents_list to be empty.
sub merge_contents_lists ( )
{ check_args ( 0 , @ _ ) ;
# Three possibilities:
# * @contents_list is empty: replace it by @current_contents_list.
# * prefixes of the two lists are identical: do nothing
# * @current_contents_list is all at lower level than $contents_list[0];
# prefix @contents_list by @current_contents_list
if ( scalar ( @ current_contents_list ) == 0 )
{ die "empty current_contents_list" ; }
# if (scalar(@contents_list) == 0)
# { @contents_list = @current_contents_list;
# @current_contents_list = ();
# return; }
# if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1]))
# { unshift @contents_list, @current_contents_list;
# @current_contents_list = ();
# return; }
for ( my $ i = 0 ; $ i < scalar ( @ current_contents_list ) ; $ i + + )
{ my $ ref_c_tdf = $ current_contents_list [ $ i ] ;
if ( $ i >= scalar ( @ contents_list ) )
{ push @ contents_list , $ ref_c_tdf ;
my $ title = $ { $ ref_c_tdf } [ 0 ] ;
if ( defined $ contents_hash { $ title } )
{ $ contents_fixups { $ title } = 1 ; }
else
{ $ contents_hash { $ title } = 1 ; }
next ; }
my $ ref_tdf = $ contents_list [ $ i ] ;
my ( $ title , $ depth , $ file ) = @ { $ ref_tdf } ;
my ( $ c_title , $ c_depth , $ c_file ) = @ { $ ref_c_tdf } ;
if ( ( $ title ne $ c_title )
&& ( $ depth < $ c_depth )
&& ( $ file ne $ c_file ) )
{ splice @ contents_list , $ i , 0 , $ ref_c_tdf ;
if ( defined $ contents_hash { $ c_title } )
{ $ contents_fixups { $ c_title } = 1 ; }
else
{ $ contents_hash { $ c_title } = 1 ; }
next ; }
if ( ( $ title ne $ c_title )
|| ( $ depth != $ c_depth )
|| ( $ file ne $ c_file ) )
{ die ( "while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:" ,
"\n main: <<<$title>>> $depth $file" ,
"\n curr: <<<$c_title>>> $c_depth $c_file" ) ; }
}
@ current_contents_list = ( ) ;
}
# Set @current_contents_list to a list of (title, href, sectionlevel);
# then merge that list into @contents_list.
# Maybe this function should also produce a map
# from title (or href) to sectionlevel (eg "chapter"?).
sub process_child_links ( $ )
{ my ( $ he ) = check_args ( 1 , @ _ ) ;
# $he->dump;
if ( scalar ( @ current_contents_list ) != 0 )
{ die "current_contents_list nonempty: @current_contents_list" ; }
$ he - > traverse ( \ & increment_current_contents_list , 'ignore text' ) ;
# Normalize the depths; for instance, convert 1,3,5 into 0,1,2.
my % depths = ( ) ;
for my $ ref_tdf ( @ current_contents_list )
{ $ depths { $ { $ ref_tdf } [ 1 ] } = 1 ; }
my @ sorted_depths = sort keys % depths ;
my $ current_depth = scalar ( @ section_stack ) - 1 ;
my $ current_depth_2 = $ { $ current_ref_tdf } [ 1 ] ;
if ( $ current_depth != $ current_depth_2 )
{ die "mismatch in current depths: $current_depth $current_depth_2; " , join ( ", " , @ section_stack ) ; }
for ( my $ i = 0 ; $ i < scalar ( @ sorted_depths ) ; $ i + + )
{ $ depths { $ sorted_depths [ $ i ] } = $ i + $ current_depth + 1 ; }
for my $ ref_tdf ( @ current_contents_list )
{ $ { $ ref_tdf } [ 1 ] = $ depths { $ { $ ref_tdf } [ 1 ] } ; }
# Eliminate uninteresting sections. Hard-coded hack for now.
if ( $ { $ current_contents_list [ - 1 ] } [ 0 ] eq "About this document ..." )
{ pop @ current_contents_list ; }
if ( ( scalar ( @ current_contents_list ) > 1 )
&& ( $ { $ current_contents_list [ 1 ] } [ 0 ] eq "Contents" ) )
{ my $ ref_first_tdf = shift @ current_contents_list ;
$ current_contents_list [ 0 ] = $ ref_first_tdf ; }
for ( my $ i = 0 ; $ i < scalar ( @ current_contents_list ) ; $ i + + )
{ my $ ref_tdf = $ current_contents_list [ $ i ] ;
my $ title = $ { $ ref_tdf } [ 0 ] ;
if ( exists $ index_info { $ title } )
{ my $ index_file = $ { $ ref_tdf } [ 2 ] ;
my ( $ indexing_command , $ suffix ) = @ { $ index_info { $ title } } ;
process_index_file ( $ index_file , $ indexing_command ) ;
print TEXI "\n\@defindex $suffix\n" ;
push @ index_titles , $ title ;
splice @ current_contents_list , $ i , 1 ;
$ i - - ; }
elsif ( $ title =~ /\bIndex$/ )
{ print STDERR "Warning: \"$title\" might be an index; if so, edit \%index_info.\n" ; } }
merge_contents_lists ( ) ;
# print_contents_list();
# print_index_info();
}
sub increment_current_contents_list ( $ $ $ )
{ my ( $ he , $ startflag , $ depth ) = check_args ( 3 , @ _ ) ;
if ( ! $ startflag )
{ return ; }
if ( $ he - > tag eq "li" )
{ my @ li_content = @ { $ he - > content } ;
if ( $ li_content [ 0 ] - > tag ne "a" )
{ die "first element of <LI> should be <A>" ; }
my ( $ name , $ href , @ content ) = anchor_info ( $ li_content [ 0 ] ) ;
# unused $name
my $ title = join ( "" , collect_texts ( $ li_content [ 0 ] ) ) ;
$ title = texi_remove_punctuation ( $ title ) ;
# The problem with these is that they are formatted differently in
# @menu and @node!
$ title =~ s/``/\"/g ;
$ title =~ s/''/\"/g ;
$ title =~ s/ -- / /g ;
push @ current_contents_list , [ $ title , $ depth , $ href ] ; }
return 1 ;
}
# Simple version for section titles
sub html_to_texi ( $ )
{ my ( $ he ) = check_args ( 1 , @ _ ) ;
if ( ! ref $ he )
{ return $ he ; }
my $ tag = $ he - > tag ;
if ( exists $ inline_markup { $ tag } )
{ my $ result = "\@$inline_markup{$tag}\{" ;
for my $ elt ( @ { $ he - > content } )
{ $ result . = html_to_texi ( $ elt ) ; }
$ result . = "\}" ;
return $ result ; }
else
{ $ he - > dump ;
die "html_to_texi confused by <$tag>" ; }
}
sub print_contents_list ()
{ check_args ( 0 , @ _ ) ;
print STDERR "Contents list:\n" ;
for my $ ref_tdf ( @ contents_list )
{ my ( $ title , $ depth , $ file ) = @ { $ ref_tdf } ;
print STDERR "$title $depth $file\n" ; }
}
###########################################################################
### Index
###
my $ l2h_broken_link_name = "l2h-" ;
# map from file to (map from anchor name to (list of index texts))
# (The list is needed when a single LaTeX command like \envvar
# expands to multiple \index commands.)
my % file_index_entries = ( ) ;
my % this_index_entries ; # map from anchor name to (list of index texts)
my % file_index_entries_broken = ( ) ; # map from file to (list of index texts)
my @ this_index_entries_broken ;
my $ index_prefix = "" ;
my @ index_prefixes = ( ) ;
my $ this_indexing_command ;
sub print_index_info ()
{ check_args ( 0 , @ _ ) ;
my ( $ key , $ val ) ;
for my $ file ( sort keys % file_index_entries )
{ my % index_entries = % { $ file_index_entries { $ file } } ;
print STDERR "file: $file\n" ;
for my $ aname ( sort keys % index_entries )
{ my @ entries = @ { $ index_entries { $ aname } } ;
if ( scalar ( @ entries ) == 1 )
{ print STDERR " $aname : $entries[0]\n" ; }
else
{ print STDERR " $aname : " , join ( "\n " . ( " " x length ( $ aname ) ) , @ entries ) , "\n" ; } } }
for my $ file ( sort keys % file_index_entries_broken )
{ my @ entries = @ { $ file_index_entries_broken { $ file } } ;
print STDERR "file: $file\n" ;
for my $ entry ( @ entries )
{ print STDERR " $entry\n" ; }
}
}
sub process_index_file ( $ $ )
{ my ( $ file , $ indexing_command ) = check_args ( 2 , @ _ ) ;
# print "process_index_file $file $indexing_command\n";
my $ he = file_to_tree ( $ html_directory . $ file ) ;
# $he->dump();
$ this_indexing_command = $ indexing_command ;
$ he - > traverse ( \ & process_if_index_dl_compact , 'ignore text' ) ;
undef $ this_indexing_command ;
# print "process_index_file done\n";
}
sub process_if_index_dl_compact ( $ $ $ )
{ my ( $ he , $ startflag ) = ( check_args ( 3 , @ _ ) ) [ 0 , 1 ] ; # ignore depth argument
if ( ! $ startflag )
{ return ; }
if ( ( $ he - > tag ( ) eq "dl" ) && ( defined $ he - > attr ( 'compact' ) ) )
{ process_index_dl_compact ( $ he ) ;
return 0 ; }
else
{ return 1 ; }
}
# The elements of a <DL COMPACT> list from a LaTeX2HTML index:
# * a single space: text to be ignored
# * <DT> elements with an optional <DD> element following each one
# Two types of <DT> elements:
# * Followed by a <DD> element: the <DT> contains a single
# string, and the <DD> contains a whitespace string to be ignored, a
# <DL COMPACT> to be recursively processed (with the <DT> string as a
# prefix), and a whitespace string to be ignored.
# * Not followed by a <DD> element: contains a list of anchors
# and texts (ignore the texts, which are only whitespace and commas).
# Optionally contains a <DL COMPACT> to be recursively processed (with
# the <DT> string as a prefix)
sub process_index_dl_compact ( $ )
{ my ( $ h ) = check_args ( 1 , @ _ ) ;
my @ content = @ { $ h - > content ( ) } ;
for ( my $ i = 0 ; $ i < scalar ( @ content ) ; $ i + + )
{ my $ this_he = $ content [ $ i ] ;
if ( $ this_he - > tag ne "dt" )
{ $ this_he - > dump ;
die "Expected <DT> tag: " . $ this_he - > tag ; }
if ( ( $ i < scalar ( @ content ) - 1 ) && ( $ content [ $ i + 1 ] - > tag eq "dd" ) )
{ process_index_dt_and_dd ( $ this_he , $ content [ $ i + 1 ] ) ;
$ i + + ; }
else
{ process_index_lone_dt ( $ this_he ) ; } } }
# Argument is a <DT> element. If it contains more than one anchor, then
# the texts of all subsequent ones are "[Link]". Example:
# <DT>
# <A HREF="embedding.html#l2h-201">
# "$PATH"
# ", "
# <A HREF="embedding.html#l2h-205">
# "[Link]"
# Optionally contains a <DL COMPACT> as well. Example:
# <DT>
# <A HREF="types.html#l2h-616">
# "attribute"
# <DL COMPACT>
# <DT>
# <A HREF="assignment.html#l2h-3074">
# "assignment"
# ", "
# <A HREF="assignment.html#l2h-3099">
# "[Link]"
# <DT>
# <A HREF="types.html#l2h-">
# "assignment, class"
sub process_index_lone_dt ( $ )
{ my ( $ dt ) = check_args ( 1 , @ _ ) ;
my @ dtcontent = @ { $ dt - > content ( ) } ;
my $ acontent ;
my $ acontent_suffix ;
for my $ a ( @ dtcontent )
{ if ( $ a eq ", " )
{ next ; }
if ( ! ref $ a )
{ $ dt - > dump ;
die "Unexpected <DT> string element: $a" ; }
if ( $ a - > tag eq "dl" )
{ push @ index_prefixes , $ index_prefix ;
if ( ! defined $ acontent_suffix )
{ die "acontent_suffix not yet defined" ; }
$ index_prefix . = $ acontent_suffix . ", " ;
process_index_dl_compact ( $ a ) ;
$ index_prefix = pop ( @ index_prefixes ) ;
return ; }
if ( $ a - > tag ne "a" )
{ $ dt - > dump ;
$ a - > dump ;
die "Expected anchor in lone <DT>" ; }
my ( $ aname , $ ahref , @ acontent ) = anchor_info ( $ a ) ;
# unused $aname
if ( scalar ( @ acontent ) != 1 )
{ die "Expected just one content of <A> in <DT>: @acontent" ; }
if ( ref $ acontent [ 0 ] )
{ $ acontent [ 0 ] - > dump ;
die "Expected string content of <A> in <DT>: $acontent[0]" ; }
if ( ! defined ( $ acontent ) )
{ $ acontent = $ index_prefix . $ acontent [ 0 ] ;
$ acontent_suffix = $ acontent [ 0 ] ; }
elsif ( ( $ acontent [ 0 ] ne "[Link]" ) && ( $ acontent ne ( $ index_prefix . $ acontent [ 0 ] ) ) )
{ die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>" ; }
if ( ! defined $ ahref )
{ $ dt - > dump ;
die "no HREF in nachor in <DT>" ; }
my ( $ ahref_file , $ ahref_name ) = split ( /\#/ , $ ahref ) ;
if ( ! defined $ ahref_name )
{ # Reference to entire file
$ ahref_name = "" ; }
if ( $ ahref_name eq $ l2h_broken_link_name )
{ if ( ! exists $ file_index_entries_broken { $ ahref_file } )
{ $ file_index_entries_broken { $ ahref_file } = [] ; }
push @ { $ file_index_entries_broken { $ ahref_file } } , "$this_indexing_command $acontent" ;
next ; }
if ( ! exists $ file_index_entries { $ ahref_file } )
{ $ file_index_entries { $ ahref_file } = { } ; }
# Don't do this! It appears to make a copy, which is not desired.
# my %index_entries = %{$file_index_entries{$ahref_file}};
if ( ! exists $ { $ file_index_entries { $ ahref_file } } { $ ahref_name } )
{ $ { $ file_index_entries { $ ahref_file } } { $ ahref_name } = [] ; }
# { my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name};
# if ($acontent eq $oldcontent)
# { die "Multiple identical index entries?"; }
# die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; }
push @ { $ { $ file_index_entries { $ ahref_file } } { $ ahref_name } } , "$this_indexing_command $acontent" ;
# print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n";
}
}
sub process_index_dt_and_dd ( $ $ )
{ my ( $ dt , $ dd ) = check_args ( 2 , @ _ ) ;
my $ dtcontent ;
{ my @ dtcontent = @ { $ dt - > content ( ) } ;
if ( ( scalar ( @ dtcontent ) != 1 ) || ( ref $ dtcontent [ 0 ] ) )
{ $ dd - > dump ;
$ dt - > dump ;
die "Expected single string (actual size = " . scalar ( @ dtcontent ) . ") in content of <DT>: @dtcontent" ; }
$ dtcontent = $ dtcontent [ 0 ] ;
$ dtcontent =~ s/ +$// ; }
my $ ddcontent ;
{ my @ ddcontent = @ { $ dd - > content ( ) } ;
if ( scalar ( @ ddcontent ) != 1 )
{ die "Expected single <DD> content, got " , scalar ( @ ddcontent ) , " elements:\n" , join ( "\n" , @ ddcontent ) , "\n " ; }
$ ddcontent = $ ddcontent [ 0 ] ; }
if ( $ ddcontent - > tag ne "dl" )
{ die "Expected <DL> as content of <DD>, but saw: $ddcontent" ; }
push @ index_prefixes , $ index_prefix ;
$ index_prefix . = $ dtcontent . ", " ;
process_index_dl_compact ( $ ddcontent ) ;
$ index_prefix = pop ( @ index_prefixes ) ;
}
###########################################################################
### Ordinary sections
###
sub process_section_file ( $ $ $ )
{ my ( $ file , $ depth , $ nodetitle ) = check_args ( 3 , @ _ ) ;
my $ he = file_to_tree ( ( $ file =~ /^\// ) ? $ file : $ html_directory . $ file ) ;
# print STDERR "process_section_file: $file $depth $nodetitle\n";
# Equivalently:
# while ($depth >= scalar(@section_stack)) { pop(@section_stack); }
@ section_stack = @ section_stack [ 0 .. $ depth - 1 ] ;
# Not a great nodename fixup scheme; need a more global view
if ( ( defined $ contents_fixups { $ nodetitle } )
&& ( scalar ( @ section_stack ) > 0 ) )
{ my $ up_title = $ section_stack [ $# section_stack ] ;
# hack for Python Standard Library
$ up_title =~ s/^(Built-in|Standard) Module //g ;
my ( $ up_first_word ) = split ( / / , $ up_title ) ;
$ nodetitle = "$up_first_word $nodetitle" ;
}
push @ section_stack , $ nodetitle ;
# print STDERR "new section_stack: ", join(", ", @section_stack), "\n";
$ he - > traverse ( \ & process_if_child_links , 'ignore text' ) ;
% footnotes = ( ) ;
# $he->dump;
$ he - > traverse ( \ & process_if_footnotes , 'ignore text' ) ;
# $he->dump;
if ( exists $ file_index_entries { $ file } )
{ % this_index_entries = % { $ file_index_entries { $ file } } ;
# print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n";
}
else
{ # print STDERR "Warning: no index entries for file $file\n";
% this_index_entries = ( ) ; }
if ( exists $ file_index_entries_broken { $ file } )
{ @ this_index_entries_broken = @ { $ file_index_entries_broken { $ file } } ; }
else
{ # print STDERR "Warning: no index entries for file $file\n";
@ this_index_entries_broken = ( ) ; }
if ( $ he - > tag ( ) ne "html" )
{ die "Expected <HTML> at top level" ; }
my @ content = @ { $ he - > content ( ) } ;
if ( ( ! ref $ content [ 0 ] ) or ( $ content [ 0 ] - > tag ne "head" ) )
{ $ he - > dump ;
die "<HEAD> not first element of <HTML>" ; }
if ( ( ! ref $ content [ 1 ] ) or ( $ content [ 1 ] - > tag ne "body" ) )
{ $ he - > dump ;
die "<BODY> not second element of <HTML>" ; }
$ content [ 1 ] - > traverse ( \ & output_body ) ;
}
# stack of things we're inside that are preventing indexing from occurring now.
# These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?)
my @ index_deferrers = ( ) ;
sub push_or_pop_index_deferrers ( $ $ )
{ my ( $ tag , $ startflag ) = check_args ( 2 , @ _ ) ;
if ( $ startflag )
{ push @ index_deferrers , $ tag ; }
else
{ my $ old_deferrer = pop @ index_deferrers ;
if ( $ tag ne $ old_deferrer )
{ die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = " , join ( " " , @ index_deferrers ) ; }
do_deferred_index_entries ( ) ; }
}
sub label_add_index_entries ( $ ; $ )
{ my ( $ label , $ he ) = check_args_range ( 1 , 2 , @ _ ) ;
# print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n";
# $he is the anchor element
if ( exists $ this_index_entries { $ label } )
{ push @ deferred_index_entries , @ { $ this_index_entries { $ label } } ;
return ; }
if ( $ label eq $ l2h_broken_link_name )
{ # Try to find some text to use in guessing which links should point here
# I should probably only look at the previous element, or if that is
# all punctuation, the one before it; collecting all the previous texts
# is a bit of overkill.
my @ anchor_texts = collect_texts ( $ he ) ;
my @ previous_texts = collect_texts ( $ he - > parent , $ he ) ;
# 4 elements is arbitrary; ought to filter out punctuation and small words
# first, then perhaps keep fewer. Perhaps also filter out formatting so
# that we can see a larger chunk of text? (Probably not.)
# Also perhaps should do further chunking into words, in case the
# index term isn't a chunk of its own (eg, was in <tt>...</tt>.
my @ candidate_texts = ( @ anchor_texts , ( reverse ( @ previous_texts ) ) [ 0 .. min ( 3 , $# previous_texts ) ] ) ;
my $ guessed = 0 ;
for my $ text ( @ candidate_texts )
{ # my $orig_text = $text;
if ( $ text =~ /^[\"\`\'().?! ]*$/ )
{ next ; }
if ( length ( $ text ) <= 2 )
{ next ; }
# hack for Python manual; maybe defer until failure first time around?
$ text =~ s/^sys\.//g ;
for my $ iterm ( @ this_index_entries_broken )
{ # I could test for zero: LaTeX2HTML's failures in the Python
# documentation are only for items of the form "... (built-in...)"
if ( index ( $ iterm , $ text ) != - 1 )
{ push @ deferred_index_entries , $ iterm ;
# print STDERR "Guessing index term `$iterm' for text `$orig_text'\n";
$ guessed = 1 ;
} } }
if ( ! $ guessed )
{ # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n";
}
}
}
# Need to add calls to this at various places.
# Perhaps add HTML::Element argument and do the check for appropriateness
# here (ie, no action if inside <H1>, etc.).
sub do_deferred_index_entries ()
{ check_args ( 0 , @ _ ) ;
if ( ( scalar ( @ deferred_index_entries ) > 0 )
&& ( scalar ( @ index_deferrers ) == 0 ) )
{ print TEXI "\n" , join ( "\n" , @ deferred_index_entries ) , "\n" ;
@ deferred_index_entries = ( ) ; }
}
my $ table_columns ; # undefined if not in a table
my $ table_first_column ; # boolean
sub output_body ( $ $ $ )
{ my ( $ he , $ startflag ) = ( check_args ( 3 , @ _ ) ) [ 0 , 1 ] ; # ignore depth argument
if ( ! ref $ he )
{ my $ space_index = index ( $ he , " " ) ;
if ( $ space_index != - 1 )
{ # Why does
# print TEXI texi_quote(substr($he, 0, $space_index+1));
# give: Can't locate object method "TEXI" via package "texi_quote"
# (Because the definition texi_quote hasn't been seen yet.)
print TEXI & texi_quote ( substr ( $ he , 0 , $ space_index + 1 ) ) ;
do_deferred_index_entries ( ) ;
print TEXI & texi_quote ( substr ( $ he , $ space_index + 1 ) ) ; }
else
{ print TEXI & texi_quote ( $ he ) ; }
return ; }
my $ tag = $ he - > tag ( ) ;
# Ordinary text markup first
if ( exists $ inline_markup { $ tag } )
{ if ( $ startflag )
{ print TEXI "\@$inline_markup{$tag}\{" ; }
else
{ print TEXI "\}" ; } }
elsif ( $ tag eq "a" )
{ my ( $ name , $ href , @ content ) = anchor_info ( $ he ) ;
if ( ! $ href )
{ # This anchor is only here for indexing/cross referencing purposes.
if ( $ startflag )
{ label_add_index_entries ( $ name , $ he ) ; }
}
elsif ( $ href =~ "^(ftp|http|news):" )
{ if ( $ startflag )
{ # Should avoid second argument if it's identical to the URL.
print TEXI "\@uref\{$href, " ; }
else
{ print TEXI "\}" ; }
}
elsif ( $ href =~ /^\#(foot[0-9]+)$/ )
{ # Footnote
if ( $ startflag )
{ # Could double-check name and content, but I'm not
# currently storing that information.
print TEXI "\@footnote\{" ;
$ footnotes { $ 1 } - > traverse ( \ & output_body ) ;
print TEXI "\}" ;
return 0 ; } }
else
{ if ( $ startflag )
{ $ he - > dump ;
warn "Can't deal with internal HREF anchors yet" ; }
}
}
1999-01-11 18:30:34 -04:00
elsif ( $ tag eq "address" )
{ # this is part of the page footer, ignore
return 0 ;
}
1999-01-08 11:25:29 -04:00
elsif ( $ tag eq "br" )
{ print TEXI "\@\n" ; }
elsif ( $ tag eq "body" )
{ }
elsif ( $ tag eq "center" )
{ if ( has_single_content_string ( $ he )
&& ( $ { $ he - > content } [ 0 ] =~ /^ *$/ ) )
{ return 0 ; }
if ( $ startflag )
{ print TEXI "\n\@center\n" ; }
else
{ print TEXI "\n\@end center\n" ; }
}
elsif ( $ tag eq "div" )
{ my $ align = $ he - > attr ( 'align' ) ;
if ( defined ( $ align ) && ( $ align eq "center" ) )
{ if ( has_single_content_string ( $ he )
&& ( $ { $ he - > content } [ 0 ] =~ /^ *$/ ) )
{ return 0 ; }
if ( $ startflag )
{ print TEXI "\n\@center\n" ; }
else
{ print TEXI "\n\@end center\n" ; } }
}
elsif ( $ tag eq "dl" )
{ # Recognize "<dl><dd><pre> ... </pre></dl>" paradigm for "@example"
if ( has_single_content_with_tag ( $ he , "dd" ) )
{ my $ he_dd = $ { $ he - > content } [ 0 ] ;
if ( has_single_content_with_tag ( $ he_dd , "pre" ) )
{ my $ he_pre = $ { $ he_dd - > content } [ 0 ] ;
print_pre ( $ he_pre ) ;
return 0 ; } }
if ( $ startflag )
{ # Could examine the elements, to be cleverer about formatting.
# (Also to use ftable, vtable...)
print TEXI "\n\@table \@asis\n" ; }
else
{ print TEXI "\n\@end table\n" ; }
}
elsif ( $ tag eq "dt" )
{ push_or_pop_index_deferrers ( $ tag , $ startflag ) ;
if ( $ startflag )
{ print TEXI "\n\@item " ; }
else
{ } }
elsif ( $ tag eq "dd" )
{ if ( $ startflag )
{ print TEXI "\n" ; }
else
{ }
if ( scalar ( @ index_deferrers ) != 0 )
{ $ he - > dump ;
die "index deferrers: " , join ( " " , @ index_deferrers ) ; }
do_deferred_index_entries ( ) ;
}
elsif ( $ tag =~ /^(font|big|small)$/ )
{ # Do nothing for now.
}
elsif ( $ tag =~ /^h[1-6]$/ )
{ # We don't need this because we never recursively enter the heading content.
# push_or_pop_index_deferrers($tag, $startflag);
my $ secname = "" ;
my @ seclabels = ( ) ;
for my $ elt ( @ { $ he - > content } )
{ if ( ! ref $ elt )
{ $ secname . = $ elt ; }
elsif ( $ elt - > tag eq "br" )
{ }
elsif ( $ elt - > tag eq "a" )
{ my ( $ name , $ href , @ acontent ) = anchor_info ( $ elt ) ;
if ( $ href )
{ $ he - > dump ;
$ elt - > dump ;
die "Nonsimple anchor in <$tag>" ; }
if ( ! defined $ name )
{ die "No NAME for anchor in $tag" ; }
push @ seclabels , $ name ;
for my $ subelt ( @ acontent )
{ $ secname . = html_to_texi ( $ subelt ) ; } }
else
{ $ secname . = html_to_texi ( $ elt ) ; } }
if ( $ secname eq "" )
{ die "No section name in <$tag>" ; }
if ( scalar ( @ section_stack ) == 1 )
{ if ( $ section_stack [ - 1 ] ne "Top" )
{ die "Not top? $section_stack[-1]" ; }
print TEXI "\@settitle $secname\n" ;
print TEXI "\@c %**end of header\n" ;
print TEXI "\n" ;
print TEXI "\@node Top\n" ;
print TEXI "\n" ; }
else
{ print TEXI "\n\@node $section_stack[-1]\n" ;
print TEXI "\@$sectionmarker[scalar(@section_stack)-1] " , texi_remove_punctuation ( $ secname ) , "\n" ; }
for my $ seclabel ( @ seclabels )
{ label_add_index_entries ( $ seclabel ) ; }
# This should only happen once per file.
label_add_index_entries ( "" ) ;
if ( scalar ( @ index_deferrers ) != 0 )
{ die "index deferrers: " , join ( " " , @ index_deferrers ) ; }
do_deferred_index_entries ( ) ;
return 0 ;
}
elsif ( $ tag eq "hr" )
{ }
elsif ( $ tag eq "ignore" )
{ # Hack for ignored elements
return 0 ;
}
elsif ( $ tag eq "li" )
{ if ( $ startflag )
{ print TEXI "\n\n\@item\n" ;
do_deferred_index_entries ( ) ; } }
elsif ( $ tag eq "ol" )
{ if ( $ startflag )
{ print TEXI "\n\@enumerate \@bullet\n" ; }
else
{ print TEXI "\n\@end enumerate\n" ; } }
elsif ( $ tag eq "p" )
{ if ( $ startflag )
{ print TEXI "\n\n" ; }
if ( scalar ( @ index_deferrers ) != 0 )
{ die "index deferrers: " , join ( " " , @ index_deferrers ) ; }
do_deferred_index_entries ( ) ; }
elsif ( $ tag eq "pre" )
{ print_pre ( $ he ) ;
return 0 ; }
elsif ( $ tag eq "table" )
{ # Could also indicate common formatting for first column, or
# determine relative widths for columns (or determine a prototype row)
if ( $ startflag )
{ if ( defined $ table_columns )
{ $ he - > dump ;
die "Can't deal with table nested inside $table_columns-column table" ; }
$ table_columns = table_columns ( $ he ) ;
if ( $ table_columns < 2 )
{ $ he - > dump ;
die "Column with $table_columns columns?" ; }
elsif ( $ table_columns == 2 )
{ print TEXI "\n\@table \@asis\n" ; }
else
{ print TEXI "\n\@multitable \@columnfractions" ;
for ( my $ i = 0 ; $ i < $ table_columns ; $ i + + )
{ print TEXI " " , 1.0 / $ table_columns ; }
print TEXI "\n" ; } }
else
{ if ( $ table_columns == 2 )
{ print TEXI "\n\@end table\n" ; }
else
{ print TEXI "\n\@end multitable\n" ; }
undef $ table_columns ; } }
elsif ( ( $ tag eq "td" ) || ( $ tag eq "th" ) )
{ if ( $ startflag )
{ if ( $ table_first_column )
{ print TEXI "\n\@item " ;
$ table_first_column = 0 ; }
elsif ( $ table_columns > 2 )
{ print TEXI "\n\@tab " ; } }
else
{ print TEXI "\n" ; } }
elsif ( $ tag eq "tr" )
{ if ( $ startflag )
{ $ table_first_column = 1 ; } }
elsif ( $ tag eq "ul" )
{ if ( $ startflag )
{ print TEXI "\n\@itemize \@bullet\n" ; }
else
{ print TEXI "\n\@end itemize\n" ; } }
else
{ print STDERR "\nBailing out\n" ;
$ he - > dump ;
return 0 ; }
return 1 ;
}
sub print_pre ( $ )
{ my ( $ he_pre ) = check_args ( 1 , @ _ ) ;
if ( ! has_single_content_string ( $ he_pre ) )
{ die "Multiple or non-string content for <PRE>: " , @ { $ he_pre - > content } ; }
my $ pre_content = $ { $ he_pre - > content } [ 0 ] ;
print TEXI "\n\@example" ;
print TEXI & texi_quote ( $ pre_content ) ;
print TEXI "\@end example\n" ;
}
sub table_columns ( $ )
{ my ( $ table ) = check_args ( 1 , @ _ ) ;
my $ result = 0 ;
for my $ row ( @ { $ table - > content } )
{ if ( $ row - > tag ne "tr" )
{ $ table - > dump ;
$ row - > dump ;
die "Expected <TR> as table row." ; }
$ result = max ( $ result , scalar ( @ { $ row - > content } ) ) ; }
return $ result ;
}
###########################################################################
### Utilities
###
sub min ( $ $ )
{ my ( $ x , $ y ) = check_args ( 2 , @ _ ) ;
return ( $ x < $ y ) ? $ x : $ y ;
}
sub max ( $ $ )
{ my ( $ x , $ y ) = check_args ( 2 , @ _ ) ;
return ( $ x > $ y ) ? $ x : $ y ;
}
sub file_to_tree ( $ )
{ my ( $ file ) = check_args ( 1 , @ _ ) ;
my $ tree = new HTML:: TreeBuilder ;
$ tree - > ignore_unknown ( 1 ) ;
# $tree->warn(1);
$ tree - > parse_file ( $ file ) ;
cleanup_parse_tree ( $ tree ) ;
return $ tree
}
sub has_single_content ( $ )
{ my ( $ he ) = check_args ( 1 , @ _ ) ;
if ( ! ref $ he )
{ # return 0;
die "Non-reference argument: $he" ; }
my $ ref_content = $ he - > content ;
if ( ! defined $ ref_content )
{ return 0 ; }
my @ content = @ { $ ref_content } ;
if ( scalar ( @ content ) != 1 )
{ return 0 ; }
return 1 ;
}
# Return true if the content of the element contains only one element itself,
# and that inner element has the specified tag.
sub has_single_content_with_tag ( $ $ )
{ my ( $ he , $ tag ) = check_args ( 2 , @ _ ) ;
if ( ! has_single_content ( $ he ) )
{ return 0 ; }
my $ content = $ { $ he - > content } [ 0 ] ;
if ( ! ref $ content )
{ return 0 ; }
my $ content_tag = $ content - > tag ;
if ( ! defined $ content_tag )
{ return 0 ; }
return $ content_tag eq $ tag ;
}
sub has_single_content_string ( $ )
{ my ( $ he ) = check_args ( 1 , @ _ ) ;
if ( ! has_single_content ( $ he ) )
{ return 0 ; }
my $ content = $ { $ he - > content } [ 0 ] ;
if ( ref $ content )
{ return 0 ; }
return 1 ;
}
# Return name, href, content. First two may be undefined; third is an array.
# I don't see how to determine if there are more attributes.
sub anchor_info ( $ )
{ my ( $ he ) = check_args ( 1 , @ _ ) ;
if ( $ he - > tag ne "a" )
{ $ he - > dump ;
die "passed non-anchor to anchor_info" ; }
my $ name = $ he - > attr ( 'name' ) ;
my $ href = $ he - > attr ( 'href' ) ;
my @ content = ( ) ;
{ my $ ref_content = $ he - > content ;
if ( defined $ ref_content )
{ @ content = @ { $ ref_content } ; } }
return ( $ name , $ href , @ content ) ;
}
sub texi_quote ( $ )
{ my ( $ text ) = check_args ( 1 , @ _ ) ;
$ text =~ s/([\@\{\}])/\@$1/g ;
$ text =~ s/ -- / --- /g ;
return $ text ;
}
# Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles.
sub texi_remove_punctuation ( $ )
{ my ( $ text ) = check_args ( 1 , @ _ ) ;
$ text =~ s/^ +//g ;
$ text =~ s/[ :]+$//g ;
$ text =~ s/^[1-9][0-9.]* +//g ;
$ text =~ s/,//g ;
# Both embedded colons and " -- " confuse makeinfo. (Perhaps " -- "
# gets converted into " - ", just as "---" would be converted into " -- ",
# so the names end up differing.)
# $text =~ s/:/ -- /g;
$ text =~ s/://g ;
return $ text ;
}
## Do not use this inside `traverse': it throws off the traversal. Use
## html_replace_by_ignore or html_replace_by_meta instead.
# Returns 1 if success, 0 if failure.
sub html_remove ( $ ; $ )
{ my ( $ he , $ parent ) = check_args_range ( 1 , 2 , @ _ ) ;
if ( ! defined $ parent )
{ $ parent = $ he - > parent ; }
my $ ref_pcontent = $ parent - > content ;
my @ pcontent = @ { $ ref_pcontent } ;
for ( my $ i = 0 ; $ i < scalar ( @ pcontent ) ; $ i + + )
{ if ( $ pcontent [ $ i ] eq $ he )
{ splice @ { $ ref_pcontent } , $ i , 1 ;
$ he - > parent ( undef ) ;
return 1 ; } }
die "Didn't find $he in $parent" ;
}
sub html_replace ( $ $ ; $ )
{ my ( $ orig , $ new , $ parent ) = check_args_range ( 2 , 3 , @ _ ) ;
if ( ! defined $ parent )
{ $ parent = $ orig - > parent ; }
my $ ref_pcontent = $ parent - > content ;
my @ pcontent = @ { $ ref_pcontent } ;
for ( my $ i = 0 ; $ i < scalar ( @ pcontent ) ; $ i + + )
{ if ( $ pcontent [ $ i ] eq $ orig )
{ $ { $ ref_pcontent } [ $ i ] = $ new ;
$ new - > parent ( $ parent ) ;
$ orig - > parent ( undef ) ;
return 1 ; } }
die "Didn't find $orig in $parent" ;
}
sub html_replace_by_meta ( $ ; $ )
{ my ( $ orig , $ parent ) = check_args_range ( 1 , 2 , @ _ ) ;
my $ meta = new HTML:: Element "meta" ;
if ( ! defined $ parent )
{ $ parent = $ orig - > parent ; }
return html_replace ( $ orig , $ meta , $ parent ) ;
}
sub html_replace_by_ignore ( $ ; $ )
{ my ( $ orig , $ parent ) = check_args_range ( 1 , 2 , @ _ ) ;
my $ ignore = new HTML:: Element "ignore" ;
if ( ! defined $ parent )
{ $ parent = $ orig - > parent ; }
return html_replace ( $ orig , $ ignore , $ parent ) ;
}
###
### Collect text elements
###
my @ collected_texts ;
my $ collect_texts_stoppoint ;
my $ done_collecting ;
sub collect_texts ( $ ; $ )
{ my ( $ root , $ stop ) = check_args_range ( 1 , 2 , @ _ ) ;
# print STDERR "collect_texts: $root $stop\n";
$ collect_texts_stoppoint = $ stop ;
$ done_collecting = 0 ;
@ collected_texts = ( ) ;
$ root - > traverse ( \ & collect_if_text ) ; # process texts
# print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n";
return @ collected_texts ;
}
sub collect_if_text ( $ $ $ )
{ my $ he = ( check_args ( 3 , @ _ ) ) [ 0 ] ; # ignore depth and startflag arguments
if ( $ done_collecting )
{ return 0 ; }
if ( ! defined $ he )
{ return 0 ; }
if ( ! ref $ he )
{ push @ collected_texts , $ he ;
return 0 ; }
if ( ( defined $ collect_texts_stoppoint ) && ( $ he eq $ collect_texts_stoppoint ) )
{ $ done_collecting = 1 ;
return 0 ; }
return 1 ;
}
###########################################################################
### Clean up parse tree
###
sub cleanup_parse_tree ( $ )
{ my ( $ he ) = check_args ( 1 , @ _ ) ;
$ he - > traverse ( \ & delete_if_navigation , 'ignore text' ) ;
$ he - > traverse ( \ & delete_extra_spaces , 'ignore text' ) ;
$ he - > traverse ( \ & merge_dl , 'ignore text' ) ;
return $ he ;
}
## Simpler version that deletes contents but not the element itself.
# sub delete_if_navigation ( $$$ )
# { my $he = (check_args(3, @_))[0]; # ignore startflag and depth
# if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation'))
# { $he->delete();
# return 0; }
# else
# { return 1; }
# }
sub delete_if_navigation ( $ $ $ )
{ my ( $ he , $ startflag ) = ( check_args ( 3 , @ _ ) ) [ 0 , 1 ] ; # ignore depth argument
if ( ! $ startflag )
{ return ; }
if ( ( $ he - > tag ( ) eq "div" ) && ( defined $ he - > attr ( 'class' ) ) && ( $ he - > attr ( 'class' ) eq 'navigation' ) )
{ my $ ref_pcontent = $ he - > parent ( ) - > content ( ) ;
# Don't try to modify @pcontent, which appears to be a COPY.
# my @pcontent = @{$ref_pcontent};
for ( my $ i = 0 ; $ i < scalar ( @ { $ ref_pcontent } ) ; $ i + + )
{ if ( $ { $ ref_pcontent } [ $ i ] eq $ he )
{ splice ( @ { $ ref_pcontent } , $ i , 1 ) ;
last ; } }
$ he - > delete ( ) ;
return 0 ; }
else
{ return 1 ; }
}
sub delete_extra_spaces ( $ $ $ )
{ my ( $ he , $ startflag ) = ( check_args ( 3 , @ _ ) ) [ 0 , 1 ] ; # ignore depth argument
if ( ! $ startflag )
{ return ; }
my $ tag = $ he - > tag ;
if ( $ tag =~ /^(head|html|table|tr|ul)$/ )
{ delete_child_spaces ( $ he ) ; }
delete_trailing_spaces ( $ he ) ;
return 1 ;
}
sub delete_child_spaces ( $ )
{ my ( $ he ) = check_args ( 1 , @ _ ) ;
my $ ref_content = $ he - > content ( ) ;
for ( my $ i = 0 ; $ i < scalar ( @ { $ ref_content } ) ; $ i + + )
{ if ( $ { $ ref_content } [ $ i ] =~ /^ *$/ )
{ splice ( @ { $ ref_content } , $ i , 1 ) ;
$ i - - ; } }
}
sub delete_trailing_spaces ( $ )
{ my ( $ he ) = check_args ( 1 , @ _ ) ;
my $ ref_content = $ he - > content ( ) ;
if ( ! defined $ ref_content )
{ return ; }
# Could also check for previous element = /^h[1-6]$/.
for ( my $ i = 0 ; $ i < scalar ( @ { $ ref_content } ) - 1 ; $ i + + )
{ if ( $ { $ ref_content } [ $ i ] =~ /^ *$/ )
{ my $ next_elt = $ { $ ref_content } [ $ i + 1 ] ;
if ( ( ref $ next_elt ) && ( $ next_elt - > tag =~ /^(br|dd|dl|dt|hr|p|ul)$/ ) )
{ splice ( @ { $ ref_content } , $ i , 1 ) ;
$ i - - ; } } }
if ( $ he - > tag =~ /^(dd|dt|^h[1-6]|li|p)$/ )
{ my $ last_elt = $ { $ ref_content } [ $# { $ ref_content } ] ;
if ( ( defined $ last_elt ) && ( $ last_elt =~ /^ *$/ ) )
{ pop @ { $ ref_content } ; } }
}
# If we find a paragraph that looks like
# <P>
# <HR>
# <UL>
# then accumulate its links into a contents_list and delete the paragraph.
sub process_if_child_links ( $ $ $ )
{ my ( $ he , $ startflag ) = ( check_args ( 3 , @ _ ) ) [ 0 , 1 ] ; # ignore depth argument
if ( ! $ startflag )
{ return ; }
if ( $ he - > tag ( ) eq "p" )
{ my $ ref_content = $ he - > content ( ) ;
if ( defined $ ref_content )
{ my @ content = @ { $ ref_content } ;
if ( ( scalar ( @ content ) == 2 )
&& ( ref $ content [ 0 ] ) && $ content [ 0 ] - > tag ( ) eq "hr"
&& ( ref $ content [ 1 ] ) && $ content [ 1 ] - > tag ( ) eq "ul" )
{ process_child_links ( $ he ) ;
$ he - > delete ( ) ;
return 0 ; } } }
return 1 ;
}
# If we find
# <H4>
# "Footnotes"
# <DL>
# <DT>
# <A NAME="foot560">
# "...borrow"
# <A HREF="refcountsInPython.html#tex2html2" NAME="foot560">
# "1.2"
# <DD>
# "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. "
# ...
# then record the footnote information and delete the section and list.
my $ process_if_footnotes_expect_dl_next = 0 ;
sub process_if_footnotes ( $ $ $ )
{ my ( $ he , $ startflag ) = ( check_args ( 3 , @ _ ) ) [ 0 , 1 ] ; # ignore depth argument
if ( ! $ startflag )
{ return ; }
if ( ( $ he - > tag ( ) eq "h4" )
&& has_single_content_string ( $ he )
&& ( $ { $ he - > content } [ 0 ] eq "Footnotes" ) )
{ html_replace_by_ignore ( $ he ) ;
$ process_if_footnotes_expect_dl_next = 1 ;
return 0 ; }
if ( $ process_if_footnotes_expect_dl_next && ( $ he - > tag ( ) eq "dl" ) )
{ my $ ref_content = $ he - > content ( ) ;
if ( defined $ ref_content )
{ $ process_if_footnotes_expect_dl_next = 0 ;
my @ content = @ { $ ref_content } ;
for ( my $ i = 0 ; $ i < $# content ; $ i += 2 )
{ my $ he_dt = $ content [ $ i ] ;
my $ he_dd = $ content [ $ i + 1 ] ;
if ( ( $ he_dt - > tag ne "dt" ) || ( $ he_dd - > tag ne "dd" ) )
{ $ he - > dump ;
die "expected <DT> and <DD> at positions $i and " , $ i + 1 ; }
my @ dt_content = @ { $ he_dt - > content ( ) } ;
if ( ( scalar ( @ dt_content ) != 2 )
|| ( $ dt_content [ 0 ] - > tag ne "a" )
|| ( $ dt_content [ 1 ] - > tag ne "a" ) )
{ $ he_dt - > dump ;
die "Expected 2 anchors as content of <DT>" ; }
my ( $ dt1_name , $ dt1_href , $ dt1_content ) = anchor_info ( $ dt_content [ 0 ] ) ;
my ( $ dt2_name , $ dt2_href , $ dt2_content ) = anchor_info ( $ dt_content [ 0 ] ) ;
# unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content
if ( $ dt1_name ne $ dt2_name )
{ $ he_dt - > dump ;
die "Expected identical names for anchors" ; }
html_replace_by_ignore ( $ he_dd ) ;
$ he_dd - > tag ( "div" ) ; # has no effect
$ footnotes { $ dt1_name } = $ he_dd ; }
html_replace_by_ignore ( $ he ) ;
return 0 ; } }
if ( $ process_if_footnotes_expect_dl_next )
{ $ he - > dump ;
die "Expected <DL> for footnotes next" ; }
return 1 ;
}
## Merge two adjacent paragraphs containing <DL> items, such as:
# <P>
# <DL>
# <DT>
# ...
# <DD>
# ...
# <P>
# <DL>
# <DT>
# ...
# <DD>
# ...
sub merge_dl ( $ $ $ )
{ my ( $ he , $ startflag ) = ( check_args ( 3 , @ _ ) ) [ 0 , 1 ] ; # ignore depth argument
if ( ! $ startflag )
{ return ; }
my $ ref_content = $ he - > content ;
if ( ! defined $ ref_content )
{ return ; }
my $ i = 0 ;
while ( $ i < scalar ( @ { $ ref_content } ) - 1 )
{ my $ p1 = $ { $ ref_content } [ $ i ] ;
if ( ( ref $ p1 ) && ( $ p1 - > tag eq "p" )
&& has_single_content_with_tag ( $ p1 , "dl" ) )
{ my $ dl1 = $ { $ p1 - > content } [ 0 ] ;
# In this loop, rhs, not lhs, of < comparison changes,
# because we are removing elements from the content of $he.
while ( $ i < scalar ( @ { $ ref_content } ) - 1 )
{ my $ p2 = $ { $ ref_content } [ $ i + 1 ] ;
if ( ! ( ( ref $ p2 ) && ( $ p2 - > tag eq "p" )
&& has_single_content_with_tag ( $ p2 , "dl" ) ) )
{ last ; }
# Merge these two elements.
splice ( @ { $ ref_content } , $ i + 1 , 1 ) ; # remove $p2
my $ dl2 = $ { $ p2 - > content } [ 0 ] ;
$ dl1 - > push_content ( @ { $ dl2 - > content } ) ; # put $dl2's content in $dl1
}
# extra increment because next element isn't a candidate for $p1
$ i + + ; }
$ i + + ; }
return 1 ;
}
###########################################################################
### Testing
###
sub test ( $ $ )
{ my ( $ action , $ file ) = check_args ( 2 , @ _ ) ;
# General testing
if ( ( $ action eq "view" ) || ( $ action eq "" ) )
{ # # $file = "/homes/gws/mernst/www/links.html";
# # $file = "/homes/gws/mernst/www/index.html";
# # $file = "/homes/fish/mernst/java/gud/doc/manual.html";
# # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html";
# # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html";
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
my $ tree = file_to_tree ( $ file ) ;
## Testing
# print STDERR $tree->as_HTML;
$ tree - > dump ( ) ;
# print STDERR $tree->tag(), "\n";
# print STDERR @{$tree->content()}, "\n";
#
# for (@{ $tree->extract_links(qw(a img)) }) {
# my ($link, $linkelem) = @$_;
# print STDERR "$link ", $linkelem->as_HTML;
# }
#
# print STDERR @{$tree->extract_links()}, "\n";
# my @top_level_elts = @{$tree->content()};
# if scalar(@{$tree->content()})
return ;
}
elsif ( $ action eq "raw" )
{ my $ tree = new HTML:: TreeBuilder ;
$ tree - > ignore_unknown ( 1 ) ;
# $tree->warn(1);
$ tree - > parse_file ( $ file ) ;
$ tree - > dump ( ) ;
# cleanup_parse_tree($tree);
# $tree->dump();
return ;
}
# Test dealing with a section.
elsif ( $ action eq "section" )
{ # my $file;
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html";
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html";
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
process_section_file ( $ file , 0 , "Title" ) ;
}
# Test dealing with many sections
elsif ( 0 )
{ my @ files = ( "/homes/fish/mernst/tmp/python-doc/html/api/about.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/abstract.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/api.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/concrete.html" ,
# "/homes/fish/mernst/tmp/python-doc/html/api/contents.html",
"/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/debugging.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/embedding.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/front.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html" ,
# "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html",
"/homes/fish/mernst/tmp/python-doc/html/api/importing.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/includes.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/index.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/initialization.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/intro.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/mapping.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/node24.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/number.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/object.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/objects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/os.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/processControl.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/sequence.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/threads.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/types.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/utilities.html" ,
"/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html" ) ;
for my $ file ( @ files )
{ print STDERR "\n" , "=" x 75 , "\n" , "$file:\n" ;
process_section_file ( $ file , 0 , "Title" ) ;
}
}
# Test dealing with index.
elsif ( $ action eq "index" )
{ # my $file;
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html";
process_index_file ( $ file , "\@cindex" ) ;
print_index_info ( ) ;
}
else
{ die "Unrecognized action `$action'" ; }
}
###########################################################################
### Main loop
###
sub process_contents_file ( $ )
{ my ( $ file ) = check_args ( 1 , @ _ ) ;
# could also use File::Basename
my $ info_file = $ file ;
$ info_file =~ s/(\/?index)?\.html$// ;
if ( $ info_file eq "" )
{ chomp ( $ info_file = `pwd` ) ; }
$ info_file =~ s/^.*\/// ; # not the most efficient way to remove dirs
$ html_directory = $ file ;
$ html_directory =~ s/(\/|^)[^\/]+$/$1/ ;
my $ texi_file = "$info_file.texi" ;
open ( TEXI , ">$texi_file" ) ;
print TEXI "\\input texinfo \@c -*-texinfo-*-\n" ;
print TEXI "\@c %**start of header\n" ;
print TEXI "\@setfilename $info_file\n" ;
# 2. Summary Description and Copyright
# The "Summary Description and Copyright" segment describes the
# document and contains the copyright notice and copying permissions
# for the Info file. The segment must be enclosed between `@ifinfo'
# and `@end ifinfo' commands so that the formatters place it only in
# the Info file.
#
# The summary description and copyright segment does not appear in the
# printed document.
#
# @ifinfo
# This is a short example of a complete Texinfo file.
#
# Copyright @copyright{} 1990 Free Software Foundation, Inc.
# @end ifinfo
# 3. Title and Copyright
# The "Title and Copyright" segment contains the title and copyright
# pages and copying permissions for the printed manual. The segment
# must be enclosed between `@titlepage' and `@end titlepage'
# commands. The title and copyright page appear only in the printed
# manual.
#
# The titlepage segment does not appear in the Info file.
#
# @titlepage
# @sp 10
# @comment The title is printed in a large font.
# @center @titlefont{Sample Title}
#
# @c The following two commands start the copyright page.
# @page
# @vskip 0pt plus 1filll
# Copyright @copyright{} 1990 Free Software Foundation, Inc.
# @end titlepage
# 4. `Top' Node and Master Menu
# The "Master Menu" contains a complete menu of all the nodes in the
# whole Info file. It appears only in the Info file, in the `Top'
# node.
#
# The `Top' node contains the master menu for the Info file. Since a
# printed manual uses a table of contents rather than a menu, the master
# menu appears only in the Info file.
#
# @node Top, First Chapter, , (dir)
# @comment node-name, next, previous, up
#
# @menu
# * First Chapter:: The first chapter is the
# only chapter in this sample.
# * Concept Index:: This index has two entries.
# @end menu
$ current_ref_tdf = [ "Top" , 0 , $ ARGV [ 0 ] ] ;
process_section_file ( $ file , 0 , "Top" ) ;
while ( scalar ( @ contents_list ) )
{ $ current_ref_tdf = shift @ contents_list ;
process_section_file ( $ { $ current_ref_tdf } [ 2 ] , $ { $ current_ref_tdf } [ 1 ] , $ { $ current_ref_tdf } [ 0 ] ) ;
}
print TEXI "\n" ;
for my $ indextitle ( @ index_titles )
{ print TEXI "\@node $indextitle\n" ;
print TEXI "\@unnumbered $indextitle\n" ;
print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n" ;
print TEXI "\n" ; }
print TEXI "\@contents\n" ;
print TEXI "\@bye\n" ;
close ( TEXI ) ;
}
# This needs to be last so global variable initializations are reached.
if ( scalar ( @ ARGV ) == 0 )
{ die "No arguments supplied to html2texi.pl" ; }
if ( $ ARGV [ 0 ] eq "-test" )
{ my @ test_args = @ ARGV [ 1 .. $# ARGV ] ;
if ( scalar ( @ test_args ) == 0 )
{ test ( "" , "index.html" ) ; }
elsif ( scalar ( @ test_args ) == 1 )
{ test ( "" , $ test_args [ 0 ] ) ; }
elsif ( scalar ( @ test_args ) == 2 )
{ test ( $ test_args [ 0 ] , $ test_args [ 1 ] ) ; }
else
{ die "Too many test arguments passed to html2texi: " , join ( " " , @ ARGV ) ; }
exit ( ) ;
}
if ( scalar ( @ ARGV ) != 1 )
{ die "Pass one argument, the main/contents page" ; }
process_contents_file ( $ ARGV [ 0 ] ) ;