From: Eric W. B. <er...@sa...> - 2005-02-08 03:23:06
|
This may be useful to some (unfortunately I wasn't able to upload it to the incoming directory on the www.htdig.org server). It's an external parser for handling Newton books and is designed to work with doc2html. For those who don't know what Newton books are, they're a type of e-book (one of the oldest, in fact). There are a couple of free Newton book creation apps out there (a Google search for "Newton Press" or "Newton BookMaker" should turn up some info and download sites), the format itself is pretty well documented (see http://metcs.bu.edu/~feneric/cs331/Archives/Project2002/ for more information), and there are quite a few free books currently available in the format on the 'net. It utilizes two simple related utilities: nb2txt and nbinfo. These are both available from http://www.saugus.net/~eric/nb2txt/ and should pretty readily build on most UNIX-like systems. We've been using it on Saugus.net for about a year now and I figured it was stable enough to share. #!/usr/bin/perl -w use strict; # # Version 1.0 13-Feb-2004 # Written by Eric W. Brown, but very heavily based on the # earlier work by David Adams <d.j...@so...> # # Uses the Saugus.net nb2txt & nbinfo utilities to read # a Newton book file and produce HTML output. # # Can be called directly from htdig as an external converter, # or may be called by doc2html.pl converter script. # ####--- Configuration ---#### # Full paths of nb2txt and nbinfo #### YOU MUST SET THESE #### my $NB2TXT = "/usr/local/bin/nb2txt"; my $NBINFO = "/usr/local/bin/nbinfo"; # # De-hyphenation option (only affects end-of-line hyphens): my $Dehyphenate = 1; # # Set title to be used when none is found: my $Default_title = "Newton Book Document"; # # make portable to win32 platform or unix: my $null = "/dev/null"; if ($^O eq "MSWin32") {$null = "nul";} ####--- End of configuration ---### if (! -x $NB2TXT) { die "Unable to execute nb2txt" } my $Input = $ARGV[0] || die "Usage: nb2html.pl filename [mime-type] [URL]"; my $MIME_type = $ARGV[1] || ''; if ($MIME_type and ($MIME_type !~ m#^application/x-newton-compatible-pkg#i)) { die "MIME/type $MIME_type wrong"; } my $Name = $ARGV[2] || ''; $Name =~ s#^.*/##; $Name =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie; &nb_head; &nb_body; exit; #----------------------------------------------------------------------- ------- sub nb_head { # # by Eric W. Brown, based pretty heavily on the work # contributed by Greg Holmes and Michael Fuller and # modified by David Adams. Uses proper Dublin Core # notation as appropriate. # my $title = ''; my $shortTitle = ''; my $author = ''; my $copyright = ''; my $isbn = ''; my $software = ''; my $publicationDate = ''; my $creationDate = ''; if (open(INFO, "$NBINFO '$Input' 2>$null |")) { while (<INFO>) { if (m/^Short Title:/i) { s/^Short Title: //i; $shortTitle=&HTML($_); } elsif (m/^Title:/i) { s/^Title: //i; $title=&HTML($_); } elsif (m/^Author:/i) { s/^Author: //i; $author=&HTML($_); } elsif (m/^Copyright:/i) { s/^Copyright: //i; $copyright=&HTML($_); } elsif (m/^ISBN:/i) { s/^ISBN: //i; $isbn=&HTML($_); } elsif (m/^Software:/i) { s/^Software: //i; $software=&HTML($_); } elsif (m/^Publication Date:/i) { s/^Publication Date: //i; $publicationDate=$_; } elsif (m/^Creation Date:/i) { s/^Creation Date: //i; $creationDate=$_; } } close INFO; } else { warn "cannot execute nbinfo" } if (not length $shortTitle) { if ($Name) { $shortTitle = '[' . $Name . ']'; } else { $shortTitle = $Default_title; } } if (not length $title) { $title = $shortTitle } print "<html>\n<head>\n"; print "<title>$shortTitle</title>\n"; if (length $title) { print '<meta name="description" content="Newton book version of ' . $title. "\" />\n"; print '<meta name="DC.Title" content="' . $title . "\" />\n"; } if (length $author) { print '<meta name="DC.Creator" content="' . $author . "\" />\n"; } if (length $copyright) { print '<meta name="DC.Rights" content="' . $copyright . "\" />\n"; } if (length $isbn) { print '<meta name="DC.Source" content="' . $isbn . "\" />\n"; } if (length $publicationDate) { print '<meta name="DC.Date" content="' . $publicationDate . "\" />\n"; } elsif (length $creationDate) { print '<meta name="DC.Date" content="' . $creationDate . "\" />\n"; } print "<meta name=\"DC.Type\" content=\"EBook\" />\n"; print "<meta name=\"DC.Format\" content=\"application/x-newton-compatible-pkg\" />\n"; if (length $software) { print '<meta name="DC.Contributor" content="' . $software . "\" />\n"; } print "</head>\n"; } #----------------------------------------------------------------------- ------- sub nb_body { my $openP = 0; my $lineNum = 0; my $pastContents = 0; my $bline = ''; open(CAT, "$NB2TXT '$Input' |") || die "$NB2TXT doesn't want to be opened using pipe\n"; print "<body>\n"; while (<CAT>) { while ( m/[A-Za-z\300-\377]-\s*$/ && $Dehyphenate) { $_ .= <CAT>; last if eof; s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s; } s/\255/-/g; # replace dashes with hyphens # replace bell, backspace, tab. etc. with single space: s/[\000-\040]+/ /g; $_ = &HTML($_); if ($lineNum==0) { $_="<h1>" . $_ . "</h1>"; print $_,"\n"; } elsif ($lineNum==1) { $_="<strong>" . $_ . "<strong>"; print $_,"\n"; } elsif ($lineNum==2) { $_="<h2>" . $_ . "</h2>"; print $_,"\n"; } elsif ($lineNum==3) { $_="<em>" . $_ . "</em>"; print $_,"\n"; } elsif (m/^Contents$/) { $_="<big><strong>" . $_ . "</strong></big>"; print $_,"\n"; $pastContents=1; } else { if (length) { print $bline, $_, "\n"; $bline = "<br />\n"; } elsif ($openP) { $bline = "</p>\n<p>\n"; } else { $bline = "<p>\n"; $openP=1; } } $lineNum++; } close CAT; print "</p>\n</body>\n</html>\n"; return; } #----------------------------------------------------------------------- ------- sub HTML { my $text = shift; $text =~ s/\f/\n/gs; # replace form feed $text =~ s/\s+/ /g; # replace multiple spaces, etc. with a single space $text =~ s/^\s+//gm; # remove leading whitespace $text =~ s/\s+$//gm; # remove trailing whitespace $text =~ s/&/&/g; $text =~ s/</</g; $text =~ s/>/>/g; $text =~ s/\xA9/©/g; $text =~ s/\xA2/¢/g; $text =~ s/\xA3/£/g; $text =~ s/\xA5/¥/g; $text =~ s/\xBA/°/g; $text =~ s/\xB1/±/g; $text =~ s/\xF7/÷/g; $text =~ s/\xE6/æ/g; $text =~ s/\xC6/Æ/g; $text =~ s/\xE1/á/g; $text =~ s/\xC1/Á/g; $text =~ s/\xE0/à/g; $text =~ s/\xC0/À/g; $text =~ s/\xE4/&aulm;/g; $text =~ s/\xC4/&Aulm;/g; $text =~ s/\xE2/â/g; $text =~ s/\xC2/Â/g; $text =~ s/\xE3/ã/g; $text =~ s/\xC3/Ã/g; $text =~ s/\xE5/å/g; $text =~ s/\xC5/Å/g; $text =~ s/\xE9/é/g; $text =~ s/\xC9/É/g; $text =~ s/\xE8/è/g; $text =~ s/\xC8/È/g; $text =~ s/\xEB/&eulm;/g; $text =~ s/\xCB/&Eulm;/g; $text =~ s/\xEA/ê/g; $text =~ s/\xCA/Ê/g; $text =~ s/\xED/í/g; $text =~ s/\xCD/Í/g; $text =~ s/\xEC/ì/g; $text =~ s/\xCC/Ì/g; $text =~ s/\xEF/&iulm;/g; $text =~ s/\xCF/&Iulm;/g; $text =~ s/\xEE/î/g; $text =~ s/\xCE/Î/g; $text =~ s/\xF3/ó/g; $text =~ s/\xD3/Ó/g; $text =~ s/\xF2/ò/g; $text =~ s/\xD2/Ò/g; $text =~ s/\xF6/&oulm;/g; $text =~ s/\xD6/&Oulm;/g; $text =~ s/\xF4/ô/g; $text =~ s/\xD4/Ô/g; $text =~ s/\xF5/õ/g; $text =~ s/\xD5/Õ/g; $text =~ s/\xF8/ø/g; $text =~ s/\xD8/Ø/g; $text =~ s/\xFA/ú/g; $text =~ s/\xDA/Ú/g; $text =~ s/\xF9/ù/g; $text =~ s/\xD9/Ù/g; $text =~ s/\xFC/&uulm;/g; $text =~ s/\xDC/&Uulm;/g; $text =~ s/\xFB/û/g; $text =~ s/\xDB/Û/g; $text =~ s/\xF1/ñ/g; $text =~ s/\xD1/Ñ/g; $text =~ s/\xE7/ç/g; $text =~ s/\xC7/Ç/g; $text =~ s/\xFF/ÿ/g; $text =~ s/\xDF/ß/g; $text =~ s/\xBF/¿/g; $text =~ s/\xA1/¡/g; $text =~ s/\xAB/«/g; $text =~ s/\xBB/»/g; chomp $text; return $text; } |