From: <dc...@us...> - 2010-10-04 15:14:27
|
Revision: 8917 http://docbook.svn.sourceforge.net/docbook/?rev=8917&view=rev Author: dcramer Date: 2010-10-04 15:14:20 +0000 (Mon, 04 Oct 2010) Log Message: ----------- Merged in changes from webhelp branch to address issue #3058244 regarding the xx.html temp file that was being created Modified Paths: -------------- trunk/xsl/webhelp/build.xml trunk/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java trunk/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java Property Changed: ---------------- trunk/xsl/webhelp/ Property changes on: trunk/xsl/webhelp ___________________________________________________________________ Added: svn:mergeinfo + /branches/webhelp/xsl/webhelp:8693-8881,8914-8916 Modified: trunk/xsl/webhelp/build.xml =================================================================== --- trunk/xsl/webhelp/build.xml 2010-10-04 13:50:56 UTC (rev 8916) +++ trunk/xsl/webhelp/build.xml 2010-10-04 15:14:20 UTC (rev 8917) @@ -93,8 +93,6 @@ <fileset dir="${output-dir}/content/search" includes="*.props"/> </delete> - <delete file="xx.html"/> - </target> <target name="webhelp" depends="validate,chunk,index"/> Modified: trunk/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java =================================================================== --- trunk/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java 2010-10-04 13:50:56 UTC (rev 8916) +++ trunk/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java 2010-10-04 15:14:20 UTC (rev 8917) @@ -11,14 +11,10 @@ import java.util.Map; import java.util.Properties; -/* - import org.apache.tools.ant.BuildException; import org.apache.tools.ant.Task; -*/ - import com.nexwave.nsidita.DirList; import com.nexwave.nsidita.DocFileInfo; @@ -30,8 +26,7 @@ * @author N. Quaine * @author Kasun Gajasinghe <http://kasunbg.blogspot.com> */ -public class IndexerTask{ -//public class IndexerTask extends Task { +public class IndexerTask extends Task { // messages private String txt_no_inputdir = "Input directory not found:"; @@ -61,11 +56,13 @@ // Indexing features: words to remove private ArrayList<String> cleanUpStrings = null; private ArrayList<String> cleanUpChars = null; + + //Html extension + private String htmlExtension = "html"; // Constructor public IndexerTask() { super(); - } /** The setter for the "htmldir" attribute (parameter of the task) * @param htmldir @@ -75,6 +72,18 @@ this.htmldir = htmldir; } + /** + * Set the extension in which html files are generated + * @param htmlExtension The extension in wich html files are generated + */ + public void setHtmlextension(String htmlExtension) { + this.htmlExtension = htmlExtension; + //Trim the starting "." + if(this.htmlExtension.startsWith(".")) { + this.htmlExtension = this.htmlExtension.substring(1); + } + } + /** * setter for "indexerLanguage" attribute from ANT * @param indexerLanguage language for the search indexer. Used to differerentiate which stemmer to be used. @@ -104,14 +113,11 @@ IndexerTask.indexerLanguage = "@@"; //fail-safe mechanism, This vm should not reach this point. } } - - /** * Implementation of the execute function (Task interface) */ -// public void execute() throws BuildException { - public void execute(){ + public void execute() throws BuildException { try{ //Use Xerces as the parser. Does not support Saxon6.5.5 parser System.setProperty("org.xml.sax.driver", "org.apache.xerces.parsers.SAXParser"); @@ -184,8 +190,7 @@ // Get the list of all html files but the tocs, covers and indexes - //DirList nsiDoc = new DirList(inputDir, "^(?!(toc|index|search|frameset|ix01)).*\\.html$", 1); - DirList nsiDoc = new DirList(inputDir, "^.*\\.html?$", 1); + DirList nsiDoc = new DirList(inputDir, "^.*\\." + htmlExtension + "?$", 1); htmlFiles = nsiDoc.getListFiles(); // Check if found html files if (htmlFiles.isEmpty()) { @@ -211,7 +216,7 @@ // Retrieve the clean-up properties for indexing RetrieveCleanUpProps(); - // System.out.print("clean"+" " +cleanUpStrings); + // System.out.print("clean"+" " +cleanUpStrings); //create a default handler //SaxHTMLIndex spe = new SaxHTMLIndex (); // do not use clean-up props files Modified: trunk/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java =================================================================== --- trunk/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java 2010-10-04 13:50:56 UTC (rev 8916) +++ trunk/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java 2010-10-04 15:14:20 UTC (rev 8917) @@ -1,17 +1,11 @@ package com.nexwave.nquindexer; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.PrintWriter; +import java.io.*; import com.nexwave.nsidita.BlankRemover; import com.nexwave.nsidita.DocFileInfo; +import org.xml.sax.InputSource; import org.xml.sax.SAXParseException; /** @@ -88,8 +82,11 @@ long start = System.currentTimeMillis(); //System.out.println("about to parse " + file.getName() + " >>> " + start); - if ( RemoveValidationPI (file) == 0){ - sp.parse("xx.html", this); + String content = RemoveValidationPI (file); + if (content != null){ + InputSource is = new InputSource(new StringReader(content)); + is.setSystemId(file.toURI().toURL().toString()); + sp.parse(is, this); } long finish = System.currentTimeMillis(); @@ -182,12 +179,9 @@ //triggers when there's character data inside an element. public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException { - - // dwc: Bug fix. Don't index contents of script tag. - // dwc: TODO: Add code here to conditionally index or not + // index certain elements. E.g. Use this to implement a - // "titles only" index, say if you wanted to use <span/>s to - // create space breaks in ja_JP lines to indicate word breaks. + // "titles only" index, if((addContent || addHeaderInfo) && !doNotIndex && !currentElName.equalsIgnoreCase("script")){ String text = new String(ch,start,length); @@ -245,17 +239,14 @@ * @param file * @return int: returns 0 if no IOException occurs, else 1. */ - public int RemoveValidationPI (File file) { - + public String RemoveValidationPI (File file) { + StringBuilder sb = new StringBuilder(); + //The content that needs to be indexed after removing validation will be written to sb. This StringBuilder will + // be the source to index the content of the particular html page. try { BufferedReader br = new BufferedReader( new InputStreamReader( new FileInputStream(file),"UTF-8")); - - //PrintWriter pw = new PrintWriter(new FileOutputStream(new File("xx.html"))); - PrintWriter pw = new PrintWriter(new OutputStreamWriter (new FileOutputStream(new File("xx.html")),"UTF-8")); - //writes the content to xx.html after removing validation. This temp file will be source to index the - // content of the particular html page. while(true) { @@ -278,7 +269,8 @@ if (line.contains("<?xml version")) { line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n"); } - pw.write(line + "\n"); + + sb.append(line + "\n"); } else { //dwc: What is this trying to do? Nuke the DOCTYPE? Why? @@ -296,7 +288,8 @@ line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n"); } line = line.replaceAll("\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E","\n"); - pw.write(line); + + sb.append(line); } } catch (IOException e) @@ -304,18 +297,15 @@ break; } } - - - pw.flush(); - pw.close(); + br.close(); } catch (IOException e) { - return 1; + return null; } - return 0; // return status + return sb.toString(); // return status } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |