From: <ka...@us...> - 2010-07-10 11:53:59
|
Revision: 8711 http://docbook.svn.sourceforge.net/docbook/?rev=8711&view=rev Author: kasunbg Date: 2010-07-10 11:53:52 +0000 (Sat, 10 Jul 2010) Log Message: ----------- Added functionality to "indexer" to not to index the navigation contents. Now it indexes only contents under "content" <div> and some meta data. Modified Paths: -------------- branches/webhelp/xsl/webhelp/build.properties branches/webhelp/xsl/webhelp/build.xml branches/webhelp/xsl/webhelp/indexer/lib/nw-cms.jar branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java branches/webhelp/xsl/webhelp/xsl/webhelp.xsl Modified: branches/webhelp/xsl/webhelp/build.properties =================================================================== --- branches/webhelp/xsl/webhelp/build.properties 2010-07-08 23:42:20 UTC (rev 8710) +++ branches/webhelp/xsl/webhelp/build.properties 2010-07-10 11:53:52 UTC (rev 8711) @@ -5,6 +5,7 @@ # Download xml-commons-resolver from http://xml.apache.org/commons/dist/ # and add it to your CLASSPATH. Then use the following property to point # to your docbook xsl stylesheets' catalog -docbook-xsl-catalog=c:/gsoc2010/docbook-xsl-1.75.2/catalog.xml +#docbook-xsl-catalog=c:/gsoc2010/docbook-xsl-1.75.2/catalog.xml +docbook-xsl-catalog=/media/DATA/ACADEMIC/GSOC/docbook/repository/docbook/trunk/maven/docbook-xsl/target/xsltmp/docbook-xsl-1.75.2/catalog.xml docbookx.dtd=file:///media/DATA/ACADEMIC/GSOC/docbook/repository/docbook/trunk/defguide/en/schema/docbookx.dtd validate=true Modified: branches/webhelp/xsl/webhelp/build.xml =================================================================== --- branches/webhelp/xsl/webhelp/build.xml 2010-07-08 23:42:20 UTC (rev 8710) +++ branches/webhelp/xsl/webhelp/build.xml 2010-07-10 11:53:52 UTC (rev 8711) @@ -13,7 +13,7 @@ <property environment="env"/> <property name="ant.jar" value="${env.ANT_HOME}/lib/ant.jar"/> - <target name="validate" if="validate"> + <target name="validate" if="validate"> <xmlvalidate file="${input-xml}"> <xmlcatalog refid="catalog"/> </xmlvalidate> Modified: branches/webhelp/xsl/webhelp/indexer/lib/nw-cms.jar =================================================================== (Binary files differ) Modified: branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java =================================================================== --- branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java 2010-07-08 23:42:20 UTC (rev 8710) +++ branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java 2010-07-10 11:53:52 UTC (rev 8711) @@ -43,7 +43,7 @@ private String searchdir = "search"; private File inputDir = null; private String outputDir = null; - private String projectDir = null; + private String projectDir = null; // ANT parameters private String htmldir=null; @@ -187,7 +187,7 @@ } int ad = 1; if (stemp.equals(projectDir)) ad = 0; - stemp = stemp.substring(i+projectDir.length()+ad); + stemp = stemp.substring(i+projectDir.length()+ad); //i is redundant (i==0 always) ftemp = new File (stemp); docFileInfoTemp.setFullpath(ftemp); @@ -242,17 +242,15 @@ // Files for punctuation (only one for now) String[] punctuationFiles = new String[] {"punctuation.props"}; FileInputStream input; - String tempStr= new String(); + String tempStr; File ftemp; Collection c = new ArrayList<String>(); - //TODO punctuation files are located at webhelp/template/content/search/*.* But here, it refers to doc/content/search which does not exist. - // Get the list of the props file containing the words to remove (not the punctuation) DirList props = new DirList(inputDir, "^(?!(punctuation)).*\\.props$", 1); ArrayList<File> wordsList = props.getListFiles(); System.out.println("props files:"+wordsList); - + //TODO all properties are taken toa single arraylist. does it ok?. Properties enProps =new Properties (); String propsDir = new String (inputDir.getPath().concat(File.separator).concat(searchdir)); Modified: branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java =================================================================== --- branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java 2010-07-08 23:42:20 UTC (rev 8710) +++ branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java 2010-07-10 11:53:52 UTC (rev 8711) @@ -77,7 +77,8 @@ SAXParserFactory spf = SAXParserFactory.newInstance(); spf.setValidating(false); - + addContent = false; + divCount = 0; try { //get a new instance of parser @@ -109,20 +110,23 @@ ie.printStackTrace(); } } - + //kasun: TODO remove indexing of css styles + private boolean addContent = false; + private boolean addHeaderInfo = false; + private int divCount = 0; //SAX parser Event Handlers: public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { //dwc: capture current element name - currentElName = qName; + currentElName = qName; // dwc: Adding contents of some meta tags to the index if((qName.equalsIgnoreCase("meta")) ) { + addHeaderInfo = true; String attrName = attributes.getValue("name"); if(attrName != null && (attrName.equalsIgnoreCase("keywords") || attrName.equalsIgnoreCase("description"))){ strbf.append(" " + attributes.getValue("content") + " "); } - // dwc: adding this to make the docbook <abstract> element // (which becomes <meta name="description".../> in html) // into the brief description that shows up in search @@ -130,31 +134,46 @@ if(attrName != null && (attrName.equalsIgnoreCase("description"))){ fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(attributes.getValue("content").replace('\n', ' '))); } - } - // dwc: End addition - - // dwc: commenting out DITA specific lines + } // dwc: End addition + + // dwc: commenting out DITA specific lines if((qName.equalsIgnoreCase("title")) || (qName.equalsIgnoreCase("shortdesc"))) { tempVal = new StringBuffer(); } - // dwc: Adding mechansim to grab <p class="summary"> etc. Powered by para.propagates.style etc. - if(qName.equalsIgnoreCase("div")||qName.equalsIgnoreCase("p")||qName.equalsIgnoreCase("span")) { - String stemp = attributes.getValue("class"); - if (stemp !=null && (stemp.equalsIgnoreCase("shortdesc")||stemp.equalsIgnoreCase("summary"))) { - shortdescBool = true; - } - tempVal = new StringBuffer(); - strbf.append(" "); - } - if (shortdescBool == true) { - shortTagCpt ++; - } - + if(qName.equalsIgnoreCase("meta") || qName.equalsIgnoreCase("title") || qName.equalsIgnoreCase("shortdesc")){ + addHeaderInfo = true; + } else { + addHeaderInfo = false; + } + + String elementId = attributes.getValue("id"); + if("content".equals(elementId)) addContent = true; + + if(addContent) { + //counts div tags starting from "content" div(inclusive). This will be used to track the end of content "div" tag. + //see #endElement() + if(qName.equalsIgnoreCase("div")){ + divCount++; + } + + // dwc: Adding mechansim to grab <p class="summary"> etc. Powered by para.propagates.style etc. + if (qName.equalsIgnoreCase("div") || qName.equalsIgnoreCase("p") || qName.equalsIgnoreCase("span")) { + String stemp = attributes.getValue("class"); + if (stemp != null && (stemp.equalsIgnoreCase("shortdesc") || stemp.equalsIgnoreCase("summary"))) { + shortdescBool = true; + } + tempVal = new StringBuffer(); + strbf.append(" "); + } + if (shortdescBool) { + shortTagCpt++; + } + } strbf.append(" "); + } - } - + //triggers when there's character data inside an element. public void characters(char[] ch, int start, int length) throws SAXException { // dwc: Bug fix. Don't index contents of script tag. @@ -162,7 +181,8 @@ // index certain elements. E.g. Use this to implement a // "titles only" index, say if you wanted to use <span/>s to // create space breaks in ja_JP lines to indicate word breaks. - if(! currentElName.equalsIgnoreCase("script")){ + + if((addContent || addHeaderInfo) && !currentElName.equalsIgnoreCase("script")){ String text = new String(ch,start,length); strbf.append(text); if (tempVal != null) { tempVal.append(text);} @@ -183,7 +203,14 @@ tempVal = null; shortdescBool = false; } - } + } + + if(qName.equalsIgnoreCase("div") && addContent){ + divCount--; + if (divCount == 0) { + addContent = false; + } + } } public void processingInstruction(String target, String data) throws SAXException { @@ -220,8 +247,9 @@ //PrintWriter pw = new PrintWriter(new FileOutputStream(new File("xx.html"))); PrintWriter pw = new PrintWriter(new OutputStreamWriter (new FileOutputStream(new File("xx.html")),"UTF-8")); - - + //writes the content to xx.html after removing validation. This temp file will be source to index the + // content of the particular html page. + while(true) { int i1, i2; Modified: branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java =================================================================== --- branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java 2010-07-08 23:42:20 UTC (rev 8710) +++ branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java 2010-07-10 11:53:52 UTC (rev 8711) @@ -1,10 +1,7 @@ package com.nexwave.nquindexer; import java.io.File; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; +import java.util.*; // specific dita ot @@ -72,13 +69,11 @@ String str = cleanBuffer(strbf); - //System.out.println(file.toString()+" "+ str +"\n"); + System.out.println(file.toString()+" "+ str +"\n"); String[] items = str.split("\\s"); //items: remove the duplicated strings first HashSet <String> tempSet = new HashSet<String>(); - for (String s : items) { - tempSet.add(s); - } + tempSet.addAll(Arrays.asList(items)); Iterator it = tempSet.iterator(); String s; while (it.hasNext()) { @@ -86,11 +81,11 @@ s = (String)it.next(); if (tempDico.containsKey(s)) { String temp = (String) tempDico.get(s); - temp = temp.concat(",").concat(new Integer(i).toString()); + temp = temp.concat(",").concat(Integer.toString(i)); //System.out.println("temp="+s+"="+temp); tempDico.put(s, temp); }else { - tempDico.put(s, new Integer(i).toString()); + tempDico.put(s, Integer.toString(i)); } } @@ -125,7 +120,7 @@ tempStrBuf.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b"); tempStrBuf.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b"); - str = str.replaceFirst("Copyright \xA9 1998-2007 NexWave Solutions.", " "); + str = str.replaceFirst("Copyright � 1998-2007 NexWave Solutions.", " "); //nqu 25.01.2008 str = str.replaceAll("\\b.\\b|\\\\", " "); Modified: branches/webhelp/xsl/webhelp/xsl/webhelp.xsl =================================================================== --- branches/webhelp/xsl/webhelp/xsl/webhelp.xsl 2010-07-08 23:42:20 UTC (rev 8710) +++ branches/webhelp/xsl/webhelp/xsl/webhelp.xsl 2010-07-10 11:53:52 UTC (rev 8711) @@ -369,7 +369,7 @@ </form> </div> <div id="searchResults"> - + <center> </center> </div> </div> </xsl:if> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |