SF.net SVN: docbook:[8711] branches/webhelp/xsl/webhelp

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Revision: 8711
          http://docbook.svn.sourceforge.net/docbook/?rev=8711&view=rev
Author:   kasunbg
Date:     2010-07-10 11:53:52 +0000 (Sat, 10 Jul 2010)

Log Message:
-----------
Added functionality to "indexer" to not to index the navigation contents. Now it indexes only contents under "content" <div> and some meta data.

Modified Paths:
--------------
    branches/webhelp/xsl/webhelp/build.properties
    branches/webhelp/xsl/webhelp/build.xml
    branches/webhelp/xsl/webhelp/indexer/lib/nw-cms.jar
    branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
    branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java
    branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java
    branches/webhelp/xsl/webhelp/xsl/webhelp.xsl

Modified: branches/webhelp/xsl/webhelp/build.properties
===================================================================

--- branches/webhelp/xsl/webhelp/build.properties	2010-07-08 23:42:20 UTC (rev 8710)
+++ branches/webhelp/xsl/webhelp/build.properties	2010-07-10 11:53:52 UTC (rev 8711)
@@ -5,6 +5,7 @@
 # Download xml-commons-resolver from http://xml.apache.org/commons/dist/
 # and add it to your CLASSPATH. Then use the following property to point 
 # to your docbook xsl stylesheets' catalog
-docbook-xsl-catalog=c:/gsoc2010/docbook-xsl-1.75.2/catalog.xml
+#docbook-xsl-catalog=c:/gsoc2010/docbook-xsl-1.75.2/catalog.xml
+docbook-xsl-catalog=/media/DATA/ACADEMIC/GSOC/docbook/repository/docbook/trunk/maven/docbook-xsl/target/xsltmp/docbook-xsl-1.75.2/catalog.xml
 docbookx.dtd=file:///media/DATA/ACADEMIC/GSOC/docbook/repository/docbook/trunk/defguide/en/schema/docbookx.dtd
 validate=true

Modified: branches/webhelp/xsl/webhelp/build.xml
===================================================================
--- branches/webhelp/xsl/webhelp/build.xml	2010-07-08 23:42:20 UTC (rev 8710)
+++ branches/webhelp/xsl/webhelp/build.xml	2010-07-10 11:53:52 UTC (rev 8711)
@@ -13,7 +13,7 @@
   <property environment="env"/>
   <property name="ant.jar" value="${env.ANT_HOME}/lib/ant.jar"/>
 
-  <target name="validate" if="validate">
+  <target name="validate" if="validate">                            
 	<xmlvalidate file="${input-xml}">
 	  <xmlcatalog refid="catalog"/>
 	</xmlvalidate>

Modified: branches/webhelp/xsl/webhelp/indexer/lib/nw-cms.jar
===================================================================
(Binary files differ)

Modified: branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
===================================================================
--- branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java	2010-07-08 23:42:20 UTC (rev 8710)
+++ branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java	2010-07-10 11:53:52 UTC (rev 8711)
@@ -43,7 +43,7 @@
 	private String searchdir = "search";
 	private File inputDir = null;
 	private String outputDir = null;
-	private String projectDir = null;	
+	private String projectDir = null;
 
 	// ANT parameters
 	private String htmldir=null;
@@ -187,7 +187,7 @@
 				}
 				int ad = 1;
 				if (stemp.equals(projectDir)) ad = 0; 
-				stemp = stemp.substring(i+projectDir.length()+ad);
+				stemp = stemp.substring(i+projectDir.length()+ad);  //i is redundant (i==0 always)
 				ftemp = new File (stemp);
 				docFileInfoTemp.setFullpath(ftemp);
 				
@@ -242,17 +242,15 @@
     	// Files for punctuation (only one for now)
         String[] punctuationFiles = new String[] {"punctuation.props"};
         FileInputStream input;
-        String tempStr= new String();
+        String tempStr;
         File ftemp;
         Collection c = new ArrayList<String>();
 
-        //TODO punctuation files are located at webhelp/template/content/search/*.* But here, it refers to doc/content/search which does not exist.
-
         // Get the list of the props file containing the words to remove (not the punctuation)
         DirList props = new DirList(inputDir, "^(?!(punctuation)).*\\.props$", 1);
 		ArrayList<File> wordsList = props.getListFiles();
 		System.out.println("props files:"+wordsList);
-		
+        //TODO all properties are taken toa single arraylist. does it ok?.
 		Properties enProps =new Properties ();
 		String propsDir = new String (inputDir.getPath().concat(File.separator).concat(searchdir));
 		

Modified: branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java
===================================================================
--- branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java	2010-07-08 23:42:20 UTC (rev 8710)
+++ branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java	2010-07-10 11:53:52 UTC (rev 8711)
@@ -77,7 +77,8 @@
 		SAXParserFactory spf = SAXParserFactory.newInstance();
 		
 		spf.setValidating(false);
-		
+        addContent = false;
+		divCount = 0;
 		try {
 		
 			//get a new instance of parser
@@ -109,20 +110,23 @@
 			ie.printStackTrace();
 		}
 	}
-	
+   //kasun: TODO remove indexing of css styles
+    private boolean addContent = false;
+    private boolean addHeaderInfo = false;
+    private int divCount = 0;
 	//SAX parser Event Handlers:
 	public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
 
 		//dwc: capture current element name
-		currentElName = qName; 
+		currentElName = qName;
 
 		// dwc: Adding contents of some meta tags to the index
 		if((qName.equalsIgnoreCase("meta")) ) {
+            addHeaderInfo = true;
 			String attrName = attributes.getValue("name");
 			if(attrName != null && (attrName.equalsIgnoreCase("keywords") || attrName.equalsIgnoreCase("description"))){
 				strbf.append(" " + attributes.getValue("content") + " ");
 			}
-
 			// dwc: adding this to make the docbook <abstract> element
 			// (which becomes <meta name="description".../> in html)
 			// into the brief description that shows up in search
@@ -130,31 +134,46 @@
 			if(attrName != null && (attrName.equalsIgnoreCase("description"))){
 				fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(attributes.getValue("content").replace('\n', ' ')));
 			}
-		}
-		// dwc: End addition
-		
-		// dwc: commenting out DITA specific lines
+		} // dwc: End addition
+
+        // dwc: commenting out DITA specific lines
 		if((qName.equalsIgnoreCase("title")) || (qName.equalsIgnoreCase("shortdesc"))) {
 			tempVal = new StringBuffer();
 		}
 
-		// dwc: Adding mechansim to grab <p class="summary"> etc. Powered by para.propagates.style etc.
-		if(qName.equalsIgnoreCase("div")||qName.equalsIgnoreCase("p")||qName.equalsIgnoreCase("span")) {
-			String stemp = attributes.getValue("class");
-			if (stemp !=null && (stemp.equalsIgnoreCase("shortdesc")||stemp.equalsIgnoreCase("summary"))) {
-				shortdescBool = true;
-			}
-			tempVal = new StringBuffer();
-			strbf.append(" ");
-		}
-		if (shortdescBool == true) {
-			shortTagCpt ++;			
-		}
-		
+        if(qName.equalsIgnoreCase("meta") || qName.equalsIgnoreCase("title") || qName.equalsIgnoreCase("shortdesc")){
+            addHeaderInfo = true;
+        } else {
+            addHeaderInfo = false;
+        }
+
+        String elementId = attributes.getValue("id"); 
+        if("content".equals(elementId)) addContent = true;
+
+        if(addContent) {
+            //counts div tags starting from "content" div(inclusive). This will be used to track the end of content "div" tag.
+            //see #endElement()
+            if(qName.equalsIgnoreCase("div")){
+                divCount++;
+            }
+
+            // dwc: Adding mechansim to grab <p class="summary"> etc. Powered by para.propagates.style etc.
+            if (qName.equalsIgnoreCase("div") || qName.equalsIgnoreCase("p") || qName.equalsIgnoreCase("span")) {
+                String stemp = attributes.getValue("class");
+                if (stemp != null && (stemp.equalsIgnoreCase("shortdesc") || stemp.equalsIgnoreCase("summary"))) {
+                    shortdescBool = true;
+                }
+                tempVal = new StringBuffer();
+                strbf.append(" ");
+            }
+            if (shortdescBool) {
+                shortTagCpt++;
+            }
+        }
 		strbf.append(" ");
+	}
 
-	}
-	
+	//triggers when there's character data inside an element.
 	public void characters(char[] ch, int start, int length) throws SAXException {
 		
 		// dwc: Bug fix. Don't index contents of script tag.
@@ -162,7 +181,8 @@
 		// index certain elements. E.g. Use this to implement a
 		// "titles only" index, say if you wanted to use <span/>s to
 		// create space breaks in ja_JP lines to indicate word breaks.
-		if(! currentElName.equalsIgnoreCase("script")){
+        
+		if((addContent || addHeaderInfo) && !currentElName.equalsIgnoreCase("script")){
 			String text = new String(ch,start,length);
 			strbf.append(text);
 			if (tempVal != null) { tempVal.append(text);}
@@ -183,7 +203,14 @@
 			tempVal = null;
 			shortdescBool = false;
 			}
-		}		
+		}
+        
+        if(qName.equalsIgnoreCase("div") && addContent){
+            divCount--;
+            if (divCount == 0) {
+                addContent = false;
+            }
+        } 
 	}
 	
 	public void processingInstruction(String target, String data) throws SAXException {
@@ -220,8 +247,9 @@
 			
 			//PrintWriter pw = new PrintWriter(new FileOutputStream(new File("xx.html")));
 			PrintWriter pw = new PrintWriter(new  OutputStreamWriter (new FileOutputStream(new File("xx.html")),"UTF-8"));
-			
-	
+			 //writes the content to xx.html after removing validation. This temp file will be source to index the
+            // content of the particular html page.
+
 			while(true)
 			{
 				int i1, i2;

Modified: branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java
===================================================================
--- branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java	2010-07-08 23:42:20 UTC (rev 8710)
+++ branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java	2010-07-10 11:53:52 UTC (rev 8711)
@@ -1,10 +1,7 @@
 package com.nexwave.nquindexer;
 
 import java.io.File;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Map;
+import java.util.*;
 
 
 // specific dita ot
@@ -72,13 +69,11 @@
 		
 		String str = cleanBuffer(strbf);
 
-		//System.out.println(file.toString()+" "+ str +"\n");
+		System.out.println(file.toString()+" "+ str +"\n");
 		String[] items = str.split("\\s");
 		//items: remove the duplicated strings first
 		HashSet <String> tempSet = new HashSet<String>();
-		for (String s : items) {
-			tempSet.add(s);
-		}
+        tempSet.addAll(Arrays.asList(items));
 		Iterator it = tempSet.iterator();
 		String s;
         while (it.hasNext()) {
@@ -86,11 +81,11 @@
         	s = (String)it.next();
         	if (tempDico.containsKey(s)) {
         		String temp = (String) tempDico.get(s);
-        		temp = temp.concat(",").concat(new Integer(i).toString());
+        		temp = temp.concat(",").concat(Integer.toString(i));
         		//System.out.println("temp="+s+"="+temp);
         		tempDico.put(s, temp);
         	}else {
-        		tempDico.put(s, new Integer(i).toString());
+        		tempDico.put(s, Integer.toString(i));
         	}
         }
         
@@ -125,7 +120,7 @@
 			tempStrBuf.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
 			tempStrBuf.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
 
-			str = str.replaceFirst("Copyright \xA9 1998-2007 NexWave Solutions.", " ");
+			str = str.replaceFirst("Copyright � 1998-2007 NexWave Solutions.", " ");
 			
 
 			//nqu 25.01.2008 str = str.replaceAll("\\b.\\b|\\\\", " ");

Modified: branches/webhelp/xsl/webhelp/xsl/webhelp.xsl
===================================================================
--- branches/webhelp/xsl/webhelp/xsl/webhelp.xsl	2010-07-08 23:42:20 UTC (rev 8710)
+++ branches/webhelp/xsl/webhelp/xsl/webhelp.xsl	2010-07-10 11:53:52 UTC (rev 8711)
@@ -369,7 +369,7 @@
                                         </form>
                                     </div>
                                     <div id="searchResults">
-
+                                           <center> </center>
                                     </div>
                                 </div>
                             </xsl:if>


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.