From: <ka...@us...> - 2010-08-08 16:43:20
|
Revision: 8799 http://docbook.svn.sourceforge.net/docbook/?rev=8799&view=rev Author: kasunbg Date: 2010-08-08 16:43:13 +0000 (Sun, 08 Aug 2010) Log Message: ----------- Added client-side support for cjk searching. By default, for cjk, 2-gram tokenizing is used in both client run time and indexer build time. i.e. ?\227?\130?\175?\227?\129?\171?\230?\142?\165?\231?\182?\154 will be tokenized to "?\227?\130?\175?\227?\129?\171", "?\227?\129?\171?\230?\142?\165", "?\230?\142?\165?\231?\182?\154". Better tokenizing suggestions are IKAnalzer and Paoding Analyzer. These are dictionary based, so that results are more accurate. Modified Paths: -------------- branches/webhelp/xsl/webhelp/indexer/lib/nw-cms.jar branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/WriteJSFiles.java branches/webhelp/xsl/webhelp/template/content/search/nwSearchFnt.js Modified: branches/webhelp/xsl/webhelp/indexer/lib/nw-cms.jar =================================================================== (Binary files differ) Modified: branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java =================================================================== --- branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java 2010-08-07 19:12:48 UTC (rev 8798) +++ branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java 2010-08-08 16:43:13 UTC (rev 8799) @@ -51,7 +51,7 @@ // ANT parameters private String htmldir=null; - private String indexerLanguage="en"; + public static String indexerLanguage="en"; //supported languages: add new additions to this. don't include country codes to the end such as en_US or en_UK, // as stemmers doesn't find a difference between them. @@ -90,7 +90,7 @@ int i=0; for (;i<supportedLanguages.length;i++) { if(indexerLanguage.equals(supportedLanguages[i])){ - this.indexerLanguage = supportedLanguages[i]; + IndexerTask.indexerLanguage = supportedLanguages[i]; break; } } @@ -100,10 +100,10 @@ System.out.println("The given language, \""+indexerLanguage+"\", does not have extensive support for " + "searching or language code is specified in a bad format. Check documentation for details. " + "Language now defaults to english."); - this.indexerLanguage = "en"; + IndexerTask.indexerLanguage = "en"; } } else { - this.indexerLanguage = "en"; + IndexerTask.indexerLanguage = "en"; } } Modified: branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/WriteJSFiles.java =================================================================== --- branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/WriteJSFiles.java 2010-08-07 19:12:48 UTC (rev 8798) +++ branches/webhelp/xsl/webhelp/indexer/src/com/nexwave/nquindexer/WriteJSFiles.java 2010-08-08 16:43:13 UTC (rev 8799) @@ -182,6 +182,8 @@ The value is the numbers of the files in which the word exists. Example: w["key"]="file1,file2,file3";*/ int count = 0; + if(i==1) + out.write("var indexerLanguage=\""+IndexerTask.indexerLanguage+"\";\n"); out.write("//Auto generated index for searching.\n"); while (keyIt.hasNext()) { //&& (tempLetter == tstr.charAt(0)) out.write("w[\"" + tstr + "\"]" + "=\"" + indexMap.get(tstr) + "\";\n"); Modified: branches/webhelp/xsl/webhelp/template/content/search/nwSearchFnt.js =================================================================== --- branches/webhelp/xsl/webhelp/template/content/search/nwSearchFnt.js 2010-08-07 19:12:48 UTC (rev 8798) +++ branches/webhelp/xsl/webhelp/template/content/search/nwSearchFnt.js 2010-08-08 16:43:13 UTC (rev 8799) @@ -8,12 +8,10 @@ */ //string initialization +var htmlfileList = "htmlFileList.js"; +var htmlfileinfoList = "htmlFileInfoList.js"; +var useCJKTokenizing = false; - -htmlfileList = "htmlFileList.js"; -htmlfileinfoList = "htmlFileInfoList.js"; - - /* Cette fonction verifie la validite de la recherche entrre par l utilisateur */ function Verifie(ditaSearch_Form) { @@ -56,13 +54,12 @@ //DisplayWaitingMessage(); /*data initialisation*/ - searchFor = ""; // expression en lowercase et sans les caracteres speciaux + searchFor = ""; // expression en lowercase et sans les caracte res speciaux //w = new Object(); // hashtable, key=word, value = list of the index of the html files scriptLetterTab = new scriptfirstchar(); // Array containing the first letter of each word to look for var scriptsarray = new Array(); // Array with the name of the scripts to load var wordsList = new Array(); // Array with the words to look for - var cleanwordsList = new Array(); // Array with the words to look for - var stemmedWordsList = new Array(); // Array with the words to look for after removing spaces + var finalWordsList = new Array(); // Array with the words to look for after removing spaces var listNumerosDesFicStr = ""; var ou_recherche = true; var linkTab = new Array(); @@ -80,34 +77,20 @@ wordsList = searchFor.split(" "); wordsList.sort(); - for(var j in wordsList){ - var word = wordsList[j]; - if(typeof stemmer != "undefined" ){ - stemQueryMap[stemmer(word)] = word; - } else { - stemQueryMap[word] = word; - } + //set the tokenizing method + if(typeof indexerLanguage != "undefined" && (indexerLanguage=="cn" || indexerLanguage=="ja" ||indexerLanguage=="ko")){ + useCJKTokenizing=true; + } else { + useCJKTokenizing=false; } - - //stemmedWordsList is the stemmed list of words separated by spaces. - for (t in wordsList) { - wordsList[t] = wordsList[t].replace(/(%22)|^-/g, "") - if (wordsList[t] != "%20") { - scriptLetterTab.add(wordsList[t].charAt(0)); - cleanwordsList.push(wordsList[t]); - } + //If Lucene CJKTokenizer was used as the indexer, then useCJKTokenizing will be true. Else, do normal tokenizing. + // 2-gram tokenizinghappens in CJKTokenizing, + if(useCJKTokenizing){ + finalWordsList = cjkTokenize(wordsList); + } else { + finalWordsList = tokenize(wordsList); } - if(typeof stemmer != "undefined" ){ - //Do the stemming using Porter's stemming algorithm - for (var i = 0; i < cleanwordsList.length; i++) { - var stemWord = stemmer(cleanwordsList[i]); - stemmedWordsList.push(stemWord); - } - } else { - stemmedWordsList = cleanwordsList; - } - //load the scripts with the indices: the following lines do not work on the server. To be corrected /*if (IEBrowser) { scriptsarray = loadTheIndexScripts (scriptLetterTab); @@ -117,32 +100,29 @@ * Compare with the indexed words (in the w[] array), and push words that are in it to tempTab. */ var tempTab = new Array(); - for (t in stemmedWordsList) { - if (w[stemmedWordsList[t].toString()] == undefined) { - txt_wordsnotfound += stemmedWordsList[t] + " "; + for (t in finalWordsList) { + if (w[finalWordsList[t].toString()] == undefined) { + txt_wordsnotfound += finalWordsList[t] + " "; } else { - tempTab.push(stemmedWordsList[t]); + tempTab.push(finalWordsList[t]); } } - stemmedWordsList = tempTab; + finalWordsList = tempTab; - if (stemmedWordsList.length) { + if (finalWordsList.length) { - // recherche 'et' et 'ou' en une fois - fileAndWordList = SortResults(stemmedWordsList); + //search 'and' and 'or' one time + fileAndWordList = SortResults(finalWordsList); cpt = fileAndWordList.length; - for (i = cpt - 1; i >= 0; i--) { + for (var i = cpt - 1; i >= 0; i--) { if (fileAndWordList[i] != undefined) { - linkTab.push("<p>" + txt_results_for + " " + "<span class=\"searchExpression\">" + fileAndWordList[i][0].motslisteDisplay + "</span>" + "</p>"); linkTab.push("<ul class='searchresult'>"); for (t in fileAndWordList[i]) { //DEBUG: alert(": "+ fileAndWordList[i][t].filenb+" " +fileAndWordList[i][t].motsliste); //linkTab.push("<li><a href=\"../"+fl[fileAndWordList[i][t].filenb]+"\">"+fl[fileAndWordList[i][t].filenb]+"</a></li>"); - - tempInfo = fil[fileAndWordList[i][t].filenb]; pos1 = tempInfo.indexOf("@@@"); pos2 = tempInfo.lastIndexOf("@@@"); @@ -151,24 +131,21 @@ tempShortdesc = tempInfo.substring(pos2 + 3, tempInfo.length); //file:///home/kasun/docbook/WEBHELP/webhelp-draft-output-format-idea/src/main/resources/web/webhelp/installation.html - var linkString = "<li><a href=" + tempPath + ">" + tempTitle + "</a>"; - // var linkString = "<li><a href=\"installation.html\">" + tempTitle + "</a>"; + var linkString = "<li><a href=" + tempPath + ">" + tempTitle + "</a>"; + // var linkString = "<li><a href=\"installation.html\">" + tempTitle + "</a>"; if ((tempShortdesc != "null")) { linkString += "\n<div class=\"shortdesclink\">" + tempShortdesc + "</div>"; } linkString += "</li>"; - linkTab.push(linkString); - } linkTab.push("</ul>"); } } } - var results=""; - if (linkTab.length > 0) { - + var results = ""; + if (linkTab.length > 0) { /*writeln ("<p>" + txt_results_for + " " + "<span class=\"searchExpression\">" + cleanwordsList + "</span>" + "<br/>"+"</p>");*/ results = "<p>"; //write("<ul class='searchresult'>"); @@ -176,47 +153,133 @@ results += linkTab[t].toString(); } results += "</p>"; - } else{ - results = "<p>"+"Your search returned no results for "+ "<span class=\"searchExpression\">" + txt_wordsnotfound + "</span>" +"</p>"; + } else { + results = "<p>" + "Your search returned no results for " + "<span class=\"searchExpression\">" + txt_wordsnotfound + "</span>" + "</p>"; } //alert(results); + document.getElementById('searchResults').innerHTML = results; +} - document.getElementById('searchResults').innerHTML = results; +function tokenize(wordsList){ + var stemmedWordsList = new Array(); // Array with the words to look for after removing spaces + var cleanwordsList = new Array(); // Array with the words to look for + for(var j in wordsList){ + var word = wordsList[j]; + if(typeof stemmer != "undefined" ){ + stemQueryMap[stemmer(word)] = word; + } else { + stemQueryMap[word] = word; + } + } + //stemmedWordsList is the stemmed list of words separated by spaces. + for (t in wordsList) { + wordsList[t] = wordsList[t].replace(/(%22)|^-/g, "") + if (wordsList[t] != "%20") { + scriptLetterTab.add(wordsList[t].charAt(0)); + cleanwordsList.push(wordsList[t]); + } + } - /* Display results * / - with (parent.frames['searchresults'].document) { - writeln("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">\n<html><head>"); - writeln("<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">"); - //writeln("<link href=\"css/commonltr.css\" type=\"text/css\" rel=\"stylesheet\">" ); - //writeln("<link rel=\"stylesheet\" type=\"text/css\" href=\"css/search.css\">") ; - writeln("<style>body{\ - font-family: verdana, sans-serif;\ - font-size: .7em;\ - background: #f3f3f3; }\ - .searchExpression{ font-weight: bold;}</style>") ; - writeln("<title>"+txt_filesfound+"</title></head>"); - writeln("<body onload = \"self.focus()\">"); - //writeln("<h2>" + txt_search_result + " " + "<i>" + wordsList + "</i>" + "</h2>"); + if(typeof stemmer != "undefined" ){ + //Do the stemming using Porter's stemming algorithm + for (var i = 0; i < cleanwordsList.length; i++) { + var stemWord = stemmer(cleanwordsList[i]); + stemmedWordsList.push(stemWord); + } + } else { + stemmedWordsList = cleanwordsList; + } + return stemmedWordsList; +} - // If no results, display a message - if ( txt_wordsnotfound != "" ) {writeln("<p>"+"Your search returned no results for "+ "<span class=\"searchExpression\">" + txt_wordsnotfound + "</span>" +"</p>")} +function cjkTokenize(wordsList){ + var allTokens= new Array(); + var notCJKTokens= new Array(); + var j=0; + for(j=0;j<wordsList.length;j++){ + var word = wordsList[j]; + if(getAvgAsciiValue(word) < 127){ + notCJKTokens.push(word); + } else { + var tokenizer = new CJKTokenizer(word); + var tokensTmp = tokenizer.getAllTokens(); + allTokens = allTokens.concat(tokensTmp); + } + } + allTokens = allTokens.concat(tokenize(notCJKTokens)); + return allTokens; +} - // If results: display them - if (linkTab.length > 0 ) { +//A simple way to determine whether the query is in english or not. +function getAvgAsciiValue(word){ + var tmp = 0; + var num = word.length < 5 ? word.length:5; + for(i=0;i<num;i++){ + if(i==5) break; + tmp += word.charCodeAt(i); + } + return tmp/num; +} - /*writeln ("<p>" + txt_results_for + " " + "<span class=\"searchExpression\">" + cleanwordsList + "</span>" + "<br/>"+"</p>");* / - write("<p>"); - //write("<ul class='searchresult'>"); - for (t in linkTab) { - writeln(linkTab[t].toString()) - } - writeln("</p>"); - } +//CJKTokenizer +function CJKTokenizer(input){ + this.input = input; + this.offset=-1; + this.tokens = new Array(); + this.incrementToken = incrementToken; + this.tokenize = tokenize; + this.getAllTokens = getAllTokens; + this.unique = unique; - writeln ("</body></html>"); - close() ; + function incrementToken(){ + if(this.input.length - 2 <= this.offset){ + // console.log("false "+offset); + return false; + } + else { + this.offset+=1; + return true; + } + } - } */ + function tokenize(){ + //document.getElementById("content").innerHTML += x.substring(offset,offset+2)+"<br>"; + return this.input.substring(this.offset,this.offset+2); + } + + function getAllTokens(){ + while(this.incrementToken()){ + var tmp = this.tokenize(); + this.tokens.push(tmp); + } + var sortedTokens = this.unique(this.tokens); + + return sortedTokens; +// document.getElementById("content").innerHTML += tokens+" "; +// document.getElementById("content").innerHTML += "<br>dada"+sortedTokens+" "; +// console.log(tokens.length+"dsdsds"); + /*for(i=0;i<tokens.length;i++){ + console.log(tokens[i]); + var ss = tokens[i] == sortedTokens[i]; + +// document.getElementById("content").innerHTML += "<br>dada"+un[i]+"- "+stems[i]+" "+ ss; + document.getElementById("content").innerHTML += "<br>"+sortedTokens[i]; + }*/ + } + + function unique(a) + { + var r = new Array(); + o:for(var i = 0, n = a.length; i < n; i++) + { + for(var x = 0, y = r.length; x < y; x++) + { + if(r[x]==a[i]) continue o; + } + r[r.length] = a[i]; + } + return r; + } } @@ -316,9 +379,10 @@ } } +/* function onLoadComplete() { alert("loaded !!"); -} +} */ /* End of scriptloader functions */ @@ -413,7 +477,11 @@ var tempDisplay = new Array(); for (var x in tab) { - tempDisplay.push(stemQueryMap[tab[x]]); //get the original word from the stem word. + if(stemQueryMap[tab[x]] != undefined){ + tempDisplay.push(stemQueryMap[tab[x]]); //get the original word from the stem word. + } else { + tempDisplay.push(tab[x]); //no stem is available. (probably a CJK language) + } } var tempDispString = tempDisplay.join(", "); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |