From: <hei...@us...> - 2011-03-21 16:19:30
|
Revision: 7489 http://geonetwork.svn.sourceforge.net/geonetwork/?rev=7489&view=rev Author: heikkidoeleman Date: 2011-03-21 16:19:24 +0000 (Mon, 21 Mar 2011) Log Message: ----------- #476 : Mini-codesprint : improvements to GeoNetworkAnalyzer Modified Paths: -------------- trunk/web/src/main/java/org/fao/geonet/kernel/search/GeoNetworkAnalyzer.java trunk/web/src/main/java/org/fao/geonet/kernel/search/LuceneConfig.java trunk/web/src/main/java/org/fao/geonet/kernel/search/LuceneQueryBuilder.java trunk/web/src/test/java/org/fao/geonet/kernel/search/LuceneQueryTest.java Modified: trunk/web/src/main/java/org/fao/geonet/kernel/search/GeoNetworkAnalyzer.java =================================================================== --- trunk/web/src/main/java/org/fao/geonet/kernel/search/GeoNetworkAnalyzer.java 2011-03-19 07:34:25 UTC (rev 7488) +++ trunk/web/src/main/java/org/fao/geonet/kernel/search/GeoNetworkAnalyzer.java 2011-03-21 16:19:24 UTC (rev 7489) @@ -26,13 +26,15 @@ import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.util.Version; import java.io.Reader; import java.util.Set; /** - * Default Lucene analyzer for GeoNetwork, based on WhitespaceTokenizer but with added LowercaseFilter and + * Default Lucene analyzer for GeoNetwork, based on a modified version of WhitespaceTokenizer and with added LowercaseFilter and * ASCIIFoldingFilter, and optionally StopFilter. * <p/> * Reason is that with StandardAnalyzer, which GeoNetwork was using before, it tokenizes such that the character * is @@ -74,13 +76,13 @@ @Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { - final Tokenizer source = new WhitespaceTokenizer(reader); + final Tokenizer source = new StandardTokenizer(Version.LUCENE_30, reader); if(stopwords != null) { - return new TokenStreamComponents(source, new ASCIIFoldingFilter(new LowerCaseFilter(new StopFilter(enablePositionIncrements, source, stopwords, ignoreCase)))); + return new TokenStreamComponents(source, new StopFilter(enablePositionIncrements, new ASCIIFoldingFilter(new LowerCaseFilter(new StandardFilter(source))), stopwords, ignoreCase)); } else { - return new TokenStreamComponents(source, new ASCIIFoldingFilter(new LowerCaseFilter(source))); + return new TokenStreamComponents(source, new ASCIIFoldingFilter(new LowerCaseFilter(new StandardFilter(source)))); } } Modified: trunk/web/src/main/java/org/fao/geonet/kernel/search/LuceneConfig.java =================================================================== --- trunk/web/src/main/java/org/fao/geonet/kernel/search/LuceneConfig.java 2011-03-19 07:34:25 UTC (rev 7488) +++ trunk/web/src/main/java/org/fao/geonet/kernel/search/LuceneConfig.java 2011-03-21 16:19:24 UTC (rev 7489) @@ -126,12 +126,13 @@ private boolean trackMaxScore = false; private boolean docsScoredInOrder = false; - private Version LUCENE_VERSION = Version.LUCENE_29; + private Version LUCENE_VERSION = Version.LUCENE_30; /** - * Create a new Lucene configuration from an XML configuration file. + * Creates a new Lucene configuration from an XML configuration file. * - * @param configurationFilePath + * @param appPath + * @param luceneConfigXmlFile */ public LuceneConfig(String appPath, String luceneConfigXmlFile) { Log.debug(Geonet.SEARCH_ENGINE, "Loading Lucene configuration ..."); Modified: trunk/web/src/main/java/org/fao/geonet/kernel/search/LuceneQueryBuilder.java =================================================================== --- trunk/web/src/main/java/org/fao/geonet/kernel/search/LuceneQueryBuilder.java 2011-03-19 07:34:25 UTC (rev 7488) +++ trunk/web/src/main/java/org/fao/geonet/kernel/search/LuceneQueryBuilder.java 2011-03-21 16:19:24 UTC (rev 7489) @@ -84,7 +84,40 @@ throw new IllegalArgumentException("Cannot create Lucene query for null string"); } Query query = null; - String analyzedString = LuceneSearcher.analyzeQueryText(luceneIndexField, string, _analyzer, _tokenizedFieldSet); + + String analyzedString = ""; + // wildcards - preserve them by analyzing the parts of the search string around them separately + // (this is because Lucene's StandardTokenizer would remove wildcards, but that's not what we want) + if(string.indexOf('*') >= 0 || string.indexOf('?') >= 0) { + String starsPreserved = ""; + String[] starSeparatedList = string.split("\\*"); + for(String starSeparatedPart : starSeparatedList) { + String qPreserved = ""; + // ? present + if(starSeparatedPart.indexOf('?') >= 0) { + String[] qSeparatedList = starSeparatedPart.split("\\?"); + for(String qSeparatedPart : qSeparatedList) { + String analyzedPart = LuceneSearcher.analyzeQueryText(luceneIndexField, qSeparatedPart, _analyzer, _tokenizedFieldSet); + qPreserved += '?' + analyzedPart; + } + // remove leading ? + qPreserved = qPreserved.substring(1); + starsPreserved += '*' + qPreserved; + } + // no ? present + else { + starsPreserved += '*' + LuceneSearcher.analyzeQueryText(luceneIndexField, starSeparatedPart, _analyzer, _tokenizedFieldSet); + } + } + // remove leading * + starsPreserved = starsPreserved.substring(1); + analyzedString = starsPreserved; + } + // no wildcards + else { + analyzedString = LuceneSearcher.analyzeQueryText(luceneIndexField, string, _analyzer, _tokenizedFieldSet); + } + if(StringUtils.hasLength(analyzedString)) { // no wildcards if(string.indexOf('*') < 0 && string.indexOf('?') < 0) { @@ -106,6 +139,8 @@ return query; } + + /** * Creates a query for all tokens in the search param. The query must select only results * where none of the tokens in the search param is present. @@ -258,6 +293,12 @@ return query; } + /** + * Builds a Lucene query from LuceneQueryInput. + * + * @param luceneQueryInput + * @return query + */ public Query build(LuceneQueryInput luceneQueryInput) { Log.debug(Geonet.SEARCH_ENGINE, "\n\nLuceneQueryBuilder: luceneQueryInput is\n" + luceneQueryInput.toString() + "\n\n"); Modified: trunk/web/src/test/java/org/fao/geonet/kernel/search/LuceneQueryTest.java =================================================================== --- trunk/web/src/test/java/org/fao/geonet/kernel/search/LuceneQueryTest.java 2011-03-19 07:34:25 UTC (rev 7488) +++ trunk/web/src/test/java/org/fao/geonet/kernel/search/LuceneQueryTest.java 2011-03-21 16:19:24 UTC (rev 7489) @@ -61,11 +61,29 @@ /** * 'any' parameter with a single token value that has a wildcard. */ - public void testSingleTokenWildcardAny() { + public void testSingleTokenQMarkWildcardAny() { // create request object JDOMFactory factory = new DefaultJDOMFactory(); Element request = factory.element("request"); Element any = factory.element("any"); + any.addContent("hoeper?poep"); + request.addContent(any); + // build lucene query input + LuceneQueryInput lQI = new LuceneQueryInput(request); + // build lucene query + Query query = new LuceneQueryBuilder(_tokenizedFieldSet, _numericFieldSet, _analyzer).build(lQI); + // verify query + assertEquals("+any:hoeper?poep +_isTemplate:n", query.toString()); + } + + /** + * 'any' parameter with a single token value that has a wildcard. + */ + public void testSingleTokenStarWildcardAny() { + // create request object + JDOMFactory factory = new DefaultJDOMFactory(); + Element request = factory.element("request"); + Element any = factory.element("any"); any.addContent("hoeper*poep"); request.addContent(any); // build lucene query input This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |