From: <wol...@us...> - 2009-05-30 12:53:33
|
Revision: 9084 http://exist.svn.sourceforge.net/exist/?rev=9084&view=rev Author: wolfgang_m Date: 2009-05-30 12:34:39 +0000 (Sat, 30 May 2009) Log Message: ----------- [feature] Lucene index: options <ignore> and <inline> can now be specified for each created index, not just globally: <text qname="section"> <ignore qname="title"/> </text> Modified Paths: -------------- trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/AbstractTextExtractor.java trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/DefaultTextExtractor.java trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneConfig.java trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneIndexConfig.java trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneIndexWorker.java trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneMatchListener.java trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/TextExtractor.java trunk/eXist/extensions/indexes/lucene/test/src/org/exist/indexing/lucene/LuceneIndexTest.java trunk/eXist/extensions/indexes/lucene/test/src/org/exist/indexing/lucene/LuceneMatchListenerTest.java Modified: trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/AbstractTextExtractor.java =================================================================== --- trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/AbstractTextExtractor.java 2009-05-30 12:30:30 UTC (rev 9083) +++ trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/AbstractTextExtractor.java 2009-05-30 12:34:39 UTC (rev 9084) @@ -27,11 +27,13 @@ public abstract class AbstractTextExtractor implements TextExtractor { protected LuceneConfig config; + protected LuceneIndexConfig idxConfig; protected XMLString buffer = new XMLString(); - public void configure(LuceneConfig config) { + public void configure(LuceneConfig config, LuceneIndexConfig idxConfig) { this.config = config; + this.idxConfig = idxConfig; } public float getBoost() { Modified: trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/DefaultTextExtractor.java =================================================================== --- trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/DefaultTextExtractor.java 2009-05-30 12:30:30 UTC (rev 9083) +++ trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/DefaultTextExtractor.java 2009-05-30 12:34:39 UTC (rev 9084) @@ -29,9 +29,9 @@ private int stack = 0; public int startElement(QName name) { - if (config.isIgnoredNode(name)) + if (config.isIgnoredNode(name) || (idxConfig != null && idxConfig.isIgnoredNode(name))) stack++; - else if (!config.isInlineNode(name)) { + else if (!(config.isInlineNode(name) || (idxConfig != null && idxConfig.isInlineNode(name)))) { buffer.append(' '); return 1; } @@ -39,9 +39,9 @@ } public int endElement(QName name) { - if (config.isIgnoredNode(name)) + if (config.isIgnoredNode(name) || (idxConfig != null && idxConfig.isIgnoredNode(name))) stack--; - else if (!config.isInlineNode(name)) { + else if (!(config.isInlineNode(name) || (idxConfig != null && idxConfig.isInlineNode(name)))) { buffer.append(' '); return 1; } Modified: trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneConfig.java =================================================================== --- trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneConfig.java 2009-05-30 12:30:30 UTC (rev 9083) +++ trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneConfig.java 2009-05-30 12:34:39 UTC (rev 9084) @@ -165,7 +165,6 @@ if (ignoreNodes == null) ignoreNodes = new TreeSet<QName>(); ignoreNodes.add(qname); - } } } Modified: trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneIndexConfig.java =================================================================== --- trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneIndexConfig.java 2009-05-30 12:30:30 UTC (rev 9083) +++ trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneIndexConfig.java 2009-05-30 12:34:39 UTC (rev 9084) @@ -5,16 +5,24 @@ import org.exist.storage.NodePath; import org.exist.util.DatabaseConfigurationException; import org.w3c.dom.Element; +import org.w3c.dom.Node; import java.util.Map; +import java.util.TreeMap; public class LuceneIndexConfig { + private final static String N_INLINE = "inline"; + private final static String N_IGNORE = "ignore"; + private final static String ANALYZER_ID_ATTR = "analyzer"; private static final String QNAME_ATTR = "qname"; private static final String MATCH_ATTR = "match"; private final static String BOOST_ATTRIB = "boost"; + private final static String IGNORE_ELEMENT = "ignore"; + private final static String INLINE_ELEMENT = "inline"; + private String analyzerId = null; private QName qname = null; @@ -23,6 +31,8 @@ private float boost = -1; + private Map<QName, String> specialNodes = null; + public LuceneIndexConfig(Element config, Map namespaces, AnalyzerConfig analyzers) throws DatabaseConfigurationException { if (config.hasAttribute(QNAME_ATTR)) { qname = parseQName(config, namespaces); @@ -49,8 +59,33 @@ "got: " + boostAttr); } } + parse(config, namespaces); } + private void parse(Element root, Map namespaces) throws DatabaseConfigurationException { + Node child = root.getFirstChild(); + while (child != null) { + if (child.getNodeType() == Node.ELEMENT_NODE) { + if (IGNORE_ELEMENT.equals(child.getLocalName())) { + String qnameAttr = ((Element) child).getAttribute(QNAME_ATTR); + if (qnameAttr == null || qnameAttr.length() == 0) + throw new DatabaseConfigurationException("Lucene configuration element 'ignore' needs an attribute 'qname'"); + if (specialNodes == null) + specialNodes = new TreeMap<QName, String>(); + specialNodes.put(parseQName(qnameAttr, namespaces), N_IGNORE); + } else if (INLINE_ELEMENT.equals(child.getLocalName())) { + String qnameAttr = ((Element) child).getAttribute(QNAME_ATTR); + if (qnameAttr == null || qnameAttr.length() == 0) + throw new DatabaseConfigurationException("Lucene configuration element 'inline' needs an attribute 'qname'"); + if (specialNodes == null) + specialNodes = new TreeMap<QName, String>(); + specialNodes.put(parseQName(qnameAttr, namespaces), N_INLINE); + } + } + child = child.getNextSibling(); + } + } + public String getAnalyzerId() { return analyzerId; } @@ -66,7 +101,15 @@ public float getBoost() { return boost; } - + + public boolean isIgnoredNode(QName qname) { + return specialNodes != null && specialNodes.get(qname) == N_IGNORE; + } + + public boolean isInlineNode(QName qname) { + return specialNodes != null && specialNodes.get(qname) == N_INLINE; + } + protected static QName parseQName(Element config, Map namespaces) throws DatabaseConfigurationException { String name = config.getAttribute(QNAME_ATTR); if (name == null || name.length() == 0) Modified: trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneIndexWorker.java =================================================================== --- trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneIndexWorker.java 2009-05-30 12:30:30 UTC (rev 9083) +++ trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneIndexWorker.java 2009-05-30 12:34:39 UTC (rev 9084) @@ -681,10 +681,11 @@ extractor.startElement(element.getQName()); } } - if (config.matches(path)) { + LuceneIndexConfig idxConf = config.getConfig(path); + if (idxConf != null) { if (contentStack == null) contentStack = new Stack(); TextExtractor extractor = new DefaultTextExtractor(); - extractor.configure(config); + extractor.configure(config, idxConf); contentStack.push(extractor); } } Modified: trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneMatchListener.java =================================================================== --- trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneMatchListener.java 2009-05-30 12:30:30 UTC (rev 9083) +++ trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/LuceneMatchListener.java 2009-05-30 12:34:39 UTC (rev 9084) @@ -27,16 +27,13 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.Query; -import org.exist.dom.Match; -import org.exist.dom.NewArrayNodeSet; -import org.exist.dom.NodeProxy; -import org.exist.dom.NodeSet; -import org.exist.dom.QName; +import org.exist.dom.*; import org.exist.indexing.AbstractMatchListener; import org.exist.numbering.NodeId; import org.exist.stax.EmbeddedXMLStreamReader; import org.exist.storage.DBBroker; import org.exist.storage.IndexSpec; +import org.exist.storage.NodePath; import org.exist.util.serializer.AttrList; import org.xml.sax.SAXException; @@ -165,8 +162,10 @@ private void scanMatches(NodeProxy p) { // Collect the text content of all descendants of p. Remember the start offsets // of the text nodes for later use. + NodePath path = getPath(p); + LuceneIndexConfig idxConf = config.getConfig(path); TextExtractor extractor = new DefaultTextExtractor(); - extractor.configure(config); + extractor.configure(config, idxConf); OffsetList offsets = new OffsetList(); int level = 0; int textOffset = 0; @@ -220,6 +219,21 @@ } } + private NodePath getPath(NodeProxy proxy) { + NodePath path = new NodePath(); + StoredNode node = (StoredNode) proxy.getNode(); + walkAncestor(node, path); + return path; + } + + private void walkAncestor(StoredNode node, NodePath path) { + if (node == null) + return; + StoredNode parent = node.getParentStoredNode(); + walkAncestor(parent, path); + path.addComponent(node.getQName()); + } + /** * Get all query terms from the original queries. */ Modified: trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/TextExtractor.java =================================================================== --- trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/TextExtractor.java 2009-05-30 12:30:30 UTC (rev 9083) +++ trunk/eXist/extensions/indexes/lucene/src/org/exist/indexing/lucene/TextExtractor.java 2009-05-30 12:34:39 UTC (rev 9084) @@ -31,7 +31,7 @@ */ public interface TextExtractor { - public void configure(LuceneConfig config); + public void configure(LuceneConfig config, LuceneIndexConfig idxConfig); public int startElement(QName name); Modified: trunk/eXist/extensions/indexes/lucene/test/src/org/exist/indexing/lucene/LuceneIndexTest.java =================================================================== --- trunk/eXist/extensions/indexes/lucene/test/src/org/exist/indexing/lucene/LuceneIndexTest.java 2009-05-30 12:30:30 UTC (rev 9083) +++ trunk/eXist/extensions/indexes/lucene/test/src/org/exist/indexing/lucene/LuceneIndexTest.java 2009-05-30 12:34:39 UTC (rev 9084) @@ -6,10 +6,7 @@ import java.io.File; import java.io.StringReader; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import org.exist.Indexer; import org.exist.collections.Collection; @@ -86,6 +83,7 @@ " <p>A simple paragraph with <hi>highlighted</hi> text <note>and a note</note> " + " in it.</p>" + " <p>Paragraphs with <s>mix</s><s>ed</s> content are <s>danger</s>ous.</p>" + + " <p><note1>ignore</note1> <s2>warn</s2>ings</p>" + "</article>"; private static String XML6 = @@ -170,11 +168,17 @@ " <fulltext default=\"none\" attributes=\"no\">" + " </fulltext>" + " <lucene>" + - " <text qname=\"article\"/>" + - " <text qname=\"p\"/>" + + " <text qname=\"article\">" + + " <ignore qname=\"note\"/>" + + " <inline qname=\"s\"/>" + + " </text>" + + " <text qname=\"p\">" + + " <ignore qname=\"note\"/>" + + " <inline qname=\"s\"/>" + + " </text>" + " <text qname=\"head\"/>" + - " <inline qname=\"s\"/>" + - " <ignore qname=\"note\"/>" + + " <ignore qname=\"note1\"/>" + + " <inline qname=\"s2\"/>" + " </lucene>" + " </index>" + "</collection>"; @@ -292,6 +296,8 @@ checkIndex(docs, broker, new QName[] { new QName("p", "") }, "mixed", 1); checkIndex(docs, broker, new QName[] { new QName("p", "") }, "dangerous", 1); checkIndex(docs, broker, new QName[] { new QName("p", "") }, "note", 0); + checkIndex(docs, broker, new QName[] { new QName("p", "") }, "ignore", 0); + checkIndex(docs, broker, new QName[] { new QName("p", "") }, "warnings", 1); XQuery xquery = broker.getXQueryService(); assertNotNull(xquery); @@ -339,6 +345,10 @@ assertNotNull(seq); assertEquals(1, seq.getItemCount()); + seq = xquery.execute("/article[ft:query(., 'warnings')]", null, AccessContext.TEST); + assertNotNull(seq); + assertEquals(1, seq.getItemCount()); + seq = xquery.execute("/article[ft:query(., 'danger')]", null, AccessContext.TEST); assertNotNull(seq); assertEquals(0, seq.getItemCount()); @@ -346,6 +356,10 @@ seq = xquery.execute("/article[ft:query(., 'note')]", null, AccessContext.TEST); assertNotNull(seq); assertEquals(0, seq.getItemCount()); + + seq = xquery.execute("/article[ft:query(., 'ignore')]", null, AccessContext.TEST); + assertNotNull(seq); + assertEquals(0, seq.getItemCount()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); @@ -1219,15 +1233,14 @@ if (term != null) hints.put(OrderedValuesIndex.START_VALUE, term); if (qn != null && qn.length > 0) { - List qnames = new ArrayList(); - for (int i = 0; i < qn.length; i++) { - qnames.add(qn[i]); - } - hints.put(QNamedKeysIndex.QNAMES_KEY, qnames); + List<QName> qnlist = new ArrayList<QName>(qn.length); + for (int i = 0; i < qn.length; i++) + qnlist.add(qn[i]); + hints.put(QNamedKeysIndex.QNAMES_KEY, qnlist); } XQueryContext context = new XQueryContext(broker, AccessContext.TEST); Occurrences[] occur = index.scanIndex(context, docs, docs.docsToNodeSet(), hints); - if (expected != occur.length) { + if (occur != null && expected != occur.length) { for (int i = 0; i < occur.length; i++) { System.out.println("term: " + occur[i].getTerm()); } Modified: trunk/eXist/extensions/indexes/lucene/test/src/org/exist/indexing/lucene/LuceneMatchListenerTest.java =================================================================== --- trunk/eXist/extensions/indexes/lucene/test/src/org/exist/indexing/lucene/LuceneMatchListenerTest.java 2009-05-30 12:30:30 UTC (rev 9083) +++ trunk/eXist/extensions/indexes/lucene/test/src/org/exist/indexing/lucene/LuceneMatchListenerTest.java 2009-05-30 12:34:39 UTC (rev 9084) @@ -68,8 +68,7 @@ private static String XML1 = "<article>" + " <head>The <b>title</b>of it</head>" + - " <p>A simple paragraph with <hi>highlighted</hi> text <note>and a note</note> " + - " in it.</p>" + + " <p>A simple<note>sic</note> paragraph with <hi>highlighted</hi> text <note>and a note</note> to be ignored.</p>" + " <p>Paragraphs with <s>mix</s><s>ed</s> content are <s>danger</s>ous.</p>" + "</article>"; @@ -107,10 +106,11 @@ " <fulltext default=\"none\" attributes=\"no\">" + " </fulltext>" + " <lucene>" + - " <text qname=\"p\"/>" + + " <text qname=\"p\">" + + " <ignore qname=\"note\"/>" + + " </text>" + " <text qname=\"head\"/>" + " <inline qname=\"s\"/>" + - " <ignore qname=\"note\"/>" + " </lucene>" + " </index>" + "</collection>"; @@ -266,6 +266,30 @@ XMLAssert.assertEquals("<p>Paragraphs with <s>" + MATCH_START + "mix" + MATCH_END + "</s><s>ed</s> content are <s>danger</s>ous.</p>", result); + seq = xquery.execute("//p[ft:query(., 'ignored')]", null, AccessContext.TEST); + assertNotNull(seq); + assertEquals(1, seq.getItemCount()); + result = queryResult2String(broker, seq); + System.out.println("RESULT: " + result); + XMLAssert.assertEquals("<p>A simple<note>sic</note> paragraph with <hi>highlighted</hi> text <note>and a note</note> to be " + + MATCH_START + "ignored" + MATCH_END + ".</p>", result); + + seq = xquery.execute("//p[ft:query(., 'highlighted')]", null, AccessContext.TEST); + assertNotNull(seq); + assertEquals(1, seq.getItemCount()); + result = queryResult2String(broker, seq); + System.out.println("RESULT: " + result); + XMLAssert.assertEquals("<p>A simple<note>sic</note> paragraph with <hi>" + MATCH_START + + "highlighted" + MATCH_END + "</hi> text <note>and a note</note> to be " + + "ignored.</p>", result); + + seq = xquery.execute("//p[ft:query(., 'highlighted')]/hi", null, AccessContext.TEST); + assertNotNull(seq); + assertEquals(1, seq.getItemCount()); + result = queryResult2String(broker, seq); + System.out.println("RESULT: " + result); + XMLAssert.assertEquals("<hi>" + MATCH_START + "highlighted" + MATCH_END + "</hi>", result); + seq = xquery.execute("//head[ft:query(., 'title')]", null, AccessContext.TEST); assertNotNull(seq); assertEquals(1, seq.getItemCount()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |