From: Wolfgang M. M. <wol...@us...> - 2004-07-14 19:03:37
|
Update of /cvsroot/exist/eXist-1.0/src/org/exist/storage In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv14412/src/org/exist/storage Modified Files: NativeTextEngine.java TextSearchEngine.java Log Message: Added optional term frequency counts to fulltext index. Index: TextSearchEngine.java =================================================================== RCS file: /cvsroot/exist/eXist-1.0/src/org/exist/storage/TextSearchEngine.java,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** TextSearchEngine.java 21 Jun 2004 15:27:34 -0000 1.12 --- TextSearchEngine.java 14 Jul 2004 19:03:24 -0000 1.13 *************** *** 65,68 **** --- 65,69 ---- protected boolean indexNumbers = false ; protected boolean stem = false ; + protected boolean termFreq = true; protected PorterStemmer stemmer = null; *************** *** 77,81 **** this.config = conf; String stopword, tokenizerClass; ! Boolean num, stemming; if ((num = (Boolean) config.getProperty("indexer.indexNumbers")) != null) --- 78,82 ---- this.config = conf; String stopword, tokenizerClass; ! Boolean num, stemming, termFrequencies; if ((num = (Boolean) config.getProperty("indexer.indexNumbers")) != null) *************** *** 83,86 **** --- 84,89 ---- if ((stemming = (Boolean) config.getProperty("indexer.stem")) != null) stem = stemming.booleanValue(); + if((termFrequencies = (Boolean) config.getProperty("indexer.store-term-freq")) != null) + termFreq = termFrequencies.booleanValue(); if ((tokenizerClass = (String) config.getProperty("indexer.tokenizer")) != null) { Index: NativeTextEngine.java =================================================================== RCS file: /cvsroot/exist/eXist-1.0/src/org/exist/storage/NativeTextEngine.java,v retrieving revision 1.59 retrieving revision 1.60 diff -C2 -d -r1.59 -r1.60 *** NativeTextEngine.java 2 Jul 2004 18:24:45 -0000 1.59 --- NativeTextEngine.java 14 Jul 2004 19:03:24 -0000 1.60 *************** *** 75,81 **** import org.exist.util.Lock; import org.exist.util.LockException; - import org.exist.util.LongLinkedList; import org.exist.util.Occurrences; ! import org.exist.util.OrderedLongLinkedList; import org.exist.util.ProgressIndicator; import org.exist.util.ReadOnlyException; --- 75,80 ---- import org.exist.util.Lock; import org.exist.util.LockException; import org.exist.util.Occurrences; ! import org.exist.util.OrderedLinkedList; import org.exist.util.ProgressIndicator; import org.exist.util.ReadOnlyException; *************** *** 301,304 **** --- 300,304 ---- byte[] data; long gid; + int freq = 1; int docId; int len; *************** *** 312,315 **** --- 312,316 ---- NodeProxy parent, current = new NodeProxy(); NodeSet result; + Match match; if (contextSet == null) result = new TextSearchResult(trackMatches != Serializer.TAG_NONE); *************** *** 336,340 **** if ((doc = docs.getDoc(docId)) == null || (contextSet != null && !contextSet.containsDoc(doc))) { ! is.skip(len); continue; } --- 337,341 ---- if ((doc = docs.getDoc(docId)) == null || (contextSet != null && !contextSet.containsDoc(doc))) { ! is.skip(termFreq ? len * 2 : len); continue; } *************** *** 344,347 **** --- 345,350 ---- for (int j = 0; j < len; j++) { delta = is.readLong(); + if(termFreq) + freq = is.readInt(); gid = last + delta; last = gid; *************** *** 355,360 **** if (parent != null) { result.add(parent, sizeHint); ! if (trackMatches != Serializer.TAG_NONE) ! parent.addMatch(new Match(term, gid)); } } else --- 358,366 ---- if (parent != null) { result.add(parent, sizeHint); ! if (trackMatches != Serializer.TAG_NONE) { ! match = new Match(term, gid); ! match.setFrequency(freq); ! parent.addMatch(match); ! } } } else *************** *** 540,544 **** section = is.readByte(); len = is.readInt(); ! is.skip(len); oc.addOccurrences(len); } --- 546,550 ---- section = is.readByte(); len = is.readInt(); ! is.skip(termFreq ? len * 2 : len); oc.addOccurrences(len); } *************** *** 641,649 **** os.writeByte(section); os.writeInt(len); ! is.copyTo(os, len); } else { changed = true; // skip ! is.skip(len); } } --- 647,655 ---- os.writeByte(section); os.writeInt(len); ! is.copyTo(os, termFreq ? len * 2 : len); } else { changed = true; // skip ! is.skip(termFreq ? len * 2 : len); } } *************** *** 732,736 **** invIdx.setDocument(doc); String sal= text.getXMLString().transformToLower().toString() ; ! invIdx.addText(sal, gid); } else { while (null != (token = tokenizer.nextToken())) { --- 738,742 ---- invIdx.setDocument(doc); String sal= text.getXMLString().transformToLower().toString() ; ! invIdx.addText(sal, gid); } else { while (null != (token = tokenizer.nextToken())) { *************** *** 790,793 **** --- 796,800 ---- } } + /** * This inner class is responsible for actually storing the list of *************** *** 797,805 **** */ final class InvertedIndex { ! private DocumentImpl doc = null; private Map words[] = new TreeMap[2]; private VariableByteOutputStream os = new VariableByteOutputStream(7); ! public InvertedIndex() { // To distinguish between attribute values and text, we use --- 804,839 ---- */ final class InvertedIndex { ! ! private class TermOccurrence extends OrderedLinkedList.Node implements Comparable { ! long gid; ! int frequency = 1; ! ! public TermOccurrence(long gid) { ! this.gid = gid; ! } ! ! public int compareTo(OrderedLinkedList.Node o) { ! final TermOccurrence other = (TermOccurrence)o; ! if(gid == other.gid) ! return 0; ! else if(gid < other.gid) ! return -1; ! else ! return 1; ! } ! ! public int compareTo(Object o) { ! return compareTo((OrderedLinkedList.Node)o); ! } ! ! public boolean equals(org.exist.util.OrderedLinkedList.Node other) { ! return gid == ((TermOccurrence)other).gid; ! } ! } ! private DocumentImpl doc = null; private Map words[] = new TreeMap[2]; private VariableByteOutputStream os = new VariableByteOutputStream(7); ! public InvertedIndex() { // To distinguish between attribute values and text, we use *************** *** 811,833 **** public void addText(String word, long gid) { ! LongLinkedList buf = (LongLinkedList) words[0].get(word); if (buf == null) { ! buf = new OrderedLongLinkedList(); words[0].put(word, buf); ! } else if (buf.getLast() == gid) { ! return; // double entry: skip } - buf.add(gid); } public void addAttribute(String word, long gid) { ! LongLinkedList buf = (LongLinkedList) words[1].get(word); if (buf == null) { ! buf = new OrderedLongLinkedList(); words[1].put(word, buf); ! } else if (buf.getLast() == gid) { ! return; // double entry: skip } - buf.add(gid); } --- 845,883 ---- public void addText(String word, long gid) { ! OrderedLinkedList buf = (OrderedLinkedList) words[0].get(word); ! TermOccurrence o; if (buf == null) { ! buf = new OrderedLinkedList(); ! o = new TermOccurrence(gid); ! buf.add(o); words[0].put(word, buf); ! } else { ! o = (TermOccurrence)buf.getLast(); ! if(o.gid == gid) ! o.frequency++; ! else { ! o = new TermOccurrence(gid); ! buf.add(o); ! } } } public void addAttribute(String word, long gid) { ! OrderedLinkedList buf = (OrderedLinkedList) words[1].get(word); ! TermOccurrence o; if (buf == null) { ! buf = new OrderedLinkedList(); ! o = new TermOccurrence(gid); ! buf.add(o); words[1].put(word, buf); ! } else { ! o = (TermOccurrence)buf.getLast(); ! if(o.gid == gid) ! o.frequency++; ! else { ! o = new TermOccurrence(gid); ! buf.add(o); ! } } } *************** *** 840,845 **** Map.Entry entry; String word; ! LongLinkedList idList; ! long[] ids; byte[] data; long last, gid; --- 890,895 ---- Map.Entry entry; String word; ! OrderedLinkedList idList; ! TermOccurrence[] ids; byte[] data; long last, gid; *************** *** 848,852 **** NodeProxy p; WordRef ref; ! LongLinkedList newList; Value val = null; VariableByteArrayInput is; --- 898,903 ---- NodeProxy p; WordRef ref; ! TermOccurrence t; ! OrderedLinkedList newList; Value val = null; VariableByteArrayInput is; *************** *** 856,860 **** entry = (Map.Entry) i.next(); word = (String) entry.getKey(); ! idList = (LongLinkedList) entry.getValue(); ref = new WordRef(collectionId, word); try { --- 907,911 ---- entry = (Map.Entry) i.next(); word = (String) entry.getKey(); ! idList = (OrderedLinkedList) entry.getValue(); ref = new WordRef(collectionId, word); try { *************** *** 862,866 **** val = dbWords.get(ref); os.clear(); ! newList = new LongLinkedList(); if (val != null) { // add old entries to the new list --- 913,917 ---- val = dbWords.get(ref); os.clear(); ! newList = new OrderedLinkedList(); if (val != null) { // add old entries to the new list *************** *** 879,884 **** delta = is.readLong(); last = last + delta; ! if (!idList.contains(last)) ! newList.add(last); } } else { --- 930,938 ---- delta = is.readLong(); last = last + delta; ! t = new TermOccurrence(last); ! if(termFreq) ! t.frequency = is.readInt(); ! if (!idList.contains(t)) ! newList.add(t); } } else { *************** *** 888,893 **** os.writeByte(section); os.writeInt(len); ! for (int j = 0; j < len; j++) ! is.copyTo(os); } } --- 942,946 ---- os.writeByte(section); os.writeInt(len); ! is.copyTo(os, termFreq ? len * 2 : len); } } *************** *** 901,905 **** } } ! ids = newList.getData(); //i.remove(); Arrays.sort(ids); --- 954,959 ---- } } ! ids = new TermOccurrence[newList.size()]; ! newList.toArray(ids); //i.remove(); Arrays.sort(ids); *************** *** 910,914 **** last = 0; for (int j = 0; j < len; j++) { ! delta = ids[j] - last; if (delta < 0) { LOG.debug("neg. delta: " + delta + " for " + word); --- 964,968 ---- last = 0; for (int j = 0; j < len; j++) { ! delta = ids[j].gid - last; if (delta < 0) { LOG.debug("neg. delta: " + delta + " for " + word); *************** *** 916,920 **** } os.writeLong(delta); ! last = ids[j]; } try { --- 970,976 ---- } os.writeLong(delta); ! if(termFreq) ! os.writeInt(ids[j].frequency); ! last = ids[j].gid; } try { *************** *** 942,947 **** Map.Entry entry; String word; ! LongLinkedList idList; ! long[] ids; long last, gid; long delta; --- 998,1004 ---- Map.Entry entry; String word; ! OrderedLinkedList idList; ! TermOccurrence[] ids; ! TermOccurrence t; long last, gid; long delta; *************** *** 955,959 **** entry = (Map.Entry) i.next(); word = (String) entry.getKey(); ! idList = (LongLinkedList) entry.getValue(); ref = new WordRef(collectionId, word); try { --- 1012,1016 ---- entry = (Map.Entry) i.next(); word = (String) entry.getKey(); ! idList = (OrderedLinkedList) entry.getValue(); ref = new WordRef(collectionId, word); try { *************** *** 974,978 **** os.writeByte(section); os.writeInt(len); ! is.copyTo(os, len); } else { // copy nodes to new list --- 1031,1035 ---- os.writeByte(section); os.writeInt(len); ! is.copyTo(os, (termFreq ? len * 2 : len)); } else { // copy nodes to new list *************** *** 981,988 **** delta = is.readLong(); gid += delta; if (node == null ! && oldDoc.getTreeLevel(gid) < oldDoc ! .reindexRequired()) { ! idList.add(gid); } else if (node != null && (!XMLUtil --- 1038,1047 ---- delta = is.readLong(); gid += delta; + t = new TermOccurrence(gid); + if(termFreq) + t.frequency = is.readInt(); if (node == null ! && oldDoc.getTreeLevel(gid) < oldDoc.reindexRequired()) { ! idList.add(t); } else if (node != null && (!XMLUtil *************** *** 991,995 **** node.getGID(), gid))) { ! idList.add(gid); } } --- 1050,1054 ---- node.getGID(), gid))) { ! idList.add(t); } } *************** *** 1004,1008 **** } } ! ids = idList.getData(); Arrays.sort(ids); len = ids.length; --- 1063,1068 ---- } } ! ids = new TermOccurrence[idList.size()]; ! idList.toArray(ids); Arrays.sort(ids); len = ids.length; *************** *** 1012,1016 **** last = 0; for (int j = 0; j < len; j++) { ! delta = ids[j] - last; if (delta < 0) { LOG.debug("neg. delta: " + delta + " for " + word); --- 1072,1076 ---- last = 0; for (int j = 0; j < len; j++) { ! delta = ids[j].gid - last; if (delta < 0) { LOG.debug("neg. delta: " + delta + " for " + word); *************** *** 1018,1022 **** } os.writeLong(delta); ! last = ids[j]; } try { --- 1078,1084 ---- } os.writeLong(delta); ! if(termFreq) ! os.writeInt(ids[j].frequency); ! last = ids[j].gid; } try { *************** *** 1055,1070 **** Map.Entry entry; String word; ! LongLinkedList idList; ! long[] ids; byte[] data; long prevId, id; long delta; for (int k = 0; k < 2; k++) { for (Iterator i = words[k].entrySet().iterator(); i.hasNext(); count++) { entry = (Map.Entry) i.next(); word = (String) entry.getKey(); ! idList = (LongLinkedList) entry.getValue(); os.clear(); ! len = idList.getSize(); os.writeInt(doc.getDocId()); os.writeByte(k == 0 ? TEXT_SECTION : ATTRIBUTE_SECTION); --- 1117,1133 ---- Map.Entry entry; String word; ! OrderedLinkedList idList; ! Comparable[] ids; byte[] data; long prevId, id; long delta; + TermOccurrence t; for (int k = 0; k < 2; k++) { for (Iterator i = words[k].entrySet().iterator(); i.hasNext(); count++) { entry = (Map.Entry) i.next(); word = (String) entry.getKey(); ! idList = (OrderedLinkedList) entry.getValue(); os.clear(); ! len = idList.size(); os.writeInt(doc.getDocId()); os.writeByte(k == 0 ? TEXT_SECTION : ATTRIBUTE_SECTION); *************** *** 1072,1076 **** prevId = 0; for (Iterator j = idList.iterator(); j.hasNext();) { ! id = ((LongLinkedList.ListItem) j.next()).l; delta = id - prevId; if (delta < 0) { --- 1135,1140 ---- prevId = 0; for (Iterator j = idList.iterator(); j.hasNext();) { ! t = (TermOccurrence) j.next(); ! id = t.gid; delta = id - prevId; if (delta < 0) { *************** *** 1079,1082 **** --- 1143,1150 ---- } os.writeLong(delta); + if(termFreq) { + // write out term frequencies + os.writeInt(t.frequency); + } prevId = id; } |