From: Wolfgang M. M. <wol...@us...> - 2004-07-21 08:54:06
|
Update of /cvsroot/exist/eXist-1.0/src/org/exist/storage In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv15944/src/org/exist/storage Modified Files: NativeTextEngine.java NativeElementIndex.java Log Message: Code to track term frequency counts in NativeTextEngine had to be rewritten as it slowed down the indexing. Index: NativeElementIndex.java =================================================================== RCS file: /cvsroot/exist/eXist-1.0/src/org/exist/storage/NativeElementIndex.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** NativeElementIndex.java 19 Jul 2004 13:06:24 -0000 1.34 --- NativeElementIndex.java 21 Jul 2004 08:53:56 -0000 1.35 *************** *** 640,646 **** last = gid; address = StorageAddress.read(is); ! if (!containsNode(idList, gid)) newList.add(new NodeProxy(doc, gid, address)); } } --- 640,647 ---- last = gid; address = StorageAddress.read(is); ! if (!containsNode(idList, gid)) { newList.add(new NodeProxy(doc, gid, address)); + } } } *************** *** 668,676 **** StorageAddress.write(p.getInternalAddress(), os); } ! if (val == null) { ! dbElement.put(ref, os.data()); ! } else { ! dbElement.update(val.getAddress(), ref, os.data()); ! } } catch (LockException e) { LOG.error("could not acquire lock on elements", e); --- 669,677 ---- StorageAddress.write(p.getInternalAddress(), os); } ! if (val == null) { ! dbElement.put(ref, os.data()); ! } else { ! dbElement.update(val.getAddress(), ref, os.data()); ! } } catch (LockException e) { LOG.error("could not acquire lock on elements", e); Index: NativeTextEngine.java =================================================================== RCS file: /cvsroot/exist/eXist-1.0/src/org/exist/storage/NativeTextEngine.java,v retrieving revision 1.62 retrieving revision 1.63 diff -C2 -d -r1.62 -r1.63 *** NativeTextEngine.java 19 Jul 2004 08:12:29 -0000 1.62 --- NativeTextEngine.java 21 Jul 2004 08:53:55 -0000 1.63 *************** *** 75,80 **** import org.exist.util.Lock; import org.exist.util.LockException; import org.exist.util.Occurrences; ! import org.exist.util.OrderedLinkedList; import org.exist.util.ProgressIndicator; import org.exist.util.ReadOnlyException; --- 75,81 ---- import org.exist.util.Lock; import org.exist.util.LockException; + import org.exist.util.LongLinkedList; import org.exist.util.Occurrences; ! import org.exist.util.OrderedLongLinkedList; import org.exist.util.ProgressIndicator; import org.exist.util.ReadOnlyException; *************** *** 300,304 **** byte[] data; long gid; - int freq = 1; int docId; int len; --- 301,304 ---- *************** *** 307,310 **** --- 307,311 ---- long last; long delta; + int freq = 1; Collection collection; short collectionId; *************** *** 344,351 **** last = 0; for (int j = 0; j < len; j++) { ! delta = is.readLong(); if(termFreq) freq = is.readInt(); - gid = last + delta; last = gid; count++; --- 345,351 ---- last = 0; for (int j = 0; j < len; j++) { ! gid = last + is.readLong(); if(termFreq) freq = is.readInt(); last = gid; count++; *************** *** 357,366 **** true, -1); if (parent != null) { result.add(parent, sizeHint); ! if (trackMatches != Serializer.TAG_NONE) { ! match = new Match(term, gid); ! match.setFrequency(freq); parent.addMatch(match); - } } } else --- 357,365 ---- true, -1); if (parent != null) { + match = new Match(term, gid); + match.setFrequency(freq); result.add(parent, sizeHint); ! if (trackMatches != Serializer.TAG_NONE) parent.addMatch(match); } } else *************** *** 546,551 **** section = is.readByte(); len = is.readInt(); ! is.skip(termFreq ? len * 2 : len); ! oc.addOccurrences(len); } } catch (EOFException e) { --- 545,552 ---- section = is.readByte(); len = is.readInt(); ! for(int k = 0; k < len; k++) { ! is.skip(1); ! oc.addOccurrences(is.readInt()); ! } } } catch (EOFException e) { *************** *** 738,742 **** invIdx.setDocument(doc); String sal= text.getXMLString().transformToLower().toString() ; ! invIdx.addText(sal, gid); } else { while (null != (token = tokenizer.nextToken())) { --- 739,743 ---- invIdx.setDocument(doc); String sal= text.getXMLString().transformToLower().toString() ; ! invIdx.addText(sal, gid); } else { while (null != (token = tokenizer.nextToken())) { *************** *** 772,776 **** } } ! private final static class WordRef extends Value { --- 773,821 ---- } } ! ! private static class TermFrequencyList extends OrderedLongLinkedList { ! ! protected static class TermFreq extends LongLinkedList.ListItem { ! ! int count = 1; ! ! public TermFreq(long l) { ! super(l); ! } ! ! public void increment() { ! ++count; ! } ! } ! ! /* (non-Javadoc) ! * @see org.exist.util.LongLinkedList#createListItem(long) ! */ ! protected ListItem createListItem(long l) { ! return new TermFreq(l); ! } ! ! public void incLastTerm() { ! if(last != null) ! ((TermFreq)last).increment(); ! } ! ! public void setLastTermFreq(int freq) { ! if(last != null) ! ((TermFreq)last).count = freq; ! } ! ! public TermFreq[] toArray() { ! TermFreq[] data = new TermFreq[count]; ! ListItem next = first; ! int i = 0; ! while( next != null ) { ! data[i++] = (TermFreq)next; ! next = next.next; ! } ! return data; ! } ! } ! private final static class WordRef extends Value { *************** *** 796,800 **** } } - /** * This inner class is responsible for actually storing the list of --- 841,844 ---- *************** *** 804,839 **** */ final class InvertedIndex { ! ! private class TermOccurrence extends OrderedLinkedList.Node implements Comparable { ! long gid; ! int frequency = 1; ! ! public TermOccurrence(long gid) { ! this.gid = gid; ! } ! ! public int compareTo(OrderedLinkedList.Node o) { ! final TermOccurrence other = (TermOccurrence)o; ! if(gid == other.gid) ! return 0; ! else if(gid < other.gid) ! return -1; ! else ! return 1; ! } ! ! public int compareTo(Object o) { ! return compareTo((OrderedLinkedList.Node)o); ! } ! ! public boolean equals(org.exist.util.OrderedLinkedList.Node other) { ! return gid == ((TermOccurrence)other).gid; ! } ! } ! private DocumentImpl doc = null; private Map words[] = new TreeMap[2]; private VariableByteOutputStream os = new VariableByteOutputStream(7); ! public InvertedIndex() { // To distinguish between attribute values and text, we use --- 848,856 ---- */ final class InvertedIndex { ! private DocumentImpl doc = null; private Map words[] = new TreeMap[2]; private VariableByteOutputStream os = new VariableByteOutputStream(7); ! public InvertedIndex() { // To distinguish between attribute values and text, we use *************** *** 845,883 **** public void addText(String word, long gid) { ! OrderedLinkedList buf = (OrderedLinkedList) words[0].get(word); ! TermOccurrence o; if (buf == null) { ! buf = new OrderedLinkedList(); ! o = new TermOccurrence(gid); ! buf.add(o); words[0].put(word, buf); ! } else { ! o = (TermOccurrence)buf.getLast(); ! if(o.gid == gid) { ! o.frequency++; ! } else { ! o = new TermOccurrence(gid); ! buf.add(o); ! } ! } } public void addAttribute(String word, long gid) { ! OrderedLinkedList buf = (OrderedLinkedList) words[1].get(word); ! TermOccurrence o; if (buf == null) { ! buf = new OrderedLinkedList(); ! o = new TermOccurrence(gid); ! buf.add(o); words[1].put(word, buf); ! } else { ! o = (TermOccurrence)buf.getLast(); ! if(o.gid == gid) ! o.frequency++; ! else { ! o = new TermOccurrence(gid); ! buf.add(o); ! } ! } } --- 862,886 ---- public void addText(String word, long gid) { ! TermFrequencyList buf = (TermFrequencyList) words[0].get(word); if (buf == null) { ! buf = new TermFrequencyList(); ! buf.add(gid); words[0].put(word, buf); ! } else if (buf.getLast() == gid) { ! buf.incLastTerm(); ! } else ! buf.add(gid); } public void addAttribute(String word, long gid) { ! TermFrequencyList buf = (TermFrequencyList) words[1].get(word); if (buf == null) { ! buf = new TermFrequencyList(); ! buf.add(gid); words[1].put(word, buf); ! } else if (buf.getLast() == gid) { ! buf.incLastTerm(); ! } else ! buf.add(gid); } *************** *** 890,895 **** Map.Entry entry; String word; ! OrderedLinkedList idList; ! TermOccurrence[] ids; byte[] data; long last, gid; --- 893,898 ---- Map.Entry entry; String word; ! TermFrequencyList idList; ! TermFrequencyList.TermFreq[] ids; byte[] data; long last, gid; *************** *** 898,903 **** NodeProxy p; WordRef ref; ! TermOccurrence t; ! OrderedLinkedList newList; Value val = null; VariableByteArrayInput is; --- 901,906 ---- NodeProxy p; WordRef ref; ! TermFrequencyList newList; ! int freq = 1; Value val = null; VariableByteArrayInput is; *************** *** 907,911 **** entry = (Map.Entry) i.next(); word = (String) entry.getKey(); ! idList = (OrderedLinkedList) entry.getValue(); ref = new WordRef(collectionId, word); try { --- 910,914 ---- entry = (Map.Entry) i.next(); word = (String) entry.getKey(); ! idList = (TermFrequencyList) entry.getValue(); ref = new WordRef(collectionId, word); try { *************** *** 913,917 **** val = dbWords.get(ref); os.clear(); ! newList = new OrderedLinkedList(); if (val != null) { // add old entries to the new list --- 916,920 ---- val = dbWords.get(ref); os.clear(); ! newList = new TermFrequencyList(); if (val != null) { // add old entries to the new list *************** *** 928,938 **** last = 0; for (int j = 0; j < len; j++) { ! delta = is.readLong(); ! last = last + delta; ! t = new TermOccurrence(last); if(termFreq) ! t.frequency = is.readInt(); ! if (!idList.contains(t)) ! newList.add(t); } } else { --- 931,941 ---- last = 0; for (int j = 0; j < len; j++) { ! last = last + is.readLong(); if(termFreq) ! freq = is.readInt(); ! if (!idList.contains(last)) { ! newList.add(last); ! newList.setLastTermFreq(freq); ! } } } else { *************** *** 954,959 **** } } ! ids = new TermOccurrence[newList.size()]; ! newList.toArray(ids); //i.remove(); Arrays.sort(ids); --- 957,961 ---- } } ! ids = newList.toArray(); //i.remove(); Arrays.sort(ids); *************** *** 964,968 **** last = 0; for (int j = 0; j < len; j++) { ! delta = ids[j].gid - last; if (delta < 0) { LOG.debug("neg. delta: " + delta + " for " + word); --- 966,970 ---- last = 0; for (int j = 0; j < len; j++) { ! delta = ids[j].l - last; if (delta < 0) { LOG.debug("neg. delta: " + delta + " for " + word); *************** *** 971,976 **** os.writeLong(delta); if(termFreq) ! os.writeInt(ids[j].frequency); ! last = ids[j].gid; } try { --- 973,978 ---- os.writeLong(delta); if(termFreq) ! os.writeInt(ids[j].count); ! last = ids[j].l; } try { *************** *** 998,1006 **** Map.Entry entry; String word; ! OrderedLinkedList idList; ! TermOccurrence[] ids; ! TermOccurrence t; ! long last, gid; ! long delta; byte section; NodeProxy p; --- 1000,1007 ---- Map.Entry entry; String word; ! TermFrequencyList idList; ! TermFrequencyList.TermFreq[] ids; ! long last, gid, delta; ! int freq = 1; byte section; NodeProxy p; *************** *** 1012,1016 **** entry = (Map.Entry) i.next(); word = (String) entry.getKey(); ! idList = (OrderedLinkedList) entry.getValue(); ref = new WordRef(collectionId, word); try { --- 1013,1017 ---- entry = (Map.Entry) i.next(); word = (String) entry.getKey(); ! idList = (TermFrequencyList) entry.getValue(); ref = new WordRef(collectionId, word); try { *************** *** 1031,1047 **** os.writeByte(section); os.writeInt(len); ! is.copyTo(os, (termFreq ? len * 2 : len)); } else { // copy nodes to new list gid = 0; for (int j = 0; j < len; j++) { ! delta = is.readLong(); ! gid += delta; ! t = new TermOccurrence(gid); if(termFreq) ! t.frequency = is.readInt(); if (node == null ! && oldDoc.getTreeLevel(gid) < oldDoc.reindexRequired()) { ! idList.add(t); } else if (node != null && (!XMLUtil --- 1032,1048 ---- os.writeByte(section); os.writeInt(len); ! is.copyTo(os, len); } else { // copy nodes to new list gid = 0; for (int j = 0; j < len; j++) { ! gid += is.readLong(); if(termFreq) ! freq = is.readInt(); if (node == null ! && oldDoc.getTreeLevel(gid) < oldDoc ! .reindexRequired()) { ! idList.add(gid); ! idList.setLastTermFreq(freq); } else if (node != null && (!XMLUtil *************** *** 1050,1054 **** node.getGID(), gid))) { ! idList.add(t); } } --- 1051,1056 ---- node.getGID(), gid))) { ! idList.add(gid); ! idList.setLastTermFreq(freq); } } *************** *** 1063,1068 **** } } ! ids = new TermOccurrence[idList.size()]; ! idList.toArray(ids); Arrays.sort(ids); len = ids.length; --- 1065,1069 ---- } } ! ids = idList.toArray(); Arrays.sort(ids); len = ids.length; *************** *** 1072,1076 **** last = 0; for (int j = 0; j < len; j++) { ! delta = ids[j].gid - last; if (delta < 0) { LOG.debug("neg. delta: " + delta + " for " + word); --- 1073,1077 ---- last = 0; for (int j = 0; j < len; j++) { ! delta = ids[j].l - last; if (delta < 0) { LOG.debug("neg. delta: " + delta + " for " + word); *************** *** 1079,1084 **** os.writeLong(delta); if(termFreq) ! os.writeInt(ids[j].frequency); ! last = ids[j].gid; } try { --- 1080,1085 ---- os.writeLong(delta); if(termFreq) ! os.writeInt(ids[j].count); ! last = ids[j].l; } try { *************** *** 1117,1133 **** Map.Entry entry; String word; ! OrderedLinkedList idList; ! Comparable[] ids; byte[] data; ! long prevId, id; long delta; - TermOccurrence t; for (int k = 0; k < 2; k++) { for (Iterator i = words[k].entrySet().iterator(); i.hasNext(); count++) { entry = (Map.Entry) i.next(); word = (String) entry.getKey(); ! idList = (OrderedLinkedList) entry.getValue(); os.clear(); ! len = idList.size(); os.writeInt(doc.getDocId()); os.writeByte(k == 0 ? TEXT_SECTION : ATTRIBUTE_SECTION); --- 1118,1134 ---- Map.Entry entry; String word; ! TermFrequencyList idList; ! TermFrequencyList.TermFreq id; ! long[] ids; byte[] data; ! long prevId; long delta; for (int k = 0; k < 2; k++) { for (Iterator i = words[k].entrySet().iterator(); i.hasNext(); count++) { entry = (Map.Entry) i.next(); word = (String) entry.getKey(); ! idList = (TermFrequencyList) entry.getValue(); os.clear(); ! len = idList.getSize(); os.writeInt(doc.getDocId()); os.writeByte(k == 0 ? TEXT_SECTION : ATTRIBUTE_SECTION); *************** *** 1135,1141 **** prevId = 0; for (Iterator j = idList.iterator(); j.hasNext();) { ! t = (TermOccurrence) j.next(); ! id = t.gid; ! delta = id - prevId; if (delta < 0) { LOG.debug("neg. delta: " + delta + " for " + word); --- 1136,1141 ---- prevId = 0; for (Iterator j = idList.iterator(); j.hasNext();) { ! id = ((TermFrequencyList.TermFreq) j.next()); ! delta = id.l - prevId; if (delta < 0) { LOG.debug("neg. delta: " + delta + " for " + word); *************** *** 1143,1151 **** } os.writeLong(delta); ! if(termFreq) { ! // write out term frequencies ! os.writeInt(t.frequency); ! } ! prevId = id; } flushWord(collectionId, word, os.data()); --- 1143,1149 ---- } os.writeLong(delta); ! if(termFreq) ! os.writeInt(id.count); ! prevId = id.l; } flushWord(collectionId, word, os.data()); *************** *** 1266,1270 **** long last = -1; int freq = 1; - long delta; int sizeHint = -1; byte section; --- 1264,1267 ---- *************** *** 1285,1295 **** if (contextSet != null) sizeHint = contextSet.getSizeHint(doc); ! last = -1; for (int j = 0; j < len; j++) { ! delta = is.readLong(); ! gid = (last < 0 ? delta : last + delta); ! last = gid; if(termFreq) freq = is.readInt(); if (contextSet != null) { proxy = (section == TEXT_SECTION --- 1282,1291 ---- if (contextSet != null) sizeHint = contextSet.getSizeHint(doc); ! last = 0; for (int j = 0; j < len; j++) { ! gid = last + is.readLong(); if(termFreq) freq = is.readInt(); + last = gid; if (contextSet != null) { proxy = (section == TEXT_SECTION *************** *** 1302,1310 **** if (parent != null) { result.add(parent, sizeHint); ! if (trackMatches != Serializer.TAG_NONE) { ! match = new Match(word, gid); ! match.setFrequency(freq); parent.addMatch(match); - } } } else --- 1298,1305 ---- if (parent != null) { result.add(parent, sizeHint); ! match = new Match(word, gid); ! match.setFrequency(freq); ! if (trackMatches != Serializer.TAG_NONE) parent.addMatch(match); } } else *************** *** 1351,1353 **** --- 1346,1350 ---- } } + + } |