[Exist-commits] eXist-1.0/src/org/exist/storage NativeTextEngine.java,1.62,1.63 NativeElementIndex.j

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/exist/eXist-1.0/src/org/exist/storage
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv15944/src/org/exist/storage

Modified Files:
	NativeTextEngine.java NativeElementIndex.java 
Log Message:
Code to track term frequency counts in NativeTextEngine had to be rewritten as it slowed down the 
indexing.

Index: NativeElementIndex.java
===================================================================
RCS file: /cvsroot/exist/eXist-1.0/src/org/exist/storage/NativeElementIndex.java,v
retrieving revision 1.34
retrieving revision 1.35
diff -C2 -d -r1.34 -r1.35
*** NativeElementIndex.java	19 Jul 2004 13:06:24 -0000	1.34
--- NativeElementIndex.java	21 Jul 2004 08:53:56 -0000	1.35
***************
*** 640,646 ****
                                          last = gid;
                                          address = StorageAddress.read(is);
!                                         if (!containsNode(idList, gid))
                                              newList.add(new NodeProxy(doc, gid,
                                                      address));
                                      }
                                  }
--- 640,647 ----
                                          last = gid;
                                          address = StorageAddress.read(is);
!                                         if (!containsNode(idList, gid)) {
                                              newList.add(new NodeProxy(doc, gid,
                                                      address));
+                                         }
                                      }
                                  }
***************
*** 668,676 ****
                          StorageAddress.write(p.getInternalAddress(), os);
                      }
!                         if (val == null) {
!                             dbElement.put(ref, os.data());
!                         } else {
!                             dbElement.update(val.getAddress(), ref, os.data());
!                         }
                  } catch (LockException e) {
                      LOG.error("could not acquire lock on elements", e);
--- 669,677 ----
                          StorageAddress.write(p.getInternalAddress(), os);
                      }
!                     if (val == null) {
!                     	dbElement.put(ref, os.data());
!                     } else {
!                     	dbElement.update(val.getAddress(), ref, os.data());
!                     }
                  } catch (LockException e) {
                      LOG.error("could not acquire lock on elements", e);

Index: NativeTextEngine.java
===================================================================
RCS file: /cvsroot/exist/eXist-1.0/src/org/exist/storage/NativeTextEngine.java,v
retrieving revision 1.62
retrieving revision 1.63
diff -C2 -d -r1.62 -r1.63
*** NativeTextEngine.java	19 Jul 2004 08:12:29 -0000	1.62
--- NativeTextEngine.java	21 Jul 2004 08:53:55 -0000	1.63
***************
*** 75,80 ****
  import org.exist.util.Lock;
  import org.exist.util.LockException;
  import org.exist.util.Occurrences;
! import org.exist.util.OrderedLinkedList;
  import org.exist.util.ProgressIndicator;
  import org.exist.util.ReadOnlyException;
--- 75,81 ----
  import org.exist.util.Lock;
  import org.exist.util.LockException;
+ import org.exist.util.LongLinkedList;
  import org.exist.util.Occurrences;
! import org.exist.util.OrderedLongLinkedList;
  import org.exist.util.ProgressIndicator;
  import org.exist.util.ReadOnlyException;
***************
*** 300,304 ****
  		byte[] data;
  		long gid;
- 		int freq = 1;
  		int docId;
  		int len;
--- 301,304 ----
***************
*** 307,310 ****
--- 307,311 ----
  		long last;
  		long delta;
+ 		int freq = 1;
  		Collection collection;
  		short collectionId;
***************
*** 344,351 ****
  					last = 0;
  					for (int j = 0; j < len; j++) {
! 						delta = is.readLong();
  						if(termFreq)
  							freq = is.readInt();
- 						gid = last + delta;
  						last = gid;
  						count++;
--- 345,351 ----
  					last = 0;
  					for (int j = 0; j < len; j++) {
! 						gid = last + is.readLong();
  						if(termFreq)
  							freq = is.readInt();
  						last = gid;
  						count++;
***************
*** 357,366 ****
  									true, -1);
  							if (parent != null) {
  								result.add(parent, sizeHint);
! 								if (trackMatches != Serializer.TAG_NONE) {
! 									match = new Match(term, gid);
! 									match.setFrequency(freq);
  									parent.addMatch(match);
- 								}
  							}
  						} else
--- 357,365 ----
  									true, -1);
  							if (parent != null) {
+ 								match = new Match(term, gid);
+ 								match.setFrequency(freq);
  								result.add(parent, sizeHint);
! 								if (trackMatches != Serializer.TAG_NONE)
  									parent.addMatch(match);
  							}
  						} else
***************
*** 546,551 ****
  							section = is.readByte();
  							len = is.readInt();
! 							is.skip(termFreq ? len * 2 : len);
! 							oc.addOccurrences(len);
  						}
  					} catch (EOFException e) {
--- 545,552 ----
  							section = is.readByte();
  							len = is.readInt();
! 							for(int k = 0; k < len; k++) {
! 								is.skip(1);
! 								oc.addOccurrences(is.readInt());
! 							}
  						}
  					} catch (EOFException e) {
***************
*** 738,742 ****
  			invIdx.setDocument(doc);
  			String sal= text.getXMLString().transformToLower().toString() ;
! 			invIdx.addText(sal, gid);		
  		} else {
  		while (null != (token = tokenizer.nextToken())) {
--- 739,743 ----
  			invIdx.setDocument(doc);
  			String sal= text.getXMLString().transformToLower().toString() ;
! 			invIdx.addText(sal, gid);			
  		} else {
  		while (null != (token = tokenizer.nextToken())) {
***************
*** 772,776 ****
  		}
  	}
! 	
  	private final static class WordRef extends Value {
  
--- 773,821 ----
  		}
  	}
! 
! 	private static class TermFrequencyList extends OrderedLongLinkedList {
! 		
! 		protected static class TermFreq extends LongLinkedList.ListItem {
! 			
! 			int count = 1;
! 			
! 			public TermFreq(long l) {
! 				super(l);
! 			}
! 			
! 			public void increment() {
! 				++count;
! 			}
! 		}
! 		
! 		/* (non-Javadoc)
! 		 * @see org.exist.util.LongLinkedList#createListItem(long)
! 		 */
! 		protected ListItem createListItem(long l) {
! 			return new TermFreq(l);
! 		}
! 		
! 		public void incLastTerm() {
! 			if(last != null)
! 				((TermFreq)last).increment();
! 		}
! 		
! 		public void setLastTermFreq(int freq) {
! 			if(last != null)
! 				((TermFreq)last).count = freq;
! 		}
! 		
! 		public TermFreq[] toArray() {
! 			TermFreq[] data = new TermFreq[count];
! 			ListItem next = first;
! 			int i = 0;
! 			while( next != null ) {
! 				data[i++] = (TermFreq)next;
! 				next = next.next;
! 			}
! 			return data;
! 		}
! 	}
! 
  	private final static class WordRef extends Value {
  
***************
*** 796,800 ****
  		}
  	}
- 
  	/**
  	 * This inner class is responsible for actually storing the list of
--- 841,844 ----
***************
*** 804,839 ****
  	 */
  	final class InvertedIndex {
!  
! 		private class TermOccurrence extends OrderedLinkedList.Node implements Comparable {
! 			long gid;
! 			int frequency = 1;
! 			
! 			public TermOccurrence(long gid) {
! 				this.gid = gid;
! 			}
! 			
! 			public int compareTo(OrderedLinkedList.Node o) {
! 				final TermOccurrence other = (TermOccurrence)o;
! 				if(gid == other.gid)
! 					return 0;
! 				else if(gid < other.gid)
! 					return -1;
! 				else
! 					return 1;
! 			}
! 			
! 			public int compareTo(Object o) {
! 				return compareTo((OrderedLinkedList.Node)o);
! 			}
! 			
! 			public boolean equals(org.exist.util.OrderedLinkedList.Node other) {
! 				return gid == ((TermOccurrence)other).gid;
! 			}
! 		}
! 		
  		private DocumentImpl doc = null;
  		private Map words[] = new TreeMap[2];
  		private VariableByteOutputStream os = new VariableByteOutputStream(7);
!  
  		public InvertedIndex() {
  			// To distinguish between attribute values and text, we use
--- 848,856 ----
  	 */
  	final class InvertedIndex {
! 
  		private DocumentImpl doc = null;
  		private Map words[] = new TreeMap[2];
  		private VariableByteOutputStream os = new VariableByteOutputStream(7);
! 
  		public InvertedIndex() {
  			// To distinguish between attribute values and text, we use
***************
*** 845,883 ****
  
  		public void addText(String word, long gid) {
! 			OrderedLinkedList buf = (OrderedLinkedList) words[0].get(word);
! 			TermOccurrence o;
  			if (buf == null) {
! 				buf = new OrderedLinkedList();
! 				o = new TermOccurrence(gid);
! 				buf.add(o);
  				words[0].put(word, buf);
! 			} else {
! 				o = (TermOccurrence)buf.getLast();
! 				if(o.gid == gid) {
! 					o.frequency++;
! 				} else {
! 					o = new TermOccurrence(gid);
! 					buf.add(o);
! 				}
! 			}
  		}
  
  		public void addAttribute(String word, long gid) {
! 			OrderedLinkedList buf = (OrderedLinkedList) words[1].get(word);
! 			TermOccurrence o;
  			if (buf == null) {
! 				buf = new OrderedLinkedList();
! 				o = new TermOccurrence(gid);
! 				buf.add(o);
  				words[1].put(word, buf);
! 			} else {
! 				o = (TermOccurrence)buf.getLast();
! 				if(o.gid == gid)
! 					o.frequency++;
! 				else {
! 					o = new TermOccurrence(gid);
! 					buf.add(o);
! 				}
! 			}
  		}
  
--- 862,886 ----
  
  		public void addText(String word, long gid) {
! 			TermFrequencyList buf = (TermFrequencyList) words[0].get(word);
  			if (buf == null) {
! 				buf = new TermFrequencyList();
! 				buf.add(gid);
  				words[0].put(word, buf);
! 			} else if (buf.getLast() == gid) {
! 				buf.incLastTerm();
! 			} else
! 				buf.add(gid);
  		}
  
  		public void addAttribute(String word, long gid) {
! 			TermFrequencyList buf = (TermFrequencyList) words[1].get(word);
  			if (buf == null) {
! 				buf = new TermFrequencyList();
! 				buf.add(gid);
  				words[1].put(word, buf);
! 			} else if (buf.getLast() == gid) {
! 				buf.incLastTerm();
! 			} else
! 				buf.add(gid);
  		}
  
***************
*** 890,895 ****
  			Map.Entry entry;
  			String word;
! 			OrderedLinkedList idList;
! 			TermOccurrence[] ids;
  			byte[] data;
  			long last, gid;
--- 893,898 ----
  			Map.Entry entry;
  			String word;
! 			TermFrequencyList idList;
! 			TermFrequencyList.TermFreq[] ids;
  			byte[] data;
  			long last, gid;
***************
*** 898,903 ****
  			NodeProxy p;
  			WordRef ref;
! 			TermOccurrence t;
! 			OrderedLinkedList newList;
  			Value val = null;
  			VariableByteArrayInput is;
--- 901,906 ----
  			NodeProxy p;
  			WordRef ref;
! 			TermFrequencyList newList;
! 			int freq = 1;
  			Value val = null;
  			VariableByteArrayInput is;
***************
*** 907,911 ****
  					entry = (Map.Entry) i.next();
  					word = (String) entry.getKey();
! 					idList = (OrderedLinkedList) entry.getValue();
  					ref = new WordRef(collectionId, word);
  					try {
--- 910,914 ----
  					entry = (Map.Entry) i.next();
  					word = (String) entry.getKey();
! 					idList = (TermFrequencyList) entry.getValue();
  					ref = new WordRef(collectionId, word);
  					try {
***************
*** 913,917 ****
  					    val = dbWords.get(ref);
  					    os.clear();
! 					    newList = new OrderedLinkedList();
  					    if (val != null) {
  					        // add old entries to the new list
--- 916,920 ----
  					    val = dbWords.get(ref);
  					    os.clear();
! 					    newList = new TermFrequencyList();
  					    if (val != null) {
  					        // add old entries to the new list
***************
*** 928,938 ****
  					                    last = 0;
  					                    for (int j = 0; j < len; j++) {
! 					                        delta = is.readLong();
! 					                        last = last + delta;
! 					                        t = new TermOccurrence(last);
  					                        if(termFreq)
! 					                        	t.frequency = is.readInt();
! 					                        if (!idList.contains(t))
! 					                            newList.add(t);
  					                    }
  					                } else {
--- 931,941 ----
  					                    last = 0;
  					                    for (int j = 0; j < len; j++) {
! 					                        last = last + is.readLong();
  					                        if(termFreq)
! 					                        	freq = is.readInt();
! 					                        if (!idList.contains(last)) {
! 					                            newList.add(last);
! 					                            newList.setLastTermFreq(freq);
! 					                        }
  					                    }
  					                } else {
***************
*** 954,959 ****
  					        }
  					    }
! 					    ids = new TermOccurrence[newList.size()];
! 					    newList.toArray(ids);
  					    //i.remove();
  					    Arrays.sort(ids);
--- 957,961 ----
  					        }
  					    }
! 					    ids = newList.toArray();
  					    //i.remove();
  					    Arrays.sort(ids);
***************
*** 964,968 ****
  					    last = 0;
  					    for (int j = 0; j < len; j++) {
! 					        delta = ids[j].gid - last;
  					        if (delta < 0) {
  					            LOG.debug("neg. delta: " + delta + " for " + word);
--- 966,970 ----
  					    last = 0;
  					    for (int j = 0; j < len; j++) {
! 					        delta = ids[j].l - last;
  					        if (delta < 0) {
  					            LOG.debug("neg. delta: " + delta + " for " + word);
***************
*** 971,976 ****
  					        os.writeLong(delta);
  					        if(termFreq)
! 					        	os.writeInt(ids[j].frequency);
! 					        last = ids[j].gid;
  					    }
  					    try {
--- 973,978 ----
  					        os.writeLong(delta);
  					        if(termFreq)
! 					        	os.writeInt(ids[j].count);
! 					        last = ids[j].l;
  					    }
  					    try {
***************
*** 998,1006 ****
  		    Map.Entry entry;
  		    String word;
! 		    OrderedLinkedList idList;
! 		    TermOccurrence[] ids;
! 		    TermOccurrence t;
! 		    long last, gid;
! 		    long delta;
  		    byte section;
  		    NodeProxy p;
--- 1000,1007 ----
  		    Map.Entry entry;
  		    String word;
! 		    TermFrequencyList idList;
! 		    TermFrequencyList.TermFreq[] ids;
! 		    long last, gid, delta;
! 		    int freq = 1;
  		    byte section;
  		    NodeProxy p;
***************
*** 1012,1016 ****
  		            entry = (Map.Entry) i.next();
  		            word = (String) entry.getKey();
! 		            idList = (OrderedLinkedList) entry.getValue();
  		            ref = new WordRef(collectionId, word);
  		            try {
--- 1013,1017 ----
  		            entry = (Map.Entry) i.next();
  		            word = (String) entry.getKey();
! 		            idList = (TermFrequencyList) entry.getValue();
  		            ref = new WordRef(collectionId, word);
  		            try {
***************
*** 1031,1047 ****
  		                                os.writeByte(section);
  		                                os.writeInt(len);
! 		                                is.copyTo(os, (termFreq ? len * 2 : len));
  		                            } else {
  		                                // copy nodes to new list
  		                                gid = 0;
  		                                for (int j = 0; j < len; j++) {
! 		                                    delta = is.readLong();
! 		                                    gid += delta;
! 		                                    t = new TermOccurrence(gid);
  		                                    if(termFreq)
! 		                                    	t.frequency = is.readInt();
  		                                    if (node == null
! 		                                            && oldDoc.getTreeLevel(gid) < oldDoc.reindexRequired()) {
! 		                                        idList.add(t);
  		                                    } else if (node != null
  		                                            && (!XMLUtil
--- 1032,1048 ----
  		                                os.writeByte(section);
  		                                os.writeInt(len);
! 		                                is.copyTo(os, len);
  		                            } else {
  		                                // copy nodes to new list
  		                                gid = 0;
  		                                for (int j = 0; j < len; j++) {
! 		                                    gid += is.readLong();
  		                                    if(termFreq)
! 		                                    	freq = is.readInt();
  		                                    if (node == null
! 		                                            && oldDoc.getTreeLevel(gid) < oldDoc
! 		                                            .reindexRequired()) {
! 		                                        idList.add(gid);
! 		                                        idList.setLastTermFreq(freq);
  		                                    } else if (node != null
  		                                            && (!XMLUtil
***************
*** 1050,1054 ****
  		                                                            node.getGID(),
  		                                                            gid))) {
! 		                                        idList.add(t);
  		                                    }
  		                                }
--- 1051,1056 ----
  		                                                            node.getGID(),
  		                                                            gid))) {
! 		                                        idList.add(gid);
! 		                                        idList.setLastTermFreq(freq);
  		                                    }
  		                                }
***************
*** 1063,1068 ****
  		                    }
  		                }
! 		                ids = new TermOccurrence[idList.size()];
! 		                idList.toArray(ids);
  		                Arrays.sort(ids);
  		                len = ids.length;
--- 1065,1069 ----
  		                    }
  		                }
! 		                ids = idList.toArray();
  		                Arrays.sort(ids);
  		                len = ids.length;
***************
*** 1072,1076 ****
  		                last = 0;
  		                for (int j = 0; j < len; j++) {
! 		                    delta = ids[j].gid - last;
  		                    if (delta < 0) {
  		                        LOG.debug("neg. delta: " + delta + " for " + word);
--- 1073,1077 ----
  		                last = 0;
  		                for (int j = 0; j < len; j++) {
! 		                    delta = ids[j].l - last;
  		                    if (delta < 0) {
  		                        LOG.debug("neg. delta: " + delta + " for " + word);
***************
*** 1079,1084 ****
  		                    os.writeLong(delta);
  		                    if(termFreq)
! 		                    	os.writeInt(ids[j].frequency);
! 		                    last = ids[j].gid;
  		                }
  		                try {
--- 1080,1085 ----
  		                    os.writeLong(delta);
  		                    if(termFreq)
! 		                    	os.writeInt(ids[j].count);
! 		                    last = ids[j].l;
  		                }
  		                try {
***************
*** 1117,1133 ****
  			Map.Entry entry;
  			String word;
! 			OrderedLinkedList idList;
! 			Comparable[] ids;
  			byte[] data;
! 			long prevId, id;
  			long delta;
- 			TermOccurrence t;
  			for (int k = 0; k < 2; k++) {
  				for (Iterator i = words[k].entrySet().iterator(); i.hasNext(); count++) {
  					entry = (Map.Entry) i.next();
  					word = (String) entry.getKey();
! 					idList = (OrderedLinkedList) entry.getValue();
  					os.clear();
! 					len = idList.size();
  					os.writeInt(doc.getDocId());
  					os.writeByte(k == 0 ? TEXT_SECTION : ATTRIBUTE_SECTION);
--- 1118,1134 ----
  			Map.Entry entry;
  			String word;
! 			TermFrequencyList idList;
! 			TermFrequencyList.TermFreq id;
! 			long[] ids;
  			byte[] data;
! 			long prevId;
  			long delta;
  			for (int k = 0; k < 2; k++) {
  				for (Iterator i = words[k].entrySet().iterator(); i.hasNext(); count++) {
  					entry = (Map.Entry) i.next();
  					word = (String) entry.getKey();
! 					idList = (TermFrequencyList) entry.getValue();
  					os.clear();
! 					len = idList.getSize();
  					os.writeInt(doc.getDocId());
  					os.writeByte(k == 0 ? TEXT_SECTION : ATTRIBUTE_SECTION);
***************
*** 1135,1141 ****
  					prevId = 0;
  					for (Iterator j = idList.iterator(); j.hasNext();) {
! 						t = (TermOccurrence) j.next();
! 						id = t.gid;
! 						delta = id - prevId;
  						if (delta < 0) {
  							LOG.debug("neg. delta: " + delta + " for " + word);
--- 1136,1141 ----
  					prevId = 0;
  					for (Iterator j = idList.iterator(); j.hasNext();) {
! 						id = ((TermFrequencyList.TermFreq) j.next());
! 						delta = id.l - prevId;
  						if (delta < 0) {
  							LOG.debug("neg. delta: " + delta + " for " + word);
***************
*** 1143,1151 ****
  						}
  						os.writeLong(delta);
! 						if(termFreq) {
! 							// write out term frequencies
! 							os.writeInt(t.frequency);
! 						}
! 						prevId = id;
  					}
  					flushWord(collectionId, word, os.data());
--- 1143,1149 ----
  						}
  						os.writeLong(delta);
! 						if(termFreq)
! 							os.writeInt(id.count);
! 						prevId = id.l;
  					}
  					flushWord(collectionId, word, os.data());
***************
*** 1266,1270 ****
  				long last = -1;
  				int freq = 1;
- 				long delta;
  				int sizeHint = -1;
  				byte section;
--- 1264,1267 ----
***************
*** 1285,1295 ****
  						if (contextSet != null)
  							sizeHint = contextSet.getSizeHint(doc);
! 						last = -1;
  						for (int j = 0; j < len; j++) {
! 							delta = is.readLong();
! 							gid = (last < 0 ? delta : last + delta);
! 							last = gid;
  							if(termFreq)
  								freq = is.readInt();
  							if (contextSet != null) {
  								proxy = (section == TEXT_SECTION
--- 1282,1291 ----
  						if (contextSet != null)
  							sizeHint = contextSet.getSizeHint(doc);
! 						last = 0;
  						for (int j = 0; j < len; j++) {
! 							gid = last + is.readLong();
  							if(termFreq)
  								freq = is.readInt();
+ 							last = gid;
  							if (contextSet != null) {
  								proxy = (section == TEXT_SECTION
***************
*** 1302,1310 ****
  								if (parent != null) {
  									result.add(parent, sizeHint);
! 									if (trackMatches != Serializer.TAG_NONE) {
! 										match = new Match(word, gid);
! 										match.setFrequency(freq);
  										parent.addMatch(match);
- 									}
  								}
  							} else
--- 1298,1305 ----
  								if (parent != null) {
  									result.add(parent, sizeHint);
! 									match = new Match(word, gid);
! 									match.setFrequency(freq);
! 									if (trackMatches != Serializer.TAG_NONE)
  										parent.addMatch(match);
  								}
  							} else
***************
*** 1351,1353 ****
--- 1346,1350 ----
  		}
  	}
+ 
+ 
  }

[Exist-commits] eXist-1.0/src/org/exist/storage NativeTextEngine.java,1.62,1.63 NativeElementIndex.j

eXist-db is a feature rich Open Source native XML database

[Exist-commits] eXist-1.0/src/org/exist/storage NativeTextEngine.java,1.62,1.63 NativeElementIndex.java,1.34,1.35