From: Eric F. <er...@us...> - 2001-12-27 04:32:40
|
Update of /cvsroot/maxent/maxent/src/java/opennlp/maxent In directory usw-pr-cvs1:/tmp/cvs-serv24656/src/java/opennlp/maxent Modified Files: Tag: no_colt ComparableEvent.java DataIndexer.java Log Message: refactored sorting/merging of events so that the events do not have to be copied into lists of lists for merging. Instead, ComparableEvent has a `seen' field (default 1) which can be used to track the number of times an event has been seen. After sorting the events, the first loop over the set increments this value when duplicates are found and simply nulls them out of the list. The second loop can then skip over null slots in the list, mining only the "unique" events. This saves a lot of memory, because it's no longer necessary to build lists of lists. I also switched optimize to "off" in build.xml because -O doesn't do anything in modern java compilers (HotSpot does all of the optimization work, at runtime), but it does cause jikes to drop the line numbers from stack traces. Index: ComparableEvent.java =================================================================== RCS file: /cvsroot/maxent/maxent/src/java/opennlp/maxent/ComparableEvent.java,v retrieving revision 1.1.1.1 retrieving revision 1.1.1.1.2.1 diff -C2 -d -r1.1.1.1 -r1.1.1.1.2.1 *** ComparableEvent.java 2001/10/23 14:06:53 1.1.1.1 --- ComparableEvent.java 2001/12/27 04:32:37 1.1.1.1.2.1 *************** *** 30,67 **** public int outcome; public int[] predIndexes; public ComparableEvent(int oc, int[] pids) { ! outcome = oc; ! Arrays.sort(pids); ! predIndexes = pids; } public int compareTo(Object o) { ! ComparableEvent ce = (ComparableEvent)o; ! if (outcome < ce.outcome) return -1; ! else if (outcome > ce.outcome) return 1; ! int smallerLength = (predIndexes.length > ce.predIndexes.length? ! ce.predIndexes.length : predIndexes.length); ! for (int i=0; i<smallerLength; i++) { ! if (predIndexes[i] < ce.predIndexes[i]) return -1; ! else if (predIndexes[i] > ce.predIndexes[i]) return 1; ! } ! if (predIndexes.length < ce.predIndexes.length) return -1; ! else if (predIndexes.length > ce.predIndexes.length) return 1; ! return 0; } public String toString() { ! String s = ""; ! for (int i=0; i<predIndexes.length; i++) s+= " "+predIndexes[i]; ! return s; } - } --- 30,68 ---- public int outcome; public int[] predIndexes; + public int seen = 1; // the number of times this event + // has been seen. public ComparableEvent(int oc, int[] pids) { ! outcome = oc; ! Arrays.sort(pids); ! predIndexes = pids; } public int compareTo(Object o) { ! ComparableEvent ce = (ComparableEvent)o; ! if (outcome < ce.outcome) return -1; ! else if (outcome > ce.outcome) return 1; ! int smallerLength = (predIndexes.length > ce.predIndexes.length? ! ce.predIndexes.length : predIndexes.length); ! for (int i=0; i<smallerLength; i++) { ! if (predIndexes[i] < ce.predIndexes[i]) return -1; ! else if (predIndexes[i] > ce.predIndexes[i]) return 1; ! } ! if (predIndexes.length < ce.predIndexes.length) return -1; ! else if (predIndexes.length > ce.predIndexes.length) return 1; ! return 0; } public String toString() { ! String s = ""; ! for (int i=0; i<predIndexes.length; i++) s+= " "+predIndexes[i]; ! return s; } } Index: DataIndexer.java =================================================================== RCS file: /cvsroot/maxent/maxent/src/java/opennlp/maxent/DataIndexer.java,v retrieving revision 1.4.2.3 retrieving revision 1.4.2.4 diff -C2 -d -r1.4.2.3 -r1.4.2.4 *** DataIndexer.java 2001/12/27 03:51:52 1.4.2.3 --- DataIndexer.java 2001/12/27 04:32:37 1.4.2.4 *************** *** 83,105 **** System.out.print("Sorting and merging events... "); Arrays.sort(eventsToCompare); ComparableEvent ce = eventsToCompare[0]; ! List uniqueEvents = new ArrayList(); ! List newGroup = new ArrayList(); ! int numEvents = eventsToCompare.length; ! for (int i=0; i<numEvents; i++) { if (ce.compareTo(eventsToCompare[i]) == 0) { ! newGroup.add(eventsToCompare[i]); ! } else { ! ce = eventsToCompare[i]; ! uniqueEvents.add(newGroup); ! newGroup = new ArrayList(); ! newGroup.add(eventsToCompare[i]); } } - uniqueEvents.add(newGroup); - - int numUniqueEvents = uniqueEvents.size(); System.out.println("done. Reduced " + eventsToCompare.length --- 83,117 ---- System.out.print("Sorting and merging events... "); + sortAndMerge(eventsToCompare); + System.out.println("Done indexing."); + } + + /** + * Sorts and uniques the array of comparable events. This method + * will alter the eventsToCompare array -- it does an in place + * sort, followed by an in place edit to remove duplicates. + * + * @param eventsToCompare a <code>ComparableEvent[]</code> value + * @since maxent 1.2.6 + */ + private void sortAndMerge(ComparableEvent[] eventsToCompare) { Arrays.sort(eventsToCompare); + int numEvents = eventsToCompare.length; + int numUniqueEvents = 1; // assertion: eventsToCompare.length >= 1 + if (eventsToCompare.length <= 1) { + return; // nothing to do; edge case (see assertion) + } + ComparableEvent ce = eventsToCompare[0]; ! for (int i=1; i<numEvents; i++) { if (ce.compareTo(eventsToCompare[i]) == 0) { ! ce.seen++; // increment the seen count ! eventsToCompare[i] = null; // kill the duplicate ! } else { ! ce = eventsToCompare[i]; // a new champion emerges... ! numUniqueEvents++; // increment the # of unique events } } System.out.println("done. Reduced " + eventsToCompare.length *************** *** 110,122 **** numTimesEventsSeen = new int[numUniqueEvents]; ! for (int i=0; i<numUniqueEvents; i++) { ! List group = (List)uniqueEvents.get(i); ! numTimesEventsSeen[i] = group.size(); ! ComparableEvent nextCE = (ComparableEvent)group.get(0); ! outcomeList[i] = nextCE.outcome; ! contexts[i] = nextCE.predIndexes; } - - System.out.println("Done indexing."); } --- 122,135 ---- numTimesEventsSeen = new int[numUniqueEvents]; ! for (int i = 0, j = 0; i<numEvents; i++) { ! ComparableEvent evt = eventsToCompare[i]; ! if (null == evt) { ! continue; // this was a dupe, skip over it. ! } ! numTimesEventsSeen[j] = evt.seen; ! outcomeList[j] = evt.outcome; ! contexts[j] = evt.predIndexes; ! ++j; } } |