From: Eric F. <er...@us...> - 2002-01-02 20:00:44
|
Update of /cvsroot/maxent/maxent/src/java/opennlp/maxent In directory usw-pr-cvs1:/tmp/cvs-serv11395/src/java/opennlp/maxent Modified Files: DataIndexer.java Log Message: [copied from CHANGES file] Upgraded trove dependency to 0.1.1 (includes TIntArrayList, with reset()) (Eric) (opennlp.maxent.DataIndexer) Refactored event count computation so that the cutoff can be applied while events are read. This obviates the need for a separate pass over the predicates between event count computation and indexing. It also saves memory by reducing the amount of temporary data needed and by avoiding creation of instances of the Counter class. the applyCutoff() method was no longer needed and so is gone. (Eric) (opennlp.maxent.DataIndexer) Made the event count computation + cutoff application also handle the assignment of unique indexes to predicates that "make the cut." This saves a fair amount of time in the indexing process. (Eric) (opennlp.maxent.DataIndexer) Refactored the indexing implementation so that TIntArrayLists are (re-)used for constructing the array of predicate references associated with each ComparableEvent. Using the TIntArrayList instead of an ArrayList of Integers dramatically reduces the amount of garbage produced during indexing; it's also smaller. (Eric) (opennlp.maxent.DataIndexer) removed toIntArray() method, since TIntArrayList provides the same behavior without the cost of a loop over a List of Integers (Eric) (opennlp.maxent.DataIndexer) changed indexing Maps to TObjectIntHashMaps to save space in several places. (Eric) Index: DataIndexer.java =================================================================== RCS file: /cvsroot/maxent/maxent/src/java/opennlp/maxent/DataIndexer.java,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** DataIndexer.java 2001/12/27 19:20:26 1.5 --- DataIndexer.java 2002/01/02 20:00:39 1.6 *************** *** 36,40 **** public String[] predLabels; public String[] outcomeLabels; - private static final IntegerPool intPool = new IntegerPool(50); /** --- 36,39 ---- *************** *** 58,82 **** */ public DataIndexer(EventStream eventStream, int cutoff) { ! Map count; TLinkedList events; ! System.out.println("Indexing events"); System.out.print("\tComputing event counts... "); ! count = new THashMap(); ! events = computeEventCounts(eventStream,count); ! //for(int tid=0; tid<events.length; tid++) { System.out.println("done."); - System.out.print("\tPerforming cutoff of " + cutoff + "... "); - applyCutoff(count, cutoff); - System.out.println("done."); - System.out.print("\tIndexing... "); ! ComparableEvent[] eventsToCompare = index(events,count); // done with event list events = null; ! // done with predicate counts ! count = null; System.out.println("done."); --- 57,77 ---- */ public DataIndexer(EventStream eventStream, int cutoff) { ! TObjectIntHashMap predicateIndex; TLinkedList events; + ComparableEvent[] eventsToCompare; ! predicateIndex = new TObjectIntHashMap(); ! System.out.println("Indexing events using cutoff of " + cutoff + "\n"); System.out.print("\tComputing event counts... "); ! events = computeEventCounts(eventStream,predicateIndex,cutoff); System.out.println("done."); System.out.print("\tIndexing... "); ! eventsToCompare = index(events,predicateIndex); // done with event list events = null; ! // done with predicates ! predicateIndex = null; System.out.println("done."); *************** *** 135,141 **** private TLinkedList computeEventCounts(EventStream eventStream, ! Map count) { TLinkedList events = new TLinkedList(); while (eventStream.hasNext()) { Event ev = eventStream.nextEvent(); --- 130,151 ---- + /** + * Reads events from <tt>eventStream</tt> into a linked list. The + * predicates associated with each event are counted and any which + * occur at least <tt>cutoff</tt> times are added to the + * <tt>predicatesInOut</tt> map along with a unique integer index. + * + * @param eventStream an <code>EventStream</code> value + * @param predicatesInOut a <code>TObjectIntHashMap</code> value + * @param cutoff an <code>int</code> value + * @return a <code>TLinkedList</code> value + */ private TLinkedList computeEventCounts(EventStream eventStream, ! TObjectIntHashMap predicatesInOut, ! int cutoff) { ! TObjectIntHashMap counter = new TObjectIntHashMap(); TLinkedList events = new TLinkedList(); + int predicateIndex = 0; + while (eventStream.hasNext()) { Event ev = eventStream.nextEvent(); *************** *** 143,173 **** String[] ec = ev.getContext(); for (int j=0; j<ec.length; j++) { ! Counter counter = (Counter)count.get(ec[j]); ! if (counter!=null) { ! counter.increment(); ! } else { ! count.put(ec[j], new Counter()); } } } return events; } - private void applyCutoff(Map count, int cutoff) { - if (cutoff == 0) { - return; // nothing to do - } - - for (Iterator cit=count.keySet().iterator(); cit.hasNext();) { - String pred = (String)cit.next(); - if (! ((Counter)count.get(pred)).passesCutoff(cutoff)) { - cit.remove(); - } - } - } - private ComparableEvent[] index(TLinkedList events, ! Map count) { ! Map omap = new THashMap(), pmap = new THashMap(); int numEvents = events.size(); --- 153,174 ---- String[] ec = ev.getContext(); for (int j=0; j<ec.length; j++) { ! if (! predicatesInOut.containsKey(ec[j])) { ! int count = counter.get(ec[j]) + 1; ! if (count >= cutoff) { ! predicatesInOut.put(ec[j], predicateIndex++); ! counter.remove(ec[j]); ! } else { ! counter.put(ec[j], count); ! } } } } + predicatesInOut.trimToSize(); return events; } private ComparableEvent[] index(TLinkedList events, ! TObjectIntHashMap predicateIndex) { ! TObjectIntHashMap omap = new TObjectIntHashMap(); int numEvents = events.size(); *************** *** 175,178 **** --- 176,180 ---- int predCount = 0; ComparableEvent[] eventsToCompare = new ComparableEvent[numEvents]; + TIntArrayList indexedContext = new TIntArrayList(); for (int eventIndex=0; eventIndex<numEvents; eventIndex++) { *************** *** 180,212 **** String[] econtext = ev.getContext(); ! Integer predID, ocID; String oc = ev.getOutcome(); if (omap.containsKey(oc)) { ! ocID = (Integer)omap.get(oc); } else { ! ocID = intPool.get(outcomeCount++); omap.put(oc, ocID); } - List indexedContext = new ArrayList(); for (int i=0; i<econtext.length; i++) { String pred = econtext[i]; ! if (count.containsKey(pred)) { ! if (pmap.containsKey(pred)) { ! predID = (Integer)pmap.get(pred); ! } else { ! predID = intPool.get(predCount++); ! pmap.put(pred, predID); ! } ! indexedContext.add(predID); } } eventsToCompare[eventIndex] = ! new ComparableEvent(ocID.intValue(), ! toIntArray(indexedContext)); } outcomeLabels = toIndexedStringArray(omap); ! predLabels = toIndexedStringArray(pmap); return eventsToCompare; } --- 182,208 ---- String[] econtext = ev.getContext(); ! int predID, ocID; String oc = ev.getOutcome(); if (omap.containsKey(oc)) { ! ocID = omap.get(oc); } else { ! ocID = outcomeCount++; omap.put(oc, ocID); } for (int i=0; i<econtext.length; i++) { String pred = econtext[i]; ! if (predicateIndex.containsKey(pred)) { ! indexedContext.add(predicateIndex.get(pred)); } } eventsToCompare[eventIndex] = ! new ComparableEvent(ocID, indexedContext.toNativeArray()); ! // recycle the TIntArrayList ! indexedContext.resetQuick(); } outcomeLabels = toIndexedStringArray(omap); ! predLabels = toIndexedStringArray(predicateIndex); return eventsToCompare; } *************** *** 218,250 **** * labels should be inserted. * ! * @param labelToIndexMap a <code>Map</code> value * @return a <code>String[]</code> value * @since maxent 1.2.6 */ ! static String[] toIndexedStringArray(Map labelToIndexMap) { ! String[] array = new String[labelToIndexMap.size()]; ! for (Iterator i = labelToIndexMap.keySet().iterator(); i.hasNext();) { ! String label = (String)i.next(); ! int index = ((Integer)labelToIndexMap.get(label)).intValue(); ! array[index] = label; ! } return array; - } - - /** - * Utility method for turning a list of Integer objects into a - * native array of primitive ints. - * - * @param integers a <code>List</code> value - * @return an <code>int[]</code> value - * @since maxent 1.2.6 - */ - static final int[] toIntArray(List integers) { - int[] rv = new int[integers.size()]; - int i = 0; - for (Iterator it = integers.iterator(); it.hasNext();) { - rv[i++] = ((Integer)it.next()).intValue(); - } - return rv; } } --- 214,230 ---- * labels should be inserted. * ! * @param labelToIndexMap a <code>TObjectIntHashMap</code> value * @return a <code>String[]</code> value * @since maxent 1.2.6 */ ! static String[] toIndexedStringArray(TObjectIntHashMap labelToIndexMap) { ! final String[] array = new String[labelToIndexMap.size()]; ! labelToIndexMap.forEachEntry(new TObjectIntProcedure() { ! public boolean execute(Object str, int index) { ! array[index] = (String)str; ! return true; ! } ! }); return array; } } |