From: <bi...@us...> - 2008-09-26 20:38:37
|
Revision: 2596 http://archive-access.svn.sourceforge.net/archive-access/?rev=2596&view=rev Author: binzino Date: 2008-09-26 20:38:26 +0000 (Fri, 26 Sep 2008) Log Message: ----------- Fix WAX-25: Add new utility to dump the unique values of a field in an index. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java 2008-09-26 20:38:26 UTC (rev 2596) @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2008 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.nutchwax.tools; + +import java.io.File; +import java.util.Iterator; +import java.util.Set; +import java.util.HashSet; +import java.util.Collection; + +import org.apache.lucene.index.IndexReader; + +/** + * A quick-n-dirty command-line utility to get the unique values for a + * field in an index and print them to stdout. + */ +public class GetUniqFieldValues +{ + public static void main(String[] args) throws Exception + { + String fieldName = ""; + String indexDir = ""; + + if ( args.length == 2 ) + { + fieldName = args[0]; + indexDir = args[1]; + } + + if (! (new File(indexDir)).exists()) + { + usageAndExit(); + } + + dumpUniqValues( fieldName, indexDir ); + } + + private static void dumpUniqValues( String fieldName, String indexDir ) throws Exception + { + IndexReader reader = IndexReader.open(indexDir); + + Collection fieldNames = reader.getFieldNames( IndexReader.FieldOption.ALL ); + + if ( ! fieldNames.contains( fieldName ) ) + { + System.out.println( "Field not in index: " + fieldName ); + System.exit( 2 ); + } + + int numDocs = reader.numDocs(); + Set<String> values = new HashSet<String>( ); + + for ( int i = 0; i < numDocs; i++ ) + { + values.add( reader.document(i).get( fieldName ) ); + } + + for ( String v : values ) + { + System.out.println( v ); + } + + } + + private static void usageAndExit() + { + System.out.println("Usage: GetUniqFieldValues field index"); + System.exit(1); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |