Revision: 2684
http://archive-access.svn.sourceforge.net/archive-access/?rev=2684&view=rev
Author: binzino
Date: 2009-02-28 01:18:32 +0000 (Sat, 28 Feb 2009)
Log Message:
-----------
Initial revision.
Added Paths:
-----------
trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java
Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java 2009-02-28 01:18:32 UTC (rev 2684)
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of the archive-access tools project
+ * (http://sourceforge.net/projects/archive-access).
+ *
+ * The archive-access tools are free software; you can redistribute them and/or
+ * modify them under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or any
+ * later version.
+ *
+ * The archive-access tools are distributed in the hope that they will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
+ * Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License along with
+ * the archive-access tools; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.nutchwax.tools;
+
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.NutchConfiguration;
+
+
+/**
+ * A nice command-line hack to generate a Lucene index N documents,
+ * each with one field set to the same value. This value is both
+ * stored and tokenized/indexed.
+ */
+public class BuildIndex extends Configured implements Tool
+{
+ public int run( String[] args ) throws Exception
+ {
+ if ( args.length < 4 )
+ {
+ System.out.println( "BuildIndex index field value count" );
+ System.exit( 0 );
+ }
+
+ String indexDir = args[0].trim();
+ String fieldKey = args[1].trim();
+ String fieldValue = args[2].trim();
+ int count = Integer.parseInt( args[3].trim() );
+
+ IndexWriter writer = new IndexWriter( indexDir, new WhitespaceAnalyzer( ), true );
+
+ for ( int i = 0 ; i < count ; i++ )
+ {
+ Document newDoc = new Document( );
+ newDoc.add( new Field( fieldKey, fieldValue, Field.Store.YES, Field.Index.TOKENIZED ) );
+
+ writer.addDocument( newDoc );
+ }
+
+ writer.close( );
+
+ return 0;
+ }
+
+ /**
+ * Runs using the Hadoop ToolRunner, which means it accepts the
+ * standard Hadoop command-line options.
+ */
+ public static void main( String args[] ) throws Exception
+ {
+ int result = ToolRunner.run( NutchConfiguration.create(), new BuildIndex(), args );
+
+ System.exit( result );
+ }
+
+}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|