archive-access-cvs Mailing List for Web Archive Access Utilities (Page 39)

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2855
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2855&view=rev
Author:   binzino
Date:     2009-10-28 00:29:23 +0000 (Wed, 28 Oct 2009)

Log Message:
-----------
Ported from NW 0.12.9.

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexMerger.java

Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexMerger.java
===================================================================

--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexMerger.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexMerger.java	2009-10-28 00:29:23 UTC (rev 2855)
@@ -0,0 +1,211 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.mapred.FileAlreadyExistsException;
+import org.apache.hadoop.util.*;
+import org.apache.hadoop.conf.*;
+
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.LogUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.indexer.NutchSimilarity;
+import org.apache.nutch.indexer.FsDirectory;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.ArchiveParallelReader;
+
+/*************************************************************************
+ * IndexMerger creates an index for the output corresponding to a 
+ * single fetcher run.
+ * 
+ * @author Doug Cutting
+ * @author Mike Cafarella
+ *************************************************************************/
+public class IndexMerger extends Configured implements Tool {
+  public static final Log LOG = LogFactory.getLog(IndexMerger.class);
+
+  public static final String DONE_NAME = "merge.done";
+
+  public IndexMerger() {
+    
+  }
+  
+  public IndexMerger(Configuration conf) {
+    setConf(conf);
+  }
+  
+  /**
+   * Merge all input indexes to the single output index
+   */
+  public void merge(IndexReader[] readers, Path outputIndex, Path localWorkingDir, boolean parallel) throws IOException {
+    LOG.info("merging indexes to: " + outputIndex);
+
+    FileSystem localFs = FileSystem.getLocal(getConf());  
+    if (localFs.exists(localWorkingDir)) {
+      localFs.delete(localWorkingDir, true);
+    }
+    localFs.mkdirs(localWorkingDir);
+
+    // Get local output target
+    //
+    FileSystem fs = FileSystem.get(getConf());
+    if (fs.exists(outputIndex)) {
+      throw new FileAlreadyExistsException("Output directory " + outputIndex + " already exists!");
+    }
+
+    Path tmpLocalOutput = new Path(localWorkingDir, "merge-output");
+    Path localOutput = fs.startLocalOutput(outputIndex, tmpLocalOutput);
+
+    //
+    // Merge indices
+    //
+    IndexWriter writer = new IndexWriter(localOutput.toString(), null, true);
+    writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR));
+    writer.setMaxBufferedDocs(getConf().getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS));
+    writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS));
+    writer.setTermIndexInterval(getConf().getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL));
+    writer.setInfoStream(LogUtil.getDebugStream(LOG));
+    writer.setUseCompoundFile(false);
+    writer.setSimilarity(new NutchSimilarity());
+    writer.addIndexes(readers);
+    writer.close();
+
+    //
+    // Put target back
+    //
+    fs.completeLocalOutput(outputIndex, tmpLocalOutput);
+    LOG.info("done merging");
+  }
+
+  /** 
+   * Create an index for the input files in the named directory. 
+   */
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new IndexMerger(), args);
+    System.exit(res);
+  }
+  
+  public int run(String[] args) throws Exception {
+    String usage = "IndexMerger [-workingdir <workingdir>] [-p] outputIndex indexesDir...\n\t-p     Input directories contain parallel indexes.\n";
+    if (args.length < 2)
+      {
+        System.err.println("Usage: " + usage);
+        return -1;
+    }
+
+    //
+    // Parse args, read all index directories to be processed
+    //
+    FileSystem fs = FileSystem.get(getConf());
+    List<Path> indexDirs = new ArrayList<Path>();
+
+    Path workDir = new Path("indexmerger-" + System.currentTimeMillis());  
+    int i = 0;
+
+    boolean parallel=false;
+
+    while ( args[i].startsWith( "-" ) )
+      {
+        if ( "-workingdir".equals(args[i]) )
+          {
+            i++;
+            workDir = new Path(args[i++], "indexmerger-" + System.currentTimeMillis());
+          }
+        else if ( "-p".equals(args[i]) )
+          {
+            i++;
+            parallel=true;
+          }
+    }
+
+    Path outputIndex = new Path(args[i++]);
+
+    List<IndexReader> readers = new ArrayList<IndexReader>( );
+    
+    if ( ! parallel )
+      {
+        for (; i < args.length; i++)
+          {
+            FileStatus[] fstats = fs.listStatus(new Path(args[i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
+            
+            for ( Path p : HadoopFSUtil.getPaths(fstats) )
+              {
+                LOG.info( "Adding reader for: " + p );
+                readers.add( IndexReader.open( new FsDirectory( fs, p, false, getConf( ) ) ) );
+              }
+          }
+      }
+    else
+      {
+        for (; i < args.length; i++)
+          {
+            FileStatus[] fstats = fs.listStatus(new Path(args[i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
+            Path parallelDirs[] = HadoopFSUtil.getPaths( fstats );
+
+            if ( parallelDirs.length < 1 )
+              {
+                LOG.info( "No sub-directories, skipping: " + args[i] );
+                
+                continue;
+              }
+            else
+              {
+                LOG.info( "Adding parallel reader for: " + args[i] );
+              }
+
+            ArchiveParallelReader preader = new ArchiveParallelReader( );
+          
+            // Sort the parallelDirs so that we add them in order.  Order
+            // matters to the ParallelReader.
+            Arrays.sort( parallelDirs );
+            
+            for ( Path p : parallelDirs )
+              {
+                LOG.info( "    Adding to parallel reader: " + p.getName( ) );
+                preader.add( IndexReader.open( new FsDirectory( fs, p, false, getConf( ) ) ) ); 
+              }
+            
+            readers.add( preader );
+          }
+      }
+
+    //
+    // Merge the indices
+    //
+
+    try {
+      merge(readers.toArray(new IndexReader[readers.size()]), outputIndex, workDir, parallel);
+      return 0;
+    } catch (Exception e) {
+      LOG.fatal("IndexMerger: " + StringUtils.stringifyException(e));
+      return -1;
+    } finally {
+      FileSystem.getLocal(getConf()).delete(workDir, true);
+    }
+  }
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




2005	Jan	Feb	Mar	Apr	May	Jun	Jul (1)	Aug (10)	Sep (36)	Oct (339)	Nov (103)	Dec (152)
2006	Jan (141)	Feb (102)	Mar (125)	Apr (203)	May (57)	Jun (30)	Jul (139)	Aug (46)	Sep (64)	Oct (105)	Nov (34)	Dec (162)
2007	Jan (81)	Feb (57)	Mar (141)	Apr (72)	May (9)	Jun (1)	Jul (144)	Aug (88)	Sep (40)	Oct (43)	Nov (34)	Dec (20)
2008	Jan (44)	Feb (45)	Mar (16)	Apr (36)	May (8)	Jun (77)	Jul (177)	Aug (66)	Sep (8)	Oct (33)	Nov (13)	Dec (37)
2009	Jan (2)	Feb (5)	Mar (8)	Apr	May (36)	Jun (19)	Jul (46)	Aug (8)	Sep (1)	Oct (66)	Nov (61)	Dec (10)
2010	Jan (13)	Feb (16)	Mar (38)	Apr (76)	May (47)	Jun (32)	Jul (35)	Aug (45)	Sep (20)	Oct (61)	Nov (24)	Dec (16)
2011	Jan (22)	Feb (34)	Mar (11)	Apr (8)	May (24)	Jun (23)	Jul (11)	Aug (42)	Sep (81)	Oct (48)	Nov (21)	Dec (20)
2012	Jan (30)	Feb (25)	Mar (4)	Apr (6)	May (1)	Jun (5)	Jul (5)	Aug (8)	Sep (6)	Oct (6)	Nov	Dec

archive-access-cvs Mailing List for Web Archive Access Utilities (Page 39)

archive-access-cvs — CVS commits