Thread: [Archive-access-cvs] SF.net SVN: archive-access:[3601] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3601
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3601&view=rev
Author:   binzino
Date:     2012-01-24 18:27:24 +0000 (Tue, 24 Jan 2012)
Log Message:
-----------
Initial revision.

Added Paths:
-----------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/FilenameInputFormat.java

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/FilenameInputFormat.java
===================================================================

--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/FilenameInputFormat.java	                        (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/FilenameInputFormat.java	2012-01-24 18:27:24 UTC (rev 3601)
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2012 Internet Archive
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You
+ * may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package org.archive.nutchwax;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.RecordReader;
+
+/**
+ * Weird hack to take a filename of a file in HDFS and return that
+ * name as the 1 and only 1 record "read" from it.
+ */
+public class FilenameInputFormat extends FileInputFormat<Text,Text>
+{
+  /**
+   * Configure per Hadoop properties
+   */
+  public void configure( JobConf conf )
+  {
+  }
+
+  /**
+   * By definition, not splitable.
+   */
+  @Override
+  protected boolean isSplitable(FileSystem fs, Path file) 
+  {
+    return false;
+  }
+
+  /**
+   * Return a RecordReader which returns 1 record: the file path from
+   * the InputSplit.
+   */
+  public RecordReader<Text, Text> getRecordReader( InputSplit genericSplit, 
+                                                   JobConf job,
+                                                   Reporter reporter)
+    throws IOException 
+    {
+      reporter.setStatus(genericSplit.toString());
+      
+      FileSplit split = (FileSplit) genericSplit;
+      final Path file  = split.getPath();
+      
+      return new RecordReader<Text,Text>()
+        {
+          boolean done = false; 
+
+          public void close() 
+          { 
+          }
+          
+          public Text createKey() 
+          {
+            return new Text();
+          }
+
+          public Text createValue() 
+          { 
+            return new Text();
+          }
+
+          public long getPos() 
+          { 
+            return 0;
+          }
+          
+          public float getProgress() 
+          { 
+            return 0.0f;
+          }
+          
+          public boolean next( Text key, Text value) 
+          { 
+            if ( done ) return false;
+
+            key  .set( file.toString() );
+            value.set( file.toString() );
+
+            done = true ;
+
+            return true;
+          }
+
+        };
+    }
+  
+}

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





Thread: [Archive-access-cvs] SF.net SVN: archive-access:[3601] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

archive-access-cvs