Thread: [Archive-access-cvs] SF.net SVN: archive-access:[3610] trunk/archive-access/projects/ archive-comm

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3610
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3610&view=rev
Author:   binzino
Date:     2012-02-02 17:09:25 +0000 (Thu, 02 Feb 2012)
Log Message:
-----------
Initial revision.

Added Paths:
-----------
    trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/FilenameInputFormat.java

Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/FilenameInputFormat.java
===================================================================

--- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/FilenameInputFormat.java	                        (rev 0)
+++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/FilenameInputFormat.java	2012-02-02 17:09:25 UTC (rev 3610)
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2012 Internet Archive
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You
+ * may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package org.archive.hadoop;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.RecordReader;
+
+
+/**
+ * Handy "input format" which maps the input filename into a "record"
+ * which just has the filename.
+ *
+ * This is very useful for map-reduce jobs where you want to pass the
+ * filenames into the map() function.  Use this as the input format,
+ * and the input filenames will be passed to the map().  The full
+ * pathname is given as both the key and the value to the map().
+ */
+public class FilenameInputFormat extends FileInputFormat<Text,Text>
+{
+  /**
+   * Configure per Hadoop properties
+   */
+  public void configure( JobConf conf )
+  {
+  }
+
+  /**
+   * By definition, not splitable.
+   */
+  @Override
+  protected boolean isSplitable(FileSystem fs, Path file) 
+  {
+    return false;
+  }
+
+  /**
+   * Return a RecordReader which returns 1 record: the file path from
+   * the InputSplit.
+   */
+  public RecordReader<Text, Text> getRecordReader( InputSplit genericSplit, 
+                                                   JobConf job,
+                                                   Reporter reporter)
+    throws IOException 
+    {
+      reporter.setStatus(genericSplit.toString());
+      
+      FileSplit split = (FileSplit) genericSplit;
+      final Path file  = split.getPath();
+      
+      return new RecordReader<Text,Text>()
+        {
+          boolean done = false; 
+
+          public void close() 
+          { 
+          }
+          
+          public Text createKey() 
+          {
+            return new Text();
+          }
+
+          public Text createValue() 
+          { 
+            return new Text();
+          }
+
+          public long getPos() 
+          { 
+            return 0;
+          }
+          
+          public float getProgress() 
+          { 
+            return 0.0f;
+          }
+          
+          public boolean next( Text key, Text value) 
+          { 
+            if ( done ) return false;
+
+            key  .set( file.toString() );
+            value.set( file.toString() );
+
+            done = true ;
+
+            return true;
+          }
+
+        };
+    }
+  
+}

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





Thread: [Archive-access-cvs] SF.net SVN: archive-access:[3610] trunk/archive-access/projects/ archive-comm

archive-access-cvs