From: <bi...@us...> - 2012-02-02 17:09:31
|
Revision: 3610 http://archive-access.svn.sourceforge.net/archive-access/?rev=3610&view=rev Author: binzino Date: 2012-02-02 17:09:25 +0000 (Thu, 02 Feb 2012) Log Message: ----------- Initial revision. Added Paths: ----------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/FilenameInputFormat.java Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/FilenameInputFormat.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/FilenameInputFormat.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/FilenameInputFormat.java 2012-02-02 17:09:25 UTC (rev 3610) @@ -0,0 +1,117 @@ +/* + * Copyright 2012 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. You + * may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package org.archive.hadoop; + +import java.io.*; +import java.util.*; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RecordReader; + + +/** + * Handy "input format" which maps the input filename into a "record" + * which just has the filename. + * + * This is very useful for map-reduce jobs where you want to pass the + * filenames into the map() function. Use this as the input format, + * and the input filenames will be passed to the map(). The full + * pathname is given as both the key and the value to the map(). + */ +public class FilenameInputFormat extends FileInputFormat<Text,Text> +{ + /** + * Configure per Hadoop properties + */ + public void configure( JobConf conf ) + { + } + + /** + * By definition, not splitable. + */ + @Override + protected boolean isSplitable(FileSystem fs, Path file) + { + return false; + } + + /** + * Return a RecordReader which returns 1 record: the file path from + * the InputSplit. + */ + public RecordReader<Text, Text> getRecordReader( InputSplit genericSplit, + JobConf job, + Reporter reporter) + throws IOException + { + reporter.setStatus(genericSplit.toString()); + + FileSplit split = (FileSplit) genericSplit; + final Path file = split.getPath(); + + return new RecordReader<Text,Text>() + { + boolean done = false; + + public void close() + { + } + + public Text createKey() + { + return new Text(); + } + + public Text createValue() + { + return new Text(); + } + + public long getPos() + { + return 0; + } + + public float getProgress() + { + return 0.0f; + } + + public boolean next( Text key, Text value) + { + if ( done ) return false; + + key .set( file.toString() ); + value.set( file.toString() ); + + done = true ; + + return true; + } + + }; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |