From: <bi...@us...> - 2012-01-24 18:27:33
|
Revision: 3601 http://archive-access.svn.sourceforge.net/archive-access/?rev=3601&view=rev Author: binzino Date: 2012-01-24 18:27:24 +0000 (Tue, 24 Jan 2012) Log Message: ----------- Initial revision. Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/FilenameInputFormat.java Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/FilenameInputFormat.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/FilenameInputFormat.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/FilenameInputFormat.java 2012-01-24 18:27:24 UTC (rev 3601) @@ -0,0 +1,113 @@ +/* + * Copyright 2012 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. You + * may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.*; +import java.util.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RecordReader; + +/** + * Weird hack to take a filename of a file in HDFS and return that + * name as the 1 and only 1 record "read" from it. + */ +public class FilenameInputFormat extends FileInputFormat<Text,Text> +{ + /** + * Configure per Hadoop properties + */ + public void configure( JobConf conf ) + { + } + + /** + * By definition, not splitable. + */ + @Override + protected boolean isSplitable(FileSystem fs, Path file) + { + return false; + } + + /** + * Return a RecordReader which returns 1 record: the file path from + * the InputSplit. + */ + public RecordReader<Text, Text> getRecordReader( InputSplit genericSplit, + JobConf job, + Reporter reporter) + throws IOException + { + reporter.setStatus(genericSplit.toString()); + + FileSplit split = (FileSplit) genericSplit; + final Path file = split.getPath(); + + return new RecordReader<Text,Text>() + { + boolean done = false; + + public void close() + { + } + + public Text createKey() + { + return new Text(); + } + + public Text createValue() + { + return new Text(); + } + + public long getPos() + { + return 0; + } + + public float getProgress() + { + return 0.0f; + } + + public boolean next( Text key, Text value) + { + if ( done ) return false; + + key .set( file.toString() ); + value.set( file.toString() ); + + done = true ; + + return true; + } + + }; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |