From: <mi...@us...> - 2008-07-26 15:35:18
|
Revision: 2500 http://archive-access.svn.sourceforge.net/archive-access/?rev=2500&view=rev Author: miklosh Date: 2008-07-26 15:35:27 +0000 (Sat, 26 Jul 2008) Log Message: ----------- Updated ImageProcessor to store size of original image and deduplicate thumbnails based on digest. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java 2008-07-26 15:33:48 UTC (rev 2499) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java 2008-07-26 15:35:27 UTC (rev 2500) @@ -18,6 +18,7 @@ package org.archive.nutchwax.imagesearch; import java.io.IOException; +import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -29,18 +30,21 @@ import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; public class ImageProcessor extends Configured implements Tool, - Mapper<Text, Content, Text, ImageWritable> { + Mapper<Text, Content, Text, ImageWritable>, + Reducer<Text, ImageWritable, Text, ImageWritable> { private static final Log LOG = LogFactory.getLog(ImageProcessor.class); @@ -56,25 +60,44 @@ OutputCollector<Text, ImageWritable> output, Reporter reporter) throws IOException { - Metadata metadata = new Metadata(); // Check content type if (!content.getContentType().contains("image/")) { return; } // Generate thumbnail + Metadata metadata = new Metadata(); byte[] data = content.getContent(); StoredImage thumb = ThumbnailGenerator.generateThumbnail(data, thumbMaxSize, thumbMaxSize, thumbQuality, metadata); // Create and setup an ImageWritable ImageWritable image = new ImageWritable(key.toString()); + metadata.set(ImageSearch.SIZE_KEY, Integer.toString(data.length)); image.setMetadata(metadata); image.setThumbnail(thumb); - output.collect(key, image); + // Get digest of image content + Metadata contentMeta = content.getMetadata(); + String digest = contentMeta.get("digest"); + if (digest == null) { + digest = contentMeta.get(Nutch.SIGNATURE_KEY); + } + + output.collect(new Text(digest), image); } + + public void reduce(Text key, Iterator<ImageWritable> values, + OutputCollector<Text, ImageWritable> output, Reporter reporter) + throws IOException { + if (values.hasNext()) { + // Save only one instance + output.collect(key, values.next()); + return; + } + } + public void processImageContent(Path segment) throws IOException { @@ -88,6 +111,7 @@ job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(ImageProcessor.class); + job.setReducerClass(ImageProcessor.class); job.setOutputPath(new Path(segment, ImageWritable.IMAGE_DATA_DIR)); job.setOutputFormat(MapFileOutputFormat.class); Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java 2008-07-26 15:33:48 UTC (rev 2499) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java 2008-07-26 15:35:27 UTC (rev 2500) @@ -20,9 +20,10 @@ public class ImageSearch { public static final String PARENT_URL_KEY = "parent_url"; public static final String ALT_TEXT_KEY = "alt"; - + public static final String IMAGE_IDS_KEY = "image_ids"; public static final String IMAGE_POS_KEY = "image_pos"; public static final String IMAGE_URLS_KEY = "image_urls"; public static final String HAS_IMAGE_KEY = "has_image"; + public static final String SIZE_KEY = "size"; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |