From: <mi...@us...> - 2008-07-26 15:35:18
|
Revision: 2500 http://archive-access.svn.sourceforge.net/archive-access/?rev=2500&view=rev Author: miklosh Date: 2008-07-26 15:35:27 +0000 (Sat, 26 Jul 2008) Log Message: ----------- Updated ImageProcessor to store size of original image and deduplicate thumbnails based on digest. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java 2008-07-26 15:33:48 UTC (rev 2499) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java 2008-07-26 15:35:27 UTC (rev 2500) @@ -18,6 +18,7 @@ package org.archive.nutchwax.imagesearch; import java.io.IOException; +import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -29,18 +30,21 @@ import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; public class ImageProcessor extends Configured implements Tool, - Mapper<Text, Content, Text, ImageWritable> { + Mapper<Text, Content, Text, ImageWritable>, + Reducer<Text, ImageWritable, Text, ImageWritable> { private static final Log LOG = LogFactory.getLog(ImageProcessor.class); @@ -56,25 +60,44 @@ OutputCollector<Text, ImageWritable> output, Reporter reporter) throws IOException { - Metadata metadata = new Metadata(); // Check content type if (!content.getContentType().contains("image/")) { return; } // Generate thumbnail + Metadata metadata = new Metadata(); byte[] data = content.getContent(); StoredImage thumb = ThumbnailGenerator.generateThumbnail(data, thumbMaxSize, thumbMaxSize, thumbQuality, metadata); // Create and setup an ImageWritable ImageWritable image = new ImageWritable(key.toString()); + metadata.set(ImageSearch.SIZE_KEY, Integer.toString(data.length)); image.setMetadata(metadata); image.setThumbnail(thumb); - output.collect(key, image); + // Get digest of image content + Metadata contentMeta = content.getMetadata(); + String digest = contentMeta.get("digest"); + if (digest == null) { + digest = contentMeta.get(Nutch.SIGNATURE_KEY); + } + + output.collect(new Text(digest), image); } + + public void reduce(Text key, Iterator<ImageWritable> values, + OutputCollector<Text, ImageWritable> output, Reporter reporter) + throws IOException { + if (values.hasNext()) { + // Save only one instance + output.collect(key, values.next()); + return; + } + } + public void processImageContent(Path segment) throws IOException { @@ -88,6 +111,7 @@ job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(ImageProcessor.class); + job.setReducerClass(ImageProcessor.class); job.setOutputPath(new Path(segment, ImageWritable.IMAGE_DATA_DIR)); job.setOutputFormat(MapFileOutputFormat.class); Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java 2008-07-26 15:33:48 UTC (rev 2499) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java 2008-07-26 15:35:27 UTC (rev 2500) @@ -20,9 +20,10 @@ public class ImageSearch { public static final String PARENT_URL_KEY = "parent_url"; public static final String ALT_TEXT_KEY = "alt"; - + public static final String IMAGE_IDS_KEY = "image_ids"; public static final String IMAGE_POS_KEY = "image_pos"; public static final String IMAGE_URLS_KEY = "image_urls"; public static final String HAS_IMAGE_KEY = "has_image"; + public static final String SIZE_KEY = "size"; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <mi...@us...> - 2008-07-26 15:37:06
|
Revision: 2501 http://archive-access.svn.sourceforge.net/archive-access/?rev=2501&view=rev Author: miklosh Date: 2008-07-26 15:37:14 +0000 (Sat, 26 Jul 2008) Log Message: ----------- Added ImageDataReader for image metadata and thumbnail retrieval. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java Added Paths: ----------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageDataReader.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageLoader.java Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageDataReader.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageDataReader.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageDataReader.java 2008-07-26 15:37:14 UTC (rev 2501) @@ -0,0 +1,84 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.MapFileOutputFormat; +import org.apache.nutch.util.HadoopFSUtil; + +/** Retrieves image thumbnails and metadata from segments. */ +public class ImageDataReader implements ImageLoader { + + private HashMap<String, MapFile.Reader[]> segments = new HashMap<String, MapFile.Reader[]>(); + + /** Construct given a directory containing segments. */ + ImageDataReader(FileSystem fs, String segmentsDir, Configuration conf) throws IOException { + FileStatus[] fstats = fs.listStatus(new Path(segmentsDir), + HadoopFSUtil.getPassDirectoriesFilter(fs)); + Path[] segmentDirs = HadoopFSUtil.getPaths(fstats); + + if (segmentDirs != null) { + for (Path segmentDir : segmentDirs) { + MapFile.Reader[] readers = MapFileOutputFormat. + getReaders(fs, new Path(segmentDir, ImageWritable.IMAGE_DATA_DIR), conf); + if (readers != null) { + segments.put(segmentDir.getName(), readers); + } + } + } + } + + /** + * Loads the stored ImageWritable from disk. + * @param id identifier of the image to retrieve + */ + public ImageWritable getImage(String id) throws IOException { + // TODO: try the segment in which the parent doc resides first + Text key = new Text(id); + ImageWritable holder = new ImageWritable(); + Iterator<MapFile.Reader[]> it = segments.values().iterator(); + while (it.hasNext()) { + MapFile.Reader[] readers = it.next(); + for (MapFile.Reader reader : readers) { + ImageWritable result = (ImageWritable) reader.get(key, holder); + if (result != null) { + return result; + } + } + } + return null; + } + + public void close() throws IOException { + Iterator<MapFile.Reader[]> it = segments.values().iterator(); + while (it.hasNext()) { + MapFile.Reader[] readers = it.next(); + for (MapFile.Reader reader : readers) { + reader.close(); + } + } + } +} \ No newline at end of file Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java 2008-07-26 15:35:27 UTC (rev 2500) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java 2008-07-26 15:37:14 UTC (rev 2501) @@ -23,6 +23,7 @@ public String imageId; public String url; + public String parentUrl; public float docSim; public float proximity; Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageLoader.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageLoader.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageLoader.java 2008-07-26 15:37:14 UTC (rev 2501) @@ -0,0 +1,25 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.IOException; + +/** Interface for loading image data from disk. */ +public interface ImageLoader { + public ImageWritable getImage(String id) throws IOException; +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |