From: <mi...@us...> - 2008-08-16 11:48:41
|
Revision: 2553 http://archive-access.svn.sourceforge.net/archive-access/?rev=2553&view=rev Author: miklosh Date: 2008-08-16 11:48:50 +0000 (Sat, 16 Aug 2008) Log Message: ----------- Added image metadata indexing and image size filtering. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java 2008-08-14 03:26:36 UTC (rev 2552) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java 2008-08-16 11:48:50 UTC (rev 2553) @@ -22,9 +22,11 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.io.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.*; import org.apache.nutch.parse.*; @@ -41,9 +43,11 @@ import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.crawl.LinkDb; import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.crawl.MapWritable; import org.apache.lucene.index.*; import org.apache.lucene.document.*; + import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.indexer.IndexingFilters; import org.apache.nutch.indexer.NutchSimilarity; @@ -57,6 +61,12 @@ Mapper<Text, Writable, Text, NutchWritable> { public static final String DONE_NAME = "index.done"; + // Keys of indexed image metadata + public static final String[] indexedImageMetadata = { + ImageSearch.IMAGE_WIDTH_KEY, + ImageSearch.IMAGE_HEIGHT_KEY, + ImageSearch.SIZE_KEY + }; public static final Log LOG = LogFactory.getLog(DocIndexer.class); /** A utility class used to pass a lucene document from Indexer.reduce @@ -192,6 +202,7 @@ ParseData parseData = null; ParseText parseText = null; Metadata imageUrlMapping = new Metadata(); + Map<String, Metadata> imageMetadata = new TreeMap<String, Metadata>(); while (values.hasNext()) { Writable value = values.next().get(); // unwrap if (value instanceof Inlinks) { @@ -215,14 +226,21 @@ parseData = (ParseData) value; } else if (value instanceof ParseText) { parseText = (ParseText) value; - } else if (value instanceof Metadata) { - // Add image URL->digest mapping - Metadata mapping = (Metadata) value; - String[] imageUrls = mapping.names(); - for (String imageUrl : imageUrls) { + } else if (value instanceof MapWritable) { + MapWritable map = (MapWritable) value; + Set<Writable> mapping = map.keySet(); + Iterator<Writable> keys = mapping.iterator(); + while (keys.hasNext()) { + Text keyValue = (Text) keys.next(); + String imageUrl = keyValue.toString(); + MapWritable metaMap = (MapWritable) map.get(keyValue); + // Add image URL->digest mapping if (imageUrlMapping.get(imageUrl) == null) { - imageUrlMapping.add(imageUrl, mapping.get(imageUrl)); + imageUrlMapping.add(imageUrl, metaMap.get(new Text(Metadata.SIGNATURE_KEY)).toString()); } + // Convert from MapWritable to Metadata + imageMetadata.put(imageUrl, + convertMapWritableToMetadata(metaMap)); } } else if (LOG.isWarnEnabled()) { LOG.warn("Unrecognized type: " + value.getClass()); @@ -256,6 +274,18 @@ continue; } parseMeta.add(ImageSearch.IMAGE_IDS_KEY, mappedTo); + if (imageMetadata.containsKey(imageUrl)) { + Metadata imageMeta = imageMetadata.get(imageUrl); + // copy metadata into parseMeta + for (String name : indexedImageMetadata) { + String value = imageMeta.get(name); + if (value != null) { + parseMeta.add(name, value); + } else { + parseMeta.add(name, "-"); + } + } + } } } @@ -308,16 +338,56 @@ output.collect(key, new LuceneDocumentWrapper(doc)); } + + /** + * Converts a MapWritable object into a Metadata object. + * @param map MapWritable to convert + * @return Metadata object + */ + private static Metadata convertMapWritableToMetadata(MapWritable map) { + Metadata result = new Metadata(); + Iterator<Writable> metaKey = map.keySet().iterator(); + while (metaKey.hasNext()) { + Writable keyWritable = metaKey.next(); + //Text keyWritable = (Text) metaKey.next(); + String keyString = keyWritable.toString(); + Writable metaValue = map.get(keyWritable); + result.add(keyString, metaValue.toString()); + } + return result; + } + /** - * Emits image URLs as keys and their URL+digest as values. + * Tries to find out the digest of an image from a Content object. + * @param content Content object to retrieve info from + * @return null if this Content does not hold a recognized image format */ + private static String getImageDigestFromContent(Content content) { + // Check MIME type + if (content.getContentType().contains("image/")) { + Metadata meta = content.getMetadata(); + // Using NutchWax.DIGEST_KEY here + String digest = meta.get("digest"); + if (digest == null) { + digest = meta.get(Metadata.SIGNATURE_KEY); + } + return digest; + } else { + return null; + } + } + + /** + * Emits parent URLs as keys and their associated image's data + * (URL+digest and metadata) as values. + */ public static class ImageUrlEmitter - implements Mapper<Text, Writable, Text, Text>, - Reducer<Text, Text, Text, Metadata> { + implements Mapper<Text, Writable, Text, NutchWritable>, + Reducer<Text, NutchWritable, Text, MapWritable> { public void map(Text key, Writable value, - OutputCollector<Text, Text> output, Reporter reporter) + OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { if (value instanceof ParseData) { @@ -326,49 +396,58 @@ String[] imageUrls = parseMeta.getValues(ImageSearch.IMAGE_URLS_KEY); if (imageUrls.length > 0) { for (String url : imageUrls) { - output.collect(new Text(url), key); + output.collect(new Text(url), new NutchWritable(key)); } } } else if (value instanceof Content) { Content content = (Content) value; - if (content.getContentType().contains("image/")) { - Metadata meta = content.getMetadata(); - // Using NutchWax.DIGEST_KEY here - String digest = meta.get("digest"); - if (digest == null) { - digest = meta.get(Metadata.SIGNATURE_KEY); - } - output.collect(new Text(content.getUrl()), new Text(digest)); + String digest = getImageDigestFromContent(content); + if (digest != null) { + output.collect(new Text(content.getUrl()), new NutchWritable(new Text(digest))); } + } else if (value instanceof MapWritable) { + output.collect(key, new NutchWritable(value)); } } - public void reduce(Text key, Iterator<Text> values, - OutputCollector<Text, Metadata> output, Reporter reporter) + public void reduce(Text key, Iterator<NutchWritable> values, + OutputCollector<Text, MapWritable> output, Reporter reporter) throws IOException { - + Vector<Text> parents = new Vector<Text>(); - String imageUrl = key.toString(); - String imageDigest = null; + Text imageDigest = null; + MapWritable metaMap = null; while (values.hasNext()) { - Text data = values.next(); - String value = data.toString(); - // Determine type of value - if (value.contains("/")) { - // This value is a parent's key - parents.add(data); - } else { - // This value is a digest - imageDigest = value.toString(); + Writable value = values.next().get(); + if (value instanceof Text) { + Text data = (Text) value; + String content = data.toString(); + // Determine type of value + if (content.contains("/")) { + // This value is a parent's key + parents.add(data); + } else { + // This value is a digest + imageDigest = data; + } + } else if (value instanceof MapWritable) { + metaMap = (MapWritable) value; } } if (imageDigest != null) { - Metadata meta = new Metadata(); - meta.add(imageUrl, imageDigest); + MapWritable resultMap = null; + if (metaMap != null) { + resultMap = metaMap; + } else { + resultMap = new MapWritable(); + } + resultMap.put(new Text(Metadata.SIGNATURE_KEY), imageDigest); + MapWritable imageInfo = new MapWritable(); + imageInfo.put(key, resultMap); Iterator<Text> it = parents.iterator(); while (it.hasNext()) { Text parentKey = it.next(); - output.collect(parentKey, meta); + output.collect(parentKey, imageInfo); } } } @@ -377,6 +456,60 @@ public void close() {} } + /** + * Emits image URLs as keys and their associated metadata as values. + */ + public static class ImageMetaEmitter + implements Mapper<Text, Writable, Text, NutchWritable>, + Reducer<Text, NutchWritable, Text, MapWritable> { + + public void map(Text key, Writable value, + OutputCollector<Text, NutchWritable> output, Reporter reporter) + throws IOException { + + if (value instanceof ImageWritable) { + ImageWritable imageData = (ImageWritable) value; + output.collect(key, new NutchWritable(imageData.getMetadata())); + } else if (value instanceof Content) { + Content content = (Content) value; + String digest = getImageDigestFromContent(content); + if (digest != null) { + output.collect(new Text(digest), new NutchWritable( + new Text(content.getUrl()))); + } + } + } + + public void reduce(Text key, Iterator<NutchWritable> values, + OutputCollector<Text, MapWritable> output, Reporter reporter) + throws IOException { + + Text imageUrl = null; + Metadata imageMeta = null; + while (values.hasNext()) { + Writable value = values.next().get(); + if (value instanceof Text) { + imageUrl = (Text) value; + } else if (value instanceof Metadata) { + imageMeta = (Metadata) value; + } + } + if (imageUrl != null && imageMeta != null) { + // Convert Metadata into MapWritable + MapWritable metaMap = new MapWritable(); + String[] names = imageMeta.names(); + for (String name : names) { + String value = imageMeta.get(name); + metaMap.put(new Text(name), new Text(value)); + } + output.collect(imageUrl, metaMap); + } + } + + public void configure(JobConf job) {} + public void close() {} + } + public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments) throws IOException { @@ -385,28 +518,65 @@ LOG.info("DocIndexer: linkdb: " + linkDb); } + FileSystem fs = FileSystem.get(getConf()); /* + * Optional phase: adding image metadata + */ + Path metaDir = null; + JobConf job = new NutchJob(getConf()); + job.setJobName("imagemeta " + indexDir); + boolean haveImageData = false; + for (int i = 0; i < segments.length; i++) { + Path imageDataDir = new Path(segments[i], ImageWritable.IMAGE_DATA_DIR); + if (fs.exists(imageDataDir)) { + job.addInputPath(imageDataDir); + job.addInputPath(new Path(segments[i], Content.DIR_NAME)); + haveImageData = true; + } + } + if (haveImageData) { + metaDir = new Path("imgmeta-" + + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); + + job.setInputFormat(SequenceFileInputFormat.class); + job.setMapperClass(ImageMetaEmitter.class); + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(NutchWritable.class); + job.setReducerClass(ImageMetaEmitter.class); + + job.setOutputPath(metaDir); + job.setOutputFormat(SequenceFileOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(MapWritable.class); + + JobClient.runJob(job); + } + + /* * First phase: determining image keys */ Path outDir = new Path("imgkeys-"+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - JobConf job = new NutchJob(getConf()); + job = new NutchJob(getConf()); job.setJobName("imagekeys " + indexDir); for (int i = 0; i < segments.length; i++) { job.addInputPath(new Path(segments[i], ParseData.DIR_NAME)); job.addInputPath(new Path(segments[i], Content.DIR_NAME)); } + if (metaDir != null) { + job.addInputPath(metaDir); + } job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(ImageUrlEmitter.class); job.setMapOutputKeyClass(Text.class); - job.setMapOutputValueClass(Text.class); + job.setMapOutputValueClass(NutchWritable.class); job.setReducerClass(ImageUrlEmitter.class); job.setOutputPath(outDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); - job.setOutputValueClass(Metadata.class); + job.setOutputValueClass(MapWritable.class); JobClient.runJob(job); @@ -441,8 +611,8 @@ JobClient.runJob(job); - FileSystem fs = FileSystem.get(getConf()); - fs.delete(outDir); + //fs.delete(metaDir); + //fs.delete(outDir); if (LOG.isInfoEnabled()) { LOG.info("DocIndexer: done"); Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java 2008-08-14 03:26:36 UTC (rev 2552) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java 2008-08-16 11:48:50 UTC (rev 2553) @@ -24,6 +24,9 @@ public static final String IMAGE_IDS_KEY = "image_ids"; public static final String IMAGE_POS_KEY = "image_pos"; public static final String IMAGE_URLS_KEY = "image_urls"; + public static final String IMAGE_WIDTH_KEY = "image_width"; + public static final String IMAGE_HEIGHT_KEY = "image_height"; + public static final String IMAGE_SIZE_KEY = "image_size"; // Image size category public static final String HAS_IMAGE_KEY = "has_image"; - public static final String SIZE_KEY = "size"; + public static final String SIZE_KEY = "size"; // File size of the image } Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java 2008-08-14 03:26:36 UTC (rev 2552) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java 2008-08-16 11:48:50 UTC (rev 2553) @@ -33,12 +33,15 @@ import org.apache.hadoop.conf.Configuration; import java.util.*; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.nutch.net.protocols.Response; import org.archive.nutchwax.imagesearch.ImageSearch; /** Adds image search related fields to a document. */ public class ImageIndexingFilter implements IndexingFilter { + public static final Log LOG = LogFactory.getLog(IndexingFilter.class); private Configuration conf; public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) @@ -59,8 +62,6 @@ doc.add(new Field(ImageSearch.ALT_TEXT_KEY, altText, Field.Store.NO, Field.Index.TOKENIZED)); } } - // Index image size - //... return doc; } else { String contentType = parse.getData().getMeta(Response.CONTENT_TYPE); @@ -74,23 +75,50 @@ if (imagePositions.length == 0) { // No images in this document doc.add(new Field(ImageSearch.HAS_IMAGE_KEY, "0", Field.Store.YES, Field.Index.TOKENIZED)); return doc; + } + // Filter based on image size + BitSet filteredIndexes = new BitSet(imagePositions.length); + int minWidth = conf.getInt("imagesearcher.indexer.minWidth", 10); + int minHeight = conf.getInt("imagesearcher.indexer.minHeight", 10); + String[] widths = metadata.getValues(ImageSearch.IMAGE_WIDTH_KEY); + String[] heights = metadata.getValues(ImageSearch.IMAGE_HEIGHT_KEY); + String[] imageUrls = metadata.getValues(ImageSearch.IMAGE_URLS_KEY); + + if (widths.length > 0 && widths.length == heights.length) { + for (int i = 0; i < widths.length; i++) { + if (Integer.parseInt(widths[i]) < minWidth || + Integer.parseInt(heights[i]) < minHeight) { + filteredIndexes.set(i); + if (LOG.isDebugEnabled()) { + LOG.debug("Filtered image " + imageUrls[i] + " " + + widths[i] + "x" + heights[i]); + } + } + } + } + + // Check if all images have been filtered + if (filteredIndexes.cardinality() == imagePositions.length) { + doc.add(new Field(ImageSearch.HAS_IMAGE_KEY, "0", Field.Store.YES, Field.Index.TOKENIZED)); + return doc; } else { doc.add(new Field(ImageSearch.HAS_IMAGE_KEY, "1", Field.Store.YES, Field.Index.TOKENIZED)); - for (String imagePos : imagePositions) { - doc.add(new Field(ImageSearch.IMAGE_POS_KEY, imagePos, + } + + // Add other image search related fields to the document + String[] imageIds = metadata.getValues(ImageSearch.IMAGE_IDS_KEY); + for (int i = 0; i < imagePositions.length; i++) { + if (filteredIndexes.get(i)) { + continue; + } else { + doc.add(new Field(ImageSearch.IMAGE_POS_KEY, imagePositions[i], Field.Store.YES, Field.Index.NO)); + doc.add(new Field(ImageSearch.IMAGE_IDS_KEY, imageIds[i], + Field.Store.YES, Field.Index.NO)); + doc.add(new Field(ImageSearch.IMAGE_URLS_KEY, imageUrls[i], + Field.Store.YES, Field.Index.TOKENIZED)); } } - String[] imageIds = metadata.getValues(ImageSearch.IMAGE_IDS_KEY); - for (String imageId : imageIds) { - doc.add(new Field(ImageSearch.IMAGE_IDS_KEY, imageId, - Field.Store.YES, Field.Index.NO)); - } - String[] imageUrls = metadata.getValues(ImageSearch.IMAGE_URLS_KEY); - for (String imageUrl : imageUrls) { - doc.add(new Field(ImageSearch.IMAGE_URLS_KEY, imageUrl, - Field.Store.YES, Field.Index.TOKENIZED)); - } return doc; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |