From: <mi...@us...> - 2008-07-17 10:29:28
|
Revision: 2458 http://archive-access.svn.sourceforge.net/archive-access/?rev=2458&view=rev Author: miklosh Date: 2008-07-17 10:29:29 +0000 (Thu, 17 Jul 2008) Log Message: ----------- Modified ImageIndexer to work with NutchWAX 0.12. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java 2008-07-17 10:18:46 UTC (rev 2457) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java 2008-07-17 10:29:29 UTC (rev 2458) @@ -50,7 +50,6 @@ import org.apache.nutch.indexer.NutchSimilarity; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; -import org.apache.nutch.net.protocols.Response; /** Create indexes for segments. */ public class ImageIndexer extends Configured implements Tool, @@ -207,13 +206,14 @@ public void reduce(Text key, Iterator<WrappedWritable> values, OutputCollector<Text, Writable> output, Reporter reporter) throws IOException { + Inlinks inlinks = null; CrawlDatum dbDatum = null; CrawlDatum fetchDatum = null; ParseData parseData = null; ParseText parseText = null; - Metadata metadata = null; + Metadata metadata = new Metadata(); Metadata contentMetadata = null; String segmentName = null; String signature = null; @@ -221,12 +221,9 @@ Writable value = values.next().get(); if (value instanceof ImageWritable) { ImageWritable imgData = (ImageWritable) value; - Metadata imgMeta = imgData.getMetadata(); - if (metadata == null) { - metadata = imgMeta; - } else { - mergeMetadata(imgMeta, metadata); - } + mergeMetadata(imgData.getMetadata(), metadata); + } else if (value instanceof Text) { // Got parent's key + metadata.add(ImageSearch.PARENT_URL_KEY, value.toString()); } else if (value instanceof Inlinks) { inlinks = (Inlinks) value; } else if (value instanceof CrawlDatum) { @@ -245,27 +242,12 @@ throw new RuntimeException("Unexpected status: " + datum.getStatus()); } } else if (value instanceof ParseData) { - if (parseData != null) { - ParseData newParse = (ParseData) value; - Metadata parseMeta = newParse.getParseMeta(); - // Check if this is the parse meta from ImageParseFilter - // If so, use its parse meta, otherwise use the content meta - if (parseMeta.get(ImageSearch.PARENT_URL_KEY) != null) { - mergeMetadata(parseMeta, metadata); - } else { - contentMetadata = newParse.getContentMeta(); - } - } else { - parseData = (ParseData) value; - metadata = parseData.getParseMeta(); - contentMetadata = parseData.getContentMeta(); - } + parseData = (ParseData) value; + mergeMetadata(parseData.getParseMeta(), metadata); + contentMetadata = parseData.getContentMeta(); } else if (value instanceof ParseText) { - ParseText newParseText = (ParseText) value; - if (parseText == null || (parseText != null && - parseText.getText().length() < newParseText.getText().length())) { - parseText = (ParseText) value; - } + parseText = (ParseText) value; + metadata.add(ImageSearch.ALT_TEXT_KEY, parseText.getText()); } else if (LOG.isWarnEnabled()) { LOG.warn("Unrecognized type: " + value.getClass()); } @@ -284,10 +266,11 @@ } } - if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) { + if (fetchDatum == null || dbDatum == null || parseData == null) { return; // only have inlinks } - if (!parseData.getStatus().isSuccess() || + + if (/*!parseData.getStatus().isSuccess() ||*/ // ImageParser returns NOTPARSED fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) { return; } @@ -296,6 +279,14 @@ if (metadata.get(ImageSearch.PARENT_URL_KEY) == null) { return; } + + if (segmentName == null || signature == null) { + if (LOG.isInfoEnabled()) { + LOG.info("Skipping " + key + " with segmentName=" + segmentName + + " signature=" + signature); + } + return; + } // Make sure segment name and signature are set contentMetadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); contentMetadata.set(Nutch.SIGNATURE_KEY, signature); @@ -385,6 +376,7 @@ job.setMapperClass(ImageIndexer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(WrappedWritable.class); + job.setOutputValueGroupingComparator(MixedOutputKeyComparator.class); job.setReducerClass(ImageIndexer.class); job.setOutputPath(indexDir); @@ -424,9 +416,63 @@ return -1; } } + + /** Compares keys such that a key=URL can be matched with a key=URL+digest. */ + private static class MixedOutputKeyComparator extends WritableComparator { + private static final String DIGEST_PREFIX = " sha1:"; + MixedOutputKeyComparator() { + super(Text.class); + } + + @Override + public int compare(WritableComparable a, WritableComparable b) { + Text key1 = (Text) a; + Text key2 = (Text) b; + int pos1 = key1.find(DIGEST_PREFIX); + int pos2 = key2.find(DIGEST_PREFIX); + if ( (pos1 > 0 && pos2 > 0) || (pos1 == -1 && pos2 == -1) ) { + // Both are keys of the same type + return key1.compareTo(key2); + } else { + int len1 = key1.getLength(); + int len2 = key2.getLength(); + if (pos1 == -1) { + return WritableComparator.compareBytes(key1.getBytes(), 0, + len1, key2.getBytes(), 0, pos2); + } else { + return WritableComparator.compareBytes(key2.getBytes(), 0, + len2, key1.getBytes(), 0, pos1); + } + } + } + } + public void map(Text key, Writable value, OutputCollector<Text, WrappedWritable> output, Reporter reporter) throws IOException { - output.collect(key, new WrappedWritable(value)); + + if (value instanceof ParseData) { + ParseData parseData = (ParseData) value; + Metadata parseMeta = parseData.getParseMeta(); + String[] imageUrls = parseMeta.getValues(ImageSearch.IMAGE_URLS_KEY); + // Emit image info with the image's URL as key + if (imageUrls != null && imageUrls.length > 0) { + String[] alts = parseMeta.getValues(ImageSearch.ALT_TEXT_KEY); + for (int i = 0; i < imageUrls.length; i++) { + Text imageKey = new Text(imageUrls[i]); + if (alts[i].length() > 0) { + ParseText parseText = new ParseText(alts[i]); + // Emit alternate text as ParseText + output.collect(imageKey, new WrappedWritable(parseText)); + } + // Emit parent's key + output.collect(imageKey, new WrappedWritable(key)); + } + } else { + output.collect(key, new WrappedWritable(value)); + } + } else { + output.collect(key, new WrappedWritable(value)); + } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |