Revision: 2504 http://archive-access.svn.sourceforge.net/archive-access/?rev=2504&view=rev Author: miklosh Date: 2008-07-26 15:47:56 +0000 (Sat, 26 Jul 2008) Log Message: ----------- Made modifications needed for using Hadoop 0.17. Display image metadata in search results. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java 2008-07-26 15:46:55 UTC (rev 2503) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java 2008-07-26 15:47:56 UTC (rev 2504) @@ -25,6 +25,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; @@ -40,13 +41,16 @@ import org.apache.lucene.store.FSDirectory; import org.apache.nutch.indexer.FsDirectory; import org.apache.nutch.indexer.Indexer; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.NutchConfiguration; -public class ImageSearcherBean { +public class ImageSearcherBean implements ImageLoader { public static final Log LOG = LogFactory.getLog(ImageSearcherBean.class); private IndexReader reader; + private ImageDataReader imageReader; private Path baseDir; private Configuration conf; @@ -66,18 +70,9 @@ Path indexesDir = new Path(baseDir, "indexes"); if (this.fs.exists(indexesDir)) { Vector<Path> doneDirs = new Vector<Path>(); - Path[] dirs = fs.listPaths(indexesDir, new PathFilter() { - - public boolean accept(Path f) { - try { - if (fs.isDirectory(f)) { - return true; - } - } catch (IOException ioe) { - } - return false; - } - }); + FileStatus[] fstats = fs.listStatus(indexesDir, + HadoopFSUtil.getPassDirectoriesFilter(fs)); + Path[] dirs = HadoopFSUtil.getPaths(fstats); for (Path dir : dirs) { Path indexdone = new Path(dir, Indexer.DONE_NAME); if (fs.isFile(indexdone)) { @@ -95,6 +90,8 @@ Path[] indexDir = {new Path(baseDir, "index")}; init(indexDir); } + this.imageReader = new ImageDataReader(FileSystem.get(conf), + new Path(baseDir, "segments").toString(), conf); } /** Init given a set of indexes or just one index. */ @@ -124,12 +121,19 @@ if (reader != null) { reader.close(); } + if (imageReader != null) { + imageReader.close(); + } } public IndexReader getReader() { return reader; } + public ImageWritable getImage(String id) throws IOException { + return imageReader.getImage(id); + } + /** * Calculate the score for an image hit. * @param hit found hit @@ -245,6 +249,9 @@ nextDist = imgIndex < numDocImages-1 ? Math.abs(imagePositions[imgIndex+1] - pos) + (end-pos) : Integer.MAX_VALUE; } + if (imgIndex >= numDocImages) { + continue; + } // Check if this image is in the allowed proximity of the span if (dist > distThreshold) { if (LOG.isDebugEnabled()) { @@ -261,6 +268,7 @@ ImageHit newHit = new ImageHit(imageIds[imgIndex], imageUrls[imgIndex], currentDoc); newHit.docSim = docSim; newHit.docScore = docBoost; + newHit.parentUrl = doc.get("url"); newHit.proximity = Math.min(1.0f, 1.0f-((float)dist/maxDist)); newHit.score = scoreHit(newHit, doc); @@ -338,6 +346,10 @@ hits.getTotal() >= maxHits ? maxHits : (int)hits.getTotal()); for (ImageHit hit : top) { System.out.println(hit.score + " " + hit.url + " " + hit.imageId); + ImageWritable imageData = isb.getImage(hit.imageId); + if (imageData != null) { + System.out.println("[ " + imageData.getMetadata() + "]"); + } } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |