Revision: 2504
http://archive-access.svn.sourceforge.net/archive-access/?rev=2504&view=rev
Author: miklosh
Date: 2008-07-26 15:47:56 +0000 (Sat, 26 Jul 2008)
Log Message:
-----------
Made modifications needed for using Hadoop 0.17.
Display image metadata in search results.
Modified Paths:
--------------
trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java
Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java 2008-07-26 15:46:55 UTC (rev 2503)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java 2008-07-26 15:47:56 UTC (rev 2504)
@@ -25,6 +25,7 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
@@ -40,13 +41,16 @@
import org.apache.lucene.store.FSDirectory;
import org.apache.nutch.indexer.FsDirectory;
import org.apache.nutch.indexer.Indexer;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
-public class ImageSearcherBean {
+public class ImageSearcherBean implements ImageLoader {
public static final Log LOG = LogFactory.getLog(ImageSearcherBean.class);
private IndexReader reader;
+ private ImageDataReader imageReader;
private Path baseDir;
private Configuration conf;
@@ -66,18 +70,9 @@
Path indexesDir = new Path(baseDir, "indexes");
if (this.fs.exists(indexesDir)) {
Vector<Path> doneDirs = new Vector<Path>();
- Path[] dirs = fs.listPaths(indexesDir, new PathFilter() {
-
- public boolean accept(Path f) {
- try {
- if (fs.isDirectory(f)) {
- return true;
- }
- } catch (IOException ioe) {
- }
- return false;
- }
- });
+ FileStatus[] fstats = fs.listStatus(indexesDir,
+ HadoopFSUtil.getPassDirectoriesFilter(fs));
+ Path[] dirs = HadoopFSUtil.getPaths(fstats);
for (Path dir : dirs) {
Path indexdone = new Path(dir, Indexer.DONE_NAME);
if (fs.isFile(indexdone)) {
@@ -95,6 +90,8 @@
Path[] indexDir = {new Path(baseDir, "index")};
init(indexDir);
}
+ this.imageReader = new ImageDataReader(FileSystem.get(conf),
+ new Path(baseDir, "segments").toString(), conf);
}
/** Init given a set of indexes or just one index. */
@@ -124,12 +121,19 @@
if (reader != null) {
reader.close();
}
+ if (imageReader != null) {
+ imageReader.close();
+ }
}
public IndexReader getReader() {
return reader;
}
+ public ImageWritable getImage(String id) throws IOException {
+ return imageReader.getImage(id);
+ }
+
/**
* Calculate the score for an image hit.
* @param hit found hit
@@ -245,6 +249,9 @@
nextDist = imgIndex < numDocImages-1 ?
Math.abs(imagePositions[imgIndex+1] - pos) + (end-pos) : Integer.MAX_VALUE;
}
+ if (imgIndex >= numDocImages) {
+ continue;
+ }
// Check if this image is in the allowed proximity of the span
if (dist > distThreshold) {
if (LOG.isDebugEnabled()) {
@@ -261,6 +268,7 @@
ImageHit newHit = new ImageHit(imageIds[imgIndex], imageUrls[imgIndex], currentDoc);
newHit.docSim = docSim;
newHit.docScore = docBoost;
+ newHit.parentUrl = doc.get("url");
newHit.proximity = Math.min(1.0f, 1.0f-((float)dist/maxDist));
newHit.score = scoreHit(newHit, doc);
@@ -338,6 +346,10 @@
hits.getTotal() >= maxHits ? maxHits : (int)hits.getTotal());
for (ImageHit hit : top) {
System.out.println(hit.score + " " + hit.url + " " + hit.imageId);
+ ImageWritable imageData = isb.getImage(hit.imageId);
+ if (imageData != null) {
+ System.out.println("[ " + imageData.getMetadata() + "]");
+ }
}
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|