[Archive-access-cvs] SF.net SVN: archive-access: [2427] trunk/archive-access/projects/nutchwax

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2427
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2427&view=rev
Author:   miklosh
Date:     2008-07-11 07:00:57 -0700 (Fri, 11 Jul 2008)

Log Message:
-----------
Initial commit of the image search contrib.

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/imagesearch/
    trunk/archive-access/projects/nutchwax/imagesearch/README.txt
    trunk/archive-access/projects/nutchwax/imagesearch/bin/
    trunk/archive-access/projects/nutchwax/imagesearch/build.xml
    trunk/archive-access/projects/nutchwax/imagesearch/conf/
    trunk/archive-access/projects/nutchwax/imagesearch/lib/
    trunk/archive-access/projects/nutchwax/imagesearch/src/
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHitQueue.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHits.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageWritable.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/StoredImage.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ThumbnailGenerator.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/WrappedWritable.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/build-plugin.xml
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/build.xml
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/plugin.xml
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParser.java

Added: trunk/archive-access/projects/nutchwax/imagesearch/README.txt
===================================================================

--- trunk/archive-access/projects/nutchwax/imagesearch/README.txt	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/README.txt	2008-07-11 14:00:57 UTC (rev 2427)
@@ -0,0 +1,59 @@
+Nutch(WAX) Image Search Contrib
+===============================
+
+Getting the source
+------------------
+Check out Nutch-1.0-dev as usually, then check-out the image search
+contrib into Nutch's "contrib" directory.
+
+ $ cd contrib
+ $ svn checkout http://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/nutchwax/imagesearch
+
+This will create a sub-directory named "imagesearch" containing the
+sources for the image search contrib.
+
+
+Configuring
+-----------
+Enable the 'image-search' plugin in Nutch's configuration by appending 
+it to the 'plugin.includes' property. This registers three plugins:
+ o ImageParseFilter (HTML parse filter)
+ o ImageParser (fake JPEG and GIF parser)
+ o ImageIndexingFilter
+
+If you are using NutchWAX 0.12 or newer, make sure you add the following 
+line to the 'indexingfilter.order' property:
+
+ org.archive.nutchwax.imagesearch.plugin.ImageIndexingFilter
+
+
+Build and install
+-----------------
+Build the contrib by executing the 'ant' build command in
+
+ nutch/contrib/imagesearch
+
+as you normally would.
+
+For example
+
+ $ cd nutch/contrib/imagesearch
+ $ ant tar
+
+This command will build all of Nutch, then the image search add-ons and 
+finally will package everything up into the "nutch-1.0-dev.tar.gz" 
+release package.
+
+Then install the "nutch-1.0-dev.tar.gz" tarball as normal.
+
+
+Searching
+---------
+After performing the usual steps to import or fetch the files, invert 
+the links and index the documents, you can search the resulting indexes 
+for images by:
+
+ bin/nutch org.archive.nutchwax.imagesearch.ImageSearcherBean product
+
+This calls the ImageSearcherBean to execute a simple keyword search for
+"product".

Added: trunk/archive-access/projects/nutchwax/imagesearch/build.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/build.xml	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/build.xml	2008-07-11 14:00:57 UTC (rev 2427)
@@ -0,0 +1,138 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="nutchwax-imagesearch" default="job">
+
+  <property name="nutch.dir" value="../../" />
+
+  <property name="src.dir"   value="src" />
+  <property name="lib.dir"   value="lib" />
+  <property name="build.dir" value="${nutch.dir}/build" />
+  <!-- HACK: Need to import default.properties like Nutch does -->
+  <property name="dist.dir"  value="${build.dir}/nutch-1.0-dev" />
+
+  <target name="nutch-compile-core">
+    <ant dir="${nutch.dir}" target="compile-core" inheritAll="false" />
+  </target>
+
+  <target name="nutch-compile-plugins">
+    <ant dir="${nutch.dir}" target="compile-plugins" inheritAll="false" />
+  </target>
+
+  <target name="compile-core" depends="nutch-compile-core">
+    <javac 
+           destdir="${build.dir}/classes"
+           debug="true"
+           verbose="false"
+           source="1.5"
+           target="1.5"
+           encoding="UTF-8"
+           fork="true"
+           nowarn="true"
+           deprecation="false">
+      <src path="${src.dir}/java" />
+      <include name="**/*.java" />
+      <classpath>
+        <pathelement location="${build.dir}/classes" />
+        <fileset dir="${lib.dir}">
+          <include name="*.jar"/>
+        </fileset>
+        <fileset dir="${nutch.dir}/lib">
+          <include name="*.jar"/>
+        </fileset>
+      </classpath>
+    </javac>
+  </target>
+
+  <target name="compile-plugins">
+    <ant dir="src/plugin" target="deploy" inheritAll="false" />
+  </target>
+
+  <!--
+      These targets all call down to the corresponding target in the
+      Nutch build.xml file.  This way all of the 'ant' build commands
+      can be executed from this directory and everything should get
+      built as expected.
+    -->
+  <target name="compile" depends="compile-core, compile-plugins, nutch-compile-plugins">
+  </target>
+
+  <target name="jar" depends="compile-core">
+    <ant dir="${nutch.dir}" target="jar" inheritAll="false" />
+  </target>
+
+  <target name="job" depends="compile">
+    <ant dir="${nutch.dir}" target="job" inheritAll="false" />
+  </target>
+
+  <target name="war" depends="compile">
+    <ant dir="${nutch.dir}" target="war" inheritAll="false" />
+  </target>
+
+  <target name="javadoc" depends="compile">
+    <ant dir="${nutch.dir}" target="javadoc" inheritAll="false" />
+  </target>
+
+  <target name="tar" depends="package">
+    <ant dir="${nutch.dir}" target="tar" inheritAll="false" />
+  </target>
+
+  <target name="clean">
+    <ant dir="${nutch.dir}" target="clean" inheritAll="false" />
+  </target>
+
+  <!-- This one does a little more after calling down to the relevant
+       Nutch target.  After Nutch has copied everything into the
+       distribution directory, we add our script, libraries, etc.
+       
+       Rather than over-write the standard Nutch configuration files,
+       we place ours in a newly created directory
+       
+         contrib/imagesearch/conf
+
+       and let the individual user decide whether or not to
+       incorporate our modifications.
+    -->
+  <target name="package" depends="jar, job, war, javadoc">
+    <ant dir="${nutch.dir}" target="package" inheritAll="false" />
+
+    <copy todir="${dist.dir}/lib" includeEmptyDirs="false">
+      <fileset dir="lib"/>
+    </copy>
+
+    <copy todir="${dist.dir}/bin">
+      <fileset dir="bin"/>
+    </copy>
+
+    <chmod perm="ugo+x" type="file">
+        <fileset dir="${dist.dir}/bin"/>
+    </chmod>
+
+    <mkdir dir="${dist.dir}/contrib/imagesearch/conf"/>
+    <copy todir="${dist.dir}/contrib/imagesearch/conf">
+      <fileset dir="conf" />
+    </copy>
+
+    <copy todir="${dist.dir}/contrib/imagesearch">
+      <fileset dir=".">
+        <include name="*.txt" />
+      </fileset>
+    </copy>
+
+  </target>
+
+</project>

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java	2008-07-11 14:00:57 UTC (rev 2427)
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.imagesearch;
+
+public class ImageHit {
+    public int doc;
+    public float docScore;
+
+    public String imageId;
+    public String url;
+
+    public float docSim;
+    public float proximity;
+    public float score;
+    
+    public ImageHit(String id, String url, int doc) {
+        this.imageId = id;
+        this.url = url;
+        this.doc = doc;
+    }
+}

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHitQueue.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHitQueue.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHitQueue.java	2008-07-11 14:00:57 UTC (rev 2427)
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.imagesearch;
+
+import org.apache.lucene.util.PriorityQueue;
+
+final class ImageHitQueue extends PriorityQueue {
+
+    ImageHitQueue(int size) {
+        initialize(size);
+    }
+
+    protected final boolean lessThan(Object a, Object b) {
+        ImageHit hitA = (ImageHit)a;
+        ImageHit hitB = (ImageHit)b;
+        if (hitA.score == hitB.score) {
+            return hitA.doc > hitB.doc;
+        } else {
+            return hitA.score < hitB.score;
+        }
+    }
+}

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHits.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHits.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHits.java	2008-07-11 14:00:57 UTC (rev 2427)
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.imagesearch;
+
+/** A set of image hits matching a query. Adapted from Nutch's Hits class. */
+public final class ImageHits {
+
+    private long total;
+    private boolean totalIsExact = true;
+    private ImageHit[] top;
+
+    public ImageHits() {
+    }
+
+    public ImageHits(long total, ImageHit[] top) {
+        this.total = total;
+        this.top = top;
+    }
+
+    /** Returns the total number of hits for this query.  This may be an estimate
+     * when (@link #totalIsExact()} is false. */
+    public long getTotal() {
+        return total;
+    }
+
+    /** True if {@link #getTotal()} gives the exact number of hits, or false if
+     * it is only an estimate of the total number of hits. */
+    public boolean totalIsExact() {
+        return totalIsExact;
+    }
+
+    /** Set {@link #totalIsExact()}. */
+    public void setTotalIsExact(boolean isExact) {
+        totalIsExact = isExact;
+    }
+
+    /** Returns the number of hits included in this current listing. */
+    public int getLength() {
+        return top.length;
+    }
+
+    /** Returns the <code>i</code><sup>th</sup> hit in this list. */
+    public ImageHit getHit(int i) {
+        return top[i];
+    }
+
+    /** Returns a subset of the hit objects. */
+    public ImageHit[] getHits(int start, int length) {
+        ImageHit[] results = new ImageHit[length];
+        for (int i = 0; i < length; i++) {
+            results[i] = top[start + i];
+        }
+        return results;
+    }
+}

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java	2008-07-11 14:00:57 UTC (rev 2427)
@@ -0,0 +1,432 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.imagesearch;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.*;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.analysis.*;
+
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.LogUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.crawl.LinkDb;
+import org.apache.nutch.crawl.NutchWritable;
+
+import org.apache.lucene.index.*;
+import org.apache.lucene.document.*;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilters;
+import org.apache.nutch.indexer.NutchSimilarity;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.protocols.Response;
+
+/** Create indexes for segments. */
+public class ImageIndexer extends Configured implements Tool, 
+        Reducer<Text, WrappedWritable, Text, Writable>, Mapper<Text, Writable, Text, WrappedWritable> {
+
+    public static final String DONE_NAME = "index.done";
+    public static final Log LOG = LogFactory.getLog(ImageIndexer.class);
+
+    /** A utility class used to pass a lucene document from Indexer.reduce 
+     * to Indexer.OutputFormat.
+     * Note: Despite its name, it can't properly wrap a lucene document - it
+     * doesn't know how to serialize/deserialize a lucene document.
+     */
+    private static class LuceneDocumentWrapper implements Writable {
+
+        private Document doc;
+
+        public LuceneDocumentWrapper(Document doc) {
+            this.doc = doc;
+        }
+
+        public Document get() {
+            return doc;
+        }
+
+        public void readFields(DataInput in) throws IOException {
+        // intentionally left blank
+        }
+
+        public void write(DataOutput out) throws IOException {
+        // intentionally left blank
+        }
+    }
+
+    /** Unwrap Lucene Documents created by reduce and add them to an index. */
+    public static class OutputFormat
+            extends org.apache.hadoop.mapred.OutputFormatBase<WritableComparable, LuceneDocumentWrapper> {
+
+        public RecordWriter<WritableComparable, LuceneDocumentWrapper> getRecordWriter(final FileSystem fs, JobConf job,
+                String name, final Progressable progress) throws IOException {
+            final Path perm = new Path(job.getOutputPath(), name);
+            final Path temp =
+                    job.getLocalPath("index/_" + Integer.toString(new Random().nextInt()));
+
+            fs.delete(perm);                            // delete old, if any
+
+            final AnalyzerFactory factory = new AnalyzerFactory(job);
+            final IndexWriter writer = // build locally first
+                    new IndexWriter(fs.startLocalOutput(perm, temp).toString(),
+                    new NutchDocumentAnalyzer(job), true);
+
+            writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
+            writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
+            writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE));
+            writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128));
+            writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
+            writer.setInfoStream(LogUtil.getInfoStream(LOG));
+            writer.setUseCompoundFile(false);
+            writer.setSimilarity(new NutchSimilarity());
+
+            return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() {
+
+                boolean closed;
+
+                public void write(WritableComparable key, LuceneDocumentWrapper value)
+                        throws IOException {                  // unwrap & index doc
+                    Document doc = value.get();
+                    NutchAnalyzer analyzer = factory.get(doc.get("lang"));
+                    if (LOG.isInfoEnabled()) {
+                        LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" +
+                                " with analyzer " + analyzer +
+                                " (" + doc.get("lang") + ")");
+                    }
+                    writer.addDocument(doc, analyzer);
+                    progress.progress();
+                }
+
+                public void close(final Reporter reporter) throws IOException {
+                    // spawn a thread to give progress heartbeats
+                    Thread prog = new Thread() {
+
+                        public void run() {
+                            while (!closed) {
+                                try {
+                                    reporter.setStatus("closing");
+                                    Thread.sleep(1000);
+                                } catch (InterruptedException e) {
+                                    continue;
+                                } catch (Throwable e) {
+                                    return;
+                                }
+                            }
+                        }
+                    };
+
+                    try {
+                        prog.start();
+                        if (LOG.isInfoEnabled()) {
+                            LOG.info("Optimizing index.");
+                        }
+                        // optimize & close index
+                        writer.optimize();
+                        writer.close();
+                        fs.completeLocalOutput(perm, temp);   // copy to dfs
+                        fs.createNewFile(new Path(perm, DONE_NAME));
+                    } finally {
+                        closed = true;
+                    }
+                }
+            };
+        }
+    }
+    private IndexingFilters filters;
+    private ScoringFilters scfilters;
+
+    public ImageIndexer() {
+    }
+
+    public ImageIndexer(Configuration conf) {
+        setConf(conf);
+    }
+
+    public void configure(JobConf job) {
+        setConf(job);
+        this.filters = new IndexingFilters(getConf());
+        this.scfilters = new ScoringFilters(getConf());
+    }
+
+    public void close() {
+    }
+
+    /**
+     * Copies key/value pairs from one metadata container to another.
+     * Overwrites the destination if the source has a value with greater length.
+     * 
+     * @param from Metadata to copy from
+     * @param to target metadata container
+     */
+    private void mergeMetadata(Metadata from, Metadata to) {
+        String[] names = from.names();
+        for (String name : names) {
+            String newValue = from.get(name);
+            String value = to.get(name);
+            if (value != null) {
+                if (newValue.length() > value.length()) {
+                    to.set(name, newValue);
+                }
+            } else {
+                to.add(name, newValue);
+            }
+        }
+    }
+
+    public void reduce(Text key, Iterator<WrappedWritable> values,
+            OutputCollector<Text, Writable> output, Reporter reporter)
+            throws IOException {
+        Inlinks inlinks = null;
+        CrawlDatum dbDatum = null;
+        CrawlDatum fetchDatum = null;
+        ParseData parseData = null;
+        ParseText parseText = null;
+
+        Metadata metadata = null;
+        Metadata contentMetadata = null;
+        String segmentName = null;
+        String signature = null;
+        while (values.hasNext()) {
+            Writable value = values.next().get();
+            if (value instanceof ImageWritable) {
+                ImageWritable imgData = (ImageWritable) value;
+                Metadata imgMeta = imgData.getMetadata();
+                if (metadata == null) {
+                    metadata = imgMeta;
+                } else {
+                    mergeMetadata(imgMeta, metadata);
+                }
+            } else if (value instanceof Inlinks) {
+                inlinks = (Inlinks) value;
+            } else if (value instanceof CrawlDatum) {
+                CrawlDatum datum = (CrawlDatum) value;
+                if (CrawlDatum.hasDbStatus(datum)) {
+                    dbDatum = datum;
+                } else if (CrawlDatum.hasFetchStatus(datum)) {
+                    // don't index unmodified (empty) pages
+                    if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
+                        fetchDatum = datum;
+                    }
+                } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
+                        CrawlDatum.STATUS_SIGNATURE == datum.getStatus()) {
+                    continue;
+                } else {
+                    throw new RuntimeException("Unexpected status: " + datum.getStatus());
+                }
+            } else if (value instanceof ParseData) {
+                if (parseData != null) {
+                    ParseData newParse = (ParseData) value;
+                    Metadata parseMeta = newParse.getParseMeta();
+                    // Check if this is the parse meta from ImageParseFilter
+                    // If so, use its parse meta, otherwise use the content meta
+                    if (parseMeta.get(ImageSearch.PARENT_URL_KEY) != null) {
+                        mergeMetadata(parseMeta, metadata);
+                    } else {
+                        contentMetadata = newParse.getContentMeta();
+                    }
+                } else {
+                    parseData = (ParseData) value;
+                    metadata = parseData.getParseMeta();
+                    contentMetadata = parseData.getContentMeta();
+                }
+            } else if (value instanceof ParseText) {
+                ParseText newParseText = (ParseText) value;
+                if (parseText == null || (parseText != null &&
+                        parseText.getText().length() < newParseText.getText().length())) {
+                    parseText = (ParseText) value;
+                }
+            } else if (LOG.isWarnEnabled()) {
+                LOG.warn("Unrecognized type: " + value.getClass());
+            }
+            // Save segment name and signature
+            if (contentMetadata != null) {
+                if (segmentName == null || signature == null) {
+                    String stringValue = contentMetadata.get(Nutch.SEGMENT_NAME_KEY);
+                    if (stringValue != null) {
+                        segmentName = stringValue;
+                    }
+                    stringValue = contentMetadata.get(Nutch.SIGNATURE_KEY);
+                    if (stringValue != null) {
+                        signature = stringValue;
+                    }
+                }
+            }
+        }
+
+        if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) {
+            return;                                     // only have inlinks
+        }
+        if (!parseData.getStatus().isSuccess() ||
+                fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
+            return;
+        }
+
+        // Skip possibly non-images
+        if (metadata.get(ImageSearch.PARENT_URL_KEY) == null) {
+            return;
+        }
+        // Make sure segment name and signature are set
+        contentMetadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
+        contentMetadata.set(Nutch.SIGNATURE_KEY, signature);
+
+        Document doc = new Document();
+
+        // add segment, used to map from merged index back to segment files
+        doc.add(new Field("segment", contentMetadata.get(Nutch.SEGMENT_NAME_KEY),
+                Field.Store.YES, Field.Index.NO));
+
+        // add digest, used by dedup
+        doc.add(new Field("digest", contentMetadata.get(Nutch.SIGNATURE_KEY),
+                Field.Store.YES, Field.Index.NO));
+        
+        ParseData combinedParseData = new ParseData(parseData.getStatus(), 
+                parseData.getTitle(), parseData.getOutlinks(), contentMetadata, 
+                metadata);
+
+        Parse parse = new ParseImpl(parseText, combinedParseData);
+        try {
+            // extract information from dbDatum and pass it to
+            // fetchDatum so that indexing filters can use it
+            Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
+            if (url != null) {
+                fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
+            }
+            // run indexing filters
+            doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
+        } catch (IndexingException e) {
+            if (LOG.isWarnEnabled()) {
+                LOG.warn("Error indexing " + key + ": " + e);
+            }
+            return;
+        }
+
+        // skip documents discarded by indexing filters
+        if (doc == null) {
+            return;
+        }
+
+        float boost = 1.0f;
+        // run scoring filters
+        try {
+            boost = this.scfilters.indexerScore((Text) key, doc, dbDatum,
+                    fetchDatum, parse, inlinks, boost);
+        } catch (ScoringFilterException e) {
+            if (LOG.isWarnEnabled()) {
+                LOG.warn("Error calculating score " + key + ": " + e);
+            }
+            return;
+        }
+        // apply boost to all indexed fields.
+        doc.setBoost(boost);
+        // store boost for use by explain and dedup
+        doc.add(new Field("boost", Float.toString(boost),
+                Field.Store.YES, Field.Index.NO));
+
+        output.collect(key, new LuceneDocumentWrapper(doc));
+    }
+
+    public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments)
+            throws IOException {
+
+        if (LOG.isInfoEnabled()) {
+            LOG.info("ImageIndexer: starting");
+            LOG.info("ImageIndexer: linkdb: " + linkDb);
+        }
+
+        JobConf job = new NutchJob(getConf());
+        job.setJobName("index " + indexDir);
+
+        for (int i = 0; i < segments.length; i++) {
+            if (LOG.isInfoEnabled()) {
+                LOG.info("ImageIndexer: adding segment: " + segments[i]);
+            }
+            job.addInputPath(new Path(segments[i], CrawlDatum.FETCH_DIR_NAME));
+            job.addInputPath(new Path(segments[i], CrawlDatum.PARSE_DIR_NAME));
+            job.addInputPath(new Path(segments[i], ParseData.DIR_NAME));
+            job.addInputPath(new Path(segments[i], ParseText.DIR_NAME));
+            job.addInputPath(new Path(segments[i], ImageWritable.IMAGE_DATA_DIR));
+        }
+
+        job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
+        job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME));
+        job.setInputFormat(SequenceFileInputFormat.class);
+
+        job.setMapperClass(ImageIndexer.class);
+        job.setMapOutputKeyClass(Text.class);
+        job.setMapOutputValueClass(WrappedWritable.class);
+        job.setReducerClass(ImageIndexer.class);
+
+        job.setOutputPath(indexDir);
+        job.setOutputFormat(OutputFormat.class);
+        job.setOutputKeyClass(Text.class);
+        job.setOutputValueClass(NutchWritable.class);
+
+        JobClient.runJob(job);
+        if (LOG.isInfoEnabled()) {
+            LOG.info("ImageIndexer: done");
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        int res = ToolRunner.run(NutchConfiguration.create(), new ImageIndexer(), args);
+        System.exit(res);
+    }
+
+    public int run(String[] args) throws Exception {
+
+        if (args.length < 4) {
+            System.err.println("Usage: <index> <crawldb> <linkdb> <segment> ...");
+            return -1;
+        }
+
+        Path[] segments = new Path[args.length - 3];
+        for (int i = 3; i < args.length; i++) {
+            segments[i - 3] = new Path(args[i]);
+        }
+
+        try {
+            index(new Path(args[0]), new Path(args[1]), new Path(args[2]),
+                    segments);
+            return 0;
+        } catch (Exception e) {
+            LOG.fatal("ImageIndexer: " + StringUtils.stringifyException(e));
+            return -1;
+        }
+    }
+
+    public void map(Text key, Writable value,
+            OutputCollector<Text, WrappedWritable> output, Reporter reporter) throws IOException {
+        output.collect(key, new WrappedWritable(value));
+    }
+}

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java	2008-07-11 14:00:57 UTC (rev 2427)
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.imagesearch;
+
+import java.io.IOException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+
+public class ImageProcessor extends Configured implements Tool,
+        Mapper<Text, Content, Text, ImageWritable> {
+
+    private static final Log LOG = LogFactory.getLog(ImageProcessor.class);
+    
+    private int thumbQuality;
+    private int thumbMaxSize;
+    
+    ImageProcessor() {}
+    ImageProcessor(Configuration conf) {
+        setConf(conf);
+    }
+
+    public void map(Text key, Content content,
+            OutputCollector<Text, ImageWritable> output,
+            Reporter reporter) throws IOException {
+
+        Metadata metadata = new Metadata();
+        // Check content type
+        if (!content.getContentType().contains("image/")) {
+            return;
+        }
+
+        // Generate thumbnail
+        byte[] data = content.getContent();
+        StoredImage thumb = ThumbnailGenerator.generateThumbnail(data, 
+                thumbMaxSize, thumbMaxSize, thumbQuality, metadata);
+
+        // Create and setup an ImageWritable
+        ImageWritable image = new ImageWritable(key.toString());
+        image.setMetadata(metadata);
+        image.setThumbnail(thumb);
+
+        output.collect(key, image);
+    }
+
+    public void processImageContent(Path segment) 
+        throws IOException {
+        
+        JobConf job = new NutchJob(getConf());
+        job.setJobName("ImageProcessor " + segment);
+
+        if (LOG.isInfoEnabled()) {
+            LOG.info("ImageProcessor: processing " + segment);
+        }
+        job.addInputPath(new Path(segment, Content.DIR_NAME));
+
+        job.setInputFormat(SequenceFileInputFormat.class);
+        job.setMapperClass(ImageProcessor.class);
+
+        job.setOutputPath(new Path(segment, ImageWritable.IMAGE_DATA_DIR));
+        job.setOutputFormat(MapFileOutputFormat.class);
+        job.setOutputKeyClass(Text.class);
+        job.setOutputValueClass(ImageWritable.class);
+
+        JobClient.runJob(job);
+
+        if (LOG.isInfoEnabled()) {
+            LOG.info("ImageProcessor: done");
+        }        
+    }
+
+    public static void main(String[] args) throws Exception {
+        int res = ToolRunner.run(NutchConfiguration.create(), 
+                new ImageProcessor(), args);
+        System.exit(res);
+    }
+    public int run(String[] args) throws Exception {
+        
+        if (args.length == 0) {
+            System.err.println("Usage: imageprocessor <segment>");
+            return -1;
+        }
+        
+        Path segment = new Path(args[0]);
+        try {
+            processImageContent(segment);
+            return 0;
+        } catch (Exception e) {
+            LOG.fatal("ImageProcessor: " + StringUtils.stringifyException(e));
+            return -1;
+        }
+    }
+
+    private Configuration conf;
+    public void configure(JobConf conf) {
+        setConf(conf);
+
+        this.thumbQuality = conf.getInt("imagesearcher.thumbnail.quality", 50);
+        this.thumbMaxSize = conf.getInt("imagesearcher.thumbnail.maxSize", 100);
+    }
+
+    public void close() throws IOException {
+    }
+}

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java	2008-07-11 14:00:57 UTC (rev 2427)
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.imagesearch;
+
+public class ImageSearch {
+    public static final String PARENT_URL_KEY   = "parent_url";
+    public static final String ALT_TEXT_KEY     = "alt";
+    
+    public static final String IMAGE_IDS_KEY    = "image_ids";
+    public static final String IMAGE_POS_KEY    = "image_pos";
+    public static final String IMAGE_URLS_KEY   = "image_urls";
+    public static final String HAS_IMAGE_KEY    = "has_image";
+}

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java	2008-07-11 14:00:57 UTC (rev 2427)
@@ -0,0 +1,346 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.imagesearch;
+
+import java.io.File;
+import java.io.IOException;
+import java.lang.Math;
+import java.util.Iterator;
+import java.util.Vector;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.nutch.indexer.FsDirectory;
+import org.apache.nutch.indexer.Indexer;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class ImageSearcherBean {
+
+    public static final Log LOG = LogFactory.getLog(ImageSearcherBean.class);
+    
+    private IndexReader reader;
+
+    private Path baseDir;
+    private Configuration conf;
+    private FileSystem fs;
+
+    private int distThreshold;  // Maximum allowed distance of image from hit
+                                // to be considered
+    
+    /** Construct given a configuration. */
+    public ImageSearcherBean(Configuration conf) throws IOException {
+        this.conf = conf;
+        this.fs = FileSystem.get(conf);
+        this.baseDir = new Path(conf.get("searcher.dir", "crawl"));
+        this.distThreshold = conf.getInt("imagesearch.maxDist", 300);
+
+        // Try to load unmerged indexes
+        Path indexesDir = new Path(baseDir, "indexes");
+        if (this.fs.exists(indexesDir)) {
+            Vector<Path> doneDirs = new Vector<Path>();
+            Path[] dirs = fs.listPaths(indexesDir, new PathFilter() {
+
+                public boolean accept(Path f) {
+                    try {
+                        if (fs.isDirectory(f)) {
+                            return true;
+                        }
+                    } catch (IOException ioe) {
+                    }
+                    return false;
+                }
+            });
+            for (Path dir : dirs) {
+                Path indexdone = new Path(dir, Indexer.DONE_NAME);
+                if (fs.isFile(indexdone)) {
+                    doneDirs.add(dir);
+                }
+            }
+            dirs = new Path[doneDirs.size()];
+            Iterator<Path> it = doneDirs.iterator();
+            int i = 0;
+            while (it.hasNext()) {
+                dirs[i++] = it.next();
+            }
+            init(dirs);
+        } else {
+            Path[] indexDir = {new Path(baseDir, "index")};
+            init(indexDir);
+        }
+    }
+
+    /** Init given a set of indexes or just one index. */
+    public void init(Path[] indexes) throws IOException {
+        IndexReader[] indexReaders = new IndexReader[indexes.length];
+        for (int i = 0; i < indexes.length; i++) {
+            indexReaders[i] = IndexReader.open(getDirectory(indexes[i]));
+        }
+        if (indexes.length > 1) {
+            this.reader = new MultiReader(indexReaders);
+        } else {
+            this.reader = IndexReader.open(getDirectory(indexes[0]));
+        }
+    }
+
+    private Directory getDirectory(Path file) throws IOException {
+        if ("file".equals(this.fs.getUri().getScheme())) {
+            Path qualified = file.makeQualified(FileSystem.getLocal(conf));
+            File fsLocal = new File(qualified.toUri());
+            return FSDirectory.getDirectory(fsLocal.getAbsolutePath());
+        } else {
+            return new FsDirectory(this.fs, file, false, this.conf);
+        }
+    }
+
+    public void close() throws IOException {
+        if (reader != null) {
+            reader.close();
+        }
+    }
+
+    public IndexReader getReader() {
+        return reader;
+    }
+    
+    /**
+     * Calculate the score for an image hit.
+     * @param hit found hit
+     * @param doc parent document
+     * @return float score
+     */
+    private float scoreHit(ImageHit hit, Document doc) {
+        float a = 0.2f;
+        float b = 0.1f;
+        return a*hit.docScore + (1.0f-a)*(b*hit.docSim + (1.0f-b)*hit.proximity);
+    }
+
+    /**
+     * Find query-related images in the content of documents based on proximity.
+     * 
+     * @param queryTerms
+     * @param hitCollector
+     * @throws java.io.IOException
+     */
+    private long getImagesFromContent(Term[] queryTerms, ImageHitQueue hitCollector,
+            int maxHits) 
+            throws IOException {
+
+        // Construct SpanQuery
+        SpanQuery[] clauses = new SpanTermQuery[queryTerms.length];
+        for (int i=0; i<queryTerms.length; i++) {
+            clauses[i] = new SpanTermQuery(queryTerms[i]);
+        }
+        SpanNearQuery snq = new SpanNearQuery(clauses, queryTerms.length+1, false);
+        Spans spans = snq.getSpans(reader);
+
+        // Per document info
+        Document doc = null;
+        int currentDoc = -1;
+        int numDocImages = 0;
+        int[] imagePositions = null;
+        String[] imageIds = null;
+        String[] imageUrls = null;
+        float docBoost = 1.0f;
+        float docSim = 0.0f;
+        float maxDist = Float.MAX_VALUE;
+        float minScore = 0.0f;
+        
+        long totalHits = 0;
+
+        boolean more = spans.next();
+        while (more) {
+            if (LOG.isDebugEnabled()) {
+                LOG.debug("currentDoc "+currentDoc);
+            }
+            if (currentDoc != spans.doc()) {
+                currentDoc = spans.doc();
+                doc = reader.document(currentDoc);
+                // Skip document with no images
+                if ("0".equals(doc.getField(ImageSearch.HAS_IMAGE_KEY).stringValue())) {
+                    while (more && spans.doc() == currentDoc) {
+                        more = spans.next();                        
+                    }
+                    continue;
+                }
+
+                // Get document's global score
+                docBoost = doc.getBoost();
+
+                // Get image positions
+                String posField = doc.getField(ImageSearch.IMAGE_POS_KEY).stringValue();
+                String[] positions = posField.split(":");
+                imagePositions = new int[positions.length];
+                numDocImages = positions.length;
+                for (int i = 0; i < numDocImages; i++) {
+                    imagePositions[i] = Integer.parseInt(positions[i]);
+                }
+                maxDist = (float)imagePositions[numDocImages-1];
+
+                // Get image ids
+                String idField = doc.getField(ImageSearch.IMAGE_IDS_KEY).stringValue();
+                imageIds = idField.split(":");
+
+                // Get image urls
+                String urlField = doc.getField(ImageSearch.IMAGE_URLS_KEY).stringValue();
+                imageUrls = urlField.split(" ");
+            }
+
+            int pos = 0;
+            int end = 0;
+            int imgIndex = 0;
+            int prevDist = Integer.MAX_VALUE;
+            while (more && spans.doc() == currentDoc) {
+                if (imgIndex >= numDocImages) {
+                    more = spans.next();
+                    continue;
+                }
+                if (LOG.isDebugEnabled()) {
+                    LOG.debug("sp " + spans.start() + " " + spans.end());
+                }
+                pos = spans.start();
+                end = spans.end();
+                int dist = Math.abs(imagePositions[imgIndex] - pos) + (end-pos);
+                int nextDist = imgIndex < numDocImages-1 ? 
+                    Math.abs(imagePositions[imgIndex + 1] - pos) + (end-pos) : Integer.MAX_VALUE;
+                /*if (prevDist < dist) {
+                    more = spans.next();
+                    prevDist = dist;
+                    if (LOG.isDebugEnabled()) {
+                        LOG.debug("p<d");
+                    }
+                    continue;
+                }*/
+                // Advance image pointer till a nearer image can be found
+                while (imgIndex < numDocImages && nextDist <= dist) {
+                    if (LOG.isDebugEnabled()) {
+                        LOG.debug("adv " + nextDist + " " + dist + " id " + imageUrls[imgIndex].substring(imageUrls[imgIndex].lastIndexOf("/")));
+                    }
+                    dist = nextDist;
+                    imgIndex++;
+                    nextDist = imgIndex < numDocImages-1 ? 
+                        Math.abs(imagePositions[imgIndex+1] - pos) + (end-pos) : Integer.MAX_VALUE;
+                }
+                // Check if this image is in the allowed proximity of the span
+                if (dist > distThreshold) {
+                    if (LOG.isDebugEnabled()) {
+                        LOG.debug("d>t: " + dist);
+                    }
+                    more = spans.next();
+                    continue;
+                }
+
+                if (LOG.isDebugEnabled()) {
+                    LOG.debug("hit " + imageUrls[imgIndex].substring(imageUrls[imgIndex].lastIndexOf("/")) + " " + dist + " next " + nextDist);
+                }
+                // Found hit
+                ImageHit newHit = new ImageHit(imageIds[imgIndex], imageUrls[imgIndex], currentDoc);
+                newHit.docSim = docSim;
+                newHit.docScore = docBoost;
+                newHit.proximity = Math.min(1.0f, 1.0f-((float)dist/maxDist));
+                newHit.score = scoreHit(newHit, doc);
+
+                if (hitCollector.size() < maxHits || newHit.score >= minScore) {
+                    hitCollector.insert(newHit);
+                    minScore = ((ImageHit)hitCollector.top()).score;
+
+                    prevDist = dist;
+                    imgIndex++;
+                }
+                totalHits++;
+                more = spans.next();
+            }
+        }
+
+        return totalHits;
+    }
+
+    /**
+     * Search for images matching the query.
+     * 
+     * @param query query
+     * @param maxHits maximum number of hits to retrieve
+     * @return ImageHits the matching hits
+     * @throws java.io.IOException
+     */
+    public ImageHits search(String query, int maxHits) throws IOException {
+        String[] keywords = query.split("\\s");
+        if (keywords == null) {
+            return new ImageHits(0, new ImageHit[0]);
+        }
+
+        // Create query term array
+        Term[] queryTerms = new Term[keywords.length];
+        for (int i=0; i<queryTerms.length; i++) {
+            queryTerms[i] = new Term("content", keywords[i]);
+        }
+
+        ImageHitQueue hitQueue = new ImageHitQueue(maxHits);
+        long totalHits = getImagesFromContent(queryTerms, hitQueue, maxHits);
+
+        // Extract top results
+        ImageHit[] resultSet = new ImageHit[hitQueue.size()];
+        for (int i = resultSet.length - 1; i >= 0; i--) {
+            resultSet[i] = (ImageHit) hitQueue.pop();
+        }
+
+        return new ImageHits(totalHits, resultSet);
+    }
+    
+    /** For debugging purposes. */
+    public static void main(String[] args) throws Exception {
+        if (args.length == 0) {
+            System.err.println("Usage: ImageSearcherBean <query>");
+            System.exit(-1);
+        }
+
+        Configuration conf = NutchConfiguration.create();
+        ImageSearcherBean isb = new ImageSearcherBean(conf);
+
+        // Construct query string
+        StringBuffer sb = new StringBuffer();
+        for (String arg : args) {
+            if (sb.length() > 0) {
+                sb.append(' ');
+            }
+            sb.append(arg);
+        }
+        // Conduct search
+        int maxHits = 10;
+        ImageHits hits = isb.search(sb.toString(), maxHits);
+        // Show results
+        System.out.println("Total hits: " + hits.getTotal());
+        ImageHit[] top = hits.getHits(0, 
+                hits.getTotal() >= maxHits ? maxHits : (int)hits.getTotal());
+        for (ImageHit hit : top) {
+            System.out.println(hit.score + " " + hit.url + " " + hit.imageId);
+        }
+    }
+}

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageWritable.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageWritable.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageWritable.java	2008-07-11 14:00:57 UTC (rev 2427)
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.imagesearch;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import org.apache.hadoop.io.MD5Hash;
+import org.apache.hadoop.io.Writable;
+import org.apache.nutch.metadata.Metadata;
+
+public class ImageWritable implements Writable {
+
+    public static final String IMAGE_DATA_DIR = "image_data";
+
+    private MD5Hash id;
+    private Metadata metadata;
+    private StoredImage thumbnail;
+
+    public ImageWritable() {}
+
+    public ImageWritable(String url) {
+        this.id = MD5Hash.digest(url);
+        this.metadata = new Metadata();
+    }
+
+    public Metadata getMetadata() {
+        return metadata;
+    }
+    
+    public void setMetadata(Metadata metadata) {
+        this.metadata = metadata;
+    }
+    
+    public void setThumbnail(StoredImage thumbnail) {
+        this.thumbnail = thumbnail;
+    }
+    
+    public StoredImage getThumbnail() {
+        return thumbnail;
+    }
+
+    public void write(DataOutput out) throws IOException {
+        id.write(out);
+        metadata.write(out);
+        if (thumbnail != null) {
+            out.writeBoolean(true);
+            thumbnail.write(out);
+        } else {
+            out.writeBoolean(false);
+        }
+    }
+
+    public void readFields(DataInput in) throws IOException {
+        id = MD5Hash.read(in);
+        metadata = new Metadata();
+        metadata.readFields(in);
+        thumbnail = new StoredImage();
+        if (in.readBoolean()) {
+            thumbnail.readFields(in);
+        }
+    }
+}

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/StoredImage.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/StoredImage.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/StoredImage.java	2008-07-11 14:00:57 UTC (rev 2427)
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.imagesearch;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import org.apache.hadoop.io.Writable;
+
+/** Represents binary image data as a Writable. */
+public class StoredImage implements Writable {
+    
+    public static final byte TYPE_JPEG  = 'j';
+    public static final byte T...
 
[truncated message content]