From: <mi...@us...> - 2008-07-11 14:00:47
|
Revision: 2427 http://archive-access.svn.sourceforge.net/archive-access/?rev=2427&view=rev Author: miklosh Date: 2008-07-11 07:00:57 -0700 (Fri, 11 Jul 2008) Log Message: ----------- Initial commit of the image search contrib. Added Paths: ----------- trunk/archive-access/projects/nutchwax/imagesearch/ trunk/archive-access/projects/nutchwax/imagesearch/README.txt trunk/archive-access/projects/nutchwax/imagesearch/bin/ trunk/archive-access/projects/nutchwax/imagesearch/build.xml trunk/archive-access/projects/nutchwax/imagesearch/conf/ trunk/archive-access/projects/nutchwax/imagesearch/lib/ trunk/archive-access/projects/nutchwax/imagesearch/src/ trunk/archive-access/projects/nutchwax/imagesearch/src/java/ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHitQueue.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHits.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageWritable.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/StoredImage.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ThumbnailGenerator.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/WrappedWritable.java trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/build-plugin.xml trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/build.xml trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/plugin.xml trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParser.java Added: trunk/archive-access/projects/nutchwax/imagesearch/README.txt =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/README.txt (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/README.txt 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,59 @@ +Nutch(WAX) Image Search Contrib +=============================== + +Getting the source +------------------ +Check out Nutch-1.0-dev as usually, then check-out the image search +contrib into Nutch's "contrib" directory. + + $ cd contrib + $ svn checkout http://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/nutchwax/imagesearch + +This will create a sub-directory named "imagesearch" containing the +sources for the image search contrib. + + +Configuring +----------- +Enable the 'image-search' plugin in Nutch's configuration by appending +it to the 'plugin.includes' property. This registers three plugins: + o ImageParseFilter (HTML parse filter) + o ImageParser (fake JPEG and GIF parser) + o ImageIndexingFilter + +If you are using NutchWAX 0.12 or newer, make sure you add the following +line to the 'indexingfilter.order' property: + + org.archive.nutchwax.imagesearch.plugin.ImageIndexingFilter + + +Build and install +----------------- +Build the contrib by executing the 'ant' build command in + + nutch/contrib/imagesearch + +as you normally would. + +For example + + $ cd nutch/contrib/imagesearch + $ ant tar + +This command will build all of Nutch, then the image search add-ons and +finally will package everything up into the "nutch-1.0-dev.tar.gz" +release package. + +Then install the "nutch-1.0-dev.tar.gz" tarball as normal. + + +Searching +--------- +After performing the usual steps to import or fetch the files, invert +the links and index the documents, you can search the resulting indexes +for images by: + + bin/nutch org.archive.nutchwax.imagesearch.ImageSearcherBean product + +This calls the ImageSearcherBean to execute a simple keyword search for +"product". Added: trunk/archive-access/projects/nutchwax/imagesearch/build.xml =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/build.xml (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/build.xml 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,138 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="nutchwax-imagesearch" default="job"> + + <property name="nutch.dir" value="../../" /> + + <property name="src.dir" value="src" /> + <property name="lib.dir" value="lib" /> + <property name="build.dir" value="${nutch.dir}/build" /> + <!-- HACK: Need to import default.properties like Nutch does --> + <property name="dist.dir" value="${build.dir}/nutch-1.0-dev" /> + + <target name="nutch-compile-core"> + <ant dir="${nutch.dir}" target="compile-core" inheritAll="false" /> + </target> + + <target name="nutch-compile-plugins"> + <ant dir="${nutch.dir}" target="compile-plugins" inheritAll="false" /> + </target> + + <target name="compile-core" depends="nutch-compile-core"> + <javac + destdir="${build.dir}/classes" + debug="true" + verbose="false" + source="1.5" + target="1.5" + encoding="UTF-8" + fork="true" + nowarn="true" + deprecation="false"> + <src path="${src.dir}/java" /> + <include name="**/*.java" /> + <classpath> + <pathelement location="${build.dir}/classes" /> + <fileset dir="${lib.dir}"> + <include name="*.jar"/> + </fileset> + <fileset dir="${nutch.dir}/lib"> + <include name="*.jar"/> + </fileset> + </classpath> + </javac> + </target> + + <target name="compile-plugins"> + <ant dir="src/plugin" target="deploy" inheritAll="false" /> + </target> + + <!-- + These targets all call down to the corresponding target in the + Nutch build.xml file. This way all of the 'ant' build commands + can be executed from this directory and everything should get + built as expected. + --> + <target name="compile" depends="compile-core, compile-plugins, nutch-compile-plugins"> + </target> + + <target name="jar" depends="compile-core"> + <ant dir="${nutch.dir}" target="jar" inheritAll="false" /> + </target> + + <target name="job" depends="compile"> + <ant dir="${nutch.dir}" target="job" inheritAll="false" /> + </target> + + <target name="war" depends="compile"> + <ant dir="${nutch.dir}" target="war" inheritAll="false" /> + </target> + + <target name="javadoc" depends="compile"> + <ant dir="${nutch.dir}" target="javadoc" inheritAll="false" /> + </target> + + <target name="tar" depends="package"> + <ant dir="${nutch.dir}" target="tar" inheritAll="false" /> + </target> + + <target name="clean"> + <ant dir="${nutch.dir}" target="clean" inheritAll="false" /> + </target> + + <!-- This one does a little more after calling down to the relevant + Nutch target. After Nutch has copied everything into the + distribution directory, we add our script, libraries, etc. + + Rather than over-write the standard Nutch configuration files, + we place ours in a newly created directory + + contrib/imagesearch/conf + + and let the individual user decide whether or not to + incorporate our modifications. + --> + <target name="package" depends="jar, job, war, javadoc"> + <ant dir="${nutch.dir}" target="package" inheritAll="false" /> + + <copy todir="${dist.dir}/lib" includeEmptyDirs="false"> + <fileset dir="lib"/> + </copy> + + <copy todir="${dist.dir}/bin"> + <fileset dir="bin"/> + </copy> + + <chmod perm="ugo+x" type="file"> + <fileset dir="${dist.dir}/bin"/> + </chmod> + + <mkdir dir="${dist.dir}/contrib/imagesearch/conf"/> + <copy todir="${dist.dir}/contrib/imagesearch/conf"> + <fileset dir="conf" /> + </copy> + + <copy todir="${dist.dir}/contrib/imagesearch"> + <fileset dir="."> + <include name="*.txt" /> + </fileset> + </copy> + + </target> + +</project> Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +public class ImageHit { + public int doc; + public float docScore; + + public String imageId; + public String url; + + public float docSim; + public float proximity; + public float score; + + public ImageHit(String id, String url, int doc) { + this.imageId = id; + this.url = url; + this.doc = doc; + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHitQueue.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHitQueue.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHitQueue.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,37 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import org.apache.lucene.util.PriorityQueue; + +final class ImageHitQueue extends PriorityQueue { + + ImageHitQueue(int size) { + initialize(size); + } + + protected final boolean lessThan(Object a, Object b) { + ImageHit hitA = (ImageHit)a; + ImageHit hitB = (ImageHit)b; + if (hitA.score == hitB.score) { + return hitA.doc > hitB.doc; + } else { + return hitA.score < hitB.score; + } + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHits.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHits.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHits.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,70 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +/** A set of image hits matching a query. Adapted from Nutch's Hits class. */ +public final class ImageHits { + + private long total; + private boolean totalIsExact = true; + private ImageHit[] top; + + public ImageHits() { + } + + public ImageHits(long total, ImageHit[] top) { + this.total = total; + this.top = top; + } + + /** Returns the total number of hits for this query. This may be an estimate + * when (@link #totalIsExact()} is false. */ + public long getTotal() { + return total; + } + + /** True if {@link #getTotal()} gives the exact number of hits, or false if + * it is only an estimate of the total number of hits. */ + public boolean totalIsExact() { + return totalIsExact; + } + + /** Set {@link #totalIsExact()}. */ + public void setTotalIsExact(boolean isExact) { + totalIsExact = isExact; + } + + /** Returns the number of hits included in this current listing. */ + public int getLength() { + return top.length; + } + + /** Returns the <code>i</code><sup>th</sup> hit in this list. */ + public ImageHit getHit(int i) { + return top[i]; + } + + /** Returns a subset of the hit objects. */ + public ImageHit[] getHits(int start, int length) { + ImageHit[] results = new ImageHit[length]; + for (int i = 0; i < length; i++) { + results[i] = top[start + i]; + } + return results; + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,432 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.*; +import java.util.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.io.*; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.conf.*; +import org.apache.hadoop.mapred.*; +import org.apache.hadoop.util.*; +import org.apache.nutch.parse.*; +import org.apache.nutch.analysis.*; + +import org.apache.nutch.scoring.ScoringFilterException; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.LogUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.CrawlDb; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.crawl.LinkDb; +import org.apache.nutch.crawl.NutchWritable; + +import org.apache.lucene.index.*; +import org.apache.lucene.document.*; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilters; +import org.apache.nutch.indexer.NutchSimilarity; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.protocols.Response; + +/** Create indexes for segments. */ +public class ImageIndexer extends Configured implements Tool, + Reducer<Text, WrappedWritable, Text, Writable>, Mapper<Text, Writable, Text, WrappedWritable> { + + public static final String DONE_NAME = "index.done"; + public static final Log LOG = LogFactory.getLog(ImageIndexer.class); + + /** A utility class used to pass a lucene document from Indexer.reduce + * to Indexer.OutputFormat. + * Note: Despite its name, it can't properly wrap a lucene document - it + * doesn't know how to serialize/deserialize a lucene document. + */ + private static class LuceneDocumentWrapper implements Writable { + + private Document doc; + + public LuceneDocumentWrapper(Document doc) { + this.doc = doc; + } + + public Document get() { + return doc; + } + + public void readFields(DataInput in) throws IOException { + // intentionally left blank + } + + public void write(DataOutput out) throws IOException { + // intentionally left blank + } + } + + /** Unwrap Lucene Documents created by reduce and add them to an index. */ + public static class OutputFormat + extends org.apache.hadoop.mapred.OutputFormatBase<WritableComparable, LuceneDocumentWrapper> { + + public RecordWriter<WritableComparable, LuceneDocumentWrapper> getRecordWriter(final FileSystem fs, JobConf job, + String name, final Progressable progress) throws IOException { + final Path perm = new Path(job.getOutputPath(), name); + final Path temp = + job.getLocalPath("index/_" + Integer.toString(new Random().nextInt())); + + fs.delete(perm); // delete old, if any + + final AnalyzerFactory factory = new AnalyzerFactory(job); + final IndexWriter writer = // build locally first + new IndexWriter(fs.startLocalOutput(perm, temp).toString(), + new NutchDocumentAnalyzer(job), true); + + writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10)); + writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100)); + writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE)); + writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128)); + writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000)); + writer.setInfoStream(LogUtil.getInfoStream(LOG)); + writer.setUseCompoundFile(false); + writer.setSimilarity(new NutchSimilarity()); + + return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() { + + boolean closed; + + public void write(WritableComparable key, LuceneDocumentWrapper value) + throws IOException { // unwrap & index doc + Document doc = value.get(); + NutchAnalyzer analyzer = factory.get(doc.get("lang")); + if (LOG.isInfoEnabled()) { + LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" + + " with analyzer " + analyzer + + " (" + doc.get("lang") + ")"); + } + writer.addDocument(doc, analyzer); + progress.progress(); + } + + public void close(final Reporter reporter) throws IOException { + // spawn a thread to give progress heartbeats + Thread prog = new Thread() { + + public void run() { + while (!closed) { + try { + reporter.setStatus("closing"); + Thread.sleep(1000); + } catch (InterruptedException e) { + continue; + } catch (Throwable e) { + return; + } + } + } + }; + + try { + prog.start(); + if (LOG.isInfoEnabled()) { + LOG.info("Optimizing index."); + } + // optimize & close index + writer.optimize(); + writer.close(); + fs.completeLocalOutput(perm, temp); // copy to dfs + fs.createNewFile(new Path(perm, DONE_NAME)); + } finally { + closed = true; + } + } + }; + } + } + private IndexingFilters filters; + private ScoringFilters scfilters; + + public ImageIndexer() { + } + + public ImageIndexer(Configuration conf) { + setConf(conf); + } + + public void configure(JobConf job) { + setConf(job); + this.filters = new IndexingFilters(getConf()); + this.scfilters = new ScoringFilters(getConf()); + } + + public void close() { + } + + /** + * Copies key/value pairs from one metadata container to another. + * Overwrites the destination if the source has a value with greater length. + * + * @param from Metadata to copy from + * @param to target metadata container + */ + private void mergeMetadata(Metadata from, Metadata to) { + String[] names = from.names(); + for (String name : names) { + String newValue = from.get(name); + String value = to.get(name); + if (value != null) { + if (newValue.length() > value.length()) { + to.set(name, newValue); + } + } else { + to.add(name, newValue); + } + } + } + + public void reduce(Text key, Iterator<WrappedWritable> values, + OutputCollector<Text, Writable> output, Reporter reporter) + throws IOException { + Inlinks inlinks = null; + CrawlDatum dbDatum = null; + CrawlDatum fetchDatum = null; + ParseData parseData = null; + ParseText parseText = null; + + Metadata metadata = null; + Metadata contentMetadata = null; + String segmentName = null; + String signature = null; + while (values.hasNext()) { + Writable value = values.next().get(); + if (value instanceof ImageWritable) { + ImageWritable imgData = (ImageWritable) value; + Metadata imgMeta = imgData.getMetadata(); + if (metadata == null) { + metadata = imgMeta; + } else { + mergeMetadata(imgMeta, metadata); + } + } else if (value instanceof Inlinks) { + inlinks = (Inlinks) value; + } else if (value instanceof CrawlDatum) { + CrawlDatum datum = (CrawlDatum) value; + if (CrawlDatum.hasDbStatus(datum)) { + dbDatum = datum; + } else if (CrawlDatum.hasFetchStatus(datum)) { + // don't index unmodified (empty) pages + if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) { + fetchDatum = datum; + } + } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() || + CrawlDatum.STATUS_SIGNATURE == datum.getStatus()) { + continue; + } else { + throw new RuntimeException("Unexpected status: " + datum.getStatus()); + } + } else if (value instanceof ParseData) { + if (parseData != null) { + ParseData newParse = (ParseData) value; + Metadata parseMeta = newParse.getParseMeta(); + // Check if this is the parse meta from ImageParseFilter + // If so, use its parse meta, otherwise use the content meta + if (parseMeta.get(ImageSearch.PARENT_URL_KEY) != null) { + mergeMetadata(parseMeta, metadata); + } else { + contentMetadata = newParse.getContentMeta(); + } + } else { + parseData = (ParseData) value; + metadata = parseData.getParseMeta(); + contentMetadata = parseData.getContentMeta(); + } + } else if (value instanceof ParseText) { + ParseText newParseText = (ParseText) value; + if (parseText == null || (parseText != null && + parseText.getText().length() < newParseText.getText().length())) { + parseText = (ParseText) value; + } + } else if (LOG.isWarnEnabled()) { + LOG.warn("Unrecognized type: " + value.getClass()); + } + // Save segment name and signature + if (contentMetadata != null) { + if (segmentName == null || signature == null) { + String stringValue = contentMetadata.get(Nutch.SEGMENT_NAME_KEY); + if (stringValue != null) { + segmentName = stringValue; + } + stringValue = contentMetadata.get(Nutch.SIGNATURE_KEY); + if (stringValue != null) { + signature = stringValue; + } + } + } + } + + if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) { + return; // only have inlinks + } + if (!parseData.getStatus().isSuccess() || + fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) { + return; + } + + // Skip possibly non-images + if (metadata.get(ImageSearch.PARENT_URL_KEY) == null) { + return; + } + // Make sure segment name and signature are set + contentMetadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); + contentMetadata.set(Nutch.SIGNATURE_KEY, signature); + + Document doc = new Document(); + + // add segment, used to map from merged index back to segment files + doc.add(new Field("segment", contentMetadata.get(Nutch.SEGMENT_NAME_KEY), + Field.Store.YES, Field.Index.NO)); + + // add digest, used by dedup + doc.add(new Field("digest", contentMetadata.get(Nutch.SIGNATURE_KEY), + Field.Store.YES, Field.Index.NO)); + + ParseData combinedParseData = new ParseData(parseData.getStatus(), + parseData.getTitle(), parseData.getOutlinks(), contentMetadata, + metadata); + + Parse parse = new ParseImpl(parseText, combinedParseData); + try { + // extract information from dbDatum and pass it to + // fetchDatum so that indexing filters can use it + Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); + if (url != null) { + fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url); + } + // run indexing filters + doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks); + } catch (IndexingException e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Error indexing " + key + ": " + e); + } + return; + } + + // skip documents discarded by indexing filters + if (doc == null) { + return; + } + + float boost = 1.0f; + // run scoring filters + try { + boost = this.scfilters.indexerScore((Text) key, doc, dbDatum, + fetchDatum, parse, inlinks, boost); + } catch (ScoringFilterException e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Error calculating score " + key + ": " + e); + } + return; + } + // apply boost to all indexed fields. + doc.setBoost(boost); + // store boost for use by explain and dedup + doc.add(new Field("boost", Float.toString(boost), + Field.Store.YES, Field.Index.NO)); + + output.collect(key, new LuceneDocumentWrapper(doc)); + } + + public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments) + throws IOException { + + if (LOG.isInfoEnabled()) { + LOG.info("ImageIndexer: starting"); + LOG.info("ImageIndexer: linkdb: " + linkDb); + } + + JobConf job = new NutchJob(getConf()); + job.setJobName("index " + indexDir); + + for (int i = 0; i < segments.length; i++) { + if (LOG.isInfoEnabled()) { + LOG.info("ImageIndexer: adding segment: " + segments[i]); + } + job.addInputPath(new Path(segments[i], CrawlDatum.FETCH_DIR_NAME)); + job.addInputPath(new Path(segments[i], CrawlDatum.PARSE_DIR_NAME)); + job.addInputPath(new Path(segments[i], ParseData.DIR_NAME)); + job.addInputPath(new Path(segments[i], ParseText.DIR_NAME)); + job.addInputPath(new Path(segments[i], ImageWritable.IMAGE_DATA_DIR)); + } + + job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME)); + job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME)); + job.setInputFormat(SequenceFileInputFormat.class); + + job.setMapperClass(ImageIndexer.class); + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(WrappedWritable.class); + job.setReducerClass(ImageIndexer.class); + + job.setOutputPath(indexDir); + job.setOutputFormat(OutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(NutchWritable.class); + + JobClient.runJob(job); + if (LOG.isInfoEnabled()) { + LOG.info("ImageIndexer: done"); + } + } + + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new ImageIndexer(), args); + System.exit(res); + } + + public int run(String[] args) throws Exception { + + if (args.length < 4) { + System.err.println("Usage: <index> <crawldb> <linkdb> <segment> ..."); + return -1; + } + + Path[] segments = new Path[args.length - 3]; + for (int i = 3; i < args.length; i++) { + segments[i - 3] = new Path(args[i]); + } + + try { + index(new Path(args[0]), new Path(args[1]), new Path(args[2]), + segments); + return 0; + } catch (Exception e) { + LOG.fatal("ImageIndexer: " + StringUtils.stringifyException(e)); + return -1; + } + } + + public void map(Text key, Writable value, + OutputCollector<Text, WrappedWritable> output, Reporter reporter) throws IOException { + output.collect(key, new WrappedWritable(value)); + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,136 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.IOException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapFileOutputFormat; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; + +public class ImageProcessor extends Configured implements Tool, + Mapper<Text, Content, Text, ImageWritable> { + + private static final Log LOG = LogFactory.getLog(ImageProcessor.class); + + private int thumbQuality; + private int thumbMaxSize; + + ImageProcessor() {} + ImageProcessor(Configuration conf) { + setConf(conf); + } + + public void map(Text key, Content content, + OutputCollector<Text, ImageWritable> output, + Reporter reporter) throws IOException { + + Metadata metadata = new Metadata(); + // Check content type + if (!content.getContentType().contains("image/")) { + return; + } + + // Generate thumbnail + byte[] data = content.getContent(); + StoredImage thumb = ThumbnailGenerator.generateThumbnail(data, + thumbMaxSize, thumbMaxSize, thumbQuality, metadata); + + // Create and setup an ImageWritable + ImageWritable image = new ImageWritable(key.toString()); + image.setMetadata(metadata); + image.setThumbnail(thumb); + + output.collect(key, image); + } + + public void processImageContent(Path segment) + throws IOException { + + JobConf job = new NutchJob(getConf()); + job.setJobName("ImageProcessor " + segment); + + if (LOG.isInfoEnabled()) { + LOG.info("ImageProcessor: processing " + segment); + } + job.addInputPath(new Path(segment, Content.DIR_NAME)); + + job.setInputFormat(SequenceFileInputFormat.class); + job.setMapperClass(ImageProcessor.class); + + job.setOutputPath(new Path(segment, ImageWritable.IMAGE_DATA_DIR)); + job.setOutputFormat(MapFileOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(ImageWritable.class); + + JobClient.runJob(job); + + if (LOG.isInfoEnabled()) { + LOG.info("ImageProcessor: done"); + } + } + + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), + new ImageProcessor(), args); + System.exit(res); + } + public int run(String[] args) throws Exception { + + if (args.length == 0) { + System.err.println("Usage: imageprocessor <segment>"); + return -1; + } + + Path segment = new Path(args[0]); + try { + processImageContent(segment); + return 0; + } catch (Exception e) { + LOG.fatal("ImageProcessor: " + StringUtils.stringifyException(e)); + return -1; + } + } + + private Configuration conf; + public void configure(JobConf conf) { + setConf(conf); + + this.thumbQuality = conf.getInt("imagesearcher.thumbnail.quality", 50); + this.thumbMaxSize = conf.getInt("imagesearcher.thumbnail.maxSize", 100); + } + + public void close() throws IOException { + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,28 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +public class ImageSearch { + public static final String PARENT_URL_KEY = "parent_url"; + public static final String ALT_TEXT_KEY = "alt"; + + public static final String IMAGE_IDS_KEY = "image_ids"; + public static final String IMAGE_POS_KEY = "image_pos"; + public static final String IMAGE_URLS_KEY = "image_urls"; + public static final String HAS_IMAGE_KEY = "has_image"; +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,346 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.File; +import java.io.IOException; +import java.lang.Math; +import java.util.Iterator; +import java.util.Vector; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.nutch.indexer.FsDirectory; +import org.apache.nutch.indexer.Indexer; +import org.apache.nutch.util.NutchConfiguration; + +public class ImageSearcherBean { + + public static final Log LOG = LogFactory.getLog(ImageSearcherBean.class); + + private IndexReader reader; + + private Path baseDir; + private Configuration conf; + private FileSystem fs; + + private int distThreshold; // Maximum allowed distance of image from hit + // to be considered + + /** Construct given a configuration. */ + public ImageSearcherBean(Configuration conf) throws IOException { + this.conf = conf; + this.fs = FileSystem.get(conf); + this.baseDir = new Path(conf.get("searcher.dir", "crawl")); + this.distThreshold = conf.getInt("imagesearch.maxDist", 300); + + // Try to load unmerged indexes + Path indexesDir = new Path(baseDir, "indexes"); + if (this.fs.exists(indexesDir)) { + Vector<Path> doneDirs = new Vector<Path>(); + Path[] dirs = fs.listPaths(indexesDir, new PathFilter() { + + public boolean accept(Path f) { + try { + if (fs.isDirectory(f)) { + return true; + } + } catch (IOException ioe) { + } + return false; + } + }); + for (Path dir : dirs) { + Path indexdone = new Path(dir, Indexer.DONE_NAME); + if (fs.isFile(indexdone)) { + doneDirs.add(dir); + } + } + dirs = new Path[doneDirs.size()]; + Iterator<Path> it = doneDirs.iterator(); + int i = 0; + while (it.hasNext()) { + dirs[i++] = it.next(); + } + init(dirs); + } else { + Path[] indexDir = {new Path(baseDir, "index")}; + init(indexDir); + } + } + + /** Init given a set of indexes or just one index. */ + public void init(Path[] indexes) throws IOException { + IndexReader[] indexReaders = new IndexReader[indexes.length]; + for (int i = 0; i < indexes.length; i++) { + indexReaders[i] = IndexReader.open(getDirectory(indexes[i])); + } + if (indexes.length > 1) { + this.reader = new MultiReader(indexReaders); + } else { + this.reader = IndexReader.open(getDirectory(indexes[0])); + } + } + + private Directory getDirectory(Path file) throws IOException { + if ("file".equals(this.fs.getUri().getScheme())) { + Path qualified = file.makeQualified(FileSystem.getLocal(conf)); + File fsLocal = new File(qualified.toUri()); + return FSDirectory.getDirectory(fsLocal.getAbsolutePath()); + } else { + return new FsDirectory(this.fs, file, false, this.conf); + } + } + + public void close() throws IOException { + if (reader != null) { + reader.close(); + } + } + + public IndexReader getReader() { + return reader; + } + + /** + * Calculate the score for an image hit. + * @param hit found hit + * @param doc parent document + * @return float score + */ + private float scoreHit(ImageHit hit, Document doc) { + float a = 0.2f; + float b = 0.1f; + return a*hit.docScore + (1.0f-a)*(b*hit.docSim + (1.0f-b)*hit.proximity); + } + + /** + * Find query-related images in the content of documents based on proximity. + * + * @param queryTerms + * @param hitCollector + * @throws java.io.IOException + */ + private long getImagesFromContent(Term[] queryTerms, ImageHitQueue hitCollector, + int maxHits) + throws IOException { + + // Construct SpanQuery + SpanQuery[] clauses = new SpanTermQuery[queryTerms.length]; + for (int i=0; i<queryTerms.length; i++) { + clauses[i] = new SpanTermQuery(queryTerms[i]); + } + SpanNearQuery snq = new SpanNearQuery(clauses, queryTerms.length+1, false); + Spans spans = snq.getSpans(reader); + + // Per document info + Document doc = null; + int currentDoc = -1; + int numDocImages = 0; + int[] imagePositions = null; + String[] imageIds = null; + String[] imageUrls = null; + float docBoost = 1.0f; + float docSim = 0.0f; + float maxDist = Float.MAX_VALUE; + float minScore = 0.0f; + + long totalHits = 0; + + boolean more = spans.next(); + while (more) { + if (LOG.isDebugEnabled()) { + LOG.debug("currentDoc "+currentDoc); + } + if (currentDoc != spans.doc()) { + currentDoc = spans.doc(); + doc = reader.document(currentDoc); + // Skip document with no images + if ("0".equals(doc.getField(ImageSearch.HAS_IMAGE_KEY).stringValue())) { + while (more && spans.doc() == currentDoc) { + more = spans.next(); + } + continue; + } + + // Get document's global score + docBoost = doc.getBoost(); + + // Get image positions + String posField = doc.getField(ImageSearch.IMAGE_POS_KEY).stringValue(); + String[] positions = posField.split(":"); + imagePositions = new int[positions.length]; + numDocImages = positions.length; + for (int i = 0; i < numDocImages; i++) { + imagePositions[i] = Integer.parseInt(positions[i]); + } + maxDist = (float)imagePositions[numDocImages-1]; + + // Get image ids + String idField = doc.getField(ImageSearch.IMAGE_IDS_KEY).stringValue(); + imageIds = idField.split(":"); + + // Get image urls + String urlField = doc.getField(ImageSearch.IMAGE_URLS_KEY).stringValue(); + imageUrls = urlField.split(" "); + } + + int pos = 0; + int end = 0; + int imgIndex = 0; + int prevDist = Integer.MAX_VALUE; + while (more && spans.doc() == currentDoc) { + if (imgIndex >= numDocImages) { + more = spans.next(); + continue; + } + if (LOG.isDebugEnabled()) { + LOG.debug("sp " + spans.start() + " " + spans.end()); + } + pos = spans.start(); + end = spans.end(); + int dist = Math.abs(imagePositions[imgIndex] - pos) + (end-pos); + int nextDist = imgIndex < numDocImages-1 ? + Math.abs(imagePositions[imgIndex + 1] - pos) + (end-pos) : Integer.MAX_VALUE; + /*if (prevDist < dist) { + more = spans.next(); + prevDist = dist; + if (LOG.isDebugEnabled()) { + LOG.debug("p<d"); + } + continue; + }*/ + // Advance image pointer till a nearer image can be found + while (imgIndex < numDocImages && nextDist <= dist) { + if (LOG.isDebugEnabled()) { + LOG.debug("adv " + nextDist + " " + dist + " id " + imageUrls[imgIndex].substring(imageUrls[imgIndex].lastIndexOf("/"))); + } + dist = nextDist; + imgIndex++; + nextDist = imgIndex < numDocImages-1 ? + Math.abs(imagePositions[imgIndex+1] - pos) + (end-pos) : Integer.MAX_VALUE; + } + // Check if this image is in the allowed proximity of the span + if (dist > distThreshold) { + if (LOG.isDebugEnabled()) { + LOG.debug("d>t: " + dist); + } + more = spans.next(); + continue; + } + + if (LOG.isDebugEnabled()) { + LOG.debug("hit " + imageUrls[imgIndex].substring(imageUrls[imgIndex].lastIndexOf("/")) + " " + dist + " next " + nextDist); + } + // Found hit + ImageHit newHit = new ImageHit(imageIds[imgIndex], imageUrls[imgIndex], currentDoc); + newHit.docSim = docSim; + newHit.docScore = docBoost; + newHit.proximity = Math.min(1.0f, 1.0f-((float)dist/maxDist)); + newHit.score = scoreHit(newHit, doc); + + if (hitCollector.size() < maxHits || newHit.score >= minScore) { + hitCollector.insert(newHit); + minScore = ((ImageHit)hitCollector.top()).score; + + prevDist = dist; + imgIndex++; + } + totalHits++; + more = spans.next(); + } + } + + return totalHits; + } + + /** + * Search for images matching the query. + * + * @param query query + * @param maxHits maximum number of hits to retrieve + * @return ImageHits the matching hits + * @throws java.io.IOException + */ + public ImageHits search(String query, int maxHits) throws IOException { + String[] keywords = query.split("\\s"); + if (keywords == null) { + return new ImageHits(0, new ImageHit[0]); + } + + // Create query term array + Term[] queryTerms = new Term[keywords.length]; + for (int i=0; i<queryTerms.length; i++) { + queryTerms[i] = new Term("content", keywords[i]); + } + + ImageHitQueue hitQueue = new ImageHitQueue(maxHits); + long totalHits = getImagesFromContent(queryTerms, hitQueue, maxHits); + + // Extract top results + ImageHit[] resultSet = new ImageHit[hitQueue.size()]; + for (int i = resultSet.length - 1; i >= 0; i--) { + resultSet[i] = (ImageHit) hitQueue.pop(); + } + + return new ImageHits(totalHits, resultSet); + } + + /** For debugging purposes. */ + public static void main(String[] args) throws Exception { + if (args.length == 0) { + System.err.println("Usage: ImageSearcherBean <query>"); + System.exit(-1); + } + + Configuration conf = NutchConfiguration.create(); + ImageSearcherBean isb = new ImageSearcherBean(conf); + + // Construct query string + StringBuffer sb = new StringBuffer(); + for (String arg : args) { + if (sb.length() > 0) { + sb.append(' '); + } + sb.append(arg); + } + // Conduct search + int maxHits = 10; + ImageHits hits = isb.search(sb.toString(), maxHits); + // Show results + System.out.println("Total hits: " + hits.getTotal()); + ImageHit[] top = hits.getHits(0, + hits.getTotal() >= maxHits ? maxHits : (int)hits.getTotal()); + for (ImageHit hit : top) { + System.out.println(hit.score + " " + hit.url + " " + hit.imageId); + } + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageWritable.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageWritable.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageWritable.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,78 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import org.apache.hadoop.io.MD5Hash; +import org.apache.hadoop.io.Writable; +import org.apache.nutch.metadata.Metadata; + +public class ImageWritable implements Writable { + + public static final String IMAGE_DATA_DIR = "image_data"; + + private MD5Hash id; + private Metadata metadata; + private StoredImage thumbnail; + + public ImageWritable() {} + + public ImageWritable(String url) { + this.id = MD5Hash.digest(url); + this.metadata = new Metadata(); + } + + public Metadata getMetadata() { + return metadata; + } + + public void setMetadata(Metadata metadata) { + this.metadata = metadata; + } + + public void setThumbnail(StoredImage thumbnail) { + this.thumbnail = thumbnail; + } + + public StoredImage getThumbnail() { + return thumbnail; + } + + public void write(DataOutput out) throws IOException { + id.write(out); + metadata.write(out); + if (thumbnail != null) { + out.writeBoolean(true); + thumbnail.write(out); + } else { + out.writeBoolean(false); + } + } + + public void readFields(DataInput in) throws IOException { + id = MD5Hash.read(in); + metadata = new Metadata(); + metadata.readFields(in); + thumbnail = new StoredImage(); + if (in.readBoolean()) { + thumbnail.readFields(in); + } + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/StoredImage.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/StoredImage.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/StoredImage.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import org.apache.hadoop.io.Writable; + +/** Represents binary image data as a Writable. */ +public class StoredImage implements Writable { + + public static final byte TYPE_JPEG = 'j'; + public static final byte T... [truncated message content] |