From: <sta...@us...> - 2007-03-22 00:07:39
|
Revision: 1626 http://archive-access.svn.sourceforge.net/archive-access/?rev=1626&view=rev Author: stack-sf Date: 2007-03-21 17:07:39 -0700 (Wed, 21 Mar 2007) Log Message: ----------- Moved third-party checkout under nutchwax-thirdparty module. Let go of maven1. * nutchwax-core/pom.xml * src/java/overview.html * src/plugin/build-plugin.xml * nutchwax-thirdparty/pom.xml * nutchwax-webapp/src/main/assembly/assemble-war.xml * .classpath * nutchwax-job/src/main/assembly/assemble-job.xml * nutchwax-webapp/pom.xml Reference new third-party location. * project.properties * maven.xml * project.xml Let go of maven1 Modified Paths: -------------- trunk/archive-access/projects/nutchwax/.classpath trunk/archive-access/projects/nutchwax/nutchwax-core/pom.xml trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml trunk/archive-access/projects/nutchwax/nutchwax-thirdparty/pom.xml trunk/archive-access/projects/nutchwax/nutchwax-webapp/pom.xml trunk/archive-access/projects/nutchwax/nutchwax-webapp/src/main/assembly/assemble-war.xml trunk/archive-access/projects/nutchwax/src/java/overview.html trunk/archive-access/projects/nutchwax/src/plugin/build-plugin.xml Removed Paths: ------------- trunk/archive-access/projects/nutchwax/maven.xml trunk/archive-access/projects/nutchwax/project.properties trunk/archive-access/projects/nutchwax/project.xml Property Changed: ---------------- trunk/archive-access/projects/nutchwax/ trunk/archive-access/projects/nutchwax/nutchwax-thirdparty/ Property changes on: trunk/archive-access/projects/nutchwax ___________________________________________________________________ Name: svn:externals - third-party/nutch -r 508238 http://svn.apache.org/repos/asf/lucene/nutch/trunk Modified: trunk/archive-access/projects/nutchwax/.classpath =================================================================== --- trunk/archive-access/projects/nutchwax/.classpath 2007-03-21 23:27:40 UTC (rev 1625) +++ trunk/archive-access/projects/nutchwax/.classpath 2007-03-22 00:07:39 UTC (rev 1626) @@ -9,234 +9,234 @@ <classpathentry kind="src" path="src/plugin/query-host/src/java"/> <classpathentry kind="src" path="src/plugin/query-title/src/java"/> <classpathentry kind="src" path="src/plugin/query-wax/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/index-basic/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/index-more/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/languageidentifier/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/languageidentifier/src/test"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/lib-regex-filter/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/lib-regex-filter/src/test"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/parse-ext/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/parse-ext/src/test"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/parse-html/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/parse-html/src/test"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/parse-js/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/parse-pdf/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/parse-pdf/src/test"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/parse-text/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/query-basic/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/query-more/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/query-site/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/query-url/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/query-url/src/test"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/scoring-opic/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/summary-basic/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/summary-lucene/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/urlfilter-automaton/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/urlfilter-automaton/src/test"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/urlfilter-prefix/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/urlfilter-regex/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/urlfilter-regex/src/test"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/urlfilter-suffix/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/urlfilter-suffix/src/test"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/urlnormalizer-basic/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/urlnormalizer-basic/src/test"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/urlnormalizer-pass/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/urlnormalizer-pass/src/test"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/urlnormalizer-regex/src/java"/> - <classpathentry kind="src" path="third-party/nutch/src/plugin/urlnormalizer-regex/src/test"/> - <classpathentry kind="src" path="third-party/nutch/src/test"/> - <classpathentry kind="lib" path="third-party/nutch/build/clustering-carrot2/clustering-carrot2.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/creativecommons/creativecommons.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/index-basic/index-basic.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/index-more/index-more.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/language-identifier/language-identifier.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/lib-http/lib-http.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/lib-jakarta-poi/poi-3.0-alpha1-20050704.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/lib-jakarta-poi/poi-scratchpad-3.0-alpha1-20050704.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/lib-log4j/log4j-1.2.11.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/lib-lucene-analyzers/lucene-analyzers-2.0.0.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/lib-nekohtml/nekohtml-0.9.4.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/lib-parsems/lib-parsems.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/lib-regex-filter/lib-regex-filter.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/lib-xml/jaxen-core.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/lib-xml/jaxen-jdom.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/lib-xml/jdom.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/lib-xml/saxpath.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/lib-xml/xercesImpl.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/microformats-reltag/microformats-reltag.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/nutch-0.9-dev.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/nutch-extensionpoints/nutch-extensionpoints.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/ontology/ontology.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/parse-ext/parse-ext.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/parse-html/parse-html.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/parse-js/parse-js.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/parse-msexcel/parse-msexcel.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/parse-mspowerpoint/parse-mspowerpoint.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/parse-msword/parse-msword.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/parse-oo/parse-oo.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/parse-pdf/parse-pdf.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/parse-rss/parse-rss.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/parse-swf/parse-swf.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/parse-text/parse-text.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/parse-zip/parse-zip.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/clustering-carrot2/carrot2-filter-lingo.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/clustering-carrot2/carrot2-local-core.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/clustering-carrot2/carrot2-snowball-stemmers.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/clustering-carrot2/carrot2-util-common.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/clustering-carrot2/carrot2-util-tokenizer.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/clustering-carrot2/clustering-carrot2.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/clustering-carrot2/commons-collections-3.1-patched.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/clustering-carrot2/commons-pool-1.1.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/clustering-carrot2/Jama-1.0.1-patched.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/clustering-carrot2/violinstrings-1.0.2.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/creativecommons/creativecommons.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/index-basic/index-basic.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/index-more/index-more.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/language-identifier/language-identifier.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/lib-http/lib-http.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/lib-jakarta-poi/poi-3.0-alpha1-20050704.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/lib-jakarta-poi/poi-scratchpad-3.0-alpha1-20050704.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/lib-log4j/log4j-1.2.11.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/lib-lucene-analyzers/lucene-analyzers-2.0.0.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/lib-nekohtml/nekohtml-0.9.4.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/lib-parsems/lib-parsems.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/lib-regex-filter/lib-regex-filter.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/lib-xml/jaxen-core.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/lib-xml/jaxen-jdom.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/lib-xml/jdom.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/lib-xml/saxpath.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/lib-xml/xercesImpl.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/microformats-reltag/microformats-reltag.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/nutch-extensionpoints/nutch-extensionpoints.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/ontology/commons-logging-1.0.3.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/ontology/icu4j_2_6_1.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/ontology/jena-2.1.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/ontology/ontology.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-ext/parse-ext.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-html/parse-html.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-html/tagsoup-1.0rc3.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-js/parse-js.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-msexcel/parse-msexcel.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-mspowerpoint/parse-mspowerpoint.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-msword/parse-msword.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-oo/parse-oo.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-pdf/parse-pdf.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-pdf/PDFBox-0.7.2-log4j.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-rss/commons-feedparser-0.6-fork.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-rss/parse-rss.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-rss/xmlrpc-1.2.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-swf/javaswf.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-swf/parse-swf.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-text/parse-text.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/parse-zip/parse-zip.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/protocol-file/protocol-file.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/protocol-ftp/commons-net-1.2.0-dev.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/protocol-ftp/protocol-ftp.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/protocol-http/protocol-http.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/protocol-httpclient/protocol-httpclient.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/query-basic/query-basic.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/query-more/query-more.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/query-site/query-site.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/query-url/query-url.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/scoring-opic/scoring-opic.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/subcollection/subcollection.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/summary-basic/summary-basic.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/summary-lucene/lucene-highlighter-2.0.0.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/summary-lucene/summary-lucene.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/urlfilter-automaton/automaton.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/urlfilter-automaton/urlfilter-automaton.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/urlfilter-prefix/urlfilter-prefix.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/urlfilter-regex/urlfilter-regex.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/urlfilter-suffix/urlfilter-suffix.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/urlnormalizer-basic/urlnormalizer-basic.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/urlnormalizer-pass/urlnormalizer-pass.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/plugins/urlnormalizer-regex/urlnormalizer-regex.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/protocol-file/protocol-file.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/protocol-ftp/protocol-ftp.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/protocol-http/protocol-http.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/protocol-httpclient/protocol-httpclient.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/query-basic/query-basic.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/query-more/query-more.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/query-site/query-site.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/query-url/query-url.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/scoring-opic/scoring-opic.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/subcollection/subcollection.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/summary-basic/summary-basic.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/summary-lucene/summary-lucene.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/urlfilter-automaton/urlfilter-automaton.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/urlfilter-prefix/urlfilter-prefix.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/urlfilter-regex/urlfilter-regex.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/urlfilter-suffix/urlfilter-suffix.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/urlnormalizer-basic/urlnormalizer-basic.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/urlnormalizer-pass/urlnormalizer-pass.jar"/> - <classpathentry kind="lib" path="third-party/nutch/build/urlnormalizer-regex/urlnormalizer-regex.jar"/> - <classpathentry kind="lib" path="third-party/nutch/contrib/web2/lib/commons-beanutils.jar"/> - <classpathentry kind="lib" path="third-party/nutch/contrib/web2/lib/commons-collections-3.0.jar"/> - <classpathentry kind="lib" path="third-party/nutch/contrib/web2/lib/commons-digester.jar"/> - <classpathentry kind="lib" path="third-party/nutch/contrib/web2/lib/jstl.jar"/> - <classpathentry kind="lib" path="third-party/nutch/contrib/web2/lib/standard.jar"/> - <classpathentry kind="lib" path="third-party/nutch/contrib/web2/lib/struts.jar"/> - <classpathentry kind="lib" path="third-party/nutch/contrib/web2/plugins/web-caching-oscache/lib/oscache-2.1.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/commons-cli-2.0-SNAPSHOT.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/commons-codec-1.3.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/commons-httpclient-3.0.1.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/commons-lang-2.1.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/commons-logging-1.0.4.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/commons-logging-api-1.0.4.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/hadoop-0.10.1-core.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/jakarta-oro-2.0.7.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/jets3t.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/jetty-5.1.4.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/jetty-ext/ant.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/jetty-ext/commons-el.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/jetty-ext/jasper-compiler.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/jetty-ext/jasper-runtime.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/jetty-ext/jsp-api.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/junit-3.8.1.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/log4j-1.2.13.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/lucene-core-2.0.0.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/lucene-misc-2.0.0.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/pmd-ext/jakarta-oro-2.0.8.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/pmd-ext/jaxen-1.1-beta-7.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/pmd-ext/pmd-3.6.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/servlet-api.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/taglibs-i18n.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/xerces-2_6_2-apis.jar"/> - <classpathentry kind="lib" path="third-party/nutch/lib/xerces-2_6_2.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/clustering-carrot2/lib/carrot2-local-core.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/clustering-carrot2/lib/carrot2-util-common.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/clustering-carrot2/lib/commons-collections-3.1-patched.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/clustering-carrot2/lib/commons-pool-1.1.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/clustering-carrot2/lib/Jama-1.0.1-patched.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/clustering-carrot2/lib/violinstrings-1.0.2.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/lib-log4j/lib/log4j-1.2.11.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.0.0.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/lib-xml/lib/jaxen-core.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/lib-xml/lib/jaxen-jdom.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/lib-xml/lib/jdom.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/lib-xml/lib/saxpath.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/lib-xml/lib/xercesImpl.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/ontology/lib/commons-logging-1.0.3.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/ontology/lib/icu4j_2_6_1.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/ontology/lib/jena-2.1.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/parse-html/lib/tagsoup-1.0rc3.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/parse-pdf/lib/PDFBox-0.7.2-log4j.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/parse-rss/lib/xmlrpc-1.2.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/parse-swf/lib/javaswf.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/protocol-ftp/lib/commons-net-1.2.0-dev.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/summary-lucene/lib/lucene-highlighter-2.0.0.jar"/> - <classpathentry kind="lib" path="third-party/nutch/src/plugin/urlfilter-automaton/lib/automaton.jar"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/index-basic/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/index-more/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/languageidentifier/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/languageidentifier/src/test"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/lib-regex-filter/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/lib-regex-filter/src/test"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/parse-ext/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/parse-ext/src/test"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/parse-html/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/parse-html/src/test"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/parse-js/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/parse-pdf/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/parse-pdf/src/test"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/parse-text/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/query-basic/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/query-more/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/query-site/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/query-url/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/query-url/src/test"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/scoring-opic/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/summary-basic/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/summary-lucene/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlfilter-automaton/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlfilter-automaton/src/test"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlfilter-prefix/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlfilter-regex/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlfilter-regex/src/test"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlfilter-suffix/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlfilter-suffix/src/test"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlnormalizer-basic/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlnormalizer-basic/src/test"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlnormalizer-pass/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlnormalizer-pass/src/test"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlnormalizer-regex/src/java"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlnormalizer-regex/src/test"/> + <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/test"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/clustering-carrot2/clustering-carrot2.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/creativecommons/creativecommons.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/index-basic/index-basic.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/index-more/index-more.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/language-identifier/language-identifier.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-http/lib-http.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-jakarta-poi/poi-3.0-alpha1-20050704.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-jakarta-poi/poi-scratchpad-3.0-alpha1-20050704.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-log4j/log4j-1.2.11.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-lucene-analyzers/lucene-analyzers-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-nekohtml/nekohtml-0.9.4.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-parsems/lib-parsems.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-regex-filter/lib-regex-filter.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-xml/jaxen-core.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-xml/jaxen-jdom.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-xml/jdom.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-xml/saxpath.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-xml/xercesImpl.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/microformats-reltag/microformats-reltag.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/nutch-0.9-dev.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/nutch-extensionpoints/nutch-extensionpoints.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/ontology/ontology.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-ext/parse-ext.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-html/parse-html.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-js/parse-js.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-msexcel/parse-msexcel.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-mspowerpoint/parse-mspowerpoint.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-msword/parse-msword.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-oo/parse-oo.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-pdf/parse-pdf.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-rss/parse-rss.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-swf/parse-swf.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-text/parse-text.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-zip/parse-zip.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/clustering-carrot2/carrot2-filter-lingo.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/clustering-carrot2/carrot2-local-core.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/clustering-carrot2/carrot2-snowball-stemmers.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/clustering-carrot2/carrot2-util-common.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/clustering-carrot2/carrot2-util-tokenizer.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/clustering-carrot2/clustering-carrot2.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/clustering-carrot2/commons-collections-3.1-patched.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/clustering-carrot2/commons-pool-1.1.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/clustering-carrot2/Jama-1.0.1-patched.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/clustering-carrot2/violinstrings-1.0.2.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/creativecommons/creativecommons.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/index-basic/index-basic.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/index-more/index-more.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/language-identifier/language-identifier.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-http/lib-http.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-jakarta-poi/poi-3.0-alpha1-20050704.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-jakarta-poi/poi-scratchpad-3.0-alpha1-20050704.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-log4j/log4j-1.2.11.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-lucene-analyzers/lucene-analyzers-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-nekohtml/nekohtml-0.9.4.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-parsems/lib-parsems.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-regex-filter/lib-regex-filter.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-xml/jaxen-core.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-xml/jaxen-jdom.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-xml/jdom.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-xml/saxpath.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-xml/xercesImpl.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/microformats-reltag/microformats-reltag.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/nutch-extensionpoints/nutch-extensionpoints.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/ontology/commons-logging-1.0.3.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/ontology/icu4j_2_6_1.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/ontology/jena-2.1.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/ontology/ontology.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-ext/parse-ext.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-html/parse-html.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-html/tagsoup-1.0rc3.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-js/parse-js.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-msexcel/parse-msexcel.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-mspowerpoint/parse-mspowerpoint.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-msword/parse-msword.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-oo/parse-oo.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-pdf/parse-pdf.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-pdf/PDFBox-0.7.2-log4j.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-rss/commons-feedparser-0.6-fork.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-rss/parse-rss.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-rss/xmlrpc-1.2.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-swf/javaswf.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-swf/parse-swf.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-text/parse-text.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/parse-zip/parse-zip.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/protocol-file/protocol-file.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/protocol-ftp/commons-net-1.2.0-dev.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/protocol-ftp/protocol-ftp.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/protocol-http/protocol-http.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/protocol-httpclient/protocol-httpclient.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/query-basic/query-basic.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/query-more/query-more.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/query-site/query-site.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/query-url/query-url.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/scoring-opic/scoring-opic.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/subcollection/subcollection.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/summary-basic/summary-basic.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/summary-lucene/lucene-highlighter-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/summary-lucene/summary-lucene.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/urlfilter-automaton/automaton.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/urlfilter-automaton/urlfilter-automaton.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/urlfilter-prefix/urlfilter-prefix.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/urlfilter-regex/urlfilter-regex.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/urlfilter-suffix/urlfilter-suffix.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/urlnormalizer-basic/urlnormalizer-basic.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/urlnormalizer-pass/urlnormalizer-pass.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/urlnormalizer-regex/urlnormalizer-regex.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/protocol-file/protocol-file.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/protocol-ftp/protocol-ftp.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/protocol-http/protocol-http.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/protocol-httpclient/protocol-httpclient.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/query-basic/query-basic.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/query-more/query-more.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/query-site/query-site.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/query-url/query-url.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/scoring-opic/scoring-opic.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/subcollection/subcollection.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/summary-basic/summary-basic.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/summary-lucene/summary-lucene.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/urlfilter-automaton/urlfilter-automaton.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/urlfilter-prefix/urlfilter-prefix.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/urlfilter-regex/urlfilter-regex.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/urlfilter-suffix/urlfilter-suffix.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/urlnormalizer-basic/urlnormalizer-basic.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/urlnormalizer-pass/urlnormalizer-pass.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/urlnormalizer-regex/urlnormalizer-regex.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/contrib/web2/lib/commons-beanutils.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/contrib/web2/lib/commons-collections-3.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/contrib/web2/lib/commons-digester.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/contrib/web2/lib/jstl.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/contrib/web2/lib/standard.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/contrib/web2/lib/struts.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/contrib/web2/plugins/web-caching-oscache/lib/oscache-2.1.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-cli-2.0-SNAPSHOT.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-codec-1.3.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-httpclient-3.0.1.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-lang-2.1.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-1.0.4.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-api-1.0.4.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.10.1-core.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jakarta-oro-2.0.7.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jets3t.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-5.1.4.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-ext/ant.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-ext/commons-el.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-ext/jasper-compiler.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-ext/jasper-runtime.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-ext/jsp-api.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/junit-3.8.1.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/log4j-1.2.13.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/lucene-core-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/lucene-misc-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/pmd-ext/jakarta-oro-2.0.8.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/pmd-ext/jaxen-1.1-beta-7.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/pmd-ext/pmd-3.6.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/servlet-api.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/taglibs-i18n.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/xerces-2_6_2-apis.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/xerces-2_6_2.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/clustering-carrot2/lib/carrot2-local-core.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/clustering-carrot2/lib/carrot2-util-common.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/clustering-carrot2/lib/commons-collections-3.1-patched.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/clustering-carrot2/lib/commons-pool-1.1.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/clustering-carrot2/lib/Jama-1.0.1-patched.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/clustering-carrot2/lib/violinstrings-1.0.2.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-log4j/lib/log4j-1.2.11.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-xml/lib/jaxen-core.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-xml/lib/jaxen-jdom.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-xml/lib/jdom.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-xml/lib/saxpath.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-xml/lib/xercesImpl.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/ontology/lib/commons-logging-1.0.3.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/ontology/lib/icu4j_2_6_1.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/ontology/lib/jena-2.1.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/parse-html/lib/tagsoup-1.0rc3.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/parse-pdf/lib/PDFBox-0.7.2-log4j.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/parse-rss/lib/xmlrpc-1.2.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/parse-swf/lib/javaswf.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/protocol-ftp/lib/commons-net-1.2.0-dev.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/summary-lucene/lib/lucene-highlighter-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/urlfilter-automaton/lib/automaton.jar"/> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> <classpathentry kind="con" path="org.maven.ide.eclipse.MAVEN2_CLASSPATH_CONTAINER"/> - <classpathentry kind="lib" path="third-party/nutch/build"/> - <classpathentry kind="lib" path="third-party/nutch/conf"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/conf"/> <classpathentry kind="output" path="target"/> </classpath> Deleted: trunk/archive-access/projects/nutchwax/maven.xml =================================================================== --- trunk/archive-access/projects/nutchwax/maven.xml 2007-03-21 23:27:40 UTC (rev 1625) +++ trunk/archive-access/projects/nutchwax/maven.xml 2007-03-22 00:07:39 UTC (rev 1626) @@ -1,181 +0,0 @@ -<?xml version="1.0"?> -<project - xmlns:j="jelly:core" - xmlns:define="jelly:define" - xmlns:doc="doc" - xmlns:artifact="artifact" - xmlns:util="jelly:util" - xmlns:maven="jelly:maven" - xmlns:ant="jelly:ant"> - - <goal name="site:update-sourceforge" description="Update sf."> - <exec executable="rsync" > - <arg value="--quiet" /> - <arg value="--archive" /> - <arg value="--rsh=ssh" /> - <arg - value="${maven.build.dir}/docs/"/> - <arg value="${maven.username}@archive-access.sf.net:/home/groups/a/ar/archive-access/htdocs/projects/nutchwax/" /> - </exec> - </goal> - - <preGoal name="xdoc:jelly-transform"> - <attainGoal name="faq" /> - <attainGoal name="docbook"/> - </preGoal> - - <postGoal name="xdoc:jelly-transform"> - <!--Overwrite the maven stylesheets because the do that - different greys on odd and even rows in tables and its messing up - my home page--> - <echo message="Copying stylesheet ${basedir}/xdocs/style/maven-theme.css to ${maven.build.dir}/docs/style/" /> - <copy todir="${maven.build.dir}/docs/style/" - file="${basedir}/xdocs/style/maven-theme.css" overwrite="true" /> - </postGoal> - - <postGoal name="site:generate" > - <copy todir="${maven.build.dir}/docs/images"> - <fileset dir="${basedir}/src/images" > - <include name="**/*" /> - </fileset> - </copy> - </postGoal> - - <!--Call out to ant goals.--> - <goal name="ant:war"><ant:ant target="war" /></goal> - <goal name="ant:clean" ><ant:ant target="clean" /></goal> - <goal name="ant:jar"><ant:ant target="jar" /></goal> - <goal name="ant:job" prereqs="ant:war"><ant:ant target="job" /></goal> - <goal name="ant:compile"><ant:ant target="compile" /></goal> - <goal name="ant:compile-plugins"><ant:ant target="compile-plugins" /></goal> - <goal name="ant:init"><ant:ant target="init" /></goal> - <preGoal name="clean" > - <!--Maven in 1.0.2, the maven on build box, doesn't support - the 'available' argument used in the ant clean target. Uncomment until - we update the build box to use a later maven. - - Note: The build box scripts are doing this build of the nutchwax jar. - Remove this special handling when we move to new maven. - - I just tired replacing the ant jar with a new one but that fails; - maven has hardcoded dependencies on old 1.5.x ant. - - <attainGoal name="ant:clean" /> - --> - </preGoal> - - <goal name="jar:jar"><!--Block building of jar--></goal> - - <postGoal name="dist:build-setup"> - <ant:available file="${basedir}/target/nutchwax.jar" - property="job.jar.exists"/> - <ant:fail - message="Must run ant 'jar' and 'war' targets before maven dist" - unless="job.jar.exists" /> - <!--Maven in 1.0.2, the maven on build box, doesn't support - the 'available' argument used in the ant clean target. Uncomment until - we update the build box to use a later maven. - <attainGoal name="ant:war" /> - <attainGoal name="ant:job" /> - --> - </postGoal> - - <postGoal name="dist:prepare-bin-filesystem"> - <echo>[nutchwax] dist:prepare-bin-filesystem postGoal</echo> - <attainGoal name="docbook" /> - <copy todir="${maven.dist.bin.assembly.dir}/bin" - filtering="true" overwrite="true" > - <fileset dir="${basedir}/bin" /> - </copy> - <copy todir="${maven.dist.bin.assembly.dir}"> - <fileset dir="${basedir}/target/"> - <include name="nutchwax.war"/> - <include name="nutchwax.jar"/> - </fileset> - </copy> - <!--Rename the nutchwax.jar to include a version--> - <ant:rename src="${maven.dist.bin.assembly.dir}/nutchwax.jar" - dest="${maven.dist.bin.assembly.dir}/nutchwax-${pom.currentVersion}.jar" /> - <attainGoal name="copy_docbook" /> - <!--Copy over the wax-default.xml and hadoop-site.xml. - They're referred to in documentation. - --> - <copy todir="${maven.dist.bin.assembly.dir}/docs"> - <fileset dir="${basedir}/conf" > - <include name="hadoop-site.xml.template"/> - <include name="wax-default.xml"/> - </fileset> - </copy> - <!--Copy over the parse-pdf.sh so available outside of - nutchwax.jar bundle for those running standalone mode - --> - <copy todir="${maven.dist.bin.assembly.dir}/bin"> - <fileset dir="${basedir}/src/plugin/parse-waxext/bin" > - <include name="parse-pdf.sh"/> - </fileset> - </copy> - </postGoal> - - <postGoal name="dist:prepare-src-filesystem"> - <echo>[nutchwax] dist:prepare-src-filesystem postGoal</echo> - <copy todir="${maven.dist.src.assembly.dir}/bin" - filtering="true" overwrite="true" > - <fileset dir="${basedir}/bin" /> - </copy> - <copy todir="${maven.dist.src.assembly.dir}/" overwrite="true" > - <fileset dir="${basedir}" > - <include name="README.txt" /> - </fileset> - </copy> - <copy todir="${maven.dist.src.assembly.dir}/conf" overwrite="true" > - <fileset dir="${basedir}/conf" /> - </copy> - <copy todir="${maven.dist.src.assembly.dir}/xdocs" overwrite="true" > - <fileset dir="${basedir}/xdocs" /> - </copy> - <copy todir="${maven.dist.src.assembly.dir}/lib" overwrite="true" > - <fileset dir="${basedir}/lib" /> - </copy> - </postGoal> - - <goal name="docbook"> - <!--Check that the jimi jars are present. If not, skip docbook. - --> - <j:set var="sDocbookSupportPresent" value="false" /> - <util:available file="${maven.repo.local}/jimi/jars" > - <j:set var="sDocbookSupportPresent" value="true"/> - </util:available> - <j:if test="${sDocbookSupportPresent == 'true'}"> - <attainGoal name="sdocbook:generate-html"/> - <!--<attainGoal name="sdocbook:generate-pdf"/> - --> - </j:if> - <!-- else --> - <j:if test="${sDocbookSupportPresent != 'true'}"> - <echo - message="sdocbook SUN jimi jar not present. Skipping docbook build." /> - </j:if> - </goal> - - <goal name="copy_docbook"> - <!--Copies docbooks under dist docs and under webapps. This goal runs - after these dirs have been setup. - --> - <property name="docbook.assembled" - value="${maven.build.dir}/docs/articles/" /> - <j:set var="docbookPresent" value="false" /> - <util:available file="${docbook.assembled}" > - <j:set var="docbookPresent" value="true"/> - </util:available> - <j:if test="${docbookPresent == 'true'}"> - <echo message="Copying over docbook" /> - <!--Copy under docs in binary build--> - <mkdir dir="${maven.dist.bin.assembly.dir}/docs/articles/" /> - <copy todir="${maven.dist.bin.assembly.dir}/docs/articles/" - verbose="true"> - <fileset dir="${maven.build.dir}/docs/articles/" /> - </copy> - </j:if> - </goal> - -</project> Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/pom.xml =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/pom.xml 2007-03-21 23:27:40 UTC (rev 1625) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/pom.xml 2007-03-22 00:07:39 UTC (rev 1626) @@ -27,8 +27,8 @@ <source>1.5</source> <target>1.5</target> <!-- - <compilerArgument> -verbose -cp ../third-party/nutch/build/classes</compilerArgument> - <compilerArgument> -verbose -classpath ../third-party/nutch/build/classes</compilerArgument> + <compilerArgument> -verbose -cp ../nutchwax-thirdparty/nutch/build/classes</compilerArgument> + <compilerArgument> -verbose -classpath ../nutchwax-thirdparty/nutch/build/classes</compilerArgument> --> </configuration> </plugin> Modified: trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml 2007-03-21 23:27:40 UTC (rev 1625) +++ trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml 2007-03-22 00:07:39 UTC (rev 1626) @@ -34,7 +34,7 @@ </includes> </fileSet> <fileSet> - <directory>../third-party/nutch/build/plugins</directory> + <directory>../nutchwax-thirdparty/nutch/build/plugins</directory> <outputDirectory>/plugins</outputDirectory> <includes> <include>analysis-*/**</include> @@ -54,7 +54,7 @@ </excludes> </fileSet> <fileSet> - <directory>../third-party/nutch/conf</directory> + <directory>../nutchwax-thirdparty/nutch/conf</directory> <outputDirectory>/</outputDirectory> <includes> <include>mime-types.xml</include> @@ -64,7 +64,7 @@ </includes> </fileSet> <fileSet> - <directory>../third-party/nutch/lib</directory> + <directory>../nutchwax-thirdparty/nutch/lib</directory> <outputDirectory>/lib</outputDirectory> <includes> <include>commons-lang*</include> @@ -81,7 +81,7 @@ class can be inside of a jar. <fileSet> - <directory>../third-party/nutch/build</directory> + <directory>../nutchwax-thirdparty/nutch/build</directory> <outputDirectory>/lib</outputDirectory> <includes> <include>nutch*jar</include> @@ -89,7 +89,7 @@ </fileSet> --> <fileSet> - <directory>../third-party/nutch/build/classes</directory> + <directory>../nutchwax-thirdparty/nutch/build/classes</directory> <outputDirectory>/</outputDirectory> </fileSet> </fileSets> Property changes on: trunk/archive-access/projects/nutchwax/nutchwax-thirdparty ___________________________________________________________________ Name: svn:externals + nutch -r 508238 http://svn.apache.org/repos/asf/lucene/nutch/trunk Modified: trunk/archive-access/projects/nutchwax/nutchwax-thirdparty/pom.xml =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-thirdparty/pom.xml 2007-03-21 23:27:40 UTC (rev 1625) +++ trunk/archive-access/projects/nutchwax/nutchwax-thirdparty/pom.xml 2007-03-22 00:07:39 UTC (rev 1626) @@ -38,7 +38,7 @@ we can invoke it from eclipse. --> <echo>Building nutch third-party dependency (jar)</echo> - <ant dir="../third-party/nutch" target="jar" inheritAll="false" > + <ant dir="nutch" target="jar" inheritAll="false" > <property name="build.compiler" value="extJavac" /> </ant> <!--Copy over the nutch classes to target/classes so they @@ -46,10 +46,10 @@ has on its classpath when it goes to build subsequent modules). --> <copy todir="target/classes" overwrite="true"> - <fileset dir="../third-party/nutch/build/classes" /> + <fileset dir="nutch/build/classes" /> </copy> <echo>Building nutch third-party dependency (plugins)</echo> - <ant dir="../third-party/nutch" target="compile-plugins" inheritAll="false" > + <ant dir="nutch" target="compile-plugins" inheritAll="false" > <property name="build.compiler" value="extJavac" /> </ant> </tasks> @@ -64,7 +64,7 @@ <configuration> <tasks> <echo>Cleaning nutch third-party dependency</echo> - <ant dir="../third-party/nutch" target="clean" inheritAll="false" > + <ant dir="nutch" target="clean" inheritAll="false" > ... [truncated message content] |
From: <sta...@us...> - 2007-03-22 22:56:35
|
Revision: 1629 http://archive-access.svn.sourceforge.net/archive-access/?rev=1629&view=rev Author: stack-sf Date: 2007-03-22 15:50:34 -0700 (Thu, 22 Mar 2007) Log Message: ----------- M projects/nutchwax/pom.xml Remove duplicated reporting section. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/pom.xml trunk/archive-access/projects/nutchwax/src/site/site.xml Modified: trunk/archive-access/projects/nutchwax/pom.xml =================================================================== --- trunk/archive-access/projects/nutchwax/pom.xml 2007-03-22 21:15:50 UTC (rev 1628) +++ trunk/archive-access/projects/nutchwax/pom.xml 2007-03-22 22:50:34 UTC (rev 1629) @@ -249,30 +249,12 @@ </plugin> </plugins> </build> + <reporting> + <outputDirectory>target/site</outputDirectory> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-javadoc-plugin</artifactId> - <configuration> - <source> - ${basedir}/src/java - </source> - <overview> - ${basedir}/src/java/overview.html - </overview> - <aggregate> - true - </aggregate> - <!-- - <additionalparam> - -classpath nutchwax-thirdparty/build/classes - </additionalparam> - --> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-project-info-reports-plugin</artifactId> <reportSets> <reportSet> @@ -288,8 +270,24 @@ </reportSet> </reportSets> </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-javadoc-plugin</artifactId> + <configuration> + <source> + ${basedir}/src/java + </source> + <overview> + ${basedir}/src/java/overview.html + </overview> + <aggregate> + true + </aggregate> + </configuration> + </plugin> </plugins> </reporting> + <repositories> <repository> <releases> @@ -309,37 +307,6 @@ </repository> </repositories> - <reporting> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-javadoc-plugin</artifactId> - <configuration> - <javadocDirectory> - ${basedir}/src/java - </javadocDirectory> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-project-info-reports-plugin</artifactId> - <reportSets> - <reportSet> - <reports> - <report>dependencies</report> - <report>project-team</report> - <report>mailing-list</report> - <report>cim</report> - <report>issue-tracking</report> - <report>license</report> - <report>scm</report> - </reports> - </reportSet> - </reportSets> - </plugin> - </plugins> - </reporting> - <!--Needed for docbkx plugin and dependencies. --> <pluginRepositories> Modified: trunk/archive-access/projects/nutchwax/src/site/site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/src/site/site.xml 2007-03-22 21:15:50 UTC (rev 1628) +++ trunk/archive-access/projects/nutchwax/src/site/site.xml 2007-03-22 22:50:34 UTC (rev 1629) @@ -44,6 +44,5 @@ <!--I want to get the sourceforge image in here but not sure how. --> - </body> </project> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-03-23 05:00:14
|
Revision: 1631 http://archive-access.svn.sourceforge.net/archive-access/?rev=1631&view=rev Author: stack-sf Date: 2007-03-22 22:00:15 -0700 (Thu, 22 Mar 2007) Log Message: ----------- * src/images/nutchwax.jpg * src/images/nwa.jpg * src/images/iipc.gif Deleted. Were moved under src/site/resources. * src/plugin/build-plugin.xml * nutchwax-webapp/src/main/assembly/assemble-war.xml * nutchwax-job/src/main/assembly/assemble-job.xml * nutchwax-webapp/pom.xml Build nutchwax plugins into nutchwax-plugins target. Then we can add this dir to nutchwax classpath in eclipse and our plugins can be found (Other wise they were in a blind spot -- in the eclipse target dir). * .classpath Add in nutchwax conf and plugins target dirs. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/.classpath trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml trunk/archive-access/projects/nutchwax/nutchwax-webapp/pom.xml trunk/archive-access/projects/nutchwax/nutchwax-webapp/src/main/assembly/assemble-war.xml trunk/archive-access/projects/nutchwax/src/plugin/build-plugin.xml Removed Paths: ------------- trunk/archive-access/projects/nutchwax/src/images/ Modified: trunk/archive-access/projects/nutchwax/.classpath =================================================================== --- trunk/archive-access/projects/nutchwax/.classpath 2007-03-23 00:30:25 UTC (rev 1630) +++ trunk/archive-access/projects/nutchwax/.classpath 2007-03-23 05:00:15 UTC (rev 1631) @@ -184,7 +184,7 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-lang-2.1.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-1.0.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-api-1.0.4.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.10.1-core.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.10.1-core.jar" /> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jakarta-oro-2.0.7.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jets3t.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-5.1.4.jar"/> @@ -236,7 +236,9 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/urlfilter-automaton/lib/automaton.jar"/> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> <classpathentry kind="con" path="org.maven.ide.eclipse.MAVEN2_CLASSPATH_CONTAINER"/> + <classpathentry kind="lib" path="conf"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/conf"/> + <classpathentry kind="lib" path="nutchwax-plugins/target"/> <classpathentry kind="output" path="target"/> </classpath> Modified: trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml 2007-03-23 00:30:25 UTC (rev 1630) +++ trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml 2007-03-23 05:00:15 UTC (rev 1631) @@ -6,7 +6,7 @@ <includeBaseDirectory>false</includeBaseDirectory> <fileSets> <fileSet> - <directory>../target/wax-plugins</directory> + <directory>../nutchwax-plugins/target/wax-plugins</directory> <outputDirectory>/wax-plugins</outputDirectory> </fileSet> <fileSet> Modified: trunk/archive-access/projects/nutchwax/nutchwax-webapp/pom.xml =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-webapp/pom.xml 2007-03-23 00:30:25 UTC (rev 1630) +++ trunk/archive-access/projects/nutchwax/nutchwax-webapp/pom.xml 2007-03-23 05:00:15 UTC (rev 1631) @@ -84,7 +84,7 @@ </copy> <copy todir="target/${artifactId}-${project.version}/WEB-INF/classes/wax-plugins" overwrite="true"> - <fileset dir="../target/wax-plugins" /> + <fileset dir="../nutchwax-plugins/target/wax-plugins" /> </copy> <copy todir="target/${artifactId}-${project.version}/WEB-INF/classes" overwrite="true"> Modified: trunk/archive-access/projects/nutchwax/nutchwax-webapp/src/main/assembly/assemble-war.xml =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-webapp/src/main/assembly/assemble-war.xml 2007-03-23 00:30:25 UTC (rev 1630) +++ trunk/archive-access/projects/nutchwax/nutchwax-webapp/src/main/assembly/assemble-war.xml 2007-03-23 05:00:15 UTC (rev 1631) @@ -25,7 +25,7 @@ </fileSet> <fileSet> - <directory>../target/wax-plugins</directory> + <directory>../nutchwax-plugins/target/wax-plugins</directory> <outputDirectory>/wax-plugins</outputDirectory> </fileSet> <fileSet> Modified: trunk/archive-access/projects/nutchwax/src/plugin/build-plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/src/plugin/build-plugin.xml 2007-03-23 00:30:25 UTC (rev 1630) +++ trunk/archive-access/projects/nutchwax/src/plugin/build-plugin.xml 2007-03-23 05:00:15 UTC (rev 1631) @@ -37,7 +37,7 @@ <property name="build.classes" location="${build.dir}/classes"/> <property name="build.test" location="${build.dir}/test"/> - <property name="deploy.dir" location="${nutch.root}/target//wax-plugins/${name}"/> + <property name="deploy.dir" location="${nutch.root}/nutchwax-plugins/target/wax-plugins/${name}"/> <property name="javac.deprecation" value="off"/> <property name="javac.debug" value="on"/> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-03-26 20:57:25
|
Revision: 1634 http://archive-access.svn.sourceforge.net/archive-access/?rev=1634&view=rev Author: stack-sf Date: 2007-03-26 13:57:26 -0700 (Mon, 26 Mar 2007) Log Message: ----------- Updating underlying nutch and hadoop. Nutch is about to release 0.9 so trying TRUNK. Hadoop in nutch is 0.12.2 so updating to that too. * conf/wax-default.xml Set default for new parser.caching.forbidden.policy to 'all' rather than 'noarchive'. * src/java/org/archive/access/nutch/jobs/ImportArcs.java Signature for MapFile.Writer changed. * nutchwax-thirdparty Update from r508238 to r521933 * .classpath Update nutch lib references to point at newer jars. * nutchwax-job/src/main/assembly/assemble-job.xml In distributed mode, seems like the nutchwax classes CANNOT be inside a jar -- else they won't be found. * nutchwax-webapp/src/main/webapp/search.jsp Add in handling for no caching. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/.classpath trunk/archive-access/projects/nutchwax/conf/wax-default.xml trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml trunk/archive-access/projects/nutchwax/nutchwax-webapp/src/main/webapp/search.jsp trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportArcs.java Property Changed: ---------------- trunk/archive-access/projects/nutchwax/nutchwax-thirdparty/ Modified: trunk/archive-access/projects/nutchwax/.classpath =================================================================== --- trunk/archive-access/projects/nutchwax/.classpath 2007-03-26 17:13:06 UTC (rev 1633) +++ trunk/archive-access/projects/nutchwax/.classpath 2007-03-26 20:57:26 UTC (rev 1634) @@ -55,7 +55,7 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-jakarta-poi/poi-3.0-alpha1-20050704.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-jakarta-poi/poi-scratchpad-3.0-alpha1-20050704.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-log4j/log4j-1.2.11.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-lucene-analyzers/lucene-analyzers-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-lucene-analyzers/lucene-analyzers-2.1.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-nekohtml/nekohtml-0.9.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-parsems/lib-parsems.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-regex-filter/lib-regex-filter.jar"/> @@ -98,7 +98,6 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-jakarta-poi/poi-3.0-alpha1-20050704.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-jakarta-poi/poi-scratchpad-3.0-alpha1-20050704.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-log4j/log4j-1.2.11.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-lucene-analyzers/lucene-analyzers-2.0.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-nekohtml/nekohtml-0.9.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-parsems/lib-parsems.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-regex-filter/lib-regex-filter.jar"/> @@ -142,7 +141,7 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/scoring-opic/scoring-opic.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/subcollection/subcollection.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/summary-basic/summary-basic.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/summary-lucene/lucene-highlighter-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/summary-lucene/lucene-highlighter-2.1.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/summary-lucene/summary-lucene.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/urlfilter-automaton/automaton.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/urlfilter-automaton/urlfilter-automaton.jar"/> @@ -184,9 +183,9 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-lang-2.1.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-1.0.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-api-1.0.4.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.10.1-core.jar" /> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.12.2-core.jar" /> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jakarta-oro-2.0.7.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jets3t.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jets3t-0.5.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-5.1.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-ext/ant.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-ext/commons-el.jar"/> @@ -195,8 +194,8 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-ext/jsp-api.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/junit-3.8.1.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/log4j-1.2.13.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/lucene-core-2.0.0.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/lucene-misc-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/lucene-core-2.1.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/lucene-misc-2.1.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/pmd-ext/jakarta-oro-2.0.8.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/pmd-ext/jaxen-1.1-beta-7.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/pmd-ext/pmd-3.6.jar"/> @@ -216,7 +215,7 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-log4j/lib/log4j-1.2.11.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.1.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-xml/lib/jaxen-core.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-xml/lib/jaxen-jdom.jar"/> @@ -232,7 +231,6 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/parse-rss/lib/xmlrpc-1.2.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/parse-swf/lib/javaswf.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/protocol-ftp/lib/commons-net-1.2.0-dev.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/summary-lucene/lib/lucene-highlighter-2.0.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/urlfilter-automaton/lib/automaton.jar"/> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> <classpathentry kind="con" path="org.maven.ide.eclipse.MAVEN2_CLASSPATH_CONTAINER"/> Modified: trunk/archive-access/projects/nutchwax/conf/wax-default.xml =================================================================== --- trunk/archive-access/projects/nutchwax/conf/wax-default.xml 2007-03-26 17:13:06 UTC (rev 1633) +++ trunk/archive-access/projects/nutchwax/conf/wax-default.xml 2007-03-26 20:57:26 UTC (rev 1634) @@ -198,6 +198,16 @@ </property> <property> +<name>parser.caching.forbidden.policy</name> + <value>all</value> + <description>If a site (or a page) requests through its robot metatags + that it should not be shown as cached content, apply this policy. Currently + three keywords are recognized: "none" ignores any "noarchive" directives. + "content" doesn't show the content, but shows summaries (snippets). + "all" doesn't show either content or summaries.</description> +</property> + +<property> <name>wax.index.all</name> <value>true</value> <description>If set to true, all content types are indexed. Otherwise Modified: trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml 2007-03-26 17:13:06 UTC (rev 1633) +++ trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml 2007-03-26 20:57:26 UTC (rev 1634) @@ -74,12 +74,6 @@ <include>concurrent*</include> </includes> </fileSet> - <!-- - For some reason, adding the nutch jar does not - work. You must add the classes at root level - of the job jar. Strange given the main nutchwax - class can be inside of a jar. - <fileSet> <directory>../nutchwax-thirdparty/nutch/build</directory> <outputDirectory>/lib</outputDirectory> @@ -87,11 +81,24 @@ <include>nutch*jar</include> </includes> </fileSet> - --> + <!-- + I used to add in the nutchwax-core jar but when I do that, + running in distributed mode, ClassNotFound issues. It + starts with not being able to find nutch classes and then + even if I put the nutch jar in $HADOOP_HOME/lib, then it + cannot find the content of archive-mapred jar. Including + the nutchwax-core classes seems to do the trick (Its how + it was done before move to m2). + <fileSet> <directory>../nutchwax-thirdparty/nutch/build/classes</directory> <outputDirectory>/</outputDirectory> </fileSet> + --> + <fileSet> + <directory>../nutchwax-core/target/classes</directory> + <outputDirectory>/</outputDirectory> + </fileSet> </fileSets> <dependencySets> <dependencySet> @@ -107,8 +114,10 @@ <exclude>com.sleepycat:je</exclude> <exclude>junit:junit</exclude> <exclude>javax.servlet:servlet-api</exclude> + <exclude>it.unimi.dsi:mg4j</exclude> <exclude>org.archive.nutchwax:nutchwax-thirdparty</exclude> <exclude>org.archive.nutchwax:nutchwax-plugins</exclude> + <exclude>org.archive.nutchwax:nutchwax-core</exclude> </excludes> </dependencySet> </dependencySets> Property changes on: trunk/archive-access/projects/nutchwax/nutchwax-thirdparty ___________________________________________________________________ Name: svn:externals - nutch -r 508238 http://svn.apache.org/repos/asf/lucene/nutch/trunk + nutch -r 521933 http://svn.apache.org/repos/asf/lucene/nutch/trunk Modified: trunk/archive-access/projects/nutchwax/nutchwax-webapp/src/main/webapp/search.jsp =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-webapp/src/main/webapp/search.jsp 2007-03-26 17:13:06 UTC (rev 1633) +++ trunk/archive-access/projects/nutchwax/nutchwax-webapp/src/main/webapp/search.jsp 2007-03-26 20:57:26 UTC (rev 1634) @@ -10,6 +10,7 @@ import="java.util.regex.Pattern" import="org.apache.nutch.html.Entities" + import="org.apache.nutch.metadata.Nutch" import="org.apache.nutch.searcher.*" import="org.apache.nutch.searcher.Summary.Fragment" import="org.apache.nutch.plugin.*" @@ -189,7 +190,15 @@ Hit[] show = hits.getHits(start, realEnd-start); HitDetails[] details = bean.getDetails(show); Summary[] summaries = bean.getSummary(details, query); + String caching = detail.getValue("cache"); + boolean showSummary = true; + boolean showCached = true; + if (caching != null) { + showSummary = !caching.equals(Nutch.CACHING_FORBIDDEN_ALL); + showCached = !caching.equals(Nutch.CACHING_FORBIDDEN_NONE); + } + bean.LOG.info("total hits: " + hits.getTotal()); String collectionsHost = nutchConf.get("wax.host", "examples.com"); @@ -245,7 +254,7 @@ %> <b><a href="<%=target%>"><%=Entities.encode(title)%></a></b> - <% if (!"".equals(summary)) { %> + <% if (!"".equals(summary) && showSummary) { %> <br><%=summary%> <% } %> <br> Modified: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportArcs.java =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-03-26 17:13:06 UTC (rev 1633) +++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-03-26 20:57:26 UTC (rev 1634) @@ -646,7 +646,7 @@ throws IOException { Path f = new Path(job.getOutputPath(), CrawlDatum.FETCH_DIR_NAME); final Path fetch = new Path(f, name); - final MapFile.Writer fetchOut = new MapFile.Writer(fs, + final MapFile.Writer fetchOut = new MapFile.Writer(job, fs, fetch.toString(), Text.class, CrawlDatum.class); // Write a cdx file. Write w/o compression. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-03-26 21:08:11
|
Revision: 1635 http://archive-access.svn.sourceforge.net/archive-access/?rev=1635&view=rev Author: stack-sf Date: 2007-03-26 14:08:10 -0700 (Mon, 26 Mar 2007) Log Message: ----------- M nutchwax/src/java/overview.html M nutchwax/README.txt Update hadoop and nutch versions. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/README.txt trunk/archive-access/projects/nutchwax/src/java/overview.html Modified: trunk/archive-access/projects/nutchwax/README.txt =================================================================== --- trunk/archive-access/projects/nutchwax/README.txt 2007-03-26 20:57:26 UTC (rev 1634) +++ trunk/archive-access/projects/nutchwax/README.txt 2007-03-26 21:08:10 UTC (rev 1635) @@ -6,144 +6,4 @@ HADOOP VERSION AND PATCHES -Hadoop release version is 0.9.2. 0.9.1 fails when you try to use local -filesystem. Turning of speculative reduce seems to fix things but the -hadoop 0.9.1. has it set to true in bundled hadoop-default.xml. See -HADOOP-827. - - -NUTCH VERSION AND PATCHES - -Below are patches made against the nutch thats built into nutchwax. -You may be able to do without them. Apply if you you are OOME'ing -because too many links found building crawldb or merging segments. - -# This patch fixes SegmentMerger OOME'ing. It puts upper bound on links -# we add to a page (Saw OOME in 1.8 Gig heap trying to add 500k links -# to single key. Also includes part of NUTCH-333. -Index: src/java/org/apache/nutch/segment/SegmentMerger.java -=================================================================== ---- src/java/org/apache/nutch/segment/SegmentMerger.java (revision 486923) -+++ src/java/org/apache/nutch/segment/SegmentMerger.java (working copy) -@@ -41,6 +41,7 @@ - import org.apache.nutch.parse.ParseText; - import org.apache.nutch.protocol.Content; - import org.apache.nutch.util.NutchConfiguration; -+import org.apache.nutch.util.NutchJob; - - /** - * This tool takes several segments and merges their data together. Only the -@@ -98,6 +99,7 @@ - private URLFilters filters = null; - private long sliceSize = -1; - private long curCount = 0; -+ private int maxLinked; - - /** - * Wraps inputs in an {@link MetaWrapper}, to permit merging different -@@ -257,6 +259,7 @@ - if (sliceSize > 0) { - sliceSize = sliceSize / conf.getNumReduceTasks(); - } -+ this.maxLinked = conf.getInt("db.linked.max", 1000); - } - - private Text newKey = new Text(); -@@ -301,7 +304,7 @@ - String lastPDname = null; - String lastPTname = null; - TreeMap linked = new TreeMap(); -- while (values.hasNext()) { -+ VALUES_LOOP: while (values.hasNext()) { - MetaWrapper wrapper = (MetaWrapper)values.next(); - Object o = wrapper.get(); - String spString = wrapper.getMeta(SEGMENT_PART_KEY); -@@ -355,6 +358,17 @@ - linked.put(sp.segmentName, segLinked); - } - segLinked.add(val); -+ if (segLinked.size() <= this.maxLinked) { -+ segLinked.add(val); -+ } else { -+ LOG.info("SKIPPING SEGLINKED LARGE " + -+ segLinked.size() + ", * linked size " + linked.size() + -+ ", name " + sp.segmentName + ", key " + key); -+ break VALUES_LOOP; -+ } -+ if ((segLinked.size() % 1000) == 0) { -+ LOG.info("SEGLINKED SIZE " + segLinked.size() + ", key " + key); -+ } - } else { - throw new IOException("Cannot determine segment part: " + sp.partName); - } -@@ -460,7 +474,7 @@ - if (LOG.isInfoEnabled()) { - LOG.info("Merging " + segs.length + " segments to " + out + "/" + segmentName); - } -- JobConf job = new JobConf(getConf()); -+ JobConf job = new NutchJob(getConf()); - job.setJobName("mergesegs " + out + "/" + segmentName); - job.setBoolean("segment.merger.filter", filter); - job.setLong("segment.merger.slice", slice); -Index: src/java/org/apache/nutch/segment/SegmentReader.java -=================================================================== ---- src/java/org/apache/nutch/segment/SegmentReader.java (revision 486923) -+++ src/java/org/apache/nutch/segment/SegmentReader.java (working copy) -@@ -36,6 +36,7 @@ - import org.apache.nutch.protocol.Content; - import org.apache.nutch.util.LogUtil; - import org.apache.nutch.util.NutchConfiguration; -+import org.apache.nutch.util.NutchJob; - - /** Dump the content of a segment. */ - public class SegmentReader extends Configured implements Reducer { -@@ -147,7 +148,7 @@ - } - - private JobConf createJobConf() { -- JobConf job = new JobConf(getConf()); -+ JobConf job = new NutchJob(getConf()); - job.setBoolean("segment.reader.co", this.co); - job.setBoolean("segment.reader.fe", this.fe); - job.setBoolean("segment.reader.ge", this.ge); - -# NUTCH-311 -# -Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java -=================================================================== ---- src/java/org/apache/nutch/crawl/CrawlDbReducer.java (revision 486923) -+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java (working copy) -@@ -38,11 +38,13 @@ - private ArrayList linked = new ArrayList(); - private ScoringFilters scfilters = null; - private boolean additionsAllowed; -+ private int maxLinked; - - public void configure(JobConf job) { - retryMax = job.getInt("db.fetch.retry.max", 3); - scfilters = new ScoringFilters(job); - additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true); -+ this.maxLinked = job.getInt("db.linked.max", 10000); - } - - public void close() {} -@@ -56,7 +58,7 @@ - byte[] signature = null; - linked.clear(); - -- while (values.hasNext()) { -+ VALUES_LOOP: while (values.hasNext()) { - CrawlDatum datum = (CrawlDatum)values.next(); - - if (highest == null || datum.getStatus() > highest.getStatus()) { -@@ -71,6 +73,10 @@ - break; - case CrawlDatum.STATUS_LINKED: - linked.add(datum); -+ if (linked.size() > this.maxLinked) { -+ LOG.info("Breaking. " + key + " has > than " + this.maxLinked); -+ break VALUES_LOOP; -+ } - break; - case CrawlDatum.STATUS_SIGNATURE: - signature = datum.getSignature(); +NutchWAX is built against nutch 0.9 which in turn uses hadoop 0.12.2. Modified: trunk/archive-access/projects/nutchwax/src/java/overview.html =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/overview.html 2007-03-26 20:57:26 UTC (rev 1634) +++ trunk/archive-access/projects/nutchwax/src/java/overview.html 2007-03-26 21:08:10 UTC (rev 1635) @@ -51,8 +51,7 @@ the platform we use to run indexing jobs atop. Hadoop is an open source implementation of <a href="http://labs.google.com/papers/mapreduce.html">Google mapreduce</a> and <a href="http://labs.google.com/papers/gfs.html">Google -GFS</a>. NutchWAX 0.10.0 requires Hadoop 0.9.2. It will not work with later -versions. Hadoop has its own set of requirements. See +GFS</a>. NutchWAX requires Hadoop 0.12.2. Hadoop has its own set of requirements. See <i>Requirements</i> about midways down on the <a href="http://lucene.apache.org/hadoop/docs/api/overview-summary.html">Hadoop API</a> page. Hadoop binaries are available for download off the This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-03-27 01:22:35
|
Revision: 1642 http://archive-access.svn.sourceforge.net/archive-access/?rev=1642&view=rev Author: stack-sf Date: 2007-03-26 18:22:31 -0700 (Mon, 26 Mar 2007) Log Message: ----------- M nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java Use LineRecordReader (TODO: Finish analysis). D nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java A nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java Renamed as LogsReporter. M nutchwax/.classpath Removed missing plugins. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/.classpath trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java Added Paths: ----------- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java Removed Paths: ------------- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java Modified: trunk/archive-access/projects/nutchwax/.classpath =================================================================== --- trunk/archive-access/projects/nutchwax/.classpath 2007-03-27 01:20:31 UTC (rev 1641) +++ trunk/archive-access/projects/nutchwax/.classpath 2007-03-27 01:22:31 UTC (rev 1642) @@ -71,10 +71,6 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-ext/parse-ext.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-html/parse-html.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-js/parse-js.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-msexcel/parse-msexcel.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-mspowerpoint/parse-mspowerpoint.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-msword/parse-msword.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-oo/parse-oo.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-pdf/parse-pdf.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-rss/parse-rss.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-swf/parse-swf.jar"/> @@ -183,7 +179,7 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-lang-2.1.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-1.0.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-api-1.0.4.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.12.2-core.jar" /> + <classpathentry sourcepath="/home/stack/checkouts/hadoop/src/java" kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.12.2-core.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jakarta-oro-2.0.7.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jets3t-0.5.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-5.1.4.jar"/> Deleted: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java 2007-03-27 01:20:31 UTC (rev 1641) +++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java 2007-03-27 01:22:31 UTC (rev 1642) @@ -1,119 +0,0 @@ -package org.archive.access.nutch.jobs; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapred.JobClient; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Mapper; -import org.apache.hadoop.mapred.OutputCollector; -import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.hadoop.mapred.TextOutputFormat; -import org.apache.hadoop.util.ToolBase; -import org.archive.access.nutch.NutchwaxConfiguration; - - -/** - * Makes a report based off passed log inputs. - * Inputs are logs of a NutchWAX import. Report lists counts of errors, - * problematic ARCs, etc. - * @author stack - * @see org.apache.hadoop.tool.Logalyzer - */ -public class ImportLogsReporter extends ToolBase implements Mapper { - // private final Log LOG = LogFactory.getLog(this.getClass().getName()); - // private long lineCount = 0; - - /** - * Parse first part of the log line. Here are some sample log lines: - * <pre> - * 2007-01-24 15:33:24,954 WARN regex.RegexURLNormalizer - can't find rules for scope 'outlink', using default - * 2007-01-24 15:33:24,570 INFO nutch.ImportArcs - adding http://www.bbswitzerland.ch/images/sbb1.gif http://www.bbswitzerland.ch:80/images/sbb1.gif 1105 image/gif - * </pre> - * Group one of the below regex is WARN or INFO in above. Group two the - * name of the logger (nutch.importArcs in the above). Group three is all - * the rest of the log string. - */ - private static final Pattern PREFIX = - Pattern.compile("\\S+\\s+\\S+\\s+(\\S+)\\s+(\\S+)\\s+-\\s+(.*)"); - - public void map(WritableComparable key, Writable value, - OutputCollector output, Reporter reporter) - throws IOException { - // lineCount++; - Matcher m = PREFIX.matcher(value.toString()); - if (!m.matches() || isWARN(m.group(1)) || isERROR(m.group(1))) { - output.collect(key, new Text(value.toString())); - } - } - - protected boolean isWARN(final String level) { - return level.equals("WARN"); - } - - protected boolean isERROR(final String level) { - return level.equals("ERROR"); - } - - public void configure(JobConf job) { - - // TODO Auto-generated method stub - } - - public void close() throws IOException { - // System.out.println(lineCount); - } - - protected void report(final String input, final String output) - throws IOException { - Path inputDir = new Path(input); - if (!FileSystem.get(getConf()).exists(inputDir)) { - throw new FileNotFoundException(input); - } - Path outputDir = new Path(output); - - JobConf jc = new JobConf(getConf()); - jc.setJobName("Import logs reporter"); - - jc.setInputPath(inputDir); - jc.setInputFormat(TextInputFormat.class); - - jc.setMapperClass(this.getClass()); - - jc.setOutputPath(outputDir); - jc.setOutputFormat(TextOutputFormat.class); - jc.setOutputKeyClass(LongWritable.class); - jc.setOutputValueClass(Text.class); - - // Write a single file - jc.setNumReduceTasks(1); - - JobClient.runJob(jc); - } - - public int run(String[] args) throws Exception { - final String usage = "Usage: ImportLogsReporter <input> <output>\n" + - " input Directory of input files listing log file URIs\n" + - " output Where we write resulting report."; - if (args.length != 2) { - System.err.print(usage); - return -1; - } - report(args[0], args[1]); - return 0; - } - - public static void main(String[] args) throws Exception { - System.exit(new ImportLogsReporter(). - doMain(NutchwaxConfiguration.getConfiguration(), args)); - } -} Added: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java (rev 0) +++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java 2007-03-27 01:22:31 UTC (rev 1642) @@ -0,0 +1,123 @@ +package org.archive.access.nutch.jobs; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; +import org.apache.hadoop.util.ToolBase; +import org.archive.access.nutch.NutchwaxConfiguration; +import org.archive.access.nutch.mapred.TaskLogMapRunner; +import org.archive.mapred.ARCMapRunner; + + +/** + * Makes a report based off passed log inputs. + * Inputs are logs of a NutchWAX import. Report lists counts of errors, + * problematic ARCs, etc. + * @author stack + * @see org.apache.hadoop.tool.Logalyzer + */ +public class LogsReporter extends ToolBase implements Mapper { + // private final Log LOG = LogFactory.getLog(this.getClass().getName()); + // private long lineCount = 0; + + /** + * Parse first part of the log line. Here are some sample log lines: + * <pre> + * 2007-01-24 15:33:24,954 WARN regex.RegexURLNormalizer - can't find rules for scope 'outlink', using default + * 2007-01-24 15:33:24,570 INFO nutch.ImportArcs - adding http://www.bbswitzerland.ch/images/sbb1.gif http://www.bbswitzerland.ch:80/images/sbb1.gif 1105 image/gif + * </pre> + * Group one of the below regex is WARN or INFO in above. Group two the + * name of the logger (nutch.importArcs in the above). Group three is all + * the rest of the log string. + */ + private static final Pattern PREFIX = + Pattern.compile("\\S+\\s+\\S+\\s+(\\S+)\\s+(\\S+)\\s+-\\s+(.*)"); + + public void map(WritableComparable key, Writable value, + OutputCollector output, Reporter reporter) + throws IOException { + // lineCount++; + Matcher m = PREFIX.matcher(value.toString()); + if (!m.matches() || isWARN(m.group(1)) || isERROR(m.group(1))) { + output.collect(key, new Text(value.toString())); + } + } + + protected boolean isWARN(final String level) { + return level.equals("WARN"); + } + + protected boolean isERROR(final String level) { + return level.equals("ERROR"); + } + + public void configure(JobConf job) { + + // TODO Auto-generated method stub + } + + public void close() throws IOException { + // System.out.println(lineCount); + } + + protected void report(final String input, final String output) + throws IOException { + Path inputDir = new Path(input); + if (!FileSystem.get(getConf()).exists(inputDir)) { + throw new FileNotFoundException(input); + } + Path outputDir = new Path(output); + + JobConf jc = new JobConf(getConf()); + jc.setJobName("Import logs reporter"); + + jc.setMapRunnerClass(TaskLogMapRunner.class); + + jc.setInputPath(inputDir); + jc.setInputFormat(TextInputFormat.class); + + jc.setMapperClass(this.getClass()); + + jc.setOutputPath(outputDir); + jc.setOutputFormat(TextOutputFormat.class); + jc.setOutputKeyClass(LongWritable.class); + jc.setOutputValueClass(Text.class); + + // Write a single file + jc.setNumReduceTasks(1); + + JobClient.runJob(jc); + } + + public int run(String[] args) throws Exception { + final String usage = "Usage: ImportLogsReporter <input> <output>\n" + + " input Directory of input files listing log file URIs\n" + + " output Where we write resulting report."; + if (args.length != 2) { + System.err.print(usage); + return -1; + } + report(args[0], args[1]); + return 0; + } + + public static void main(String[] args) throws Exception { + System.exit(new LogsReporter(). + doMain(NutchwaxConfiguration.getConfiguration(), args)); + } +} Modified: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java 2007-03-27 01:20:31 UTC (rev 1641) +++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java 2007-03-27 01:22:31 UTC (rev 1642) @@ -27,9 +27,12 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.LineRecordReader; import org.apache.hadoop.mapred.MapRunnable; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; @@ -38,7 +41,7 @@ import org.apache.hadoop.util.ReflectionUtils; /** - * Calls a map for every line in a hadoop userlog directory. + * Calls a map for every line of a hadoop userlog directory. * @author stack */ public class TaskLogMapRunner implements MapRunnable { @@ -70,6 +73,12 @@ throws IOException { URL u = new URL(logurl); TaskLogReader tlr = new TaskLogReader(u); - // TODO: Need to upgrade hadoop so can get new LineRecordReader. + LineRecordReader lrr = new LineRecordReader(tlr.getInputStream(), 0, + tlr.getTotalLogSize()); + LongWritable lineKey = new LongWritable(); + Text lineValue = new Text(); + while(lrr.next(lineKey, lineValue)) { + LOG.info(lineKey.toString() + " " + lineValue.toString()); + } } } \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-04-06 20:42:37
|
Revision: 1705 http://archive-access.svn.sourceforge.net/archive-access/?rev=1705&view=rev Author: stack-sf Date: 2007-04-06 13:42:38 -0700 (Fri, 06 Apr 2007) Log Message: ----------- Move m2'ing moving src and plugins to m2 expected locations Modified Paths: -------------- trunk/archive-access/projects/nutchwax/nutchwax-plugins/pom.xml Added Paths: ----------- trunk/archive-access/projects/nutchwax/nutchwax-core/src/ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/ trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/ trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/main/ trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/main/plugin/ Removed Paths: ------------- trunk/archive-access/projects/nutchwax/src/java/ trunk/archive-access/projects/nutchwax/src/plugin/ Copied: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java (from rev 1704, trunk/archive-access/projects/nutchwax/src/java) Modified: trunk/archive-access/projects/nutchwax/nutchwax-plugins/pom.xml =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-plugins/pom.xml 2007-04-06 19:49:00 UTC (rev 1704) +++ trunk/archive-access/projects/nutchwax/nutchwax-plugins/pom.xml 2007-04-06 20:42:38 UTC (rev 1705) @@ -33,35 +33,35 @@ at the maven dependencies when we go to compile. --> <echo>Compiling plugins</echo> - <ant dir="../src/plugin/index-wax" target="deploy" inheritAll="false" + <ant dir="src/plugin/index-wax" target="deploy" inheritAll="false" inheritRefs="true"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/query-wax" target="deploy" inheritAll="false" + <ant dir="src/plugin/query-wax" target="deploy" inheritAll="false" inheritRefs="true"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/parse-default" target="deploy" inheritAll="false" + <ant dir="src/plugin/parse-default" target="deploy" inheritAll="false" inheritRefs="true"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/parse-waxext" target="deploy" inheritAll="false" + <ant dir="src/plugin/parse-waxext" target="deploy" inheritAll="false" inheritRefs="true"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/query-host" target="deploy" inheritAll="false" + <ant dir="src/plugin/query-host" target="deploy" inheritAll="false" inheritRefs="true"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/query-anchor" target="deploy" inheritAll="false" + <ant dir="src/plugin/query-anchor" target="deploy" inheritAll="false" inheritRefs="true"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/query-title" target="deploy" inheritAll="false" + <ant dir="src/plugin/query-title" target="deploy" inheritAll="false" inheritRefs="true"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/query-content" target="deploy" inheritAll="false" + <ant dir="src/plugin/query-content" target="deploy" inheritAll="false" inheritRefs="true"> <property name="build.compiler" value="extJavac" /> </ant> @@ -81,29 +81,29 @@ we can invoke it from eclipse. --> <echo>Cleaning plugins</echo> - <ant dir="../src/plugin/index-wax" target="clean" inheritAll="false" + <ant dir="src/plugin/index-wax" target="clean" inheritAll="false" inheritRefs="true"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/query-wax" target="clean" inheritAll="false"> + <ant dir="src/plugin/query-wax" target="clean" inheritAll="false"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/parse-default" target="clean" inheritAll="false"> + <ant dir="src/plugin/parse-default" target="clean" inheritAll="false"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/parse-waxext" target="clean" inheritAll="false"> + <ant dir="src/plugin/parse-waxext" target="clean" inheritAll="false"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/query-host" target="clean" inheritAll="false"> + <ant dir="src/plugin/query-host" target="clean" inheritAll="false"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/query-anchor" target="clean" inheritAll="false"> + <ant dir="src/plugin/query-anchor" target="clean" inheritAll="false"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/query-title" target="clean" inheritAll="false"> + <ant dir="src/plugin/query-title" target="clean" inheritAll="false"> <property name="build.compiler" value="extJavac" /> </ant> - <ant dir="../src/plugin/query-content" target="clean" inheritAll="false"> + <ant dir="src/plugin/query-content" target="clean" inheritAll="false"> <property name="build.compiler" value="extJavac" /> </ant> </tasks> Copied: trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/main/plugin (from rev 1704, trunk/archive-access/projects/nutchwax/src/plugin) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-04-06 22:54:36
|
Revision: 1707 http://archive-access.svn.sourceforge.net/archive-access/?rev=1707&view=rev Author: stack-sf Date: 2007-04-06 15:54:36 -0700 (Fri, 06 Apr 2007) Log Message: ----------- More m2'ing moving classes to m2 default. * nutchwax-plugins/src/plugin/build-plugin.xml Get core classes from new location. * nutchwax-core/pom.xml Point to new java class locations. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/nutchwax-core/pom.xml trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/build-plugin.xml Removed Paths: ------------- trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/main/ Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/pom.xml =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/pom.xml 2007-04-06 20:43:49 UTC (rev 1706) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/pom.xml 2007-04-06 22:54:36 UTC (rev 1707) @@ -18,7 +18,7 @@ </dependency> </dependencies> <build> - <sourceDirectory>../src/java</sourceDirectory> + <sourceDirectory>src/main/java</sourceDirectory> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> Modified: trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/build-plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/build-plugin.xml 2007-04-06 20:43:49 UTC (rev 1706) +++ trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/build-plugin.xml 2007-04-06 22:54:36 UTC (rev 1707) @@ -21,7 +21,7 @@ <!--Point at nutchwax home instead of at nutch. --> - <property name="nutch.root" location="${root}/../../../"/> + <property name="nutch.root" location="${root}/../../../../"/> <!--Point at nutch under third-party subdir. --> <property name="real.nutch.root" location="${nutch.root}/nutchwax-thirdparty/nutch"/> @@ -52,6 +52,7 @@ <path id="classpath"> <pathelement location="${build.classes}"/> <pathelement location="${nutch.root}/target/classes"/> + <pathelement location="${nutch.root}/nutchwax-core/target/classes"/> <!--IA: Add the nutch jars.--> <fileset dir="${real.nutch.root}/lib"> <include name="*.jar" /> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-04-06 23:27:42
|
Revision: 1708 http://archive-access.svn.sourceforge.net/archive-access/?rev=1708&view=rev Author: stack-sf Date: 2007-04-06 16:27:43 -0700 (Fri, 06 Apr 2007) Log Message: ----------- M nutchwax/nutchwax-core/src/main/java/overview.html M nutchwax/README.txt Update versions. M nutchwax/pom.xml Fix javadoc source pointers so they point to new locations under nutchwax-core. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/README.txt trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/overview.html trunk/archive-access/projects/nutchwax/pom.xml Modified: trunk/archive-access/projects/nutchwax/README.txt =================================================================== --- trunk/archive-access/projects/nutchwax/README.txt 2007-04-06 22:54:36 UTC (rev 1707) +++ trunk/archive-access/projects/nutchwax/README.txt 2007-04-06 23:27:43 UTC (rev 1708) @@ -6,4 +6,4 @@ HADOOP VERSION AND PATCHES -NutchWAX is built against nutch 0.9 which in turn uses hadoop 0.12.2. +NutchWAX is built against the nutch 0.9 branch which in turn uses hadoop 0.12.X. Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/overview.html =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/overview.html 2007-04-06 22:54:36 UTC (rev 1707) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/overview.html 2007-04-06 23:27:43 UTC (rev 1708) @@ -51,7 +51,7 @@ the platform we use to run indexing jobs atop. Hadoop is an open source implementation of <a href="http://labs.google.com/papers/mapreduce.html">Google mapreduce</a> and <a href="http://labs.google.com/papers/gfs.html">Google -GFS</a>. NutchWAX requires Hadoop 0.12.2. Hadoop has its own set of +GFS</a>. NutchWAX requires Hadoop 0.12.X. Hadoop has its own set of equirements. See <i>Requirements</i> about midways down on the <a href="http://lucene.apache.org/hadoop/docs/api/overview-summary.html">Hadoop API</a> page. Hadoop binaries are available for download off the Modified: trunk/archive-access/projects/nutchwax/pom.xml =================================================================== --- trunk/archive-access/projects/nutchwax/pom.xml 2007-04-06 22:54:36 UTC (rev 1707) +++ trunk/archive-access/projects/nutchwax/pom.xml 2007-04-06 23:27:43 UTC (rev 1708) @@ -275,10 +275,10 @@ <artifactId>maven-javadoc-plugin</artifactId> <configuration> <source> - ${basedir}/src/java + ${basedir}/nutchwax-core/src/main/java </source> <overview> - ${basedir}/src/java/overview.html + ${basedir}/nutchwax-core/src/main/java/overview.html </overview> <aggregate> true This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-04-10 00:03:30
|
Revision: 1711 http://archive-access.svn.sourceforge.net/archive-access/?rev=1711&view=rev Author: stack-sf Date: 2007-04-09 17:03:29 -0700 (Mon, 09 Apr 2007) Log Message: ----------- * nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java Add in new 'multiple' parameter for running multiple concurrent non-mapreduce tasks such as merge and sort. * nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java Job to run multiple concurrent non-mapreduce tasks. * .classpath Update classpath so points to new locations for src. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/.classpath trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java Added Paths: ----------- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java Modified: trunk/archive-access/projects/nutchwax/.classpath =================================================================== --- trunk/archive-access/projects/nutchwax/.classpath 2007-04-09 16:45:35 UTC (rev 1710) +++ trunk/archive-access/projects/nutchwax/.classpath 2007-04-10 00:03:29 UTC (rev 1711) @@ -1,14 +1,6 @@ <?xml version="1.0" encoding="UTF-8"?> <classpath> - <classpathentry kind="src" path="src/java"/> - <classpathentry kind="src" path="src/plugin/index-wax/src/java"/> - <classpathentry kind="src" path="src/plugin/parse-default/src/java"/> - <classpathentry kind="src" path="src/plugin/parse-waxext/src/java"/> - <classpathentry kind="src" path="src/plugin/query-anchor/src/java"/> - <classpathentry kind="src" path="src/plugin/query-content/src/java"/> - <classpathentry kind="src" path="src/plugin/query-host/src/java"/> - <classpathentry kind="src" path="src/plugin/query-title/src/java"/> - <classpathentry kind="src" path="src/plugin/query-wax/src/java"/> + <classpathentry kind="src" path="nutchwax-core/src/main/java"/> <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/java"/> <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/index-basic/src/java"/> <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/index-more/src/java"/> @@ -45,7 +37,6 @@ <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlnormalizer-pass/src/test"/> <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlnormalizer-regex/src/java"/> <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/plugin/urlnormalizer-regex/src/test"/> - <classpathentry kind="src" path="nutchwax-thirdparty/nutch/src/test"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/clustering-carrot2/clustering-carrot2.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/creativecommons/creativecommons.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/index-basic/index-basic.jar"/> @@ -65,7 +56,6 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-xml/saxpath.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-xml/xercesImpl.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/microformats-reltag/microformats-reltag.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/nutch-0.9-dev.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/nutch-extensionpoints/nutch-extensionpoints.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/ontology/ontology.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-ext/parse-ext.jar"/> @@ -179,7 +169,7 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-lang-2.1.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-1.0.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-api-1.0.4.jar"/> - <classpathentry sourcepath="/home/stack/checkouts/hadoop/src/java" kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.12.2-core.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.12.2-core.jar" sourcepath="/home/stack/checkouts/hadoop/src/java"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jakarta-oro-2.0.7.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jets3t-0.5.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-5.1.4.jar"/> Added: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java (rev 0) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java 2007-04-10 00:03:29 UTC (rev 1711) @@ -0,0 +1,219 @@ +package org.archive.access.nutch; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.LineNumberReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Timer; +import java.util.TimerTask; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.util.ToolBase; +import org.apache.nutch.util.NutchConfiguration; + +/** + * Run multiple concurrent tasks. + * Takes input that has per line the name of the class to run and the arguments + * to pass. Use this mapreduce job to run multiple concurrent merges or + * multiple concurrent sorts, etc. Will run as many tasks as there are input + * lines. + * @author stack + */ +public class Multiple extends ToolBase implements Mapper { + public void map(WritableComparable key, Writable value, + OutputCollector output, final Reporter reporter) + throws IOException { + final String [] words = ("PADDING_FOR_DOCLASS_BELOW " + + value.toString()).split("\\s"); + if (words.length <= 1) { + return; + } + // Set a timer running that will update reporter on a period. + Timer t = new Timer(false); + t.scheduleAtFixedRate(new TimerTask() { + @Override + public void run() { + try { + reporter.setStatus("Running " + words[1]); + } catch (IOException e) { + e.printStackTrace(); + } + }}, 0, 3000); + try { + Nutchwax.doClass(words); + } finally { + t.cancel(); + } + } + + public void configure(JobConf job) { + // Nothing to configure. + } + + public void close() throws IOException { + // TODO Auto-generated method stub + } + + public static class MultipleInputFormat implements InputFormat { + + public RecordReader getRecordReader(final InputSplit split, + final JobConf job, final Reporter reporter) + throws IOException { + // Only one record/line to read. + return new RecordReader() { + private final String line = ((LineInputSplit)split).line; + private boolean read = false; + + public void close() throws IOException { + // TODO Auto-generated method stub + } + + public WritableComparable createKey() { + return new Text(""); + } + + public Writable createValue() { + return new Text(""); + } + + public long getPos() throws IOException { + return 0; + } + + public float getProgress() throws IOException { + return getPos(); + } + + public boolean next(Writable key, Writable value) + throws IOException { + if (read) { + return false; + } + read = true; + ((Text)value).set(this.line); + return true; + } + }; + } + + public InputSplit[] getSplits(JobConf job, int numSplits) + throws IOException { + Path[] inputs = job.getInputPaths(); + List<String> lines = new ArrayList<String>(); + for (int i = 0; i < inputs.length; i++) { + Path p = inputs[i]; + FileSystem fs = p.getFileSystem(job); + Path [] ps = fs.listPaths(p); + for (int j = 0; j < ps.length; j++) { + if (fs.isDirectory(ps[j])) { + continue; + } + addFileLines(lines, fs, ps[j]); + } + } + List<LineInputSplit> splits = + new ArrayList<LineInputSplit>(lines.size()); + for (String line: lines) { + splits.add(new LineInputSplit(line)); + } + return splits.toArray(new LineInputSplit [0]); + } + + private void addFileLines(final List<String> lines, final FileSystem fs, + final Path p) + throws IOException { + InputStream is = (InputStream)fs.open(p); + LineNumberReader lnr = null; + try { + lnr = new LineNumberReader(new InputStreamReader(is)); + for (String l = null; (l = lnr.readLine()) != null;) { + if (l.length() > 0 && !l.trim().startsWith("#")) { + lines.add(l); + } + } + } finally { + if (lnr != null) { + lnr.close(); + } + is.close(); + } + } + + public void validateInput(JobConf job) throws IOException { + // Nothing to validate. + } + } + + public static class LineInputSplit implements InputSplit { + private String line; + + protected LineInputSplit() { + super(); + } + + public LineInputSplit(final String l) { + line = l; + } + + public long getLength() throws IOException { + return line.length(); + } + + public String[] getLocations() throws IOException { + return new String[0]; + } + + public void readFields(DataInput in) throws IOException { + this.line = in.readLine(); + } + + public void write(DataOutput out) throws IOException { + out.writeBytes(this.line); + } + } + + public static void usage() { + System.out.println("Usage: multiple <input> <output>"); + System.out.println("Arguments:"); + System.out.println(" <input> Directory of input files with " + + "each line describing task to run"); + System.out.println(" <output> Output directory."); + } + + public int run(String[] args) throws Exception { + if (args.length != 2 || + (args.length == 1 && + (args[0].equals("-h") || args[0].equals("--help")))) { + usage(); + return -1; + } + JobConf job = new JobConf(MultipleInputFormat.class); + job.setInputFormat(MultipleInputFormat.class); + job.setInputPath(new Path(args[0])); + job.setMapperClass(Multiple.class); + job.setOutputPath(new Path(args[1])); + JobClient.runJob(job); + return 0; + } + + public static void main(String[] args) throws Exception { + int res = new Multiple().doMain(NutchConfiguration.create(), args); + System.exit(res); + } +} \ No newline at end of file Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java 2007-04-09 16:45:35 UTC (rev 1710) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java 2007-04-10 00:03:29 UTC (rev 1711) @@ -65,7 +65,7 @@ private final static List JOBS = Arrays.asList(new String[] { "import", "update", "invert", "index", "dedup", "merge", "all", - "class", "search"}); + "class", "search", "multiple"}); // Lazy initialize these two variables to delay complaint about hadoop not @@ -269,7 +269,7 @@ od.getIndex(), od.getTmpDir()); } - protected String [] rewriteArgs(final String [] args, final int offset) { + static String [] rewriteArgs(final String [] args, final int offset) { final String [] newArgs = new String[args.length - offset]; for (int i = 0; i < args.length; i++) { if (i < offset) { @@ -280,7 +280,7 @@ return newArgs; } - protected void doClass(final String [] args) { + static void doClass(final String [] args) { // Redo args so absent our nutchwax 'class' command. final String className = args[1]; String [] newArgs = rewriteArgs(args, 2); @@ -331,6 +331,10 @@ } } + protected void doMultiple(final String [] args) throws Exception { + (new Multiple()).run(rewriteArgs(args, 1)); + } + protected void doJob(final String jobName, final String [] args) throws Exception { if (jobName.equals("import")) { @@ -431,6 +435,8 @@ doClassUsage("ERROR: Wrong number of arguments passed.", 2); } doSearch(args); + } else if (jobName.equals("multiple")) { + doMultiple(args); } else { usage("ERROR: No handler for job name " + jobName, 4); System.exit(0); @@ -534,17 +540,17 @@ System.out.println("Jobs (usually) must be run in the order " + "listed below."); System.out.println("Available jobs:"); - System.out.println(" import Import ARCs."); - System.out.println(" update Update dbs with recent imports."); - System.out.println(" invert Invert links."); - System.out.println(" index Index segments."); - System.out.println(" dedup Deduplicate by URL or content MD5."); - System.out.println(" merge Merge segment indices into one."); - System.out.println(" all Runs all above jobs in order."); - System.out.println(" class Run the passed class's main."); - System.out.println(" search Run a query against index under " + + System.out.println(" import Import ARCs."); + System.out.println(" update Update dbs with recent imports."); + System.out.println(" invert Invert links."); + System.out.println(" index Index segments."); + System.out.println(" dedup Deduplicate by URL or content MD5."); + System.out.println(" merge Merge segment indices into one."); + System.out.println(" all Runs all above jobs in order."); + System.out.println(" class Run the passed class's main."); + System.out.println(" search Run a query against index under " + "property 'searcher.dir'"); - + System.out.println(" multiple Run multiple concurrent tasks."); System.exit(exitCode); } @@ -621,6 +627,15 @@ " to merge reside."); System.exit(exitCode); } + + public static void doMultipleUsage(final String message, + final int exitCode) { + if (message != null && message.length() > 0) { + System.out.println(message); + } + Multiple.usage(); + System.exit(exitCode); + } public static void doSearchUsage(final String message, final int exitCode) { @@ -704,6 +719,8 @@ doAllUsage(null, 1); } else if (jobName.equals("search")) { doSearchUsage(null, 1); + } else if (jobName.equals("multiple")) { + doMultipleUsage(null, 1); } else if (jobName.equals("class")) { doClassUsage(null, 1); } else { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-04-10 17:08:00
|
Revision: 1712 http://archive-access.svn.sourceforge.net/archive-access/?rev=1712&view=rev Author: stack-sf Date: 2007-04-10 10:07:52 -0700 (Tue, 10 Apr 2007) Log Message: ----------- Implement '[ 1697808 ] [nutchwax] Use MR to run multiple concurrent index merges' * xdocs/faq.fml Add note on how to run many concurrent merges. * nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java (doClass): Rename as doClassMain since thats what it actually does. Roll all exceptions up into the generic Exception capture since all are given the same treatment anyways. (doSearch): Refactor to use doClassMain. * nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java Cleanup that comes of a bunch of exercising of this class. Added logger. Narrow what this class does; now it will only run the doMain of hadoop ToolBase classes. Set number of map tasks to be the number of splits. * .classpath Remove /home/stack referecne. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/.classpath trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java trunk/archive-access/projects/nutchwax/xdocs/faq.fml Modified: trunk/archive-access/projects/nutchwax/.classpath =================================================================== --- trunk/archive-access/projects/nutchwax/.classpath 2007-04-10 00:03:29 UTC (rev 1711) +++ trunk/archive-access/projects/nutchwax/.classpath 2007-04-10 17:07:52 UTC (rev 1712) @@ -169,7 +169,6 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-lang-2.1.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-1.0.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-api-1.0.4.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.12.2-core.jar" sourcepath="/home/stack/checkouts/hadoop/src/java"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jakarta-oro-2.0.7.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jets3t-0.5.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-5.1.4.jar"/> @@ -224,5 +223,6 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/conf"/> <classpathentry kind="lib" path="nutchwax-plugins/target"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.12.2-core.jar"/> <classpathentry kind="output" path="target"/> </classpath> Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java 2007-04-10 00:03:29 UTC (rev 1711) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java 2007-04-10 17:07:52 UTC (rev 1712) @@ -11,6 +11,8 @@ import java.util.Timer; import java.util.TimerTask; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; @@ -28,42 +30,74 @@ import org.apache.nutch.util.NutchConfiguration; /** - * Run multiple concurrent tasks. + * Run multiple concurrent non-mapreduce {@link ToolBase} tasks such as + * {@link org.apache.nutch.indexer.IndexMerger} or + * {@link org.apache.indexer.IndexSorter}. + * * Takes input that has per line the name of the class to run and the arguments - * to pass. Use this mapreduce job to run multiple concurrent merges or - * multiple concurrent sorts, etc. Will run as many tasks as there are input - * lines. + * to pass. Here is an example line for IndexMerger: + * <code>org.apache.nutch.indexer.IndexMerger -workingdir /tmp index-new indexes + * </code>. We run as many tasks as there are input lines. + * * @author stack */ public class Multiple extends ToolBase implements Mapper { + public final Log LOG = LogFactory.getLog(this.getClass()); + private JobConf job; + public void map(WritableComparable key, Writable value, OutputCollector output, final Reporter reporter) throws IOException { - final String [] words = ("PADDING_FOR_DOCLASS_BELOW " + - value.toString()).split("\\s"); - if (words.length <= 1) { + final String [] words = value.toString().split("\\s"); + if (words.length <= 0) { return; } + final String className = words[0]; // Set a timer running that will update reporter on a period. Timer t = new Timer(false); t.scheduleAtFixedRate(new TimerTask() { @Override public void run() { try { - reporter.setStatus("Running " + words[1]); + reporter.setStatus("Running " + className); } catch (IOException e) { e.printStackTrace(); } - }}, 0, 3000); + }}, 0, 10000); try { - Nutchwax.doClass(words); - } finally { + int result = doMain(words); + reporter.setStatus("Done running " + className + ": " + result); + if (result != 0) { + throw new IOException(className + " returned non-null: " + + result + ", check logs."); + } + } finally { t.cancel(); } } - public void configure(JobConf job) { - // Nothing to configure. + /** + * Call {@link ToolBase#doMain(org.apache.hadoop.conf.Configuration, String[])} + * on the passed classname. + * @param args + * @return Result from call to doMain. + */ + private int doMain(final String [] args) { + final String className = args[0]; + // Redo args so absent our 'class' command. + String [] newArgs = Nutchwax.rewriteArgs(args, 1); + int result = -1; + try { + Object obj = Class.forName(className).newInstance(); + result = ((ToolBase)obj).doMain(this.job, newArgs); + } catch (Exception e) { + LOG.error(className, e); + } + return result; + } + + public void configure(final JobConf j) { + this.job = j; } public void close() throws IOException { @@ -132,6 +166,7 @@ for (String line: lines) { splits.add(new LineInputSplit(line)); } + job.setNumMapTasks(lines.size()); return splits.toArray(new LineInputSplit [0]); } Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java 2007-04-10 00:03:29 UTC (rev 1711) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java 2007-04-10 17:07:52 UTC (rev 1712) @@ -26,7 +26,6 @@ import java.io.FileNotFoundException; import java.io.IOException; -import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Arrays; @@ -280,55 +279,32 @@ return newArgs; } - static void doClass(final String [] args) { + static Object doClassMain(final String [] args) { // Redo args so absent our nutchwax 'class' command. final String className = args[1]; String [] newArgs = rewriteArgs(args, 2); // From http://www.javaworld.com/javaworld/javaqa/1999-06/01-outside.html Class [] argTypes = new Class[1]; argTypes[0] = String[].class; + Object result = null; try { Method mainMethod = Class.forName(className).getDeclaredMethod("main", argTypes); - mainMethod.invoke(newArgs, new Object [] {newArgs}); - } catch (SecurityException e) { - throw new RuntimeException(e); - } catch (NoSuchMethodException e) { - throw new RuntimeException(e); - } catch (ClassNotFoundException e) { - throw new RuntimeException(e); - } catch (IllegalArgumentException e) { - throw new RuntimeException(e); - } catch (IllegalAccessException e) { - throw new RuntimeException(e); - } catch (InvocationTargetException e) { - throw new RuntimeException(e); + result = mainMethod.invoke(newArgs, new Object [] {newArgs}); + } catch (Throwable t) { + t.printStackTrace(); } + return result; } - protected void doSearch(final String [] args) { - // Redo args so absent our nutchwax 'query' command. - String [] newArgs = rewriteArgs(args, 1); - // From http://www.javaworld.com/javaworld/javaqa/1999-06/01-outside.html - Class [] argTypes = new Class[1]; - argTypes[0] = String[].class; - try { - Method mainMethod = Class.forName(NutchwaxBean.class.getName()). - getDeclaredMethod("main", argTypes); - mainMethod.invoke(newArgs, new Object [] {newArgs}); - } catch (SecurityException e) { - throw new RuntimeException(e); - } catch (NoSuchMethodException e) { - throw new RuntimeException(e); - } catch (ClassNotFoundException e) { - throw new RuntimeException(e); - } catch (IllegalArgumentException e) { - throw new RuntimeException(e); - } catch (IllegalAccessException e) { - throw new RuntimeException(e); - } catch (InvocationTargetException e) { - throw new RuntimeException(e); + protected Object doSearch(final String [] args) { + String [] newArgs = new String[args.length + 1]; + newArgs[0] = args[0]; + newArgs[1] = NutchwaxBean.class.getName(); + for (int i = 1; i < args.length; i++) { + newArgs[i + 1] = args[i]; } + return doClassMain(newArgs); } protected void doMultiple(final String [] args) throws Exception { @@ -429,7 +405,7 @@ if (args.length < 2) { doClassUsage("ERROR: Wrong number of arguments passed.", 2); } - doClass(args); + doClassMain(args); } else if (jobName.equals("search")) { if (args.length < 1) { doClassUsage("ERROR: Wrong number of arguments passed.", 2); Modified: trunk/archive-access/projects/nutchwax/xdocs/faq.fml =================================================================== --- trunk/archive-access/projects/nutchwax/xdocs/faq.fml 2007-04-10 00:03:29 UTC (rev 1711) +++ trunk/archive-access/projects/nutchwax/xdocs/faq.fml 2007-04-10 17:07:52 UTC (rev 1712) @@ -105,6 +105,19 @@ Run the following to see the usage: <pre>$ ${HADOOP_HOME}/bin/hadoop jar nutchwax-job-0.11.0-SNAPSHOT.jar class org.apache.nutch.segment.SegmentMerger ~/tmp/crawl/segments_merged/ ~/tmp/crawl/segments/20070406155807-test/ ~/tmp/crawl/segments/20070406155856-test/</pre> </p> +<p>If creating multiple indices, you may want to make use of the NutchWAX facility +that runs a mapreduce job to farm out the multiple index merges across the cluster +so they run conccurrently rather than in series. For usage, run the following: +<pre>stack@debord:~/workspace$ ${HADOOP_HOME}/bin/hadoop jar nutchwax.jar help multiple +</pre> +It takes an inputs directory and an outputs (The latter is usually not used). The +inputs lists per line a job to run on a remote machine. Here is an example line from +an input that would run an index merge of the directory <code>indexes-monday</code> into +<code>index-monday</index> using <code>/tmp</code> as working directory: +<pre> +org.apache.nutch.indexer.IndexMerger -workingdir /tmp index-monday indexes-monday +</pre>. +</p> </answer> </faq> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-04-10 18:13:42
|
Revision: 1714 http://archive-access.svn.sourceforge.net/archive-access/?rev=1714&view=rev Author: stack-sf Date: 2007-04-10 11:12:24 -0700 (Tue, 10 Apr 2007) Log Message: ----------- M nutchwax/xdocs/faq.fml M nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java Add examples of how to run multiple concurrent index sorts. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java trunk/archive-access/projects/nutchwax/xdocs/faq.fml Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java 2007-04-10 17:30:07 UTC (rev 1713) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java 2007-04-10 18:12:24 UTC (rev 1714) @@ -37,7 +37,10 @@ * Takes input that has per line the name of the class to run and the arguments * to pass. Here is an example line for IndexMerger: * <code>org.apache.nutch.indexer.IndexMerger -workingdir /tmp index-new indexes - * </code>. We run as many tasks as there are input lines. + * </code>. Here is one for IndexSorter: + * <code>org.apache.nutch.indexer.IndexSorter /home/stack/tmp/crawl</code> + * (Note that IndexSorter wants to refer to the local system; the indexes to + * sort must be on local disk). We run as many tasks as there are input lines. * * @author stack */ @@ -234,8 +237,16 @@ System.out.println("Examples:"); System.out.println(" org.apache.nutch.indexer.IndexMerger " + "-workingdir /3/hadoop-tmp index-monday indexes-monday"); - System.out.println(" (Note that named class must implement " + - "org.apache.hadoop.util.ToolBase)"); + System.out.println(" Note that named class must implement " + + "org.apache.hadoop.util.ToolBase"); + System.out.println(); + System.out.println(" org.apache.nutch.indexer.IndexSorter " + + "/home/stack/tmp/crawl"); + System.out.println(" Note that IndexSorter refers to local " + + "filesystem and not to hdfs and is RAM-bound. Set"); + System.out.println(" task child RAM with the mapred.child.java.opts " + + "property in your hadoop-site.xml."); + } public int run(String[] args) throws Exception { Modified: trunk/archive-access/projects/nutchwax/xdocs/faq.fml =================================================================== --- trunk/archive-access/projects/nutchwax/xdocs/faq.fml 2007-04-10 17:30:07 UTC (rev 1713) +++ trunk/archive-access/projects/nutchwax/xdocs/faq.fml 2007-04-10 18:12:24 UTC (rev 1714) @@ -80,20 +80,6 @@ </answer> </faq> -<faq id="sort"> -<title>How do I sort an index in NutchWAX</title> -<question>How do I sort an index with NutchWAX</question> -<answer><p>Sorting an index will usually return better -quality results in less time. Most of Nutch is built into the NutchWAX jar. -To run the nutch indexer sorter, do the following: -<pre>$ hadoop jar nutchwax.jar class org.apache.nutch.indexer.IndexerSorter</pre> -</p> -<p>When the index is sorted, you might as well set the -searcher.max.hits to, e.g., 1000, since you are getting back the top ranked -documents and limit the number of hits someone is allowed to see to 1000.</p> -</answer> -</faq> - <faq id="segmentmerge"> <title>How do I merge segments in NutchWAX</title> <question>How do I merge segments in NutchWAX</question> @@ -118,9 +104,34 @@ org.apache.nutch.indexer.IndexMerger -workingdir /tmp index-monday indexes-monday </pre>. </p> +<p>In a similar fashion its possible to run multiple concurrent index sorts. +Here is an example line from the inputs: +<pre>org.apache.nutch.indexer.IndexSorter /home/stack/tmp/crawl</pre> +Note that the IndexSorter references the local filesystem explicitly (Your +index cannot be in hdfs when you run the sort). Also index sorting is RAM-bound +so you will probably need to up the RAM allocated to task children (Set the +mapred.child.java.opts property in your hadoop-site.xml). +</p> </answer> </faq> +<faq id="sort"> +<title>How do I sort an index in NutchWAX</title> +<question>How do I sort an index with NutchWAX</question> +<answer><p>Sorting an index will usually return better +quality results in less time. Most of Nutch is built into the NutchWAX jar. +To run the nutch indexer sorter, do the following: +<pre>$ hadoop jar nutchwax.jar class org.apache.nutch.indexer.IndexerSorter</pre> +</p> +<p>When the index is sorted, you might as well set the +searcher.max.hits to, e.g., 1000, since you are getting back the top ranked +documents and limit the number of hits someone is allowed to see to 1000.</p> +<p>See the end of <a href="#segmentmerge">How do I merge segments in NutchWAX</a> +for how to run multiple concurrent sorts.</p> +</answer> +</faq> + + <faq id="incremental"> <question>Is it possible to do incremental updates?</question> <answer><p>Here is a sketch of how to do it for now. Later we'll add better This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-04-10 18:18:58
|
Revision: 1715 http://archive-access.svn.sourceforge.net/archive-access/?rev=1715&view=rev Author: stack-sf Date: 2007-04-10 11:18:57 -0700 (Tue, 10 Apr 2007) Log Message: ----------- M nutchwax/xdocs/faq.fml Fix broke xml (close code element). M nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java Fix javadoc link. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java trunk/archive-access/projects/nutchwax/xdocs/faq.fml Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java 2007-04-10 18:12:24 UTC (rev 1714) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java 2007-04-10 18:18:57 UTC (rev 1715) @@ -32,7 +32,7 @@ /** * Run multiple concurrent non-mapreduce {@link ToolBase} tasks such as * {@link org.apache.nutch.indexer.IndexMerger} or - * {@link org.apache.indexer.IndexSorter}. + * {@link org.apache.nutch.indexer.IndexSorter}. * * Takes input that has per line the name of the class to run and the arguments * to pass. Here is an example line for IndexMerger: Modified: trunk/archive-access/projects/nutchwax/xdocs/faq.fml =================================================================== --- trunk/archive-access/projects/nutchwax/xdocs/faq.fml 2007-04-10 18:12:24 UTC (rev 1714) +++ trunk/archive-access/projects/nutchwax/xdocs/faq.fml 2007-04-10 18:18:57 UTC (rev 1715) @@ -85,11 +85,11 @@ <question>How do I merge segments in NutchWAX</question> <answer><p> Run the following to see the usage: -<pre>$ ${HADOOP_HOME}/bin/hadoop jar nutchwax-job-0.11.0-SNAPSHOT.jar class org.apache.nutch.segment.SegmentMerger</pre> +<pre>% ${HADOOP_HOME}/bin/hadoop jar nutchwax-job-0.11.0-SNAPSHOT.jar class org.apache.nutch.segment.SegmentMerger</pre> </p> <p> Run the following to see the usage: -<pre>$ ${HADOOP_HOME}/bin/hadoop jar nutchwax-job-0.11.0-SNAPSHOT.jar class org.apache.nutch.segment.SegmentMerger ~/tmp/crawl/segments_merged/ ~/tmp/crawl/segments/20070406155807-test/ ~/tmp/crawl/segments/20070406155856-test/</pre> +<pre>% ${HADOOP_HOME}/bin/hadoop jar nutchwax-job-0.11.0-SNAPSHOT.jar class org.apache.nutch.segment.SegmentMerger ~/tmp/crawl/segments_merged/ ~/tmp/crawl/segments/20070406155807-test/ ~/tmp/crawl/segments/20070406155856-test/</pre> </p> <p>If creating multiple indices, you may want to make use of the NutchWAX facility that runs a mapreduce job to farm out the multiple index merges across the cluster @@ -99,10 +99,10 @@ It takes an inputs directory and an outputs (The latter is usually not used). The inputs lists per line a job to run on a remote machine. Here is an example line from an input that would run an index merge of the directory <code>indexes-monday</code> into -<code>index-monday</index> using <code>/tmp</code> as working directory: +<code>index-monday</code> using <code>/tmp</code> as working directory: <pre> org.apache.nutch.indexer.IndexMerger -workingdir /tmp index-monday indexes-monday -</pre>. +</pre> </p> <p>In a similar fashion its possible to run multiple concurrent index sorts. Here is an example line from the inputs: This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-04-10 20:36:21
|
Revision: 1717 http://archive-access.svn.sourceforge.net/archive-access/?rev=1717&view=rev Author: stack-sf Date: 2007-04-10 13:36:21 -0700 (Tue, 10 Apr 2007) Log Message: ----------- M nutchwax/xdocs/faq.fml M nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java Add an example of a distributed copy from hdfs to local filesystem. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java trunk/archive-access/projects/nutchwax/xdocs/faq.fml Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java 2007-04-10 18:30:28 UTC (rev 1716) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java 2007-04-10 20:36:21 UTC (rev 1717) @@ -234,14 +234,27 @@ System.out.println(" <input> Directory of input files with " + "each line describing task to run"); System.out.println(" <output> Output directory."); - System.out.println("Examples:"); + System.out.println("Example input lines:"); + System.out.println(); + System.out.println(" An input line to specify a merge would look " + + "like:"); + System.out.println(); System.out.println(" org.apache.nutch.indexer.IndexMerger " + "-workingdir /3/hadoop-tmp index-monday indexes-monday"); + System.out.println(); System.out.println(" Note that named class must implement " + - "org.apache.hadoop.util.ToolBase"); + "org.apache.hadoop.util.ToolBase"); System.out.println(); + System.out.println(" To copy from " + + "hdfs://HOST:PORT/user/stack/index-monday to"); + System.out.println( " file:///0/searcher.dir/index:"); + System.out.println(); + System.out.println(" org.apache.hadoop.fs.FsShell " + + "/user/stack/index-monday /0/searcher.dir/index"); + System.out.println(); System.out.println(" org.apache.nutch.indexer.IndexSorter " + "/home/stack/tmp/crawl"); + System.out.println(); System.out.println(" Note that IndexSorter refers to local " + "filesystem and not to hdfs and is RAM-bound. Set"); System.out.println(" task child RAM with the mapred.child.java.opts " + Modified: trunk/archive-access/projects/nutchwax/xdocs/faq.fml =================================================================== --- trunk/archive-access/projects/nutchwax/xdocs/faq.fml 2007-04-10 18:30:28 UTC (rev 1716) +++ trunk/archive-access/projects/nutchwax/xdocs/faq.fml 2007-04-10 20:36:21 UTC (rev 1717) @@ -46,8 +46,6 @@ </part> - - <part id="indexing"> <title>Indexing</title> @@ -91,11 +89,56 @@ Run the following to see the usage: <pre>% ${HADOOP_HOME}/bin/hadoop jar nutchwax-job-0.11.0-SNAPSHOT.jar class org.apache.nutch.segment.SegmentMerger ~/tmp/crawl/segments_merged/ ~/tmp/crawl/segments/20070406155807-test/ ~/tmp/crawl/segments/20070406155856-test/</pre> </p> +</answer> +</faq> + +<faq id="sort"> +<title>How do I sort an index in NutchWAX</title> +<question>How do I sort an index with NutchWAX</question> +<answer><p>Sorting an index will usually return better +quality results in less time. Most of Nutch is built into the NutchWAX jar. +To run the nutch indexer sorter, do the following: +<pre>$ hadoop jar nutchwax.jar class org.apache.nutch.indexer.IndexerSorter</pre> +</p> +<p>When the index is sorted, you might as well set the +searcher.max.hits to, e.g., 1000, since you are getting back the top ranked +documents and limit the number of hits someone is allowed to see to 1000.</p> +<p>See the end of <a href="#segmentmerge">How do I merge segments in NutchWAX</a> +for how to run multiple concurrent sorts.</p> +</answer> +</faq> + +<faq id="multiples"> +<title>How to run multiple merges/sorts/copies concurrently?</title> +<question>How to run multiple merges/sorts/copies concurrently</question> +<answer> <p>If creating multiple indices, you may want to make use of the NutchWAX facility -that runs a mapreduce job to farm out the multiple index merges across the cluster -so they run concurrently rather than in series. For the usage on how to run -multiple concurrent jobs, run the following: +that runs a mapreduce job to farm out the multiple index merges, copy from hdfs to local, +and index sorting across the cluster so they run concurrently rather than in series. For +the usage on how to run multiple concurrent jobs, run the following: <pre>stack@debord:~/workspace$ ${HADOOP_HOME}/bin/hadoop jar nutchwax.jar help multiple +Usage: multiple <input> <output> +Runs concurrently all commands listed in <inputs>. +Arguments: + <input> Directory of input files with each line describing task to run + <output> Output directory. +Example input lines: + + An input line to specify a merge would look like: + + org.apache.nutch.indexer.IndexMerger -workingdir /3/hadoop-tmp index-monday indexes-monday + + Note that named class must implement org.apache.hadoop.util.ToolBase + + To copy from hdfs://HOST:PORT/user/stack/index-monday to + file:///0/searcher.dir/index: + + org.apache.hadoop.fs.FsShell /user/stack/index-monday /0/searcher.dir/index + + org.apache.nutch.indexer.IndexSorter /home/stack/tmp/crawl + + Note that IndexSorter refers to local filesystem and not to hdfs and is RAM-bound. Set + task child RAM with the mapred.child.java.opts property in your hadoop-site.xml. </pre> It takes inputs and outputs directories. The latter is usually not used but required by the framework. The inputs directory contains files that list per line a job to @@ -105,7 +148,14 @@ <pre> org.apache.nutch.indexer.IndexMerger -workingdir /tmp index-monday indexes-monday </pre> +If the inputs had a line per day of the week then we'd run seven tasks with +each task merging a day's indices. If the cluster had 7 machines, then we'd the +7 tasks would run concurrently. </p> +<p>Here is how you would specify a copy task that copyied <code>hdfs:///user/stack/index-monday</code> +to <code>file:///0/searcher.dir/index</code>: +<pre>org.apache.hadoop.fs.FsShell -get /user/stack/index-monday /0/searcher.dir/index</pre> +</p> <p>In a similar fashion its possible to run multiple concurrent index sorts. Here is an example line from the inputs: <pre>org.apache.nutch.indexer.IndexSorter /home/stack/tmp/crawl</pre> @@ -117,23 +167,6 @@ </answer> </faq> -<faq id="sort"> -<title>How do I sort an index in NutchWAX</title> -<question>How do I sort an index with NutchWAX</question> -<answer><p>Sorting an index will usually return better -quality results in less time. Most of Nutch is built into the NutchWAX jar. -To run the nutch indexer sorter, do the following: -<pre>$ hadoop jar nutchwax.jar class org.apache.nutch.indexer.IndexerSorter</pre> -</p> -<p>When the index is sorted, you might as well set the -searcher.max.hits to, e.g., 1000, since you are getting back the top ranked -documents and limit the number of hits someone is allowed to see to 1000.</p> -<p>See the end of <a href="#segmentmerge">How do I merge segments in NutchWAX</a> -for how to run multiple concurrent sorts.</p> -</answer> -</faq> - - <faq id="incremental"> <question>Is it possible to do incremental updates?</question> <answer><p>Here is a sketch of how to do it for now. Later we'll add better This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <jle...@us...> - 2007-08-01 21:44:29
|
Revision: 1896 http://archive-access.svn.sourceforge.net/archive-access/?rev=1896&view=rev Author: jlee-archive Date: 2007-08-01 14:44:31 -0700 (Wed, 01 Aug 2007) Log Message: ----------- Just cleaned up the code with some whitespace, consistent indenting, etc. No functional changes. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxBean.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxConfiguration.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxDistributedSearch.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxOpenSearchServlet.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxQuery.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxTest.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/index-wax/src/java/org/archive/access/nutch/indexer/WaxIndexingFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/parse-default/src/java/org/archive/access/nutch/parse/MetadataOnlyParser.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/parse-waxext/src/java/org/apache/nutch/parse/ext/WaxExtParser.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-anchor/src/java/org/apache/nutch/searcher/anchor/AnchorQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-content/src/java/org/apache/nutch/searcher/content/ContentQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-host/src/java/org/apache/nutch/searcher/host/HostQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-title/src/java/org/apache/nutch/searcher/title/TitleQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxArcfileQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxCollectionQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxDateQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxExacturlQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxTypeQueryFilter.java Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java 2007-07-26 21:53:47 UTC (rev 1895) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java 2007-08-01 21:44:31 UTC (rev 1896) @@ -34,252 +34,329 @@ * {@link org.apache.nutch.indexer.IndexMerger} or * {@link org.apache.nutch.indexer.IndexSorter}. * - * Takes input that has per line the name of the class to run and the arguments - * to pass. Here is an example line for IndexMerger: - * <code>org.apache.nutch.indexer.IndexMerger -workingdir /tmp index-new indexes - * </code>. Here is one for IndexSorter: + * Takes input that has per line the name of the class to run and the + * arguments to pass. Here is an example line for IndexMerger: + * <code>org.apache.nutch.indexer.IndexMerger -workingdir /tmp index-new + * indexes</code>. Here is one for IndexSorter: * <code>org.apache.nutch.indexer.IndexSorter /home/stack/tmp/crawl</code> * (Note that IndexSorter wants to refer to the local system; the indexes to * sort must be on local disk). We run as many tasks as there are input lines. * * @author stack */ -public class Multiple extends ToolBase implements Mapper { - public final Log LOG = LogFactory.getLog(this.getClass()); - private JobConf job; +public class Multiple extends ToolBase implements Mapper +{ + public final Log LOG = LogFactory.getLog(this.getClass()); + private JobConf job; - public void map(WritableComparable key, Writable value, - OutputCollector output, final Reporter reporter) - throws IOException { - final String [] words = value.toString().split("\\s"); - if (words.length <= 0) { - return; - } - final String className = words[0]; - // Set a timer running that will update reporter on a period. - Timer t = new Timer(false); - t.scheduleAtFixedRate(new TimerTask() { - @Override - public void run() { - try { - reporter.setStatus("Running " + className); - } catch (IOException e) { - e.printStackTrace(); - } - }}, 0, 10000); - try { - int result = doMain(words); - reporter.setStatus("Done running " + className + ": " + result); - if (result != 0) { - throw new IOException(className + " returned non-null: " + - result + ", check logs."); - } - } finally { - t.cancel(); - } - } - - /** - * Call {@link ToolBase#doMain(org.apache.hadoop.conf.Configuration, String[])} - * on the passed classname. - * @param args - * @return Result from call to doMain. - */ - private int doMain(final String [] args) { - final String className = args[0]; - // Redo args so absent our 'class' command. - String [] newArgs = Nutchwax.rewriteArgs(args, 1); - int result = -1; - try { - Object obj = Class.forName(className).newInstance(); - result = ((ToolBase)obj).doMain(this.job, newArgs); - } catch (Exception e) { - LOG.error(className, e); + public void map(WritableComparable key, Writable value, + OutputCollector output, final Reporter reporter) + throws IOException + { + final String [] words = value.toString().split("\\s"); + + if (words.length <= 0) + { + return; + } + + final String className = words[0]; + + // Set a timer running that will update reporter on a period. + Timer t = new Timer(false); + + t.scheduleAtFixedRate(new TimerTask() + { + @Override + public void run() + { + try + { + reporter.setStatus("Running " + className); } - return result; + catch (IOException e) + { + e.printStackTrace(); + } + } + }, 0, 10000); + + try + { + int result = doMain(words); + + reporter.setStatus("Done running " + className + ": " + result); + + if (result != 0) + { + throw new IOException(className + " returned non-null: " + + result + ", check logs."); + } } + finally + { + t.cancel(); + } + } - public void configure(final JobConf j) { - this.job = j; - } + /** + * Call {@link ToolBase#doMain(org.apache.hadoop.conf.Configuration, String[])} + * on the passed classname. + * @param args + * @return Result from call to doMain. + */ + private int doMain(final String [] args) + { + final String className = args[0]; + + // Redo args so absent our 'class' command. + String [] newArgs = Nutchwax.rewriteArgs(args, 1); + int result = -1; + + try + { + Object obj = Class.forName(className).newInstance(); + result = ((ToolBase)obj).doMain(this.job, newArgs); + } + catch (Exception e) + { + LOG.error(className, e); + } + + return result; + } - public void close() throws IOException { - // TODO Auto-generated method stub - } + public void configure(final JobConf j) + { + this.job = j; + } - public static class MultipleInputFormat implements InputFormat { - - public RecordReader getRecordReader(final InputSplit split, - final JobConf job, final Reporter reporter) - throws IOException { - // Only one record/line to read. - return new RecordReader() { - private final String line = ((LineInputSplit)split).line; - private boolean read = false; - - public void close() throws IOException { - // TODO Auto-generated method stub - } + public void close() throws IOException + { + // TODO Auto-generated method stub + } - public WritableComparable createKey() { - return new Text(""); - } + public static class MultipleInputFormat implements InputFormat + { + public RecordReader getRecordReader(final InputSplit split, + final JobConf job, final Reporter reporter) + throws IOException + { + // Only one record/line to read. + return new RecordReader() + { + private final String line = ((LineInputSplit)split).line; + private boolean read = false; + + public void close() throws IOException + { + // TODO Auto-generated method stub + } - public Writable createValue() { - return new Text(""); - } + public WritableComparable createKey() + { + return new Text(""); + } - public long getPos() throws IOException { - return 0; - } + public Writable createValue() { + return new Text(""); + } - public float getProgress() throws IOException { - return getPos(); - } + public long getPos() throws IOException + { + return 0; + } - public boolean next(Writable key, Writable value) - throws IOException { - if (read) { - return false; - } - read = true; - ((Text)value).set(this.line); - return true; - } - }; - } + public float getProgress() throws IOException + { + return getPos(); + } - public InputSplit[] getSplits(JobConf job, int numSplits) - throws IOException { - Path[] inputs = job.getInputPaths(); - List<String> lines = new ArrayList<String>(); - for (int i = 0; i < inputs.length; i++) { - Path p = inputs[i]; - FileSystem fs = p.getFileSystem(job); - Path [] ps = fs.listPaths(p); - for (int j = 0; j < ps.length; j++) { - if (fs.isDirectory(ps[j])) { - continue; - } - addFileLines(lines, fs, ps[j]); - } - } - List<LineInputSplit> splits = - new ArrayList<LineInputSplit>(lines.size()); - for (String line: lines) { - splits.add(new LineInputSplit(line)); - } - job.setNumMapTasks(lines.size()); - return splits.toArray(new LineInputSplit [0]); - } - - private void addFileLines(final List<String> lines, final FileSystem fs, - final Path p) - throws IOException { - InputStream is = (InputStream)fs.open(p); - LineNumberReader lnr = null; - try { - lnr = new LineNumberReader(new InputStreamReader(is)); - for (String l = null; (l = lnr.readLine()) != null;) { - if (l.length() > 0 && !l.trim().startsWith("#")) { - lines.add(l); - } - } - } finally { - if (lnr != null) { - lnr.close(); - } - is.close(); - } - } + public boolean next(Writable key, Writable value) + throws IOException + { + if (read) + { + return false; + } + + read = true; + + ((Text)value).set(this.line); - public void validateInput(JobConf job) throws IOException { - // Nothing to validate. - } - } - - public static class LineInputSplit implements InputSplit { - private String line; - - protected LineInputSplit() { - super(); - } - - public LineInputSplit(final String l) { - line = l; - } - - public long getLength() throws IOException { - return line.length(); - } + return true; + } + }; + } - public String[] getLocations() throws IOException { - return new String[0]; - } + public InputSplit[] getSplits(JobConf job, int numSplits) + throws IOException + { + Path[] inputs = job.getInputPaths(); - public void readFields(DataInput in) throws IOException { - this.line = in.readLine(); - } + List<String> lines = new ArrayList<String>(); - public void write(DataOutput out) throws IOException { - out.writeBytes(this.line); - } - } - - public static void usage() { - System.out.println("Usage: multiple <input> <output>"); - System.out.println("Runs concurrently all commands listed in " + - "<inputs>."); - System.out.println("Arguments:"); - System.out.println(" <input> Directory of input files with " + - "each line describing task to run"); - System.out.println(" <output> Output directory."); - System.out.println("Example input lines:"); - System.out.println(); - System.out.println(" An input line to specify a merge would look " + - "like:"); - System.out.println(); - System.out.println(" org.apache.nutch.indexer.IndexMerger " + - "-workingdir /3/hadoop-tmp index-monday indexes-monday"); - System.out.println(); - System.out.println(" Note that named class must implement " + - "org.apache.hadoop.util.ToolBase"); - System.out.println(); - System.out.println(" To copy from " + - "hdfs://HOST:PORT/user/stack/index-monday to"); - System.out.println( " file:///0/searcher.dir/index:"); - System.out.println(); - System.out.println(" org.apache.hadoop.fs.FsShell " + - "/user/stack/index-monday /0/searcher.dir/index"); - System.out.println(); - System.out.println(" org.apache.nutch.indexer.IndexSorter " + - "/home/stack/tmp/crawl"); - System.out.println(); - System.out.println(" Note that IndexSorter refers to local " + - "filesystem and not to hdfs and is RAM-bound. Set"); - System.out.println(" task child RAM with the mapred.child.java.opts " + - "property in your hadoop-site.xml."); + for (int i = 0; i < inputs.length; i++) + { + Path p = inputs[i]; + FileSystem fs = p.getFileSystem(job); + Path [] ps = fs.listPaths(p); + + for (int j = 0; j < ps.length; j++) + { + if (fs.isDirectory(ps[j])) + { + continue; + } + + addFileLines(lines, fs, ps[j]); + } + } + + List<LineInputSplit> splits = + new ArrayList<LineInputSplit>(lines.size()); - } - - public int run(String[] args) throws Exception { - if (args.length != 2 || - (args.length == 1 && - (args[0].equals("-h") || args[0].equals("--help")))) { - usage(); - return -1; - } - JobConf job = new JobConf(MultipleInputFormat.class); - job.setInputFormat(MultipleInputFormat.class); - job.setInputPath(new Path(args[0])); - job.setMapperClass(Multiple.class); - job.setOutputPath(new Path(args[1])); - JobClient.runJob(job); - return 0; - } - - public static void main(String[] args) throws Exception { - int res = new Multiple().doMain(NutchConfiguration.create(), args); - System.exit(res); - } + for (String line: lines) + { + splits.add(new LineInputSplit(line)); + } + + job.setNumMapTasks(lines.size()); + + return splits.toArray(new LineInputSplit [0]); + } + + private void addFileLines(final List<String> lines, final FileSystem fs, + final Path p) + throws IOException + { + InputStream is = (InputStream)fs.open(p); + LineNumberReader lnr = null; + + try + { + lnr = new LineNumberReader(new InputStreamReader(is)); + + for (String l = null; (l = lnr.readLine()) != null;) + { + if (l.length() > 0 && !l.trim().startsWith("#")) + { + lines.add(l); + } + } + } + finally + { + if (lnr != null) + { + lnr.close(); + } + + is.close(); + } + } + + public void validateInput(JobConf job) throws IOException + { + // Nothing to validate. + } + } + + public static class LineInputSplit implements InputSplit + { + private String line; + + protected LineInputSplit() + { + super(); + } + + public LineInputSplit(final String l) + { + line = l; + } + + public long getLength() throws IOException + { + return line.length(); + } + + public String[] getLocations() throws IOException + { + return new String[0]; + } + + public void readFields(DataInput in) throws IOException + { + this.line = in.readLine(); + } + + public void write(DataOutput out) throws IOException + { + out.writeBytes(this.line); + } + } + + public static void usage() + { + System.out.println("Usage: multiple <input> <output>"); + System.out.println("Runs concurrently all commands listed in " + + "<inputs>."); + System.out.println("Arguments:"); + System.out.println(" <input> Directory of input files with " + + "each line describing task to run"); + System.out.println(" <output> Output directory."); + System.out.println("Example input lines:"); + System.out.println(); + System.out.println(" An input line to specify a merge would look like:"); + System.out.println(); + System.out.println(" org.apache.nutch.indexer.IndexMerger " + + "-workingdir /3/hadoop-tmp index-monday indexes-monday"); + System.out.println(); + System.out.println(" Note that named class must implement " + + "org.apache.hadoop.util.ToolBase"); + System.out.println(); + System.out.println(" To copy from " + + "hdfs://HOST:PORT/user/stack/index-monday to"); + System.out.println( " file:///0/searcher.dir/index:"); + System.out.println(); + System.out.println(" org.apache.hadoop.fs.FsShell " + + "/user/stack/index-monday /0/searcher.dir/index"); + System.out.println(); + System.out.println(" org.apache.nutch.indexer.IndexSorter " + + "/home/stack/tmp/crawl"); + System.out.println(); + System.out.println(" Note that IndexSorter refers to local " + + "filesystem and not to hdfs and is RAM-bound. Set"); + System.out.println(" task child RAM with the mapred.child.java.opts " + + "property in your hadoop-site.xml."); + } + + public int run(String[] args) throws Exception + { + if (args.length != 2 || + (args.length == 1 && + (args[0].equals("-h") || args[0].equals("--help")))) + { + usage(); + return -1; + } + + JobConf job = new JobConf(MultipleInputFormat.class); + job.setInputFormat(MultipleInputFormat.class); + job.setInputPath(new Path(args[0])); + job.setMapperClass(Multiple.class); + job.setOutputPath(new Path(args[1])); + + JobClient.runJob(job); + + return 0; + } + + public static void main(String[] args) throws Exception + { + int res = new Multiple().doMain(NutchConfiguration.create(), args); + + System.exit(res); + } } \ No newline at end of file Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java 2007-07-26 21:53:47 UTC (rev 1895) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java 2007-08-01 21:44:31 UTC (rev 1896) @@ -53,677 +53,916 @@ /** * Script to run all indexing jobs from index through merge of final index. */ -public class Nutchwax { - public static final Log LOG = - LogFactory.getLog(Nutchwax.class.getName()); +public class Nutchwax +{ + public static final Log LOG = + LogFactory.getLog(Nutchwax.class.getName()); - private static final String KEY_COLLECTION_PREFIX = "c="; - private static final String KEY_COLLECTION_SUFFIX = ",u="; - private static final Pattern COLLECTION = - Pattern.compile("^\\s*c=([^,]+),u=(.*)\\s*", Pattern.DOTALL); + private static final String KEY_COLLECTION_PREFIX = "c="; + private static final String KEY_COLLECTION_SUFFIX = ",u="; + private static final Pattern COLLECTION = + Pattern.compile("^\\s*c=([^,]+),u=(.*)\\s*", Pattern.DOTALL); - private final static List JOBS = Arrays.asList(new String[] { - "import", "update", "invert", "index", "dedup", "merge", "all", - "class", "search", "multiple"}); + private final static List JOBS = Arrays.asList(new String[] { + "import", "update", "invert", "index", "dedup", "merge", "all", + "class", "search", "multiple"}); - // Lazy initialize these two variables to delay complaint about hadoop not - // being present -- if its not. Meantime I get command-line processing - // done. - private FileSystem fs = null; - private JobConf conf = null; + // Lazy initialize these two variables to delay complaint about hadoop not + // being present -- if its not. Meantime I get command-line processing + // done. + private FileSystem fs = null; + private JobConf conf = null; - /** - * Default constructor. - * @throws IOException - */ - public Nutchwax() throws IOException { - super(); - } + /** + * Default constructor. + * @throws IOException + */ + public Nutchwax() throws IOException + { + super(); + } - public synchronized JobConf getJobConf() { - if (this.conf == null) { - this.conf = new JobConf(NutchwaxConfiguration.getConfiguration()); - } - return this.conf; + public synchronized JobConf getJobConf() + { + if (this.conf == null) + { + this.conf = new JobConf(NutchwaxConfiguration.getConfiguration()); } + + return this.conf; + } - public synchronized FileSystem getFS() throws IOException { - if (this.fs == null) { - this.fs = FileSystem.get(getJobConf()); - } - return this.fs; + public synchronized FileSystem getFS() throws IOException + { + if (this.fs == null) + { + this.fs = FileSystem.get(getJobConf()); } + + return this.fs; + } - protected class OutputDirectories { - private final Path output; - private final Path crawlDb; - private final Path linkDb; - private final Path segments; - private final Path indexes; - private final Path index; - private final Path tmpDir; + protected class OutputDirectories + { + private final Path output; + private final Path crawlDb; + private final Path linkDb; + private final Path segments; + private final Path indexes; + private final Path index; + private final Path tmpDir; - public OutputDirectories(final Path output) throws IOException { - this.output = output; - this.crawlDb = new Path(output + "/crawldb"); - this.linkDb = new Path(output + "/linkdb"); - this.segments = new Path(output + "/segments"); - this.indexes = new Path(output + "/indexes"); - this.index = new Path(output + "/index"); - this.tmpDir = getJobConf().getLocalPath("mapred.temp.dir", - Generator.generateSegmentName()); - } + public OutputDirectories(final Path output) throws IOException + { + this.output = output; + this.crawlDb = new Path(output + "/crawldb"); + this.linkDb = new Path(output + "/linkdb"); + this.segments = new Path(output + "/segments"); + this.indexes = new Path(output + "/indexes"); + this.index = new Path(output + "/index"); + this.tmpDir = getJobConf().getLocalPath("mapred.temp.dir", + Generator.generateSegmentName()); + } - public Path getCrawlDb() { - return crawlDb; - } + public Path getCrawlDb() + { + return crawlDb; + } - public Path getIndexes() { - return indexes; - } + public Path getIndexes() + { + return indexes; + } - public Path getLinkDb() { - return linkDb; - } + public Path getLinkDb() + { + return linkDb; + } - public Path getSegments() { - return segments; - } + public Path getSegments() + { + return segments; + } - public Path getTmpDir() { - return tmpDir; - } + public Path getTmpDir() + { + return tmpDir; + } - public Path getIndex() { - return index; - } + public Path getIndex() + { + return index; + } - public Path getOutput() { - return output; - } + public Path getOutput() + { + return output; } + } - /** - * Run passed list of mapreduce indexing jobs. Jobs are always run in - * order: import, update, etc. - * - * @throws Exception - */ - protected void doAll(final Path input, final String collectionName, - final OutputDirectories od) - throws Exception { - doImport(input, collectionName, od); - doUpdate(od); - doInvert(od); - doIndexing(od); - doDedup(od); - doMerge(od); - LOG.info("Nutchwax finished."); - } + /** + * Run passed list of mapreduce indexing jobs. Jobs are always run in + * order: import, update, etc. + * + * @throws Exception + */ + protected void doAll(final Path input, final String collectionName, + final OutputDirectories od) + throws Exception + { + doImport(input, collectionName, od); + doUpdate(od); + doInvert(od); + doIndexing(od); + doDedup(od); + doMerge(od); + + LOG.info("Nutchwax finished."); + } - protected void doImport(final Path input, String collectionName, - final OutputDirectories od) - throws IOException { - Path segment = new Path(od.getSegments(), - Generator.generateSegmentName() + - ((collectionName == null || collectionName.length() <= 0)? - "": "-" + collectionName)); - new ImportArcs(getJobConf()).importArcs(input, segment, - collectionName); - } + protected void doImport(final Path input, String collectionName, + final OutputDirectories od) + throws IOException + { + Path segment = new Path(od.getSegments(), + Generator.generateSegmentName() + + ((collectionName == null || collectionName.length() <= 0)? + "": "-" + collectionName)); + + new ImportArcs(getJobConf()).importArcs(input, segment, collectionName); + } - protected void doUpdate(final OutputDirectories od) - throws IOException { - doUpdate(od, null); + protected void doUpdate(final OutputDirectories od) + throws IOException + { + doUpdate(od, null); + } + + protected void doUpdate(final OutputDirectories od, + final String[] segments) + throws IOException + { + LOG.info("updating crawldb " + od.getCrawlDb()); + + // Need to make sure the db dir exists before progressing. + Path dbPath = new Path(od.getCrawlDb(), CrawlDb.CURRENT_NAME); + + if (!getFS().exists(dbPath)) + { + getFS().mkdirs(dbPath); } - - protected void doUpdate(final OutputDirectories od, - final String[] segments) - throws IOException { - LOG.info("updating crawldb " + od.getCrawlDb()); - // Need to make sure the db dir exists before progressing. - Path dbPath = new Path(od.getCrawlDb(), CrawlDb.CURRENT_NAME); - if (!getFS().exists(dbPath)) { - getFS().mkdirs(dbPath); + + CrawlDb cdb = new NutchwaxCrawlDb(getJobConf()); + + if (segments != null) + { + List<Path> paths = new ArrayList<Path>(segments.length); + + for (int i = 0; i < segments.length; i++) + { + Path p = new Path(segments[i]); + + if (!getFS().exists(p)) + { + throw new FileNotFoundException(p.toString()); } - CrawlDb cdb = new NutchwaxCrawlDb(getJobConf()); - if (segments != null) { - List<Path> paths = new ArrayList<Path>(segments.length); - for (int i = 0; i < segments.length; i++) { - Path p = new Path(segments[i]); - if (!getFS().exists(p)) { - throw new FileNotFoundException(p.toString()); - } - paths.add(p); - } - cdb.update(od.getCrawlDb(), paths.toArray(new Path[paths.size()]), - true, true); - } else { - Path[] allSegments = getSegments(od); - // This just does the last segment created. - cdb.update(od.getCrawlDb(), - new Path[] {allSegments[allSegments.length - 1]}, true, true); - } + + paths.add(p); + } + + cdb.update(od.getCrawlDb(), paths.toArray(new Path[paths.size()]), + true, true); } + else + { + Path[] allSegments = getSegments(od); + + // This just does the last segment created. + cdb.update(od.getCrawlDb(), + new Path[] {allSegments[allSegments.length - 1]}, true, true); + } + } - protected Path [] getSegments(final OutputDirectories od) - throws IOException { - Path[] allSegments = getFS().listPaths(od.getSegments()); - if (allSegments == null || allSegments.length <= 0) { - throw new FileNotFoundException(od.getSegments().toString()); - } - return allSegments; + protected Path [] getSegments(final OutputDirectories od) + throws IOException + { + Path[] allSegments = getFS().listPaths(od.getSegments()); + + if (allSegments == null || allSegments.length <= 0) + { + throw new FileNotFoundException(od.getSegments().toString()); } + + return allSegments; + } - protected void doInvert(final OutputDirectories od, final Path [] segments) - throws IOException { - createLinkdb(od); - new NutchwaxLinkDb(getJobConf()). - invert(od.getLinkDb(), segments, true, true, false); - } + protected void doInvert(final OutputDirectories od, final Path [] segments) + throws IOException + { + createLinkdb(od); + + new NutchwaxLinkDb(getJobConf()). + invert(od.getLinkDb(), segments, true, true, false); + } - protected void doInvert(final OutputDirectories od) - throws IOException { - LOG.info("inverting links in " + od.getSegments()); - new NutchwaxLinkDb(getJobConf()). - invert(od.getLinkDb(), getSegments(od), true, true, false); + protected void doInvert(final OutputDirectories od) + throws IOException + { + LOG.info("inverting links in " + od.getSegments()); + + new NutchwaxLinkDb(getJobConf()). + invert(od.getLinkDb(), getSegments(od), true, true, false); + } + + protected boolean createLinkdb(final OutputDirectories od) + throws IOException + { + boolean result = false; + + // Make sure the linkdb exists. Otherwise the install where + // the temporary location gets moved to the permanent fails. + if (getFS().mkdirs(new Path(od.getLinkDb(), + NutchwaxLinkDb.CURRENT_NAME))) + { + LOG.info("Created " + od.getLinkDb()); + + result = true; } + + return result; + } - protected boolean createLinkdb(final OutputDirectories od) - throws IOException { - boolean result = false; - // Make sure the linkdb exists. Otherwise the install where - // the temporary location gets moved to the permanent fails. - if (getFS().mkdirs(new Path(od.getLinkDb(), - NutchwaxLinkDb.CURRENT_NAME))) { - LOG.info("Created " + od.getLinkDb()); - result = true; - } - return result; - } + protected void doIndexing(final OutputDirectories od) + throws IOException + { + doIndexing(od, getFS().listPaths(od.getSegments())); + } + + protected void doIndexing(final OutputDirectories od, + final Path [] segments) + throws IOException + { + LOG.info(" indexing " + segments); + + new NutchwaxIndexer(getJobConf()).index(od.getIndexes(), + od.getCrawlDb(), od.getLinkDb(), segments); + } + + protected void doDedup(final OutputDirectories od) throws IOException + { + LOG.info("dedup " + od.getIndex()); + + new DeleteDuplicates(getJobConf()).dedup(new Path[] {od.getIndexes()}); + } - protected void doIndexing(final OutputDirectories od) - throws IOException { - doIndexing(od, getFS().listPaths(od.getSegments())); + protected void doMerge(final OutputDirectories od) throws IOException + { + LOG.info("index merge " + od.getOutput() + " using tmpDir=" + + od.getTmpDir()); + + new IndexMerger(getJobConf()).merge(getFS().listPaths(od.getIndexes()), + od.getIndex(), od.getTmpDir()); + } + + static String [] rewriteArgs(final String [] args, final int offset) + { + final String [] newArgs = new String[args.length - offset]; + + for (int i = 0; i < args.length; i++) + { + if (i < offset) + { + continue; + } + + newArgs[i - offset] = args[i]; } - - protected void doIndexing(final OutputDirectories od, - final Path [] segments) - throws IOException { - LOG.info(" indexing " + segments); - new NutchwaxIndexer(getJobConf()).index(od.getIndexes(), - od.getCrawlDb(), od.getLinkDb(), segments); + + return newArgs; + } + + static Object doClassMain(final String [] args) + { + // Redo args so absent our nutchwax 'class' command. + final String className = args[1]; + String [] newArgs = rewriteArgs(args, 2); + + // From http://www.javaworld.com/javaworld/javaqa/1999-06/01-outside.html + Class [] argTypes = new Class[1]; + argTypes[0] = String[].class; + Object result = null; + + try + { + Method mainMethod = + Class.forName(className).getDeclaredMethod("main", argTypes); + result = mainMethod.invoke(newArgs, new Object [] {newArgs}); } + catch (Throwable t) + { + t.printStackTrace(); + } + + return result; + } - protected void doDedup(final OutputDirectories od) throws IOException { - LOG.info("dedup " + od.getIndex()); - new DeleteDuplicates(getJobConf()).dedup(new Path[] {od.getIndexes()}); + protected Object doSearch(final String [] args) + { + String [] newArgs = new String[args.length + 1]; + newArgs[0] = args[0]; + newArgs[1] = NutchwaxBean.class.getName(); + + for (int i = 1; i < args.length; i++) + { + newArgs[i + 1] = args[i]; } + + return doClassMain(newArgs); + } + + protected void doMultiple(final String [] args) throws Exception + { + (new Multiple()).run(rewriteArgs(args, 1)); + } - protected void doMerge(final OutputDirectories od) throws IOException { - LOG.info("index merge " + od.getOutput() + " using tmpDir=" + - od.getTmpDir()); - new IndexMerger(getJobConf()).merge(getFS().listPaths(od.getIndexes()), - od.getIndex(), od.getTmpDir()); + protected void doJob(final String jobName, final String [] args) + throws Exception + { + if (jobName.equals("import")) + { + // Usage: hadoop jar nutchwax.jar import input output name + if (args.length != 4) + { + ImportArcs.doImportUsage( + "ERROR: Wrong number of arguments passed.", 2); + } + + final Path input = new Path(args[1]); + final Path output = new Path(args[2]); + final String collectionName = args[3]; + + checkArcsDir(input); + OutputDirectories od = new OutputDirectories(output); + doImport(input, collectionName, od); } - - static String [] rewriteArgs(final String [] args, final int offset) { - final String [] newArgs = new String[args.length - offset]; - for (int i = 0; i < args.length; i++) { - if (i < offset) { - continue; - } - newArgs[i - offset] = args[i]; + else if (jobName.equals("update")) + { + // Usage: hadoop jar nutchwax.jar update output + if (args.length < 2) + { + doUpdateUsage("ERROR: Wrong number of arguments passed.", 2); + } + + OutputDirectories od = new OutputDirectories(new Path(args[1])); + + if (args.length == 2) + { + doUpdate(od); + } + else + { + for (int i = 2; i < args.length; i++) + { + doUpdate(od, new String [] {args[i]}); } - return newArgs; + } } + else if (jobName.equals("invert")) + { + // Usage: hadoop jar nutchwax.jar invert output + if (args.length < 2) + { + doInvertUsage("ERROR: Wrong number of arguments passed.", 2); + } - static Object doClassMain(final String [] args) { - // Redo args so absent our nutchwax 'class' command. - final String className = args[1]; - String [] newArgs = rewriteArgs(args, 2); - // From http://www.javaworld.com/javaworld/javaqa/1999-06/01-outside.html - Class [] argTypes = new Class[1]; - argTypes[0] = String[].class; - Object result = null; - try { - Method mainMethod = - Class.forName(className).getDeclaredMethod("main", argTypes); - result = mainMethod.invoke(newArgs, new Object [] {newArgs}); - } catch (Throwable t) { - t.printStackTrace(); + OutputDirectories od = new OutputDirectories(new Path(args[1])); + + if (args.length == 2) + { + doInvert(od); + } + else + { + final int offset = 2; + Path [] segments = new Path[args.length - offset]; + + for (int i = offset; i < args.length; i++) + { + Path f = new Path(args[i]); + + if (! getFS().exists(f)) + { + throw new FileNotFoundException(f.toString()); + } + + segments[i - offset] = f; } - return result; + + doInvert(od, segments); + } } - - protected Object doSearch(final String [] args) { - String [] newArgs = new String[args.length + 1]; - newArgs[0] = args[0]; - newArgs[1] = NutchwaxBean.class.getName(); - for (int i = 1; i < args.length; i++) { - newArgs[i + 1] = args[i]; + else if (jobName.equals("index")) + { + // Usage: hadoop jar nutchwax.jar index output + if (args.length < 2) + { + doIndexUsage("ERROR: Wrong number of arguments passed.", 2); + } + + OutputDirectories od = new OutputDirectories(new Path(args[1])); + + if (args.length == 2) + { + doIndexing(od); + } + else + { + final int offset = 2; + Path [] segments = new Path[args.length - offset]; + + for (int i = offset; i < args.length; i++) + { + Path f = new Path(args[i]); + + if (! getFS().exists(f)) + { + throw new FileNotFoundException(f.toString()); + } + + segments[i - offset] = f; } - return doClassMain(newArgs); + + doIndexing(od, segments); + } } - - protected void doMultiple(final String [] args) throws Exception { - (new Multiple()).run(rewriteArgs(args, 1)); + else if (jobName.equals("dedup")) + { + // Usage: hadoop jar nutchwax.jar dedup output + if (args.length != 2) + { + doDedupUsage("Wrong number of arguments passed.", 2); + } + + doDedup(new OutputDirectories(new Path(args[1]))); } - - protected void doJob(final String jobName, final String [] args) - throws Exception { - if (jobName.equals("import")) { - // Usage: hadoop jar nutchwax.jar import input output name - if (args.length != 4) { - ImportArcs.doImportUsage( - "ERROR: Wrong number of arguments passed.", 2); - } - final Path input = new Path(args[1]); - final Path output = new Path(args[2]); - final String collectionName = args[3]; - checkArcsDir(input); - OutputDirectories od = new OutputDirectories(output); - doImport(input, collectionName, od); - } else if (jobName.equals("update")) { - // Usage: hadoop jar nutchwax.jar update output - if (args.length < 2) { - doUpdateUsage("ERROR: Wrong number of arguments passed.", 2); - } - OutputDirectories od = new OutputDirectories(new Path(args[1])); - if (args.length == 2) { - doUpdate(od); - } else { - for (int i = 2; i < args.length; i++) { - doUpdate(od, new String [] {args[i]}); - } - } - } else if (jobName.equals("invert")) { - // Usage: hadoop jar nutchwax.jar invert output - if (args.length < 2) { - doInvertUsage("ERROR: Wrong number of arguments passed.", 2); - } - OutputDirectories od = new OutputDirectories(new Path(args[1])); - if (args.length == 2) { - doInvert(od); - } else { - final int offset = 2; - Path [] segments = new Path[args.length - offset]; - for (int i = offset; i < args.length; i++) { - Path f = new Path(args[i]); - if (!getFS().exists(f)) { - throw new FileNotFoundException(f.toString()); - } - segments[i - offset] = f; - } - doInvert(od, segments); - } - } else if (jobName.equals("index")) { - // Usage: hadoop jar nutchwax.jar index output - if (args.length < 2) { - doIndexUsage("ERROR: Wrong number of arguments passed.", 2); - } - OutputDirectories od = new OutputDirectories(new Path(args[1])); - if (args.length == 2) { - doIndexing(od); - } else { - final int offset = 2; - Path [] segments = new Path[args.length - offset]; - for (int i = offset; i < args.length; i++) { - Path f = new Path(args[i]); - if (!getFS().exists(f)) { - throw new FileNotFoundException(f.toString()); - } - segments[i - offset] = f; - } - doIndexing(od, segments); - } - } else if (jobName.equals("dedup")) { - // Usage: hadoop jar nutchwax.jar dedup output - if (args.length != 2) { - doDedupUsage("Wrong number of arguments passed.", 2); - } - doDedup(new OutputDirectories(new Path(args[1]))); - } else if (jobName.equals("merge")) { - // Usage: hadoop jar nutchwax.jar merge output"); - if (args.length != 2) { - doMergeUsage("ERROR: Wrong number of arguments passed.", 2); - } - doMerge(new OutputDirectories(new Path(args[1]))); - } else if (jobName.equals("all")) { - // Usage: hadoop jar nutchwax.jar import input output name - if (args.length != 4) { - doAllUsage("ERROR: Wrong number of arguments passed.", 2); - } - final Path input = new Path(args[1]); - final Path output = new Path(args[2]); - final String collectionName = args[3]; - checkArcsDir(input); - OutputDirectories od = new OutputDirectories(output); - doAll(input, collectionName, od); - } else if (jobName.equals("class")) { - if (args.length < 2) { - doClassUsage("ERROR: Wrong number of arguments passed.", 2); - } - doClassMain(args); - } else if (jobName.equals("search")) { - if (args.length < 1) { - doClassUsage("ERROR: Wrong number of arguments passed.", 2); - } - doSearch(args); - } else if (jobName.equals("multiple")) { - doMultiple(args); - } else { - usage("ERROR: No handler for job name " + jobName, 4); - System.exit(0); - } + else if (jobName.equals("merge")) + { + // Usage: hadoop jar nutchwax.jar merge output"); + if (args.length != 2) + { + doMergeUsage("ERROR: Wrong number of arguments passed.", 2); + } + + doMerge(new OutputDirectories(new Path(args[1]))); } + else if (jobName.equals("all")) + { + // Usage: hadoop jar nutchwax.jar import input output name + if (args.length != 4) + { + doAllUsage("ERROR: Wrong number of arguments passed.", 2); + } - /** - * Check the arcs dir exists and looks like it has files that list ARCs - * (rather than ARCs themselves). - * - * @param arcsDir Directory to examine. - * @throws IOException - */ - protected void checkArcsDir(final Path arcsDir) - throws IOException { - if (!getFS().exists(arcsDir)) { - throw new IOException(arcsDir + " does not exist."); - } - if (!fs.isDirectory(arcsDir)) { - throw new IOException(arcsDir + " is not a directory."); - } + final Path input = new Path(args[1]); + final Path output = new Path(args[2]); + final String collectionName = args[3]; - final Path [] files = getFS().listPaths(arcsDir); - for (int i = 0; i < files.length; i++) { - if (!getFS().isFile(files[i])) { - throw new IOException(files[i] + " is not a file."); - } - if (files[i].getName().toLowerCase().endsWith(".arc.gz")) { - throw new IOException(files[i] + " is an ARC file (ARCSDIR " + - "should contain text file listing ARCs rather than " + - "actual ARCs)."); - } - } + checkArcsDir(input); + + OutputDirectories od = new OutputDirectories(output); + + doAll(input, collectionName, od); } + else if (jobName.equals("class")) + { + if (args.length < 2) + { + doClassUsage("ERROR: Wrong number of arguments passed.", 2); + } + + doClassMain(args); + } + else if (jobName.equals("search")) + { + if (args.length < 1) + { + doClassUsage("ERROR: Wrong number of arguments passed.", 2); + } + + doSearch(args); + } + else if (jobName.equals("multiple")) + { + doMultiple(args); + } + else + { + usage("ERROR: No handler for job name " + jobName, 4); + System.exit(0); + } + } + + /** + * Check the arcs dir exists and looks like it has files that list ARCs + * (rather than ARCs themselves). + * + * @param arcsDir Directory to examine. + * @throws IOException + */ + protected void checkArcsDir(final Path arcsDir) + throws IOException + { + if (! getFS().exists(arcsDir)) + { + throw new IOException(arcsDir + " does not exist."); + } + + if (! fs.isDirectory(arcsDir)) + { + throw new IOException(arcsDir + " is not a directory."); + } + + final Path [] files = getFS().listPaths(arcsDir); + + for (int i = 0; i < files.length; i++) + { + if (! getFS().isFile(files[i])) + { + throw new IOException(files[i] + " is not a file."); + } + + if (files[i].getName().toLowerCase().endsWith(".arc.gz")) + { + throw new IOException(files[i] + " is an ARC file (ARCSDIR " + + "should contain text file listing ARCs rather than " + + "actual ARCs)."); + } + } + } + + public static Text generateWaxKey(WritableComparable key, + final String collection) + { + return generateWaxKey(key.toString(), collection); + } - public static Text generateWaxKey(WritableComparable key, - final String collection) { - return generateWaxKey(key.toString(), collection); + public static Text generateWaxKey(final String keyStr, + final String collection) + { + if (collection == null) + { + throw new NullPointerException("Collection is null for " + keyStr); } - public static Text generateWaxKey(final String keyStr, - final String collection) { - if (collection == null) { - throw new NullPointerException("Collection is null for " + keyStr); - } - if (keyStr == null) { - throw new NullPointerException("keyStr is null"); - } - if (keyStr.startsWith(KEY_COLLECTION_PREFIX)) { - LOG.warn("Key already has collection prefix: " + keyStr - + ". Skipping."); - return new Text(keyStr); - } + if (keyStr == null) + { + throw new NullPointerException("keyStr is null"); + } + + if (keyStr.startsWith(KEY_COLLECTION_PREFIX)) + { + LOG.warn("Key already has collection prefix: " + keyStr + + ". Skipping."); - return new Text(KEY_COLLECTION_PREFIX + collection.trim() + - KEY_COLLECTION_SUFFIX + keyStr.trim()); + return new Text(keyStr); } + + return new Text(KEY_COLLECTION_PREFIX + collection.trim() + + KEY_COLLECTION_SUFFIX + keyStr.trim()); + } + + public static String getCollectionFromWaxKey(final WritableComparable key) + throws IOException + { + Matcher m = COLLECTION.matcher(key.toString()); - public static String getCollectionFromWaxKey(final WritableComparable key) - throws IOException { - Matcher m = COLLECTION.matcher(key.toString()); - if (m == null || !m.matches()) { - throw new IOException("Key doesn't have collection " + - "prefix <" + key.toString() + ">"); - } - return m.group(1); + if (m == null || !m.matches()) + { + throw new IOException("Key doesn't have collection " + + "prefix <" + key.toString() + ">"); } - public static String getUrlFromWaxKey(final WritableComparable key) - throws IOException { - Matcher m = COLLECTION.matcher(key.toString()); - if (m == null || !m.matches()) { - throw new IOException("Key doesn't have collection " + - " prefix: " + key); - } - return m.group(2); + return m.group(1); + } + + public static String getUrlFromWaxKey(final WritableComparable key) + throws IOException + { + Matcher m = COLLECTION.matcher(key.toString()); + + if (m == null || !m.matches()) + { + throw new IOException("Key doesn't have collection " + + " prefix: " + key); } - public static long getDate(String d) - throws IOException { - long date = 0; - try { - date = ArchiveUtils.getDate(d).getTime(); - } catch (final java.text.ParseException e) { - throw new IOException("Failed parse of date: " + d + ": " + - e.getMessage()); - } - // Date can be < 0 if pre-1970 (Seen in some old ARCs). - return date >= 0? date: 0; + return m.group(2); + } + + public static long getDate(String d) throws IOException + { + long date = 0; + + try + { + date = ArchiveUtils.getDate(d).getTime(); } + catch (final java.text.ParseException e) + { + throw new IOException("Failed parse of date: " + d + ": " + + e.getMessage()); + } + + // Date can be < 0 if pre-1970 (Seen in some old ARCs). + return date >= 0? date: 0; + } - public static void usage(final String message, final int exitCode) { - if (message != null && message.length() > 0) { - System.out.println(message); - } + public static void usage(final String message, final int exitCode) + { + if (message != null && message.length() > 0) + { + System.out.println(message); + } - System.out.println("Usage: hadoop jar nutchwax.jar <job> [args]"); - System.out.println("Launch NutchWAX job(s) on a hadoop platform."); - System.out.println("Type 'hadoop jar nutchwax.jar help <job>' for" + - " help on a specific job."); - System.out.println("Jobs (usually) must be run in the order " + - "listed below."); - System.out.println("Available jobs:"); - System.out.println(" import Import ARCs."); - System.out.println(" update Update dbs with recent imports."); - System.out.println(" invert Invert links."); - System.out.println(" index Index segments."); - System.out.println(" dedup Deduplicate by URL or content MD5."); - System.out.println(" merge Merge segment indices into one."); - System.out.println(" all Runs all above jobs in order."); - System.out.println(" class Run the passed class's main."); - System.out.println(" search Run a query against index under " + - "property 'searcher.dir'"); - System.out.println(" multiple Run multiple concurrent tasks."); - System.exit(exitCode); - } + System.out.println("Usage: hadoop jar nutchwax.jar <job> [args]"); + System.out.println("Launch NutchWAX job(s) on a hadoop platform."); + System.out.println("Type 'hadoop jar nutchwax.jar help <job>' for" + + " help on a specific job."); + System.out.println("Jobs (usually) must be run in the order " + + "listed below."); + System.out.println("Available jobs:"); + System.out.println(" import Import ARCs."); + System.out.println(" update Update dbs with recent imports."); + System.out.println(" invert Invert links."); + System.out.println(" index Index segments."); + System.out.println(" dedup Deduplicate by URL or content MD5."); + System.out.println(" merge Merge segment indices into one."); + System.out.println(" all Runs all above jobs in order."); + System.out.println(" class Run the passed class's main."); + System.out.println(" search Run a query against index under " + + "property 'searcher.dir'"); + System.out.println(" multiple Run multiple concurrent tasks."); - public static void doUpdateUsage(final String message, - final int exitCode) { - if (message != null && message.length() > 0) { - System.out.println(message); - } - System.out.println("Usage: hadoop jar nutchwax.jar update <output> " + - "[<segments>...]"); - System.out.println("Arguments:"); - System.out.println(" output Directory to write crawldb under."); - System.out.println("Options:"); - System.out.println(" segments List of segments to update crawldb " + - "with. If none supplied, updates"); - System.out.println(" using latest segment found."); - System.exit(exitCode); + System.exit(exitCode); + } + + public static void doUpdateUsage(final String message, + final int exitCode) + { + if (message != null && message.length() > 0) + { + System.out.println(message); } - public static void doInvertUsage(final String message, - final int exitCode) { - if (message != null && message.length() > 0) { - System.out.println(message); - } - System.out.println("Usage: hadoop jar nutchwax.jar invert <output> " + - "[<segments>...]"); - System.out.println("Arguments:"); - System.out.println(" output Directory to write linkdb under."); - System.out.println("Options:"); - System.out.println(" segments List of segments to update linkdb " + - "with. If none supplied, all under"); - System.out.println(" '<output>/segments/' " + - "are passed."); - System.exit(exitCode); + System.out.println("Usage: hadoop jar nutchwax.jar update <output> " + + "[<segments>...]"); + System.out.println("Arguments:"); + System.out.println(" output Directory to write crawldb under."); + System.out.println("Options:"); + System.out.println(" segments List of segments to update crawldb " + + "with. If none supplied, updates"); + System.out.println(" using latest segment found."); + + System.exit(exitCode); + } + + public static void doInvertUsage(final String message, + final int exitCode) + { + if (message != null && message.length() > 0) + { + System.out.println(message); } - public static void doIndexUsage(final String message, - final int exitCode) { - if (message != null && message.length() > 0) { - System.out.println(message); - } - System.out.println("Usage: hadoop jar nutchwax.jar index <output> " + - "[<segments>...]"); - System.out.println("Arguments:"); - System.out.println(" output Directory to write indexes under."); - System.out.println("Options:"); - System.out.println(" segments List of segments to index. " + - "If none supplied, all under"); - System.out.println(" '<output>/segments/' " + - "are indexed."); - System.exit(exitCode); + System.out.println("Usage: hadoop jar nutchwax.jar invert <output> " + + "[<segments>...]"); + System.out.println("Arguments:"); + System.out.println(" output Directory to write linkdb under."); + System.out.println("Options:"); + System.out.println(" segments List of segments to update linkdb " + + "with. If none supplied, all under"); + System.out.println(" '<output>/segments/' " + + "are passed."); + + System.exit(exitCode); + } + + public static void doIndexUsage(final String message, + final int exitCode) + { + if (message != null && message.length() > 0) + { + System.out.println(message); } - public static void doDedupUsage(final String message, - final int exitCode) { - if (message != null && message.length() > 0) { - System.out.println(message); - } - System.out.println("Usage: hadoop jar nutchwax.jar dedup <output>"); - System.out.println("Arguments:"); - System.out.println(" output Directory in which indices" + - " to dedup reside."); - System.exit(exitCode); + System.out.println("Usage: hadoop jar nutchwax.jar index <output> " + + "[<segments>...]"); + System.out.println("Arguments:"); + System.out.println(" output Directory to write indexes under."); + System.out.println("Options:"); + System.out.println(" segments List of segments to index. " + + "If none supplied, all under"); + System.out.println(" '<output>/segments/' " + + "are indexed."); + + System.exit(exitCode); + } + + public static void doDedupUsage(final String message, + final int exitCode) + { + if (message != null && message.length() > 0) + { + System.out.println(message); } - public static void doMergeUsage(final String message, - final int exitCode) { - if (message != null && message.length() > 0) { - System.out.println(message); - } - System.out.println("Usage: hadoop jar nutchwax.jar merge <output>"); - System.out.println("Arguments:"); - System.out.println(" output Directory in which indices" + - " to merge reside."); - System.exit(exitCode); + System.out.println("Usage: hadoop jar nutchwax.jar dedup <output>"); + System.out.println("Arguments:"); + System.out.println(" output Direc... [truncated message content] |
From: <mi...@us...> - 2008-07-11 14:00:47
|
Revision: 2427 http://archive-access.svn.sourceforge.net/archive-access/?rev=2427&view=rev Author: miklosh Date: 2008-07-11 07:00:57 -0700 (Fri, 11 Jul 2008) Log Message: ----------- Initial commit of the image search contrib. Added Paths: ----------- trunk/archive-access/projects/nutchwax/imagesearch/ trunk/archive-access/projects/nutchwax/imagesearch/README.txt trunk/archive-access/projects/nutchwax/imagesearch/bin/ trunk/archive-access/projects/nutchwax/imagesearch/build.xml trunk/archive-access/projects/nutchwax/imagesearch/conf/ trunk/archive-access/projects/nutchwax/imagesearch/lib/ trunk/archive-access/projects/nutchwax/imagesearch/src/ trunk/archive-access/projects/nutchwax/imagesearch/src/java/ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHitQueue.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHits.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageWritable.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/StoredImage.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ThumbnailGenerator.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/WrappedWritable.java trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/build-plugin.xml trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/build.xml trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/plugin.xml trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParser.java Added: trunk/archive-access/projects/nutchwax/imagesearch/README.txt =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/README.txt (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/README.txt 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,59 @@ +Nutch(WAX) Image Search Contrib +=============================== + +Getting the source +------------------ +Check out Nutch-1.0-dev as usually, then check-out the image search +contrib into Nutch's "contrib" directory. + + $ cd contrib + $ svn checkout http://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/nutchwax/imagesearch + +This will create a sub-directory named "imagesearch" containing the +sources for the image search contrib. + + +Configuring +----------- +Enable the 'image-search' plugin in Nutch's configuration by appending +it to the 'plugin.includes' property. This registers three plugins: + o ImageParseFilter (HTML parse filter) + o ImageParser (fake JPEG and GIF parser) + o ImageIndexingFilter + +If you are using NutchWAX 0.12 or newer, make sure you add the following +line to the 'indexingfilter.order' property: + + org.archive.nutchwax.imagesearch.plugin.ImageIndexingFilter + + +Build and install +----------------- +Build the contrib by executing the 'ant' build command in + + nutch/contrib/imagesearch + +as you normally would. + +For example + + $ cd nutch/contrib/imagesearch + $ ant tar + +This command will build all of Nutch, then the image search add-ons and +finally will package everything up into the "nutch-1.0-dev.tar.gz" +release package. + +Then install the "nutch-1.0-dev.tar.gz" tarball as normal. + + +Searching +--------- +After performing the usual steps to import or fetch the files, invert +the links and index the documents, you can search the resulting indexes +for images by: + + bin/nutch org.archive.nutchwax.imagesearch.ImageSearcherBean product + +This calls the ImageSearcherBean to execute a simple keyword search for +"product". Added: trunk/archive-access/projects/nutchwax/imagesearch/build.xml =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/build.xml (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/build.xml 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,138 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="nutchwax-imagesearch" default="job"> + + <property name="nutch.dir" value="../../" /> + + <property name="src.dir" value="src" /> + <property name="lib.dir" value="lib" /> + <property name="build.dir" value="${nutch.dir}/build" /> + <!-- HACK: Need to import default.properties like Nutch does --> + <property name="dist.dir" value="${build.dir}/nutch-1.0-dev" /> + + <target name="nutch-compile-core"> + <ant dir="${nutch.dir}" target="compile-core" inheritAll="false" /> + </target> + + <target name="nutch-compile-plugins"> + <ant dir="${nutch.dir}" target="compile-plugins" inheritAll="false" /> + </target> + + <target name="compile-core" depends="nutch-compile-core"> + <javac + destdir="${build.dir}/classes" + debug="true" + verbose="false" + source="1.5" + target="1.5" + encoding="UTF-8" + fork="true" + nowarn="true" + deprecation="false"> + <src path="${src.dir}/java" /> + <include name="**/*.java" /> + <classpath> + <pathelement location="${build.dir}/classes" /> + <fileset dir="${lib.dir}"> + <include name="*.jar"/> + </fileset> + <fileset dir="${nutch.dir}/lib"> + <include name="*.jar"/> + </fileset> + </classpath> + </javac> + </target> + + <target name="compile-plugins"> + <ant dir="src/plugin" target="deploy" inheritAll="false" /> + </target> + + <!-- + These targets all call down to the corresponding target in the + Nutch build.xml file. This way all of the 'ant' build commands + can be executed from this directory and everything should get + built as expected. + --> + <target name="compile" depends="compile-core, compile-plugins, nutch-compile-plugins"> + </target> + + <target name="jar" depends="compile-core"> + <ant dir="${nutch.dir}" target="jar" inheritAll="false" /> + </target> + + <target name="job" depends="compile"> + <ant dir="${nutch.dir}" target="job" inheritAll="false" /> + </target> + + <target name="war" depends="compile"> + <ant dir="${nutch.dir}" target="war" inheritAll="false" /> + </target> + + <target name="javadoc" depends="compile"> + <ant dir="${nutch.dir}" target="javadoc" inheritAll="false" /> + </target> + + <target name="tar" depends="package"> + <ant dir="${nutch.dir}" target="tar" inheritAll="false" /> + </target> + + <target name="clean"> + <ant dir="${nutch.dir}" target="clean" inheritAll="false" /> + </target> + + <!-- This one does a little more after calling down to the relevant + Nutch target. After Nutch has copied everything into the + distribution directory, we add our script, libraries, etc. + + Rather than over-write the standard Nutch configuration files, + we place ours in a newly created directory + + contrib/imagesearch/conf + + and let the individual user decide whether or not to + incorporate our modifications. + --> + <target name="package" depends="jar, job, war, javadoc"> + <ant dir="${nutch.dir}" target="package" inheritAll="false" /> + + <copy todir="${dist.dir}/lib" includeEmptyDirs="false"> + <fileset dir="lib"/> + </copy> + + <copy todir="${dist.dir}/bin"> + <fileset dir="bin"/> + </copy> + + <chmod perm="ugo+x" type="file"> + <fileset dir="${dist.dir}/bin"/> + </chmod> + + <mkdir dir="${dist.dir}/contrib/imagesearch/conf"/> + <copy todir="${dist.dir}/contrib/imagesearch/conf"> + <fileset dir="conf" /> + </copy> + + <copy todir="${dist.dir}/contrib/imagesearch"> + <fileset dir="."> + <include name="*.txt" /> + </fileset> + </copy> + + </target> + +</project> Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +public class ImageHit { + public int doc; + public float docScore; + + public String imageId; + public String url; + + public float docSim; + public float proximity; + public float score; + + public ImageHit(String id, String url, int doc) { + this.imageId = id; + this.url = url; + this.doc = doc; + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHitQueue.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHitQueue.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHitQueue.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,37 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import org.apache.lucene.util.PriorityQueue; + +final class ImageHitQueue extends PriorityQueue { + + ImageHitQueue(int size) { + initialize(size); + } + + protected final boolean lessThan(Object a, Object b) { + ImageHit hitA = (ImageHit)a; + ImageHit hitB = (ImageHit)b; + if (hitA.score == hitB.score) { + return hitA.doc > hitB.doc; + } else { + return hitA.score < hitB.score; + } + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHits.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHits.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHits.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,70 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +/** A set of image hits matching a query. Adapted from Nutch's Hits class. */ +public final class ImageHits { + + private long total; + private boolean totalIsExact = true; + private ImageHit[] top; + + public ImageHits() { + } + + public ImageHits(long total, ImageHit[] top) { + this.total = total; + this.top = top; + } + + /** Returns the total number of hits for this query. This may be an estimate + * when (@link #totalIsExact()} is false. */ + public long getTotal() { + return total; + } + + /** True if {@link #getTotal()} gives the exact number of hits, or false if + * it is only an estimate of the total number of hits. */ + public boolean totalIsExact() { + return totalIsExact; + } + + /** Set {@link #totalIsExact()}. */ + public void setTotalIsExact(boolean isExact) { + totalIsExact = isExact; + } + + /** Returns the number of hits included in this current listing. */ + public int getLength() { + return top.length; + } + + /** Returns the <code>i</code><sup>th</sup> hit in this list. */ + public ImageHit getHit(int i) { + return top[i]; + } + + /** Returns a subset of the hit objects. */ + public ImageHit[] getHits(int start, int length) { + ImageHit[] results = new ImageHit[length]; + for (int i = 0; i < length; i++) { + results[i] = top[start + i]; + } + return results; + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageIndexer.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,432 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.*; +import java.util.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.io.*; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.conf.*; +import org.apache.hadoop.mapred.*; +import org.apache.hadoop.util.*; +import org.apache.nutch.parse.*; +import org.apache.nutch.analysis.*; + +import org.apache.nutch.scoring.ScoringFilterException; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.LogUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.CrawlDb; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.crawl.LinkDb; +import org.apache.nutch.crawl.NutchWritable; + +import org.apache.lucene.index.*; +import org.apache.lucene.document.*; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilters; +import org.apache.nutch.indexer.NutchSimilarity; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.protocols.Response; + +/** Create indexes for segments. */ +public class ImageIndexer extends Configured implements Tool, + Reducer<Text, WrappedWritable, Text, Writable>, Mapper<Text, Writable, Text, WrappedWritable> { + + public static final String DONE_NAME = "index.done"; + public static final Log LOG = LogFactory.getLog(ImageIndexer.class); + + /** A utility class used to pass a lucene document from Indexer.reduce + * to Indexer.OutputFormat. + * Note: Despite its name, it can't properly wrap a lucene document - it + * doesn't know how to serialize/deserialize a lucene document. + */ + private static class LuceneDocumentWrapper implements Writable { + + private Document doc; + + public LuceneDocumentWrapper(Document doc) { + this.doc = doc; + } + + public Document get() { + return doc; + } + + public void readFields(DataInput in) throws IOException { + // intentionally left blank + } + + public void write(DataOutput out) throws IOException { + // intentionally left blank + } + } + + /** Unwrap Lucene Documents created by reduce and add them to an index. */ + public static class OutputFormat + extends org.apache.hadoop.mapred.OutputFormatBase<WritableComparable, LuceneDocumentWrapper> { + + public RecordWriter<WritableComparable, LuceneDocumentWrapper> getRecordWriter(final FileSystem fs, JobConf job, + String name, final Progressable progress) throws IOException { + final Path perm = new Path(job.getOutputPath(), name); + final Path temp = + job.getLocalPath("index/_" + Integer.toString(new Random().nextInt())); + + fs.delete(perm); // delete old, if any + + final AnalyzerFactory factory = new AnalyzerFactory(job); + final IndexWriter writer = // build locally first + new IndexWriter(fs.startLocalOutput(perm, temp).toString(), + new NutchDocumentAnalyzer(job), true); + + writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10)); + writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100)); + writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE)); + writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128)); + writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000)); + writer.setInfoStream(LogUtil.getInfoStream(LOG)); + writer.setUseCompoundFile(false); + writer.setSimilarity(new NutchSimilarity()); + + return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() { + + boolean closed; + + public void write(WritableComparable key, LuceneDocumentWrapper value) + throws IOException { // unwrap & index doc + Document doc = value.get(); + NutchAnalyzer analyzer = factory.get(doc.get("lang")); + if (LOG.isInfoEnabled()) { + LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" + + " with analyzer " + analyzer + + " (" + doc.get("lang") + ")"); + } + writer.addDocument(doc, analyzer); + progress.progress(); + } + + public void close(final Reporter reporter) throws IOException { + // spawn a thread to give progress heartbeats + Thread prog = new Thread() { + + public void run() { + while (!closed) { + try { + reporter.setStatus("closing"); + Thread.sleep(1000); + } catch (InterruptedException e) { + continue; + } catch (Throwable e) { + return; + } + } + } + }; + + try { + prog.start(); + if (LOG.isInfoEnabled()) { + LOG.info("Optimizing index."); + } + // optimize & close index + writer.optimize(); + writer.close(); + fs.completeLocalOutput(perm, temp); // copy to dfs + fs.createNewFile(new Path(perm, DONE_NAME)); + } finally { + closed = true; + } + } + }; + } + } + private IndexingFilters filters; + private ScoringFilters scfilters; + + public ImageIndexer() { + } + + public ImageIndexer(Configuration conf) { + setConf(conf); + } + + public void configure(JobConf job) { + setConf(job); + this.filters = new IndexingFilters(getConf()); + this.scfilters = new ScoringFilters(getConf()); + } + + public void close() { + } + + /** + * Copies key/value pairs from one metadata container to another. + * Overwrites the destination if the source has a value with greater length. + * + * @param from Metadata to copy from + * @param to target metadata container + */ + private void mergeMetadata(Metadata from, Metadata to) { + String[] names = from.names(); + for (String name : names) { + String newValue = from.get(name); + String value = to.get(name); + if (value != null) { + if (newValue.length() > value.length()) { + to.set(name, newValue); + } + } else { + to.add(name, newValue); + } + } + } + + public void reduce(Text key, Iterator<WrappedWritable> values, + OutputCollector<Text, Writable> output, Reporter reporter) + throws IOException { + Inlinks inlinks = null; + CrawlDatum dbDatum = null; + CrawlDatum fetchDatum = null; + ParseData parseData = null; + ParseText parseText = null; + + Metadata metadata = null; + Metadata contentMetadata = null; + String segmentName = null; + String signature = null; + while (values.hasNext()) { + Writable value = values.next().get(); + if (value instanceof ImageWritable) { + ImageWritable imgData = (ImageWritable) value; + Metadata imgMeta = imgData.getMetadata(); + if (metadata == null) { + metadata = imgMeta; + } else { + mergeMetadata(imgMeta, metadata); + } + } else if (value instanceof Inlinks) { + inlinks = (Inlinks) value; + } else if (value instanceof CrawlDatum) { + CrawlDatum datum = (CrawlDatum) value; + if (CrawlDatum.hasDbStatus(datum)) { + dbDatum = datum; + } else if (CrawlDatum.hasFetchStatus(datum)) { + // don't index unmodified (empty) pages + if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) { + fetchDatum = datum; + } + } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() || + CrawlDatum.STATUS_SIGNATURE == datum.getStatus()) { + continue; + } else { + throw new RuntimeException("Unexpected status: " + datum.getStatus()); + } + } else if (value instanceof ParseData) { + if (parseData != null) { + ParseData newParse = (ParseData) value; + Metadata parseMeta = newParse.getParseMeta(); + // Check if this is the parse meta from ImageParseFilter + // If so, use its parse meta, otherwise use the content meta + if (parseMeta.get(ImageSearch.PARENT_URL_KEY) != null) { + mergeMetadata(parseMeta, metadata); + } else { + contentMetadata = newParse.getContentMeta(); + } + } else { + parseData = (ParseData) value; + metadata = parseData.getParseMeta(); + contentMetadata = parseData.getContentMeta(); + } + } else if (value instanceof ParseText) { + ParseText newParseText = (ParseText) value; + if (parseText == null || (parseText != null && + parseText.getText().length() < newParseText.getText().length())) { + parseText = (ParseText) value; + } + } else if (LOG.isWarnEnabled()) { + LOG.warn("Unrecognized type: " + value.getClass()); + } + // Save segment name and signature + if (contentMetadata != null) { + if (segmentName == null || signature == null) { + String stringValue = contentMetadata.get(Nutch.SEGMENT_NAME_KEY); + if (stringValue != null) { + segmentName = stringValue; + } + stringValue = contentMetadata.get(Nutch.SIGNATURE_KEY); + if (stringValue != null) { + signature = stringValue; + } + } + } + } + + if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) { + return; // only have inlinks + } + if (!parseData.getStatus().isSuccess() || + fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) { + return; + } + + // Skip possibly non-images + if (metadata.get(ImageSearch.PARENT_URL_KEY) == null) { + return; + } + // Make sure segment name and signature are set + contentMetadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); + contentMetadata.set(Nutch.SIGNATURE_KEY, signature); + + Document doc = new Document(); + + // add segment, used to map from merged index back to segment files + doc.add(new Field("segment", contentMetadata.get(Nutch.SEGMENT_NAME_KEY), + Field.Store.YES, Field.Index.NO)); + + // add digest, used by dedup + doc.add(new Field("digest", contentMetadata.get(Nutch.SIGNATURE_KEY), + Field.Store.YES, Field.Index.NO)); + + ParseData combinedParseData = new ParseData(parseData.getStatus(), + parseData.getTitle(), parseData.getOutlinks(), contentMetadata, + metadata); + + Parse parse = new ParseImpl(parseText, combinedParseData); + try { + // extract information from dbDatum and pass it to + // fetchDatum so that indexing filters can use it + Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); + if (url != null) { + fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url); + } + // run indexing filters + doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks); + } catch (IndexingException e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Error indexing " + key + ": " + e); + } + return; + } + + // skip documents discarded by indexing filters + if (doc == null) { + return; + } + + float boost = 1.0f; + // run scoring filters + try { + boost = this.scfilters.indexerScore((Text) key, doc, dbDatum, + fetchDatum, parse, inlinks, boost); + } catch (ScoringFilterException e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Error calculating score " + key + ": " + e); + } + return; + } + // apply boost to all indexed fields. + doc.setBoost(boost); + // store boost for use by explain and dedup + doc.add(new Field("boost", Float.toString(boost), + Field.Store.YES, Field.Index.NO)); + + output.collect(key, new LuceneDocumentWrapper(doc)); + } + + public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments) + throws IOException { + + if (LOG.isInfoEnabled()) { + LOG.info("ImageIndexer: starting"); + LOG.info("ImageIndexer: linkdb: " + linkDb); + } + + JobConf job = new NutchJob(getConf()); + job.setJobName("index " + indexDir); + + for (int i = 0; i < segments.length; i++) { + if (LOG.isInfoEnabled()) { + LOG.info("ImageIndexer: adding segment: " + segments[i]); + } + job.addInputPath(new Path(segments[i], CrawlDatum.FETCH_DIR_NAME)); + job.addInputPath(new Path(segments[i], CrawlDatum.PARSE_DIR_NAME)); + job.addInputPath(new Path(segments[i], ParseData.DIR_NAME)); + job.addInputPath(new Path(segments[i], ParseText.DIR_NAME)); + job.addInputPath(new Path(segments[i], ImageWritable.IMAGE_DATA_DIR)); + } + + job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME)); + job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME)); + job.setInputFormat(SequenceFileInputFormat.class); + + job.setMapperClass(ImageIndexer.class); + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(WrappedWritable.class); + job.setReducerClass(ImageIndexer.class); + + job.setOutputPath(indexDir); + job.setOutputFormat(OutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(NutchWritable.class); + + JobClient.runJob(job); + if (LOG.isInfoEnabled()) { + LOG.info("ImageIndexer: done"); + } + } + + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new ImageIndexer(), args); + System.exit(res); + } + + public int run(String[] args) throws Exception { + + if (args.length < 4) { + System.err.println("Usage: <index> <crawldb> <linkdb> <segment> ..."); + return -1; + } + + Path[] segments = new Path[args.length - 3]; + for (int i = 3; i < args.length; i++) { + segments[i - 3] = new Path(args[i]); + } + + try { + index(new Path(args[0]), new Path(args[1]), new Path(args[2]), + segments); + return 0; + } catch (Exception e) { + LOG.fatal("ImageIndexer: " + StringUtils.stringifyException(e)); + return -1; + } + } + + public void map(Text key, Writable value, + OutputCollector<Text, WrappedWritable> output, Reporter reporter) throws IOException { + output.collect(key, new WrappedWritable(value)); + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,136 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.IOException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapFileOutputFormat; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; + +public class ImageProcessor extends Configured implements Tool, + Mapper<Text, Content, Text, ImageWritable> { + + private static final Log LOG = LogFactory.getLog(ImageProcessor.class); + + private int thumbQuality; + private int thumbMaxSize; + + ImageProcessor() {} + ImageProcessor(Configuration conf) { + setConf(conf); + } + + public void map(Text key, Content content, + OutputCollector<Text, ImageWritable> output, + Reporter reporter) throws IOException { + + Metadata metadata = new Metadata(); + // Check content type + if (!content.getContentType().contains("image/")) { + return; + } + + // Generate thumbnail + byte[] data = content.getContent(); + StoredImage thumb = ThumbnailGenerator.generateThumbnail(data, + thumbMaxSize, thumbMaxSize, thumbQuality, metadata); + + // Create and setup an ImageWritable + ImageWritable image = new ImageWritable(key.toString()); + image.setMetadata(metadata); + image.setThumbnail(thumb); + + output.collect(key, image); + } + + public void processImageContent(Path segment) + throws IOException { + + JobConf job = new NutchJob(getConf()); + job.setJobName("ImageProcessor " + segment); + + if (LOG.isInfoEnabled()) { + LOG.info("ImageProcessor: processing " + segment); + } + job.addInputPath(new Path(segment, Content.DIR_NAME)); + + job.setInputFormat(SequenceFileInputFormat.class); + job.setMapperClass(ImageProcessor.class); + + job.setOutputPath(new Path(segment, ImageWritable.IMAGE_DATA_DIR)); + job.setOutputFormat(MapFileOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(ImageWritable.class); + + JobClient.runJob(job); + + if (LOG.isInfoEnabled()) { + LOG.info("ImageProcessor: done"); + } + } + + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), + new ImageProcessor(), args); + System.exit(res); + } + public int run(String[] args) throws Exception { + + if (args.length == 0) { + System.err.println("Usage: imageprocessor <segment>"); + return -1; + } + + Path segment = new Path(args[0]); + try { + processImageContent(segment); + return 0; + } catch (Exception e) { + LOG.fatal("ImageProcessor: " + StringUtils.stringifyException(e)); + return -1; + } + } + + private Configuration conf; + public void configure(JobConf conf) { + setConf(conf); + + this.thumbQuality = conf.getInt("imagesearcher.thumbnail.quality", 50); + this.thumbMaxSize = conf.getInt("imagesearcher.thumbnail.maxSize", 100); + } + + public void close() throws IOException { + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,28 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +public class ImageSearch { + public static final String PARENT_URL_KEY = "parent_url"; + public static final String ALT_TEXT_KEY = "alt"; + + public static final String IMAGE_IDS_KEY = "image_ids"; + public static final String IMAGE_POS_KEY = "image_pos"; + public static final String IMAGE_URLS_KEY = "image_urls"; + public static final String HAS_IMAGE_KEY = "has_image"; +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,346 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.File; +import java.io.IOException; +import java.lang.Math; +import java.util.Iterator; +import java.util.Vector; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.nutch.indexer.FsDirectory; +import org.apache.nutch.indexer.Indexer; +import org.apache.nutch.util.NutchConfiguration; + +public class ImageSearcherBean { + + public static final Log LOG = LogFactory.getLog(ImageSearcherBean.class); + + private IndexReader reader; + + private Path baseDir; + private Configuration conf; + private FileSystem fs; + + private int distThreshold; // Maximum allowed distance of image from hit + // to be considered + + /** Construct given a configuration. */ + public ImageSearcherBean(Configuration conf) throws IOException { + this.conf = conf; + this.fs = FileSystem.get(conf); + this.baseDir = new Path(conf.get("searcher.dir", "crawl")); + this.distThreshold = conf.getInt("imagesearch.maxDist", 300); + + // Try to load unmerged indexes + Path indexesDir = new Path(baseDir, "indexes"); + if (this.fs.exists(indexesDir)) { + Vector<Path> doneDirs = new Vector<Path>(); + Path[] dirs = fs.listPaths(indexesDir, new PathFilter() { + + public boolean accept(Path f) { + try { + if (fs.isDirectory(f)) { + return true; + } + } catch (IOException ioe) { + } + return false; + } + }); + for (Path dir : dirs) { + Path indexdone = new Path(dir, Indexer.DONE_NAME); + if (fs.isFile(indexdone)) { + doneDirs.add(dir); + } + } + dirs = new Path[doneDirs.size()]; + Iterator<Path> it = doneDirs.iterator(); + int i = 0; + while (it.hasNext()) { + dirs[i++] = it.next(); + } + init(dirs); + } else { + Path[] indexDir = {new Path(baseDir, "index")}; + init(indexDir); + } + } + + /** Init given a set of indexes or just one index. */ + public void init(Path[] indexes) throws IOException { + IndexReader[] indexReaders = new IndexReader[indexes.length]; + for (int i = 0; i < indexes.length; i++) { + indexReaders[i] = IndexReader.open(getDirectory(indexes[i])); + } + if (indexes.length > 1) { + this.reader = new MultiReader(indexReaders); + } else { + this.reader = IndexReader.open(getDirectory(indexes[0])); + } + } + + private Directory getDirectory(Path file) throws IOException { + if ("file".equals(this.fs.getUri().getScheme())) { + Path qualified = file.makeQualified(FileSystem.getLocal(conf)); + File fsLocal = new File(qualified.toUri()); + return FSDirectory.getDirectory(fsLocal.getAbsolutePath()); + } else { + return new FsDirectory(this.fs, file, false, this.conf); + } + } + + public void close() throws IOException { + if (reader != null) { + reader.close(); + } + } + + public IndexReader getReader() { + return reader; + } + + /** + * Calculate the score for an image hit. + * @param hit found hit + * @param doc parent document + * @return float score + */ + private float scoreHit(ImageHit hit, Document doc) { + float a = 0.2f; + float b = 0.1f; + return a*hit.docScore + (1.0f-a)*(b*hit.docSim + (1.0f-b)*hit.proximity); + } + + /** + * Find query-related images in the content of documents based on proximity. + * + * @param queryTerms + * @param hitCollector + * @throws java.io.IOException + */ + private long getImagesFromContent(Term[] queryTerms, ImageHitQueue hitCollector, + int maxHits) + throws IOException { + + // Construct SpanQuery + SpanQuery[] clauses = new SpanTermQuery[queryTerms.length]; + for (int i=0; i<queryTerms.length; i++) { + clauses[i] = new SpanTermQuery(queryTerms[i]); + } + SpanNearQuery snq = new SpanNearQuery(clauses, queryTerms.length+1, false); + Spans spans = snq.getSpans(reader); + + // Per document info + Document doc = null; + int currentDoc = -1; + int numDocImages = 0; + int[] imagePositions = null; + String[] imageIds = null; + String[] imageUrls = null; + float docBoost = 1.0f; + float docSim = 0.0f; + float maxDist = Float.MAX_VALUE; + float minScore = 0.0f; + + long totalHits = 0; + + boolean more = spans.next(); + while (more) { + if (LOG.isDebugEnabled()) { + LOG.debug("currentDoc "+currentDoc); + } + if (currentDoc != spans.doc()) { + currentDoc = spans.doc(); + doc = reader.document(currentDoc); + // Skip document with no images + if ("0".equals(doc.getField(ImageSearch.HAS_IMAGE_KEY).stringValue())) { + while (more && spans.doc() == currentDoc) { + more = spans.next(); + } + continue; + } + + // Get document's global score + docBoost = doc.getBoost(); + + // Get image positions + String posField = doc.getField(ImageSearch.IMAGE_POS_KEY).stringValue(); + String[] positions = posField.split(":"); + imagePositions = new int[positions.length]; + numDocImages = positions.length; + for (int i = 0; i < numDocImages; i++) { + imagePositions[i] = Integer.parseInt(positions[i]); + } + maxDist = (float)imagePositions[numDocImages-1]; + + // Get image ids + String idField = doc.getField(ImageSearch.IMAGE_IDS_KEY).stringValue(); + imageIds = idField.split(":"); + + // Get image urls + String urlField = doc.getField(ImageSearch.IMAGE_URLS_KEY).stringValue(); + imageUrls = urlField.split(" "); + } + + int pos = 0; + int end = 0; + int imgIndex = 0; + int prevDist = Integer.MAX_VALUE; + while (more && spans.doc() == currentDoc) { + if (imgIndex >= numDocImages) { + more = spans.next(); + continue; + } + if (LOG.isDebugEnabled()) { + LOG.debug("sp " + spans.start() + " " + spans.end()); + } + pos = spans.start(); + end = spans.end(); + int dist = Math.abs(imagePositions[imgIndex] - pos) + (end-pos); + int nextDist = imgIndex < numDocImages-1 ? + Math.abs(imagePositions[imgIndex + 1] - pos) + (end-pos) : Integer.MAX_VALUE; + /*if (prevDist < dist) { + more = spans.next(); + prevDist = dist; + if (LOG.isDebugEnabled()) { + LOG.debug("p<d"); + } + continue; + }*/ + // Advance image pointer till a nearer image can be found + while (imgIndex < numDocImages && nextDist <= dist) { + if (LOG.isDebugEnabled()) { + LOG.debug("adv " + nextDist + " " + dist + " id " + imageUrls[imgIndex].substring(imageUrls[imgIndex].lastIndexOf("/"))); + } + dist = nextDist; + imgIndex++; + nextDist = imgIndex < numDocImages-1 ? + Math.abs(imagePositions[imgIndex+1] - pos) + (end-pos) : Integer.MAX_VALUE; + } + // Check if this image is in the allowed proximity of the span + if (dist > distThreshold) { + if (LOG.isDebugEnabled()) { + LOG.debug("d>t: " + dist); + } + more = spans.next(); + continue; + } + + if (LOG.isDebugEnabled()) { + LOG.debug("hit " + imageUrls[imgIndex].substring(imageUrls[imgIndex].lastIndexOf("/")) + " " + dist + " next " + nextDist); + } + // Found hit + ImageHit newHit = new ImageHit(imageIds[imgIndex], imageUrls[imgIndex], currentDoc); + newHit.docSim = docSim; + newHit.docScore = docBoost; + newHit.proximity = Math.min(1.0f, 1.0f-((float)dist/maxDist)); + newHit.score = scoreHit(newHit, doc); + + if (hitCollector.size() < maxHits || newHit.score >= minScore) { + hitCollector.insert(newHit); + minScore = ((ImageHit)hitCollector.top()).score; + + prevDist = dist; + imgIndex++; + } + totalHits++; + more = spans.next(); + } + } + + return totalHits; + } + + /** + * Search for images matching the query. + * + * @param query query + * @param maxHits maximum number of hits to retrieve + * @return ImageHits the matching hits + * @throws java.io.IOException + */ + public ImageHits search(String query, int maxHits) throws IOException { + String[] keywords = query.split("\\s"); + if (keywords == null) { + return new ImageHits(0, new ImageHit[0]); + } + + // Create query term array + Term[] queryTerms = new Term[keywords.length]; + for (int i=0; i<queryTerms.length; i++) { + queryTerms[i] = new Term("content", keywords[i]); + } + + ImageHitQueue hitQueue = new ImageHitQueue(maxHits); + long totalHits = getImagesFromContent(queryTerms, hitQueue, maxHits); + + // Extract top results + ImageHit[] resultSet = new ImageHit[hitQueue.size()]; + for (int i = resultSet.length - 1; i >= 0; i--) { + resultSet[i] = (ImageHit) hitQueue.pop(); + } + + return new ImageHits(totalHits, resultSet); + } + + /** For debugging purposes. */ + public static void main(String[] args) throws Exception { + if (args.length == 0) { + System.err.println("Usage: ImageSearcherBean <query>"); + System.exit(-1); + } + + Configuration conf = NutchConfiguration.create(); + ImageSearcherBean isb = new ImageSearcherBean(conf); + + // Construct query string + StringBuffer sb = new StringBuffer(); + for (String arg : args) { + if (sb.length() > 0) { + sb.append(' '); + } + sb.append(arg); + } + // Conduct search + int maxHits = 10; + ImageHits hits = isb.search(sb.toString(), maxHits); + // Show results + System.out.println("Total hits: " + hits.getTotal()); + ImageHit[] top = hits.getHits(0, + hits.getTotal() >= maxHits ? maxHits : (int)hits.getTotal()); + for (ImageHit hit : top) { + System.out.println(hit.score + " " + hit.url + " " + hit.imageId); + } + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageWritable.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageWritable.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageWritable.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,78 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import org.apache.hadoop.io.MD5Hash; +import org.apache.hadoop.io.Writable; +import org.apache.nutch.metadata.Metadata; + +public class ImageWritable implements Writable { + + public static final String IMAGE_DATA_DIR = "image_data"; + + private MD5Hash id; + private Metadata metadata; + private StoredImage thumbnail; + + public ImageWritable() {} + + public ImageWritable(String url) { + this.id = MD5Hash.digest(url); + this.metadata = new Metadata(); + } + + public Metadata getMetadata() { + return metadata; + } + + public void setMetadata(Metadata metadata) { + this.metadata = metadata; + } + + public void setThumbnail(StoredImage thumbnail) { + this.thumbnail = thumbnail; + } + + public StoredImage getThumbnail() { + return thumbnail; + } + + public void write(DataOutput out) throws IOException { + id.write(out); + metadata.write(out); + if (thumbnail != null) { + out.writeBoolean(true); + thumbnail.write(out); + } else { + out.writeBoolean(false); + } + } + + public void readFields(DataInput in) throws IOException { + id = MD5Hash.read(in); + metadata = new Metadata(); + metadata.readFields(in); + thumbnail = new StoredImage(); + if (in.readBoolean()) { + thumbnail.readFields(in); + } + } +} Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/StoredImage.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/StoredImage.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/StoredImage.java 2008-07-11 14:00:57 UTC (rev 2427) @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import org.apache.hadoop.io.Writable; + +/** Represents binary image data as a Writable. */ +public class StoredImage implements Writable { + + public static final byte TYPE_JPEG = 'j'; + public static final byte T... [truncated message content] |