From: <sta...@us...> - 2007-03-26 20:57:25
|
Revision: 1634 http://archive-access.svn.sourceforge.net/archive-access/?rev=1634&view=rev Author: stack-sf Date: 2007-03-26 13:57:26 -0700 (Mon, 26 Mar 2007) Log Message: ----------- Updating underlying nutch and hadoop. Nutch is about to release 0.9 so trying TRUNK. Hadoop in nutch is 0.12.2 so updating to that too. * conf/wax-default.xml Set default for new parser.caching.forbidden.policy to 'all' rather than 'noarchive'. * src/java/org/archive/access/nutch/jobs/ImportArcs.java Signature for MapFile.Writer changed. * nutchwax-thirdparty Update from r508238 to r521933 * .classpath Update nutch lib references to point at newer jars. * nutchwax-job/src/main/assembly/assemble-job.xml In distributed mode, seems like the nutchwax classes CANNOT be inside a jar -- else they won't be found. * nutchwax-webapp/src/main/webapp/search.jsp Add in handling for no caching. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/.classpath trunk/archive-access/projects/nutchwax/conf/wax-default.xml trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml trunk/archive-access/projects/nutchwax/nutchwax-webapp/src/main/webapp/search.jsp trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportArcs.java Property Changed: ---------------- trunk/archive-access/projects/nutchwax/nutchwax-thirdparty/ Modified: trunk/archive-access/projects/nutchwax/.classpath =================================================================== --- trunk/archive-access/projects/nutchwax/.classpath 2007-03-26 17:13:06 UTC (rev 1633) +++ trunk/archive-access/projects/nutchwax/.classpath 2007-03-26 20:57:26 UTC (rev 1634) @@ -55,7 +55,7 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-jakarta-poi/poi-3.0-alpha1-20050704.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-jakarta-poi/poi-scratchpad-3.0-alpha1-20050704.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-log4j/log4j-1.2.11.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-lucene-analyzers/lucene-analyzers-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-lucene-analyzers/lucene-analyzers-2.1.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-nekohtml/nekohtml-0.9.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-parsems/lib-parsems.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/lib-regex-filter/lib-regex-filter.jar"/> @@ -98,7 +98,6 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-jakarta-poi/poi-3.0-alpha1-20050704.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-jakarta-poi/poi-scratchpad-3.0-alpha1-20050704.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-log4j/log4j-1.2.11.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-lucene-analyzers/lucene-analyzers-2.0.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-nekohtml/nekohtml-0.9.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-parsems/lib-parsems.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/lib-regex-filter/lib-regex-filter.jar"/> @@ -142,7 +141,7 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/scoring-opic/scoring-opic.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/subcollection/subcollection.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/summary-basic/summary-basic.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/summary-lucene/lucene-highlighter-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/summary-lucene/lucene-highlighter-2.1.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/summary-lucene/summary-lucene.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/urlfilter-automaton/automaton.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/plugins/urlfilter-automaton/urlfilter-automaton.jar"/> @@ -184,9 +183,9 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-lang-2.1.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-1.0.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-api-1.0.4.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.10.1-core.jar" /> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.12.2-core.jar" /> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jakarta-oro-2.0.7.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jets3t.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jets3t-0.5.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-5.1.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-ext/ant.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-ext/commons-el.jar"/> @@ -195,8 +194,8 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-ext/jsp-api.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/junit-3.8.1.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/log4j-1.2.13.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/lucene-core-2.0.0.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/lucene-misc-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/lucene-core-2.1.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/lucene-misc-2.1.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/pmd-ext/jakarta-oro-2.0.8.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/pmd-ext/jaxen-1.1-beta-7.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/pmd-ext/pmd-3.6.jar"/> @@ -216,7 +215,7 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-log4j/lib/log4j-1.2.11.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.0.0.jar"/> + <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.1.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-xml/lib/jaxen-core.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/lib-xml/lib/jaxen-jdom.jar"/> @@ -232,7 +231,6 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/parse-rss/lib/xmlrpc-1.2.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/parse-swf/lib/javaswf.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/protocol-ftp/lib/commons-net-1.2.0-dev.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/summary-lucene/lib/lucene-highlighter-2.0.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/src/plugin/urlfilter-automaton/lib/automaton.jar"/> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> <classpathentry kind="con" path="org.maven.ide.eclipse.MAVEN2_CLASSPATH_CONTAINER"/> Modified: trunk/archive-access/projects/nutchwax/conf/wax-default.xml =================================================================== --- trunk/archive-access/projects/nutchwax/conf/wax-default.xml 2007-03-26 17:13:06 UTC (rev 1633) +++ trunk/archive-access/projects/nutchwax/conf/wax-default.xml 2007-03-26 20:57:26 UTC (rev 1634) @@ -198,6 +198,16 @@ </property> <property> +<name>parser.caching.forbidden.policy</name> + <value>all</value> + <description>If a site (or a page) requests through its robot metatags + that it should not be shown as cached content, apply this policy. Currently + three keywords are recognized: "none" ignores any "noarchive" directives. + "content" doesn't show the content, but shows summaries (snippets). + "all" doesn't show either content or summaries.</description> +</property> + +<property> <name>wax.index.all</name> <value>true</value> <description>If set to true, all content types are indexed. Otherwise Modified: trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml 2007-03-26 17:13:06 UTC (rev 1633) +++ trunk/archive-access/projects/nutchwax/nutchwax-job/src/main/assembly/assemble-job.xml 2007-03-26 20:57:26 UTC (rev 1634) @@ -74,12 +74,6 @@ <include>concurrent*</include> </includes> </fileSet> - <!-- - For some reason, adding the nutch jar does not - work. You must add the classes at root level - of the job jar. Strange given the main nutchwax - class can be inside of a jar. - <fileSet> <directory>../nutchwax-thirdparty/nutch/build</directory> <outputDirectory>/lib</outputDirectory> @@ -87,11 +81,24 @@ <include>nutch*jar</include> </includes> </fileSet> - --> + <!-- + I used to add in the nutchwax-core jar but when I do that, + running in distributed mode, ClassNotFound issues. It + starts with not being able to find nutch classes and then + even if I put the nutch jar in $HADOOP_HOME/lib, then it + cannot find the content of archive-mapred jar. Including + the nutchwax-core classes seems to do the trick (Its how + it was done before move to m2). + <fileSet> <directory>../nutchwax-thirdparty/nutch/build/classes</directory> <outputDirectory>/</outputDirectory> </fileSet> + --> + <fileSet> + <directory>../nutchwax-core/target/classes</directory> + <outputDirectory>/</outputDirectory> + </fileSet> </fileSets> <dependencySets> <dependencySet> @@ -107,8 +114,10 @@ <exclude>com.sleepycat:je</exclude> <exclude>junit:junit</exclude> <exclude>javax.servlet:servlet-api</exclude> + <exclude>it.unimi.dsi:mg4j</exclude> <exclude>org.archive.nutchwax:nutchwax-thirdparty</exclude> <exclude>org.archive.nutchwax:nutchwax-plugins</exclude> + <exclude>org.archive.nutchwax:nutchwax-core</exclude> </excludes> </dependencySet> </dependencySets> Property changes on: trunk/archive-access/projects/nutchwax/nutchwax-thirdparty ___________________________________________________________________ Name: svn:externals - nutch -r 508238 http://svn.apache.org/repos/asf/lucene/nutch/trunk + nutch -r 521933 http://svn.apache.org/repos/asf/lucene/nutch/trunk Modified: trunk/archive-access/projects/nutchwax/nutchwax-webapp/src/main/webapp/search.jsp =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-webapp/src/main/webapp/search.jsp 2007-03-26 17:13:06 UTC (rev 1633) +++ trunk/archive-access/projects/nutchwax/nutchwax-webapp/src/main/webapp/search.jsp 2007-03-26 20:57:26 UTC (rev 1634) @@ -10,6 +10,7 @@ import="java.util.regex.Pattern" import="org.apache.nutch.html.Entities" + import="org.apache.nutch.metadata.Nutch" import="org.apache.nutch.searcher.*" import="org.apache.nutch.searcher.Summary.Fragment" import="org.apache.nutch.plugin.*" @@ -189,7 +190,15 @@ Hit[] show = hits.getHits(start, realEnd-start); HitDetails[] details = bean.getDetails(show); Summary[] summaries = bean.getSummary(details, query); + String caching = detail.getValue("cache"); + boolean showSummary = true; + boolean showCached = true; + if (caching != null) { + showSummary = !caching.equals(Nutch.CACHING_FORBIDDEN_ALL); + showCached = !caching.equals(Nutch.CACHING_FORBIDDEN_NONE); + } + bean.LOG.info("total hits: " + hits.getTotal()); String collectionsHost = nutchConf.get("wax.host", "examples.com"); @@ -245,7 +254,7 @@ %> <b><a href="<%=target%>"><%=Entities.encode(title)%></a></b> - <% if (!"".equals(summary)) { %> + <% if (!"".equals(summary) && showSummary) { %> <br><%=summary%> <% } %> <br> Modified: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportArcs.java =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-03-26 17:13:06 UTC (rev 1633) +++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-03-26 20:57:26 UTC (rev 1634) @@ -646,7 +646,7 @@ throws IOException { Path f = new Path(job.getOutputPath(), CrawlDatum.FETCH_DIR_NAME); final Path fetch = new Path(f, name); - final MapFile.Writer fetchOut = new MapFile.Writer(fs, + final MapFile.Writer fetchOut = new MapFile.Writer(job, fs, fetch.toString(), Text.class, CrawlDatum.class); // Write a cdx file. Write w/o compression. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |