From: <bi...@us...> - 2008-07-25 20:24:45
|
Revision: 2494 http://archive-access.svn.sourceforge.net/archive-access/?rev=2494&view=rev Author: binzino Date: 2008-07-25 20:24:53 +0000 (Fri, 25 Jul 2008) Log Message: ----------- Added HTTPStatusCodeFilter and configuration thereof in nutch-site.xml. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-25 02:46:16 UTC (rev 2493) +++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-25 20:24:53 UTC (rev 2494) @@ -32,7 +32,7 @@ </property> <property> - <!-- Configure the 'index-nutchwax' plugin. Specify how the metadata fields added by the ArcsToSegment are mapped to the Lucene documents during indexing. + <!-- Configure the 'index-nutchwax' plugin. Specify how the metadata fields added by the Importer are mapped to the Lucene documents during indexing. The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key" Where the only required part is the "src-key", the rest will assume the following defaults: lowercase = true @@ -111,9 +111,16 @@ <property> <name>nutchwax.urlfilter.wayback.canonicalizer</name> <value>org.archive.wayback.util.url.AggressiveUrlCanonicalizer</value> - <description></description> + <description>Implementation of URL canonicalizer to use.</description> </property> +<property> + <name>nutchwax.filter.http.status</name> + <value> + 200-299 + </value> +</property> + <!-- Similar to Nutch's file.content.limit http.content.limit Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-25 02:46:16 UTC (rev 2493) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-25 20:24:53 UTC (rev 2494) @@ -20,6 +20,8 @@ import java.net.MalformedURLException; import java.util.Map.Entry; import java.util.Iterator; +import java.util.List; +import java.util.ArrayList; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -106,12 +108,8 @@ private ParseUtil parseUtil; private URLNormalizers normalizers; private int interval; + private HTTPStatusCodeFilter httpStatusCodeFilter; - private long numSkipped; - private long numImported; - private long bytesSkipped; - private long bytesImported; - /** * ?: Is this necessary? */ @@ -146,6 +144,8 @@ this.parseUtil = new ParseUtil ( jobConf ); this.normalizers = new URLNormalizers( jobConf, URLNormalizers.SCOPE_FETCHER ); this.interval = jobConf.getInt( "db.fetch.interval.default", 2592000 ); + + this.httpStatusCodeFilter = new HTTPStatusCodeFilter( jobConf.get( "nutchwax.filter.http.status" ) ); } /** @@ -233,6 +233,13 @@ if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" ); + if ( ! this.httpStatusCodeFilter.isAllowed( record.getStatusCode( ) ) ) + { + if ( LOG.isInfoEnabled() ) LOG.info( "Skip URL: " + meta.getUrl() + " HTTP status:" + record.getStatusCode() ); + + return false; + } + try { // Skip the HTTP headers in the response body, so that the @@ -313,6 +320,7 @@ contentMetadata.set( NutchWax.DIGEST_KEY, meta.getDigest() ); contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); contentMetadata.set( NutchWax.CONTENT_LENGTH_KEY, String.valueOf( meta.getLength() ) ); + contentMetadata.set( NutchWax.HTTP_RESPONSE_KEY, String.valueOf( record.getStatusCode() ) ); Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() ); @@ -677,3 +685,96 @@ } } + + +/** + * This should all be moved into some sort of filtering plugin. + * Unfortunately the URLFilter plugin interface isn't adequate as it + * only looks at a URL string. Rather than jamming a response code + * through that interface, we do a one-off filter class here. + * + * A long-term solution would be to create a new Nutch extension point + * interface that takes an ARCRecord rather than a URL string. That + * way we can write filters that can operate on any part of an + * ARCRecord, not just the URL. + */ +class HTTPStatusCodeFilter +{ + List<Range> ranges = new ArrayList<Range>( ); + + public HTTPStatusCodeFilter( String configuration ) + { + if ( configuration == null ) + { + return ; + } + + configuration = configuration.trim( ); + + for ( String value : configuration.split( "\\s+" ) ) + { + Range range = new Range( ); + + // Special handling for "none" where an ARCRecord doesn't have + // an HTTP status code. The ARCRecord.getStatusCode() returns + // -1 in that case, so we make a range for it. + if ( value.toLowerCase( ).equals( "none" ) ) + { + range.lower = -1; + range.upper = -1; + + this.ranges.add( range ); + + continue; + } + + String values[] = value.split( "[-]" ); + + try + { + switch ( values.length ) + { + case 2: + // It's a range, N-M + range.lower = Integer.parseInt( values[0] ); + range.upper = Integer.parseInt( values[1] ); + break; + + case 1: + // It's a single value, convert to a single-value range + range.lower = Integer.parseInt( values[0] ); + range.upper = range.lower; + break; + + default: + // Bad format + Importer.LOG.warn( "Illegal format for nutchwax.filter.http.status: " + range ); + continue ; + } + + this.ranges.add( range ); + } + catch ( NumberFormatException nfe ) + { + Importer.LOG.warn( "Illegal format for nutchwax.filter.http.status: " + range, nfe ); + } + } + + } + + public boolean isAllowed( int code ) + { + for ( Range r : this.ranges ) + { + return ( r.lower <= code && code <= r.upper ); + } + + return false; + } + + static class Range + { + int lower; + int upper; + } +} Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-25 02:46:16 UTC (rev 2493) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-25 20:24:53 UTC (rev 2494) @@ -31,4 +31,5 @@ public static final String DIGEST_KEY = "digest"; public static final String CONTENT_TYPE_KEY = "type"; public static final String CONTENT_LENGTH_KEY = "length"; + public static final String HTTP_RESPONSE_KEY = "http_response_code"; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-07-28 19:49:58
|
Revision: 2509 http://archive-access.svn.sourceforge.net/archive-access/?rev=2509&view=rev Author: binzino Date: 2008-07-28 19:50:07 +0000 (Mon, 28 Jul 2008) Log Message: ----------- Updated for NutchWAX 0.12.1 release. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/HOWTO.txt trunk/archive-access/projects/nutchwax/archive/INSTALL.txt trunk/archive-access/projects/nutchwax/archive/README.txt trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt Modified: trunk/archive-access/projects/nutchwax/archive/HOWTO.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/HOWTO.txt 2008-07-28 19:43:10 UTC (rev 2508) +++ trunk/archive-access/projects/nutchwax/archive/HOWTO.txt 2008-07-28 19:50:07 UTC (rev 2509) @@ -1,6 +1,6 @@ HOWTO.txt -2008-05-20 +2008-07-28 Aaron Binns Table of Contents Modified: trunk/archive-access/projects/nutchwax/archive/INSTALL.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/INSTALL.txt 2008-07-28 19:43:10 UTC (rev 2508) +++ trunk/archive-access/projects/nutchwax/archive/INSTALL.txt 2008-07-28 19:50:07 UTC (rev 2509) @@ -1,6 +1,6 @@ INSTALL.txt -2008-07-02 +2008-07-28 Aaron Binns This installation guide assumes the reader is already familiar with @@ -46,11 +46,11 @@ Nutch SVN trunk. The specific SVN revision that NutchWAX 0.12 is built against is: - 673823 + 676736 To checkout this revision of Nutch, use: - $ svn checkout -r 673823 http://svn.apache.org/repos/asf/lucene/nutch/trunk nutch + $ svn checkout -r 676736 http://svn.apache.org/repos/asf/lucene/nutch/trunk nutch $ cd nutch Modified: trunk/archive-access/projects/nutchwax/archive/README.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/README.txt 2008-07-28 19:43:10 UTC (rev 2508) +++ trunk/archive-access/projects/nutchwax/archive/README.txt 2008-07-28 19:50:07 UTC (rev 2509) @@ -1,9 +1,9 @@ README.txt -2008-07-02 +2008-07-25 Aaron Binns -Welcome to NutchWAX 0.12! +Welcome to NutchWAX 0.12.1! NutchWAX is a set of add-ons to Nutch in order to index and search archived web data. @@ -76,15 +76,15 @@ ====================================================================== -This 0.12 release of NutchWAX is radically different in source-code +This 0.12.x release of NutchWAX is radically different in source-code form compared to the previous release, 0.10. -One of the design goals of 0.12 was to reduce or even eliminate the +One of the design goals of 0.12.x was to reduce or even eliminate the "copy/paste/edit" approach of 0.10. The 0.10 (and prior) NutchWAX releases had to copy/paste/edit large chunks of Nutch source code in order to add the NutchWAX features. -Also, the NutchWAX 0.12 sources and build are designed to one day be +Also, the NutchWAX 0.12.x sources and build are designed to one day be added into mainline Nutch as a proper "contrib" package; then eventually be fully integrated into the core Nutch source code. Modified: trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt 2008-07-28 19:43:10 UTC (rev 2508) +++ trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt 2008-07-28 19:50:07 UTC (rev 2509) @@ -1,9 +1,9 @@ RELEASE-NOTES.TXT -2007-07-03 +2007-07-25 Aaron Binns -Release notes for NutchWAX 0.12 +Release notes for NutchWAX 0.12.1 For the most recent updates and information on NutchWAX, please visit the project wiki at: @@ -15,28 +15,10 @@ Overview ====================================================================== -NutchWAX 0.12-beta-1 was released on June 2, 2008. We anticipated -releasing another beta mid-June with bug fixes and some minor -enhancements based on feedback from the community. +NutchWAX 0.12.1 contains some minor enhancements and fixes to NutchWAX +0.12. One of the driving forces behind some of the enhancements was +integration with the Wayback machine. -During internal testing by the Internet Archive Web Team, a few -serious problems were found, the most critical being the failure to -store different copies of the same URL when importing large batches of -archive files. - -The NutchWAX team canceled the mid-month release in order to focus on -fixing this problem. - -The good news is that not only has that problem been fixed, but the -solution is part of a broader enhancement to manage the de-duplication -of archive contnet during import and indexing. - -For more details on de-duplication in NutchWAX, please see - - HOWTO-dedup.txt - README-dedup.txt - - ====================================================================== Issues ====================================================================== @@ -47,16 +29,24 @@ Issues resolved in this release: -WAX-9 Entire file not imported -WAX-8 Investigate why so many PDFs fail to parse +WAX-16 + Option to skip ARC record import based on HTTP status code of + content - Fixing the first one caused nearly all of the PDF parsing errors to - disappear. +WAX-12 + Add metadata field "fileoffset" -WAX-7 Change config to that URL filters are not applied during link inversion +WAX-11 + Change metadata field name in search results from "arcname" to + "filename" - This is easily achieved by using command-line options when invoking - the Nutch "invertlinks" command. +WAX-10 + Add "exacturl" metadata field to indexing so it can be searched + as-is, not parsed/tokenized like the "url" field. -WAX-3 Observe content size limit on importing -WAX-2 Date queries cause TooManyClauses exceptions +WAX-6 + Change DateAdder to allow for implementation of URLCanonicalizer to + be defined in property. + +WAX-4 + Implementor/user-provided XSLT for OpenSearch results This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-09-02 23:26:04
|
Revision: 2589 http://archive-access.svn.sourceforge.net/archive-access/?rev=2589&view=rev Author: binzino Date: 2008-09-02 23:26:08 +0000 (Tue, 02 Sep 2008) Log Message: ----------- Changed parsing of dup/date file to allow for extra, unused fields. Also updated dedup-cdx script to add archive filename to output. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx =================================================================== --- trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx 2008-08-29 22:18:41 UTC (rev 2588) +++ trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx 2008-09-02 23:26:08 UTC (rev 2589) @@ -11,16 +11,26 @@ echo "Duplicate records are found by sorting all the CDX records, then" echo "comparing subsequent records by URL+digest." echo - echo "Output is in abbreviated form of \"URL digest date\", ex:" + echo "Output is in abbreviated form of \"URL digest date arcname\", ex:" echo - echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20070208173443" - echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20080626121505" + echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20070208173443 foo.arc.gz" + echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20080626121505 bar.arc.gz" echo echo "The output of this script can be used as an exclusions file for" echo "importing ARC files with NutchWAX, and also for adding dates" echo "to a parallel index." echo + echo "NOTE: This script uses Unix 'sort' binary. If you wish to use a different" + echo "implementation, specify it via the SORT shell variable, e.g.:" + echo + echo " SORT=my_cool_sort dedup-cdx file1.cdx" + echo exit 1; fi -cat $@ | awk '{ print $1 " sha1:" $6 " " $2 }' | sort -u | awk '{ if ( url == $1 && digest == $2 ) print $1 " " $2 " " $3 ; url = $1 ; digest = $2 }' +# Use Unix 'sort', unless over-ridden by caller. +if [ -z "$SORT" ]; then + SORT=sort +fi + +cat $@ | awk '{ print $1, "sha1:" $6, $2, $9 }' | $SORT -u | awk '{ if ( url == $1 && digest == $2 ) print $1, $2, $3, $4 ; url = $1 ; digest = $2 }' Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-08-29 22:18:41 UTC (rev 2588) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-09-02 23:26:08 UTC (rev 2589) @@ -84,24 +84,24 @@ String line; while ( (line = br.readLine( )) != null ) { - String parts[] = line.split("\\s+"); - if ( parts.length != 3 ) + String fields[] = line.split("\\s+"); + if ( fields.length < 3 ) { - System.out.println( "Malformed line: " + line ); + System.out.println( "Malformed line, not enough fields (" + fields.length +"): " + line ); continue; } // Key is hash+url, value is String which is a " "-separated list of dates - String key = parts[0] + parts[1]; + String key = fields[0] + fields[1]; String dates = dateRecords.get( key ); if ( dates != null ) { - dates += " " + parts[2]; + dates += " " + fields[2]; dateRecords.put( key, dates ); } else { - dateRecords.put( key , parts[2] ); + dateRecords.put( key , fields[2] ); } } Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java 2008-08-29 22:18:41 UTC (rev 2588) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java 2008-09-02 23:26:08 UTC (rev 2589) @@ -48,7 +48,6 @@ * same logic as the Wayback. By making Wayback canonicalization * available, we can use exclusion rules generated from CDX files. */ -// TODO: Add logging public class WaybackURLFilter implements URLFilter { public static final Log LOG = LogFactory.getLog( WaybackURLFilter.class ); @@ -75,7 +74,7 @@ if ( s.length != 3 ) { // Don't filter. - LOG.info( "Allowing: " + urlString ); + LOG.info( "Allowing : " + urlString ); return urlString; } @@ -94,7 +93,7 @@ // Then, build a key to be compared against the exclusion // list. - String key = url + " " + digest + " " + date; + String key = url + digest + date; exclude = this.exclusions.contains( key ); } @@ -192,6 +191,20 @@ String line; while ( (line = reader.readLine()) != null ) { + String fields[] = line.split( "\\s+" ); + + if ( fields.length < 3 ) + { + LOG.warn( "Malformed exclusion, not enough fields ("+fields.length+"): " + line ); + continue ; + } + + // We only want the first three fields. Chop-off anything extra. + if ( fields.length >= 3 ) + { + line = fields[0] + fields[1] + fields[2]; + } + exclusions.add( line ); } } @@ -222,5 +235,5 @@ return exclusions; } - + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-09-22 19:56:03
|
Revision: 2594 http://archive-access.svn.sourceforge.net/archive-access/?rev=2594&view=rev Author: binzino Date: 2008-09-22 19:55:50 +0000 (Mon, 22 Sep 2008) Log Message: ----------- Updates in anticipation for 0.12.2 release. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/INSTALL.txt trunk/archive-access/projects/nutchwax/archive/README.txt trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt Modified: trunk/archive-access/projects/nutchwax/archive/INSTALL.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/INSTALL.txt 2008-09-22 19:08:40 UTC (rev 2593) +++ trunk/archive-access/projects/nutchwax/archive/INSTALL.txt 2008-09-22 19:55:50 UTC (rev 2594) @@ -1,6 +1,6 @@ INSTALL.txt -2008-07-28 +2008-10-01 Aaron Binns This installation guide assumes the reader is already familiar with @@ -43,14 +43,14 @@ ------------- As mentioned above, NutchWAX 0.12 is built against Nutch-1.0-dev. Nutch doesn't have a 1.0 release package yet, so we have to use the -Nutch SVN trunk. The specific SVN revision that NutchWAX 0.12 is +Nutch SVN trunk. The specific SVN revision that NutchWAX 0.12.2 is built against is: - 676736 + 697964 To checkout this revision of Nutch, use: - $ svn checkout -r 676736 http://svn.apache.org/repos/asf/lucene/nutch/trunk nutch + $ svn checkout -r 697964 http://svn.apache.org/repos/asf/lucene/nutch/trunk nutch $ cd nutch Modified: trunk/archive-access/projects/nutchwax/archive/README.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/README.txt 2008-09-22 19:08:40 UTC (rev 2593) +++ trunk/archive-access/projects/nutchwax/archive/README.txt 2008-09-22 19:55:50 UTC (rev 2594) @@ -1,9 +1,9 @@ README.txt -2008-07-25 +2008-10-01 Aaron Binns -Welcome to NutchWAX 0.12.1! +Welcome to NutchWAX 0.12.2! NutchWAX is a set of add-ons to Nutch in order to index and search archived web data. Modified: trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt 2008-09-22 19:08:40 UTC (rev 2593) +++ trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt 2008-09-22 19:55:50 UTC (rev 2594) @@ -1,9 +1,9 @@ RELEASE-NOTES.TXT -2007-07-25 +2008-10-01 Aaron Binns -Release notes for NutchWAX 0.12.1 +Release notes for NutchWAX 0.12.2 For the most recent updates and information on NutchWAX, please visit the project wiki at: @@ -15,9 +15,8 @@ Overview ====================================================================== -NutchWAX 0.12.1 contains some minor enhancements and fixes to NutchWAX -0.12. One of the driving forces behind some of the enhancements was -integration with the Wayback machine. +NutchWAX 0.12.2 contains some minor enhancements and fixes to NutchWAX +0.12.1. ====================================================================== Issues @@ -29,24 +28,16 @@ Issues resolved in this release: -WAX-16 - Option to skip ARC record import based on HTTP status code of - content +WAX-23 + Add a "field setter" filter to set a field to a static value in the + Lucene document during indexing. -WAX-12 - Add metadata field "fileoffset" +WAX-22 + Various code clean-ups based on code review using PMD tool. -WAX-11 - Change metadata field name in search results from "arcname" to - "filename" +WAX-21 + Allow for blank lines and comment lines in manifest file. -WAX-10 - Add "exacturl" metadata field to indexing so it can be searched - as-is, not parsed/tokenized like the "url" field. - -WAX-6 - Change DateAdder to allow for implementation of URLCanonicalizer to - be defined in property. - -WAX-4 - Implementor/user-provided XSLT for OpenSearch results +WAX-19 + Add strict/loose option to DateAdder for revisit lines with extra + data on end. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-12-16 03:00:15
|
Revision: 2671 http://archive-access.svn.sourceforge.net/archive-access/?rev=2671&view=rev Author: binzino Date: 2008-12-16 03:00:10 +0000 (Tue, 16 Dec 2008) Log Message: ----------- Updated documentation for 0.12.3 release. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/HOWTO-dedup.txt trunk/archive-access/projects/nutchwax/archive/HOWTO-xslt.txt trunk/archive-access/projects/nutchwax/archive/INSTALL.txt trunk/archive-access/projects/nutchwax/archive/README.txt trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/HOWTO-pagerank.txt Modified: trunk/archive-access/projects/nutchwax/archive/HOWTO-dedup.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/HOWTO-dedup.txt 2008-12-16 02:59:10 UTC (rev 2670) +++ trunk/archive-access/projects/nutchwax/archive/HOWTO-dedup.txt 2008-12-16 03:00:10 UTC (rev 2671) @@ -157,62 +157,36 @@ ====================================================================== -Index +Index and Index merging ====================================================================== -The only chage we make to the indexing step is the destination of the -index directory. +Perform the index step as normal, yielding an 'indexes' directory. -By default, Nutch expects the per-segment index directory to live in a -sub-directory called 'indexes' and the index command is accordingly +E.g. $ nutch index indexes crawldb linkdb segments/* -Resulting in an index directory structure of the form +Then, merge the 'indexes' directory into a single Lucene index by +invoking the Nutch 'merge' command - indexes/part-00000 + $ nutch merge index indexes -For de-duplication, we use a slightly different directory structure, -which will be used by a de-duplication-aware NutchWaxBean at -search-time. The directory structure we use is: - pindexes/<segment>/part-00000 - -Using the segment name is not strictly required, but it is a good -practice and is strongly recommended. This way the segment and its -corresponding index directory are easily matched. - -Let's assume that the segment directory created during the import is -named - - segments/20080703050349 - -In that case, our index command becomes: - - $ nutch index pindexes/20080703050349 crawldb linkdb segments/20080703050349 - -Upon completion, the Lucene index is created in - - pindexes/20080703050349/part-0000 - -This index is exactly the same as one normally created by Nutch, the -only difference is the location. - - ====================================================================== Add Revisit Dates ====================================================================== -Now that we have the Nutch index, we add the revisit dates to it. +Now that we have a single, merged index, we create a "parallel" index +directory which contains the additional revisit dates. Examine the "all.dup" file again, it has lines of the form - example.org/robots.txt sha1:4G3PAROKCYJNRGZIHJO5PVLZ724FX3GN 20080618133034 - example.org/robots.txt sha1:AGW5DJIEUBL67473477TDVBBGDZ37AEZ 20080613194800 - example.org/robots.txt sha1:AGW5DJIEUBL67473477TDVBBGDZ37AEZ 20080616061312 - example.org/robots.txt sha1:AGW5DJIEUBL67473477TDVBBGDZ37AEZ 20080618132204 - example.org/robots.txt sha1:AGW5DJIEUBL67473477TDVBBGDZ37AEZ 20080618132213 - example.org/robots.txt sha1:AGW5DJIEUBL67473477TDVBBGDZ37AEZ 20080619132911 + example.org/robots.txt sha1:4G3PAROKCYJNRGZIHJO5PVLZ724FX3GN 20080618133034 + example.org/robots.txt sha1:AGW5DJIEUBL67473477TDVBBGDZ37AEZ 20080613194800 + example.org/robots.txt sha1:AGW5DJIEUBL67473477TDVBBGDZ37AEZ 20080616061312 + example.org/robots.txt sha1:AGW5DJIEUBL67473477TDVBBGDZ37AEZ 20080618132204 + example.org/robots.txt sha1:AGW5DJIEUBL67473477TDVBBGDZ37AEZ 20080618132213 + example.org/robots.txt sha1:AGW5DJIEUBL67473477TDVBBGDZ37AEZ 20080619132911 These are the revisit dates that need to be added to the records in the Lucene index. When we generated the index, only the date of the @@ -220,35 +194,47 @@ As explained in README-dedup.txt, modifying the Lucene index to actually add these dates is infeasible. What we do is create a -parallel index next to the main index (the part-00000 created above) -that contains all the dates for each record. +parallel index next to the merged index that contains all the dates +for each record. The NutchWAX 'add-dates' command creates this parallel index for us. - $ nutchwax add-dates pindexes/20080703050349/part-0000 \ - pindexes/20080703050349/part-0000 \ - pindexes/20080703050349/dates \ + $ nutchwax add-dates index \ + index \ + dates \ all.dup -Yes, the part-0000 argument does appear twice. This is beacuse it is +Yes, the 'index' argument does appear twice. This is beacuse it is both the "key" index and the "source" index. - Suppose we did another crawl and had even more dates to add to the existing index. In that case we would run - $ nutchwax add-dates pindexes/20080703050349/part-0000 \ - pindexes/20080703050349/dates \ - pindexes/20080703050349/new-dates \ + $ nutchwax add-dates index \ + dates \ + new-dates \ new-crawl.dup - $ rm -r pindexes/20080703050349/dates - $ mv pindexes/20080703050349/new-dates pindexes/20080703050349/dates + $ rm -r dates + $ mv new-dates dates This copies the existing dates from "dates" to "new-dates" and adds additional ones from "new-crawl.dup" along the way. Then we replace the previous "dates" index with the new one. +Now, Nutch doesn't know what to do with the extra 'dates' parallel +index, but NutchWAX does and it requires them to be arranged +in a directory structure of the following form: + pindexes/<name>/dates + /index + +Where "name" is any name of your choosing. For example, + + $ mkdir -p pindexes/200812180000 + $ mv dates pindexes/200812180000/ + $ mv index pindexes/200812180000/ + + WARC ---- This step is the same for ARCs and WARCs. @@ -318,6 +304,8 @@ <listener> <listener-class>org.apache.nutch.searcher.NutchBean$NutchBeanConstructor</listener-class> + </listener> + <listener> <listener-class>org.archive.nutchwax.NutchWaxBean$NutchWaxBeanConstructor</listener-class> </listener> Added: trunk/archive-access/projects/nutchwax/archive/HOWTO-pagerank.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/HOWTO-pagerank.txt (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/HOWTO-pagerank.txt 2008-12-16 03:00:10 UTC (rev 2671) @@ -0,0 +1,129 @@ + +HOWTO-pagerank.txt +2008-12-18 +Aaron Binns + +Table of Contents + o Prerequisites + o Overview + o Generate PageRank + o PageRank Scoring and Boosting + o Configuration and Indexing + + +====================================================================== +Prerequisites +====================================================================== + +This HOWTO assumes you've already read the main NutchWAX HOWTO and are +familiar with importing and indexing archive files with NutchWAX. + +Also, we assume that you are familiar with deploying the Nutch(WAX) +web application into a servlet container such as Tomcat. + + +====================================================================== +Overview +====================================================================== + +NutchWAX provides a pair of tools for extracting and utilizing +simplistic "page rank" information for scoring and sorting documents +in the full-text search index. + +Nutch's 'invertlinks' step inverts links and stores them in the +'linkdb' directory. We use the inlinks to boost the Lucene score of +documents in proportion to the number of inlinks. + + +====================================================================== +Generate PageRank +====================================================================== + +After the Nutch 'invertlinks' step is performed, run the NutchWAX +'pagerank' command to extract inlink information from the 'linkdb' + +For example + + $ nutch invertlinks linkdb -dir segments + $ nutchwax pagerank pagerank.txt linkdb + +The resulting "pagerank.txt" file is a simple text file containing +a count of the number of inlinks followed by the URL. + + $ sort -n pagerank.txt | tail + 367762 http://informe.presidencia.gob.mx/ + 367809 http://comovamos.presidencia.gob.mx/ + 367852 http://ocho.presidencia.gob.mx/ + 372681 http://www.gob.mx/ + 398073 http://pnd.presidencia.gob.mx/ + 399321 http://zedillo.presidencia.gob.mx/ + 496993 http://www.google-analytics.com/urchin.js + 702448 http://www.elbalero.gob.mx/ + 703517 http://www.mexicoenlinea.gob.mx/ + 764195 http://www.brasil.gov.br + +In the above example, the most linked-to URL has 764195 inlinks. + + +====================================================================== +PageRank Scoring and Boosting +====================================================================== + +During indexing, the NutchWAX PageRankScoringFilter uses the page rank +information to boost the Lucene documents score in proportion to the +number of inlinks. + +The formula used for boosting the Lucene document score is a simple +log10()-based calculation + + boost = log10( # inlinks ) + 1 + +In Lucene, the boost is a multiplier where a boost of 1.0 means "no +change" or "no boost" for the document score. By default, all +documents have a boost of 1.0 unless a scoring filter changes it. + +Thus, we add 1 to the log10() value so that our boost scores start and +1.0 and go up from there. + +The use of log10() gives us a linear boost based on the order of +magnitude of the number of inlinks. Consider the following boost +scores as determined by our formula: + + # inlinks boost + 1 1.00 + 10 2.00 + 82 2.91 + 100 3.00 + 532 3.72 + 1000 4.00 + 14892 5.17 + +A document with 1000 inlinks will have it's score boosted 4x compared +to a document with 1 inlink. + + +====================================================================== +Configuration and Indexing +====================================================================== + +To use the PageRankScoringFilter during indexing, replace the Nutch +OPIC scoring filter in the Nutch(WAX) configuration: + +nutch-site.xml + <property> + <name>plugin.includes</name> + <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> + </property> + +Where we change 'scoring-opic' to 'scoring-nutchwax'. + +Then, when we invoke the Nutch(WAX) 'index' command, we specify the +location of the page rank file. For example, + + $ nutch index \ + -Dnutchwax.scoringfilter.pagerank.ranks=pagerank.txt \ + indexes \ + linkdb \ + crawldb \ + segments/* + Modified: trunk/archive-access/projects/nutchwax/archive/HOWTO-xslt.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/HOWTO-xslt.txt 2008-12-16 02:59:10 UTC (rev 2670) +++ trunk/archive-access/projects/nutchwax/archive/HOWTO-xslt.txt 2008-12-16 03:00:10 UTC (rev 2671) @@ -1,13 +1,15 @@ HOWTO-xslt.txt -2008-07-25 +2008-12-18 Aaron Binns Table of Contents o Prerequisites - NutchWAX HOWTO.txt o Overview + o NutchWAX OpenSearchServlet o XSLTFilter and web.xml + o Sample ====================================================================== @@ -31,9 +33,10 @@ Servlet : OpenSearchServlet If you read the OpenSearchServlet.java source code and the search.jsp -page, you'll notice a lot of similarity, if not duplication of code. +page, you'll notice a lot of similarity, if not outright duplication +of code. -The Internet Archive Web Team plans to improve and expand upon the +The Internet Archive Web Team has improved and expanded upon the existing OpenSearchServlet interface as well as adding more XML-based capabilities, including replacements for the existing JSP pages. In short, moving away from JSP and toward XML. @@ -48,6 +51,21 @@ ====================================================================== +NutchWAX OpenSearchServlet +====================================================================== + +NutchWAX contains an enhanced OpenSearch servlet which is a drop-in +replacement for the default Nutch OpenSearch servlet. To use the +NutchWAX implementation, modify the 'web.xml' + +from: + <servlet-class>org.apache.nutch.searcher.OpenSearchServlet</servlet-class> + +to: + <servlet-class>org.archive.nutchwax.OpenSearchServlet</servlet-class> + + +====================================================================== XSLTFilter and web.xml ====================================================================== @@ -55,11 +73,11 @@ OpenSearchServlet is straightforward. Simply add the XSLTFilter to the servlet's path and specify the XSL transform to apply. -For example, consider the default Nutch web.xml +For example, consider the default NutchWAX web.xml <servlet> <servlet-name>OpenSearch</servlet-name> - <servlet-class>org.apache.nutch.searcher.OpenSearchServlet</servlet-class> + <servlet-class>org.archive.nutchwax.OpenSearchServlet</servlet-class> </servlet> <servlet-mapping> @@ -68,13 +86,13 @@ </servlet-mapping> Let's say we want to retain the '/opensearch' path for the XML output, -and add the human-friendly HTML page at '/coolsearch' +and add the human-friendly HTML page at '/search' First, we add an additional 'servlet-mapping' for our new path: <servlet-mapping> <servlet-name>OpenSearch</servlet-name> - <url-pattern>/coolsearch</url-pattern> + <url-pattern>/search</url-pattern> </servlet-mapping> Then, we add the XSLTFilter, passing it a URL to the XSLT file @@ -93,7 +111,7 @@ <filter-mapping> <filter-name>XSLT Filter</filter-name> - <url-pattern>/coolsearch</url-pattern> + <url-pattern>/search</url-pattern> </filter-mapping> This way, we have two URLs, which run the exact same @@ -101,11 +119,11 @@ output whereas the other produces human-friendly HTML output. OpenSearch XML : http://someserver/opensearch?query=foo - Human-friendly HTML : http://someserver/coolsearch?query=foo + Human-friendly HTML : http://someserver/search?query=foo ====================================================================== -Samples +Sample ====================================================================== You can find sample 'web.xml' and 'search.xsl' files in Modified: trunk/archive-access/projects/nutchwax/archive/INSTALL.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/INSTALL.txt 2008-12-16 02:59:10 UTC (rev 2670) +++ trunk/archive-access/projects/nutchwax/archive/INSTALL.txt 2008-12-16 03:00:10 UTC (rev 2671) @@ -1,6 +1,6 @@ INSTALL.txt -2008-10-01 +2008-12-18 Aaron Binns This installation guide assumes the reader is already familiar with @@ -43,7 +43,7 @@ ------------- As mentioned above, NutchWAX 0.12 is built against Nutch-1.0-dev. Nutch doesn't have a 1.0 release package yet, so we have to use the -Nutch SVN trunk. The specific SVN revision that NutchWAX 0.12.2 is +Nutch SVN trunk. The specific SVN revision that NutchWAX 0.12.3 is built against is: 701524 Modified: trunk/archive-access/projects/nutchwax/archive/README.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/README.txt 2008-12-16 02:59:10 UTC (rev 2670) +++ trunk/archive-access/projects/nutchwax/archive/README.txt 2008-12-16 03:00:10 UTC (rev 2671) @@ -1,9 +1,9 @@ README.txt -2008-10-01 +2008-12-18 Aaron Binns -Welcome to NutchWAX 0.12.2! +Welcome to NutchWAX 0.12.3! NutchWAX is a set of add-ons to Nutch in order to index and search archived web data. @@ -60,6 +60,15 @@ Filtering plugin which can be used to exclude URLs from import. It can be used as part of a NutchWAX de-duplication scheme. + plugins/scoring-nutchwax + + Scoring plugin for use at index-time which reads from an external + "pagerank.txt" file for scoring documents based on the log10 of the + number of inlinks to a document. + + The use of this plugin is optional but can improve the quality of + search results, especially for very large collections. + conf/nutch-site.xml Sample configuration properties file showing suggested settings for @@ -131,6 +140,4 @@ contentMetadata.set( NutchWax.DATE_KEY, meta.getDate() ); ... - ====================================================================== - Modified: trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt 2008-12-16 02:59:10 UTC (rev 2670) +++ trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt 2008-12-16 03:00:10 UTC (rev 2671) @@ -1,9 +1,9 @@ RELEASE-NOTES.TXT -2008-10-13 +2008-12-18 Aaron Binns -Release notes for NutchWAX 0.12.2 +Release notes for NutchWAX 0.12.3 For the most recent updates and information on NutchWAX, please visit the project wiki at: @@ -15,9 +15,14 @@ Overview ====================================================================== -NutchWAX 0.12.2 contains some minor enhancements and fixes to NutchWAX -0.12.1. +NutchWAX 0.12.3 contains numerous enhancements and fixes to 0.12.2 + o PageRank calculation and scoring + o Enhanced OpenSearchServlet + o Improved XSLT sample for OpenSearch + o System init.d script for searcher slaves + o Enhanced searcher slave aware of NutchWAX extensions + ====================================================================== Issues ====================================================================== @@ -28,23 +33,6 @@ Issues resolved in this release: -WAX-19 - Add strict/loose option to DateAdder for revisit lines with extra - data on end. - -WAX-21 - Allow for blank lines and comment lines in manifest file. - -WAX-22 - Various code clean-ups based on code review using PMD tool. - -WAX-23 - Add a "field setter" filter to set a field to a static value in the - Lucene document during indexing. - -WAX-24 - DateAdder fails due to uncaught exception in URL canonicalization - -WAX-25 - Add utility/tool to dump unique values of a field in an index. - +WAX-26 + Add XML elements containing all search URL params for self-link + generation This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-12-16 06:24:10
|
Revision: 2673 http://archive-access.svn.sourceforge.net/archive-access/?rev=2673&view=rev Author: binzino Date: 2008-12-16 06:24:01 +0000 (Tue, 16 Dec 2008) Log Message: ----------- Moved conf sub-dir so that it's automatically copied over into Nutch directory during build. This way the NutchWAX extensions are automatically included in the Nutch build. Operators/users don't have to do hand-editing of Nutch conf files to get NutchWAX enhancements. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/ Removed Paths: ------------- trunk/archive-access/projects/nutchwax/archive/conf/ Property changes on: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf ___________________________________________________________________ Added: svn:mergeinfo + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-12-18 18:37:45
|
Revision: 2678 http://archive-access.svn.sourceforge.net/archive-access/?rev=2678&view=rev Author: binzino Date: 2008-12-18 18:37:40 +0000 (Thu, 18 Dec 2008) Log Message: ----------- Updated documenation for 0.12.3 release. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/HOWTO-pagerank.txt trunk/archive-access/projects/nutchwax/archive/HOWTO.txt trunk/archive-access/projects/nutchwax/archive/INSTALL.txt trunk/archive-access/projects/nutchwax/archive/README.txt trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/BUILD-NOTES.txt Added: trunk/archive-access/projects/nutchwax/archive/BUILD-NOTES.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/BUILD-NOTES.txt (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/BUILD-NOTES.txt 2008-12-18 18:37:40 UTC (rev 2678) @@ -0,0 +1,392 @@ + +BUILD-NOTES.txt +2008-12-18 +Aaron Binns + +====================================================================== +Build notes +====================================================================== + +This document contains supplemental notes regarding the NutchWAX +build, expanding upon the information found in the various READMEs and +HOWTOs. + +====================================================================== + +This 0.12.x release of NutchWAX is radically different in source-code +form compared to the previous release, 0.10. + +One of the design goals of 0.12.x was to reduce or even eliminate the +"copy/paste/edit" approach of 0.10. The 0.10 (and prior) NutchWAX +releases had to copy/paste/edit large chunks of Nutch source code in +order to add the NutchWAX features. + +Also, the NutchWAX 0.12.x sources and build are designed to one day be +added into mainline Nutch as a proper "contrib" package; then +eventually be fully integrated into the core Nutch source code. + +====================================================================== + +Most of the NutchWAX source code is relatively straightfoward to those +already familiar with the inner workings of Nutch. Still, special +attention on one class is worth while: + + src/java/org/archive/nutchwax/Importer.java + +This is where ARC/WARC files are read and their documents are imported +into a Nutch segment. + +It is inspired by: + + nutch/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java + +on the Nutch SVN head. + +Our implementation differs in a few important ways: + + o Rather than taking a directory with ARC files as input, we take a + manifest file with URLs to ARC files. This way, the manifest is + split up among the distributed Hadoop jobs and the ARC files are + processed in whole by each worker. + + In the Nutch SVN, the ArcSegmentCreator.java expects the input + directory to contain the ARC files and (AFAICT) splits them up and + distributes them across the Hadoop workers. + + o We use the standard Internet Archive ARCReader and WARCReader + classes. Thus, NutchWAX can read both ARC and WARC files, whereas + the ArcSegmentCreator class can only read ARC files. + + o We add metadata fields to the document, which are then available + to the "index-nutchwax" plugin at indexing-time. + + Importer.importRecord() + ... + contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); + contentMetadata.set( NutchWax.ARCNAME_KEY, meta.getArcFile().getName() ); + contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); + contentMetadata.set( NutchWax.DATE_KEY, meta.getDate() ); + ... + + +====================================================================== +Patching +====================================================================== + +When NutchWAX is built, a number of patches are automatically applied +to the Nutch source and configuration files. + +---------------------------------------------------------------------- +The file + + /opt/nutchwax-0.12.3/conf/tika-mimetypes.xml + +contains two errors: one where a mimetype is referenced before it is +defined; and a second where a definition has an illegal character. + +These errors cause Nutch to not recognize certain mimetypes and +therefore will ignore documents matching those mimetypes. + +There are two fixes: + + 1. Move + + <mime-type type="application/xml"> + <alias type="text/xml" /> + <glob pattern="*.xml" /> + </mime-type> + + definition higher up in the file, before the reference to it. + + 2. Remove + + <mime-type type="application/x-ms-dos-executable"> + <alias type="application/x-dosexec;exe" /> + </mime-type> + + as the ';' character is illegal according to the comments in the + Nutch code. + +You can either apply these patches yourself, or copy an already-patched +copy from: + + /opt/nutchwax-0.12.3/contrib/archive/conf/tika-mimetypes.xml + +to + + /opt/nutchwax-0.12.3/conf/tika-mimetypes.xml + +---------------------------------------------------------------------- + +In the file 'conf/nutch-site.xml' we define some properties to +over-ride the values in 'conf/nutch-default.xml'. + +-------------------------------------------------- +plugin.includes +-------------------------------------------------- +Change the list of plugins from: + + protocol-http|urlfilter-regex|parse-(text|html|js)|index-(basic|anchor)|query-(basic|site|url)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic) + +to + + protocol-http|parse-(text|html|js|pdf)|index-(basic|anchor|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic|urlfilter-nutchwax + +In short, we add: + + index-nutchwax + query-nutchwax + urlfilter-nutchwax + parse-pdf + +and remove: + + urlfilter-regex + urlnormalizer-(pass|regex|basic) + +The only *required* changes are the additions of the NutchWAX index +and query plugins. The rest are optional, but recommended. + +The "parse-pdf" plugin is added simply because we have lots of PDFs in +our archives and we want to index them. We sometimes remove the +"parse-js" plugin if we don't care to index JavaScript files. + +We also remove the default Nutch URL filtering and normalizing plugins +because we do not need the URLs normalized nor filtered. We trust +that the tool that produced the ARC/WARC file will have normalized the +URLs contained therein according to its own rules so there's no need +to normalize here. Also, we don't filter by URL since we want to +index as much of the ARC/WARC file as we have parsers for. + +We do, however, add the NutchWAX URL filter. If de-duplication is +being performed upon import, this plugin is required. It performs URL +filtering of the list of ARC records to exclude based on +URL+digest+date. + +-------------------------------------------------- +indexingfilter.order +-------------------------------------------------- + +Add this property with a value of + + org.apache.nutch.indexer.basic.BasicIndexingFilter + org.archive.nutchwax.index.ConfigurableIndexingFilter + +So that the NutchWAX indexing filter is run after the Nutch basic +indexing filter. + +A full explanation is given in "README-dedup.txt". + +-------------------------------------------------- +mime.type.magic +-------------------------------------------------- +We disable mimetype detection in Nutch for two reasons: + +1. The ARC/WARC file specifies the Content-Type of the document. We + trust that the tool that created the ARC/WARC file got it right. + +2. The implementation in Nutch can use a lot of memory as the *entire* + document is read into memory as a byte[], then converted to a + String, then checked against the MIME database. This can lead to + out of memory errors for large files, such as music and video. + +To disable, simply set the property value to false. + + <property> + <name>mime.type.magic</name> + <value>false</value> + </property> + +-------------------------------------------------- +nutchwax.filter.index +-------------------------------------------------- +Configure the 'index-nutchwax' plugin. Specify how the metadata +fields added by the Importer are mapped to the Lucene documents during +indexing. + +The specifications here are of the form: + + src-key:lowercase:store:tokenize:exclusive:dest-key + +where the only required part is the "src-key", the rest will assume +the following defaults: + + lowercase = true + store = true + tokenize = false + exclusive = true + dest-key = src-key + +We recommend: + +<property> + <name>nutchwax.filter.index</name> + <value> + url:false:true:true + url:flase:true:false:true:exacturl + orig:false + digest:false + filename:false + fileoffset:false + collection + date + type + length + </value> +</property> + +The "url", "orig" and "digest" values are required, the rest are +optional, but strongly recommended. + +-------------------------------------------------- +nutchwax.filter.query +-------------------------------------------------- +Configure the 'query-nutchwax' plugin. Specify which fields to make +searchable via "field:[term|phrase]" query syntax, and whether they +are "raw" fields or not. + +The specification format is one of: + + field:<name>:<boost> + raw:<name>:<lowercase>:<boost> + group:<name>:<lowercase>:<delimiter>:<boost> + +Default values are + + lowercase = true + delimiter = "," + boost = 1.0f + +There is no "lowercase" property for "field" specification because the +Nutch FieldQueryFilter doesn't expose the option, unlike the +RawFieldQueryFilter. + +The "group" fields are raw fields that can accept multiple values, +separated by a delimiter. Multiple values appearing in a query are +automagically translated into required OR-groups, such as + + collection:"193,221,36" => +(collection:193 collection:221 collection:36) + +NOTE: We do *not* use this filter for handling "date" queries, there +is a specific filter for that: DateQueryFilter + +We recommend: + +<property> + <name>nutchwax.filter.query</name> + <value> + raw:digest:false + raw:filename:false + raw:fileoffset:false + raw:exacturl:false + group:collection + group:type + field:anchor + field:content + field:host + field:title + </value> +</property> + + +-------------------------------------------------- +nutchwax.urlfilter.wayback.exclusions +-------------------------------------------------- +File containing the exclusion list for importing. + +Normally, this is specified on the command line with the NutchWAX +Importer is invoked. It can be specified here if preferred. + +-------------------------------------------------- +nutchwax.urlfilter.wayback.canonicalizer +-------------------------------------------------- + +For CDX-based de-duplication, the same URL canonicalization algorithm +must be used here as was used to generate the CDX files. + +The default canonicalizer in Wayback's '(w)arc-indexer' utility +is + + org.archive.wayback.util.url.AggressiveUrlCanonicalizer + +which is the value provided in "nutch-site.xml". + +If the '(w)arc-indexer' is executed with the "-i" (identity) +command-line option, then the matching canonicalizer + + org.archive.wayback.util.url.IdentityUrlCanonicalizer + +must be specified here. + +-------------------------------------------------- +nutchwax.filter.http.status +-------------------------------------------------- +This property configures a filter with a list of ranges +of HTTP status codes to allow. + +Typically, most NutchWAX implementors do not wish to import and index +404, 500, 302 and other non-success pages. This is an inclusion +filter, meaning that only ARC records with an HTTP status code +matching any of the values will be imported. + +There is a special "unknown" value which can be used to include ARC +records that don't have an HTTP status code (for whatever reason). + +The default setting provided in nutch-site.xml is to allow any 2XX +success code: + + <property> + <name>nutchwax.filter.http.status</name> + <value> + 200-299 + </value> + </property> + +But some other examples are: + + Allow any 2XX success code *and* redirects, use: + <property> + <name>nutchwax.filter.http.status</name> + <value> + 200-299 + 300-399 + </value> + </property> + + Be really strict about only certain codes, use: + <property> + <name>nutchwax.filter.http.status</name> + <value> + 200 + 301 + 302 + 304 + </value> + </property> + + Mix of ranges and specific codes, including the "unknown" + <property> + <name>nutchwax.filter.http.status</name> + <value> + Unknown + 200 + 300-399 + </value> + </property> + +-------------------------------------------------- +nutchwax.import.content.limit +-------------------------------------------------- +Similar to Nutch's + + file.content.limit + http.content.limit + ftp.content.limit + +properties, this specifies a limit on the size of a document imported +via NutchWAX. + +We recommend setting this to a size compatible with the memory +capacity of the computers performing the import. Something in the +1-4MB range is typical. + Modified: trunk/archive-access/projects/nutchwax/archive/HOWTO-pagerank.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/HOWTO-pagerank.txt 2008-12-16 19:53:25 UTC (rev 2677) +++ trunk/archive-access/projects/nutchwax/archive/HOWTO-pagerank.txt 2008-12-18 18:37:40 UTC (rev 2678) @@ -31,7 +31,7 @@ in the full-text search index. Nutch's 'invertlinks' step inverts links and stores them in the -'linkdb' directory. We use the inlinks to boost the Lucene score of +'linkdb' directory. We use these inlinks to boost the Lucene score of documents in proportion to the number of inlinks. Modified: trunk/archive-access/projects/nutchwax/archive/HOWTO.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/HOWTO.txt 2008-12-16 19:53:25 UTC (rev 2677) +++ trunk/archive-access/projects/nutchwax/archive/HOWTO.txt 2008-12-18 18:37:40 UTC (rev 2678) @@ -5,9 +5,8 @@ Table of Contents o Prerequisites - - Nutch(WAX) installation + - NutchWAX installation - ARC/WARC files - o Configuration & Patching o Create a manifest o Import, Invert and Index o Search @@ -27,7 +26,7 @@ This HOWTO assumes it is installed in - /opt/nutch-1.0-dev + /opt/nutchwax-0.12.3 2. ARC/WARC files. @@ -40,348 +39,6 @@ ====================================================================== -Patching -====================================================================== - -The vanilla NutchWAX as built according to the INSTALL.txt guide is -not quite ready to be used out-of-the-box. - -Before you can use NutchWAX, you must first patch a bug that exists in -the current Nutch SVN head. - -The file - - /opt/nutch-1.0-dev/conf/tika-mimetypes.xml - -contains two errors: one where a mimetype is referenced before it is -defined; and a second where a definition has an illegal character. - -These errors cause Nutch to not recognize certain mimetypes and -therefore will ignore documents matching those mimetypes. - -There are two fixes: - - 1. Move - - <mime-type type="application/xml"> - <alias type="text/xml" /> - <glob pattern="*.xml" /> - </mime-type> - - definition higher up in the file, before the reference to it. - - 2. Remove - - <mime-type type="application/x-ms-dos-executable"> - <alias type="application/x-dosexec;exe" /> - </mime-type> - - as the ';' character is illegal according to the comments in the - Nutch code. - -You can either apply these patches yourself, or copy an already-patched -copy from: - - /opt/nutch-1.0-dev/contrib/archive/conf/tika-mimetypes.xml - -to - - /opt/nutch-1.0-dev/conf/tika-mimetypes.xml - - -====================================================================== -Configuring -====================================================================== - -Since we assume that you are already familiar with Nutch, then you -should already be familiar with configuring it. The configuration -is mainly defined in - - /opt/nutch-1.0-dev/conf/nutch-default.xml - -NutchWAX requires the modification of two existing properties and the -addition of two new ones. - -All of the modifications described below can be found in: - - /opt/nutch-1.0-dev/contrib/archive/conf/nutch-site.xml - -You can either apply the configuration changes yourself, or copy that -file to - - /opt/nutch-1.0-dev/conf/nutch-site.xml - --------------------------------------------------- -plugin.includes --------------------------------------------------- -Change the list of plugins from: - - protocol-http|urlfilter-regex|parse-(text|html|js)|index-(basic|anchor)|query-(basic|site|url)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic) - -to - - protocol-http|parse-(text|html|js|pdf)|index-(basic|anchor|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic|urlfilter-nutchwax - -In short, we add: - - index-nutchwax - query-nutchwax - urlfilter-nutchwax - parse-pdf - -and remove: - - urlfilter-regex - urlnormalizer-(pass|regex|basic) - -The only *required* changes are the additions of the NutchWAX index -and query plugins. The rest are optional, but recommended. - -The "parse-pdf" plugin is added simply because we have lots of PDFs in -our archives and we want to index them. We sometimes remove the -"parse-js" plugin if we don't care to index JavaScript files. - -We also remove the default Nutch URL filtering and normalizing plugins -because we do not need the URLs normalized nor filtered. We trust -that the tool that produced the ARC/WARC file will have normalized the -URLs contained therein according to its own rules so there's no need -to normalize here. Also, we don't filter by URL since we want to -index as much of the ARC/WARC file as we have parsers for. - -We do, however, add the NutchWAX URL filter. If de-duplication is -being performed upon import, this plugin is required. It performs URL -filtering of the list of ARC records to exclude based on -URL+digest+date. - --------------------------------------------------- -indexingfilter.order --------------------------------------------------- - -Add this property with a value of - - org.apache.nutch.indexer.basic.BasicIndexingFilter - org.archive.nutchwax.index.ConfigurableIndexingFilter - -So that the NutchWAX indexing filter is run after the Nutch basic -indexing filter. - -A full explanation is given in "README-dedup.txt". - --------------------------------------------------- -mime.type.magic --------------------------------------------------- -We disable mimetype detection in Nutch for two reasons: - -1. The ARC/WARC file specifies the Content-Type of the document. We - trust that the tool that created the ARC/WARC file got it right. - -2. The implementation in Nutch can use a lot of memory as the *entire* - document is read into memory as a byte[], then converted to a - String, then checked against the MIME database. This can lead to - out of memory errors for large files, such as music and video. - -To disable, simply set the property value to false. - - <property> - <name>mime.type.magic</name> - <value>false</value> - </property> - --------------------------------------------------- -nutchwax.filter.index --------------------------------------------------- -Configure the 'index-nutchwax' plugin. Specify how the metadata -fields added by the Importer are mapped to the Lucene documents during -indexing. - -The specifications here are of the form: - - src-key:lowercase:store:tokenize:exclusive:dest-key - -where the only required part is the "src-key", the rest will assume -the following defaults: - - lowercase = true - store = true - tokenize = false - exclusive = true - dest-key = src-key - -We recommend: - -<property> - <name>nutchwax.filter.index</name> - <value> - url:false:true:true - url:flase:true:false:true:exacturl - orig:false - digest:false - filename:false - fileoffset:false - collection - date - type - length - </value> -</property> - -The "url", "orig" and "digest" values are required, the rest are -optional, but strongly recommended. - --------------------------------------------------- -nutchwax.filter.query --------------------------------------------------- -Configure the 'query-nutchwax' plugin. Specify which fields to make -searchable via "field:[term|phrase]" query syntax, and whether they -are "raw" fields or not. - -The specification format is one of: - - field:<name>:<boost> - raw:<name>:<lowercase>:<boost> - group:<name>:<lowercase>:<delimiter>:<boost> - -Default values are - - lowercase = true - delimiter = "," - boost = 1.0f - -There is no "lowercase" property for "field" specification because the -Nutch FieldQueryFilter doesn't expose the option, unlike the -RawFieldQueryFilter. - -The "group" fields are raw fields that can accept multiple values, -separated by a delimiter. Multiple values appearing in a query are -automagically translated into required OR-groups, such as - - collection:"193,221,36" => +(collection:193 collection:221 collection:36) - -NOTE: We do *not* use this filter for handling "date" queries, there -is a specific filter for that: DateQueryFilter - -We recommend: - -<property> - <name>nutchwax.filter.query</name> - <value> - raw:digest:false - raw:filename:false - raw:fileoffset:false - raw:exacturl:false - group:collection - group:type - field:anchor - field:content - field:host - field:title - </value> -</property> - - --------------------------------------------------- -nutchwax.urlfilter.wayback.exclusions --------------------------------------------------- -File containing the exclusion list for importing. - -Normally, this is specified on the command line with the NutchWAX -Importer is invoked. It can be specified here if preferred. - --------------------------------------------------- -nutchwax.urlfilter.wayback.canonicalizer --------------------------------------------------- - -For CDX-based de-duplication, the same URL canonicalization algorithm -must be used here as was used to generate the CDX files. - -The default canonicalizer in Wayback's '(w)arc-indexer' utility -is - - org.archive.wayback.util.url.AggressiveUrlCanonicalizer - -which is the value provided in "nutch-site.xml". - -If the '(w)arc-indexer' is executed with the "-i" (identity) -command-line option, then the matching canonicalizer - - org.archive.wayback.util.url.IdentityUrlCanonicalizer - -must be specified here. - --------------------------------------------------- -nutchwax.filter.http.status --------------------------------------------------- -This property configures a filter with a list of ranges -of HTTP status codes to allow. - -Typically, most NutchWAX implementors do not wish to import and index -404, 500, 302 and other non-success pages. This is an inclusion -filter, meaning that only ARC records with an HTTP status code -matching any of the values will be imported. - -There is a special "unknown" value which can be used to include ARC -records that don't have an HTTP status code (for whatever reason). - -The default setting provided in nutch-site.xml is to allow any 2XX -success code: - - <property> - <name>nutchwax.filter.http.status</name> - <value> - 200-299 - </value> - </property> - -But some other examples are: - - Allow any 2XX success code *and* redirects, use: - <property> - <name>nutchwax.filter.http.status</name> - <value> - 200-299 - 300-399 - </value> - </property> - - Be really strict about only certain codes, use: - <property> - <name>nutchwax.filter.http.status</name> - <value> - 200 - 301 - 302 - 304 - </value> - </property> - - Mix of ranges and specific codes, including the "unknown" - <property> - <name>nutchwax.filter.http.status</name> - <value> - Unknown - 200 - 300-399 - </value> - </property> - --------------------------------------------------- -nutchwax.import.content.limit --------------------------------------------------- -Similar to Nutch's - - file.content.limit - http.content.limit - ftp.content.limit - -properties, this specifies a limit on the size of a document imported -via NutchWAX. - -We recommend setting this to a size compatible with the memory -capacity of the computers performing the import. Something in the -1-4MB range is typical. - - -====================================================================== Create a manifest ====================================================================== @@ -411,10 +68,10 @@ $ mkdir crawl $ cd crawl - $ /opt/nutch-1.0-dev/bin/nutchwax import ../manifest - $ /opt/nutch-1.0-dev/bin/nutch updatedb crawldb -dir segments - $ /opt/nutch-1.0-dev/bin/nutch invertlinks linkdb -dir segments - $ /opt/nutch-1.0-dev/bin/nutch index indexes crawldb linkdb segments/* + $ /opt/nutchwax-0.12.3/bin/nutchwax import ../manifest + $ /opt/nutchwax-0.12.3/bin/nutch updatedb crawldb -dir segments + $ /opt/nutchwax-0.12.3/bin/nutch invertlinks linkdb -dir segments + $ /opt/nutchwax-0.12.3/bin/nutch index indexes crawldb linkdb segments/* $ ls -F1 crawldb/ indexes/ @@ -439,7 +96,7 @@ $ cd ../ $ ls -F1 crawl/ - $ /opt/nutch-1.0-dev/bin/nutch org.apache.nutch.searcher.NutchBean computer + $ /opt/nutchwax-0.12.3/bin/nutch org.apache.nutch.searcher.NutchBean computer This calls the NutchBean to execute a simple keyword search for "computer". Use whatever query term you think appears in the @@ -450,17 +107,9 @@ Web Deployment ====================================================================== -As users of Nutch are aware, the web application (nutch-1.0-dev.war) -bundled with Nutch contains duplicate copies of the configuration -files. +The Nutch(WAX) web application is bundled with NutchWAX as -So, all patches and configuration changes that we made to the -files in + /opt/nutchwax-0.12.3/nutch-1.0-dev.war - /opt/nutch-1.0-dev/conf - -will have to be duplicated in the Nutch webapp when it is deployed. - -This is not due to NutchWAX, this is a "feature" of regular Nutch. I -just thought it would be good to remind everyone since we did make -configuration changes for NutchWAX. +Simply deploy that web application in the same fashion as with +Nutch. Modified: trunk/archive-access/projects/nutchwax/archive/INSTALL.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/INSTALL.txt 2008-12-16 19:53:25 UTC (rev 2677) +++ trunk/archive-access/projects/nutchwax/archive/INSTALL.txt 2008-12-18 18:37:40 UTC (rev 2678) @@ -3,10 +3,22 @@ 2008-12-18 Aaron Binns +Table of Contents + o Introduction + o Build from source + - SVN: Nutch 1.0-dev + - SVN: NutchWAX + - Build and Install + o Install binary package + + +====================================================================== +Introduction +====================================================================== + This installation guide assumes the reader is already familiar with building, packaging and deploying Nutch 1.0-dev. - The NutchWAX 0.12 source and build system are designed to integrate into the existing Nutch 1.0-dev source and build. @@ -20,12 +32,12 @@ proper, then builds NutchWAX components and integrates them into the Nutch build directory. -We recommend that you execute all build commands from the NutchWAX -directory. This way, NutchWAX will ensure that any and all +In order to build NutchWAX, execute all build commands from the +NutchWAX directory. This way, NutchWAX will ensure that any and all dependencies in Nutch will be properly built and kept up-to-date. Towards this goal, we have duplicated the most common build targets -from the Nutch 'build.xml' file to the NutchWAX 'build.xml' file, -such as: +from the Nutch 'build.xml' file to the NutchWAX 'build.xml' file, such +as: o compile o jar @@ -39,8 +51,15 @@ sub-directory as normal. -Nutch-1.0-dev -------------- +====================================================================== +Build from Source +====================================================================== + +To build from source, you must check-out the Nutch and NutchWAX sources +from their respective 'subversion' source control servers. + +SVN: nutch-1.0-dev +------------------ As mentioned above, NutchWAX 0.12 is built against Nutch-1.0-dev. Nutch doesn't have a 1.0 release package yet, so we have to use the Nutch SVN trunk. The specific SVN revision that NutchWAX 0.12.3 is @@ -53,9 +72,12 @@ $ svn checkout -r 701524 http://svn.apache.org/repos/asf/lucene/nutch/trunk nutch $ cd nutch +Please be sure to check-out this specific version of the Nutch source. +If you just grab the head of the trunk, there may be newer and +incompatible changed to Nutch. -NutchWAX --------- +SVN: NutchWAX +------------- Once you have Nutch-1.0-dev checked-out, check-out NutchWAX into Nutch's "contrib" directory. @@ -65,7 +87,6 @@ This will create a sub-directory named "archive" containing the NutchWAX sources. - Build and install ----------------- Assuming you already have the required tool-set for building Nutch, @@ -91,3 +112,18 @@ $ cd /opt $ tar xvfz nutch-1.0-dev.tar.gz + $ mv nutch-1.0-dev nutchwax-0.12.3 + + +====================================================================== +Install binary package +====================================================================== + +Alternatively, grab a "binary" release package from the Internet +Archive's NutchWAX home page. + +Install it simply by untarring it, for example: + + $ cd /opt + $ tar xvfz nutchwax-0.12.3.tar.gz + Modified: trunk/archive-access/projects/nutchwax/archive/README.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/README.txt 2008-12-16 19:53:25 UTC (rev 2677) +++ trunk/archive-access/projects/nutchwax/archive/README.txt 2008-12-18 18:37:40 UTC (rev 2678) @@ -3,6 +3,16 @@ 2008-12-18 Aaron Binns +Table of Contents + o Introduction + o Build and Install + o Tutorial + + +====================================================================== +Introduction +====================================================================== + Welcome to NutchWAX 0.12.3! NutchWAX is a set of add-ons to Nutch in order to index and search @@ -17,7 +27,6 @@ Since NutchWAX is a set of add-ons to Nutch, you should already be familiar with Nutch before using NutchWAX. -====================================================================== The goal of NutchWAX is to enable full-text indexing and searching of documents stored in web archive file formats (ARC and WARC). @@ -26,13 +35,13 @@ to Nutch to read documents directly from ARC/WARC files. We call this process "importing" archive files. -Importing produces a Nutch segment, similar to Nutch crawling the -documents itself. In this scenario, document importing replaces the +Importing produces a Nutch segment, the same as when Nutch is used to +crawl documents itself. In essence, document importing replaces the conventional "generate/fetch/update" cycle of Nutch. Once the archival documents have been imported into a segment, the -regular Nutch commands to update the 'crawldb', invert the links and -index the document contents can proceed as normal. +regular Nutch commands to index the document contents can proceed as +normal. ====================================================================== @@ -71,73 +80,25 @@ conf/nutch-site.xml - Sample configuration properties file showing suggested settings for - Nutch and NutchWAX. + Additional configuration properties for NutchWAX, including + over-rides for properties defined in 'nutch-default.xml' There is no separate 'lib/nutchwax.jar' file for NutchWAX. NutchWAX is distributed in source code form and is intended to be built in conjunction with Nutch. -See "INSTALL.txt" for details on building NutchWAX and Nutch. -See "HOWTO.txt" for a quick tutorial on importing, indexing and -searching a set of documents in a web archive file. - ====================================================================== - -This 0.12.x release of NutchWAX is radically different in source-code -form compared to the previous release, 0.10. - -One of the design goals of 0.12.x was to reduce or even eliminate the -"copy/paste/edit" approach of 0.10. The 0.10 (and prior) NutchWAX -releases had to copy/paste/edit large chunks of Nutch source code in -order to add the NutchWAX features. - -Also, the NutchWAX 0.12.x sources and build are designed to one day be -added into mainline Nutch as a proper "contrib" package; then -eventually be fully integrated into the core Nutch source code. - +Build and Install ====================================================================== -Most of the NutchWAX source code is relatively straightfoward to those -already familiar with the inner workings of Nutch. Still, special -attention on one class is worth while: +See "INSTALL.txt" for detailed instructions to build NutchWAX from +source or install a binary package. - src/java/org/archive/nutchwax/Importer.java -This is where ARC/WARC files are read and their documents are imported -into a Nutch segment. - -It is inspired by: - - nutch/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java - -on the Nutch SVN head. - -Our implementation differs in a few important ways: - - o Rather than taking a directory with ARC files as input, we take a - manifest file with URLs to ARC files. This way, the manifest is - split up among the distributed Hadoop jobs and the ARC files are - processed in whole by each worker. - - In the Nutch SVN, the ArcSegmentCreator.java expects the input - directory to contain the ARC files and (AFAICT) splits them up and - distributes them across the Hadoop workers. - - o We use the standard Internet Archive ARCReader and WARCReader - classes. Thus, NutchWAX can read both ARC and WARC files, whereas - the ArcSegmentCreator class can only read ARC files. - - o We add metadata fields to the document, which are then available - to the "index-nutchwax" plugin at indexing-time. - - Importer.importRecord() - ... - contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); - contentMetadata.set( NutchWax.ARCNAME_KEY, meta.getArcFile().getName() ); - contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); - contentMetadata.set( NutchWax.DATE_KEY, meta.getDate() ); - ... - ====================================================================== +Tutorial +====================================================================== + +See "HOWTO.txt" for a quick tutorial on importing, indexing and +searching a set of documents in a web archive file. Modified: trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt 2008-12-16 19:53:25 UTC (rev 2677) +++ trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt 2008-12-18 18:37:40 UTC (rev 2678) @@ -21,8 +21,45 @@ o Enhanced OpenSearchServlet o Improved XSLT sample for OpenSearch o System init.d script for searcher slaves - o Enhanced searcher slave aware of NutchWAX extensions + o Enhanced searcher slave which supports NutchWAX extensions + +One of the major changes to 0.12.3 is not a feature, enhancement or +bug-fix, but the way the NutchWAX source is "integrated" into the +Nutch source. + +Yes, the NutchWAX source is still kept in the contrib/archive +sub-directory, but when you invoke a build command from the +NutchWAX directory, such as + + $ cd nutch/contrib/archive + $ ant tar + +Many files from the NutchWAX source tree are copied directly into the +Nutch source tree before the build process begins. + +The reason for this is to make NutchWAX easier to use. + +In previous versions of NutchWAX, once 'ant' build command was +finished, the operator had to manually patch configuration files in +the Nutch directory. Upon a subsequent build, the files would be +over-written by Nutch's and would have to be patched again. + +It was a major hassle and complication. + +Another impetus for copying files into the Nutch source was to patch +bugs and make enhancements in the Nutch Java code which couldn't be +effectively done keeping the sources separate. When an 'ant' build +command is run a few Java files are copied from the NutchWAX source +tree into the Nutch source tree. + +In release 0.12.3, the NutchWAX build file: 'build.xml' handles all of +this. Simply execute your build commands from 'contrib/archive' as +instructed in the HOWTO and no longer worry about patching +configuration files. If you wish to alter the NutchWAX configuration +file, make those changes in the NutchWAX source tree. + + ====================================================================== Issues ====================================================================== This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-03-08 21:43:45
|
Revision: 2693 http://archive-access.svn.sourceforge.net/archive-access/?rev=2693&view=rev Author: binzino Date: 2009-03-08 21:43:33 +0000 (Sun, 08 Mar 2009) Log Message: ----------- Updated documentation for 0.12.4 release. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/BUILD-NOTES.txt trunk/archive-access/projects/nutchwax/archive/HOWTO.txt trunk/archive-access/projects/nutchwax/archive/INSTALL.txt trunk/archive-access/projects/nutchwax/archive/README.txt trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt Modified: trunk/archive-access/projects/nutchwax/archive/BUILD-NOTES.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/BUILD-NOTES.txt 2009-03-08 20:44:25 UTC (rev 2692) +++ trunk/archive-access/projects/nutchwax/archive/BUILD-NOTES.txt 2009-03-08 21:43:33 UTC (rev 2693) @@ -79,7 +79,7 @@ ---------------------------------------------------------------------- The file - /opt/nutchwax-0.12.3/conf/tika-mimetypes.xml + /opt/nutchwax-0.12.4/conf/tika-mimetypes.xml contains two errors: one where a mimetype is referenced before it is defined; and a second where a definition has an illegal character. @@ -110,11 +110,11 @@ You can either apply these patches yourself, or copy an already-patched copy from: - /opt/nutchwax-0.12.3/contrib/archive/conf/tika-mimetypes.xml + /opt/nutchwax-0.12.4/contrib/archive/conf/tika-mimetypes.xml to - /opt/nutchwax-0.12.3/conf/tika-mimetypes.xml + /opt/nutchwax-0.12.4/conf/tika-mimetypes.xml ---------------------------------------------------------------------- @@ -166,7 +166,6 @@ -------------------------------------------------- indexingfilter.order -------------------------------------------------- - Add this property with a value of org.apache.nutch.indexer.basic.BasicIndexingFilter @@ -300,7 +299,6 @@ -------------------------------------------------- nutchwax.urlfilter.wayback.canonicalizer -------------------------------------------------- - For CDX-based de-duplication, the same URL canonicalization algorithm must be used here as was used to generate the CDX files. @@ -390,3 +388,43 @@ capacity of the computers performing the import. Something in the 1-4MB range is typical. +-------------------------------------------------- +nutchwax.FetchedSegments.perCollection +-------------------------------------------------- +Enable per-collection segment sub-dirs, e.g. + + segments/<collectionId>/segment1 + /segment2 + ... + +Default value: false + +For example, + + <property> + <name>nutchwax.FetchedSegments.perCollection</name> + <value>true</value> + </property> + +-------------------------------------------------- +nutchwax.import.content.store +-------------------------------------------------- +Whether or not we store the full content in the segment's "content" +directory. Most NutchWAX users are also using Wayback to serve the +archived content, so there's no need for NutchWAX to keep a "cached" +copy as well. + +Setting to 'true' yields the same bahavior as in previous versions of +NutchWAX, and as in Nutch. The content is stored in the segment's +"content" directory. + +Setting to 'false' results in an empty "content" directory in the +segment. The content is not stored. + +Default value is 'false'. + + <property> + <name>nutchwax.import.store.content</name> + <value>false</value> + </property> + Modified: trunk/archive-access/projects/nutchwax/archive/HOWTO.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/HOWTO.txt 2009-03-08 20:44:25 UTC (rev 2692) +++ trunk/archive-access/projects/nutchwax/archive/HOWTO.txt 2009-03-08 21:43:33 UTC (rev 2693) @@ -26,7 +26,7 @@ This HOWTO assumes it is installed in - /opt/nutchwax-0.12.3 + /opt/nutchwax-0.12.4 2. ARC/WARC files. @@ -68,10 +68,10 @@ $ mkdir crawl $ cd crawl - $ /opt/nutchwax-0.12.3/bin/nutchwax import ../manifest - $ /opt/nutchwax-0.12.3/bin/nutch updatedb crawldb -dir segments - $ /opt/nutchwax-0.12.3/bin/nutch invertlinks linkdb -dir segments - $ /opt/nutchwax-0.12.3/bin/nutch index indexes crawldb linkdb segments/* + $ /opt/nutchwax-0.12.4/bin/nutchwax import ../manifest + $ /opt/nutchwax-0.12.4/bin/nutch updatedb crawldb -dir segments + $ /opt/nutchwax-0.12.4/bin/nutch invertlinks linkdb -dir segments + $ /opt/nutchwax-0.12.4/bin/nutch index indexes crawldb linkdb segments/* $ ls -F1 crawldb/ indexes/ @@ -96,7 +96,7 @@ $ cd ../ $ ls -F1 crawl/ - $ /opt/nutchwax-0.12.3/bin/nutch org.apache.nutch.searcher.NutchBean computer + $ /opt/nutchwax-0.12.4/bin/nutch org.apache.nutch.searcher.NutchBean computer This calls the NutchBean to execute a simple keyword search for "computer". Use whatever query term you think appears in the @@ -109,7 +109,7 @@ The Nutch(WAX) web application is bundled with NutchWAX as - /opt/nutchwax-0.12.3/nutch-1.0-dev.war + /opt/nutchwax-0.12.4/nutch-1.0-dev.war Simply deploy that web application in the same fashion as with Nutch. Modified: trunk/archive-access/projects/nutchwax/archive/INSTALL.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/INSTALL.txt 2009-03-08 20:44:25 UTC (rev 2692) +++ trunk/archive-access/projects/nutchwax/archive/INSTALL.txt 2009-03-08 21:43:33 UTC (rev 2693) @@ -1,6 +1,6 @@ INSTALL.txt -2008-12-18 +2009-03-08 Aaron Binns Table of Contents @@ -10,6 +10,7 @@ - SVN: NutchWAX - Build and Install o Install binary package + o Install start-up scripts ====================================================================== @@ -62,7 +63,7 @@ ------------------ As mentioned above, NutchWAX 0.12 is built against Nutch-1.0-dev. Nutch doesn't have a 1.0 release package yet, so we have to use the -Nutch SVN trunk. The specific SVN revision that NutchWAX 0.12.3 is +Nutch SVN trunk. The specific SVN revision that NutchWAX 0.12.4 is built against is: 701524 @@ -78,14 +79,14 @@ SVN: NutchWAX ------------- -Once you have Nutch-1.0-dev checked-out, check-out NutchWAX into -Nutch's "contrib" directory. +Once you have Nutch-1.0-dev checked-out, check-out NutchWAX 0.12.4 +source into Nutch's "contrib" directory. $ cd contrib - $ svn checkout http://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/nutchwax/archive + $ svn checkout http://archive-access.svn.sourceforge.net/svnroot/archive-access/tags/nutchwax-0_12_4/archive This will create a sub-directory named "archive" containing the -NutchWAX sources. +NutchWAX 0.12.4 sources. Build and install ----------------- @@ -112,7 +113,7 @@ $ cd /opt $ tar xvfz nutch-1.0-dev.tar.gz - $ mv nutch-1.0-dev nutchwax-0.12.3 + $ mv nutch-1.0-dev nutchwax-0.12.4 ====================================================================== @@ -125,5 +126,50 @@ Install it simply by untarring it, for example: $ cd /opt - $ tar xvfz nutchwax-0.12.3.tar.gz + $ tar xvfz nutchwax-0.12.4.tar.gz + +====================================================================== +Install start-up scripts +====================================================================== + +NutchWAX 0.12.4 comes with a Unix init.d script which can be used to +automatically start the searcher slaves for a multi-node search +configuration. + +Assuming you installed NutchWAX as + + /opt/nutchwax-0.12.4 + +the script is found at + + /opt/nutchwax-0.12.4/contrib/archive/etc/init.d/searcher-slave + +This script can be placed in /etc/init.d then added to the list of +startup scripts to run at bootup by using commands appropriate to your +Linux distribution. + +You must edit a few of the environment variables defined in the +'searcher-slave' specifying where NutchWAX is installed and where the +index(s) are deployed. In 'searcher-slave' you will find the: + + export NUTCH_HOME=TODO + export DEPLOYMENT_DIR=TODO + +edit those appropriately for your system. + + +The "master" in the multi-node search deployment is the NutchWAX +webapp running in a webapp server, such as Tomcat or Jetty. + +Jetty comes with a start/stop script appropriate for use as an init.d +script, similar to the 'searcher-slave' script described above. If you +use Jetty, create a symlink + + /etc/init.d/jetty.sh -> /opt/jetty/bin/jetty.sh + +Then add this script to the list of startup scripts to run at bootup +by using commands appropriate to your Linux distribution. + +Follow the instructions from Jetty on the deployment of the NutchWAX +webapp (nutch-1.0-dev.war) in the Jetty web application server. Modified: trunk/archive-access/projects/nutchwax/archive/README.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/README.txt 2009-03-08 20:44:25 UTC (rev 2692) +++ trunk/archive-access/projects/nutchwax/archive/README.txt 2009-03-08 21:43:33 UTC (rev 2693) @@ -1,6 +1,6 @@ README.txt -2008-12-18 +2008-03-08 Aaron Binns Table of Contents @@ -13,7 +13,7 @@ Introduction ====================================================================== -Welcome to NutchWAX 0.12.3! +Welcome to NutchWAX 0.12.4! NutchWAX is a set of add-ons to Nutch in order to index and search archived web data. Modified: trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt 2009-03-08 20:44:25 UTC (rev 2692) +++ trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt 2009-03-08 21:43:33 UTC (rev 2693) @@ -1,9 +1,9 @@ RELEASE-NOTES.TXT -2008-12-18 +2008-03-08 Aaron Binns -Release notes for NutchWAX 0.12.3 +Release notes for NutchWAX 0.12.4 For the most recent updates and information on NutchWAX, please visit the project wiki at: @@ -15,61 +15,44 @@ Overview ====================================================================== -NutchWAX 0.12.3 contains numerous enhancements and fixes to 0.12.2 +NutchWAX 0.12.4 contains numerous enhancements and fixes to 0.12.3 - o PageRank calculation and scoring - o Enhanced OpenSearchServlet - o Improved XSLT sample for OpenSearch - o System init.d script for searcher slaves - o Enhanced searcher slave which supports NutchWAX extensions + o Option to omit storing of content during import. + o Support for per-collection segments in master/slave config. + o Additional diagnostic/log messages to help troubleshoot common + deployment mistakes. + o PageRankDb similar to LinkDb but only keeping inlink counts. + o Improved paging through results, handling "paging past the end". -One of the major changes to 0.12.3 is not a feature, enhancement or -bug-fix, but the way the NutchWAX source is "integrated" into the -Nutch source. +====================================================================== +Issues +====================================================================== -Yes, the NutchWAX source is still kept in the contrib/archive -sub-directory, but when you invoke a build command from the -NutchWAX directory, such as +For an up-to-date list of NutchWAX issues: - $ cd nutch/contrib/archive - $ ant tar + http://webteam.archive.org/jira/browse/WAX -Many files from the NutchWAX source tree are copied directly into the -Nutch source tree before the build process begins. +Issues resolved in this release: -The reason for this is to make NutchWAX easier to use. +WAX-27 Sensible output for requesting page of results past the end. -In previous versions of NutchWAX, once 'ant' build command was -finished, the operator had to manually patch configuration files in -the Nutch directory. Upon a subsequent build, the files would be -over-written by Nutch's and would have to be patched again. +WAX-34 Add option to omit storing of content in segment -It was a major hassle and complication. +WAX-35 Add pagerankdb similar to linkdb but which only keeps counts + rather than actual inlinks. -Another impetus for copying files into the Nutch source was to patch -bugs and make enhancements in the Nutch Java code which couldn't be -effectively done keeping the sources separate. When an 'ant' build -command is run a few Java files are copied from the NutchWAX source -tree into the Nutch source tree. +WAX-36 Some additional diagnostics on connecting results to segments + and snippets would be very helpful. -In release 0.12.3, the NutchWAX build file: 'build.xml' handles all of -this. Simply execute your build commands from 'contrib/archive' as -instructed in the HOWTO and no longer worry about patching -configuration files. If you wish to alter the NutchWAX configuration -file, make those changes in the NutchWAX source tree. +WAX-37 Per-collection segments not supported in distributed + master-slave configuration. +WAX-38 Build omits neessary libraries from .job file. -====================================================================== -Issues -====================================================================== +WAX-39 Write more efficient, specialized segment parse_text merging. -For an up-to-date list of NutchWAX issues: - http://webteam.archive.org/jira/browse/WAX -Issues resolved in this release: -WAX-26 - Add XML elements containing all search URL params for self-link - generation + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-05-05 21:46:49
|
Revision: 2703 http://archive-access.svn.sourceforge.net/archive-access/?rev=2703&view=rev Author: binzino Date: 2009-05-05 21:46:40 +0000 (Tue, 05 May 2009) Log Message: ----------- Updated for NutchWAX 0.12.4 release. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/README.txt trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt Modified: trunk/archive-access/projects/nutchwax/archive/README.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/README.txt 2009-05-05 21:44:29 UTC (rev 2702) +++ trunk/archive-access/projects/nutchwax/archive/README.txt 2009-05-05 21:46:40 UTC (rev 2703) @@ -1,6 +1,6 @@ README.txt -2008-03-08 +2009-05-05 Aaron Binns Table of Contents Modified: trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt =================================================================== --- trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt 2009-05-05 21:44:29 UTC (rev 2702) +++ trunk/archive-access/projects/nutchwax/archive/RELEASE-NOTES.txt 2009-05-05 21:46:40 UTC (rev 2703) @@ -1,6 +1,6 @@ RELEASE-NOTES.TXT -2008-03-08 +2009-05-05 Aaron Binns Release notes for NutchWAX 0.12.4 @@ -52,7 +52,6 @@ WAX-39 Write more efficient, specialized segment parse_text merging. +WAX-41 Option to enable/disable the FIELDCACHE in the Nutch IndexSearcher - - - +WAX-42 Add option to continue importing if an arcfile cannot be read. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-01-29 00:50:29
|
Revision: 2953 http://archive-access.svn.sourceforge.net/archive-access/?rev=2953&view=rev Author: binzino Date: 2010-01-29 00:20:42 +0000 (Fri, 29 Jan 2010) Log Message: ----------- Updated to use NutchBean since NutchWaxBean is deprecated. Also fixed bug in NutchBean not observing the -n parameter. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/bin/nutchwax trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java Modified: trunk/archive-access/projects/nutchwax/archive/bin/nutchwax =================================================================== --- trunk/archive-access/projects/nutchwax/archive/bin/nutchwax 2010-01-19 22:11:50 UTC (rev 2952) +++ trunk/archive-access/projects/nutchwax/archive/bin/nutchwax 2010-01-29 00:20:42 UTC (rev 2953) @@ -80,7 +80,7 @@ ;; search) shift - ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.NutchWaxBean "$@" + ${NUTCH_HOME}/bin/nutch org.apache.nutch.searcher.NutchBean "$@" ;; *) echo "" Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java 2010-01-19 22:11:50 UTC (rev 2952) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java 2010-01-29 00:20:42 UTC (rev 2953) @@ -431,10 +431,10 @@ try { final Query query = Query.parse( queryString, conf); - final Hits hits = bean.search(query, 10); + final Hits hits = bean.search(query, numHits); System.out.println( "Total hits : " + hits.getTotal () ); System.out.println( "Hits length: " + hits.getLength() ); - final int length = (int)Math.min(hits.getTotal(), 10); + final int length = (int)Math.min(hits.getTotal(), numHits); final Hit[] show = hits.getHits(0, length); final HitDetails[] details = bean.getDetails(show); final Summary[] summaries = bean.getSummary(details, query); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-02-20 03:21:06
|
Revision: 2958 http://archive-access.svn.sourceforge.net/archive-access/?rev=2958&view=rev Author: binzino Date: 2010-02-20 03:20:59 +0000 (Sat, 20 Feb 2010) Log Message: ----------- WAX-72 and WAX-71: Re-did build system. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/build.xml Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/plugin/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/plugin/build.xml Removed Paths: ------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/build-plugin.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml Modified: trunk/archive-access/projects/nutchwax/archive/build.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/build.xml 2010-02-20 03:18:57 UTC (rev 2957) +++ trunk/archive-access/projects/nutchwax/archive/build.xml 2010-02-20 03:20:59 UTC (rev 2958) @@ -25,81 +25,52 @@ <!-- HACK: Need to import default.properties like Nutch does --> <property name="final.name" value="nutch-1.0" /> <property name="dist.dir" value="${build.dir}/${final.name}" /> - - <target name="nutch-compile-core"> - <!-- First, copy over Nutch source overlays --> + + <target name="init"> <exec executable="rsync"> <arg value="-vacC"/> <arg value="src/nutch/"/> <arg value="../../"/> </exec> - <ant dir="${nutch.dir}" target="compile-core" inheritAll="false" /> + <exec executable="rsync"> + <arg value="-vacC"/> + <arg value="lib/"/> + <arg value="../../lib/"/> + </exec> + <exec executable="rsync"> + <arg value="-vacC"/> + <arg value="bin/"/> + <arg value="../../bin/"/> + </exec> + <exec executable="rsync"> + <arg value="-vacC"/> + <arg value="src/java/"/> + <arg value="../../src/java/"/> + </exec> + <exec executable="rsync"> + <arg value="-vacC"/> + <arg value="src/plugin/"/> + <arg value="../../src/plugin/"/> + </exec> </target> - - <target name="nutch-compile-plugins"> - <ant dir="${nutch.dir}" target="compile-plugins" inheritAll="false" /> - </target> - - <target name="compile-core" depends="nutch-compile-core"> - <javac - destdir="${build.dir}/classes" - debug="true" - verbose="false" - source="1.5" - target="1.5" - encoding="UTF-8" - fork="true" - nowarn="true" - deprecation="false"> - <src path="${src.dir}/java" /> - <include name="**/*.java" /> - <classpath> - <pathelement location="${build.dir}/classes" /> - <fileset dir="${lib.dir}"> - <include name="*.jar"/> - </fileset> - <fileset dir="${nutch.dir}/lib"> - <include name="*.jar"/> - </fileset> - </classpath> - </javac> - </target> - - <target name="compile-plugins"> - <ant dir="src/plugin" target="deploy" inheritAll="false" /> - </target> - - <!-- - These targets all call down to the corresponding target in the - Nutch build.xml file. This way all of the 'ant' build commands - can be executed from this directory and everything should get - built as expected. - --> - <target name="compile" depends="compile-core, compile-plugins, nutch-compile-plugins"> - </target> - - <target name="jar" depends="compile-core"> + + <target name="jar" depends="init"> <ant dir="${nutch.dir}" target="jar" inheritAll="false" /> </target> - <target name="job" depends="compile"> + <target name="job" depends="init"> <ant dir="${nutch.dir}" target="job" inheritAll="false" /> - - <!-- Add our NutchWAX libs to the .job created by Nutch's build. --> - <jar jarfile="${build.dir}/${final.name}.job" update="true"> - <zipfileset dir="lib" prefix="lib" includes="*.jar"/> - </jar> </target> - <target name="war" depends="compile"> + <target name="war" depends="init"> <ant dir="${nutch.dir}" target="war" inheritAll="false" /> </target> - <target name="javadoc" depends="compile"> + <target name="javadoc" depends="init"> <ant dir="${nutch.dir}" target="javadoc" inheritAll="false" /> </target> - <target name="tar" depends="package"> + <target name="tar" depends="init"> <ant dir="${nutch.dir}" target="tar" inheritAll="false" /> </target> @@ -107,24 +78,12 @@ <ant dir="${nutch.dir}" target="clean" inheritAll="false" /> </target> - <!-- This one does a little more after calling down to the relevant - Nutch target. After Nutch has copied everything into the - distribution directory, we add our script, libraries, etc. - --> - <target name="package" depends="jar, job, war, javadoc" > + <target name="package" depends="init"> <ant dir="${nutch.dir}" target="package" inheritAll="false" /> <ant target="onlypack" /> </target> <target name="onlypack"> - <copy todir="${dist.dir}/lib" includeEmptyDirs="false"> - <fileset dir="lib"/> - </copy> - - <copy todir="${dist.dir}/bin"> - <fileset dir="bin"/> - </copy> - <chmod perm="ugo+x" type="file"> <fileset dir="${dist.dir}/bin"/> </chmod> Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/plugin/build.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/plugin/build.xml (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/plugin/build.xml 2010-02-20 03:20:59 UTC (rev 2958) @@ -0,0 +1,204 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="Nutch" default="deploy-core" basedir="."> + + <target name="deploy-core"> + <ant target="compile-core" inheritall="false" dir="../.."/> + <ant target="deploy"/> + </target> + + <!-- ====================================================== --> + <!-- Build & deploy all the plugin jars. --> + <!-- ====================================================== --> + <target name="deploy"> + <ant dir="clustering-carrot2" target="deploy"/> + <ant dir="creativecommons" target="deploy"/> + <ant dir="feed" target="deploy"/> + <ant dir="index-basic" target="deploy"/> + <ant dir="index-anchor" target="deploy"/> + <ant dir="index-more" target="deploy"/> + <ant dir="field-basic" target="deploy"/> + <ant dir="field-boost" target="deploy"/> + <ant dir="languageidentifier" target="deploy"/> + <ant dir="lib-http" target="deploy"/> + <ant dir="lib-jakarta-poi" target="deploy"/> + <ant dir="lib-lucene-analyzers" target="deploy"/> + <ant dir="lib-nekohtml" target="deploy"/> + <ant dir="lib-parsems" target="deploy"/> + <ant dir="lib-regex-filter" target="deploy"/> + <ant dir="lib-xml" target="deploy"/> + <ant dir="microformats-reltag" target="deploy"/> + <ant dir="nutch-extensionpoints" target="deploy"/> + <ant dir="ontology" target="deploy"/> + <ant dir="protocol-file" target="deploy"/> + <ant dir="protocol-ftp" target="deploy"/> + <ant dir="protocol-http" target="deploy"/> + <ant dir="protocol-httpclient" target="deploy"/> + <ant dir="parse-ext" target="deploy"/> + <ant dir="parse-html" target="deploy"/> + <ant dir="parse-js" target="deploy"/> + <!-- <ant dir="parse-mp3" target="deploy"/> --> + <ant dir="parse-msexcel" target="deploy"/> + <ant dir="parse-mspowerpoint" target="deploy"/> + <ant dir="parse-msword" target="deploy"/> + <ant dir="parse-oo" target="deploy"/> + <ant dir="parse-pdf" target="deploy"/> + <ant dir="parse-rss" target="deploy"/> + <!-- <ant dir="parse-rtf" target="deploy"/> --> + <ant dir="parse-swf" target="deploy"/> + <ant dir="parse-text" target="deploy"/> + <ant dir="parse-zip" target="deploy"/> + <ant dir="query-basic" target="deploy"/> + <ant dir="query-more" target="deploy"/> + <ant dir="query-site" target="deploy"/> + <ant dir="query-custom" target="deploy"/> + <ant dir="query-url" target="deploy"/> + <ant dir="response-json" target="deploy"/> + <ant dir="response-xml" target="deploy"/> + <ant dir="scoring-opic" target="deploy"/> + <ant dir="scoring-link" target="deploy"/> + <ant dir="summary-basic" target="deploy"/> + <ant dir="subcollection" target="deploy"/> + <ant dir="summary-lucene" target="deploy"/> + <ant dir="tld" target="deploy"/> + <ant dir="urlfilter-automaton" target="deploy"/> + <ant dir="urlfilter-domain" target="deploy" /> + <ant dir="urlfilter-prefix" target="deploy"/> + <ant dir="urlfilter-regex" target="deploy"/> + <ant dir="urlfilter-suffix" target="deploy"/> + <ant dir="urlfilter-validator" target="deploy"/> + <ant dir="urlnormalizer-basic" target="deploy"/> + <ant dir="urlnormalizer-pass" target="deploy"/> + <ant dir="urlnormalizer-regex" target="deploy"/> + + <ant dir="index-nutchwax" target="deploy" /> + <ant dir="query-nutchwax" target="deploy" /> + <ant dir="scoring-nutchwax" target="deploy" /> + <ant dir="urlfilter-nutchwax" target="deploy" /> + + </target> + + <!-- ====================================================== --> + <!-- Test all of the plugins. --> + <!-- ====================================================== --> + <target name="test"> + <parallel threadCount="2"> + <ant dir="creativecommons" target="test"/> + <ant dir="index-more" target="test"/> + <ant dir="languageidentifier" target="test"/> + <ant dir="lib-http" target="test"/> + <ant dir="ontology" target="test"/> + <ant dir="protocol-httpclient" target="test"/> + <!--ant dir="parse-ext" target="test"/--> + <ant dir="parse-html" target="test"/> + <!-- <ant dir="parse-mp3" target="test"/> --> + <ant dir="parse-msexcel" target="test"/> + <ant dir="parse-mspowerpoint" target="test"/> + <ant dir="parse-msword" target="test"/> + <ant dir="parse-oo" target="test"/> + <ant dir="parse-pdf" target="test"/> + <ant dir="parse-rss" target="test"/> + <ant dir="feed" target="test"/> + <!-- <ant dir="parse-rtf" target="test"/> --> + <ant dir="parse-swf" target="test"/> + <ant dir="parse-zip" target="test"/> + <ant dir="query-url" target="test"/> + <ant dir="subcollection" target="test"/> + <ant dir="urlfilter-automaton" target="test"/> + <ant dir="urlfilter-domain" target="test" /> + <ant dir="urlfilter-regex" target="test"/> + <ant dir="urlfilter-suffix" target="test"/> + <ant dir="urlnormalizer-basic" target="test"/> + <ant dir="urlnormalizer-pass" target="test"/> + <ant dir="urlnormalizer-regex" target="test"/> + </parallel> + </target> + + <!-- ====================================================== --> + <!-- Clean all of the plugins. --> + <!-- ====================================================== --> + <target name="clean"> + <ant dir="analysis-de" target="clean"/> + <ant dir="analysis-fr" target="clean"/> + <ant dir="clustering-carrot2" target="clean"/> + <ant dir="creativecommons" target="clean"/> + <ant dir="feed" target="clean"/> + <ant dir="index-basic" target="clean"/> + <ant dir="index-anchor" target="clean"/> + <ant dir="index-more" target="clean"/> + <ant dir="field-basic" target="clean"/> + <ant dir="field-boost" target="clean"/> + <ant dir="languageidentifier" target="clean"/> + <ant dir="lib-commons-httpclient" target="clean"/> + <ant dir="lib-http" target="clean"/> + <ant dir="lib-jakarta-poi" target="clean"/> + <ant dir="lib-lucene-analyzers" target="clean"/> + <ant dir="lib-nekohtml" target="clean"/> + <ant dir="lib-parsems" target="clean"/> + <ant dir="lib-regex-filter" target="clean"/> + <ant dir="lib-xml" target="clean"/> + <ant dir="microformats-reltag" target="clean"/> + <ant dir="nutch-extensionpoints" target="clean"/> + <ant dir="ontology" target="clean"/> + <ant dir="protocol-file" target="clean"/> + <ant dir="protocol-ftp" target="clean"/> + <ant dir="protocol-http" target="clean"/> + <ant dir="protocol-httpclient" target="clean"/> + <ant dir="parse-ext" target="clean"/> + <ant dir="parse-html" target="clean"/> + <ant dir="parse-js" target="clean"/> + <ant dir="parse-mp3" target="clean"/> + <ant dir="parse-msexcel" target="clean"/> + <ant dir="parse-mspowerpoint" target="clean"/> + <ant dir="parse-msword" target="clean"/> + <ant dir="parse-oo" target="clean"/> + <ant dir="parse-pdf" target="clean"/> + <ant dir="parse-rss" target="clean"/> + <ant dir="parse-rtf" target="clean"/> + <ant dir="parse-swf" target="clean"/> + <ant dir="parse-text" target="clean"/> + <ant dir="parse-zip" target="clean"/> + <ant dir="query-basic" target="clean"/> + <ant dir="query-more" target="clean"/> + <ant dir="query-site" target="clean"/> + <ant dir="query-url" target="clean"/> + <ant dir="query-custom" target="clean"/> + <ant dir="response-json" target="clean"/> + <ant dir="response-xml" target="clean"/> + <ant dir="scoring-opic" target="clean"/> + <ant dir="scoring-link" target="clean"/> + <ant dir="subcollection" target="clean"/> + <ant dir="summary-basic" target="clean"/> + <ant dir="summary-lucene" target="clean"/> + <ant dir="tld" target="clean"/> + <ant dir="urlfilter-automaton" target="clean"/> + <ant dir="urlfilter-domain" target="clean" /> + <ant dir="urlfilter-prefix" target="clean"/> + <ant dir="urlfilter-regex" target="clean"/> + <ant dir="urlfilter-suffix" target="clean"/> + <ant dir="urlfilter-validator" target="clean"/> + <ant dir="urlnormalizer-basic" target="clean"/> + <ant dir="urlnormalizer-pass" target="clean"/> + <ant dir="urlnormalizer-regex" target="clean"/> + + <ant dir="index-nutchwax" target="clean" /> + <ant dir="query-nutchwax" target="clean" /> + <ant dir="scoring-nutchwax" target="clean" /> + <ant dir="urlfilter-nutchwax" target="clean" /> + </target> +</project> Property changes on: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/plugin/build.xml ___________________________________________________________________ Added: svn:executable + * Deleted: trunk/archive-access/projects/nutchwax/archive/src/plugin/build-plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/build-plugin.xml 2010-02-20 03:18:57 UTC (rev 2957) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/build-plugin.xml 2010-02-20 03:20:59 UTC (rev 2958) @@ -1,216 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<!-- Imported by plugin build.xml files to define default targets. --> -<project> - - <property name="name" value="${ant.project.name}"/> - <property name="root" value="${basedir}"/> - - <!-- load plugin-specific properties first --> - <property file="${user.home}/${name}.build.properties" /> - <property file="${root}/build.properties" /> - - <property name="nutch.root" location="${root}/../../../../../"/> - - <property name="src.dir" location="${root}/src/java"/> - <property name="src.test" location="${root}/src/test"/> - - <available file="${src.test}" type="dir" property="test.available"/> - - <property name="conf.dir" location="${nutch.root}/conf"/> - - <property name="build.dir" location="${nutch.root}/build/${name}"/> - <property name="build.classes" location="${build.dir}/classes"/> - <property name="build.test" location="${build.dir}/test"/> - - <property name="deploy.dir" location="${nutch.root}/build/plugins/${name}"/> - - <!-- load nutch defaults last so that they can be overridden above --> - <property file="${nutch.root}/default.properties" /> - - <path id="plugin.deps"/> - - <fileset id="lib.jars" dir="${root}" includes="lib/*.jar"/> - - <!-- the normal classpath --> - <path id="classpath"> - <pathelement location="${build.classes}"/> - <fileset refid="lib.jars"/> - <pathelement location="${nutch.root}/build/classes"/> - <fileset dir="${nutch.root}/lib"> - <include name="*.jar" /> - </fileset> - <!-- This is the contrib/archive/lib directory --> - <fileset dir="../../../lib"> - <include name="*.jar" /> - </fileset> - <path refid="plugin.deps"/> - </path> - - <!-- the unit test classpath --> - <path id="test.classpath"> - <pathelement location="${build.test}" /> - <pathelement location="${nutch.root}/build/test/classes"/> - <pathelement location="${nutch.root}/src/test"/> - <pathelement location="${conf.dir}"/> - <pathelement location="${nutch.root}/build"/> - <path refid="classpath"/> - </path> - - <!-- ====================================================== --> - <!-- Stuff needed by all targets --> - <!-- ====================================================== --> - <target name="init"> - <mkdir dir="${build.dir}"/> - <mkdir dir="${build.classes}"/> - <mkdir dir="${build.test}"/> - - <antcall target="init-plugin"/> - </target> - - <!-- to be overridden by sub-projects --> - <target name="init-plugin"/> - - <!-- - ! Used to build plugin compilation dependencies - ! (to be overridden by plugins) - !--> - <target name="deps-jar"/> - - <!-- - ! Used to deploy plugin runtime dependencies - ! (to be overridden by plugins) - !--> - <target name="deps-test"/> - - <!-- ====================================================== --> - <!-- Compile the Java files --> - <!-- ====================================================== --> - <target name="compile" depends="init,deps-jar"> - <echo message="Compiling plugin: ${name}"/> - <javac - encoding="${build.encoding}" - srcdir="${src.dir}" - includes="**/*.java" - destdir="${build.classes}" - debug="${javac.debug}" - optimize="${javac.optimize}" - target="${javac.version}" - source="${javac.version}" - deprecation="${javac.deprecation}"> - <classpath refid="classpath"/> - </javac> - </target> - - <target name="compile-core"> - <ant target="compile-core" inheritall="false" dir="${nutch.root}"/> - <ant target="compile"/> - </target> - - <!-- ================================================================== --> - <!-- Make plugin .jar --> - <!-- ================================================================== --> - <!-- --> - <!-- ================================================================== --> - <target name="jar" depends="compile"> - <jar - jarfile="${build.dir}/${name}.jar" - basedir="${build.classes}" - /> - </target> - - <target name="jar-core" depends="compile-core"> - <jar - jarfile="${build.dir}/${name}.jar" - basedir="${build.classes}" - /> - </target> - - <!-- ================================================================== --> - <!-- Deploy plugin to ${deploy.dir} --> - <!-- ================================================================== --> - <!-- --> - <!-- ================================================================== --> - <target name="deploy" depends="jar, deps-test"> - <mkdir dir="${deploy.dir}"/> - <copy file="plugin.xml" todir="${deploy.dir}" - preservelastmodified="true"/> - <available property="lib-available" - file="${build.dir}/${name}.jar"/> - <antcall target="copy-generated-lib"/> - <copy todir="${deploy.dir}" flatten="true"> - <fileset refid="lib.jars"/> - </copy> - </target> - - <target name="copy-generated-lib" if="lib-available"> - <copy file="${build.dir}/${name}.jar" todir="${deploy.dir}" failonerror="false"/> - </target> - - <!-- ================================================================== --> - <!-- Compile test code --> - <!-- ================================================================== --> - <target name="compile-test" depends="compile" if="test.available"> - <javac - encoding="${build.encoding}" - srcdir="${src.test}" - includes="**/*.java" - destdir="${build.test}" - debug="${javac.debug}" - optimize="${javac.optimize}" - target="${javac.version}" - source="${javac.version}" - deprecation="${javac.deprecation}"> - <classpath refid="test.classpath"/> - </javac> - </target> - - <!-- ================================================================== --> - <!-- Run unit tests --> - <!-- ================================================================== --> - <target name="test" depends="compile-test, deploy" if="test.available"> - <echo message="Testing plugin: ${name}"/> - - <junit printsummary="yes" haltonfailure="no" fork="yes" - errorProperty="tests.failed" failureProperty="tests.failed"> - <sysproperty key="test.data" value="${build.test}/data"/> - <sysproperty key="test.input" value="${root}/data"/> - <classpath refid="test.classpath"/> - <formatter type="plain" /> - <batchtest todir="${build.test}" unless="testcase"> - <fileset dir="${src.test}" - includes="**/Test*.java" excludes="**/${test.exclude}.java" /> - </batchtest> - <batchtest todir="${build.test}" if="testcase"> - <fileset dir="${src.test}" includes="**/${testcase}.java"/> - </batchtest> - </junit> - - <fail if="tests.failed">Tests failed!</fail> - - </target> - - <!-- ================================================================== --> - <!-- Clean. Delete the build files, and their directories --> - <!-- ================================================================== --> - <target name="clean"> - <delete dir="${build.dir}"/> - <delete dir="${deploy.dir}"/> - </target> - -</project> Deleted: trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml 2010-02-20 03:18:57 UTC (rev 2957) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml 2010-02-20 03:20:59 UTC (rev 2958) @@ -1,45 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="nutchwax" default="deploy-core" basedir="."> - - <target name="deploy-core"> - <ant target="compile-core" inheritall="false" dir="../../../../"/> - <ant target="deploy"/> - </target> - - <!-- ====================================================== --> - <!-- Build & deploy all the plugin jars. --> - <!-- ====================================================== --> - <target name="deploy"> - <ant dir="index-nutchwax" target="deploy"/> - <ant dir="query-nutchwax" target="deploy"/> - <ant dir="urlfilter-nutchwax" target="deploy"/> - <ant dir="scoring-nutchwax" target="deploy"/> - </target> - - <!-- ====================================================== --> - <!-- Clean all of the plugins. --> - <!-- ====================================================== --> - <target name="clean"> - <ant dir="index-nutchwax" target="clean"/> - <ant dir="query-nutchwax" target="clean"/> - <ant dir="urlfilter-nutchwax" target="clean"/> - <ant dir="scoring-nutchwax" target="clean"/> - </target> - -</project> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-02-22 05:17:29
|
Revision: 2960 http://archive-access.svn.sourceforge.net/archive-access/?rev=2960&view=rev Author: binzino Date: 2010-02-22 05:17:20 +0000 (Mon, 22 Feb 2010) Log Message: ----------- Initial revision of OpenSearch master/slave system. Work-in-progress. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/lib/jdom.LICENSE trunk/archive-access/projects/nutchwax/archive/lib/jdom.jar trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java Added: trunk/archive-access/projects/nutchwax/archive/lib/jdom.LICENSE =================================================================== --- trunk/archive-access/projects/nutchwax/archive/lib/jdom.LICENSE (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/lib/jdom.LICENSE 2010-02-22 05:17:20 UTC (rev 2960) @@ -0,0 +1,56 @@ +/*-- + + $Id: LICENSE.txt,v 1.11 2004/02/06 09:32:57 jhunter Exp $ + + Copyright (C) 2000-2004 Jason Hunter & Brett McLaughlin. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer that follows + these conditions in the documentation and/or other materials + provided with the distribution. + + 3. The name "JDOM" must not be used to endorse or promote products + derived from this software without prior written permission. For + written permission, please contact <request_AT_jdom_DOT_org>. + + 4. Products derived from this software may not be called "JDOM", nor + may "JDOM" appear in their name, without prior written permission + from the JDOM Project Management <request_AT_jdom_DOT_org>. + + In addition, we request (but do not require) that you include in the + end-user documentation provided with the redistribution and/or in the + software itself an acknowledgement equivalent to the following: + "This product includes software developed by the + JDOM Project (http://www.jdom.org/)." + Alternatively, the acknowledgment may be graphical using the logos + available at http://www.jdom.org/images/logos. + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. + + This software consists of voluntary contributions made by many + individuals on behalf of the JDOM Project and was originally + created by Jason Hunter <jhunter_AT_jdom_DOT_org> and + Brett McLaughlin <brett_AT_jdom_DOT_org>. For more information + on the JDOM Project, please see <http://www.jdom.org/>. + + */ + Added: trunk/archive-access/projects/nutchwax/archive/lib/jdom.jar =================================================================== (Binary files differ) Property changes on: trunk/archive-access/projects/nutchwax/archive/lib/jdom.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java 2010-02-22 05:17:20 UTC (rev 2960) @@ -0,0 +1,364 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.IOException; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.FileInputStream; +import java.util.Comparator; +import java.util.Collections; +import java.util.List; +import java.util.ArrayList; +import java.util.LinkedList; + +import org.jdom.Document; +import org.jdom.Element; +import org.jdom.Namespace; +import org.jdom.output.XMLOutputter; + + +/** + * + */ +public class OpenSearchMaster +{ + List<OpenSearchSlave> slaves = new ArrayList<OpenSearchSlave>( ); + long timeout = 30 * 1000; + + public OpenSearchMaster( String slavesFile, long timeout ) + throws IOException + { + this( slavesFile ); + this.timeout = timeout; + } + + public OpenSearchMaster( String slavesFile ) + throws IOException + { + BufferedReader r = null; + try + { + r = new BufferedReader( new InputStreamReader( new FileInputStream( slavesFile ), "utf-8" ) ); + + String line; + while ( (line = r.readLine()) != null ) + { + line = line.trim(); + if ( line.length() == 0 || line.charAt( 0 ) == '#' ) + { + // Ignore it. + continue ; + } + + OpenSearchSlave slave = new OpenSearchSlave( line ); + + this.slaves.add( slave ); + } + } + finally + { + try { if ( r != null ) r.close(); } catch ( IOException ioe ) { } + } + + } + + public Document query( String query, int startIndex, int numResults, int hitsPerSite ) + { + long startTime = System.currentTimeMillis( ); + + List<SlaveQueryThread> slaveThreads = new ArrayList<SlaveQueryThread>( this.slaves.size() ); + + for ( OpenSearchSlave slave : this.slaves ) + { + SlaveQueryThread sqt = new SlaveQueryThread( slave, query, 0, (startIndex+numResults), hitsPerSite ); + + sqt.start( ); + + slaveThreads.add( sqt ); + } + + waitForThreads( slaveThreads, this.timeout, startTime ); + + LinkedList<Element> items = new LinkedList<Element>( ); + long totalResults = 0; + + for ( SlaveQueryThread sqt : slaveThreads ) + { + if ( sqt.throwable != null ) + { + // TODO: Handle problems with slaves + continue ; + } + + // Dump all the results ("item" elements) into a single list. + Element channel = sqt.response.getRootElement( ).getChild( "channel" ); + items.addAll( (List<Element>) channel.getChildren( "item" ) ); + channel.removeChildren( "item" ); + + try + { + totalResults += Integer.parseInt( channel.getChild( "totalResults", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) ).getTextTrim( ) ); + } + catch ( Exception e ) + { + // TODO: Log error getting total. + } + + } + + if ( items.size( ) > 0 && hitsPerSite > 0 ) + { + Collections.sort( items, new ElementSiteThenScoreComparator( ) ); + + LinkedList<Element> collapsed = new LinkedList<Element>( ); + + collapsed.add( items.removeFirst( ) ); + + int count = 1; + for ( Element item : items ) + { + String lastSite = collapsed.getLast( ).getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim( ); + + if ( lastSite.length( ) == 0 || + !lastSite.equals( item.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim( ) ) ) + { + collapsed.add( item ); + count = 1; + } + else if ( count < hitsPerSite ) + { + collapsed.add( item ); + count++; + } + else + { + // TODO: Log collapse of item. + } + } + + // Replace the list of items with the collapsed list. + items = collapsed; + } + + Collections.sort( items, new ElementScoreComparator( ) ); + + // Build the final results OpenSearch XML document. + Element channel = new Element( "channel" ); + channel.addContent( new Element( "title" ) ); + channel.addContent( new Element( "description" ) ); + channel.addContent( new Element( "link" ) ); + + Element eTotalResults = new Element( "totalResults", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) ); + Element eStartIndex = new Element( "startIndex", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) ); + Element eItemsPerPage = new Element( "itemsPerPage", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) ); + + eTotalResults.setText( Long.toString( totalResults ) ); + eStartIndex. setText( Long.toString( startIndex ) ); + eItemsPerPage.setText( Long.toString( numResults ) ); + + channel.addContent( eTotalResults ); + channel.addContent( eStartIndex ); + channel.addContent( eItemsPerPage ); + + // Get a sub-list of only the items we want: [startIndex,(startIndex+numResults)] + List<Element> subList = items.subList( Math.min( startIndex, items.size( ) ), + Math.min( (startIndex+numResults), items.size( ) ) ); + channel.addContent( subList ); + + Element rss = new Element( "rss" ); + rss.addContent( channel ); + + return new Document( rss ); + } + + + /** + * Convenience method to wait for a collection of threads to complete, + * or until a timeout after a startTime expires. + */ + private void waitForThreads( List<SlaveQueryThread> threads, long timeout, long startTime ) + { + for ( Thread t : threads ) + { + long timeRemaining = timeout - (System.currentTimeMillis( ) - startTime); + + // If we are out of time, don't wait for any more threads. + if ( timeRemaining <= 0 ) + { + break; + } + + // Otherwise, wait for the next unfinished thread to finish. + try + { + t.join( timeRemaining ); + } + catch ( InterruptedException ie ) + { + break; + } + } + } + + + public static void main( String args[] ) + throws Exception + { + String usage = "OpenSearchMaster [OPTIONS] SLAVES.txt query" + + "\n\t-h <n> Hits per site" + + "\n\t-n <n> Number of results" + + "\n\t-s <n> Start index" + + "\n"; + + if ( args.length < 2 ) + { + System.err.println( usage ); + System.exit( 1 ); + } + + String slavesFile = args[args.length - 2]; + String query = args[args.length - 1]; + + int startIndex = 0; + int hitsPerSite = 0; + int numHits = 10; + for ( int i = 0 ; i < args.length - 2 ; i++ ) + { + try + { + if ( "-h".equals( args[i] ) ) + { + i++; + hitsPerSite = Integer.parseInt( args[i] ); + } + if ( "-n".equals( args[i] ) ) + { + i++; + numHits = Integer.parseInt( args[i] ); + } + if ( "-s".equals( args[i] ) ) + { + i++; + startIndex = Integer.parseInt( args[i] ); + } + } + catch ( NumberFormatException nfe ) + { + System.err.println( "Error: not a numeric value: " + args[i] ); + System.err.println( usage ); + System.exit( 1 ); + } + } + + OpenSearchMaster master = new OpenSearchMaster( slavesFile ); + + Document doc = master.query( query, startIndex, numHits, hitsPerSite ); + + (new XMLOutputter()).output( doc, System.out ); + } + +} + + +class SlaveQueryThread extends Thread +{ + OpenSearchSlave slave; + + String query; + int startIndex; + int numResults; + int hitsPerSite; + + Document response; + Throwable throwable; + + + SlaveQueryThread( OpenSearchSlave slave, String query, int startIndex, int numResults, int hitsPerSite ) + { + this.slave = slave; + this.query = query; + this.startIndex = startIndex; + this.numResults = numResults; + this.hitsPerSite = hitsPerSite; + } + + public void run( ) + { + try + { + this.response = this.slave.query( this.query, this.startIndex, this.numResults, this.hitsPerSite ); + } + catch ( Throwable t ) + { + this.throwable = t; + } + } +} + + +class ElementScoreComparator implements Comparator<Element> +{ + public int compare( Element e1, Element e2 ) + { + if ( e1 == e2 ) return 0; + if ( e1 == null ) return 1; + if ( e2 == null ) return -1; + + Element score1 = e1.getChild( "score" ); + Element score2 = e2.getChild( "score" ); + + if ( score1 == score2 ) return 0; + if ( score1 == null ) return 1; + if ( score2 == null ) return -1; + + String text1 = score1.getText().trim(); + String text2 = score2.getText().trim(); + + float value1 = 0.0f; + float value2 = 0.0f; + + try { value1 = Float.parseFloat( text1 ); } catch ( NumberFormatException nfe ) { } + try { value2 = Float.parseFloat( text2 ); } catch ( NumberFormatException nfe ) { } + + if ( value1 == value2 ) return 0; + + return value1 > value2 ? -1 : 1; + } +} + +class ElementSiteThenScoreComparator extends ElementScoreComparator +{ + public int compare( Element e1, Element e2 ) + { + if ( e1 == e2 ) return 0; + if ( e1 == null ) return 1; + if ( e2 == null ) return -1; + + String site1 = e1.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim(); + String site2 = e2.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim(); + + if ( site1.equals( site2 ) ) + { + // Sites are equal, then compare scores. + return super.compare( e1, e2 ); + } + + return site1.compareTo( site2 ); + } +} \ No newline at end of file Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java 2010-02-22 05:17:20 UTC (rev 2960) @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.IOException; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.FileInputStream; +import java.util.List; +import java.util.ArrayList; +import javax.servlet.ServletException; +import javax.servlet.ServletConfig; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + + +/** + * + */ +public class OpenSearchMasterServlet extends HttpServlet +{ + + public void init( ServletConfig config ) + throws ServletException + { + + + } + + public void doGet( HttpServletRequest request, HttpServletResponse response ) + throws ServletException, IOException + { + + } + +} Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java 2010-02-22 05:17:20 UTC (rev 2960) @@ -0,0 +1,209 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLEncoder; +import java.util.List; + +import org.jdom.Document; +import org.jdom.Element; +import org.jdom.Namespace; +import org.jdom.input.SAXBuilder; +import org.jdom.output.XMLOutputter; + +/** + * + */ +public class OpenSearchSlave +{ + private String urlTemplate; + + public OpenSearchSlave( String urlTemplate ) + { + this.urlTemplate = urlTemplate; + } + + public Document query( String query, int startIndex, int requestedNumResults, int hitsPerSite ) + throws Exception + { + URL url = buildRequestUrl( query, startIndex, requestedNumResults, hitsPerSite ); + + InputStream is = null; + try + { + is = getInputStream( url ); + + Document doc = (new SAXBuilder()).build( is ); + + doc = validate( doc ); + + return doc; + } + finally + { + // Ensure the InputStream is closed, which should trigger the + // underlying HTTP connection to be cleaned-up. + try { if ( is != null ) is.close( ); } catch ( IOException ioe ) { } // Not much we can do + } + } + + private Document validate( Document doc ) + throws Exception + { + if ( doc.getRootElement( ) == null ) throw new Exception( "Invalid OpenSearch response: missing /rss" ); + Element root = doc.getRootElement( ); + + if ( ! "rss".equals( root.getName( ) ) ) throw new Exception( "Invalid OpenSearch response: missing /rss" ); + Element channel = root.getChild( "channel" ); + + if ( channel == null ) throw new Exception( "Invalid OpenSearch response: missing /rss/channel" ); + + for ( Element item : (List<Element>) channel.getChildren( "item" ) ) + { + Element site = item.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ); + if ( site == null ) + { + item.addContent( new Element( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ) ); + } + + Element score = item.getChild( "score", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ); + if ( score == null ) + { + score = new Element( "score", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ); + score.setText( "" ); + + item.addContent( score ); + } + } + + return doc; + } + + /** + * + */ + public URL buildRequestUrl( String query, int startIndex, int requestedNumResults, int hitsPerSite ) + throws MalformedURLException, UnsupportedEncodingException + { + String url = this.urlTemplate; + + // Note about replaceAll: In the Java regex library, the replacement string has a few + // special characters: \ and $. Forunately, since we URL-encode the replacement string, + // any occurance of \ or $ is converted to %xy form. So we don't have to worry about it. :) + url = url.replaceAll( "[{]searchTerms[}]", URLEncoder.encode( query, "utf-8" ) ); + url = url.replaceAll( "[{]count[}]" , String.valueOf( requestedNumResults ) ); + url = url.replaceAll( "[{]startIndex[}]" , String.valueOf( startIndex ) ); + url = url.replaceAll( "[{]hitsPerSite[}]", String.valueOf( hitsPerSite ) ); + + // We don't know about any optional parameters, so we remove them (per the OpenSearch spec.) + url = url.replaceAll( "[{][^}]+[?][}]", "" ); + + return new URL( url ); + } + + + public InputStream getInputStream( URL url ) + throws IOException + { + URLConnection connection = url.openConnection( ); + connection.setDoOutput( false ); + connection.setRequestProperty( "User-Agent", "Mozilla/4.0 (compatible; NutchWAX OpenSearchMaster)" ); + connection.connect( ); + + if ( connection instanceof HttpURLConnection ) + { + HttpURLConnection hc = (HttpURLConnection) connection; + + switch ( hc.getResponseCode( ) ) + { + case 200: + // All good. + break; + default: + // Problems! Bail out. + throw new IOException( "HTTP error from " + url + ": " + hc.getResponseMessage( ) ); + } + } + + InputStream is = connection.getInputStream( ); + + return is; + } + + public String toString() + { + return this.urlTemplate; + } + + public static void main( String args[] ) + throws Exception + { + String usage = "OpenSearchSlave [OPTIONS] urlTemplate query" + + "\n\t-h <n> Hits per site" + + "\n\t-n <n> Number of results" + + "\n"; + + if ( args.length < 2 ) + { + System.err.println( usage ); + System.exit( 1 ); + } + + String urlTemplate = args[args.length - 2]; + String query = args[args.length - 1]; + + int hitsPerSite = 0; + int numHits = 10; + for ( int i = 0 ; i < args.length - 2 ; i++ ) + { + try + { + if ( "-h".equals( args[i] ) ) + { + i++; + hitsPerSite = Integer.parseInt( args[i] ); + } + if ( "-n".equals( args[i] ) ) + { + i++; + numHits = Integer.parseInt( args[i] ); + } + } + catch ( NumberFormatException nfe ) + { + System.err.println( "Error: not a numeric value: " + args[i] ); + System.err.println( usage ); + System.exit( 1 ); + } + } + + OpenSearchSlave osl = new OpenSearchSlave( urlTemplate ); + + Document doc = osl.query( query, 0, numHits, hitsPerSite ); + + (new XMLOutputter()).output( doc, System.out ); + } + +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |