You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bi...@us...> - 2009-10-28 00:29:34
|
Revision: 2855 http://archive-access.svn.sourceforge.net/archive-access/?rev=2855&view=rev Author: binzino Date: 2009-10-28 00:29:23 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Ported from NW 0.12.9. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexMerger.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexMerger.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexMerger.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexMerger.java 2009-10-28 00:29:23 UTC (rev 2855) @@ -0,0 +1,211 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.*; +import java.util.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.fs.*; +import org.apache.hadoop.mapred.FileAlreadyExistsException; +import org.apache.hadoop.util.*; +import org.apache.hadoop.conf.*; + +import org.apache.nutch.util.HadoopFSUtil; +import org.apache.nutch.util.LogUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.indexer.NutchSimilarity; +import org.apache.nutch.indexer.FsDirectory; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.ArchiveParallelReader; + +/************************************************************************* + * IndexMerger creates an index for the output corresponding to a + * single fetcher run. + * + * @author Doug Cutting + * @author Mike Cafarella + *************************************************************************/ +public class IndexMerger extends Configured implements Tool { + public static final Log LOG = LogFactory.getLog(IndexMerger.class); + + public static final String DONE_NAME = "merge.done"; + + public IndexMerger() { + + } + + public IndexMerger(Configuration conf) { + setConf(conf); + } + + /** + * Merge all input indexes to the single output index + */ + public void merge(IndexReader[] readers, Path outputIndex, Path localWorkingDir, boolean parallel) throws IOException { + LOG.info("merging indexes to: " + outputIndex); + + FileSystem localFs = FileSystem.getLocal(getConf()); + if (localFs.exists(localWorkingDir)) { + localFs.delete(localWorkingDir, true); + } + localFs.mkdirs(localWorkingDir); + + // Get local output target + // + FileSystem fs = FileSystem.get(getConf()); + if (fs.exists(outputIndex)) { + throw new FileAlreadyExistsException("Output directory " + outputIndex + " already exists!"); + } + + Path tmpLocalOutput = new Path(localWorkingDir, "merge-output"); + Path localOutput = fs.startLocalOutput(outputIndex, tmpLocalOutput); + + // + // Merge indices + // + IndexWriter writer = new IndexWriter(localOutput.toString(), null, true); + writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR)); + writer.setMaxBufferedDocs(getConf().getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS)); + writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS)); + writer.setTermIndexInterval(getConf().getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL)); + writer.setInfoStream(LogUtil.getDebugStream(LOG)); + writer.setUseCompoundFile(false); + writer.setSimilarity(new NutchSimilarity()); + writer.addIndexes(readers); + writer.close(); + + // + // Put target back + // + fs.completeLocalOutput(outputIndex, tmpLocalOutput); + LOG.info("done merging"); + } + + /** + * Create an index for the input files in the named directory. + */ + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new IndexMerger(), args); + System.exit(res); + } + + public int run(String[] args) throws Exception { + String usage = "IndexMerger [-workingdir <workingdir>] [-p] outputIndex indexesDir...\n\t-p Input directories contain parallel indexes.\n"; + if (args.length < 2) + { + System.err.println("Usage: " + usage); + return -1; + } + + // + // Parse args, read all index directories to be processed + // + FileSystem fs = FileSystem.get(getConf()); + List<Path> indexDirs = new ArrayList<Path>(); + + Path workDir = new Path("indexmerger-" + System.currentTimeMillis()); + int i = 0; + + boolean parallel=false; + + while ( args[i].startsWith( "-" ) ) + { + if ( "-workingdir".equals(args[i]) ) + { + i++; + workDir = new Path(args[i++], "indexmerger-" + System.currentTimeMillis()); + } + else if ( "-p".equals(args[i]) ) + { + i++; + parallel=true; + } + } + + Path outputIndex = new Path(args[i++]); + + List<IndexReader> readers = new ArrayList<IndexReader>( ); + + if ( ! parallel ) + { + for (; i < args.length; i++) + { + FileStatus[] fstats = fs.listStatus(new Path(args[i]), HadoopFSUtil.getPassDirectoriesFilter(fs)); + + for ( Path p : HadoopFSUtil.getPaths(fstats) ) + { + LOG.info( "Adding reader for: " + p ); + readers.add( IndexReader.open( new FsDirectory( fs, p, false, getConf( ) ) ) ); + } + } + } + else + { + for (; i < args.length; i++) + { + FileStatus[] fstats = fs.listStatus(new Path(args[i]), HadoopFSUtil.getPassDirectoriesFilter(fs)); + Path parallelDirs[] = HadoopFSUtil.getPaths( fstats ); + + if ( parallelDirs.length < 1 ) + { + LOG.info( "No sub-directories, skipping: " + args[i] ); + + continue; + } + else + { + LOG.info( "Adding parallel reader for: " + args[i] ); + } + + ArchiveParallelReader preader = new ArchiveParallelReader( ); + + // Sort the parallelDirs so that we add them in order. Order + // matters to the ParallelReader. + Arrays.sort( parallelDirs ); + + for ( Path p : parallelDirs ) + { + LOG.info( " Adding to parallel reader: " + p.getName( ) ); + preader.add( IndexReader.open( new FsDirectory( fs, p, false, getConf( ) ) ) ); + } + + readers.add( preader ); + } + } + + // + // Merge the indices + // + + try { + merge(readers.toArray(new IndexReader[readers.size()]), outputIndex, workDir, parallel); + return 0; + } catch (Exception e) { + LOG.fatal("IndexMerger: " + StringUtils.stringifyException(e)); + return -1; + } finally { + FileSystem.getLocal(getConf()).delete(workDir, true); + } + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 00:25:36
|
Revision: 2854 http://archive-access.svn.sourceforge.net/archive-access/?rev=2854&view=rev Author: binzino Date: 2009-10-28 00:25:23 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Updated to Nutch 1.0 API. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java 2009-10-28 00:18:16 UTC (rev 2853) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java 2009-10-28 00:25:23 UTC (rev 2854) @@ -107,12 +107,12 @@ public String getExplanation(Query query, Hit hit) throws IOException { return luceneSearcher.explain(this.queryFilters.filter(query), - hit.getIndexDocNo()).toHtml(); + Integer.valueOf(hit.getUniqueKey())).toHtml(); } public HitDetails getDetails(Hit hit) throws IOException { - Document doc = luceneSearcher.doc(hit.getIndexDocNo()); + Document doc = luceneSearcher.doc(Integer.valueOf(hit.getUniqueKey())); List docFields = doc.getFields(); String[] fields = new String[docFields.size()]; @@ -173,13 +173,14 @@ { if ( "site".equals( dedupField ) ) { - String exactUrl = reader.document( doc ).get( "exacturl"); try { - java.net.URL u = new java.net.URL( exactUrl ); + String url = reader.document( doc ).get( "url"); + + java.net.URL u = new java.net.URL( url ); dedupValue = u.getHost(); - System.out.println("Dedup value hack:" + dedupValue); + System.out.println( "Dedup value hack:" + dedupValue ); } catch ( java.net.MalformedURLException e ) { @@ -192,7 +193,7 @@ } } - hits[i] = new Hit(doc, sortValue, dedupValue); + hits[i] = new Hit(Integer.toString(doc), sortValue, dedupValue); } return new Hits(topDocs.totalHits, hits); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2853 http://archive-access.svn.sourceforge.net/archive-access/?rev=2853&view=rev Author: bradtofel Date: 2009-10-28 00:18:16 +0000 (Wed, 28 Oct 2009) Log Message: ----------- FEATURE: optionally can add results that are filtered to an "annotater" which is really a QueryFilterGroup. Later the QueryFilterGroup can annotate the search results with "close matches" Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/HostMatchFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/HostMatchFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/HostMatchFilter.java 2009-10-28 00:16:36 UTC (rev 2852) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/HostMatchFilter.java 2009-10-28 00:18:16 UTC (rev 2853) @@ -25,6 +25,7 @@ package org.archive.wayback.resourceindex.filters; import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.resourceindex.filterfactory.QueryCaptureFilterGroup; import org.archive.wayback.util.ObjectFilter; /** @@ -37,10 +38,20 @@ public class HostMatchFilter implements ObjectFilter<CaptureSearchResult> { private String hostname = null; + private QueryCaptureFilterGroup annotationTarget = null; /** * @param hostname String of original host to match */ + public HostMatchFilter(final String hostname, + QueryCaptureFilterGroup annotationTarget) { + this.hostname = hostname; + this.annotationTarget = annotationTarget; + } + + /** + * @param hostname String of original host to match + */ public HostMatchFilter(final String hostname) { this.hostname = hostname; } @@ -50,6 +61,13 @@ */ public int filterObject(CaptureSearchResult r) { String origHost = r.getOriginalHost(); - return hostname.equals(origHost) ? FILTER_INCLUDE : FILTER_EXCLUDE; + if(hostname.equals(origHost)) { + return FILTER_INCLUDE; + } else { + if(annotationTarget != null) { + annotationTarget.addCloseMatch(origHost, r.getOriginalUrl()); + } + return FILTER_EXCLUDE; + } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-28 00:17:42
|
Revision: 2852 http://archive-access.svn.sourceforge.net/archive-access/?rev=2852&view=rev Author: bradtofel Date: 2009-10-28 00:16:36 +0000 (Wed, 28 Oct 2009) Log Message: ----------- INITIAL REV: experimental result filters to include results based on the file field, either as a string prefix, or a regex Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/FilePrefixFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/FileRegexFilter.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/FilePrefixFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/FilePrefixFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/FilePrefixFilter.java 2009-10-28 00:16:36 UTC (rev 2852) @@ -0,0 +1,26 @@ +package org.archive.wayback.resourceindex.filters; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; + +public class FilePrefixFilter implements ObjectFilter<CaptureSearchResult> { + + private String prefixes[] = null; + + public String[] getPrefixes() { + return prefixes; + } + public void setPrefixes(String[] prefixes) { + this.prefixes = prefixes; + } + + public int filterObject(CaptureSearchResult o) { + final String file = o.getFile(); + for(String prefix : prefixes) { + if(file.startsWith(prefix)) { + return FILTER_INCLUDE; + } + } + return FILTER_EXCLUDE; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/FileRegexFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/FileRegexFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/FileRegexFilter.java 2009-10-28 00:16:36 UTC (rev 2852) @@ -0,0 +1,39 @@ +package org.archive.wayback.resourceindex.filters; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; + +public class FileRegexFilter implements ObjectFilter<CaptureSearchResult> { + + private Pattern patterns[] = null; + + public List<String> getPatterns() { + ArrayList<String> s = new ArrayList<String>(); + for(Pattern p : patterns) { + s.add(p.pattern()); + } + return s; + } + + public void setPatterns(List<String> patternStrings) { + int size = patternStrings.size(); + patterns = new Pattern[size]; + for(int i = 0; i < size; i++) { + patterns[i] = Pattern.compile(patternStrings.get(i)); + } + } + + public int filterObject(CaptureSearchResult o) { + final String file = o.getFile(); + for(Pattern pattern : patterns) { + if(pattern.matcher(file).find()) { + return FILTER_INCLUDE; + } + } + return FILTER_EXCLUDE; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-28 00:17:15
|
Revision: 2851 http://archive-access.svn.sourceforge.net/archive-access/?rev=2851&view=rev Author: bradtofel Date: 2009-10-28 00:14:40 +0000 (Wed, 28 Oct 2009) Log Message: ----------- REFACTOR: major overhaul of resource index query filtering, moving much of the logic out of LocalResourceIndex into ...wayback.resourceindex.filterfactory.* Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroupFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroupFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroupFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/FilterGroupFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroupFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/WindowFilterGroup.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2009-10-28 00:08:00 UTC (rev 2850) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -25,7 +25,9 @@ package org.archive.wayback.resourceindex; import java.io.IOException; +import java.util.ArrayList; import java.util.Iterator; +import java.util.List; import org.apache.commons.httpclient.URIException; import org.archive.wayback.ResourceIndex; @@ -41,36 +43,59 @@ import org.archive.wayback.exception.BadQueryException; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.exception.ResourceNotInArchiveException; -import org.archive.wayback.resourceindex.adapters.ConditionalGetAnnotationSearchResultAdapter; import org.archive.wayback.resourceindex.adapters.CaptureToUrlSearchResultAdapter; -import org.archive.wayback.resourceindex.adapters.DeduplicationSearchResultAnnotationAdapter; -import org.archive.wayback.resourceindex.filters.CounterFilter; -import org.archive.wayback.resourceindex.filters.DateRangeFilter; -import org.archive.wayback.resourceindex.filters.DuplicateRecordFilter; -import org.archive.wayback.resourceindex.filters.GuardRailFilter; -import org.archive.wayback.resourceindex.filters.HostMatchFilter; -import org.archive.wayback.resourceindex.filters.SchemeMatchFilter; -import org.archive.wayback.resourceindex.filters.SelfRedirectFilter; -import org.archive.wayback.resourceindex.filters.UrlMatchFilter; -import org.archive.wayback.resourceindex.filters.UrlPrefixMatchFilter; -import org.archive.wayback.resourceindex.filters.WindowEndFilter; -import org.archive.wayback.resourceindex.filters.WindowStartFilter; -import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.resourceindex.filterfactory.AccessPointCaptureFilterGroupFactory; +import org.archive.wayback.resourceindex.filterfactory.CaptureFilterGroup; +import org.archive.wayback.resourceindex.filterfactory.CoreCaptureFilterGroupFactory; +import org.archive.wayback.resourceindex.filterfactory.ExclusionCaptureFilterGroupFactory; +import org.archive.wayback.resourceindex.filterfactory.FilterGroupFactory; +import org.archive.wayback.resourceindex.filterfactory.QueryCaptureFilterGroupFactory; +import org.archive.wayback.resourceindex.filterfactory.WindowFilterGroup; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.ObjectFilter; import org.archive.wayback.util.ObjectFilterChain; import org.archive.wayback.util.ObjectFilterIterator; -import org.archive.wayback.util.Timestamp; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.UrlOperations; /** + * ResourceIndex implementation which assumes a "local" SearchResultSource. + * + * Extracting SearchResults from the source involves several layered steps: + * + * 1) extraction of results based on a prefix into the index + * 2) passing each result through a series of adapters + * these adapters can create new fields based on existing fields, or can + * annotate fields as they are scanned in order + * 3) filtering results based on request filters, which may come from + * * WaybackRequest-specific parameters. + * Ex. exact host match only, exact scheme match only, ... + * * AccessPoint-specific configuration + * Ex. only return records with (ARC/WARC) filename prefixed with XXX + * Ex. block any dates not older than 6 months + * 4) filtering based on AccessControl configurations + * Ex. block any urls with prefixes in file X + * 5) windowing filters, which provide pagination of the results, allowing + * requests to specify "show results between 10 and 20" + * 6) post filter adapters, which may annotate final results with other + * information + * Ex. for each result, consult DB to see if user-contributed messages + * apply to the results + * + * After all results have been processed, we annotate the final SearchResultS + * object with summary information about the results included. As we set up the + * chain of filters, we instrument the chain with counters that observe the + * number of results that went into, and came out of the Exclusion filters. + * + * If there were results presented to the Exclusion filter, but none were + * emitted from it, an AccessControlException is thrown. * - * * @author brad * @version $Date$, $Revision$ */ public class LocalResourceIndex implements ResourceIndex { + public final static int TYPE_REPLAY = 0; + public final static int TYPE_CAPTURE = 1; + public final static int TYPE_URL = 2; /** * maximum number of records to return @@ -89,25 +114,18 @@ private ObjectFilter<CaptureSearchResult> filter = null; + + List<FilterGroupFactory> fgFactories = null; + public LocalResourceIndex() { canonicalizer = new AggressiveUrlCanonicalizer(); + fgFactories = new ArrayList<FilterGroupFactory>(); + fgFactories.add(new CoreCaptureFilterGroupFactory()); + fgFactories.add(new QueryCaptureFilterGroupFactory()); + fgFactories.add(new ExclusionCaptureFilterGroupFactory()); + fgFactories.add(new AccessPointCaptureFilterGroupFactory()); } - private CloseableIterator<CaptureSearchResult> getCaptureIterator(String k) - throws ResourceIndexNotAvailableException { - - CloseableIterator<CaptureSearchResult> captures = - source.getPrefixIterator(k); - if(dedupeRecords) { - // hack hack!!! - captures = new AdaptedIterator<CaptureSearchResult, CaptureSearchResult> - (captures, new ConditionalGetAnnotationSearchResultAdapter()); - captures = new AdaptedIterator<CaptureSearchResult, CaptureSearchResult> - (captures, new DeduplicationSearchResultAnnotationAdapter()); - } - return captures; - } - private void cleanupIterator(CloseableIterator<? extends SearchResult> itr) throws ResourceIndexNotAvailableException { try { @@ -119,42 +137,64 @@ } } + private List<CaptureFilterGroup> getRequestFilterGroups(WaybackRequest r) + throws BadQueryException { + + ArrayList<CaptureFilterGroup> groups = + new ArrayList<CaptureFilterGroup>(); + for(FilterGroupFactory f : fgFactories) { + groups.add(f.getGroup(r, canonicalizer, this)); + } + return groups; + } + + public CaptureSearchResults doCaptureQuery(WaybackRequest wbRequest, int type) throws ResourceIndexNotAvailableException, ResourceNotInArchiveException, BadQueryException, AccessControlException { - - CaptureSearchResults results = new CaptureSearchResults(); - CaptureQueryFilterState filterState = - new CaptureQueryFilterState(wbRequest, canonicalizer, type, - getUserFilters(wbRequest)); - String keyUrl = filterState.getKeyUrl(); + String urlKey; + try { + urlKey = canonicalizer.urlStringToKey(wbRequest.getRequestUrl()); + } catch (URIException e) { + throw new BadQueryException("Bad URL(" + + wbRequest.getRequestUrl() + ")"); + } - CloseableIterator<CaptureSearchResult> itr = getCaptureIterator(keyUrl); - // set up the common Filters: - ObjectFilter<CaptureSearchResult> filter = filterState.getFilter(); - itr = new ObjectFilterIterator<CaptureSearchResult>(itr,filter); + // the CaptureSearchResults we are about to return: + CaptureSearchResults results = new CaptureSearchResults(); + // the various filters to apply to the results: + ObjectFilterChain<CaptureSearchResult> filters = + new ObjectFilterChain<CaptureSearchResult>(); - // Windowing: - WindowFilterState<CaptureSearchResult> window = - new WindowFilterState<CaptureSearchResult>(wbRequest); - ObjectFilter<CaptureSearchResult> windowFilter = window.getFilter(); - itr = new ObjectFilterIterator<CaptureSearchResult>(itr,windowFilter); - - - if(annotater != null) { - itr = new ObjectFilterIterator<CaptureSearchResult>(itr,annotater); + // Groupings of filters for... sanity and summary annotation of results: + // Windows: + WindowFilterGroup<CaptureSearchResult> window = + new WindowFilterGroup<CaptureSearchResult>(wbRequest,this); + List<CaptureFilterGroup> groups = getRequestFilterGroups(wbRequest); + + for(CaptureFilterGroup cfg : groups) { + filters.addFilters(cfg.getFilters()); } + filters.addFilters(window.getFilters()); + CloseableIterator<CaptureSearchResult> itr = + new ObjectFilterIterator<CaptureSearchResult>( + source.getPrefixIterator(urlKey),filters); + while(itr.hasNext()) { results.addSearchResult(itr.next()); } - filterState.annotateResults(results); + for(CaptureFilterGroup cfg : groups) { + cfg.annotateResults(results); + } window.annotateResults(results); + cleanupIterator(itr); - return results; + + return results; } public UrlSearchResults doUrlQuery(WaybackRequest wbRequest) @@ -162,38 +202,61 @@ ResourceNotInArchiveException, BadQueryException, AccessControlException { + String urlKey; + try { + urlKey = canonicalizer.urlStringToKey(wbRequest.getRequestUrl()); + } catch (URIException e) { + throw new BadQueryException("Bad URL(" + + wbRequest.getRequestUrl() + ")"); + } + UrlSearchResults results = new UrlSearchResults(); - CaptureQueryFilterState filterState = - new CaptureQueryFilterState(wbRequest,canonicalizer, - CaptureQueryFilterState.TYPE_URL, getUserFilters(wbRequest)); - String keyUrl = filterState.getKeyUrl(); + // the various CAPTURE filters to apply to the results: + ObjectFilterChain<CaptureSearchResult> cFilters = + new ObjectFilterChain<CaptureSearchResult>(); - CloseableIterator<CaptureSearchResult> citr = getCaptureIterator(keyUrl); - // set up the common Filters: - ObjectFilter<CaptureSearchResult> filter = filterState.getFilter(); - citr = new ObjectFilterIterator<CaptureSearchResult>(citr,filter); - // adapt into UrlSearchResult: + // Groupings of filters for clarity(?) and summary annotation of + // results: + List<CaptureFilterGroup> groups = getRequestFilterGroups(wbRequest); + for(CaptureFilterGroup cfg : groups) { + cFilters.addFilters(cfg.getFilters()); + } - CloseableIterator<UrlSearchResult> itr = - CaptureToUrlSearchResultAdapter.adaptCaptureIterator(citr); + CloseableIterator<CaptureSearchResult> itrC = + new ObjectFilterIterator<CaptureSearchResult>( + source.getPrefixIterator(urlKey),cFilters); + + // we've filtered the appropriate CaptureResult objects within the + // iterator, now we're going to convert whatever records make it past + // the filters into UrlSearchResults, and then do further window + // filtering on those results: + // Windows: + // the window URL filters to apply to the results, once they're + // UrlSearchResult objects + ObjectFilterChain<UrlSearchResult> uFilters = + new ObjectFilterChain<UrlSearchResult>(); + WindowFilterGroup<UrlSearchResult> window = + new WindowFilterGroup<UrlSearchResult>(wbRequest,this); + uFilters.addFilters(window.getFilters()); + CloseableIterator<UrlSearchResult> itrU = + new ObjectFilterIterator<UrlSearchResult>( + CaptureToUrlSearchResultAdapter.adaptCaptureIterator(itrC), + uFilters); - // Windowing: - WindowFilterState<UrlSearchResult> window = - new WindowFilterState<UrlSearchResult>(wbRequest); - ObjectFilter<UrlSearchResult> windowFilter = window.getFilter(); - itr = new ObjectFilterIterator<UrlSearchResult>(itr,windowFilter); - - while(itr.hasNext()) { - results.addSearchResult(itr.next()); + while(itrU.hasNext()) { + results.addSearchResult(itrU.next()); } - filterState.annotateResults(results); + for(CaptureFilterGroup cfg : groups) { + cfg.annotateResults(results); + } window.annotateResults(results); - cleanupIterator(itr); - - return results; + + cleanupIterator(itrU); + + return results; } /* @@ -209,15 +272,13 @@ if (wbRequest.isReplayRequest()) { - results = doCaptureQuery(wbRequest, - CaptureQueryFilterState.TYPE_REPLAY); + results = doCaptureQuery(wbRequest, TYPE_REPLAY); results.putFilter(WaybackRequest.REQUEST_TYPE, WaybackRequest.REQUEST_REPLAY_QUERY); } else if (wbRequest.isCaptureQueryRequest()) { - results = doCaptureQuery(wbRequest, - CaptureQueryFilterState.TYPE_CAPTURE); + results = doCaptureQuery(wbRequest, TYPE_CAPTURE); results.putFilter(WaybackRequest.REQUEST_TYPE, WaybackRequest.REQUEST_CAPTURE_QUERY); @@ -259,7 +320,11 @@ public void setMaxRecords(int maxRecords) { this.maxRecords = maxRecords; } + public int getMaxRecords() { + return maxRecords; + } + /** * @param source the source to set */ @@ -302,190 +367,4 @@ public void setFilter(ObjectFilter<CaptureSearchResult> filter) { this.filter = filter; } - - public ObjectFilterChain<CaptureSearchResult> getUserFilters(WaybackRequest request) { - ObjectFilterChain<CaptureSearchResult> userFilters = - new ObjectFilterChain<CaptureSearchResult>(); - - // has the user asked for only results on the exact host specified? - if(request.isExactHost()) { - userFilters.addFilter(new HostMatchFilter( - UrlOperations.urlToHost(request.getRequestUrl()))); - } - - if(request.isExactScheme()) { - userFilters.addFilter(new SchemeMatchFilter( - UrlOperations.urlToScheme(request.getRequestUrl()))); - } - if(filter != null) { - userFilters.addFilter(filter); - } - - return userFilters; - } - - private class CaptureQueryFilterState { - public final static int TYPE_REPLAY = 0; - public final static int TYPE_CAPTURE = 1; - public final static int TYPE_URL = 2; - - private ObjectFilterChain<CaptureSearchResult> filter = null; - private CounterFilter finalCounter = null; - private CounterFilter preExclusionCounter = null; - private String keyUrl = null; - private String startDate; - private String endDate; - private String exactDate; - - public CaptureQueryFilterState(WaybackRequest request, - UrlCanonicalizer canonicalizer, int type, - ObjectFilterChain<CaptureSearchResult> userFilter) - throws BadQueryException { - - String searchUrl = request.getRequestUrl(); - try { - keyUrl = canonicalizer.urlStringToKey(searchUrl); - } catch (URIException e) { - throw new BadQueryException("invalid " - + WaybackRequest.REQUEST_URL + " " + searchUrl); - } - - filter = new ObjectFilterChain<CaptureSearchResult>(); - startDate = request.getStartTimestamp(); - if(startDate == null) { - startDate = Timestamp.earliestTimestamp().getDateStr(); - } - endDate = request.getEndTimestamp(); - if(endDate == null) { - endDate = Timestamp.latestTimestamp().getDateStr(); - } - if(type == TYPE_REPLAY) { - exactDate = request.getReplayTimestamp(); - if(exactDate == null) { - exactDate = Timestamp.latestTimestamp().getDateStr(); - } - } - - finalCounter = new CounterFilter(); - preExclusionCounter = new CounterFilter(); - DateRangeFilter drFilter = new DateRangeFilter(startDate,endDate); - - // checks an exclusion service for every matching record - ObjectFilter<CaptureSearchResult> exclusion = - request.getExclusionFilter(); - - - // makes sure we don't inspect too many records: prevents DOS - filter.addFilter(new GuardRailFilter(maxRecords)); - filter.addFilter(new DuplicateRecordFilter()); - - if(type == TYPE_REPLAY) { - filter.addFilter(new UrlMatchFilter(keyUrl)); - filter.addFilter(drFilter); - SelfRedirectFilter selfRedirectFilter= new SelfRedirectFilter(); - selfRedirectFilter.setCanonicalizer(canonicalizer); - filter.addFilter(selfRedirectFilter); - } else if(type == TYPE_CAPTURE){ - filter.addFilter(new UrlMatchFilter(keyUrl)); - filter.addFilter(drFilter); - } else if(type == TYPE_URL) { - filter.addFilter(new UrlPrefixMatchFilter(keyUrl)); - filter.addFilter(drFilter); - } else { - throw new BadQueryException("Unknown type"); - } - - if(userFilter != null) { - filter.addFilters(userFilter.getFilters()); - } - - // count how many results got to the ExclusionFilter: - filter.addFilter(preExclusionCounter); - - if(exclusion != null) { - filter.addFilter(exclusion); - } - - // count how many results got past the ExclusionFilter, or how - // many total matched, if there was no ExclusionFilter: - filter.addFilter(finalCounter); - } - public String getKeyUrl() { - return keyUrl; - } - public ObjectFilter<CaptureSearchResult> getFilter() { - return filter; - } - public void annotateResults(SearchResults results) - throws AccessControlException, ResourceNotInArchiveException { - - int matched = finalCounter.getNumMatched(); - if (matched == 0) { - if (preExclusionCounter != null) { - if(preExclusionCounter.getNumMatched() > 0) { - throw new AccessControlException("All results Excluded"); - } - } - throw new ResourceNotInArchiveException("the URL " + keyUrl - + " is not in the archive."); - } - // now we need to set some filter properties on the results: - results.putFilter(WaybackRequest.REQUEST_URL, keyUrl); - results.putFilter(WaybackRequest.REQUEST_START_DATE, startDate); - results.putFilter(WaybackRequest.REQUEST_END_DATE, endDate); - if(exactDate != null) { - results.putFilter(WaybackRequest.REQUEST_EXACT_DATE, exactDate); - } - } - } - - private class WindowFilterState<T> { - int startResult; // calculated based on hits/page * pagenum - int resultsPerPage; - int pageNum; - ObjectFilterChain<T> windowFilters; - WindowStartFilter<T> startFilter; - WindowEndFilter<T> endFilter; - public WindowFilterState(WaybackRequest request) - throws BadQueryException { - - windowFilters = new ObjectFilterChain<T>(); - // first grab all the info from the WaybackRequest, and validate it: - resultsPerPage = request.getResultsPerPage(); - pageNum = request.getPageNum(); - - if (resultsPerPage < 1) { - throw new BadQueryException("resultsPerPage cannot be < 1"); - } - if (resultsPerPage > maxRecords) { - throw new BadQueryException("resultsPerPage cannot be > " - + maxRecords); - } - if (pageNum < 1) { - throw new BadQueryException("pageNum must be > 0"); - } - startResult = (pageNum - 1) * resultsPerPage; - startFilter = new WindowStartFilter<T>(startResult); - endFilter = new WindowEndFilter<T>(resultsPerPage); - windowFilters.addFilter(startFilter); - windowFilters.addFilter(endFilter); - } - public ObjectFilter<T> getFilter() { - return windowFilters; - } - public void annotateResults(SearchResults results) - throws BadQueryException { - results.setFirstReturned(startResult); - results.setNumRequested(resultsPerPage); - int numSeen = endFilter.getNumSeen(); - if(numSeen == 0) { - throw new BadQueryException("No results in requested window"); - } - // how many went by the filters: - results.setMatchingCount(startFilter.getNumSeen()); - - // how many were actually returned: - results.setReturnedCount(endFilter.getNumReturned()); - } - } } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroup.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,48 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.AccessControlException; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.resourceindex.filters.FilePrefixFilter; +import org.archive.wayback.resourceindex.filters.FileRegexFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; + +public class AccessPointCaptureFilterGroup implements CaptureFilterGroup { + private ObjectFilterChain<CaptureSearchResult> chain = null; + private final static String[] sA = null; + + public AccessPointCaptureFilterGroup(WaybackRequest request) { + chain = new ObjectFilterChain<CaptureSearchResult>(); + List<String> prefixes = null; + if(request.getAccessPoint() != null) { + prefixes = request.getAccessPoint().getFilePrefixes(); + if(prefixes != null && prefixes.size() > 0) { + FilePrefixFilter f = new FilePrefixFilter(); + f.setPrefixes(prefixes.toArray(sA)); + chain.addFilter(f); + } + List<String> patterns = request.getAccessPoint().getFilePatterns(); + if(patterns != null && patterns.size() > 0) { + FileRegexFilter f = new FileRegexFilter(); + f.setPatterns(patterns); + chain.addFilter(f); + } + } + } + + public void annotateResults(SearchResults results) + throws ResourceNotInArchiveException, BadQueryException, + AccessControlException { + + } + + public List<ObjectFilter<CaptureSearchResult>> getFilters() { + return chain.getFilters(); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroupFactory.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,15 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public class AccessPointCaptureFilterGroupFactory implements FilterGroupFactory { + + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException { + return new AccessPointCaptureFilterGroup(request); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CaptureFilterGroup.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,18 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.exception.AccessControlException; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.util.ObjectFilter; + +public interface CaptureFilterGroup { + public List<ObjectFilter<CaptureSearchResult>> getFilters(); + + public void annotateResults(SearchResults results) + throws ResourceNotInArchiveException, BadQueryException, + AccessControlException; +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,35 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.resourceindex.LocalResourceIndex; +import org.archive.wayback.resourceindex.filters.ConditionalGetAnnotationFilter; +import org.archive.wayback.resourceindex.filters.DuplicateRecordFilter; +import org.archive.wayback.resourceindex.filters.GuardRailFilter; +import org.archive.wayback.resourceindex.filters.WARCRevisitAnnotationFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; + +public class CoreCaptureFilterGroup implements CaptureFilterGroup { + private ObjectFilterChain<CaptureSearchResult> chain = null; + + public CoreCaptureFilterGroup(LocalResourceIndex index) { + chain = new ObjectFilterChain<CaptureSearchResult>(); + chain.addFilter(new GuardRailFilter(index.getMaxRecords())); + chain.addFilter(new DuplicateRecordFilter()); + if(index.isDedupeRecords()) { + chain.addFilter(new WARCRevisitAnnotationFilter()); + chain.addFilter(new ConditionalGetAnnotationFilter()); + } + } + public List<ObjectFilter<CaptureSearchResult>> getFilters() { + return chain.getFilters(); + } + + public void annotateResults(SearchResults results) { + // TODO: ask guardRailFilter if it aborted processing (too many records) + // and annotate the results with info about how to continue the request? + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroupFactory.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,15 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public class CoreCaptureFilterGroupFactory implements FilterGroupFactory { + + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException { + return new CoreCaptureFilterGroup(index); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,61 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.AccessControlException; +import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.resourceindex.filters.CounterFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; + +public class ExclusionCaptureFilterGroup implements CaptureFilterGroup { + + private ObjectFilterChain<CaptureSearchResult> chain = null; + private CounterFilter preCounter = null; + private CounterFilter postCounter = null; + String requestUrl = null; + + public ExclusionCaptureFilterGroup(WaybackRequest request) { + + // checks an exclusion service for every matching record + ObjectFilter<CaptureSearchResult> exclusion = + request.getExclusionFilter(); + chain = new ObjectFilterChain<CaptureSearchResult>(); + if(exclusion != null) { + preCounter = new CounterFilter(); + // count how many results got to the ExclusionFilter: + chain.addFilter(preCounter); + chain.addFilter(exclusion); + // count how many results got past the ExclusionFilter: + requestUrl = request.getRequestUrl(); + } + postCounter = new CounterFilter(); + chain.addFilter(postCounter); + } + + public List<ObjectFilter<CaptureSearchResult>> getFilters() { + return chain.getFilters(); + } + + public void annotateResults(SearchResults results) + throws AccessControlException, ResourceNotInArchiveException { + if(postCounter.getNumMatched() == 0) { + + // nothing got to the counter after exclusions. If we have + // exclusions (detected by preCounter being non-null, and the + // preCounter passed any results, then they were all filtered by + // the exclusions filter. + if(preCounter != null && preCounter.getNumMatched() > 0) { + throw new AccessControlException("All results Excluded"); + } + ResourceNotInArchiveException e = + new ResourceNotInArchiveException("the URL " + requestUrl + + " is not in the archive."); + e.setCloseMatches(results.getCloseMatches()); + throw e; + } + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroupFactory.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,15 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public class ExclusionCaptureFilterGroupFactory implements FilterGroupFactory { + + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException { + return new ExclusionCaptureFilterGroup(request); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/FilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/FilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/FilterGroupFactory.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,12 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public interface FilterGroupFactory { + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException; +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,120 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.httpclient.URIException; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.filters.DateRangeFilter; +import org.archive.wayback.resourceindex.filters.HostMatchFilter; +import org.archive.wayback.resourceindex.filters.SchemeMatchFilter; +import org.archive.wayback.resourceindex.filters.SelfRedirectFilter; +import org.archive.wayback.resourceindex.filters.UrlMatchFilter; +import org.archive.wayback.resourceindex.filters.UrlPrefixMatchFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; +import org.archive.wayback.util.Timestamp; +import org.archive.wayback.util.url.UrlOperations; + +public class QueryCaptureFilterGroup implements CaptureFilterGroup { +// private ObjectFilter<CaptureSearchResult> prefixFilter = null; +// private ObjectFilter<CaptureSearchResult> dateFilter = null; +// private ObjectFilter<CaptureSearchResult> selfRedirectFilter = null; +// private ObjectFilter<CaptureSearchResult> exactHost = null; +// private ObjectFilter<CaptureSearchResult> exactScheme = null; + private ObjectFilterChain<CaptureSearchResult> chain = null; + private String requestType = null; + private String keyUrl = null; + private String startDate; + private String endDate; + private String exactDate; + /** + * List of URL Strings that are "close" to the current request, but not + * included in the current CaptureSearchResults. + */ + private Map<String,String> closeMatches = new HashMap<String,String>(); + + + public QueryCaptureFilterGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer) + throws BadQueryException { + + requestType = request.get(WaybackRequest.REQUEST_TYPE); + + // URL-Filters: + chain = new ObjectFilterChain<CaptureSearchResult>(); + try { + keyUrl = canonicalizer.urlStringToKey(request.getRequestUrl()); + } catch (URIException e) { + throw new BadQueryException("Bad request URL(" + + request.getRequestUrl() +")"); + } + if(request.isReplayRequest()) { + exactDate = request.getReplayTimestamp(); + if(exactDate == null) { + exactDate = Timestamp.latestTimestamp().getDateStr(); + } + chain.addFilter(new UrlMatchFilter(keyUrl)); + chain.addFilter(new SelfRedirectFilter(canonicalizer)); + + } else if(request.isCaptureQueryRequest()) { + chain.addFilter(new UrlMatchFilter(keyUrl)); + } else if(request.isUrlQueryRequest()) { + chain.addFilter(new UrlPrefixMatchFilter(keyUrl)); + } + + // Date-Filters: + startDate = request.getStartTimestamp(); + if(startDate == null) { + startDate = Timestamp.earliestTimestamp().getDateStr(); + } + endDate = request.getEndTimestamp(); + if(endDate == null) { + endDate = Timestamp.latestTimestamp().getDateStr(); + } + chain.addFilter(new DateRangeFilter(startDate, endDate)); + + // Other Filters: + if(request.isExactHost()) { + chain.addFilter( + new HostMatchFilter( + UrlOperations.urlToHost(request.getRequestUrl()), + this) + ); + } + + if(request.isExactScheme()) { + chain.addFilter(new SchemeMatchFilter( + UrlOperations.urlToScheme(request.getRequestUrl()),this)); + } + } + + public List<ObjectFilter<CaptureSearchResult>> getFilters() { + return chain.getFilters(); + } + + public void annotateResults(SearchResults results) { + + // set the filter properties on the results: + results.putFilter(WaybackRequest.REQUEST_URL, keyUrl); + results.putFilter(WaybackRequest.REQUEST_START_DATE, startDate); + results.putFilter(WaybackRequest.REQUEST_END_DATE, endDate); + if(exactDate != null) { + results.putFilter(WaybackRequest.REQUEST_EXACT_DATE, exactDate); + } + results.putFilter(WaybackRequest.REQUEST_TYPE, requestType); + if(!closeMatches.isEmpty()) { + results.setCloseMatches(new ArrayList<String>(closeMatches.values())); + } + } + + public void addCloseMatch(String host, String closeMatch) { + closeMatches.put(host, closeMatch); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroupFactory.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,15 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public class QueryCaptureFilterGroupFactory implements FilterGroupFactory { + + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException { + return new QueryCaptureFilterGroup(request,canonicalizer); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/WindowFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/WindowFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/WindowFilterGroup.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,63 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; +import org.archive.wayback.resourceindex.filters.WindowEndFilter; +import org.archive.wayback.resourceindex.filters.WindowStartFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; + +public class WindowFilterGroup<T> { + int startResult; // calculated based on hits/page * pagenum + int resultsPerPage; + int pageNum; + ObjectFilterChain<T> windowFilters; + WindowStartFilter<T> startFilter; + WindowEndFilter<T> endFilter; + public WindowFilterGroup(WaybackRequest request, LocalResourceIndex index) + throws BadQueryException { + + windowFilters = new ObjectFilterChain<T>(); + // first grab all the info from the WaybackRequest, and validate it: + resultsPerPage = request.getResultsPerPage(); + pageNum = request.getPageNum(); + + if (resultsPerPage < 1) { + throw new BadQueryException("resultsPerPage cannot be < 1"); + } + if (resultsPerPage > index.getMaxRecords()) { + throw new BadQueryException("resultsPerPage cannot be > " + + index.getMaxRecords()); + } + if (pageNum < 1) { + throw new BadQueryException("pageNum must be > 0"); + } + startResult = (pageNum - 1) * resultsPerPage; + startFilter = new WindowStartFilter<T>(startResult); + endFilter = new WindowEndFilter<T>(resultsPerPage); + windowFilters.addFilter(startFilter); + windowFilters.addFilter(endFilter); + } + public List<ObjectFilter<T>> getFilters() { + return windowFilters.getFilters(); + } + + public void annotateResults(SearchResults results) + throws BadQueryException { + results.setFirstReturned(startResult); + results.setNumRequested(resultsPerPage); + int numSeen = endFilter.getNumSeen(); + if(numSeen == 0) { + throw new BadQueryException("No results in requested window"); + } + // how many went by the filters: + results.setMatchingCount(startFilter.getNumSeen()); + + // how many were actually returned: + results.setReturnedCount(endFilter.getNumReturned()); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-28 00:08:09
|
Revision: 2850 http://archive-access.svn.sourceforge.net/archive-access/?rev=2850&view=rev Author: bradtofel Date: 2009-10-28 00:08:00 +0000 (Wed, 28 Oct 2009) Log Message: ----------- BUGFIX(unreported): test for null before attempting to decode String into long Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResult.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResult.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResult.java 2009-10-28 00:06:30 UTC (rev 2849) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResult.java 2009-10-28 00:08:00 UTC (rev 2850) @@ -213,7 +213,8 @@ } public long getEndOffset() { if(cachedEndOffset == -1) { - cachedEndOffset = Long.parseLong(get(CAPTURE_END_OFFSET)); + String tmp = get(CAPTURE_END_OFFSET); + cachedEndOffset = tmp == null ? -1 : Long.parseLong(tmp); } return cachedEndOffset; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2849 http://archive-access.svn.sourceforge.net/archive-access/?rev=2849&view=rev Author: bradtofel Date: 2009-10-28 00:06:30 +0000 (Wed, 28 Oct 2009) Log Message: ----------- FEATURE: Added parsing of ArchivalURL requests within proxy mode, with the main benefit of being able to include a date in the incoming requests. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/proxy/ProxyArchivalRequestParser.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/proxy/ProxyArchivalRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/proxy/ProxyArchivalRequestParser.java 2009-10-28 00:04:43 UTC (rev 2848) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/proxy/ProxyArchivalRequestParser.java 2009-10-28 00:06:30 UTC (rev 2849) @@ -1,3 +1,27 @@ +/* ProxyArchivalRequestParser + * + * $Id$ + * + * Created on 4:01:04 PM Apr 6, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ package org.archive.wayback.proxy; import java.util.List; @@ -2,2 +26,4 @@ +import javax.servlet.http.HttpServletRequest; + import org.archive.wayback.RequestParser; @@ -7,10 +33,24 @@ import org.archive.wayback.archivalurl.requestparser.PathDateRangeQueryRequestParser; import org.archive.wayback.archivalurl.requestparser.PathPrefixDatePrefixQueryRequestParser; import org.archive.wayback.archivalurl.requestparser.PathPrefixDateRangeQueryRequestParser; +import org.archive.wayback.archivalurl.requestparser.ReplayRequestParser; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.BetterRequestException; +import org.archive.wayback.requestparser.CompositeRequestParser; import org.archive.wayback.requestparser.FormRequestParser; import org.archive.wayback.requestparser.OpenSearchRequestParser; +import org.archive.wayback.util.bdb.BDBMap; +import org.archive.wayback.webapp.AccessPoint; -public class ProxyArchivalRequestParser extends ProxyRequestParser { +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ + +public class ProxyArchivalRequestParser extends CompositeRequestParser { private ProxyReplayRequestParser prrp = new ProxyReplayRequestParser(this); protected RequestParser[] getRequestParsers() { prrp.init(); @@ -20,6 +60,7 @@ new PathDateRangeQueryRequestParser(this), new PathPrefixDatePrefixQueryRequestParser(this), new PathPrefixDateRangeQueryRequestParser(this), + new ReplayRequestParser(this), new OpenSearchRequestParser(this), new FormRequestParser(this) }; @@ -31,4 +72,37 @@ public void setLocalhostNames(List<String> localhostNames) { prrp.setLocalhostNames(localhostNames); } + + public WaybackRequest parse(HttpServletRequest httpRequest, + AccessPoint wbContext) throws BadQueryException, BetterRequestException { + + WaybackRequest wbRequest = super.parse(httpRequest, wbContext); + if (wbRequest != null) { + String id = httpRequest.getHeader("Proxy-Id"); + if (id == null) + id = httpRequest.getRemoteAddr(); + + // Get the id from the request. + // If no id, use the ip-address instead. + // Check if the parser parsed a replay request and found a + // timestamp. If so, then we need to store the timestamp and + // redirect, which is done with a BetterRequestException: + if(wbRequest.isReplayRequest()) { + String replayTimestamp = wbRequest.getReplayTimestamp(); + if(replayTimestamp != null) { + BDBMap.addTimestampForId(httpRequest.getContextPath(), + id, replayTimestamp); + } + throw new BetterRequestException(wbRequest.getRequestUrl()); + } + + // Then get the timestamp (or rather datestr) matching this id. + // TODO: This is hacky - need generic way to store session data + String replayDateStr = BDBMap.getTimestampForId( + httpRequest.getContextPath(), id); + wbRequest.setReplayTimestamp(replayDateStr); + wbRequest.setAnchorTimestamp(replayDateStr); + } + return wbRequest; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 00:04:54
|
Revision: 2848 http://archive-access.svn.sourceforge.net/archive-access/?rev=2848&view=rev Author: binzino Date: 2009-10-28 00:04:43 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Ported fixes and updates from NW 0.12.9. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/bin/nutchwax Modified: trunk/archive-access/projects/nutchwax/archive/bin/nutchwax =================================================================== --- trunk/archive-access/projects/nutchwax/archive/bin/nutchwax 2009-10-28 00:04:14 UTC (rev 2847) +++ trunk/archive-access/projects/nutchwax/archive/bin/nutchwax 2009-10-28 00:04:43 UTC (rev 2848) @@ -40,32 +40,48 @@ case "$1" in import) shift - ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.Importer $@ + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.Importer "$@" ;; pagerankdb) shift - ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.PageRankDb $@ + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.PageRankDb "$@" ;; pagerankdbmerger) shift - ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.PageRankDbMerger $@ + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.PageRankDbMerger "$@" ;; + pageranker) + shift + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.PageRanker "$@" + ;; + parsetextmerger) + shift + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.ParseTextCombiner "$@" + ;; add-dates) shift - ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.DateAdder $@ + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.DateAdder "$@" ;; - dumpindex) + index) shift - ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.DumpParallelIndex $@ + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.Indexer "$@" ;; - pageranker) + merge) shift - ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.PageRanker $@ + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.IndexMerger "$@" ;; - parsetextmerger) + reboost) shift - ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.ParseTextCombiner $@ + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.LengthNormUpdater "$@" ;; + dumpindex) + shift + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.DumpParallelIndex "$@" + ;; + search) + shift + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.NutchWaxBean "$@" + ;; *) echo "" echo "Usage: nutchwax COMMAND" @@ -76,7 +92,11 @@ echo " pageranker Generate pagerank.txt file from 'pagerankdb's or 'linkdb's" echo " parsetextmerger Merge segement parse_text/part-nnnnn directories." echo " add-dates Add dates to a parallel index" + echo " index Build Lucene index from segment(s) without crawl & linkdbs" + echo " merge Merge indexes or parallel indexes" + echo " reboost Update document boosts based on pagerank info" echo " dumpindex Dump an index or set of parallel indices to stdout" + echo " search Query a search index" echo "" exit 1 ;; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 00:04:23
|
Revision: 2847 http://archive-access.svn.sourceforge.net/archive-access/?rev=2847&view=rev Author: binzino Date: 2009-10-28 00:04:14 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Obsoleted by DumpParallelIndex.java. Removed Paths: ------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DumpIndex.java Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DumpIndex.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DumpIndex.java 2009-10-28 00:02:34 UTC (rev 2846) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DumpIndex.java 2009-10-28 00:04:14 UTC (rev 2847) @@ -1,105 +0,0 @@ -/* - * Copyright (C) 2008 Internet Archive. - * - * This file is part of the archive-access tools project - * (http://sourceforge.net/projects/archive-access). - * - * The archive-access tools are free software; you can redistribute them and/or - * modify them under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or any - * later version. - * - * The archive-access tools are distributed in the hope that they will be - * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser - * Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License along with - * the archive-access tools; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.nutchwax.tools; - -import java.io.File; -import java.util.Iterator; - -import org.apache.lucene.index.IndexReader; - -public class DumpIndex -{ - public static void main(String[] args) throws Exception - { - String option = ""; - String indexDir = ""; - - if (args.length == 1) - { - indexDir = args[0]; - } - else if (args.length == 2) - { - option = args[0]; - indexDir = args[1]; - } - - if (! (new File(indexDir)).exists()) - { - usageAndExit(); - } - - if (option.equals("-f")) - { - listFields(indexDir); - } - else - { - dumpIndex(indexDir); - } - } - - private static void dumpIndex(String indexDir) throws Exception - { - IndexReader reader = IndexReader.open(indexDir); - - Object[] fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL).toArray(); - - for (int i = 0; i < fieldNames.length; i++) - { - System.out.print(fieldNames[i] + "\t"); - } - - System.out.println(); - - int numDocs = reader.numDocs(); - - for (int i = 0; i < numDocs; i++) - { - for (int j = 0; j < fieldNames.length; j++) - { - System.out.print(reader.document(i).get((String) fieldNames[j]) + "\t"); - } - - System.out.println(); - } - } - - private static void listFields(String indexDir) throws Exception - { - IndexReader reader = IndexReader.open(indexDir); - - Iterator it = reader.getFieldNames(IndexReader.FieldOption.ALL).iterator(); - - while (it.hasNext()) - { - System.out.println(it.next()); - } - - reader.close(); - } - - private static void usageAndExit() - { - System.out.println("Usage: DumpIndex [-f] index"); - System.exit(1); - } -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 00:02:45
|
Revision: 2846 http://archive-access.svn.sourceforge.net/archive-access/?rev=2846&view=rev Author: binzino Date: 2009-10-28 00:02:34 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Ported from NW 0.12.9. Factored into two classes to match the refactoring that occurred in Nutch 1.0. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Indexer.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Indexer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Indexer.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Indexer.java 2009-10-28 00:02:34 UTC (rev 2846) @@ -0,0 +1,101 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.indexer.lucene.LuceneWriter; +import org.apache.nutch.indexer.NutchIndexWriterFactory; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; + +/** Create indexes for segments. */ +public class Indexer extends Configured implements Tool { + + public static final String DONE_NAME = "index.done"; + + public static final Log LOG = LogFactory.getLog(Indexer.class); + + public Indexer() { + super(null); + } + + public Indexer(Configuration conf) { + super(conf); + } + + public void index(Path luceneDir, List<Path> segments) + throws IOException { + LOG.info("Indexer: starting"); + + final JobConf job = new NutchJob(getConf()); + job.setJobName("index-lucene " + luceneDir); + + IndexerMapReduce.initMRJob(segments, job); + + FileOutputFormat.setOutputPath(job, luceneDir); + + LuceneWriter.addFieldOptions("segment", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job); + LuceneWriter.addFieldOptions("digest", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job); + + NutchIndexWriterFactory.addClassToConf(job, LuceneWriter.class); + + JobClient.runJob(job); + LOG.info("Indexer: done"); + } + + public int run(String[] args) throws Exception { + if (args.length < 2) { + System.err.println("Usage: Indexer <index> <segment> ..."); + return -1; + } + + final Path luceneDir = new Path(args[0]); + + final List<Path> segments = new ArrayList<Path>(); + for (int i = 1; i < args.length; i++) { + segments.add(new Path(args[i])); + } + + try { + index(luceneDir, segments); + return 0; + } catch (final Exception e) { + LOG.fatal("Indexer: " + StringUtils.stringifyException(e)); + return -1; + } + } + + public static void main(String[] args) throws Exception { + final int res = ToolRunner.run(NutchConfiguration.create(), new Indexer(), args); + System.exit(res); + } +} Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2009-10-28 00:02:34 UTC (rev 2846) @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.IOException; +import java.util.Collection; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.indexer.IndexerOutputFormat; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilters; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseText; + +public class IndexerMapReduce extends Configured +implements Mapper<Text, Writable, Text, NutchWritable>, + Reducer<Text, NutchWritable, Text, NutchDocument> { + + public static final Log LOG = LogFactory.getLog(IndexerMapReduce.class); + + private IndexingFilters filters; + + public void configure(JobConf job) { + setConf(job); + this.filters = new IndexingFilters(getConf()); + } + + public void map(Text key, Writable value, + OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { + output.collect(key, new NutchWritable(value)); + } + + public void reduce(Text key, Iterator<NutchWritable> values, + OutputCollector<Text, NutchDocument> output, Reporter reporter) + throws IOException { + ParseData parseData = null; + ParseText parseText = null; + while (values.hasNext()) { + final Writable value = values.next().get(); // unwrap + + if (value instanceof ParseData) { + parseData = (ParseData)value; + } else if (value instanceof ParseText) { + parseText = (ParseText)value; + } else if (LOG.isWarnEnabled()) { + LOG.warn("Unrecognized type: "+value.getClass()); + } + } + + if ( parseText == null || parseData == null ) { + return; + } + + NutchDocument doc = new NutchDocument(); + final Metadata metadata = parseData.getContentMeta(); + + if ( metadata.get(Nutch.SEGMENT_NAME_KEY) == null || + metadata.get(Nutch.SIGNATURE_KEY) == null ) + { + LOG.warn( "Skipping document, insufficient metadata: key=" + key + " metadata=" + metadata ); + return ; + } + + // add segment, used to map from merged index back to segment files + doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY)); + + // add digest, used by dedup + doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY)); + + final Parse parse = new ParseImpl(parseText, parseData); + try { + // run indexing filters + doc = this.filters.filter(doc, parse, key, /*fetchDatum*/ null, /*inlinks*/ null); + } catch (final IndexingException e) { + if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); } + return; + } + + // skip documents discarded by indexing filters + if (doc == null) return; + + output.collect(key, doc); + } + + public void close() throws IOException { } + + public static void initMRJob(Collection<Path> segments, + JobConf job) { + + for (final Path segment : segments) { + LOG.info("IndexerMapReduces: adding segment: " + segment); + FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME)); + FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME)); + } + + job.setInputFormat(SequenceFileInputFormat.class); + + job.setMapperClass(IndexerMapReduce.class); + job.setReducerClass(IndexerMapReduce.class); + + job.setOutputFormat(IndexerOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setMapOutputValueClass(NutchWritable.class); + job.setOutputValueClass(NutchWritable.class); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 00:01:20
|
Revision: 2845 http://archive-access.svn.sourceforge.net/archive-access/?rev=2845&view=rev Author: binzino Date: 2009-10-28 00:01:06 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Ported from NW 0.12.9. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java 2009-10-28 00:01:06 UTC (rev 2845) @@ -0,0 +1,333 @@ +package org.archive.nutchwax.tools; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.Collection; +import java.util.HashSet; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; + + +import org.apache.nutch.indexer.NutchSimilarity; + +/** + * This is heavily cribbed from org.apache.lucene.misc.LengthNormModifier + */ +public class LengthNormUpdater +{ + private static final String USAGE = + "Usage: LengthNormUpdater [OPTIONS] <pageranks> <index> [field1]...\n" + + "\n" + + "Update the norms of <index> with boosts based on values from <pageranks>\n" + + "\n" + + "Options:\n" + + "\t-s <classname> similarity implementation to use\n" + + "\t-v increase verbosity\n" + + "\n" + + "Reads the pagerank values from the <pageranks> file and calculates new\n" + + "norms for the documents based on the formula:\n" + + "\n" + + "\tnorm = similarity.lengthNorm * log10(pagerank)\n" + + "\n" + + "If fields are specified on the command-line, only they will be updated.\n" + + "If a specified field does not have norms, an error message is given and\n" + + "the program terminates without performing any updates.\n" + + "\n" + + "If no fields are given, all the fields in the index that have norms will\n" + + "be updated.\n" + + "\n" + + "The default similarity implementation is NutchSimilarity\n" + + "\n" + + "Examples:\n" + + "\n" + + "\tLengthNormUpdater pagerank.txt index\n" + + "\tLengthNormUpdater -v -v pagerank.txt index title content\n" + + "\n" + ; + + private static int VERBOSE = 0; + + /** + * + */ + public static void main( String[] args ) throws IOException + { + if ( args.length < 1 ) + { + System.err.print( USAGE ); + System.exit(1); + } + + Similarity s = new NutchSimilarity( ); + + int pos = 0; + for ( ; (pos < args.length) && args[pos].startsWith( "-" ) ; pos++ ) + { + if ( "-h".equals( args[pos] ) ) + { + System.out.println( USAGE ); + System.exit( 0 ); + } + else if ( "-v".equals( args[pos] ) ) + { + VERBOSE++; + } + else if ( "-s".equals( args[pos] ) ) + { + pos++; + + if ( pos == args.length ) + { + System.err.println( "Error: missing argument to option -s" ); + System.exit( 1 ); + } + + try + { + Class simClass = Class.forName(args[pos]); + s = (Similarity)simClass.newInstance(); + } + catch (Exception e) + { + System.err.println( "Couldn't instantiate similarity with empty constructor: " + args[pos] ); + e.printStackTrace(System.err); + System.exit( 1 ); + } + } + } + + if ( (pos + 2) > args.length ) + { + System.out.println( USAGE ); + System.exit( 1 ); + } + + String pagerankFile = args[pos++]; + + IndexReader reader = IndexReader.open( args[pos++] ); + + try + { + Set<String> fieldNames = new HashSet<String>( ); + if ( pos == args.length ) + { + // No fields specified on command-line, get a list of all + // fields in the index that have norms. + for ( String fieldName : (Collection<String>) reader.getFieldNames( IndexReader.FieldOption.ALL ) ) + { + if ( reader.hasNorms( fieldName ) ) + { + fieldNames.add( fieldName ); + } + } + } + else + { + // Verify all explicitly specified fields have norms. + for ( int i = pos ; i < args.length ; i++ ) + { + if ( ! reader.hasNorms( args[i] ) ) + { + System.err.println( "Error: No norms for field: " + args[i] ); + System.exit( 1 ); + } + + fieldNames.add( args[i] ); + } + } + + if ( fieldNames.isEmpty( ) ) + { + System.out.println( "Warning: No fields with norms to update" ); + System.exit( 0 ); + } + + Map<String,Integer> ranks = getPageRanks( pagerankFile ); + + for ( String fieldName : fieldNames ) + { + reSetNorms( reader, fieldName, ranks, s ); + } + + } + finally + { + if ( reader != null ) + { + reader.close( ); + } + + } + } + + + /** + * + */ + public static void reSetNorms( IndexReader reader, + String fieldName, + Map<String,Integer> ranks, + Similarity sim ) throws IOException + { + if ( VERBOSE > 0 ) System.out.println( "Updating field: " + fieldName ); + + int[] termCounts = new int[0]; + + TermEnum termEnum = null; + TermDocs termDocs = null; + + termCounts = new int[reader.maxDoc()]; + try + { + termEnum = reader.terms(new Term(fieldName,"")); + try + { + termDocs = reader.termDocs(); + do + { + Term term = termEnum.term(); + if (term != null && term.field().equals(fieldName)) + { + termDocs.seek(termEnum.term()); + while (termDocs.next()) + { + termCounts[termDocs.doc()] += termDocs.freq(); + } + } + } + while (termEnum.next()); + } + finally + { + if (null != termDocs) termDocs.close(); + } + } + finally + { + if (null != termEnum) termEnum.close(); + } + + for (int d = 0; d < termCounts.length; d++) + { + if ( ! reader.isDeleted(d) ) + { + Document doc = reader.document( d ); + + String url = doc.get( "url" ); + + if ( url != null ) + { + Integer rank = ranks.get( url ); + if ( rank == null ) continue; + + float originalNorm = sim.lengthNorm(fieldName, termCounts[d]); + byte encodedOrig = sim.encodeNorm(originalNorm); + float rankedNorm = originalNorm * (float) ( Math.log10( rank ) + 1 ); + byte encodedRank = sim.encodeNorm(rankedNorm); + + if ( VERBOSE > 1 ) System.out.println( fieldName + "\t" + d + "\t" + originalNorm + "\t" + encodedOrig + "\t" + rankedNorm + "\t" + encodedRank ); + + reader.setNorm(d, fieldName, encodedRank); + } + } + } + } + + /** + * Utility function to read a list of page-rank records from a file + * specified in the configuration. + */ + public static Map<String,Integer> getPageRanks( String filename ) + { + if ( VERBOSE > 0 ) System.out.println( "Reading pageranks from: " + filename ); + + Map<String,Integer> pageranks = new HashMap<String,Integer>( ); + + BufferedReader reader = null; + try + { + reader = new BufferedReader( new InputStreamReader( new FileInputStream( filename), "UTF-8" ) ); + + String line; + while ( (line = reader.readLine()) != null ) + { + String fields[] = line.split( "\\s+" ); + + if ( fields.length < 2 ) + { + System.err.println( "Malformed pagerank, not enough fields ("+fields.length+"): " + line ); + continue ; + } + + try + { + int rank = Integer.parseInt( fields[0] ); + String url = fields[1]; + + if ( rank < 0 ) + { + System.err.println( "Malformed pagerank, rank less than 0: " + line ); + } + + pageranks.put( url, rank ); + } + catch ( NumberFormatException nfe ) + { + System.err.println( "Malformed pagerank, rank not an integer: " + line ); + continue ; + } + } + } + catch ( IOException e ) + { + // Umm, what to do? + throw new RuntimeException( e ); + } + finally + { + try + { + if ( reader != null ) + { + reader.close( ); + } + } + catch ( IOException e ) + { + // Ignore it. + } + } + + return pageranks; + } + + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-27 23:17:30
|
Revision: 2844 http://archive-access.svn.sourceforge.net/archive-access/?rev=2844&view=rev Author: binzino Date: 2009-10-27 23:17:15 +0000 (Tue, 27 Oct 2009) Log Message: ----------- Updated to Nutch 1.0 API. Also added use of Java generics to avoid type-casts. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java 2009-10-27 23:00:46 UTC (rev 2843) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java 2009-10-27 23:17:15 UTC (rev 2844) @@ -53,7 +53,7 @@ */ public class OpenSearchServlet extends HttpServlet { - private static final Map NS_MAP = new HashMap(); + private static final Map<String,String> NS_MAP = new HashMap<String,String>(); private int MAX_HITS_PER_PAGE; static { @@ -61,7 +61,7 @@ NS_MAP.put("nutch", "http://www.nutch.org/opensearchrss/1.0/"); } - private static final Set SKIP_DETAILS = new HashSet(); + private static final Set<String> SKIP_DETAILS = new HashSet<String>(); static { SKIP_DETAILS.add("url"); // redundant with RSS link SKIP_DETAILS.add("title"); // redundant with RSS title @@ -92,9 +92,8 @@ // get parameters from request request.setCharacterEncoding("UTF-8"); String queryString = request.getParameter("query"); - if (queryString == null) - queryString = ""; - String urlQuery = URLEncoder.encode(queryString, "UTF-8"); + if (queryString == null) queryString = ""; + //String urlQuery = URLEncoder.encode(queryString, "UTF-8"); // the query language String queryLang = request.getParameter("lang"); @@ -133,12 +132,6 @@ } } - // Make up query string for use later drawing the 'rss' logo. - String params = "&hitsPerPage=" + hitsPerPage + - (queryLang == null ? "" : "&lang=" + queryLang) + - (sort == null ? "" : "&sort=" + sort + (reverse? "&reverse=true": "") + - (dedupField == null ? "" : "&dedupField=" + dedupField)); - Query query = Query.parse(queryString, queryLang, this.conf); if (NutchBean.LOG.isInfoEnabled()) { NutchBean.LOG.info("query: " + queryString); @@ -183,9 +176,6 @@ HitDetails[] details = bean.getDetails(show); Summary[] summaries = bean.getSummary(details, query); - String requestUrl = request.getRequestURL().toString(); - String base = requestUrl.substring(0, requestUrl.lastIndexOf('/')); - try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); @@ -194,20 +184,14 @@ Element rss = addNode(doc, doc, "rss"); addAttribute(doc, rss, "version", "2.0"); addAttribute(doc, rss, "xmlns:opensearch", - (String)NS_MAP.get("opensearch")); - addAttribute(doc, rss, "xmlns:nutch", (String)NS_MAP.get("nutch")); + NS_MAP.get("opensearch")); + addAttribute(doc, rss, "xmlns:nutch", NS_MAP.get("nutch")); Element channel = addNode(doc, rss, "channel"); addNode(doc, channel, "title", "Nutch: " + queryString); - addNode(doc, channel, "description", "Nutch search results for query: " - + queryString); - addNode(doc, channel, "link", - base+"/search.jsp" - +"?query="+urlQuery - +"&start="+start - +"&hitsPerDup="+hitsPerDup - +params); + addNode(doc, channel, "description", "Nutch search results for query: " + queryString); + addNode(doc, channel, "link", "" ); addNode(doc, channel, "opensearch", "totalResults", ""+totalResults); addNode(doc, channel, "opensearch", "startIndex", ""+start); @@ -217,7 +201,7 @@ addNode(doc, channel, "nutch", "responseTime", Double.toString( ((long) responseTime / 1000 / 1000 ) / 1000.0 ) ); // Add a <nutch:urlParams> element containing a list of all the URL parameters. - Element urlParams = doc.createElementNS((String)NS_MAP.get("nutch"), "nutch:urlParams" ); + Element urlParams = doc.createElementNS(NS_MAP.get("nutch"), "nutch:urlParams" ); channel.appendChild( urlParams ); for ( Map.Entry<String,String[]> e : ((Map<String,String[]>) request.getParameterMap( )).entrySet( ) ) @@ -225,43 +209,19 @@ String key = e.getKey( ); for ( String value : e.getValue( ) ) { - Element urlParam = doc.createElementNS((String)NS_MAP.get("nutch"), "nutch:param" ); + Element urlParam = doc.createElementNS(NS_MAP.get("nutch"), "nutch:param" ); addAttribute( doc, urlParam, "name", key ); addAttribute( doc, urlParam, "value", value ); urlParams.appendChild(urlParam); } } - // Hmm, we should indicate whether or not the "totalResults" - // number as being exact some other way; perhaps just have a - // <nutch:totalIsExact>true</nutch:totalIsExact> element. - /* - if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show - || (!hits.totalIsExact() && (hits.getLength() > start+hitsPerPage))){ - addNode(doc, channel, "nutch", "nextPage", requestUrl - +"?query="+urlQuery - +"&start="+end - +"&hitsPerDup="+hitsPerDup - +params); - } - */ - - // Same here, this seems odd. - /* - if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) { - addNode(doc, channel, "nutch", "showAllHits", requestUrl - +"?query="+urlQuery - +"&hitsPerDup="+0 - +params); - } - */ - for (int i = 0; i < length; i++) { Hit hit = show[i]; HitDetails detail = details[i]; String title = detail.getValue("title"); String url = detail.getValue("url"); - String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo(); + String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getUniqueKey(); if (title == null || title.equals("")) { // use url for docs w/o title title = url; @@ -274,24 +234,8 @@ addNode(doc, item, "description", summaries[i].toString() ); } addNode(doc, item, "link", url); - addNode(doc, item, "nutch", "site", hit.getDedupValue()); - addNode(doc, item, "nutch", "cache", base+"/cached.jsp?"+id); - addNode(doc, item, "nutch", "explain", base+"/explain.jsp?"+id - +"&query="+urlQuery+"&lang="+queryLang); - - // Probably don't need this as the XML processor/front-end can - // easily do this themselves. - if (hit.moreFromDupExcluded()) { - addNode(doc, item, "nutch", "moreFromSite", requestUrl - +"?query=" - +URLEncoder.encode("site:"+hit.getDedupValue() - +" "+queryString, "UTF-8") - +"&hitsPerSite="+0 - +params); - } - for (int j = 0; j < detail.getLength(); j++) { // add all from detail String field = detail.getField(j); if (!SKIP_DETAILS.contains(field)) @@ -304,9 +248,9 @@ DOMSource source = new DOMSource(doc); TransformerFactory transFactory = TransformerFactory.newInstance(); Transformer transformer = transFactory.newTransformer(); - transformer.setOutputProperty("indent", "yes"); + transformer.setOutputProperty( javax.xml.transform.OutputKeys.ENCODING, "UTF-8" ); StreamResult result = new StreamResult(response.getOutputStream()); - response.setContentType("text/xml"); + response.setContentType("application/rss+xml"); transformer.transform(source, result); } catch (javax.xml.parsers.ParserConfigurationException e) { @@ -334,7 +278,7 @@ private static void addNode(Document doc, Node parent, String ns, String name, String text) { if ( text == null ) text = ""; - Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name); + Element child = doc.createElementNS(NS_MAP.get(ns), ns+":"+name); child.appendChild(doc.createTextNode(getLegalXml(text))); parent.appendChild(child); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-27 23:00:58
|
Revision: 2843 http://archive-access.svn.sourceforge.net/archive-access/?rev=2843&view=rev Author: binzino Date: 2009-10-27 23:00:46 +0000 (Tue, 27 Oct 2009) Log Message: ----------- Ported changes/fixes from NW 0.12.9. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-10-27 22:52:46 UTC (rev 2842) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-10-27 23:00:46 UTC (rev 2843) @@ -10,19 +10,18 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> + <value>protocol-http|parse-(text|html|js|pdf)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> </property> -<!-- The indexing filter order *must* be specified in order for - NutchWAX's ConfigurableIndexingFilter to be called *after* the - BasicIndexingFilter. This is necessary so that the - ConfigurableIndexingFilter can over-write some of the values put - into the Lucene document by the BasicIndexingFilter. - - The over-written values are the 'url' and 'digest' fields, which - NutchWAX needs to handle specially in order for de-duplication to - work properly. - --> +<!-- + When using *only* the 'index-nutchwax' in 'plugin.includes' above, + we don't need to specify an order since there is only one plugin. + + However, if you choose to use the Nutch 'index-basic', then you have + to specify the order such that the NutchWAX ConfigurableIndexingFilter + is after it. Whichever plugin comes last over-writes the values + of those that come before it. + <property> <name>indexingfilter.order</name> <value> @@ -30,29 +29,31 @@ org.archive.nutchwax.index.ConfigurableIndexingFilter </value> </property> + --> <property> <!-- Configure the 'index-nutchwax' plugin. Specify how the metadata fields added by the Importer are mapped to the Lucene documents during indexing. - The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key" + The specifications here are of the form "src-key:lowercase:store:index:dest-key" Where the only required part is the "src-key", the rest will assume the following defaults: lowercase = true store = true - tokenize = false + index = tokenized exclusive = true dest-key = src-key --> <name>nutchwax.filter.index</name> <value> - url:false:true:true - url:false:true:false:true:exacturl - orig:false - digest:false - filename:false - fileoffset:false - collection - date - type - length + title:false:true:tokenized + content:false:false:tokenized + site:false:false:untokenized + + url:false:true:tokenized + digest:false:true:no + + collection:true:true:no_norms + date:true:true:no_norms + type:true:true:no_norms + length:false:true:no </value> </property> @@ -70,15 +71,10 @@ <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter --> <name>nutchwax.filter.query</name> <value> - raw:digest:false - raw:filename:false - raw:fileoffset:false - raw:exacturl:false group:collection + group:site:false group:type - field:anchor field:content - field:host field:title </value> </property> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-27 22:53:01
|
Revision: 2842 http://archive-access.svn.sourceforge.net/archive-access/?rev=2842&view=rev Author: binzino Date: 2009-10-27 22:52:46 +0000 (Tue, 27 Oct 2009) Log Message: ----------- Updated to Nutch 1.0 API. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/ParseTextCombiner.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/ParseTextCombiner.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/ParseTextCombiner.java 2009-10-27 22:47:30 UTC (rev 2841) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/ParseTextCombiner.java 2009-10-27 22:52:46 UTC (rev 2842) @@ -128,9 +128,9 @@ WritableComparable[] keys = new WritableComparable[readers.length]; Writable[] values = new Writable [readers.length]; - WritableComparator wc = WritableComparator.get( readers[0].getKeyClass() ); + WritableComparator wc = WritableComparator.get( (Class<WritableComparable>) readers[0].getKeyClass() ); - MapFile.Writer writer = new MapFile.Writer( conf, fs, outputPath.toString(), readers[0].getKeyClass(), readers[0].getValueClass( ) ); + MapFile.Writer writer = new MapFile.Writer( conf, fs, outputPath.toString(), (Class<WritableComparable>) readers[0].getKeyClass(), readers[0].getValueClass( ) ); int readCount = 0; int writeCount = 0; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-27 22:47:40
|
Revision: 2841 http://archive-access.svn.sourceforge.net/archive-access/?rev=2841&view=rev Author: binzino Date: 2009-10-27 22:47:30 +0000 (Tue, 27 Oct 2009) Log Message: ----------- Ported from NW 0.12.9. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/CacheSettingsFilter.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/CacheSettingsFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/CacheSettingsFilter.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/CacheSettingsFilter.java 2009-10-27 22:47:30 UTC (rev 2841) @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2008 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.nutchwax; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; + +import javax.servlet.Filter; +import javax.servlet.FilterChain; +import javax.servlet.FilterConfig; +import javax.servlet.ServletContext; +import javax.servlet.ServletException; +import javax.servlet.ServletOutputStream; +import javax.servlet.ServletRequest; +import javax.servlet.ServletResponse; +import javax.servlet.http.HttpServletResponse; +import javax.servlet.http.HttpServletResponseWrapper; + +import javax.xml.transform.Source; +import javax.xml.transform.stream.StreamSource; +import javax.xml.transform.Templates; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.Transformer; +import javax.xml.transform.stream.StreamResult; + + +public class CacheSettingsFilter implements Filter +{ + private String maxAge; + + public void init( FilterConfig config ) + throws ServletException + { + this.maxAge = config.getInitParameter( "max-age" ); + + if ( this.maxAge != null ) + { + this.maxAge = this.maxAge.trim( ); + + if ( this.maxAge.length( ) == 0 ) + { + this.maxAge = null; + } + else + { + this.maxAge = "max-age=" + this.maxAge; + } + } + } + + public void doFilter( ServletRequest request, ServletResponse response, FilterChain chain ) + throws IOException, ServletException + { + HttpServletResponse res = (HttpServletResponse) response; + + res.setDateHeader( "Date", System.currentTimeMillis( ) ); + + if ( this.maxAge != null ) + { + res.addHeader( "Cache-Control", this.maxAge ); + } + + chain.doFilter( request, res ); + } + + public void destroy() + { + + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-27 22:46:39
|
Revision: 2840 http://archive-access.svn.sourceforge.net/archive-access/?rev=2840&view=rev Author: binzino Date: 2009-10-27 22:46:25 +0000 (Tue, 27 Oct 2009) Log Message: ----------- Minor edits to conform to Nutch 1.0 API. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2009-10-27 21:38:28 UTC (rev 2839) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2009-10-27 22:46:25 UTC (rev 2840) @@ -30,14 +30,16 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobStatus; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.hadoop.mapred.RunningJob; -import org.apache.hadoop.mapred.JobStatus; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; @@ -46,8 +48,8 @@ import org.apache.nutch.fetcher.FetcherOutputFormat; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; -import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseImpl; @@ -323,7 +325,7 @@ // We store both the normal URL and the URL+digest key for // later retrieval by the indexing plugin(s). contentMetadata.set( NutchWax.URL_KEY, url ); - contentMetadata.set( NutchWax.ORIG_KEY, key ); + //contentMetadata.set( NutchWax.ORIG_KEY, key ); contentMetadata.set( NutchWax.FILENAME_KEY, meta.getArcFile().getName() ); contentMetadata.set( NutchWax.FILEOFFSET_KEY, String.valueOf( record.getHeader().getOffset( ) ) ); @@ -650,12 +652,14 @@ job.setJobName( "Importer " + manifestPath ); job.set( Nutch.SEGMENT_NAME_KEY, segmentPath.getName() ); - job.setInputPath ( manifestPath); + //job.setInputPath ( manifestPath); + FileInputFormat.addInputPath( job, manifestPath ); job.setInputFormat( TextInputFormat.class ); job.setMapperClass( Importer.class ); - job.setOutputPath ( segmentPath ); + //job.setOutputPath ( segmentPath ); + FileOutputFormat.setOutputPath( job, segmentPath ); job.setOutputFormat ( FetcherOutputFormat.class ); job.setOutputKeyClass ( Text.class ); job.setOutputValueClass( NutchWritable.class ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-27 21:38:38
|
Revision: 2839 http://archive-access.svn.sourceforge.net/archive-access/?rev=2839&view=rev Author: binzino Date: 2009-10-27 21:38:28 +0000 (Tue, 27 Oct 2009) Log Message: ----------- Port of fix for WAX-53 from NW-0.12.9 branch. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2009-10-27 21:31:15 UTC (rev 2838) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2009-10-27 21:38:28 UTC (rev 2839) @@ -472,6 +472,8 @@ private TermEnum termEnum; public ParallelTermEnum() throws IOException { + if ( fieldToReader.isEmpty( ) ) return ; + field = (String)fieldToReader.firstKey(); if (field != null) termEnum = ((IndexReader)fieldToReader.get(field)).terms(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2838 http://archive-access.svn.sourceforge.net/archive-access/?rev=2838&view=rev Author: binzino Date: 2009-10-27 21:31:15 +0000 (Tue, 27 Oct 2009) Log Message: ----------- Updated to Nutch 1.0 API. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java 2009-10-27 21:29:00 UTC (rev 2837) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java 2009-10-27 21:31:15 UTC (rev 2838) @@ -41,6 +41,7 @@ import org.apache.lucene.document.Document; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; import org.apache.nutch.protocol.Content; @@ -158,7 +159,7 @@ // Not implemented } - public float indexerScore(Text key, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + public float indexerScore(Text key, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { synchronized ( this ) @@ -196,7 +197,7 @@ return initScore; } - float newScore = initScore * (float) ( Math.log( rank ) + 1 ); + float newScore = initScore * (float) ( Math.log10( rank ) + 1 ); LOG.info( "PageRankScoringFilter: initScore = " + newScore + " ; key = " + key ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-27 21:29:07
|
Revision: 2837 http://archive-access.svn.sourceforge.net/archive-access/?rev=2837&view=rev Author: binzino Date: 2009-10-27 21:29:00 +0000 (Tue, 27 Oct 2009) Log Message: ----------- Updated fields to match what is expected in the code. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml 2009-10-27 21:18:32 UTC (rev 2836) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml 2009-10-27 21:29:00 UTC (rev 2837) @@ -40,8 +40,8 @@ point="org.apache.nutch.searcher.QueryFilter"> <implementation id="ConfigurableQueryFilter" class="org.archive.nutchwax.query.ConfigurableQueryFilter"> - <parameter name="raw-fields" value="collection,date,digest,exacturl,filename,fileoffset,type" /> - <parameter name="fields" value="anchor,content,host,title" /> + <parameter name="raw-fields" value="collection,site,type" /> + <parameter name="fields" value="content,title" /> </implementation> </extension> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2836 http://archive-access.svn.sourceforge.net/archive-access/?rev=2836&view=rev Author: binzino Date: 2009-10-27 21:18:32 +0000 (Tue, 27 Oct 2009) Log Message: ----------- Updated to Nutch 1.0 API. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2009-10-27 21:14:24 UTC (rev 2835) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2009-10-27 21:18:32 UTC (rev 2836) @@ -20,19 +20,22 @@ */ package org.archive.nutchwax.index; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; import java.util.List; -import java.util.ArrayList; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.lucene.LuceneWriter; +import org.apache.nutch.indexer.lucene.LuceneWriter.INDEX; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; @@ -46,9 +49,13 @@ private Configuration conf; private List<FieldSpecification> fieldSpecs; + private int MAX_TITLE_LENGTH; + public void setConf( Configuration conf ) { this.conf = conf; + + this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100); String filterSpecs = conf.get( "nutchwax.filter.index" ); @@ -65,12 +72,12 @@ { String spec[] = filterSpec.split("[:]"); - String srcKey = spec[0]; - boolean lowerCase = true; - boolean store = true; - boolean tokenize = false; - boolean exclusive = true; - String destKey = srcKey; + String srcKey = spec[0]; + boolean lowerCase = true; + boolean store = true; + INDEX index = INDEX.TOKENIZED; + boolean exclusive = true; + String destKey = srcKey; switch ( spec.length ) { default: @@ -79,7 +86,10 @@ case 5: exclusive = Boolean.parseBoolean( spec[4] ); case 4: - tokenize = Boolean.parseBoolean( spec[3] ); + index = "tokenized". equals(spec[3]) ? INDEX.TOKENIZED : + "untokenized".equals(spec[3]) ? INDEX.UNTOKENIZED : + "no_norms". equals(spec[3]) ? INDEX.NO_NORMS : + INDEX.NO; case 3: store = Boolean.parseBoolean( spec[2] ); case 2: @@ -89,9 +99,9 @@ ; } - LOG.info( "Add field specification: " + srcKey + ":" + lowerCase + ":" + store + ":" + tokenize + ":" + exclusive + ":" + destKey ); + LOG.info( "Add field specification: " + srcKey + ":" + lowerCase + ":" + store + ":" + index + ":" + exclusive + ":" + destKey ); - this.fieldSpecs.add( new FieldSpecification( srcKey, lowerCase, store, tokenize, exclusive, destKey ) ); + this.fieldSpecs.add( new FieldSpecification( srcKey, lowerCase, store, index, exclusive, destKey ) ); } } @@ -100,16 +110,16 @@ String srcKey; boolean lowerCase; boolean store; - boolean tokenize; + INDEX index; boolean exclusive; String destKey; - public FieldSpecification( String srcKey, boolean lowerCase, boolean store, boolean tokenize, boolean exclusive, String destKey ) + public FieldSpecification( String srcKey, boolean lowerCase, boolean store, INDEX index, boolean exclusive, String destKey ) { this.srcKey = srcKey; this.lowerCase = lowerCase; this.store = store; - this.tokenize = tokenize; + this.index = index; this.exclusive = exclusive; this.destKey = destKey; } @@ -124,14 +134,47 @@ * Transfer NutchWAX field values stored in the parsed content to * the Lucene document. */ - public Document filter( Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks ) + public NutchDocument filter( NutchDocument doc, Parse parse, Text key, CrawlDatum datum, Inlinks inlinks ) throws IndexingException { Metadata meta = parse.getData().getContentMeta(); for ( FieldSpecification spec : this.fieldSpecs ) { - String value = meta.get( spec.srcKey ); + String value = null; + if ( "site".equals( spec.srcKey ) || "host".equals( spec.srcKey ) ) + { + try + { + value = (new URL( meta.get( "url" ) ) ).getHost( ); + } + catch ( MalformedURLException mue ) { /* Eat it */ } + } + else if ( "content".equals( spec.srcKey ) ) + { + value = parse.getText( ); + } + else if ( "title".equals( spec.srcKey ) ) + { + value = parse.getData().getTitle(); + if ( value.length() > MAX_TITLE_LENGTH ) // truncate title if needed + { + value = value.substring( 0, MAX_TITLE_LENGTH ); + } + } + else if ( "type".equals( spec.srcKey ) ) + { + value = meta.get( spec.srcKey ); + + if ( value == null ) continue ; + + int p = value.indexOf( ';' ); + if ( p >= 0 ) value = value.substring( 0, p ); + } + else + { + value = meta.get( spec.srcKey ); + } if ( value == null ) continue; @@ -142,16 +185,33 @@ if ( spec.exclusive ) { - doc.removeFields( spec.destKey ); + doc.removeField( spec.destKey ); } - - doc.add( new Field( spec.destKey, - value, - spec.store ? Field.Store.YES : Field.Store.NO, - spec.tokenize ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED ) ); + + if ( spec.store || spec.index != INDEX.NO ) + { + doc.add( spec.destKey, value ); + } + } return doc; } - + + public void addIndexBackendOptions( Configuration conf ) + { + for ( FieldSpecification spec : this.fieldSpecs ) + { + if ( ! spec.store && spec.index == INDEX.NO ) + { + continue ; + } + + LuceneWriter.addFieldOptions( spec.destKey, + spec.store ? LuceneWriter.STORE.YES : LuceneWriter.STORE.NO, + spec.index, + conf ); + } + + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2835 http://archive-access.svn.sourceforge.net/archive-access/?rev=2835&view=rev Author: binzino Date: 2009-10-27 21:14:24 +0000 (Tue, 27 Oct 2009) Log Message: ----------- Updated to Nutch 1.0 API. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/FieldSetter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/FieldSetter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/FieldSetter.java 2009-10-26 23:02:50 UTC (rev 2834) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/FieldSetter.java 2009-10-27 21:14:24 UTC (rev 2835) @@ -26,14 +26,14 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.lucene.LuceneWriter; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; @@ -152,7 +152,7 @@ * Remove field if specified value is <code>null</code>. * </p> */ - public Document filter( Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks ) + public NutchDocument filter( NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks ) throws IndexingException { Metadata meta = parse.getData().getContentMeta(); @@ -160,20 +160,29 @@ for ( FieldSetting setting : this.settings ) { // First, remove the existing field. - doc.removeFields( setting.key ); + doc.removeField( setting.key ); // Add the value if it is given. if ( setting.value != null ) { - doc.add( new Field( setting.key, - setting.value, - setting.store ? Field.Store.YES : Field.Store.NO, - setting.tokenize ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED ) ); + doc.add( setting.key, setting.value ); } } return doc; } + public void addIndexBackendOptions( Configuration conf ) + { + + for ( FieldSetting setting : this.settings ) + { + LuceneWriter.addFieldOptions( setting.key, + setting.store ? LuceneWriter.STORE.YES : LuceneWriter.STORE.NO, + setting.tokenize ? LuceneWriter.INDEX.TOKENIZED : LuceneWriter.INDEX.UNTOKENIZED, + conf ); + } + + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-26 23:02:58
|
Revision: 2834 http://archive-access.svn.sourceforge.net/archive-access/?rev=2834&view=rev Author: binzino Date: 2009-10-26 23:02:50 +0000 (Mon, 26 Oct 2009) Log Message: ----------- Added command-line driver for 'index' to call NutchWAX Indexer. Modified Paths: -------------- tags/nutchwax-0_12_9/archive/bin/nutchwax Modified: tags/nutchwax-0_12_9/archive/bin/nutchwax =================================================================== --- tags/nutchwax-0_12_9/archive/bin/nutchwax 2009-10-26 23:01:57 UTC (rev 2833) +++ tags/nutchwax-0_12_9/archive/bin/nutchwax 2009-10-26 23:02:50 UTC (rev 2834) @@ -62,6 +62,10 @@ shift ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.DateAdder "$@" ;; + index) + shift + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.Indexer "$@" + ;; merge) shift ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.IndexMerger "$@" @@ -88,6 +92,7 @@ echo " pageranker Generate pagerank.txt file from 'pagerankdb's or 'linkdb's" echo " parsetextmerger Merge segement parse_text/part-nnnnn directories." echo " add-dates Add dates to a parallel index" + echo " index Build Lucene index from segment(s) without crawl & linkdbs" echo " merge Merge indexes or parallel indexes" echo " reboost Update document boosts based on pagerank info" echo " dumpindex Dump an index or set of parallel indices to stdout" This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-26 23:02:14
|
Revision: 2833 http://archive-access.svn.sourceforge.net/archive-access/?rev=2833&view=rev Author: binzino Date: 2009-10-26 23:01:57 +0000 (Mon, 26 Oct 2009) Log Message: ----------- Added NutchWAX version of Indexer that doesn't need/use crawldb nor linkdb. Also has a special check for non-null values of required metadata fields: segment, digest. Added Paths: ----------- tags/nutchwax-0_12_9/archive/src/java/org/archive/nutchwax/Indexer.java Added: tags/nutchwax-0_12_9/archive/src/java/org/archive/nutchwax/Indexer.java =================================================================== --- tags/nutchwax-0_12_9/archive/src/java/org/archive/nutchwax/Indexer.java (rev 0) +++ tags/nutchwax-0_12_9/archive/src/java/org/archive/nutchwax/Indexer.java 2009-10-26 23:01:57 UTC (rev 2833) @@ -0,0 +1,294 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.*; +import java.util.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.io.*; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.conf.*; +import org.apache.hadoop.mapred.*; +import org.apache.hadoop.util.*; +import org.apache.nutch.parse.*; +import org.apache.nutch.analysis.*; + +import org.apache.nutch.indexer.IndexingFilters; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchSimilarity; + +import org.apache.nutch.util.LogUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.CrawlDb; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.crawl.LinkDb; +import org.apache.nutch.crawl.NutchWritable; + +import org.apache.lucene.index.*; +import org.apache.lucene.document.*; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; + +/** Create indexes for segments. */ +public class Indexer extends Configured implements Tool, Reducer<Text, NutchWritable, Text, Writable>, Mapper<Text, Writable, Text, NutchWritable> { + + public static final String DONE_NAME = "index.done"; + + public static final Log LOG = LogFactory.getLog(Indexer.class); + + /** A utility class used to pass a lucene document from Indexer.reduce + * to Indexer.OutputFormat. + * Note: Despite its name, it can't properly wrap a lucene document - it + * doesn't know how to serialize/deserialize a lucene document. + */ + public static class LuceneDocumentWrapper implements Writable { + private Document doc; + + public LuceneDocumentWrapper(Document doc) { + this.doc = doc; + } + + public Document get() { + return doc; + } + + public void readFields(DataInput in) throws IOException { + // intentionally left blank + } + + public void write(DataOutput out) throws IOException { + // intentionally left blank + } + + } + + /** Unwrap Lucene Documents created by reduce and add them to an index. */ + public static class OutputFormat + extends org.apache.hadoop.mapred.FileOutputFormat<WritableComparable, LuceneDocumentWrapper> { + public RecordWriter<WritableComparable, LuceneDocumentWrapper> getRecordWriter(final FileSystem fs, JobConf job, + String name, final Progressable progress) throws IOException { + final Path perm = new Path(FileOutputFormat.getOutputPath(job), name); + final Path temp = + job.getLocalPath("index/_"+Integer.toString(new Random().nextInt())); + + int maxTokens = job.getInt("indexer.max.tokens", 10000); + if (maxTokens < 0) maxTokens = Integer.MAX_VALUE; + + fs.delete(perm, true); // delete old, if any + + final AnalyzerFactory factory = new AnalyzerFactory(job); + final IndexWriter writer = // build locally first + new IndexWriter(fs.startLocalOutput(perm, temp).toString(), + new NutchDocumentAnalyzer(job), true); + + writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10)); + writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100)); + writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE)); + writer.setTermIndexInterval + (job.getInt("indexer.termIndexInterval", 128)); + writer.setMaxFieldLength(maxTokens); + writer.setInfoStream(LogUtil.getInfoStream(LOG)); + writer.setUseCompoundFile(false); + writer.setSimilarity(new NutchSimilarity()); + + return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() { + boolean closed; + + public void write(WritableComparable key, LuceneDocumentWrapper value) + throws IOException { // unwrap & index doc + Document doc = value.get(); + NutchAnalyzer analyzer = factory.get(doc.get("lang")); + if (LOG.isInfoEnabled()) { + LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" + + " with analyzer " + analyzer + + " (" + doc.get("lang") + ")"); + } + writer.addDocument(doc, analyzer); + progress.progress(); + } + + public void close(final Reporter reporter) throws IOException { + // spawn a thread to give progress heartbeats + Thread prog = new Thread() { + public void run() { + while (!closed) { + try { + reporter.setStatus("closing"); + Thread.sleep(1000); + } catch (InterruptedException e) { continue; } + catch (Throwable e) { return; } + } + } + }; + + try { + prog.start(); + if (LOG.isInfoEnabled()) { LOG.info("Optimizing index."); } + // optimize & close index + writer.optimize(); + writer.close(); + fs.completeLocalOutput(perm, temp); // copy to dfs + fs.createNewFile(new Path(perm, DONE_NAME)); + } finally { + closed = true; + } + } + }; + } + } + + private IndexingFilters filters; + + public Indexer() { + + } + + public Indexer(Configuration conf) { + setConf(conf); + } + + public void configure(JobConf job) { + setConf(job); + this.filters = new IndexingFilters(getConf()); + } + + public void close() {} + + public void reduce(Text key, Iterator<NutchWritable> values, + OutputCollector<Text, Writable> output, Reporter reporter) + throws IOException { + ParseData parseData = null; + ParseText parseText = null; + while (values.hasNext()) { + Writable value = values.next().get(); // unwrap + + if (value instanceof ParseData) { + parseData = (ParseData)value; + } else if (value instanceof ParseText) { + parseText = (ParseText)value; + } else if (LOG.isWarnEnabled()) { + LOG.warn("Unrecognized type: "+value.getClass()); + } + } + + if ( parseText == null || parseData == null) { + return; // only have inlinks + } + + Document doc = new Document(); + Metadata metadata = parseData.getContentMeta(); + + if ( metadata.get(Nutch.SEGMENT_NAME_KEY) == null || + metadata.get(Nutch.SIGNATURE_KEY) == null ) + { + LOG.warn( "Skipping document, insufficient metadata: key=" + key + " metadata=" + metadata ); + return ; + } + + // add segment, used to map from merged index back to segment files + doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY), + Field.Store.YES, Field.Index.NO)); + + // add digest, used by dedup + doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY), + Field.Store.YES, Field.Index.NO)); + + Parse parse = new ParseImpl(parseText, parseData); + try { + doc = this.filters.filter(doc, parse, key, /*fetchDatum*/ null, /*inlinks*/ null); + } catch (IndexingException e) { + if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); } + return; + } + + // skip documents discarded by indexing filters + if (doc == null) return; + + output.collect(key, new LuceneDocumentWrapper(doc)); + } + + public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments) + throws IOException { + + if (LOG.isInfoEnabled()) { + LOG.info("Indexer: starting"); + } + + JobConf job = new NutchJob(getConf()); + job.setJobName("index " + indexDir); + + for (int i = 0; i < segments.length; i++) { + if (LOG.isInfoEnabled()) { + LOG.info("Indexer: adding segment: " + segments[i]); + } + FileInputFormat.addInputPath(job, new Path(segments[i], ParseData.DIR_NAME)); + FileInputFormat.addInputPath(job, new Path(segments[i], ParseText.DIR_NAME)); + } + + job.setInputFormat(SequenceFileInputFormat.class); + + job.setMapperClass(Indexer.class); + job.setReducerClass(Indexer.class); + + FileOutputFormat.setOutputPath(job, indexDir); + job.setOutputFormat(OutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(NutchWritable.class); + + JobClient.runJob(job); + if (LOG.isInfoEnabled()) { LOG.info("Indexer: done"); } + } + + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new Indexer(), args); + System.exit(res); + } + + public int run(String[] args) throws Exception { + + if (args.length < 2) { + System.err.println("Usage: <index> <segment> ..."); + return -1; + } + + Path[] segments = new Path[args.length-1]; + for (int i = 1; i < args.length; i++) { + segments[i-1] = new Path(args[i]); + } + + try { + index(new Path(args[0]), null, null, segments); + return 0; + } catch (Exception e) { + LOG.fatal("Indexer: " + StringUtils.stringifyException(e)); + return -1; + } + } + + public void map(Text key, Writable value, + OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { + output.collect(key, new NutchWritable(value)); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-26 22:57:34
|
Revision: 2832 http://archive-access.svn.sourceforge.net/archive-access/?rev=2832&view=rev Author: binzino Date: 2009-10-26 22:57:25 +0000 (Mon, 26 Oct 2009) Log Message: ----------- Fix WAX-67. One-line change to include metadata passed in from Importer. Added Paths: ----------- tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java Added: tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java =================================================================== --- tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java (rev 0) +++ tags/nutchwax-0_12_9/archive/src/nutch/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java 2009-10-26 22:57:25 UTC (rev 2832) @@ -0,0 +1,220 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.oo; + +import java.io.*; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.List; +import java.util.zip.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.*; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.LogUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.jaxen.*; +import org.jaxen.jdom.JDOMXPath; +import org.jdom.*; +import org.jdom.input.*; + +/** + * Parser for OpenOffice and OpenDocument formats. This should handle + * the following formats: Text, Spreadsheet, Presentation, and + * corresponding templates and "master" documents. + * + * @author Andrzej Bialecki + */ +public class OOParser implements Parser { + public static final Log LOG = LogFactory.getLog(OOParser.class); + + private Configuration conf; + + public OOParser () { + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return conf; + } + + public ParseResult getParse(Content content) { + String text = null; + String title = null; + Metadata metadata = new Metadata(); + ArrayList outlinks = new ArrayList(); + + try { + byte[] raw = content.getContent(); + String contentLength = content.getMetadata().get("Content-Length"); + if (contentLength != null + && raw.length != Integer.parseInt(contentLength)) { + return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, + "Content truncated at "+raw.length + +" bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), conf); + } + ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(raw)); + ZipEntry ze = null; + while ((ze = zis.getNextEntry()) != null) { + if (ze.getName().equals("content.xml")) { + text = parseContent(ze, zis, outlinks); + } else if (ze.getName().equals("meta.xml")) { + parseMeta(ze, zis, metadata); + } + } + zis.close(); + } catch (Exception e) { // run time exception + e.printStackTrace(LogUtil.getWarnStream(LOG)); + return new ParseStatus(ParseStatus.FAILED, + "Can't be handled as OO document. " + e).getEmptyParseResult(content.getUrl(), conf); + } + + title = metadata.get(Metadata.TITLE); + if (text == null) + text = ""; + + if (title == null) + title = ""; + + Outlink[] links = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]); + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, links, content.getMetadata(), metadata); + return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); + } + + // extract as much plain text as possible. + private String parseContent(ZipEntry ze, ZipInputStream zis, ArrayList outlinks) throws Exception { + StringBuffer res = new StringBuffer(); + FilterInputStream fis = new FilterInputStream(zis) { + public void close() {}; + }; + SAXBuilder builder = new SAXBuilder(); + Document doc = builder.build(fis); + Element root = doc.getRootElement(); + // XXX this is expensive for very large documents. In those cases another + // XXX method (direct processing of SAX events, or XMLPull) should be used. + XPath path = new JDOMXPath("//text:span | //text:p | //text:tab | //text:tab-stop | //text:a"); + path.addNamespace("text", root.getNamespace("text").getURI()); + Namespace xlink = Namespace.getNamespace("xlink", "http://www.w3.org/1999/xlink"); + List list = path.selectNodes(doc); + boolean lastp = true; + for (int i = 0; i < list.size(); i++) { + Element el = (Element)list.get(i); + String text = el.getText(); + if (el.getName().equals("p")) { + // skip empty paragraphs + if (!text.equals("")) { + if (!lastp) res.append("\n"); + res.append(text + "\n"); + lastp = true; + } + } else if (el.getName().startsWith("tab")) { + res.append("\t"); + lastp = false; + } else if (el.getName().equals("a")) { + List nl = el.getChildren(); + String a = null; + for (int k = 0; k < nl.size(); k++) { + Element anchor = (Element)nl.get(k); + String nsName = anchor.getNamespacePrefix() + ":" + anchor.getName(); + if (!nsName.equals("text:span")) continue; + a = anchor.getText(); + break; + } + String u = el.getAttributeValue("href", xlink); + if (u == null) u = a; // often anchors are URLs + try { + Outlink o = new Outlink(u, a); + outlinks.add(o); + } catch (MalformedURLException mue) { + // skip + } + if (a != null && !a.equals("")) { + if (!lastp) res.append(' '); + res.append(a); + lastp = false; + } + } else { + if (!text.equals("")) { + if (!lastp) res.append(' '); + res.append(text); + } + lastp = false; + } + } + return res.toString(); + } + + // extract metadata and convert them to Nutch format + private void parseMeta(ZipEntry ze, ZipInputStream zis, Metadata metadata) throws Exception { + FilterInputStream fis = new FilterInputStream(zis) { + public void close() {}; + }; + SAXBuilder builder = new SAXBuilder(); + Document doc = builder.build(fis); + XPath path = new JDOMXPath("/office:document-meta/office:meta/*"); + Element root = doc.getRootElement(); + path.addNamespace("office", root.getNamespace("office").getURI()); + List list = path.selectNodes(doc); + for (int i = 0; i < list.size(); i++) { + Element n = (Element)list.get(i); + String text = n.getText(); + if (text.trim().equals("")) continue; + String name = n.getName(); + if (name.equals("title")) + metadata.add(Metadata.TITLE, text); + else if (name.equals("language")) + metadata.add(Metadata.LANGUAGE, text); + else if (name.equals("creation-date")) + metadata.add(Metadata.DATE, text); + else if (name.equals("print-date")) + metadata.add(Metadata.LAST_PRINTED, text); + else if (name.equals("generator")) + metadata.add(Metadata.APPLICATION_NAME, text); + else if (name.equals("creator")) + metadata.add(Metadata.CREATOR, text); + } + } + + public static void main(String[] args) throws Exception { + OOParser oo = new OOParser(); + Configuration conf = NutchConfiguration.create(); + oo.setConf(conf); + FileInputStream fis = new FileInputStream(args[0]); + byte[] bytes = new byte[fis.available()]; + fis.read(bytes); + fis.close(); + Content c = new Content("local", "local", bytes, "application/vnd.oasis.opendocument.text", new Metadata(), conf); + Parse p = oo.getParse(c).get(c.getUrl()); + System.out.println(p.getData()); + System.out.println("Text: '" + p.getText() + "'"); + /* + // create the test output file + OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream("e:\\ootest.txt"), "UTF-8"); + osw.write(p.getText()); + osw.flush(); + osw.close(); + */ + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-26 19:35:41
|
Revision: 2831 http://archive-access.svn.sourceforge.net/archive-access/?rev=2831&view=rev Author: binzino Date: 2009-10-26 19:35:33 +0000 (Mon, 26 Oct 2009) Log Message: ----------- Creation of 0.12.9 branch from 0.12.8. Added Paths: ----------- tags/nutchwax-0_12_9/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |