From: <bi...@us...> - 2010-07-11 00:09:35
|
Revision: 3168 http://archive-access.svn.sourceforge.net/archive-access/?rev=3168&view=rev Author: binzino Date: 2010-07-11 00:09:27 +0000 (Sun, 11 Jul 2010) Log Message: ----------- A whole mess of accumulated hacks to get Importing and Indexing working with Hadoop 0.20 (Cloudera) on our Hadoop rack. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexMerger.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Indexer.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/DateQueryFilter.java Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/common-terms.utf8 tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/PluginManifestParser.java Removed Paths: ------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSegmentBean.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneSearchBean.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexMerger.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexMerger.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexMerger.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -34,9 +34,10 @@ import org.apache.nutch.indexer.NutchSimilarity; import org.apache.nutch.indexer.FsDirectory; -import org.apache.lucene.store.Directory; +import org.apache.lucene.store.NIOFSDirectory; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LogMergePolicy; import org.apache.lucene.index.ArchiveParallelReader; /************************************************************************* @@ -84,10 +85,10 @@ // // Merge indices // - IndexWriter writer = new IndexWriter(localOutput.toString(), null, true); - writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR)); + IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( localOutput.toString() ) ), null, IndexWriter.MaxFieldLength.UNLIMITED ); + writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", LogMergePolicy.DEFAULT_MERGE_FACTOR)); writer.setMaxBufferedDocs(getConf().getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS)); - writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS)); + writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", LogMergePolicy.DEFAULT_MAX_MERGE_DOCS)); writer.setTermIndexInterval(getConf().getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL)); writer.setInfoStream(LogUtil.getDebugStream(LOG)); writer.setUseCompoundFile(false); Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Indexer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Indexer.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Indexer.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -63,8 +63,8 @@ FileOutputFormat.setOutputPath(job, luceneDir); - LuceneWriter.addFieldOptions("segment", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job); - LuceneWriter.addFieldOptions("digest", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job); + //LuceneWriter.addFieldOptions("segment", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job); + //LuceneWriter.addFieldOptions("digest", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job); NutchIndexWriterFactory.addClassToConf(job, LuceneWriter.class); Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -20,13 +20,16 @@ */ package org.archive.nutchwax.tools; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.analysis.WhitespaceAnalyzer; +import java.io.*; + import org.apache.hadoop.conf.Configured; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.store.NIOFSDirectory; import org.apache.nutch.util.NutchConfiguration; @@ -50,12 +53,12 @@ String fieldValue = args[2].trim(); int count = Integer.parseInt( args[3].trim() ); - IndexWriter writer = new IndexWriter( indexDir, new WhitespaceAnalyzer( ), true ); + IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( indexDir ) ), null, IndexWriter.MaxFieldLength.UNLIMITED ); for ( int i = 0 ; i < count ; i++ ) { Document newDoc = new Document( ); - newDoc.add( new Field( fieldKey, fieldValue, Field.Store.YES, Field.Index.TOKENIZED ) ); + newDoc.add( new Field( fieldKey, fieldValue, Field.Store.YES, Field.Index.ANALYZED ) ); writer.addDocument( newDoc ); } Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -20,21 +20,15 @@ */ package org.archive.nutchwax.tools; -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.io.*; +import java.util.*; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.store.NIOFSDirectory; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.conf.Configuration; @@ -104,15 +98,15 @@ } - IndexReader reader = IndexReader.open( mainIndexDir ); + IndexReader reader = IndexReader.open( new NIOFSDirectory( new File( mainIndexDir ) ), true ); IndexReader sourceReaders[] = new IndexReader[args.length-3]; for ( int i = 0 ; i < sourceReaders.length ; i++ ) { - sourceReaders[i] = IndexReader.open( args[i+1] ); + sourceReaders[i] = IndexReader.open( new NIOFSDirectory( new File( args[i+1] ) ), true ); } - IndexWriter writer = new IndexWriter( destIndexDir, new WhitespaceAnalyzer( ), true ); + IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( destIndexDir ) ), null, IndexWriter.MaxFieldLength.UNLIMITED ); UrlCanonicalizer canonicalizer = getCanonicalizer( this.getConf( ) ); @@ -134,7 +128,7 @@ } for ( String date : uniqueDates ) { - newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NOT_ANALYZED ) ); } // Obtain the new dates for the document. @@ -162,7 +156,7 @@ { for ( String date : newDates.split("\\s+") ) { - newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NOT_ANALYZED ) ); } } Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -27,7 +27,9 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.ArchiveParallelReader; +import org.apache.lucene.store.NIOFSDirectory; + public class DumpParallelIndex { public static void main( String[] args ) throws Exception @@ -58,7 +60,7 @@ ArchiveParallelReader reader = new ArchiveParallelReader( ); for ( String dir : dirs ) { - reader.add( IndexReader.open( dir ) ); + reader.add( IndexReader.open( new NIOFSDirectory( new File( dir ) ) ) ); } if ( args[0].equals( "-l" ) ) Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -20,13 +20,11 @@ */ package org.archive.nutchwax.tools; -import java.io.File; -import java.util.Iterator; -import java.util.Set; -import java.util.HashSet; -import java.util.Collection; +import java.io.*; +import java.util.*; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.store.NIOFSDirectory; /** * A quick-n-dirty command-line utility to get the unique values for a @@ -55,7 +53,7 @@ private static void dumpUniqValues( String fieldName, String indexDir ) throws Exception { - IndexReader reader = IndexReader.open(indexDir); + IndexReader reader = IndexReader.open( new NIOFSDirectory( new File( indexDir) ) ); Collection fieldNames = reader.getFieldNames( IndexReader.FieldOption.ALL ); Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -16,15 +16,8 @@ * limitations under the License. */ -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; -import java.util.Collection; -import java.util.HashSet; +import java.io.*; +import java.util.*; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; @@ -32,12 +25,11 @@ import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Similarity; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.NIOFSDirectory; - import org.apache.nutch.indexer.NutchSimilarity; + /** * This is heavily cribbed from org.apache.lucene.misc.LengthNormModifier */ @@ -132,7 +124,7 @@ String pagerankFile = args[pos++]; - IndexReader reader = IndexReader.open( args[pos++] ); + IndexReader reader = IndexReader.open( new NIOFSDirectory( new File( args[pos++] ) ) ); try { Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/common-terms.utf8 =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/common-terms.utf8 (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/common-terms.utf8 2010-07-11 00:09:27 UTC (rev 3168) @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Common terms and phrases which will be indexed in n-grams +# in order to optimize search. +#content:a +#content:and +#content:for +#content:in +#content:of +#content:the +#content:to +#url:com +#url:http +#url:http-www +#url:www Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-07-11 00:09:27 UTC (rev 3168) @@ -10,7 +10,7 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-(text|html|js|pdf)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> + <value>protocol-http|parse-(text|html|pdf|msword|mspowerpoint|oo)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> </property> <!-- @@ -42,6 +42,7 @@ dest-key = src-key --> <name>nutchwax.filter.index</name> +<!-- <value> title:false:true:tokenized content:false:compress:tokenized @@ -55,6 +56,16 @@ type:true:true:no_norms length:false:true:no </value> +--> + <value> + title:false:true:tokenized + content:false:compress:tokenized + site:false:false:untokenized + url:false:true:tokenized + type:true:true:no_norms + date:false:true:no + length:false:true:no + </value> </property> <property> Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -1,3 +1,5 @@ +package org.apache.lucene.index; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -14,24 +16,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -/** - * ARCHIVE: This must be in the lucene index package because it needs - * to call protected methods on other IndexReader objects. - */ -package org.apache.lucene.index; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.Fieldable; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; -import org.apache.lucene.index.TermEnum; -import org.apache.lucene.index.TermFreqVector; -import org.apache.lucene.index.TermPositions; -import org.apache.lucene.index.TermVectorMapper; import java.io.IOException; import java.util.*; @@ -55,10 +44,12 @@ * undefined behavior</em>. */ public class ArchiveParallelReader extends IndexReader { - private List readers = new ArrayList(); - private List decrefOnClose = new ArrayList(); // remember which subreaders to decRef on close + private List<IndexReader> readers = new ArrayList<IndexReader>(); + private List<Boolean> decrefOnClose = new ArrayList<Boolean>(); // remember which subreaders to decRef on close boolean incRefReaders = false; - private SortedMap fieldToReader = new TreeMap(); + private SortedMap<String,IndexReader> fieldToReader = new TreeMap<String,IndexReader>(); + private Map<IndexReader,Collection<String>> readerToFields = new HashMap<IndexReader,Collection<String>>(); + private List<IndexReader> storedFieldReaders = new ArrayList<IndexReader>(); private int maxDoc; private int numDocs; @@ -81,9 +72,25 @@ /** Add an IndexReader. * @throws IOException if there is a low-level IO error */ - public void add(IndexReader reader) throws IOException - { + public void add(IndexReader reader) throws IOException { ensureOpen(); + add(reader, false); + } + + /** Add an IndexReader whose stored fields will not be returned. This can + * accelerate search when stored fields are only needed from a subset of + * the IndexReaders. + * + * @throws IllegalArgumentException if not all indexes contain the same number + * of documents + * @throws IllegalArgumentException if not all indexes have the same value + * of {@link IndexReader#maxDoc()} + * @throws IOException if there is a low-level IO error + */ + public void add(IndexReader reader, boolean ignoreStoredFields) + throws IOException { + + ensureOpen(); if (readers.size() == 0) { this.maxDoc = reader.maxDoc(); this.numDocs = reader.numDocs(); @@ -97,14 +104,15 @@ throw new IllegalArgumentException ("All readers must have same numDocs: "+numDocs+"!="+reader.numDocs()); - Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL); - Iterator i = fields.iterator(); - while (i.hasNext()) { // update fieldToReader map - String field = (String)i.next(); + Collection<String> fields = reader.getFieldNames(IndexReader.FieldOption.ALL); + readerToFields.put(reader, fields); + for (final String field : fields) { // update fieldToReader map if (fieldToReader.get(field) == null) fieldToReader.put(field, reader); } + if (!ignoreStoredFields) + storedFieldReaders.add(reader); // add to storedFieldReaders readers.add(reader); if (incRefReaders) { @@ -112,7 +120,16 @@ } decrefOnClose.add(Boolean.valueOf(incRefReaders)); } - + + @Override + public synchronized Object clone() { + try { + return doReopen(true); + } catch (Exception ex) { + throw new RuntimeException(ex); + } + } + /** * Tries to reopen the subreaders. * <br> @@ -132,63 +149,42 @@ * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - public IndexReader reopen() throws CorruptIndexException, IOException { + @Override + public synchronized IndexReader reopen() throws CorruptIndexException, IOException { + return doReopen(false); + } + + protected IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException { ensureOpen(); boolean reopened = false; - List newReaders = new ArrayList(); - List newDecrefOnClose = new ArrayList(); + List<IndexReader> newReaders = new ArrayList<IndexReader>(); boolean success = false; try { - - for (int i = 0; i < readers.size(); i++) { - IndexReader oldReader = (IndexReader) readers.get(i); - IndexReader newReader = oldReader.reopen(); + for (final IndexReader oldReader : readers) { + IndexReader newReader = null; + if (doClone) { + newReader = (IndexReader) oldReader.clone(); + } else { + newReader = oldReader.reopen(); + } newReaders.add(newReader); // if at least one of the subreaders was updated we remember that - // and return a new MultiReader + // and return a new ArchiveParallelReader if (newReader != oldReader) { reopened = true; } } - - if (reopened) { - ArchiveParallelReader pr = new ArchiveParallelReader(); - for (int i = 0; i < readers.size(); i++) { - IndexReader oldReader = (IndexReader) readers.get(i); - IndexReader newReader = (IndexReader) newReaders.get(i); - if (newReader == oldReader) { - newDecrefOnClose.add(Boolean.TRUE); - newReader.incRef(); - } else { - // this is a new subreader instance, so on close() we don't - // decRef but close it - newDecrefOnClose.add(Boolean.FALSE); - } - pr.add(newReader); - } - pr.decrefOnClose = newDecrefOnClose; - pr.incRefReaders = incRefReaders; - success = true; - return pr; - } else { - success = true; - // No subreader was refreshed - return this; - } + success = true; } finally { if (!success && reopened) { for (int i = 0; i < newReaders.size(); i++) { - IndexReader r = (IndexReader) newReaders.get(i); - if (r != null) { + IndexReader r = newReaders.get(i); + if (r != readers.get(i)) { try { - if (((Boolean) newDecrefOnClose.get(i)).booleanValue()) { - r.decRef(); - } else { - r.close(); - } + r.close(); } catch (IOException ignore) { // keep going - we want to clean up as much as possible } @@ -196,46 +192,74 @@ } } } + + if (reopened) { + List<Boolean> newDecrefOnClose = new ArrayList<Boolean>(); + ArchiveParallelReader pr = new ArchiveParallelReader(); + for (int i = 0; i < readers.size(); i++) { + IndexReader oldReader = readers.get(i); + IndexReader newReader = newReaders.get(i); + if (newReader == oldReader) { + newDecrefOnClose.add(Boolean.TRUE); + newReader.incRef(); + } else { + // this is a new subreader instance, so on close() we don't + // decRef but close it + newDecrefOnClose.add(Boolean.FALSE); + } + pr.add(newReader, !storedFieldReaders.contains(oldReader)); + } + pr.decrefOnClose = newDecrefOnClose; + pr.incRefReaders = incRefReaders; + return pr; + } else { + // No subreader was refreshed + return this; + } } + @Override public int numDocs() { // Don't call ensureOpen() here (it could affect performance) return numDocs; } + @Override public int maxDoc() { // Don't call ensureOpen() here (it could affect performance) return maxDoc; } + @Override public boolean hasDeletions() { // Don't call ensureOpen() here (it could affect performance) return hasDeletions; } // check first reader + @Override public boolean isDeleted(int n) { // Don't call ensureOpen() here (it could affect performance) if (readers.size() > 0) - return ((IndexReader)readers.get(0)).isDeleted(n); + return readers.get(0).isDeleted(n); return false; } // delete in all readers + @Override protected void doDelete(int n) throws CorruptIndexException, IOException { - for (int i = 0; i < readers.size(); i++) { - ((IndexReader)readers.get(i)).deleteDocument(n); + for (final IndexReader reader : readers) { + reader.deleteDocument(n); } hasDeletions = true; } - /** - * @see org.apache.lucene.index.ParallelReader.doUndeleteAll - */ + // undeleteAll in all readers + @Override protected void doUndeleteAll() throws CorruptIndexException, IOException { - for (int i = 0; i < readers.size(); i++) { - ((IndexReader)readers.get(i)).undeleteAll(); + for (final IndexReader reader : readers) { + reader.undeleteAll(); } hasDeletions = false; } @@ -289,111 +313,150 @@ return result; } + /* + // append fields from storedFieldReaders + @Override + public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { + ensureOpen(); + Document result = new Document(); + for (final IndexReader reader: storedFieldReaders) { + + boolean include = (fieldSelector==null); + if (!include) { + Collection<String> fields = readerToFields.get(reader); + for (final String field : fields) + if (fieldSelector.accept(field) != FieldSelectorResult.NO_LOAD) { + include = true; + break; + } + } + if (include) { + List<Fieldable> fields = reader.document(n, fieldSelector).getFields(); + for (Fieldable field : fields) { + result.add(field); + } + } + } + return result; + } + */ + // get all vectors + @Override public TermFreqVector[] getTermFreqVectors(int n) throws IOException { ensureOpen(); - ArrayList results = new ArrayList(); - Iterator i = fieldToReader.entrySet().iterator(); - while (i.hasNext()) { - Map.Entry e = (Map.Entry)i.next(); - String field = (String)e.getKey(); - IndexReader reader = (IndexReader)e.getValue(); + ArrayList<TermFreqVector> results = new ArrayList<TermFreqVector>(); + for (final Map.Entry<String,IndexReader> e: fieldToReader.entrySet()) { + + String field = e.getKey(); + IndexReader reader = e.getValue(); TermFreqVector vector = reader.getTermFreqVector(n, field); if (vector != null) results.add(vector); } - return (TermFreqVector[]) - results.toArray(new TermFreqVector[results.size()]); + return results.toArray(new TermFreqVector[results.size()]); } + @Override public TermFreqVector getTermFreqVector(int n, String field) throws IOException { ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); return reader==null ? null : reader.getTermFreqVector(n, field); } + @Override public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); if (reader != null) { reader.getTermFreqVector(docNumber, field, mapper); } } + @Override public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { ensureOpen(); - ensureOpen(); - Iterator i = fieldToReader.entrySet().iterator(); - while (i.hasNext()) { - Map.Entry e = (Map.Entry)i.next(); - String field = (String)e.getKey(); - IndexReader reader = (IndexReader)e.getValue(); + for (final Map.Entry<String,IndexReader> e : fieldToReader.entrySet()) { + + String field = e.getKey(); + IndexReader reader = e.getValue(); reader.getTermFreqVector(docNumber, field, mapper); } } + @Override public boolean hasNorms(String field) throws IOException { ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); return reader==null ? false : reader.hasNorms(field); } + @Override public byte[] norms(String field) throws IOException { ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); return reader==null ? null : reader.norms(field); } + @Override public void norms(String field, byte[] result, int offset) throws IOException { ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); if (reader!=null) reader.norms(field, result, offset); } + @Override protected void doSetNorm(int n, String field, byte value) throws CorruptIndexException, IOException { - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); if (reader!=null) reader.doSetNorm(n, field, value); } + @Override public TermEnum terms() throws IOException { ensureOpen(); return new ParallelTermEnum(); } + @Override public TermEnum terms(Term term) throws IOException { ensureOpen(); return new ParallelTermEnum(term); } + @Override public int docFreq(Term term) throws IOException { ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + IndexReader reader = fieldToReader.get(term.field()); return reader==null ? 0 : reader.docFreq(term); } + @Override public TermDocs termDocs(Term term) throws IOException { ensureOpen(); return new ParallelTermDocs(term); } + @Override public TermDocs termDocs() throws IOException { ensureOpen(); return new ParallelTermDocs(); } + @Override public TermPositions termPositions(Term term) throws IOException { ensureOpen(); return new ParallelTermPositions(term); } + @Override public TermPositions termPositions() throws IOException { ensureOpen(); return new ParallelTermPositions(); @@ -402,9 +465,10 @@ /** * Checks recursively if all subreaders are up to date. */ + @Override public boolean isCurrent() throws CorruptIndexException, IOException { - for (int i = 0; i < readers.size(); i++) { - if (!((IndexReader)readers.get(i)).isCurrent()) { + for (final IndexReader reader : readers) { + if (!reader.isCurrent()) { return false; } } @@ -416,9 +480,10 @@ /** * Checks recursively if all subindexes are optimized */ + @Override public boolean isOptimized() { - for (int i = 0; i < readers.size(); i++) { - if (!((IndexReader)readers.get(i)).isOptimized()) { + for (final IndexReader reader : readers) { + if (!reader.isOptimized()) { return false; } } @@ -431,36 +496,39 @@ /** Not implemented. * @throws UnsupportedOperationException */ + @Override public long getVersion() { throw new UnsupportedOperationException("ArchiveParallelReader does not support this method."); } // for testing IndexReader[] getSubReaders() { - return (IndexReader[]) readers.toArray(new IndexReader[readers.size()]); + return readers.toArray(new IndexReader[readers.size()]); } - protected void doCommit() throws IOException { - for (int i = 0; i < readers.size(); i++) - ((IndexReader)readers.get(i)).commit(); + @Override + protected void doCommit(Map<String,String> commitUserData) throws IOException { + for (final IndexReader reader : readers) + reader.commit(commitUserData); } + @Override protected synchronized void doClose() throws IOException { for (int i = 0; i < readers.size(); i++) { - if (((Boolean) decrefOnClose.get(i)).booleanValue()) { - ((IndexReader)readers.get(i)).decRef(); + if (decrefOnClose.get(i).booleanValue()) { + readers.get(i).decRef(); } else { - ((IndexReader)readers.get(i)).close(); + readers.get(i).close(); } } } - public Collection getFieldNames (IndexReader.FieldOption fieldNames) { + @Override + public Collection<String> getFieldNames (IndexReader.FieldOption fieldNames) { ensureOpen(); - Set fieldSet = new HashSet(); - for (int i = 0; i < readers.size(); i++) { - IndexReader reader = ((IndexReader)readers.get(i)); - Collection names = reader.getFieldNames(fieldNames); + Set<String> fieldSet = new HashSet<String>(); + for (final IndexReader reader : readers) { + Collection<String> names = reader.getFieldNames(fieldNames); fieldSet.addAll(names); } return fieldSet; @@ -468,24 +536,28 @@ private class ParallelTermEnum extends TermEnum { private String field; - private Iterator fieldIterator; + private Iterator<String> fieldIterator; private TermEnum termEnum; public ParallelTermEnum() throws IOException { - if ( fieldToReader.isEmpty( ) ) return ; - - field = (String)fieldToReader.firstKey(); + try { + field = fieldToReader.firstKey(); + } catch(NoSuchElementException e) { + // No fields, so keep field == null, termEnum == null + return; + } if (field != null) - termEnum = ((IndexReader)fieldToReader.get(field)).terms(); + termEnum = fieldToReader.get(field).terms(); } public ParallelTermEnum(Term term) throws IOException { field = term.field(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); if (reader!=null) termEnum = reader.terms(term); } + @Override public boolean next() throws IOException { if (termEnum==null) return false; @@ -502,8 +574,8 @@ fieldIterator.next(); // Skip field to get next one } while (fieldIterator.hasNext()) { - field = (String) fieldIterator.next(); - termEnum = ((IndexReader)fieldToReader.get(field)).terms(new Term(field, "")); + field = fieldIterator.next(); + termEnum = fieldToReader.get(field).terms(new Term(field)); Term term = termEnum.term(); if (term!=null && term.field()==field) return true; @@ -514,6 +586,7 @@ return false; // no more fields } + @Override public Term term() { if (termEnum==null) return null; @@ -521,6 +594,7 @@ return termEnum.term(); } + @Override public int docFreq() { if (termEnum==null) return 0; @@ -528,6 +602,7 @@ return termEnum.docFreq(); } + @Override public void close() throws IOException { if (termEnum!=null) termEnum.close(); @@ -540,13 +615,18 @@ protected TermDocs termDocs; public ParallelTermDocs() {} - public ParallelTermDocs(Term term) throws IOException { seek(term); } + public ParallelTermDocs(Term term) throws IOException { + if (term == null) + termDocs = readers.isEmpty() ? null : readers.get(0).termDocs(null); + else + seek(term); + } public int doc() { return termDocs.doc(); } public int freq() { return termDocs.freq(); } public void seek(Term term) throws IOException { - IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + IndexReader reader = fieldToReader.get(term.field()); termDocs = reader!=null ? reader.termDocs(term) : null; } @@ -588,8 +668,9 @@ public ParallelTermPositions() {} public ParallelTermPositions(Term term) throws IOException { seek(term); } + @Override public void seek(Term term) throws IOException { - IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + IndexReader reader = fieldToReader.get(term.field()); termDocs = reader!=null ? reader.termPositions(term) : null; } @@ -614,3 +695,8 @@ } } + + + + + Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -0,0 +1,330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.lucene; + +import java.io.File; +import java.io.IOException; +import java.io.ByteArrayOutputStream; +import java.io.OutputStreamWriter; +import java.util.zip.GZIPOutputStream; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; +import java.util.Map.Entry; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.CompressionTools; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.store.FSDirectory; +import org.apache.nutch.analysis.AnalyzerFactory; +import org.apache.nutch.analysis.NutchAnalyzer; +import org.apache.nutch.analysis.NutchDocumentAnalyzer; +import org.apache.nutch.indexer.Indexer; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.NutchIndexWriter; +import org.apache.nutch.indexer.NutchSimilarity; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.util.LogUtil; + +public class LuceneWriter implements NutchIndexWriter { + + public static enum STORE { YES, NO, COMPRESS } + + public static enum INDEX { NO, NO_NORMS, TOKENIZED, UNTOKENIZED } + + public static enum VECTOR { NO, OFFSET, POS, POS_OFFSET, YES } + + private IndexWriter writer; + + private AnalyzerFactory analyzerFactory; + + private Path perm; + + private Path temp; + + private FileSystem fs; + + private final Map<String, Field.Store> fieldStore; + private final Set<String> fieldCompress; + + private final Map<String, Field.Index> fieldIndex; + + private final Map<String, Field.TermVector> fieldVector; + + public LuceneWriter() { + fieldStore = new HashMap<String, Field.Store>(); + fieldCompress = new HashSet<String>(); + fieldIndex = new HashMap<String, Field.Index>(); + fieldVector = new HashMap<String, Field.TermVector>(); + } + + private Document createLuceneDoc(NutchDocument doc) { + final Document out = new Document(); + + out.setBoost(doc.getScore()); + + final Metadata documentMeta = doc.getDocumentMeta(); + for (final Entry<String, List<String>> entry : doc) { + final String fieldName = entry.getKey(); + + Field.Store store = fieldStore.get(fieldName); + boolean compress = fieldCompress.contains(fieldName); + Field.Index index = fieldIndex.get(fieldName); + Field.TermVector vector = fieldVector.get(fieldName); + + // default values + if (store == null) { + store = Field.Store.NO; + } + + if (index == null) { + index = Field.Index.NO; + } + + if (vector == null) { + vector = Field.TermVector.NO; + } + + // read document-level field information + final String[] fieldMetas = + documentMeta.getValues(LuceneConstants.FIELD_PREFIX + fieldName); + if (fieldMetas.length != 0) { + for (final String val : fieldMetas) { + System.out.println( fieldName + " : " + val ); + if (LuceneConstants.STORE_YES.equals(val)) { + store = Field.Store.YES; + } else if (LuceneConstants.STORE_NO.equals(val)) { + store = Field.Store.NO; + } else if (LuceneConstants.STORE_COMPRESS.equals(val)) { + compress = true; + } else if (LuceneConstants.INDEX_TOKENIZED.equals(val)) { + index = Field.Index.ANALYZED; + } else if (LuceneConstants.INDEX_NO.equals(val)) { + index = Field.Index.NO; + } else if (LuceneConstants.INDEX_UNTOKENIZED.equals(val)) { + index = Field.Index.NOT_ANALYZED; + } else if (LuceneConstants.INDEX_NO_NORMS.equals(val)) { + index = Field.Index.ANALYZED_NO_NORMS; + } else if (LuceneConstants.VECTOR_NO.equals(val)) { + vector = Field.TermVector.NO; + } else if (LuceneConstants.VECTOR_YES.equals(val)) { + vector = Field.TermVector.YES; + } else if (LuceneConstants.VECTOR_POS.equals(val)) { + vector = Field.TermVector.WITH_POSITIONS; + } else if (LuceneConstants.VECTOR_POS_OFFSET.equals(val)) { + vector = Field.TermVector.WITH_POSITIONS_OFFSETS; + } else if (LuceneConstants.VECTOR_OFFSET.equals(val)) { + vector = Field.TermVector.WITH_OFFSETS; + } + } + } + + for (final String fieldValue : entry.getValue()) { + if ( compress ) + { + out.add( new Field( fieldName, CompressionTools.compressString( fieldValue ), Field.Store.YES ) ); + } + out.add(new Field(fieldName, fieldValue, store, index, vector)); + } + } + + return out; + } + + @SuppressWarnings("unchecked") + private void processOptions(Configuration conf) { + final Iterator iterator = conf.iterator(); + while (iterator.hasNext()) { + final String key = (String) ((Map.Entry)iterator.next()).getKey(); + if (!key.startsWith(LuceneConstants.LUCENE_PREFIX)) { + continue; + } + if (key.startsWith(LuceneConstants.FIELD_STORE_PREFIX)) { + final String field = + key.substring(LuceneConstants.FIELD_STORE_PREFIX.length()); + final LuceneWriter.STORE store = LuceneWriter.STORE.valueOf(conf.get(key)); + switch (store) { + case YES: + fieldStore.put(field, Field.Store.YES); + break; + case NO: + fieldStore.put(field, Field.Store.NO); + break; + case COMPRESS: + fieldCompress.add(field); + break; + } + } else if (key.startsWith(LuceneConstants.FIELD_INDEX_PREFIX)) { + final String field = + key.substring(LuceneConstants.FIELD_INDEX_PREFIX.length()); + final LuceneWriter.INDEX index = LuceneWriter.INDEX.valueOf(conf.get(key)); + switch (index) { + case NO: + fieldIndex.put(field, Field.Index.NO); + break; + case NO_NORMS: + fieldIndex.put(field, Field.Index.NOT_ANALYZED_NO_NORMS); + break; + case TOKENIZED: + fieldIndex.put(field, Field.Index.ANALYZED); + break; + case UNTOKENIZED: + fieldIndex.put(field, Field.Index.NOT_ANALYZED); + break; + } + } else if (key.startsWith(LuceneConstants.FIELD_VECTOR_PREFIX)) { + final String field = + key.substring(LuceneConstants.FIELD_VECTOR_PREFIX.length()); + final LuceneWriter.VECTOR vector = LuceneWriter.VECTOR.valueOf(conf.get(key)); + switch (vector) { + case NO: + fieldVector.put(field, Field.TermVector.NO); + break; + case OFFSET: + fieldVector.put(field, Field.TermVector.WITH_OFFSETS); + break; + case POS: + fieldVector.put(field, Field.TermVector.WITH_POSITIONS); + break; + case POS_OFFSET: + fieldVector.put(field, Field.TermVector.WITH_POSITIONS_OFFSETS); + break; + case YES: + fieldVector.put(field, Field.TermVector.YES); + break; + } + } + } + } + + public void open(JobConf job, String name) + throws IOException { + this.fs = FileSystem.get(job); + perm = new Path(FileOutputFormat.getOutputPath(job), name); + temp = job.getLocalPath("index/_" + + Integer.toString(new Random().nextInt())); + + fs.delete(perm, true); // delete old, if any + analyzerFactory = new AnalyzerFactory(job); + writer = new IndexWriter( + FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())), + new NutchDocumentAnalyzer(job), true, MaxFieldLength.UNLIMITED); + + writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10)); + writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100)); + writer.setMaxMergeDocs(job + .getInt("indexer.maxMergeDocs", Integer.MAX_VALUE)); + writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128)); + writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000)); + writer.setInfoStream(LogUtil.getDebugStream(Indexer.LOG)); + writer.setUseCompoundFile(false); + writer.setSimilarity(new NutchSimilarity()); + + processOptions(job); + } + + public void close() throws IOException { + writer.optimize(); + writer.close(); + fs.completeLocalOutput(perm, temp); // copy to dfs + fs.createNewFile(new Path(perm, Indexer.DONE_NAME)); + } + + public void write(NutchDocument doc) throws IOException { + final Document luceneDoc = createLuceneDoc(doc); + final NutchAnalyzer analyzer = analyzerFactory.get(luceneDoc.get("lang")); + if (Indexer.LOG.isDebugEnabled()) { + Indexer.LOG.debug("Indexing [" + luceneDoc.get("url") + + "] with analyzer " + analyzer + " (" + luceneDoc.get("lang") + + ")"); + } + writer.addDocument(luceneDoc, analyzer); + + } + + /** Adds a lucene field. + * <p> + * This method is provided for backward-compatibility with + * older indexing filters. This should not be used by newer + * implementations since this is slower than + * {@link NutchDocument#add(String, String)} and will be removed + * in a future release. + * </p> + * @param f Lucene field to be added. + * @deprecated Use {@link NutchDocument#add(String, String)} instead and + * set index-level metadata for field information. + * */ + @Deprecated + public static void add(NutchDocument doc, Field f) { + final String fieldName = f.name(); + final String key = LuceneConstants.FIELD_PREFIX + fieldName; + final Metadata documentMeta = doc.getDocumentMeta(); + if (f.isStored()) { + documentMeta.add(key, LuceneConstants.STORE_YES); + } else { + documentMeta.add(key, LuceneConstants.STORE_NO); + } + + if (f.isIndexed()) { + if (f.isTokenized()) { + documentMeta.add(key, LuceneConstants.INDEX_TOKENIZED); + } else if (f.getOmitNorms()) { + documentMeta.add(key, LuceneConstants.INDEX_NO_NORMS); + } else { + documentMeta.add(key, LuceneConstants.INDEX_UNTOKENIZED); + } + } else { + documentMeta.add(key, LuceneConstants.INDEX_NO); + } + + if (f.isStoreOffsetWithTermVector() && f.isStorePositionWithTermVector()) { + documentMeta.add(key, LuceneConstants.VECTOR_POS_OFFSET); + } else if (f.isStoreOffsetWithTermVector()) { + documentMeta.add(key, LuceneConstants.VECTOR_OFFSET); + } else if (f.isStorePositionWithTermVector()) { + documentMeta.add(key, LuceneConstants.VECTOR_POS); + } else if (f.isTermVectorStored()) { + documentMeta.add(key, LuceneConstants.VECTOR_YES); + } else { + documentMeta.add(key, LuceneConstants.VECTOR_NO); + } + } + + public static void addFieldOptions(String field, LuceneWriter.STORE store, + LuceneWriter.INDEX index, LuceneWriter.VECTOR vector, Configuration conf) { + + conf.set(LuceneConstants.FIELD_STORE_PREFIX + field, store.toString()); + conf.set(LuceneConstants.FIELD_INDEX_PREFIX + field, index.toString()); + conf.set(LuceneConstants.FIELD_VECTOR_PREFIX + field, vector.toString()); + } + + public static void addFieldOptions(String field, LuceneWriter.STORE store, + LuceneWriter.INDEX index, Configuration conf) { + LuceneWriter.addFieldOptions(field, store, index, LuceneWriter.VECTOR.NO, conf); + } +} Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/PluginManifestParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/PluginManifestParser.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/PluginManifestParser.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +import java.io.File; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLDecoder; +import java.util.HashMap; +import java.util.Map; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.apache.commons.logging.Log; + +import org.apache.hadoop.conf.Configuration; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +/** + * The <code>PluginManifestParser</code> parser just parse the manifest file + * in all plugin directories. + * + * @author joa23 + */ +public class PluginManifestParser { + private static final String ATTR_NAME = "name"; + private static final String ATTR_CLASS = "class"; + private static final String ATTR_ID = "id"; + + public static final Log LOG = PluginRepository.LOG; + + private static final boolean WINDOWS = System.getProperty("os.name") + .startsWith("Windows"); + + private Configuration conf; + + private PluginRepository pluginRepository; + + public PluginManifestParser(Configuration conf, + PluginRepository pluginRepository) { + this.conf = conf; + this.pluginRepository = pluginRepository; + } + + /** + * Returns a list of all found plugin descriptors. + * + * @param pluginFolders + * folders to search plugins from + * @return A {@link Map} of all found {@link PluginDescriptor}s. + */ + public Map<String, PluginDescriptor> parsePluginFolder(String[] pluginFolders) { + Map<String, PluginDescriptor> map = new HashMap<String, PluginDescriptor>(); + + if (pluginFolders == null) { + throw new IllegalArgumentException("plugin.folders is not defined"); + } + + for (String name : pluginFolders) { + File directory = getPluginFolder(name); + if (directory == null) { + continue; + } + LOG.info("Plugins: looking in: " + directory.getAbsolutePath()); + for (File oneSubFolder : directory.listFiles()) { + if (oneSubFolder.isDirectory()) { + String manifestPath = oneSubFolder.getAbsolutePath() + File.separator + + "plugin.xml"; + try { + LOG.debug("parsing: " + manifestPath); + PluginDescriptor p = parseManifestFile(manifestPath); + map.put(p.getPluginId(), p); + } catch (MalformedURLException e) { + LOG.warn(e.toString()); + } catch (SAXException e) { + LOG.warn(e.toString()); + } catch (IOException e) { + LOG.warn(e.toString()); + } catch (ParserConfigurationException e) { + LOG.warn(e.toString()); + } + } + } + } + return map; + } + + /** + * Return the named plugin folder. If the name is absolute then it is + * returned. Otherwise, for relative names, the classpath is scanned. + */ + public File getPluginFolder(String name) { + File directory = new File(name); + if (!directory.isAbsolute()) { + URL url = PluginManifestParser.class.getClassLoader().getResource(name); + if (url == null && directory.exists() && directory.isDirectory() + && directory.listFiles().length > 0) { + return directory; // relative path that is not in the classpath + } else if (url == null) { + LOG.warn("Plugins: directory not found: " + name); + return null; + } else if ( "jar".equals(url.getProtocol()) ) { + try + { + // HACK to find directory containing .jar file and look for plugins there. + LOG.warn( "HACK to look for plugin directory next to jar file: " + url ); + java.net.JarURLConnection connection = (java.net.JarURLConnection) url.openConnection(); + URL url2 = connection.getJarFileURL(); + if ( !"file".equals(url2.getProtocol()) ) + { + LOG.warn( "Jar file is not a file: " + url2 ); + return null; + } + directory = new File( new File( url2.getFile() ).getParent( ) + "/" + name ); + LOG.warn( "Plugin directory: " + directory ); + return directory; + } + catch ( IOException ioe ) + { + LOG.warn( ioe ); + return null; + } + } else if (!"file".equals(url.getProtocol())) { + LOG.warn("Plugins: not a file: url. Can't load plugins from: " + url); + return null; + } + String path = url.getPath(); + if (WINDOWS && path.startsWith("/")) // patch a windows bug + path = path.substring(1); + try { + path = URLDecoder.decode(path, "UTF-8"); // decode the url path + } catch (UnsupportedEncodingException e) { + } + directory = new File(path); + } + return directory; + } + + /** + * @param manifestPath + * @throws ParserConfigurationException + * @throws IOException + * @throws SAXException + * @throws MalformedURLException + */ + private PluginDescriptor parseManifestFile(String pManifestPath) + throws MalformedURLException, SAXException, IOException, + ParserConfigurationException { + Document document = parseXML(new File(pManifestPath).toURL()); + String pPath = new File(pManifestPath).getParent(); + return parsePlugin(document, pPath); + } + + /** + * @param url + * @return Document + * @throws IOException + * @throws SAXException + * @throws ParserConfigurationException + * @throws DocumentException + */ + private Document parseXML(URL url) throws SAXException, IOException, + ParserConfigurationException { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder builder = factory.newDocumentBuilder(); + return builder.parse(url.openStream()); + } + + /** + * @param pDocument + * @throws MalformedURLException + */ + private PluginDescriptor parsePlugin(Document pDocument, String pPath) + throws MalformedURLException { + Element rootElement = pDocument.getDocumentElement(); + String id = rootElement.getAttribute(ATTR_ID); + String name = rootElement.getAttribute(ATTR_NAME); + String version = rootElement.getAttribute("version"); + String providerName = rootElement.getAttribute("provider-name"); + String pluginClazz = null; + if (rootElement.getAttribute(ATTR_CLASS).trim().length() > 0) { + pluginClazz = rootElement.getAttribute(ATTR_CLASS); + } + PluginDescriptor pluginDescriptor = new PluginDescriptor(id, version, name, + providerName, pluginClazz, pPath, this.conf); + LOG.debug("plugin: id=" + id + " name=" + name + " version=" + version + + " provider=" + providerName + "class=" + pluginClazz); + parseExtension(rootElement, pluginDescriptor); + parseExtensionPoints(rootElement, pluginDescriptor); + parseLibraries(rootElement, pluginDescriptor); + parseRequires(rootElement, pluginDescriptor); + return pluginDescriptor; + } + + /** + * @param pRootElement + * @param pDescriptor + * @throws MalformedURLException + */ + private void parseRequires(Element pRootElement, PluginDescriptor pDescriptor) + throws MalformedURLException { + + NodeList nodelist = pRootElement.getElementsByTagName("requires"); + if (nodelist.getLength() > 0) { + + Element requires = (Element) nodelist.item(0); + + NodeList imports = requires.getElementsByTagName("import"); + for (int i = 0; i < imports.getLength(); i++) { + Element anImport = (Element) imports.item(i); + String plugin = anImport.getAttribute("plugin"); + if (plugin != null) { + pDescriptor.addDependency(plugin); + } + } + } + } + + /** + * @param pRootElement + * @param pDescriptor + * @throws MalformedURLException + */ + private void parseLibraries(Element pRootElement, PluginDescriptor pDescriptor) + throws MalformedURLException { + NodeList nodelist = pRootElement.getElementsByTagName("runtime"); + if (nodelist.getLength() > 0) { + + Element runtime = (Element) nodelist.item(0); + + NodeList libraries = runtime.getElementsByTagName("library"); + for (int i = 0; i < libraries.getLength(); i++)... [truncated message content] |