From: <bi...@us...> - 2010-07-11 00:09:35
|
Revision: 3168 http://archive-access.svn.sourceforge.net/archive-access/?rev=3168&view=rev Author: binzino Date: 2010-07-11 00:09:27 +0000 (Sun, 11 Jul 2010) Log Message: ----------- A whole mess of accumulated hacks to get Importing and Indexing working with Hadoop 0.20 (Cloudera) on our Hadoop rack. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexMerger.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Indexer.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/DateQueryFilter.java Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/common-terms.utf8 tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/PluginManifestParser.java Removed Paths: ------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSegmentBean.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneSearchBean.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexMerger.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexMerger.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexMerger.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -34,9 +34,10 @@ import org.apache.nutch.indexer.NutchSimilarity; import org.apache.nutch.indexer.FsDirectory; -import org.apache.lucene.store.Directory; +import org.apache.lucene.store.NIOFSDirectory; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LogMergePolicy; import org.apache.lucene.index.ArchiveParallelReader; /************************************************************************* @@ -84,10 +85,10 @@ // // Merge indices // - IndexWriter writer = new IndexWriter(localOutput.toString(), null, true); - writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR)); + IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( localOutput.toString() ) ), null, IndexWriter.MaxFieldLength.UNLIMITED ); + writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", LogMergePolicy.DEFAULT_MERGE_FACTOR)); writer.setMaxBufferedDocs(getConf().getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS)); - writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS)); + writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", LogMergePolicy.DEFAULT_MAX_MERGE_DOCS)); writer.setTermIndexInterval(getConf().getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL)); writer.setInfoStream(LogUtil.getDebugStream(LOG)); writer.setUseCompoundFile(false); Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Indexer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Indexer.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Indexer.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -63,8 +63,8 @@ FileOutputFormat.setOutputPath(job, luceneDir); - LuceneWriter.addFieldOptions("segment", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job); - LuceneWriter.addFieldOptions("digest", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job); + //LuceneWriter.addFieldOptions("segment", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job); + //LuceneWriter.addFieldOptions("digest", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job); NutchIndexWriterFactory.addClassToConf(job, LuceneWriter.class); Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -20,13 +20,16 @@ */ package org.archive.nutchwax.tools; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.analysis.WhitespaceAnalyzer; +import java.io.*; + import org.apache.hadoop.conf.Configured; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.store.NIOFSDirectory; import org.apache.nutch.util.NutchConfiguration; @@ -50,12 +53,12 @@ String fieldValue = args[2].trim(); int count = Integer.parseInt( args[3].trim() ); - IndexWriter writer = new IndexWriter( indexDir, new WhitespaceAnalyzer( ), true ); + IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( indexDir ) ), null, IndexWriter.MaxFieldLength.UNLIMITED ); for ( int i = 0 ; i < count ; i++ ) { Document newDoc = new Document( ); - newDoc.add( new Field( fieldKey, fieldValue, Field.Store.YES, Field.Index.TOKENIZED ) ); + newDoc.add( new Field( fieldKey, fieldValue, Field.Store.YES, Field.Index.ANALYZED ) ); writer.addDocument( newDoc ); } Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -20,21 +20,15 @@ */ package org.archive.nutchwax.tools; -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.io.*; +import java.util.*; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.store.NIOFSDirectory; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.conf.Configuration; @@ -104,15 +98,15 @@ } - IndexReader reader = IndexReader.open( mainIndexDir ); + IndexReader reader = IndexReader.open( new NIOFSDirectory( new File( mainIndexDir ) ), true ); IndexReader sourceReaders[] = new IndexReader[args.length-3]; for ( int i = 0 ; i < sourceReaders.length ; i++ ) { - sourceReaders[i] = IndexReader.open( args[i+1] ); + sourceReaders[i] = IndexReader.open( new NIOFSDirectory( new File( args[i+1] ) ), true ); } - IndexWriter writer = new IndexWriter( destIndexDir, new WhitespaceAnalyzer( ), true ); + IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( destIndexDir ) ), null, IndexWriter.MaxFieldLength.UNLIMITED ); UrlCanonicalizer canonicalizer = getCanonicalizer( this.getConf( ) ); @@ -134,7 +128,7 @@ } for ( String date : uniqueDates ) { - newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NOT_ANALYZED ) ); } // Obtain the new dates for the document. @@ -162,7 +156,7 @@ { for ( String date : newDates.split("\\s+") ) { - newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NOT_ANALYZED ) ); } } Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -27,7 +27,9 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.ArchiveParallelReader; +import org.apache.lucene.store.NIOFSDirectory; + public class DumpParallelIndex { public static void main( String[] args ) throws Exception @@ -58,7 +60,7 @@ ArchiveParallelReader reader = new ArchiveParallelReader( ); for ( String dir : dirs ) { - reader.add( IndexReader.open( dir ) ); + reader.add( IndexReader.open( new NIOFSDirectory( new File( dir ) ) ) ); } if ( args[0].equals( "-l" ) ) Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -20,13 +20,11 @@ */ package org.archive.nutchwax.tools; -import java.io.File; -import java.util.Iterator; -import java.util.Set; -import java.util.HashSet; -import java.util.Collection; +import java.io.*; +import java.util.*; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.store.NIOFSDirectory; /** * A quick-n-dirty command-line utility to get the unique values for a @@ -55,7 +53,7 @@ private static void dumpUniqValues( String fieldName, String indexDir ) throws Exception { - IndexReader reader = IndexReader.open(indexDir); + IndexReader reader = IndexReader.open( new NIOFSDirectory( new File( indexDir) ) ); Collection fieldNames = reader.getFieldNames( IndexReader.FieldOption.ALL ); Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -16,15 +16,8 @@ * limitations under the License. */ -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; -import java.util.Collection; -import java.util.HashSet; +import java.io.*; +import java.util.*; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; @@ -32,12 +25,11 @@ import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Similarity; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.NIOFSDirectory; - import org.apache.nutch.indexer.NutchSimilarity; + /** * This is heavily cribbed from org.apache.lucene.misc.LengthNormModifier */ @@ -132,7 +124,7 @@ String pagerankFile = args[pos++]; - IndexReader reader = IndexReader.open( args[pos++] ); + IndexReader reader = IndexReader.open( new NIOFSDirectory( new File( args[pos++] ) ) ); try { Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/common-terms.utf8 =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/common-terms.utf8 (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/common-terms.utf8 2010-07-11 00:09:27 UTC (rev 3168) @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Common terms and phrases which will be indexed in n-grams +# in order to optimize search. +#content:a +#content:and +#content:for +#content:in +#content:of +#content:the +#content:to +#url:com +#url:http +#url:http-www +#url:www Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-07-11 00:09:27 UTC (rev 3168) @@ -10,7 +10,7 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-(text|html|js|pdf)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> + <value>protocol-http|parse-(text|html|pdf|msword|mspowerpoint|oo)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> </property> <!-- @@ -42,6 +42,7 @@ dest-key = src-key --> <name>nutchwax.filter.index</name> +<!-- <value> title:false:true:tokenized content:false:compress:tokenized @@ -55,6 +56,16 @@ type:true:true:no_norms length:false:true:no </value> +--> + <value> + title:false:true:tokenized + content:false:compress:tokenized + site:false:false:untokenized + url:false:true:tokenized + type:true:true:no_norms + date:false:true:no + length:false:true:no + </value> </property> <property> Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2010-07-10 23:34:25 UTC (rev 3167) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -1,3 +1,5 @@ +package org.apache.lucene.index; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -14,24 +16,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -/** - * ARCHIVE: This must be in the lucene index package because it needs - * to call protected methods on other IndexReader objects. - */ -package org.apache.lucene.index; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.Fieldable; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; -import org.apache.lucene.index.TermEnum; -import org.apache.lucene.index.TermFreqVector; -import org.apache.lucene.index.TermPositions; -import org.apache.lucene.index.TermVectorMapper; import java.io.IOException; import java.util.*; @@ -55,10 +44,12 @@ * undefined behavior</em>. */ public class ArchiveParallelReader extends IndexReader { - private List readers = new ArrayList(); - private List decrefOnClose = new ArrayList(); // remember which subreaders to decRef on close + private List<IndexReader> readers = new ArrayList<IndexReader>(); + private List<Boolean> decrefOnClose = new ArrayList<Boolean>(); // remember which subreaders to decRef on close boolean incRefReaders = false; - private SortedMap fieldToReader = new TreeMap(); + private SortedMap<String,IndexReader> fieldToReader = new TreeMap<String,IndexReader>(); + private Map<IndexReader,Collection<String>> readerToFields = new HashMap<IndexReader,Collection<String>>(); + private List<IndexReader> storedFieldReaders = new ArrayList<IndexReader>(); private int maxDoc; private int numDocs; @@ -81,9 +72,25 @@ /** Add an IndexReader. * @throws IOException if there is a low-level IO error */ - public void add(IndexReader reader) throws IOException - { + public void add(IndexReader reader) throws IOException { ensureOpen(); + add(reader, false); + } + + /** Add an IndexReader whose stored fields will not be returned. This can + * accelerate search when stored fields are only needed from a subset of + * the IndexReaders. + * + * @throws IllegalArgumentException if not all indexes contain the same number + * of documents + * @throws IllegalArgumentException if not all indexes have the same value + * of {@link IndexReader#maxDoc()} + * @throws IOException if there is a low-level IO error + */ + public void add(IndexReader reader, boolean ignoreStoredFields) + throws IOException { + + ensureOpen(); if (readers.size() == 0) { this.maxDoc = reader.maxDoc(); this.numDocs = reader.numDocs(); @@ -97,14 +104,15 @@ throw new IllegalArgumentException ("All readers must have same numDocs: "+numDocs+"!="+reader.numDocs()); - Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL); - Iterator i = fields.iterator(); - while (i.hasNext()) { // update fieldToReader map - String field = (String)i.next(); + Collection<String> fields = reader.getFieldNames(IndexReader.FieldOption.ALL); + readerToFields.put(reader, fields); + for (final String field : fields) { // update fieldToReader map if (fieldToReader.get(field) == null) fieldToReader.put(field, reader); } + if (!ignoreStoredFields) + storedFieldReaders.add(reader); // add to storedFieldReaders readers.add(reader); if (incRefReaders) { @@ -112,7 +120,16 @@ } decrefOnClose.add(Boolean.valueOf(incRefReaders)); } - + + @Override + public synchronized Object clone() { + try { + return doReopen(true); + } catch (Exception ex) { + throw new RuntimeException(ex); + } + } + /** * Tries to reopen the subreaders. * <br> @@ -132,63 +149,42 @@ * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - public IndexReader reopen() throws CorruptIndexException, IOException { + @Override + public synchronized IndexReader reopen() throws CorruptIndexException, IOException { + return doReopen(false); + } + + protected IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException { ensureOpen(); boolean reopened = false; - List newReaders = new ArrayList(); - List newDecrefOnClose = new ArrayList(); + List<IndexReader> newReaders = new ArrayList<IndexReader>(); boolean success = false; try { - - for (int i = 0; i < readers.size(); i++) { - IndexReader oldReader = (IndexReader) readers.get(i); - IndexReader newReader = oldReader.reopen(); + for (final IndexReader oldReader : readers) { + IndexReader newReader = null; + if (doClone) { + newReader = (IndexReader) oldReader.clone(); + } else { + newReader = oldReader.reopen(); + } newReaders.add(newReader); // if at least one of the subreaders was updated we remember that - // and return a new MultiReader + // and return a new ArchiveParallelReader if (newReader != oldReader) { reopened = true; } } - - if (reopened) { - ArchiveParallelReader pr = new ArchiveParallelReader(); - for (int i = 0; i < readers.size(); i++) { - IndexReader oldReader = (IndexReader) readers.get(i); - IndexReader newReader = (IndexReader) newReaders.get(i); - if (newReader == oldReader) { - newDecrefOnClose.add(Boolean.TRUE); - newReader.incRef(); - } else { - // this is a new subreader instance, so on close() we don't - // decRef but close it - newDecrefOnClose.add(Boolean.FALSE); - } - pr.add(newReader); - } - pr.decrefOnClose = newDecrefOnClose; - pr.incRefReaders = incRefReaders; - success = true; - return pr; - } else { - success = true; - // No subreader was refreshed - return this; - } + success = true; } finally { if (!success && reopened) { for (int i = 0; i < newReaders.size(); i++) { - IndexReader r = (IndexReader) newReaders.get(i); - if (r != null) { + IndexReader r = newReaders.get(i); + if (r != readers.get(i)) { try { - if (((Boolean) newDecrefOnClose.get(i)).booleanValue()) { - r.decRef(); - } else { - r.close(); - } + r.close(); } catch (IOException ignore) { // keep going - we want to clean up as much as possible } @@ -196,46 +192,74 @@ } } } + + if (reopened) { + List<Boolean> newDecrefOnClose = new ArrayList<Boolean>(); + ArchiveParallelReader pr = new ArchiveParallelReader(); + for (int i = 0; i < readers.size(); i++) { + IndexReader oldReader = readers.get(i); + IndexReader newReader = newReaders.get(i); + if (newReader == oldReader) { + newDecrefOnClose.add(Boolean.TRUE); + newReader.incRef(); + } else { + // this is a new subreader instance, so on close() we don't + // decRef but close it + newDecrefOnClose.add(Boolean.FALSE); + } + pr.add(newReader, !storedFieldReaders.contains(oldReader)); + } + pr.decrefOnClose = newDecrefOnClose; + pr.incRefReaders = incRefReaders; + return pr; + } else { + // No subreader was refreshed + return this; + } } + @Override public int numDocs() { // Don't call ensureOpen() here (it could affect performance) return numDocs; } + @Override public int maxDoc() { // Don't call ensureOpen() here (it could affect performance) return maxDoc; } + @Override public boolean hasDeletions() { // Don't call ensureOpen() here (it could affect performance) return hasDeletions; } // check first reader + @Override public boolean isDeleted(int n) { // Don't call ensureOpen() here (it could affect performance) if (readers.size() > 0) - return ((IndexReader)readers.get(0)).isDeleted(n); + return readers.get(0).isDeleted(n); return false; } // delete in all readers + @Override protected void doDelete(int n) throws CorruptIndexException, IOException { - for (int i = 0; i < readers.size(); i++) { - ((IndexReader)readers.get(i)).deleteDocument(n); + for (final IndexReader reader : readers) { + reader.deleteDocument(n); } hasDeletions = true; } - /** - * @see org.apache.lucene.index.ParallelReader.doUndeleteAll - */ + // undeleteAll in all readers + @Override protected void doUndeleteAll() throws CorruptIndexException, IOException { - for (int i = 0; i < readers.size(); i++) { - ((IndexReader)readers.get(i)).undeleteAll(); + for (final IndexReader reader : readers) { + reader.undeleteAll(); } hasDeletions = false; } @@ -289,111 +313,150 @@ return result; } + /* + // append fields from storedFieldReaders + @Override + public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { + ensureOpen(); + Document result = new Document(); + for (final IndexReader reader: storedFieldReaders) { + + boolean include = (fieldSelector==null); + if (!include) { + Collection<String> fields = readerToFields.get(reader); + for (final String field : fields) + if (fieldSelector.accept(field) != FieldSelectorResult.NO_LOAD) { + include = true; + break; + } + } + if (include) { + List<Fieldable> fields = reader.document(n, fieldSelector).getFields(); + for (Fieldable field : fields) { + result.add(field); + } + } + } + return result; + } + */ + // get all vectors + @Override public TermFreqVector[] getTermFreqVectors(int n) throws IOException { ensureOpen(); - ArrayList results = new ArrayList(); - Iterator i = fieldToReader.entrySet().iterator(); - while (i.hasNext()) { - Map.Entry e = (Map.Entry)i.next(); - String field = (String)e.getKey(); - IndexReader reader = (IndexReader)e.getValue(); + ArrayList<TermFreqVector> results = new ArrayList<TermFreqVector>(); + for (final Map.Entry<String,IndexReader> e: fieldToReader.entrySet()) { + + String field = e.getKey(); + IndexReader reader = e.getValue(); TermFreqVector vector = reader.getTermFreqVector(n, field); if (vector != null) results.add(vector); } - return (TermFreqVector[]) - results.toArray(new TermFreqVector[results.size()]); + return results.toArray(new TermFreqVector[results.size()]); } + @Override public TermFreqVector getTermFreqVector(int n, String field) throws IOException { ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); return reader==null ? null : reader.getTermFreqVector(n, field); } + @Override public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); if (reader != null) { reader.getTermFreqVector(docNumber, field, mapper); } } + @Override public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { ensureOpen(); - ensureOpen(); - Iterator i = fieldToReader.entrySet().iterator(); - while (i.hasNext()) { - Map.Entry e = (Map.Entry)i.next(); - String field = (String)e.getKey(); - IndexReader reader = (IndexReader)e.getValue(); + for (final Map.Entry<String,IndexReader> e : fieldToReader.entrySet()) { + + String field = e.getKey(); + IndexReader reader = e.getValue(); reader.getTermFreqVector(docNumber, field, mapper); } } + @Override public boolean hasNorms(String field) throws IOException { ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); return reader==null ? false : reader.hasNorms(field); } + @Override public byte[] norms(String field) throws IOException { ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); return reader==null ? null : reader.norms(field); } + @Override public void norms(String field, byte[] result, int offset) throws IOException { ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); if (reader!=null) reader.norms(field, result, offset); } + @Override protected void doSetNorm(int n, String field, byte value) throws CorruptIndexException, IOException { - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); if (reader!=null) reader.doSetNorm(n, field, value); } + @Override public TermEnum terms() throws IOException { ensureOpen(); return new ParallelTermEnum(); } + @Override public TermEnum terms(Term term) throws IOException { ensureOpen(); return new ParallelTermEnum(term); } + @Override public int docFreq(Term term) throws IOException { ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + IndexReader reader = fieldToReader.get(term.field()); return reader==null ? 0 : reader.docFreq(term); } + @Override public TermDocs termDocs(Term term) throws IOException { ensureOpen(); return new ParallelTermDocs(term); } + @Override public TermDocs termDocs() throws IOException { ensureOpen(); return new ParallelTermDocs(); } + @Override public TermPositions termPositions(Term term) throws IOException { ensureOpen(); return new ParallelTermPositions(term); } + @Override public TermPositions termPositions() throws IOException { ensureOpen(); return new ParallelTermPositions(); @@ -402,9 +465,10 @@ /** * Checks recursively if all subreaders are up to date. */ + @Override public boolean isCurrent() throws CorruptIndexException, IOException { - for (int i = 0; i < readers.size(); i++) { - if (!((IndexReader)readers.get(i)).isCurrent()) { + for (final IndexReader reader : readers) { + if (!reader.isCurrent()) { return false; } } @@ -416,9 +480,10 @@ /** * Checks recursively if all subindexes are optimized */ + @Override public boolean isOptimized() { - for (int i = 0; i < readers.size(); i++) { - if (!((IndexReader)readers.get(i)).isOptimized()) { + for (final IndexReader reader : readers) { + if (!reader.isOptimized()) { return false; } } @@ -431,36 +496,39 @@ /** Not implemented. * @throws UnsupportedOperationException */ + @Override public long getVersion() { throw new UnsupportedOperationException("ArchiveParallelReader does not support this method."); } // for testing IndexReader[] getSubReaders() { - return (IndexReader[]) readers.toArray(new IndexReader[readers.size()]); + return readers.toArray(new IndexReader[readers.size()]); } - protected void doCommit() throws IOException { - for (int i = 0; i < readers.size(); i++) - ((IndexReader)readers.get(i)).commit(); + @Override + protected void doCommit(Map<String,String> commitUserData) throws IOException { + for (final IndexReader reader : readers) + reader.commit(commitUserData); } + @Override protected synchronized void doClose() throws IOException { for (int i = 0; i < readers.size(); i++) { - if (((Boolean) decrefOnClose.get(i)).booleanValue()) { - ((IndexReader)readers.get(i)).decRef(); + if (decrefOnClose.get(i).booleanValue()) { + readers.get(i).decRef(); } else { - ((IndexReader)readers.get(i)).close(); + readers.get(i).close(); } } } - public Collection getFieldNames (IndexReader.FieldOption fieldNames) { + @Override + public Collection<String> getFieldNames (IndexReader.FieldOption fieldNames) { ensureOpen(); - Set fieldSet = new HashSet(); - for (int i = 0; i < readers.size(); i++) { - IndexReader reader = ((IndexReader)readers.get(i)); - Collection names = reader.getFieldNames(fieldNames); + Set<String> fieldSet = new HashSet<String>(); + for (final IndexReader reader : readers) { + Collection<String> names = reader.getFieldNames(fieldNames); fieldSet.addAll(names); } return fieldSet; @@ -468,24 +536,28 @@ private class ParallelTermEnum extends TermEnum { private String field; - private Iterator fieldIterator; + private Iterator<String> fieldIterator; private TermEnum termEnum; public ParallelTermEnum() throws IOException { - if ( fieldToReader.isEmpty( ) ) return ; - - field = (String)fieldToReader.firstKey(); + try { + field = fieldToReader.firstKey(); + } catch(NoSuchElementException e) { + // No fields, so keep field == null, termEnum == null + return; + } if (field != null) - termEnum = ((IndexReader)fieldToReader.get(field)).terms(); + termEnum = fieldToReader.get(field).terms(); } public ParallelTermEnum(Term term) throws IOException { field = term.field(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); + IndexReader reader = fieldToReader.get(field); if (reader!=null) termEnum = reader.terms(term); } + @Override public boolean next() throws IOException { if (termEnum==null) return false; @@ -502,8 +574,8 @@ fieldIterator.next(); // Skip field to get next one } while (fieldIterator.hasNext()) { - field = (String) fieldIterator.next(); - termEnum = ((IndexReader)fieldToReader.get(field)).terms(new Term(field, "")); + field = fieldIterator.next(); + termEnum = fieldToReader.get(field).terms(new Term(field)); Term term = termEnum.term(); if (term!=null && term.field()==field) return true; @@ -514,6 +586,7 @@ return false; // no more fields } + @Override public Term term() { if (termEnum==null) return null; @@ -521,6 +594,7 @@ return termEnum.term(); } + @Override public int docFreq() { if (termEnum==null) return 0; @@ -528,6 +602,7 @@ return termEnum.docFreq(); } + @Override public void close() throws IOException { if (termEnum!=null) termEnum.close(); @@ -540,13 +615,18 @@ protected TermDocs termDocs; public ParallelTermDocs() {} - public ParallelTermDocs(Term term) throws IOException { seek(term); } + public ParallelTermDocs(Term term) throws IOException { + if (term == null) + termDocs = readers.isEmpty() ? null : readers.get(0).termDocs(null); + else + seek(term); + } public int doc() { return termDocs.doc(); } public int freq() { return termDocs.freq(); } public void seek(Term term) throws IOException { - IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + IndexReader reader = fieldToReader.get(term.field()); termDocs = reader!=null ? reader.termDocs(term) : null; } @@ -588,8 +668,9 @@ public ParallelTermPositions() {} public ParallelTermPositions(Term term) throws IOException { seek(term); } + @Override public void seek(Term term) throws IOException { - IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + IndexReader reader = fieldToReader.get(term.field()); termDocs = reader!=null ? reader.termPositions(term) : null; } @@ -614,3 +695,8 @@ } } + + + + + Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -0,0 +1,330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.lucene; + +import java.io.File; +import java.io.IOException; +import java.io.ByteArrayOutputStream; +import java.io.OutputStreamWriter; +import java.util.zip.GZIPOutputStream; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; +import java.util.Map.Entry; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.CompressionTools; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.store.FSDirectory; +import org.apache.nutch.analysis.AnalyzerFactory; +import org.apache.nutch.analysis.NutchAnalyzer; +import org.apache.nutch.analysis.NutchDocumentAnalyzer; +import org.apache.nutch.indexer.Indexer; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.NutchIndexWriter; +import org.apache.nutch.indexer.NutchSimilarity; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.util.LogUtil; + +public class LuceneWriter implements NutchIndexWriter { + + public static enum STORE { YES, NO, COMPRESS } + + public static enum INDEX { NO, NO_NORMS, TOKENIZED, UNTOKENIZED } + + public static enum VECTOR { NO, OFFSET, POS, POS_OFFSET, YES } + + private IndexWriter writer; + + private AnalyzerFactory analyzerFactory; + + private Path perm; + + private Path temp; + + private FileSystem fs; + + private final Map<String, Field.Store> fieldStore; + private final Set<String> fieldCompress; + + private final Map<String, Field.Index> fieldIndex; + + private final Map<String, Field.TermVector> fieldVector; + + public LuceneWriter() { + fieldStore = new HashMap<String, Field.Store>(); + fieldCompress = new HashSet<String>(); + fieldIndex = new HashMap<String, Field.Index>(); + fieldVector = new HashMap<String, Field.TermVector>(); + } + + private Document createLuceneDoc(NutchDocument doc) { + final Document out = new Document(); + + out.setBoost(doc.getScore()); + + final Metadata documentMeta = doc.getDocumentMeta(); + for (final Entry<String, List<String>> entry : doc) { + final String fieldName = entry.getKey(); + + Field.Store store = fieldStore.get(fieldName); + boolean compress = fieldCompress.contains(fieldName); + Field.Index index = fieldIndex.get(fieldName); + Field.TermVector vector = fieldVector.get(fieldName); + + // default values + if (store == null) { + store = Field.Store.NO; + } + + if (index == null) { + index = Field.Index.NO; + } + + if (vector == null) { + vector = Field.TermVector.NO; + } + + // read document-level field information + final String[] fieldMetas = + documentMeta.getValues(LuceneConstants.FIELD_PREFIX + fieldName); + if (fieldMetas.length != 0) { + for (final String val : fieldMetas) { + System.out.println( fieldName + " : " + val ); + if (LuceneConstants.STORE_YES.equals(val)) { + store = Field.Store.YES; + } else if (LuceneConstants.STORE_NO.equals(val)) { + store = Field.Store.NO; + } else if (LuceneConstants.STORE_COMPRESS.equals(val)) { + compress = true; + } else if (LuceneConstants.INDEX_TOKENIZED.equals(val)) { + index = Field.Index.ANALYZED; + } else if (LuceneConstants.INDEX_NO.equals(val)) { + index = Field.Index.NO; + } else if (LuceneConstants.INDEX_UNTOKENIZED.equals(val)) { + index = Field.Index.NOT_ANALYZED; + } else if (LuceneConstants.INDEX_NO_NORMS.equals(val)) { + index = Field.Index.ANALYZED_NO_NORMS; + } else if (LuceneConstants.VECTOR_NO.equals(val)) { + vector = Field.TermVector.NO; + } else if (LuceneConstants.VECTOR_YES.equals(val)) { + vector = Field.TermVector.YES; + } else if (LuceneConstants.VECTOR_POS.equals(val)) { + vector = Field.TermVector.WITH_POSITIONS; + } else if (LuceneConstants.VECTOR_POS_OFFSET.equals(val)) { + vector = Field.TermVector.WITH_POSITIONS_OFFSETS; + } else if (LuceneConstants.VECTOR_OFFSET.equals(val)) { + vector = Field.TermVector.WITH_OFFSETS; + } + } + } + + for (final String fieldValue : entry.getValue()) { + if ( compress ) + { + out.add( new Field( fieldName, CompressionTools.compressString( fieldValue ), Field.Store.YES ) ); + } + out.add(new Field(fieldName, fieldValue, store, index, vector)); + } + } + + return out; + } + + @SuppressWarnings("unchecked") + private void processOptions(Configuration conf) { + final Iterator iterator = conf.iterator(); + while (iterator.hasNext()) { + final String key = (String) ((Map.Entry)iterator.next()).getKey(); + if (!key.startsWith(LuceneConstants.LUCENE_PREFIX)) { + continue; + } + if (key.startsWith(LuceneConstants.FIELD_STORE_PREFIX)) { + final String field = + key.substring(LuceneConstants.FIELD_STORE_PREFIX.length()); + final LuceneWriter.STORE store = LuceneWriter.STORE.valueOf(conf.get(key)); + switch (store) { + case YES: + fieldStore.put(field, Field.Store.YES); + break; + case NO: + fieldStore.put(field, Field.Store.NO); + break; + case COMPRESS: + fieldCompress.add(field); + break; + } + } else if (key.startsWith(LuceneConstants.FIELD_INDEX_PREFIX)) { + final String field = + key.substring(LuceneConstants.FIELD_INDEX_PREFIX.length()); + final LuceneWriter.INDEX index = LuceneWriter.INDEX.valueOf(conf.get(key)); + switch (index) { + case NO: + fieldIndex.put(field, Field.Index.NO); + break; + case NO_NORMS: + fieldIndex.put(field, Field.Index.NOT_ANALYZED_NO_NORMS); + break; + case TOKENIZED: + fieldIndex.put(field, Field.Index.ANALYZED); + break; + case UNTOKENIZED: + fieldIndex.put(field, Field.Index.NOT_ANALYZED); + break; + } + } else if (key.startsWith(LuceneConstants.FIELD_VECTOR_PREFIX)) { + final String field = + key.substring(LuceneConstants.FIELD_VECTOR_PREFIX.length()); + final LuceneWriter.VECTOR vector = LuceneWriter.VECTOR.valueOf(conf.get(key)); + switch (vector) { + case NO: + fieldVector.put(field, Field.TermVector.NO); + break; + case OFFSET: + fieldVector.put(field, Field.TermVector.WITH_OFFSETS); + break; + case POS: + fieldVector.put(field, Field.TermVector.WITH_POSITIONS); + break; + case POS_OFFSET: + fieldVector.put(field, Field.TermVector.WITH_POSITIONS_OFFSETS); + break; + case YES: + fieldVector.put(field, Field.TermVector.YES); + break; + } + } + } + } + + public void open(JobConf job, String name) + throws IOException { + this.fs = FileSystem.get(job); + perm = new Path(FileOutputFormat.getOutputPath(job), name); + temp = job.getLocalPath("index/_" + + Integer.toString(new Random().nextInt())); + + fs.delete(perm, true); // delete old, if any + analyzerFactory = new AnalyzerFactory(job); + writer = new IndexWriter( + FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())), + new NutchDocumentAnalyzer(job), true, MaxFieldLength.UNLIMITED); + + writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10)); + writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100)); + writer.setMaxMergeDocs(job + .getInt("indexer.maxMergeDocs", Integer.MAX_VALUE)); + writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128)); + writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000)); + writer.setInfoStream(LogUtil.getDebugStream(Indexer.LOG)); + writer.setUseCompoundFile(false); + writer.setSimilarity(new NutchSimilarity()); + + processOptions(job); + } + + public void close() throws IOException { + writer.optimize(); + writer.close(); + fs.completeLocalOutput(perm, temp); // copy to dfs + fs.createNewFile(new Path(perm, Indexer.DONE_NAME)); + } + + public void write(NutchDocument doc) throws IOException { + final Document luceneDoc = createLuceneDoc(doc); + final NutchAnalyzer analyzer = analyzerFactory.get(luceneDoc.get("lang")); + if (Indexer.LOG.isDebugEnabled()) { + Indexer.LOG.debug("Indexing [" + luceneDoc.get("url") + + "] with analyzer " + analyzer + " (" + luceneDoc.get("lang") + + ")"); + } + writer.addDocument(luceneDoc, analyzer); + + } + + /** Adds a lucene field. + * <p> + * This method is provided for backward-compatibility with + * older indexing filters. This should not be used by newer + * implementations since this is slower than + * {@link NutchDocument#add(String, String)} and will be removed + * in a future release. + * </p> + * @param f Lucene field to be added. + * @deprecated Use {@link NutchDocument#add(String, String)} instead and + * set index-level metadata for field information. + * */ + @Deprecated + public static void add(NutchDocument doc, Field f) { + final String fieldName = f.name(); + final String key = LuceneConstants.FIELD_PREFIX + fieldName; + final Metadata documentMeta = doc.getDocumentMeta(); + if (f.isStored()) { + documentMeta.add(key, LuceneConstants.STORE_YES); + } else { + documentMeta.add(key, LuceneConstants.STORE_NO); + } + + if (f.isIndexed()) { + if (f.isTokenized()) { + documentMeta.add(key, LuceneConstants.INDEX_TOKENIZED); + } else if (f.getOmitNorms()) { + documentMeta.add(key, LuceneConstants.INDEX_NO_NORMS); + } else { + documentMeta.add(key, LuceneConstants.INDEX_UNTOKENIZED); + } + } else { + documentMeta.add(key, LuceneConstants.INDEX_NO); + } + + if (f.isStoreOffsetWithTermVector() && f.isStorePositionWithTermVector()) { + documentMeta.add(key, LuceneConstants.VECTOR_POS_OFFSET); + } else if (f.isStoreOffsetWithTermVector()) { + documentMeta.add(key, LuceneConstants.VECTOR_OFFSET); + } else if (f.isStorePositionWithTermVector()) { + documentMeta.add(key, LuceneConstants.VECTOR_POS); + } else if (f.isTermVectorStored()) { + documentMeta.add(key, LuceneConstants.VECTOR_YES); + } else { + documentMeta.add(key, LuceneConstants.VECTOR_NO); + } + } + + public static void addFieldOptions(String field, LuceneWriter.STORE store, + LuceneWriter.INDEX index, LuceneWriter.VECTOR vector, Configuration conf) { + + conf.set(LuceneConstants.FIELD_STORE_PREFIX + field, store.toString()); + conf.set(LuceneConstants.FIELD_INDEX_PREFIX + field, index.toString()); + conf.set(LuceneConstants.FIELD_VECTOR_PREFIX + field, vector.toString()); + } + + public static void addFieldOptions(String field, LuceneWriter.STORE store, + LuceneWriter.INDEX index, Configuration conf) { + LuceneWriter.addFieldOptions(field, store, index, LuceneWriter.VECTOR.NO, conf); + } +} Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/PluginManifestParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/PluginManifestParser.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/PluginManifestParser.java 2010-07-11 00:09:27 UTC (rev 3168) @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +import java.io.File; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLDecoder; +import java.util.HashMap; +import java.util.Map; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.apache.commons.logging.Log; + +import org.apache.hadoop.conf.Configuration; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +/** + * The <code>PluginManifestParser</code> parser just parse the manifest file + * in all plugin directories. + * + * @author joa23 + */ +public class PluginManifestParser { + private static final String ATTR_NAME = "name"; + private static final String ATTR_CLASS = "class"; + private static final String ATTR_ID = "id"; + + public static final Log LOG = PluginRepository.LOG; + + private static final boolean WINDOWS = System.getProperty("os.name") + .startsWith("Windows"); + + private Configuration conf; + + private PluginRepository pluginRepository; + + public PluginManifestParser(Configuration conf, + PluginRepository pluginRepository) { + this.conf = conf; + this.pluginRepository = pluginRepository; + } + + /** + * Returns a list of all found plugin descriptors. + * + * @param pluginFolders + * folders to search plugins from + * @return A {@link Map} of all found {@link PluginDescriptor}s. + */ + public Map<String, PluginDescriptor> parsePluginFolder(String[] pluginFolders) { + Map<String, PluginDescriptor> map = new HashMap<String, PluginDescriptor>(); + + if (pluginFolders == null) { + throw new IllegalArgumentException("plugin.folders is not defined"); + } + + for (String name : pluginFolders) { + File directory = getPluginFolder(name); + if (directory == null) { + continue; + } + LOG.info("Plugins: looking in: " + directory.getAbsolutePath()); + for (File oneSubFolder : directory.listFiles()) { + if (oneSubFolder.isDirectory()) { + String manifestPath = oneSubFolder.getAbsolutePath() + File.separator + + "plugin.xml"; + try { + LOG.debug("parsing: " + manifestPath); + PluginDescriptor p = parseManifestFile(manifestPath); + map.put(p.getPluginId(), p); + } catch (MalformedURLException e) { + LOG.warn(e.toString()); + } catch (SAXException e) { + LOG.warn(e.toString()); + } catch (IOException e) { + LOG.warn(e.toString()); + } catch (ParserConfigurationException e) { + LOG.warn(e.toString()); + } + } + } + } + return map; + } + + /** + * Return the named plugin folder. If the name is absolute then it is + * returned. Otherwise, for relative names, the classpath is scanned. + */ + public File getPluginFolder(String name) { + File directory = new File(name); + if (!directory.isAbsolute()) { + URL url = PluginManifestParser.class.getClassLoader().getResource(name); + if (url == null && directory.exists() && directory.isDirectory() + && directory.listFiles().length > 0) { + return directory; // relative path that is not in the classpath + } else if (url == null) { + LOG.warn("Plugins: directory not found: " + name); + return null; + } else if ( "jar".equals(url.getProtocol()) ) { + try + { + // HACK to find directory containing .jar file and look for plugins there. + LOG.warn( "HACK to look for plugin directory next to jar file: " + url ); + java.net.JarURLConnection connection = (java.net.JarURLConnection) url.openConnection(); + URL url2 = connection.getJarFileURL(); + if ( !"file".equals(url2.getProtocol()) ) + { + LOG.warn( "Jar file is not a file: " + url2 ); + return null; + } + directory = new File( new File( url2.getFile() ).getParent( ) + "/" + name ); + LOG.warn( "Plugin directory: " + directory ); + return directory; + } + catch ( IOException ioe ) + { + LOG.warn( ioe ); + return null; + } + } else if (!"file".equals(url.getProtocol())) { + LOG.warn("Plugins: not a file: url. Can't load plugins from: " + url); + return null; + } + String path = url.getPath(); + if (WINDOWS && path.startsWith("/")) // patch a windows bug + path = path.substring(1); + try { + path = URLDecoder.decode(path, "UTF-8"); // decode the url path + } catch (UnsupportedEncodingException e) { + } + directory = new File(path); + } + return directory; + } + + /** + * @param manifestPath + * @throws ParserConfigurationException + * @throws IOException + * @throws SAXException + * @throws MalformedURLException + */ + private PluginDescriptor parseManifestFile(String pManifestPath) + throws MalformedURLException, SAXException, IOException, + ParserConfigurationException { + Document document = parseXML(new File(pManifestPath).toURL()); + String pPath = new File(pManifestPath).getParent(); + return parsePlugin(document, pPath); + } + + /** + * @param url + * @return Document + * @throws IOException + * @throws SAXException + * @throws ParserConfigurationException + * @throws DocumentException + */ + private Document parseXML(URL url) throws SAXException, IOException, + ParserConfigurationException { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder builder = factory.newDocumentBuilder(); + return builder.parse(url.openStream()); + } + + /** + * @param pDocument + * @throws MalformedURLException + */ + private PluginDescriptor parsePlugin(Document pDocument, String pPath) + throws MalformedURLException { + Element rootElement = pDocument.getDocumentElement(); + String id = rootElement.getAttribute(ATTR_ID); + String name = rootElement.getAttribute(ATTR_NAME); + String version = rootElement.getAttribute("version"); + String providerName = rootElement.getAttribute("provider-name"); + String pluginClazz = null; + if (rootElement.getAttribute(ATTR_CLASS).trim().length() > 0) { + pluginClazz = rootElement.getAttribute(ATTR_CLASS); + } + PluginDescriptor pluginDescriptor = new PluginDescriptor(id, version, name, + providerName, pluginClazz, pPath, this.conf); + LOG.debug("plugin: id=" + id + " name=" + name + " version=" + version + + " provider=" + providerName + "class=" + pluginClazz); + parseExtension(rootElement, pluginDescriptor); + parseExtensionPoints(rootElement, pluginDescriptor); + parseLibraries(rootElement, pluginDescriptor); + parseRequires(rootElement, pluginDescriptor); + return pluginDescriptor; + } + + /** + * @param pRootElement + * @param pDescriptor + * @throws MalformedURLException + */ + private void parseRequires(Element pRootElement, PluginDescriptor pDescriptor) + throws MalformedURLException { + + NodeList nodelist = pRootElement.getElementsByTagName("requires"); + if (nodelist.getLength() > 0) { + + Element requires = (Element) nodelist.item(0); + + NodeList imports = requires.getElementsByTagName("import"); + for (int i = 0; i < imports.getLength(); i++) { + Element anImport = (Element) imports.item(i); + String plugin = anImport.getAttribute("plugin"); + if (plugin != null) { + pDescriptor.addDependency(plugin); + } + } + } + } + + /** + * @param pRootElement + * @param pDescriptor + * @throws MalformedURLException + */ + private void parseLibraries(Element pRootElement, PluginDescriptor pDescriptor) + throws MalformedURLException { + NodeList nodelist = pRootElement.getElementsByTagName("runtime"); + if (nodelist.getLength() > 0) { + + Element runtime = (Element) nodelist.item(0); + + NodeList libraries = runtime.getElementsByTagName("library"); + for (int i = 0; i < libraries.getLength(); i++)... [truncated message content] |
From: <bi...@us...> - 2010-07-16 20:25:47
|
Revision: 3170 http://archive-access.svn.sourceforge.net/archive-access/?rev=3170&view=rev Author: binzino Date: 2010-07-16 20:25:38 +0000 (Fri, 16 Jul 2010) Log Message: ----------- Changed logging levels to be less chatty. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-07-12 02:26:34 UTC (rev 3169) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-07-16 20:25:38 UTC (rev 3170) @@ -193,13 +193,14 @@ if ( LOG.isInfoEnabled() ) LOG.info( "Importing ARC: " + arcUrl ); - ArchiveReader r = ArchiveReaderFactory.get( arcUrl ); - r.setDigest( true ); - - ArcReader reader = new ArcReader( r ); - + ArchiveReader r = null; try { + r = ArchiveReaderFactory.get( arcUrl ); + r.setDigest( true ); + + ArcReader reader = new ArcReader( r ); + for ( ARCRecord record : reader ) { // When reading WARC files, records of type other than @@ -214,7 +215,7 @@ } catch ( Exception e ) { - LOG.warn( "Error processing archive file: " + arcUrl, e ); + LOG.error( "Error processing archive file: " + arcUrl, e ); if ( jobConf.getBoolean( "nutchwax.import.abortOnArchiveReadError", false ) ) { @@ -223,7 +224,7 @@ } finally { - r.close(); + if ( r != null ) r.close(); if ( LOG.isInfoEnabled() ) { @@ -246,11 +247,11 @@ { ARCRecordMetaData meta = record.getMetaData(); - if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" ); + if ( LOG.isDebugEnabled() ) LOG.debug( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" ); if ( ! this.httpStatusCodeFilter.isAllowed( record.getStatusCode( ) ) ) { - if ( LOG.isInfoEnabled() ) LOG.info( "Skip URL: " + meta.getUrl() + " HTTP status:" + record.getStatusCode() ); + if ( LOG.isDebugEnabled() ) LOG.debug( "Skip URL: " + meta.getUrl() + " HTTP status:" + record.getStatusCode() ); return false; } @@ -291,7 +292,7 @@ if ( url == null ) { - if ( LOG.isInfoEnabled() ) LOG.info( "Skip URL: " + meta.getUrl() ); + if ( LOG.isDebugEnabled() ) LOG.debug( "Skip URL: " + meta.getUrl() ); return false; } @@ -375,11 +376,11 @@ } catch ( MalformedURLException mue ) { - if ( LOG.isInfoEnabled() ) LOG.info( "MalformedURL: " + candidateUrl ); + if ( LOG.isDebugEnabled() ) LOG.debug( "MalformedURL: " + candidateUrl ); } catch ( URLFilterException ufe ) { - if ( LOG.isInfoEnabled() ) LOG.info( "URL filtered: " + candidateUrl ); + if ( LOG.isDebugEnabled() ) LOG.debug( "URL filtered: " + candidateUrl ); } return null; @@ -439,9 +440,9 @@ { parseResult = this.parseUtil.parse( content ); } - catch ( Exception e ) + catch ( Throwable t ) { - LOG.warn( "Error parsing: " + key, e ); + if ( LOG.isDebugEnabled() ) LOG.debug( "Error parsing: " + key, t ); } // ?: This is taken from Nutch Fetcher. I believe the signatures are used in the Fetcher @@ -590,7 +591,7 @@ count += record.read( buf, 0, Math.min( buf.length, record.available( ) ) ); } - if ( LOG.isInfoEnabled() ) LOG.info( "Bytes read: expected=" + contentLength + " bytes.length=" + bytes.length + " pos=" + pos + " count=" + count ); + if ( LOG.isDebugEnabled() ) LOG.debug( "Bytes read: expected=" + contentLength + " bytes.length=" + bytes.length + " pos=" + pos + " count=" + count ); // Sanity check. The number of bytes read into our bytes[] // buffer, plus the count of extra stuff read after it should Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java 2010-07-12 02:26:34 UTC (rev 3169) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java 2010-07-16 20:25:38 UTC (rev 3170) @@ -70,7 +70,7 @@ if ( s.length != 3 ) { // Don't filter. - LOG.info( "Allowing : " + urlString ); + if ( LOG.isDebugEnabled() ) LOG.debug( "Allowing : " + urlString ); return urlString; } @@ -101,12 +101,12 @@ if ( exclude ) { - LOG.info( "Excluding: " + urlString ); + if ( LOG.isDebugEnabled() ) LOG.debug( "Excluding: " + urlString ); return null; } - LOG.info( "Allowing : " + urlString ); + if ( LOG.isDebugEnabled() ) LOG.debug( "Allowing : " + urlString ); return urlString; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-11-22 22:44:55
|
Revision: 3344 http://archive-access.svn.sourceforge.net/archive-access/?rev=3344&view=rev Author: binzino Date: 2010-11-22 22:44:48 +0000 (Mon, 22 Nov 2010) Log Message: ----------- Add PDF parser that uses external 'pdftotext' tool. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-19 02:51:57 UTC (rev 3343) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -10,7 +10,7 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-(tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> + <value>protocol-http|parse-(pdf2|tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> </property> <!-- Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-19 02:51:57 UTC (rev 3343) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -31,7 +31,7 @@ </mimeType> <mimeType name="application/pdf"> - <plugin id="parse-tika" /> + <plugin id="parse-pdf2" /> </mimeType> <mimeType name="application/vnd.ms-excel"> @@ -152,6 +152,7 @@ <alias name="parse-ext" extension-id="ExtParser" /> <alias name="parse-text" extension-id="org.apache.nutch.parse.text.TextParser" /> <alias name="parse-html" extension-id="org.apache.nutch.parse.html.HtmlParser" /> + <alias name="parse-pdf2" extension-id="org.archive.nutchwax.parse.pdf.PDFParser" /> <!-- <alias name="parse-js" extension-id="JSParser" /> <alias name="parse-msexceld" extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" /> Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2010-11-19 02:51:57 UTC (rev 3343) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -91,6 +91,7 @@ <ant dir="query-nutchwax" target="deploy" /> <ant dir="scoring-nutchwax" target="deploy" /> <ant dir="urlfilter-nutchwax" target="deploy" /> + <ant dir="parse-pdf2" target="deploy" /> </target> @@ -202,5 +203,6 @@ <ant dir="query-nutchwax" target="clean" /> <ant dir="scoring-nutchwax" target="clean" /> <ant dir="urlfilter-nutchwax" target="clean" /> + <ant dir="parse-pdf2" target="clean" /> </target> </project> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-pdf2" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -0,0 +1,49 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Copyright (C) 2010 Internet Archive. + + This file is part of the archive-access tools project + (http://sourceforge.net/projects/archive-access). + + The archive-access tools are free software; you can redistribute them and/or + modify them under the terms of the GNU Lesser Public License as published by + the Free Software Foundation; either version 2.1 of the License, or any + later version. + + The archive-access tools are distributed in the hope that they will be + useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + Public License for more details. + + You should have received a copy of the GNU Lesser Public License along with + the archive-access tools; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +--> +<plugin + id="parse-pdf2" + name="External PDF Parser" + version="1.0.0" + provider-name="archive.org"> + + <runtime> + <library name="parse-pdf2.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.archive.nutchwax.parse.pdf" + name="NutchWAX External PDF Parser" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.archive.nutchwax.parse.pdf.PDFParser" + class="org.archive.nutchwax.parse.pdf.PDFParser"> + <parameter name="contentType" value="application/pdf" /> + <parameter name="pathSuffix" value="" /> + </implementation> + </extension> + +</plugin> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2010-11-22 22:44:48 UTC (rev 3344) @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2010 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.nutchwax.parse.pdf; + +import java.io.*; +import java.util.*; +import java.util.regex.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; + +import org.apache.nutch.protocol.Content; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.OutlinkExtractor; +import org.apache.nutch.util.LogUtil; + + +/** + * + */ +public class PDFParser implements Parser +{ + public static final Log LOG = LogFactory.getLog( PDFParser.class ); + + private Configuration conf; + + public void setConf( Configuration conf ) + { + this.conf = conf; + } + + public Configuration getConf( ) + { + return this.conf; + } + + public ParseResult getParse( Content content ) + { + System.out.println( "PDFParser" ); + + Metadata metadata = new Metadata(); + String title = ""; + String text = ""; + + byte[] raw = content.getContent( ); + + File tmpfile = null; + try + { + tmpfile = File.createTempFile( "pdf2-", ".pdf" ); + + // Write the PDF document to the tmp file. + FileOutputStream fos = new FileOutputStream( tmpfile ); + fos.write( raw ); + fos.close(); + + // Now create a Process to call 'pdftotext' to extract the metadata. + ProcessBuilder pb = new ProcessBuilder( "/usr/bin/pdftotext", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); + + Process p = pb.start(); + + p.getOutputStream( ).close(); + String head = suck( new InputStreamReader( p.getInputStream( ) ) ); + byte[] err = suck( p.getErrorStream( ) ); + + if ( err.length > 0 ) + { + LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) ); + } + + p.destroy( ); + + pb = new ProcessBuilder( "/usr/bin/pdftotext", tmpfile.toString(), "-" ); + p = pb.start( ); + + p.getOutputStream( ).close( ); + text = suck( new InputStreamReader( p.getInputStream( ) ) ); + err = suck( p.getErrorStream( ) ); + + if ( err.length > 0 ) + { + LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) ); + } + + p.destroy( ); + + Matcher m = Pattern.compile( "<html>.*?<title>(.*?)</title>.*?</head>", Pattern.DOTALL ).matcher( head ); + if ( m.find( ) ) + { + title = m.group(1); + } + + //System.out.println( "head = " + head ); + //System.out.println( "title = " + title ); + + // No outlinks. + Outlink[] outlinks = new Outlink[0]; + + ParseData parseData = new ParseData( ParseStatus.STATUS_SUCCESS, + title, + outlinks, + content.getMetadata(), + metadata ); + + return ParseResult.createParseResult( content.getUrl(), new ParseImpl( text, parseData ) ); + } + catch ( Exception e ) + { + LOG.error( e ); + } + finally + { + if ( tmpfile != null ) + { + tmpfile.delete(); + } + } + + // TODO! + return null; + } + + private byte[] suck( InputStream is ) + throws IOException + { + ByteArrayOutputStream baos = new ByteArrayOutputStream( 4* 1024 ); + byte[] buf = new byte[1024*4]; + int c = -1; + while ( (c = is.read( buf )) != -1 ) + { + baos.write( buf, 0, c ); + } + + return baos.toByteArray(); + } + + private String suck( InputStreamReader reader ) + throws IOException + { + StringBuilder sb = new StringBuilder( 1024 * 4 ); + char[] buf = new char[1024*4]; + int c = -1; + + while ( (c = reader.read( buf )) != -1 ) + { + sb.append( buf, 0, c ); + } + + return sb.toString(); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2012-01-26 20:53:09
|
Revision: 3607 http://archive-access.svn.sourceforge.net/archive-access/?rev=3607&view=rev Author: binzino Date: 2012-01-26 20:53:00 +0000 (Thu, 26 Jan 2012) Log Message: ----------- Initial revision of NutchWAX custom version of parse-html plugin. Main diffs are not enforcing robots meta tag nor trying to process redirects. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup-1.2.jar tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/XMLCharacterRecognizer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2012-01-26 20:51:04 UTC (rev 3606) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2012-01-26 20:53:00 UTC (rev 3607) @@ -92,6 +92,7 @@ <ant dir="scoring-nutchwax" target="deploy" /> <ant dir="urlfilter-nutchwax" target="deploy" /> <ant dir="parse-pdf2" target="deploy" /> + <ant dir="parse-html2" target="deploy" /> <ant dir="html-decorator" target="deploy" /> </target> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-html2" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup-1.2.jar =================================================================== (Binary files differ) Property changes on: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup-1.2.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,47 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parse-html2" + name="NutchWAX Html Parse Plug-in" + version="1.0.0" + provider-name="archive.org"> + + <runtime> + <library name="parse-html2.jar"> + <export name="*"/> + </library> + <library name="tagsoup-1.2.jar"/> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.archive.nutchwax.parse.html" + name="NutchWAX HTML Parser" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.archive.nutchwax.parse.html.HtmlParser" + class="org.archive.nutchwax.parse.html.HtmlParser"> + <parameter name="contentType" value="text/html"/> + <parameter name="pathSuffix" value=""/> + </implementation> + + </extension> + +</plugin> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,740 @@ +/* + * XXX ab...@ap...: This class is copied verbatim from Xalan-J 2.6.0 + * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to + * avoid dependency on Xalan. + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * $Id: DOMBuilder.java 823614 2009-10-09 17:02:32Z ab $ + */ +package org.archive.nutchwax.parse.html; + +import java.util.Stack; + +import org.w3c.dom.Comment; +import org.w3c.dom.Document; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.Text; +import org.w3c.dom.CDATASection; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.ext.LexicalHandler; +/** + * This class takes SAX events (in addition to some extra events + * that SAX doesn't handle yet) and adds the result to a document + * or document fragment. + */ +public class DOMBuilder + implements ContentHandler, LexicalHandler +{ + + /** Root document */ + public Document m_doc; + + /** Current node */ + protected Node m_currentNode = null; + + /** First node of document fragment or null if not a DocumentFragment */ + public DocumentFragment m_docFrag = null; + + /** Vector of element nodes */ + protected Stack m_elemStack = new Stack(); + + /** + * DOMBuilder instance constructor... it will add the DOM nodes + * to the document fragment. + * + * @param doc Root document + * @param node Current node + */ + public DOMBuilder(Document doc, Node node) + { + m_doc = doc; + m_currentNode = node; + } + + /** + * DOMBuilder instance constructor... it will add the DOM nodes + * to the document fragment. + * + * @param doc Root document + * @param docFrag Document fragment + */ + public DOMBuilder(Document doc, DocumentFragment docFrag) + { + m_doc = doc; + m_docFrag = docFrag; + } + + /** + * DOMBuilder instance constructor... it will add the DOM nodes + * to the document. + * + * @param doc Root document + */ + public DOMBuilder(Document doc) + { + m_doc = doc; + } + + /** + * Get the root node of the DOM being created. This + * is either a Document or a DocumentFragment. + * + * @return The root document or document fragment if not null + */ + public Node getRootNode() + { + return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc; + } + + /** + * Get the node currently being processed. + * + * @return the current node being processed + */ + public Node getCurrentNode() + { + return m_currentNode; + } + + /** + * Return null since there is no Writer for this class. + * + * @return null + */ + public java.io.Writer getWriter() + { + return null; + } + + /** + * Append a node to the current container. + * + * @param newNode New node to append + */ + protected void append(Node newNode) throws org.xml.sax.SAXException + { + + Node currentNode = m_currentNode; + + if (null != currentNode) + { + currentNode.appendChild(newNode); + + // System.out.println(newNode.getNodeName()); + } + else if (null != m_docFrag) + { + m_docFrag.appendChild(newNode); + } + else + { + boolean ok = true; + short type = newNode.getNodeType(); + + if (type == Node.TEXT_NODE) + { + String data = newNode.getNodeValue(); + + if ((null != data) && (data.trim().length() > 0)) + { + throw new org.xml.sax.SAXException("Warning: can't output text before document element! Ignoring..."); + } + + ok = false; + } + else if (type == Node.ELEMENT_NODE) + { + if (m_doc.getDocumentElement() != null) + { + throw new org.xml.sax.SAXException("Can't have more than one root on a DOM!"); + } + } + + if (ok) + m_doc.appendChild(newNode); + } + } + + /** + * Receive an object for locating the origin of SAX document events. + * + * <p>SAX parsers are strongly encouraged (though not absolutely + * required) to supply a locator: if it does so, it must supply + * the locator to the application by invoking this method before + * invoking any of the other methods in the ContentHandler + * interface.</p> + * + * <p>The locator allows the application to determine the end + * position of any document-related event, even if the parser is + * not reporting an error. Typically, the application will + * use this information for reporting its own errors (such as + * character content that does not match an application's + * business rules). The information returned by the locator + * is probably not sufficient for use with a search engine.</p> + * + * <p>Note that the locator will return correct information only + * during the invocation of the events in this interface. The + * application should not attempt to use it at any other time.</p> + * + * @param locator An object that can return the location of + * any SAX document event. + * @see org.xml.sax.Locator + */ + public void setDocumentLocator(Locator locator) + { + + // No action for the moment. + } + + /** + * Receive notification of the beginning of a document. + * + * <p>The SAX parser will invoke this method only once, before any + * other methods in this interface or in DTDHandler (except for + * setDocumentLocator).</p> + */ + public void startDocument() throws org.xml.sax.SAXException + { + + // No action for the moment. + } + + /** + * Receive notification of the end of a document. + * + * <p>The SAX parser will invoke this method only once, and it will + * be the last method invoked during the parse. The parser shall + * not invoke this method until it has either abandoned parsing + * (because of an unrecoverable error) or reached the end of + * input.</p> + */ + public void endDocument() throws org.xml.sax.SAXException + { + + // No action for the moment. + } + + /** + * Receive notification of the beginning of an element. + * + * <p>The Parser will invoke this method at the beginning of every + * element in the XML document; there will be a corresponding + * endElement() event for every startElement() event (even when the + * element is empty). All of the element's content will be + * reported, in order, before the corresponding endElement() + * event.</p> + * + * <p>If the element name has a namespace prefix, the prefix will + * still be attached. Note that the attribute list provided will + * contain only attributes with explicit values (specified or + * defaulted): #IMPLIED attributes will be omitted.</p> + * + * + * @param ns The namespace of the node + * @param localName The local part of the qualified name + * @param name The element name. + * @param atts The attributes attached to the element, if any. + * @see #endElement + * @see org.xml.sax.Attributes + */ + public void startElement( + String ns, String localName, String name, Attributes atts) + throws org.xml.sax.SAXException + { + + Element elem; + + // Note that the namespace-aware call must be used to correctly + // construct a Level 2 DOM, even for non-namespaced nodes. + if ((null == ns) || (ns.length() == 0)) + elem = m_doc.createElementNS(null,name); + else + elem = m_doc.createElementNS(ns, name); + + append(elem); + + try + { + int nAtts = atts.getLength(); + + if (0 != nAtts) + { + for (int i = 0; i < nAtts; i++) + { + + //System.out.println("type " + atts.getType(i) + " name " + atts.getLocalName(i) ); + // First handle a possible ID attribute + if (atts.getType(i).equalsIgnoreCase("ID")) + setIDAttribute(atts.getValue(i), elem); + + String attrNS = atts.getURI(i); + + if("".equals(attrNS)) + attrNS = null; // DOM represents no-namespace as null + + // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i) + // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i)); + // Crimson won't let us set an xmlns: attribute on the DOM. + String attrQName = atts.getQName(i); + + // In SAX, xmlns: attributes have an empty namespace, while in DOM they should have the xmlns namespace + if (attrQName.startsWith("xmlns:")) + attrNS = "http://www.w3.org/2000/xmlns/"; + + // ALWAYS use the DOM Level 2 call! + elem.setAttributeNS(attrNS,attrQName, atts.getValue(i)); + } + } + + // append(elem); + + m_elemStack.push(elem); + + m_currentNode = elem; + + // append(elem); + } + catch(java.lang.Exception de) + { + // de.printStackTrace(); + throw new org.xml.sax.SAXException(de); + } + + } + + /** + + + + * Receive notification of the end of an element. + * + * <p>The SAX parser will invoke this method at the end of every + * element in the XML document; there will be a corresponding + * startElement() event for every endElement() event (even when the + * element is empty).</p> + * + * <p>If the element name has a namespace prefix, the prefix will + * still be attached to the name.</p> + * + * + * @param ns the namespace of the element + * @param localName The local part of the qualified name of the element + * @param name The element name + */ + public void endElement(String ns, String localName, String name) + throws org.xml.sax.SAXException + { + m_elemStack.pop(); + m_currentNode = m_elemStack.isEmpty() ? null : (Node)m_elemStack.peek(); + } + + /** + * Set an ID string to node association in the ID table. + * + * @param id The ID string. + * @param elem The associated ID. + */ + public void setIDAttribute(String id, Element elem) + { + + // Do nothing. This method is meant to be overiden. + } + + /** + * Receive notification of character data. + * + * <p>The Parser will call this method to report each chunk of + * character data. SAX parsers may return all contiguous character + * data in a single chunk, or they may split it into several + * chunks; however, all of the characters in any single event + * must come from the same external entity, so that the Locator + * provides useful information.</p> + * + * <p>The application must not attempt to read from the array + * outside of the specified range.</p> + * + * <p>Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating + * parsers must do so).</p> + * + * @param ch The characters from the XML document. + * @param start The start position in the array. + * @param length The number of characters to read from the array. + * @see #ignorableWhitespace + * @see org.xml.sax.Locator + */ + public void characters(char ch[], int start, int length) throws org.xml.sax.SAXException + { + if(isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + if (m_inCData) + { + cdata(ch, start, length); + + return; + } + + String s = new String(ch, start, length); + Node childNode; + childNode = m_currentNode != null ? m_currentNode.getLastChild(): null; + if( childNode != null && childNode.getNodeType() == Node.TEXT_NODE ){ + ((Text)childNode).appendData(s); + } + else{ + Text text = m_doc.createTextNode(s); + append(text); + } + } + + /** + * If available, when the disable-output-escaping attribute is used, + * output raw text without escaping. A PI will be inserted in front + * of the node with the name "lotusxsl-next-is-raw" and a value of + * "formatter-to-dom". + * + * @param ch Array containing the characters + * @param start Index to start of characters in the array + * @param length Number of characters in the array + */ + public void charactersRaw(char ch[], int start, int length) + throws org.xml.sax.SAXException + { + if(isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + + String s = new String(ch, start, length); + + append(m_doc.createProcessingInstruction("xslt-next-is-raw", + "formatter-to-dom")); + append(m_doc.createTextNode(s)); + } + + /** + * Report the beginning of an entity. + * + * The start and end of the document entity are not reported. + * The start and end of the external DTD subset are reported + * using the pseudo-name "[dtd]". All other events must be + * properly nested within start/end entity events. + * + * @param name The name of the entity. If it is a parameter + * entity, the name will begin with '%'. + * @see #endEntity + * @see org.xml.sax.ext.DeclHandler#internalEntityDecl + * @see org.xml.sax.ext.DeclHandler#externalEntityDecl + */ + public void startEntity(String name) throws org.xml.sax.SAXException + { + + // Almost certainly the wrong behavior... + // entityReference(name); + } + + /** + * Report the end of an entity. + * + * @param name The name of the entity that is ending. + * @see #startEntity + */ + public void endEntity(String name) throws org.xml.sax.SAXException{} + + /** + * Receive notivication of a entityReference. + * + * @param name name of the entity reference + */ + public void entityReference(String name) throws org.xml.sax.SAXException + { + append(m_doc.createEntityReference(name)); + } + + /** + * Receive notification of ignorable whitespace in element content. + * + * <p>Validating Parsers must use this method to report each chunk + * of ignorable whitespace (see the W3C XML 1.0 recommendation, + * section 2.10): non-validating parsers may also use this method + * if they are capable of parsing and using content models.</p> + * + * <p>SAX parsers may return all contiguous whitespace in a single + * chunk, or they may split it into several chunks; however, all of + * the characters in any single event must come from the same + * external entity, so that the Locator provides useful + * information.</p> + * + * <p>The application must not attempt to read from the array + * outside of the specified range.</p> + * + * @param ch The characters from the XML document. + * @param start The start position in the array. + * @param length The number of characters to read from the array. + * @see #characters + */ + public void ignorableWhitespace(char ch[], int start, int length) + throws org.xml.sax.SAXException + { + if(isOutsideDocElem()) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + append(m_doc.createTextNode(s)); + } + + /** + * Tell if the current node is outside the document element. + * + * @return true if the current node is outside the document element. + */ + private boolean isOutsideDocElem() + { + return (null == m_docFrag) && m_elemStack.size() == 0 && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE); + } + + /** + * Receive notification of a processing instruction. + * + * <p>The Parser will invoke this method once for each processing + * instruction found: note that processing instructions may occur + * before or after the main document element.</p> + * + * <p>A SAX parser should never report an XML declaration (XML 1.0, + * section 2.8) or a text declaration (XML 1.0, section 4.3.1) + * using this method.</p> + * + * @param target The processing instruction target. + * @param data The processing instruction data, or null if + * none was supplied. + */ + public void processingInstruction(String target, String data) + throws org.xml.sax.SAXException + { + append(m_doc.createProcessingInstruction(target, data)); + } + + /** + * Report an XML comment anywhere in the document. + * + * This callback will be used for comments inside or outside the + * document element, including comments in the external DTD + * subset (if read). + * + * @param ch An array holding the characters in the comment. + * @param start The starting position in the array. + * @param length The number of characters to use from the array. + */ + public void comment(char ch[], int start, int length) throws org.xml.sax.SAXException + { + // tagsoup sometimes submits invalid values here + if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) return; + append(m_doc.createComment(new String(ch, start, length))); + } + + /** Flag indicating that we are processing a CData section */ + protected boolean m_inCData = false; + + /** + * Report the start of a CDATA section. + * + * @see #endCDATA + */ + public void startCDATA() throws org.xml.sax.SAXException + { + m_inCData = true; + append(m_doc.createCDATASection("")); + } + + /** + * Report the end of a CDATA section. + * + * @see #startCDATA + */ + public void endCDATA() throws org.xml.sax.SAXException + { + m_inCData = false; + } + + /** + * Receive notification of cdata. + * + * <p>The Parser will call this method to report each chunk of + * character data. SAX parsers may return all contiguous character + * data in a single chunk, or they may split it into several + * chunks; however, all of the characters in any single event + * must come from the same external entity, so that the Locator + * provides useful information.</p> + * + * <p>The application must not attempt to read from the array + * outside of the specified range.</p> + * + * <p>Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating + * parsers must do so).</p> + * + * @param ch The characters from the XML document. + * @param start The start position in the array. + * @param length The number of characters to read from the array. + * @see #ignorableWhitespace + * @see org.xml.sax.Locator + */ + public void cdata(char ch[], int start, int length) throws org.xml.sax.SAXException + { + if(isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + // XXX ab...@ap...: modified from the original, to accomodate TagSoup. + Node n = m_currentNode.getLastChild(); + if (n instanceof CDATASection) + ((CDATASection)n).appendData(s); + else if (n instanceof Comment) + ((Comment)n).appendData(s); + } + + /** + * Report the start of DTD declarations, if any. + * + * Any declarations are assumed to be in the internal subset + * unless otherwise indicated. + * + * @param name The document type name. + * @param publicId The declared public identifier for the + * external DTD subset, or null if none was declared. + * @param systemId The declared system identifier for the + * external DTD subset, or null if none was declared. + * @see #endDTD + * @see #startEntity + */ + public void startDTD(String name, String publicId, String systemId) + throws org.xml.sax.SAXException + { + + // Do nothing for now. + } + + /** + * Report the end of DTD declarations. + * + * @see #startDTD + */ + public void endDTD() throws org.xml.sax.SAXException + { + + // Do nothing for now. + } + + /** + * Begin the scope of a prefix-URI Namespace mapping. + * + * <p>The information from this event is not necessary for + * normal Namespace processing: the SAX XML reader will + * automatically replace prefixes for element and attribute + * names when the http://xml.org/sax/features/namespaces + * feature is true (the default).</p> + * + * <p>There are cases, however, when applications need to + * use prefixes in character data or in attribute values, + * where they cannot safely be expanded automatically; the + * start/endPrefixMapping event supplies the information + * to the application to expand prefixes in those contexts + * itself, if necessary.</p> + * + * <p>Note that start/endPrefixMapping events are not + * guaranteed to be properly nested relative to each-other: + * all startPrefixMapping events will occur before the + * corresponding startElement event, and all endPrefixMapping + * events will occur after the corresponding endElement event, + * but their order is not guaranteed.</p> + * + * @param prefix The Namespace prefix being declared. + * @param uri The Namespace URI the prefix is mapped to. + * @see #endPrefixMapping + * @see #startElement + */ + public void startPrefixMapping(String prefix, String uri) + throws org.xml.sax.SAXException + { + + /* + // Not sure if this is needed or wanted + // Also, it fails in the stree. + if((null != m_currentNode) + && (m_currentNode.getNodeType() == Node.ELEMENT_NODE)) + { + String qname; + if(((null != prefix) && (prefix.length() == 0)) + || (null == prefix)) + qname = "xmlns"; + else + qname = "xmlns:"+prefix; + + Element elem = (Element)m_currentNode; + String val = elem.getAttribute(qname); // Obsolete, should be DOM2...? + if(val == null) + { + elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", + qname, uri); + } + } + */ + } + + /** + * End the scope of a prefix-URI mapping. + * + * <p>See startPrefixMapping for details. This event will + * always occur after the corresponding endElement event, + * but the order of endPrefixMapping events is not otherwise + * guaranteed.</p> + * + * @param prefix The prefix that was being mapping. + * @see #startPrefixMapping + * @see #endElement + */ + public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException{} + + /** + * Receive notification of a skipped entity. + * + * <p>The Parser will invoke this method once for each entity + * skipped. Non-validating processors may skip entities if they + * have not seen the declarations (because, for example, the + * entity was declared in an external DTD subset). All processors + * may skip external entities, depending on the values of the + * http://xml.org/sax/features/external-general-entities and the + * http://xml.org/sax/features/external-parameter-entities + * properties.</p> + * + * @param name The name of the skipped entity. If it is a + * parameter entity, the name will begin with '%'. + */ + public void skippedEntity(String name) throws org.xml.sax.SAXException{} +} Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,419 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.parse.html; + +import java.net.URL; +import java.net.MalformedURLException; +import java.util.Collection; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Stack; + +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.util.NodeWalker; +import org.apache.hadoop.conf.Configuration; + +import org.w3c.dom.*; + +/** + * A collection of methods for extracting content from DOM trees. + * + * This class holds a few utility methods for pulling content out of + * DOM nodes, such as getOutlinks, getText, etc. + * + */ +public class DOMContentUtils { + + public static class LinkParams { + public String elName; + public String attrName; + public int childLen; + + public LinkParams(String elName, String attrName, int childLen) { + this.elName = elName; + this.attrName = attrName; + this.childLen = childLen; + } + + public String toString() { + return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; + } + } + + private HashMap linkParams = new HashMap(); + private Configuration conf; + + public DOMContentUtils(Configuration conf) { + setConf(conf); + } + + public void setConf(Configuration conf) { + // forceTags is used to override configurable tag ignoring, later on + Collection<String> forceTags = new ArrayList<String>(1); + + this.conf = conf; + linkParams.clear(); + linkParams.put("a", new LinkParams("a", "href", 1)); + linkParams.put("area", new LinkParams("area", "href", 0)); + if (conf.getBoolean("parser.html.form.use_action", true)) { + linkParams.put("form", new LinkParams("form", "action", 1)); + if (conf.get("parser.html.form.use_action") != null) + forceTags.add("form"); + } + linkParams.put("frame", new LinkParams("frame", "src", 0)); + linkParams.put("iframe", new LinkParams("iframe", "src", 0)); + linkParams.put("script", new LinkParams("script", "src", 0)); + linkParams.put("link", new LinkParams("link", "href", 0)); + linkParams.put("img", new LinkParams("img", "src", 0)); + + // remove unwanted link tags from the linkParams map + String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags"); + for ( int i = 0 ; ignoreTags != null && i < ignoreTags.length ; i++ ) { + if ( ! forceTags.contains(ignoreTags[i]) ) + linkParams.remove(ignoreTags[i]); + } + } + + /** + * This method takes a {@link StringBuffer} and a DOM {@link Node}, + * and will append all the content text found beneath the DOM node to + * the <code>StringBuffer</code>. + * + * <p> + * + * If <code>abortOnNestedAnchors</code> is true, DOM traversal will + * be aborted and the <code>StringBuffer</code> will not contain + * any text encountered after a nested anchor is found. + * + * <p> + * + * @return true if nested anchors were found + */ + public boolean getText(StringBuffer sb, Node node, + boolean abortOnNestedAnchors) { + if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { + return true; + } + return false; + } + + + /** + * This is a convinience method, equivalent to {@link + * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. + * + */ + public void getText(StringBuffer sb, Node node) { + getText(sb, node, false); + } + + // returns true if abortOnNestedAnchors is true and we find nested + // anchors + private boolean getTextHelper(StringBuffer sb, Node node, + boolean abortOnNestedAnchors, + int anchorDepth) { + boolean abort = false; + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + if ("script".equalsIgnoreCase(nodeName)) { + walker.skipChildren(); + } + if ("style".equalsIgnoreCase(nodeName)) { + walker.skipChildren(); + } + if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) { + anchorDepth++; + if (anchorDepth > 1) { + abort = true; + break; + } + } + if (nodeType == Node.COMMENT_NODE) { + walker.skipChildren(); + } + if (nodeType == Node.TEXT_NODE) { + // cleanup and trim the value + String text = currentNode.getNodeValue(); + text = text.replaceAll("\\s+", " "); + text = text.trim(); + if (text.length() > 0) { + if (sb.length() > 0) sb.append(' '); + sb.append(text); + } + } + } + + return abort; + } + + /** + * This method takes a {@link StringBuffer} and a DOM {@link Node}, + * and will append the content text found beneath the first + * <code>title</code> node to the <code>StringBuffer</code>. + * + * @return true if a title node was found, false otherwise + */ + public boolean getTitle(StringBuffer sb, Node node) { + + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD + return false; + } + + if (nodeType == Node.ELEMENT_NODE) { + if ("title".equalsIgnoreCase(nodeName)) { + getText(sb, currentNode); + return true; + } + } + } + + return false; + } + + /** If Node contains a BASE tag then it's HREF is returned. */ + public URL getBase(Node node) { + + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + // is this node a BASE tag? + if (nodeType == Node.ELEMENT_NODE) { + + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD + return null; + } + + if ("base".equalsIgnoreCase(nodeName)) { + NamedNodeMap attrs = currentNode.getAttributes(); + for (int i= 0; i < attrs.getLength(); i++ ) { + Node attr = attrs.item(i); + if ("href".equalsIgnoreCase(attr.getNodeName())) { + try { + return new URL(attr.getNodeValue()); + } catch (MalformedURLException e) {} + } + } + } + } + } + + // no. + return null; + } + + + private boolean hasOnlyWhiteSpace(Node node) { + String val= node.getNodeValue(); + for (int i= 0; i < val.length(); i++) { + if (!Character.isWhitespace(val.charAt(i))) + return false; + } + return true; + } + + // this only covers a few cases of empty links that are symptomatic + // of nekohtml's DOM-fixup process... + private boolean shouldThrowAwayLink(Node node, NodeList children, + int childLen, LinkParams params) { + if (childLen == 0) { + // this has no inner structure + if (params.childLen == 0) return false; + else return true; + } else if ((childLen == 1) + && (children.item(0).getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { + // single nested link + return true; + + } else if (childLen == 2) { + + Node c0= children.item(0); + Node c1= children.item(1); + + if ((c0.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c0.getNodeName())) + && (c1.getNodeType() == Node.TEXT_NODE) + && hasOnlyWhiteSpace(c1) ) { + // single link followed by whitespace node + return true; + } + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) + && hasOnlyWhiteSpace(c0) ) { + // whitespace node followed by single link + return true; + } + + } else if (childLen == 3) { + Node c0= children.item(0); + Node c1= children.item(1); + Node c2= children.item(2); + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) + && (c2.getNodeType() == Node.TEXT_NODE) + && hasOnlyWhiteSpace(c0) + && hasOnlyWhiteSpace(c2) ) { + // single link surrounded by whitespace nodes + return true; + } + } + + return false; + } + + /** + * Handles cases where the url param information is encoded into the base + * url as opposed to the target. + * <p> + * If the taget contains params (i.e. ';xxxx') information then the target + * params information is assumed to be correct and any base params information + * is ignored. If the base contains params information but the tareget does + * not, then the params information is moved to the target allowing it to be + * correctly determined by the java.net.URL class. + * + * @param base The base URL. + * @param target The target path from the base URL. + * + * @return URL A URL with the params information correctly encoded. + * + * @throws MalformedURLException If the url is not a well formed URL. + */ + private URL fixEmbeddedParams(URL base, String target) + throws MalformedURLException{ + + // the target contains params information or the base doesn't then no + // conversion necessary, return regular URL + if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) { + return new URL(base, target); + } + + // get the base url and it params information + String baseURL = base.toString(); + int startParams = baseURL.indexOf(';'); + String params = baseURL.substring(startParams); + + // if the target has a query string then put the params information after + // any path but before the query string, otherwise just append to the path + int startQS = target.indexOf('?'); + if (startQS >= 0) { + target = target.substring(0, startQS) + params + + target.substring(startQS); + } + else { + target += params; + } + + return new URL(base, target); + } + + /** + * This method finds all anchors below the supplied DOM + * <code>node</code>, and creates appropriate {@link Outlink} + * records for each (relative to the supplied <code>base</code> + * URL), and adds them to the <code>outlinks</code> {@link + * ArrayList}. + * + * <p> + * + * Links without inner structure (tags, text, etc) are discarded, as + * are links which contain only single nested links and empty text + * nodes (this is a common DOM-fixup artifact, at least with + * nekohtml). + */ + public void getOutlinks(URL base, ArrayList outlinks, + Node node) { + + NodeWalker walker = new NodeWalker(node); + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + NodeList children = currentNode.getChildNodes(); + int childLen = (children != null) ? children.getLength() : 0; + + if (nodeType == Node.ELEMENT_NODE) { + + nodeName = nodeName.toLowerCase(); + LinkParams params = (LinkParams)linkParams.get(nodeName); + if (params != null) { + if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { + + StringBuffer linkText = new StringBuffer(); + getText(linkText, currentNode, true); + + NamedNodeMap attrs = currentNode.getAttributes(); + String target = null; + boolean noFollow = false; + boolean post = false; + for (int i= 0; i < attrs.getLength(); i++ ) { + Node attr = attrs.item(i); + String attrName = attr.getNodeName(); + if (params.attrName.equalsIgnoreCase(attrName)) { + target = attr.getNodeValue(); + } else if ("rel".equalsIgnoreCase(attrName) && + "nofollow".equalsIgnoreCase(attr.getNodeValue())) { + noFollow = true; + } else if ("method".equalsIgnoreCase(attrName) && + "post".equalsIgnoreCase(attr.getNodeValue())) { + post = true; + } + } + if (target != null && !noFollow && !post) + try { + + URL url = (base.toString().indexOf(';') > 0) ? + fixEmbeddedParams(base, target) : new URL(base, target); + outlinks.add(new Outlink(url.toString(), + linkText.toString().trim())); + } catch (MalformedURLException e) { + // don't care + } + } + // this should not have any children, skip them + if (params.childLen == 0) continue; + } + } + } + } + +} + Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,213 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.parse.html; + +import java.net.URL; + +import org.apache.nutch.parse.HTMLMetaTags; +import org.w3c.dom.*; + +/** + * Class for parsing META Directives from DOM trees. This class + * handles specifically Robots META directives (all, none, nofollow, + * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache + * instructions. All meta directives are stored in a HTMLMetaTags instance. + */ +public class HTMLMetaProcessor { + + /** + * Utility class with indicators for the robots directives "noindex" + * and "nofollow", and HTTP-EQUIV/no-cache + */ + + /** + * Sets the indicators in <code>robotsMeta</code> to appropriate + * values, based on any META tags found under the given + * <code>node</code>. + */ + public static final void getMetaTags ( + HTMLMetaTags metaTags, Node node, URL currURL) { + + metaTags.reset(); + getMetaTagsHelper(metaTags, node, currURL); + } + + private static final void getMetaTagsHelper( + HTMLMetaTags metaTags, Node node, URL currURL) { + + if (node.getNodeType() == Node.ELEMENT_NODE) { + + if ("body".equalsIgnoreCase(node.getNodeName())) { + // META tags should not be under body + return; + } + + if ("meta".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + Node nameNode = null; + Node equivNode = null; + Node contentNode = null; + // Retrieves name, http-equiv and content attribues + for (int i=0; i<attrs.g... [truncated message content] |