[Archive-access-cvs] SF.net SVN: archive-access:[2660] trunk/archive-access/projects/nutchwax/ arch

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2660
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2660&view=rev
Author:   binzino
Date:     2008-12-11 22:58:28 +0000 (Thu, 11 Dec 2008)

Log Message:
-----------
Initial checkin of Nutch source-files that are over-ridden and copied
into the Nutch source tree when compiling.

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/archive/src/nutch/
    trunk/archive-access/projects/nutchwax/archive/src/nutch/src/
    trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/
    trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/
    trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/
    trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/
    trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/
    trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java
    trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java
    trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/OpenSearchServlet.java

Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java
===================================================================

--- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java	2008-12-11 22:58:28 UTC (rev 2660)
@@ -0,0 +1,375 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.searcher;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.BufferedReader;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Iterator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.nutch.protocol.*;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.mapred.lib.*;
+import org.apache.nutch.crawl.*;
+
+/** Implements {@link HitSummarizer} and {@link HitContent} for a set of
+ * fetched segments. */
+public class FetchedSegments implements HitSummarizer, HitContent
+{
+  public static final Log LOG = LogFactory.getLog(FetchedSegments.class);
+
+  private static class Segment implements Closeable {
+    
+    private static final Partitioner PARTITIONER = new HashPartitioner();
+
+    private FileSystem fs;
+    private Path segmentDir;
+
+    private MapFile.Reader[] content;
+    private MapFile.Reader[] parseText;
+    private MapFile.Reader[] parseData;
+    private MapFile.Reader[] crawl;
+    private Configuration conf;
+
+    public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException {
+      this.fs = fs;
+      this.segmentDir = segmentDir;
+      this.conf = conf;
+    }
+
+    public CrawlDatum getCrawlDatum(Text url) throws IOException {
+      synchronized (this) {
+        if (crawl == null)
+          crawl = getReaders(CrawlDatum.FETCH_DIR_NAME);
+      }
+      return (CrawlDatum)getEntry(crawl, url, new CrawlDatum());
+    }
+    
+    public byte[] getContent(Text url) throws IOException {
+      synchronized (this) {
+        if (content == null)
+          content = getReaders(Content.DIR_NAME);
+      }
+      return ((Content)getEntry(content, url, new Content())).getContent();
+    }
+
+    public ParseData getParseData(Text url) throws IOException {
+      synchronized (this) {
+        if (parseData == null)
+          parseData = getReaders(ParseData.DIR_NAME);
+      }
+      return (ParseData)getEntry(parseData, url, new ParseData());
+    }
+
+    public ParseText getParseText(Text url) throws IOException {
+      synchronized (this) {
+        if (parseText == null)
+          parseText = getReaders(ParseText.DIR_NAME);
+      }
+      return (ParseText)getEntry(parseText, url, new ParseText());
+    }
+    
+    private MapFile.Reader[] getReaders(String subDir) throws IOException {
+      return MapFileOutputFormat.getReaders(fs, new Path(segmentDir, subDir), this.conf);
+    }
+
+    private Writable getEntry(MapFile.Reader[] readers, Text url,
+                              Writable entry) throws IOException {
+      return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry);
+    }
+
+    public void close() throws IOException {
+      if (content != null) { closeReaders(content); }
+      if (parseText != null) { closeReaders(parseText); }
+      if (parseData != null) { closeReaders(parseData); }
+      if (crawl != null) { closeReaders(crawl); }
+    }
+
+    private void closeReaders(MapFile.Reader[] readers) throws IOException {
+      for (int i = 0; i < readers.length; i++) {
+        readers[i].close();
+      }
+    }
+
+  }
+
+  private HashMap segments = new HashMap( );
+  private boolean perCollection = false;
+  private Summarizer summarizer;
+
+  /** Construct given a directory containing fetcher output. */
+  public FetchedSegments(FileSystem fs, String segmentsDir, Configuration conf) throws IOException
+  {
+    this.summarizer = new SummarizerFactory(conf).getSummarizer();
+
+    Path[] segmentDirs = HadoopFSUtil.getPaths( fs.listStatus(new Path(segmentsDir), HadoopFSUtil.getPassDirectoriesFilter(fs)) );
+    if ( segmentDirs == null )
+      {
+        LOG.warn( "No segment directories: " + segmentsDir );
+        return ;
+      }
+
+    this.perCollection = conf.getBoolean( "nutchwax.FetchedSegments.perCollection", false );
+
+    LOG.info( "Per-collection segments: " + this.perCollection );
+
+    for ( int i = 0; i < segmentDirs.length; i++ )
+      {
+        if ( this.perCollection )
+          {
+            // Assume segmentDir is actually a 'collection' dir which
+            // contains a list of segments, such as:
+            //   crawl/segments/194/segment-foo
+            //                     /segment-bar
+            //                     /segment-baz
+            //   crawl/segments/366/segment-frotz
+            //                     /segment-fizzle
+            //                     /segment-bizzle
+            // The '194' and '366' are collection dirs, which contain the
+            // actual segment dirs.
+            Path collectionDir = segmentDirs[i];
+            
+            Map perCollectionSegments = (Map) this.segments.get( collectionDir.getName( ) );
+            if ( perCollectionSegments == null )
+              {
+                perCollectionSegments = new HashMap( );
+                this.segments.put( collectionDir.getName( ), perCollectionSegments );
+              }
+            
+            // Now, get a list of all the sub-dirs of the collectionDir,
+            // and create segments for them, adding them to the
+            // per-collection map.
+            Path[] perCollectionSegmentDirs = HadoopFSUtil.getPaths( fs.listStatus( collectionDir, HadoopFSUtil.getPassDirectoriesFilter(fs) ) );
+            for ( Path segmentDir : perCollectionSegmentDirs )
+              {
+                perCollectionSegments.put( segmentDir.getName( ), new Segment( fs, segmentDir, conf ) );
+              }
+
+            addRemaps( fs, collectionDir, (Map<String,Segment>) perCollectionSegments );
+          }
+        else
+          {
+            Path segmentDir = segmentDirs[i];
+            segments.put(segmentDir.getName(), new Segment(fs, segmentDir, conf));
+          }
+      }
+
+    // If we not-doing perCollection segments, process a single
+    // "remap" file for the "segments" dir.
+    if ( ! this.perCollection )
+      {
+        addRemaps( fs, new Path(segmentsDir), (Map<String,Segment>) segments );
+      }
+
+    LOG.info( "segments: " + segments );
+  }
+
+  protected void addRemaps( FileSystem fs, Path segmentDir, Map<String,Segment> segments )
+    throws IOException
+  {
+    Path segmentRemapFile = new Path( segmentDir, "remap" );
+
+    if ( ! fs.exists( segmentRemapFile ) )
+      {
+        LOG.warn( "Remap file doesn't exist: " + segmentRemapFile );
+        
+        return ;
+      }
+
+    // InputStream is = segmentRemapFile.getFileSystem( conf ).open( segmentRemapFile );
+    InputStream is = fs.open( segmentRemapFile );
+    
+    BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) );
+            
+    String line;
+    while ( (line = reader.readLine()) != null )
+      {
+        String fields[] = line.trim( ).split( "\\s+" );
+        
+        if ( fields.length < 2 )
+          {
+            LOG.warn( "Malformed remap line, not enough fields ("+fields.length+"): " + line );
+            continue ;
+          }
+        
+        // Look for the "to" name in the segments.
+        Segment toSegment = segments.get( fields[1] );
+        if ( toSegment == null )
+          {
+            LOG.warn( "Segment remap destination doesn't exist: " + fields[1] );
+          }
+        else
+          {
+            LOG.warn( "Remap: " + fields[0] + " => " + fields[1] );
+            segments.put( fields[0], toSegment );
+          }
+      }
+  }
+
+
+  public String[] getSegmentNames() {
+    return (String[])segments.keySet().toArray(new String[segments.size()]);
+  }
+
+  public byte[] getContent(HitDetails details) throws IOException {
+    return getSegment(details).getContent(getUrl(details));
+  }
+
+  public ParseData getParseData(HitDetails details) throws IOException {
+    return getSegment(details).getParseData(getUrl(details));
+  }
+
+  public long getFetchDate(HitDetails details) throws IOException {
+    return getSegment(details).getCrawlDatum(getUrl(details))
+      .getFetchTime();
+  }
+
+  public ParseText getParseText(HitDetails details) throws IOException {
+    return getSegment(details).getParseText(getUrl(details));
+  }
+
+  public Summary getSummary(HitDetails details, Query query)
+    throws IOException {
+    
+    if (this.summarizer == null) { return new Summary(); }
+    
+    Segment segment = getSegment(details);
+    ParseText parseText = segment.getParseText(getUrl(details));
+    String text = (parseText != null) ? parseText.getText() : "";
+    
+    return this.summarizer.getSummary(text, query);
+  }
+    
+  private class SummaryThread extends Thread {
+    private HitDetails details;
+    private Query query;
+
+    private Summary summary;
+    private Throwable throwable;
+
+    public SummaryThread(HitDetails details, Query query) {
+      this.details = details;
+      this.query = query;
+    }
+
+    public void run() {
+      try {
+        this.summary = getSummary(details, query);
+      } catch (Throwable throwable) {
+        this.throwable = throwable;
+      }
+    }
+
+  }
+
+
+  public Summary[] getSummary(HitDetails[] details, Query query)
+    throws IOException {
+    SummaryThread[] threads = new SummaryThread[details.length];
+    for (int i = 0; i < threads.length; i++) {
+      threads[i] = new SummaryThread(details[i], query);
+      threads[i].start();
+    }
+
+    Summary[] results = new Summary[details.length];
+    for (int i = 0; i < threads.length; i++) {
+      try {
+        threads[i].join();
+      } catch (InterruptedException e) {
+        throw new RuntimeException(e);
+      }
+      if (threads[i].throwable instanceof IOException) {
+        throw (IOException)threads[i].throwable;
+      } else if (threads[i].throwable != null) {
+        throw new RuntimeException(threads[i].throwable);
+      }
+      results[i] = threads[i].summary;
+    }
+    return results;
+  }
+
+
+  private Segment getSegment(HitDetails details) 
+  {
+    if ( this.perCollection )
+      {
+        LOG.info( "getSegment: " + details );
+        LOG.info( "  collection: " + details.getValue("collection") );
+        LOG.info( "  segment   : " + details.getValue("segment") );
+
+        String collectionId = details.getValue("collection");
+        String segmentName  = details.getValue("segment");
+        
+        Map perCollectionSegments = (Map) this.segments.get( collectionId );
+        
+        Segment segment = (Segment) perCollectionSegments.get( segmentName );
+        
+        if ( segment == null )
+          {
+            LOG.warn( "Didn't find segment: collection=" + collectionId + " segment=" + segmentName );
+          }
+
+        return segment;
+      }
+    else
+      {
+        LOG.info( "getSegment: " + details );
+        LOG.info( "  segment   : " + details.getValue("segment") );
+
+        String segmentName = details.getValue( "segment" );
+        Segment segment = (Segment) segments.get( segmentName );
+
+        if ( segment == null )
+          {
+            LOG.warn( "Didn't find segment: " + segmentName );
+          }
+        
+        return segment;
+      }
+  }
+
+  private Text getUrl(HitDetails details) {
+    String url = details.getValue("orig");
+    if (StringUtils.isBlank(url)) {
+      url = details.getValue("url");
+    }
+    return new Text(url);
+  }
+
+  public void close() throws IOException {
+    Iterator iterator = segments.values().iterator();
+    while (iterator.hasNext()) {
+      ((Segment) iterator.next()).close();
+    }
+  }
+  
+}

Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java	2008-12-11 22:58:28 UTC (rev 2660)
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.searcher;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.search.FieldDoc;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.nutch.indexer.FsDirectory;
+import org.apache.nutch.indexer.NutchSimilarity;
+
+/** Implements {@link Searcher} and {@link HitDetailer} for either a single
+ * merged index, or a set of indexes. */
+public class IndexSearcher implements Searcher, HitDetailer {
+
+  private org.apache.lucene.search.Searcher luceneSearcher;
+  private org.apache.lucene.index.IndexReader reader;
+  private LuceneQueryOptimizer optimizer;
+  private FileSystem fs;
+  private Configuration conf;
+  private QueryFilters queryFilters;
+
+  /** Construct given a number of indexes. */
+  public IndexSearcher(Path[] indexDirs, Configuration conf) throws IOException {
+    IndexReader[] readers = new IndexReader[indexDirs.length];
+    this.conf = conf;
+    this.fs = FileSystem.get(conf);
+    for (int i = 0; i < indexDirs.length; i++) {
+      readers[i] = IndexReader.open(getDirectory(indexDirs[i]));
+    }
+    init(new MultiReader(readers), conf);
+  }
+
+  /** Construct given a single merged index. */
+  public IndexSearcher(Path index,  Configuration conf)
+    throws IOException {
+    this.conf = conf;
+    this.fs = FileSystem.get(conf);
+    init(IndexReader.open(getDirectory(index)), conf);
+  }
+
+  private void init(IndexReader reader, Configuration conf) throws IOException {
+    this.reader = reader;
+    this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader);
+    this.luceneSearcher.setSimilarity(new NutchSimilarity());
+    this.optimizer = new LuceneQueryOptimizer(conf);
+    this.queryFilters = new QueryFilters(conf);
+  }
+
+  private Directory getDirectory(Path file) throws IOException {
+    if ("file".equals(this.fs.getUri().getScheme())) {
+      Path qualified = file.makeQualified(FileSystem.getLocal(conf));
+      File fsLocal = new File(qualified.toUri());
+      return FSDirectory.getDirectory(fsLocal.getAbsolutePath());
+    } else {
+      return new FsDirectory(this.fs, file, false, this.conf);
+    }
+  }
+
+  public Hits search(Query query, int numHits,
+                     String dedupField, String sortField, boolean reverse)
+
+    throws IOException {
+    org.apache.lucene.search.BooleanQuery luceneQuery =
+      this.queryFilters.filter(query);
+    
+    System.out.println( "Nutch  query: " + query       );
+    System.out.println( "Lucene query: " + luceneQuery );
+
+    return translateHits
+      (optimizer.optimize(luceneQuery, luceneSearcher, numHits,
+                          sortField, reverse),
+       dedupField, sortField);
+  }
+
+  public String getExplanation(Query query, Hit hit) throws IOException {
+    return luceneSearcher.explain(this.queryFilters.filter(query),
+                                  hit.getIndexDocNo()).toHtml();
+  }
+
+  public HitDetails getDetails(Hit hit) throws IOException {
+
+    Document doc = luceneSearcher.doc(hit.getIndexDocNo());
+
+    List docFields = doc.getFields();
+    String[] fields = new String[docFields.size()];
+    String[] values = new String[docFields.size()];
+    for (int i = 0; i < docFields.size(); i++) {
+      Field field = (Field)docFields.get(i);
+      fields[i] = field.name();
+      values[i] = field.stringValue();
+    }
+
+    return new HitDetails(fields, values);
+  }
+
+  public HitDetails[] getDetails(Hit[] hits) throws IOException {
+    HitDetails[] results = new HitDetails[hits.length];
+    for (int i = 0; i < hits.length; i++)
+      results[i] = getDetails(hits[i]);
+    return results;
+  }
+
+  private Hits translateHits(TopDocs topDocs,
+                             String dedupField, String sortField)
+    throws IOException {
+
+    String[] dedupValues = null;
+    if (dedupField != null) 
+      dedupValues = FieldCache.DEFAULT.getStrings(reader, dedupField);
+
+    ScoreDoc[] scoreDocs = topDocs.scoreDocs;
+    int length = scoreDocs.length;
+    Hit[] hits = new Hit[length];
+    for (int i = 0; i < length; i++) {
+      
+      int doc = scoreDocs[i].doc;
+      
+      WritableComparable sortValue;               // convert value to writable
+      if (sortField == null) {
+        sortValue = new FloatWritable(scoreDocs[i].score);
+      } else {
+        Object raw = ((FieldDoc)scoreDocs[i]).fields[0];
+        if (raw instanceof Integer) {
+          sortValue = new IntWritable(((Integer)raw).intValue());
+        } else if (raw instanceof Float) {
+          sortValue = new FloatWritable(((Float)raw).floatValue());
+        } else if (raw instanceof String) {
+          sortValue = new Text((String)raw);
+        } else {
+          throw new RuntimeException("Unknown sort value type!");
+        }
+      }
+
+      String dedupValue = dedupValues == null ? null : dedupValues[doc];
+
+      hits[i] = new Hit(doc, sortValue, dedupValue);
+    }
+    return new Hits(topDocs.totalHits, hits);
+  }
+  
+  public void close() throws IOException {
+    if (luceneSearcher != null) { luceneSearcher.close(); }
+    if (reader != null) { reader.close(); }
+  }
+
+}

Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/OpenSearchServlet.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/OpenSearchServlet.java	2008-12-11 22:58:28 UTC (rev 2660)
@@ -0,0 +1,333 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.searcher;
+
+import java.io.IOException;
+import java.net.URLEncoder;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Set;
+import java.util.HashSet;
+
+import javax.servlet.ServletException;
+import javax.servlet.ServletConfig;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import javax.xml.parsers.*;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.w3c.dom.*;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+
+/** Present search results using A9's OpenSearch extensions to RSS, plus a few
+ * Nutch-specific extensions. */   
+public class OpenSearchServlet extends HttpServlet {
+  private static final Map NS_MAP = new HashMap();
+  private int MAX_HITS_PER_PAGE;
+
+  static {
+    NS_MAP.put("opensearch", "http://a9.com/-/spec/opensearchrss/1.0/");
+    NS_MAP.put("nutch", "http://www.nutch.org/opensearchrss/1.0/");
+  }
+
+  private static final Set SKIP_DETAILS = new HashSet();
+  static {
+    SKIP_DETAILS.add("url");                   // redundant with RSS link
+    SKIP_DETAILS.add("title");                 // redundant with RSS title
+  }
+
+  private NutchBean bean;
+  private Configuration conf;
+
+  public void init(ServletConfig config) throws ServletException {
+    try {
+      this.conf = NutchConfiguration.get(config.getServletContext());
+      bean = NutchBean.get(config.getServletContext(), this.conf);
+    } catch (IOException e) {
+      throw new ServletException(e);
+    }
+    MAX_HITS_PER_PAGE = conf.getInt("searcher.max.hits.per.page", -1);
+  }
+
+  public void doGet(HttpServletRequest request, HttpServletResponse response)
+    throws ServletException, IOException {
+
+    if (NutchBean.LOG.isInfoEnabled()) {
+      NutchBean.LOG.info("query request from " + request.getRemoteAddr());
+    }
+
+    // get parameters from request
+    request.setCharacterEncoding("UTF-8");
+    String queryString = request.getParameter("query");
+    if (queryString == null)
+      queryString = "";
+    String urlQuery = URLEncoder.encode(queryString, "UTF-8");
+    
+    // the query language
+    String queryLang = request.getParameter("lang");
+    
+    int start = 0;                                // first hit to display
+    String startString = request.getParameter("start");
+    if (startString != null)
+      start = Integer.parseInt(startString);
+    
+    int hitsPerPage = 10;                         // number of hits to display
+    String hitsString = request.getParameter("hitsPerPage");
+    if (hitsString != null)
+      hitsPerPage = Integer.parseInt(hitsString);
+    if(MAX_HITS_PER_PAGE > 0 && hitsPerPage > MAX_HITS_PER_PAGE)
+      hitsPerPage = MAX_HITS_PER_PAGE;
+
+    String sort = request.getParameter("sort");
+    boolean reverse =
+      sort!=null && "true".equals(request.getParameter("reverse"));
+
+    // De-Duplicate handling.  Look for duplicates field and for how many
+    // duplicates per results to return. Default duplicates field is 'site'
+    // and duplicates per results default is '2'.
+    String dedupField = request.getParameter("dedupField");
+    if (dedupField == null || dedupField.length() == 0) {
+        dedupField = "site";
+    }
+    int hitsPerDup = 2;
+    String hitsPerDupString = request.getParameter("hitsPerDup");
+    if (hitsPerDupString != null && hitsPerDupString.length() > 0) {
+        hitsPerDup = Integer.parseInt(hitsPerDupString);
+    } else {
+        // If 'hitsPerSite' present, use that value.
+        String hitsPerSiteString = request.getParameter("hitsPerSite");
+        if (hitsPerSiteString != null && hitsPerSiteString.length() > 0) {
+            hitsPerDup = Integer.parseInt(hitsPerSiteString);
+        }
+    }
+     
+    // Make up query string for use later drawing the 'rss' logo.
+    String params = "&hitsPerPage=" + hitsPerPage +
+        (queryLang == null ? "" : "&lang=" + queryLang) +
+        (sort == null ? "" : "&sort=" + sort + (reverse? "&reverse=true": "") +
+        (dedupField == null ? "" : "&dedupField=" + dedupField));
+
+    Query query = Query.parse(queryString, queryLang, this.conf);
+    if (NutchBean.LOG.isInfoEnabled()) {
+      NutchBean.LOG.info("query: " + queryString);
+      NutchBean.LOG.info("lang: " + queryLang);
+    }
+
+    // execute the query
+    Hits hits;
+    try {
+      hits = bean.search(query, start + hitsPerPage, hitsPerDup, dedupField,
+          sort, reverse);
+    } catch (IOException e) {
+      if (NutchBean.LOG.isWarnEnabled()) {
+        NutchBean.LOG.warn("Search Error", e);
+      }
+      hits = new Hits(0,new Hit[0]);	
+    }
+
+    if (NutchBean.LOG.isInfoEnabled()) {
+      NutchBean.LOG.info("total hits: " + hits.getTotal());
+    }
+
+    // generate xml results
+    int end = (int)Math.min(hits.getLength(), start + hitsPerPage);
+    int length = end-start;
+
+    Hit[] show = hits.getHits(start, end-start);
+    HitDetails[] details = bean.getDetails(show);
+    Summary[] summaries = bean.getSummary(details, query);
+
+    String requestUrl = request.getRequestURL().toString();
+    String base = requestUrl.substring(0, requestUrl.lastIndexOf('/'));
+      
+
+    try {
+      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+      factory.setNamespaceAware(true);
+      Document doc = factory.newDocumentBuilder().newDocument();
+ 
+      Element rss = addNode(doc, doc, "rss");
+      addAttribute(doc, rss, "version", "2.0");
+      addAttribute(doc, rss, "xmlns:opensearch",
+                   (String)NS_MAP.get("opensearch"));
+      addAttribute(doc, rss, "xmlns:nutch", (String)NS_MAP.get("nutch"));
+
+      Element channel = addNode(doc, rss, "channel");
+    
+      addNode(doc, channel, "title", "Nutch: " + queryString);
+      addNode(doc, channel, "description", "Nutch search results for query: "
+              + queryString);
+      addNode(doc, channel, "link",
+              base+"/search.jsp"
+              +"?query="+urlQuery
+              +"&start="+start
+              +"&hitsPerDup="+hitsPerDup
+              +params);
+
+      addNode(doc, channel, "opensearch", "totalResults", ""+hits.getTotal());
+      addNode(doc, channel, "opensearch", "startIndex", ""+start);
+      addNode(doc, channel, "opensearch", "itemsPerPage", ""+hitsPerPage);
+
+      addNode(doc, channel, "nutch", "query", queryString);
+    
+
+      if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show
+          || (!hits.totalIsExact() && (hits.getLength() > start+hitsPerPage))){
+        addNode(doc, channel, "nutch", "nextPage", requestUrl
+                +"?query="+urlQuery
+                +"&start="+end
+                +"&hitsPerDup="+hitsPerDup
+                +params);
+      }
+
+      if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) {
+        addNode(doc, channel, "nutch", "showAllHits", requestUrl
+                +"?query="+urlQuery
+                +"&hitsPerDup="+0
+                +params);
+      }
+
+      for (int i = 0; i < length; i++) {
+        Hit hit = show[i];
+        HitDetails detail = details[i];
+        String title = detail.getValue("title");
+        String url = detail.getValue("url");
+        String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo();
+      
+        if (title == null || title.equals("")) {   // use url for docs w/o title
+          title = url;
+        }
+        
+        Element item = addNode(doc, channel, "item");
+
+        addNode(doc, item, "title", title);
+        if (summaries[i] != null) {
+          addNode(doc, item, "description", summaries[i].toString() );
+        }
+        addNode(doc, item, "link", url);
+
+        addNode(doc, item, "nutch", "site", hit.getDedupValue());
+
+        addNode(doc, item, "nutch", "cache", base+"/cached.jsp?"+id);
+        addNode(doc, item, "nutch", "explain", base+"/explain.jsp?"+id
+                +"&query="+urlQuery+"&lang="+queryLang);
+
+        if (hit.moreFromDupExcluded()) {
+          addNode(doc, item, "nutch", "moreFromSite", requestUrl
+                  +"?query="
+                  +URLEncoder.encode("site:"+hit.getDedupValue()
+                                     +" "+queryString, "UTF-8")
+                  +"&hitsPerSite="+0
+                  +params);
+        }
+
+        for (int j = 0; j < detail.getLength(); j++) { // add all from detail
+          String field = detail.getField(j);
+          if (!SKIP_DETAILS.contains(field))
+            addNode(doc, item, "nutch", field, detail.getValue(j));
+        }
+      }
+
+      // dump DOM tree
+
+      DOMSource source = new DOMSource(doc);
+      TransformerFactory transFactory = TransformerFactory.newInstance();
+      Transformer transformer = transFactory.newTransformer();
+      transformer.setOutputProperty("indent", "yes");
+      StreamResult result = new StreamResult(response.getOutputStream());
+      response.setContentType("text/xml");
+      transformer.transform(source, result);
+
+    } catch (javax.xml.parsers.ParserConfigurationException e) {
+      throw new ServletException(e);
+    } catch (javax.xml.transform.TransformerException e) {
+      throw new ServletException(e);
+    }
+      
+  }
+
+  private static Element addNode(Document doc, Node parent, String name) {
+    Element child = doc.createElement(name);
+    parent.appendChild(child);
+    return child;
+  }
+
+  private static void addNode(Document doc, Node parent,
+                              String name, String text) {
+    Element child = doc.createElement(name);
+    child.appendChild(doc.createTextNode(getLegalXml(text)));
+    parent.appendChild(child);
+  }
+
+  private static void addNode(Document doc, Node parent,
+                              String ns, String name, String text) {
+    Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name);
+    child.appendChild(doc.createTextNode(getLegalXml(text)));
+    parent.appendChild(child);
+  }
+
+  private static void addAttribute(Document doc, Element node,
+                                   String name, String value) {
+    Attr attribute = doc.createAttribute(name);
+    attribute.setValue(getLegalXml(value));
+    node.getAttributes().setNamedItem(attribute);
+  }
+
+  /*
+   * Ensure string is legal xml.
+   * @param text String to verify.
+   * @return Passed <code>text</code> or a new string with illegal
+   * characters removed if any found in <code>text</code>.
+   * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char
+   */
+  protected static String getLegalXml(final String text) {
+      if (text == null) {
+          return null;
+      }
+      StringBuffer buffer = null;
+      for (int i = 0; i < text.length(); i++) {
+        char c = text.charAt(i);
+        if (!isLegalXml(c)) {
+	  if (buffer == null) {
+              // Start up a buffer.  Copy characters here from now on
+              // now we've found at least one bad character in original.
+	      buffer = new StringBuffer(text.length());
+              buffer.append(text.substring(0, i));
+          }
+        } else {
+           if (buffer != null) {
+             buffer.append(c);
+           }
+        }
+      }
+      return (buffer != null)? buffer.toString(): text;
+  }
+ 
+  private static boolean isLegalXml(final char c) {
+    return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff)
+        || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff);
+  }
+
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access:[2660] trunk/archive-access/projects/nutchwax/ arch

[Archive-access-cvs] SF.net SVN: archive-access:[2660] trunk/archive-access/projects/nutchwax/ archive/src