archive-access-cvs Mailing List for Web Archive Access Utilities (Page 33)

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3005
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3005&view=rev
Author:   binzino
Date:     2010-03-24 01:08:53 +0000 (Wed, 24 Mar 2010)

Log Message:
-----------
Various hacks for ARI-2260.

Modified Paths:
--------------
    tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java
    tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java

Added Paths:
-----------
    tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/CollapsingHitCollector.java
    tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
    tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java
    tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/Searcher.java

Added: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/CollapsingHitCollector.java
===================================================================

--- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/CollapsingHitCollector.java	                        (rev 0)
+++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/CollapsingHitCollector.java	2010-03-24 01:08:53 UTC (rev 3005)
@@ -0,0 +1,246 @@
+package org.apache.nutch.searcher;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.lucene.search.TopDocCollector;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+
+public class CollapsingHitCollector extends TopDocCollector
+{
+  public static final Comparator<Hit> SCORE_COMPARATOR = new Comparator<Hit>( )
+  {
+    public int compare( Hit h1, Hit h2 )
+    {
+      if ( h1.score <  h2.score ) return -1;
+      if ( h1.score >  h2.score ) return 1;
+
+      // must be equal
+      return 0;
+    }
+  };
+
+  public static final Comparator<Hit> SITE_COMPARATOR_PARTIAL = new Comparator<Hit>( )
+  {
+    public int compare( Hit h1, Hit h2 )
+    {
+      return String.CASE_INSENSITIVE_ORDER.compare( h1.site, h2.site );
+    }
+  };
+
+  public static final Comparator<Hit> SITE_COMPARATOR_TOTAL = new Comparator<Hit>( )
+  {
+    public int compare( Hit h1, Hit h2 )
+    {
+      return String.CASE_INSENSITIVE_ORDER.compare( h1.site, h2.site );
+    }
+  };
+
+  public static class Hit 
+  {
+    int    id;
+    float  score;
+    String site;
+    
+    public Hit( int id, float score, String site )
+    {
+      this.id    = id;
+      this.score = score;
+      this.site  = site;
+    }
+    
+    public int compareTo( Hit that )
+    {
+      if ( this.score < that.score ) return -1;
+      if ( this.score > that.score ) return  1;
+      
+      if ( this.id < that.id ) return -1;
+      if ( this.id > that.id ) return  1;
+      
+      return 0;
+    }
+  }
+
+  public org.apache.lucene.search.Searcher searcher;
+  public int      numHits;
+  public int      hitsPerSite;
+
+  final Hit[] sortedByScore;
+  final Hit[] sortedBySite;
+  final Hit   candidate;
+
+  int numUncollapsedHits = 0;
+
+  public CollapsingHitCollector( org.apache.lucene.search.Searcher searcher, int numHits, int hitsPerSite )
+  {
+    super( numHits );
+    this.searcher    = searcher;
+    this.numHits     = numHits;
+    this.hitsPerSite = hitsPerSite;
+
+    this.sortedByScore = new Hit[numHits];
+    this.sortedBySite  = new Hit[numHits];
+
+    for ( int i = 0; i < numHits; i++ )
+      {
+        Hit sd = new Hit( -1, Float.NEGATIVE_INFINITY, "" );
+        this.sortedByScore[i] = sd;
+        this.sortedBySite [i] = sd;
+      }
+
+    this.candidate = new Hit( -1, Float.NEGATIVE_INFINITY, "" );
+
+  }
+
+  public void collect( int doc, float score )
+  {
+    this.numUncollapsedHits++;
+        
+    this.candidate.id    = doc;
+    this.candidate.score = score;
+
+    if ( this.candidate.score <= this.sortedByScore[0].score )
+      {
+        return ;
+      }
+
+    try
+      {
+        String url = this.searcher.doc( this.candidate.id  ).get( "url");
+        try 
+          {
+            java.net.URL u = new java.net.URL( url );
+
+            this.candidate.site = u.getHost();
+          }
+        catch ( java.net.MalformedURLException e ) {  }
+      }
+    catch ( IOException ioe ) { throw new RuntimeException( ioe ); }
+
+    // Use "" rather than null to keep searching and sorting simple.
+    if ( this.candidate.site == null ) this.candidate.site = "";
+    
+    int sitePos = findReplacementPosition( candidate );
+
+    // No existing hit to be replaced, so we replace the overall
+    // lowest-scoring one, which is always in position 0 in the
+    // sortedByScore list.
+    if ( sitePos < 0 )
+      {
+        this.sortedByScore[0].id    = candidate.id;
+        this.sortedByScore[0].score = candidate.score;
+        this.sortedByScore[0].site  = candidate.site;
+        
+        // Since we just added a new site, re-sort them.
+        Arrays.sort( this.sortedByScore, SCORE_COMPARATOR );
+
+        // No need to re-sort the sites if not collapsing.
+        if ( this.hitsPerSite != 0 )
+          {
+            Arrays.sort( this.sortedBySite, SITE_COMPARATOR_TOTAL );
+          }
+
+        // Done!
+        return ;
+      }
+
+    // We have an existing Hit from the same site which can be
+    // replaced *if* the candidate's score is better.
+    if ( candidate.score > this.sortedBySite[sitePos].score )
+      {
+        this.sortedBySite[sitePos].id    = this.candidate.id;
+        this.sortedBySite[sitePos].score = this.candidate.score;
+        
+        // We have to re-sort by scores.
+        Arrays.sort( this.sortedByScore, SCORE_COMPARATOR );
+
+        // If our hitsPerSite > 1, then we have to re-sort by site to
+        // ensure that the hit we just inserted is put into the proper
+        // sorted position within the site group.  If hitsPerSite==1,
+        // then the group size == 1 and therefore no need to re-sort.
+        if ( this.hitsPerSite > 1 )
+          {
+            Arrays.sort( this.sortedBySite, SITE_COMPARATOR_TOTAL );
+          }
+      }
+  }
+
+  private int findReplacementPosition( Hit candidate )
+  {
+    if ( this.hitsPerSite == 0 ) return -1;
+
+    int pos = Arrays.binarySearch( this.sortedBySite, candidate, SITE_COMPARATOR_PARTIAL );
+    
+    if ( pos < 0 || this.hitsPerSite == 1 ) return pos;
+    
+    int i = pos, j = pos;
+
+    final int mini = 0, maxj = this.sortedBySite.length - 1;
+
+    for ( ; i > mini && SITE_COMPARATOR_PARTIAL.compare( this.sortedBySite[i], this.sortedBySite[i-1] ) == 0; i-- )
+      ;
+
+    for ( ; j < maxj && SITE_COMPARATOR_PARTIAL.compare( this.sortedBySite[i], this.sortedBySite[j+1] ) == 0; j++ )
+      ;
+
+    // The number of hits from this site is (j-i+1), so if we are less
+    // than the max number of hits per site, then we return -1 to
+    // indicate there is still room for more Hits from the candidate
+    // site.
+    if ( (j - i + 1) < this.hitsPerSite ) return -1;
+
+    // Otherwise, the Hit to be potentially replaced is the lowest
+    // scoring hit, which is the one at position i.
+    return i;
+  }
+
+  public Hit[] getHits()
+  {
+    Hit[] hits = new Hit[this.getNumHits( )];
+
+    final int sortedByScoreEndPos = this.sortedByScore.length - 1;
+    for ( int i = 0; i < hits.length ; i++ )
+      {
+        hits[i] = this.sortedByScore[ sortedByScoreEndPos - i ];
+      }
+
+    return hits;
+  }
+
+  public int getNumHits( )
+  {
+    for ( int i = this.sortedByScore.length - this.numHits ; i < this.sortedByScore.length ; i++ )
+      {
+        if ( this.sortedByScore[i].score != Float.NEGATIVE_INFINITY )
+          {
+            return this.sortedByScore.length - i;
+          }
+      }
+    return 0;
+  }
+
+  public int getTotalHits()
+  {
+    if ( this.hitsPerSite == 0 ) return this.numUncollapsedHits;
+
+   int numCollapsedHits = getNumHits( );
+
+    if ( numCollapsedHits < this.numHits )
+      {
+        return numCollapsedHits;
+      }
+    return this.numUncollapsedHits;
+  }
+
+  public TopDocs topDocs() 
+  {
+    Hit[] hits = this.getHits( );
+    ScoreDoc[] sd = new ScoreDoc[hits.length];
+    for ( int i = 0 ; i < hits.length ; i++ )
+      {
+        sd[i] = new ScoreDoc( hits[i].id, hits[i].score );
+      }
+    return new TopDocs( getTotalHits(), sd, hits[hits.length-1].score );
+  }
+}
\ No newline at end of file

Modified: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java
===================================================================
--- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java	2010-03-24 01:01:04 UTC (rev 3004)
+++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java	2010-03-24 01:08:53 UTC (rev 3005)
@@ -254,7 +254,9 @@
     }
 
     public Hits search(final Query query, final int numHits,
-                       final String dedupField, final String sortField,
+                       final int maxHitsPerDup,
+                       final String dedupField, 
+                       final String sortField,
                        final boolean reverse) throws IOException {
       // Get the list of live servers.  It would be nice to build this
       // list in updateSegments(), but that would create concurrency issues.
@@ -282,8 +284,9 @@
         params[i][0] = query;
         params[i][1] = new Integer(numHits);
         params[i][2] = dedupField;
-        params[i][3] = sortField;
-        params[i][4] = Boolean.valueOf(reverse);
+        params[i][3] = maxHitsPerDup;
+        params[i][4] = sortField;
+        params[i][5] = Boolean.valueOf(reverse);
       }
       Hits[] results = (Hits[])RPC.call(SEARCH, params, liveAddresses, this.conf);
 
@@ -439,7 +442,7 @@
       Client client = new Client(addresses, NutchConfiguration.create());
       //client.setTimeout(Integer.MAX_VALUE);
 
-      Hits hits = client.search(query, 10, null, null, false);
+      Hits hits = client.search(query, 10, 0, null, null, false);
       System.out.println("Total hits: " + hits.getTotal());
       for (int i = 0; i < hits.getLength(); i++) {
         System.out.println(" "+i+" "+ client.getDetails(hits.getHit(i)));

Modified: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java
===================================================================
--- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java	2010-03-24 01:01:04 UTC (rev 3004)
+++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java	2010-03-24 01:08:53 UTC (rev 3005)
@@ -90,7 +90,9 @@
   }
 
   public Hits search(Query query, int numHits,
-                     String dedupField, String sortField, boolean reverse)
+                     int maxHitsPerDup,
+                     String dedupField, 
+                     String sortField, boolean reverse)
 
     throws IOException {
     org.apache.lucene.search.BooleanQuery luceneQuery =
@@ -100,7 +102,7 @@
     System.out.println( "Lucene query: " + luceneQuery );
 
     return translateHits
-      (optimizer.optimize(luceneQuery, luceneSearcher, numHits,
+      (optimizer.optimize(luceneQuery, luceneSearcher, numHits, maxHitsPerDup,
                           sortField, reverse),
        dedupField, sortField);
   }

Added: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
===================================================================
--- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java	                        (rev 0)
+++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java	2010-03-24 01:08:53 UTC (rev 3005)
@@ -0,0 +1,278 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.searcher;
+
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.QueryFilter;
+import org.apache.lucene.search.*;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.misc.ChainedFilter;
+
+import org.apache.hadoop.conf.Configuration;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.ArrayList;
+
+import java.io.IOException;
+
+/** Utility which converts certain query clauses into {@link QueryFilter}s and
+ * caches these.  Only required clauses whose boost is zero are converted to
+ * cached filters.  Range queries are converted to range filters.  This
+ * accellerates query constraints like date, language, document format, etc.,
+ * which do not affect ranking but might otherwise slow search considerably. */
+class LuceneQueryOptimizer {
+
+  // This thread provides a pseudo-clock service to all searching
+  // threads, so that they can count elapsed time with less overhead than
+  // repeatedly calling System.currentTimeMillis.
+  private TimerThread timerThread = null;
+
+  private static class TimerThread extends Thread {
+    private int tick;
+    // NOTE: we can avoid explicit synchronization here for several reasons:
+    // * updates to 32-bit-sized variables are atomic
+    // * only single thread modifies this value
+    // * use of volatile keyword ensures that it does not reside in
+    //   a register, but in main memory (so that changes are visible to
+    //   other threads).
+    // * visibility of changes does not need to be instantanous, we can
+    //   afford losing a tick or two.
+    //
+    // See section 17 of the Java Language Specification for details.
+    public volatile int timeCounter = 0;
+
+    boolean running = true;
+
+    public TimerThread(int tick) {
+      super("LQO timer thread");
+      this.tick = tick;
+      this.setDaemon(true);
+    }
+
+    public void run() {
+      while(running) {
+        timeCounter++;
+        try {
+          Thread.sleep(tick);
+        } catch (InterruptedException ie) {};
+      }
+    }
+  }
+
+  private void initTimerThread(int p) {
+    if (timerThread == null || !timerThread.isAlive()) {
+      timerThread = new TimerThread(p);
+      timerThread.start();
+    }
+  }
+  
+
+  private static class TimeExceeded extends RuntimeException {
+    public long maxTime;
+    private int maxDoc;
+    public TimeExceeded(long maxTime, int maxDoc) {
+      super("Exceeded search time: " + maxTime + " ms.");
+      this.maxTime = maxTime;
+      this.maxDoc = maxDoc;
+    }
+  }
+
+  private static class LimitedCollector extends TopDocCollector {
+    private int maxHits;
+    private int maxTicks;
+    private int startTicks;
+    private TimerThread timer;
+    private int curTicks;
+
+    public LimitedCollector(int numHits, int maxHits, int maxTicks,
+            TimerThread timer) {
+      super(numHits);
+      this.maxHits = maxHits;
+      this.maxTicks = maxTicks;
+      if (timer != null) {
+    	this.timer = timer;
+        this.startTicks = timer.timeCounter;
+      }
+    }
+
+    public void collect(int doc, float score) {
+      if (maxHits > 0 && getTotalHits() >= maxHits) {
+        throw new LimitExceeded(doc);
+      }
+      if (timer != null) {
+        curTicks = timer.timeCounter;
+        // overflow check
+        if (curTicks < startTicks) curTicks += Integer.MAX_VALUE;
+        if (curTicks - startTicks > maxTicks) {
+          throw new TimeExceeded(timer.tick * (curTicks - startTicks), doc);
+        }
+      }
+      super.collect(doc, score);
+    }
+  }
+  
+  private static class LimitExceeded extends RuntimeException {
+    private int maxDoc;
+    public LimitExceeded(int maxDoc) { this.maxDoc = maxDoc; }    
+  }
+  
+  private LinkedHashMap<BooleanQuery, Filter> cache;                   // an LRU cache of QueryFilter
+  
+  private float threshold;
+
+  private int searcherMaxHits;
+
+  private int tickLength;
+
+  private int maxTickCount;
+  
+  /**
+   * Construct an optimizer that caches and uses filters for required clauses
+   * whose boost is zero.
+   * 
+   * @param cacheSize
+   *          the number of QueryFilters to cache
+   * @param threshold
+   *          the fraction of documents which must contain a term
+   */
+  public LuceneQueryOptimizer(Configuration conf) {
+    final int cacheSize = conf.getInt("searcher.filter.cache.size", 16);
+    this.threshold = conf.getFloat("searcher.filter.cache.threshold",
+        0.05f);
+    this.searcherMaxHits = conf.getInt("searcher.max.hits", -1);
+    this.cache = new LinkedHashMap<BooleanQuery, Filter>(cacheSize, 0.75f, true) {
+      protected boolean removeEldestEntry(Map.Entry eldest) {
+        return size() > cacheSize; // limit size of cache
+      }
+    };
+    this.tickLength = conf.getInt("searcher.max.time.tick_length", 200);
+    this.maxTickCount = conf.getInt("searcher.max.time.tick_count", -1);
+    if (this.maxTickCount > 0) {
+      initTimerThread(this.tickLength);
+    }
+  }
+
+  public TopDocs optimize(BooleanQuery original,
+                          Searcher searcher, int numHits, int maxHitsPerDup,
+                          String sortField, boolean reverse)
+    throws IOException {
+
+    BooleanQuery query = new BooleanQuery();
+    BooleanQuery cacheQuery = new BooleanQuery();
+    BooleanQuery filterQuery = new BooleanQuery();
+    ArrayList<Filter> filters = new ArrayList<Filter>();
+
+    BooleanClause[] clauses = original.getClauses();
+    for (int i = 0; i < clauses.length; i++) {
+      BooleanClause c = clauses[i];
+      if (c.isRequired()                          // required
+          && c.getQuery().getBoost() == 0.0f) {   // boost is zero
+
+        if (c.getQuery() instanceof TermQuery     // TermQuery
+            && (searcher.docFreq(((TermQuery)c.getQuery()).getTerm())
+                / (float)searcher.maxDoc()) < threshold) { // beneath threshold
+          query.add(c);                           // don't filterize
+          continue;
+        }
+          
+        if (c.getQuery() instanceof RangeQuery) { // RangeQuery
+          RangeQuery range = (RangeQuery)c.getQuery();
+          boolean inclusive = range.isInclusive();// convert to RangeFilter
+          Term lower = range.getLowerTerm();
+          Term upper = range.getUpperTerm();
+          filters.add(new RangeFilter(lower!=null?lower.field():upper.field(),
+                                      lower != null ? lower.text() : null,
+                                      upper != null ? upper.text() : null,
+                                      inclusive, inclusive));
+          cacheQuery.add(c.getQuery(), BooleanClause.Occur.MUST); // cache it
+          continue;
+        }
+
+        // all other query types
+        filterQuery.add(c.getQuery(), BooleanClause.Occur.MUST);  // filter it
+        cacheQuery.add(c.getQuery(), BooleanClause.Occur.MUST);   // cache it
+        continue;
+      }
+
+      query.add(c);                               // query it
+    }
+
+    Filter filter = null;
+    if (cacheQuery.getClauses().length != 0) {
+      synchronized (cache) {                      // check cache
+        filter = cache.get(cacheQuery);
+      }
+      if (filter == null) {                       // miss
+
+        if (filterQuery.getClauses().length != 0) // add filterQuery to filters
+          filters.add(new CachingWrapperFilter(new QueryWrapperFilter(filterQuery)));
+
+        if (filters.size() == 1) {                // convert filters to filter
+          filter = (Filter)filters.get(0);
+        } else {
+          filter = new ChainedFilter((Filter[])filters.toArray
+                                     (new Filter[filters.size()]),
+                                     ChainedFilter.AND);
+        }
+        if (!(filter instanceof CachingWrapperFilter))     // make sure bits are cached
+          filter = new CachingWrapperFilter(filter);
+        
+        synchronized (cache) {
+          cache.put(cacheQuery, filter);          // cache the filter
+        }
+      }        
+    }
+    if (sortField == null && !reverse) {
+
+      // no hit limit
+      if (this.searcherMaxHits <= 0 && timerThread == null)  {
+        // FIXME: Need hitsPerSite value, using '1' to test.
+        TopDocCollector c = new CollapsingHitCollector( searcher, numHits, maxHitsPerDup );
+        searcher.search(query, filter, c );
+        return c.topDocs( );
+      }
+
+      // hits limited in time or in count -- use a LimitedCollector
+      LimitedCollector collector = new LimitedCollector(numHits, searcherMaxHits,
+              maxTickCount, timerThread);
+      LimitExceeded exceeded = null;
+      TimeExceeded timeExceeded = null;
+      try {
+        searcher.search(query, filter, collector);
+      } catch (LimitExceeded le) {
+        exceeded = le;
+      } catch (TimeExceeded te) {
+        timeExceeded = te;
+      }
+      TopDocs results = collector.topDocs();
+      if (exceeded != null) {                     // limit was exceeded
+        results.totalHits = (int)                 // must estimate totalHits
+          (results.totalHits*(searcher.maxDoc()/(float)exceeded.maxDoc));
+      } else if (timeExceeded != null) {
+        // Estimate total hits.
+        results.totalHits = (int)(results.totalHits * (searcher.maxDoc()/(float)timeExceeded.maxDoc));
+      }
+      return results;
+
+    } else {
+      return searcher.search(query, filter, numHits,
+                             new Sort(sortField, reverse));
+    }
+  }
+}

Added: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java
===================================================================
--- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java	                        (rev 0)
+++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java	2010-03-24 01:08:53 UTC (rev 3005)
@@ -0,0 +1,434 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.searcher;
+
+import java.io.*;
+import java.util.*;
+import javax.servlet.*;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.io.Closeable;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.indexer.*;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.NutchConfiguration;
+
+/** 
+ * One stop shopping for search-related functionality.
+ * @version $Id: NutchBean.java,v 1.19 2005/02/07 19:10:08 cutting Exp $
+ */   
+public class NutchBean
+  implements Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks,
+             DistributedSearch.Protocol, Closeable {
+
+  public static final Log LOG = LogFactory.getLog(NutchBean.class);
+  public static final String KEY = "nutchBean";
+
+//  static {
+//    LogFormatter.setShowThreadIDs(true);
+//  }
+
+  private String[] segmentNames;
+
+  private Searcher searcher;
+  private HitDetailer detailer;
+  private HitSummarizer summarizer;
+  private HitContent content;
+  private HitInlinks linkDb;
+
+
+  /** BooleanQuery won't permit more than 32 required/prohibited clauses.  We
+   * don't want to use too many of those. */ 
+  private static final int MAX_PROHIBITED_TERMS = 20;
+  
+  private Configuration conf;
+
+  private FileSystem fs;
+
+  /** Returns the cached instance in the servlet context. 
+   * @see NutchBeanConstructor*/
+  public static NutchBean get(ServletContext app, Configuration conf) throws IOException {
+    NutchBean bean = (NutchBean)app.getAttribute(KEY);
+    return bean;
+  }
+
+
+  /**
+   * 
+   * @param conf
+   * @throws IOException
+   */
+  public NutchBean(Configuration conf) throws IOException {
+    this(conf, null);
+  }
+  
+  /**
+   *  Construct in a named directory. 
+   * @param conf
+   * @param dir
+   * @throws IOException
+   */
+  public NutchBean(Configuration conf, Path dir) throws IOException {
+        this.conf = conf;
+        this.fs = FileSystem.get(this.conf);
+        if (dir == null) {
+            dir = new Path(this.conf.get("searcher.dir", "crawl"));
+        }
+        Path servers = new Path(dir, "search-servers.txt");
+        if (fs.exists(servers)) {
+            if (LOG.isInfoEnabled()) {
+              LOG.info("searching servers in " + servers);
+            }
+            init(new DistributedSearch.Client(servers, conf));
+        } else {
+            init(new Path(dir, "index"), new Path(dir, "indexes"), new Path(
+                    dir, "segments"), new Path(dir, "linkdb"));
+        }
+    }
+
+  private void init(Path indexDir, Path indexesDir, Path segmentsDir,
+                    Path linkDb)
+    throws IOException {
+    IndexSearcher indexSearcher;
+    if (this.fs.exists(indexDir)) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("opening merged index in " + indexDir);
+      }
+      indexSearcher = new IndexSearcher(indexDir, this.conf);
+    } else {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("opening indexes in " + indexesDir);
+      }
+      
+      Vector vDirs=new Vector();
+      FileStatus[] fstats = fs.listStatus(indexesDir,
+          HadoopFSUtil.getPassDirectoriesFilter(fs));
+      Path [] directories = HadoopFSUtil.getPaths(fstats);
+      for(int i = 0; i < directories.length; i++) {
+        Path indexdone = new Path(directories[i], Indexer.DONE_NAME);
+        if(fs.isFile(indexdone)) {
+          vDirs.add(directories[i]);
+        }
+      }
+      
+      
+      directories = new Path[ vDirs.size() ];
+      for(int i = 0; vDirs.size()>0; i++) {
+        directories[i]=(Path)vDirs.remove(0);
+      }
+      
+      indexSearcher = new IndexSearcher(directories, this.conf);
+    }
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("opening segments in " + segmentsDir);
+    }
+    FetchedSegments segments = new FetchedSegments(this.fs, segmentsDir.toString(),this.conf);
+    
+    this.segmentNames = segments.getSegmentNames();
+
+    this.searcher = indexSearcher;
+    this.detailer = indexSearcher;
+    this.summarizer = segments;
+    this.content = segments;
+
+    if (LOG.isInfoEnabled()) { LOG.info("opening linkdb in " + linkDb); }
+    this.linkDb = new LinkDbInlinks(fs, linkDb, this.conf);
+  }
+
+  private void init(DistributedSearch.Client client) {
+    this.segmentNames = client.getSegmentNames();
+    this.searcher = client;
+    this.detailer = client;
+    this.summarizer = client;
+    this.content = client;
+    this.linkDb = client;
+  }
+
+
+  public String[] getSegmentNames() {
+    return segmentNames;
+  }
+
+  public Hits search(Query query, int numHits) throws IOException {
+    return search(query, numHits, null, null, false);
+  }
+  
+  public Hits search(Query query, int numHits,
+                     String dedupField, String sortField, boolean reverse)
+    throws IOException {
+
+    return searcher.search(query, numHits, 0, dedupField, sortField, reverse);
+  }
+  
+  private class DupHits extends ArrayList {
+    private boolean maxSizeExceeded;
+  }
+
+  /** Search for pages matching a query, eliminating excessive hits from the
+   * same site.  Hits after the first <code>maxHitsPerDup</code> from the same
+   * site are removed from results.  The remaining hits have {@link
+   * Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero then all
+   * hits are returned.
+   * 
+   * @param query query
+   * @param numHits number of requested hits
+   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
+   * @return Hits the matching hits
+   * @throws IOException
+   */
+  public Hits search(Query query, int numHits, int maxHitsPerDup)
+       throws IOException {
+    return search(query, numHits, maxHitsPerDup, "site", null, false);
+  }
+
+  /** Search for pages matching a query, eliminating excessive hits with
+   * matching values for a named field.  Hits after the first
+   * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
+   * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
+   * then all hits are returned.
+   * 
+   * @param query query
+   * @param numHits number of requested hits
+   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
+   * @param dedupField field name to check for duplicates
+   * @return Hits the matching hits
+   * @throws IOException
+   */
+  public Hits search(Query query, int numHits,
+                     int maxHitsPerDup, String dedupField)
+       throws IOException {
+    return search(query, numHits, maxHitsPerDup, dedupField, null, false);
+  }
+  /** Search for pages matching a query, eliminating excessive hits with
+   * matching values for a named field.  Hits after the first
+   * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
+   * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
+   * then all hits are returned.
+   * 
+   * @param query query
+   * @param numHits number of requested hits
+   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
+   * @param dedupField field name to check for duplicates
+   * @param sortField Field to sort on (or null if no sorting).
+   * @param reverse True if we are to reverse sort by <code>sortField</code>.
+   * @return Hits the matching hits
+   * @throws IOException
+   */
+  public Hits search(Query query, int numHits,
+                     int maxHitsPerDup, 
+                     String dedupField,
+                     String sortField, boolean reverse)
+       throws IOException {
+    if (maxHitsPerDup <= 0)                      // disable dup checking
+      return search(query, numHits, dedupField, sortField, reverse);
+
+    float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
+    int numHitsRaw = (int)(numHits * rawHitsFactor);
+    if (LOG.isInfoEnabled()) {
+      LOG.info("searching for "+numHitsRaw+" raw hits");
+    }
+    Hits hits = searcher.search(query, numHitsRaw, maxHitsPerDup, dedupField, sortField, reverse);
+    long total = hits.getTotal();
+    Map dupToHits = new HashMap();
+    List resultList = new ArrayList();
+    Set seen = new HashSet();
+    List excludedValues = new ArrayList();
+    boolean totalIsExact = true;
+    for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
+      // get the next raw hit
+      if (rawHitNum >= hits.getLength()) {
+        // optimize query by prohibiting more matches on some excluded values
+        Query optQuery = (Query)query.clone();
+        for (int i = 0; i < excludedValues.size(); i++) {
+          if (i == MAX_PROHIBITED_TERMS)
+            break;
+          optQuery.addProhibitedTerm(((String)excludedValues.get(i)),
+                                     dedupField);
+        }
+        numHitsRaw = (int)(numHitsRaw * rawHitsFactor);
+        if (LOG.isInfoEnabled()) {
+          LOG.info("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery);
+        }
+        hits = searcher.search(optQuery, numHitsRaw, maxHitsPerDup, dedupField, sortField, reverse);
+        if (LOG.isInfoEnabled()) {
+          LOG.info("found "+hits.getTotal()+" raw hits");
+        }
+        rawHitNum = -1;
+        continue;
+      }
+
+      Hit hit = hits.getHit(rawHitNum);
+      if (seen.contains(hit))
+        continue;
+      seen.add(hit);
+      
+      // get dup hits for its value
+      String value = hit.getDedupValue();
+      DupHits dupHits = (DupHits)dupToHits.get(value);
+      if (dupHits == null)
+        dupToHits.put(value, dupHits = new DupHits());
+
+      // does this hit exceed maxHitsPerDup?
+      if (dupHits.size() == maxHitsPerDup) {      // yes -- ignore the hit
+        if (!dupHits.maxSizeExceeded) {
+
+          // mark prior hits with moreFromDupExcluded
+          for (int i = 0; i < dupHits.size(); i++) {
+            ((Hit)dupHits.get(i)).setMoreFromDupExcluded(true);
+          }
+          dupHits.maxSizeExceeded = true;
+
+          excludedValues.add(value);              // exclude dup
+        }
+        totalIsExact = false;
+      } else {                                    // no -- collect the hit
+        resultList.add(hit);
+        dupHits.add(hit);
+
+        // are we done?
+        // we need to find one more than asked for, so that we can tell if
+        // there are more hits to be shown
+        if (resultList.size() > numHits)
+          break;
+      }
+    }
+
+    Hits results =
+      new Hits(total,
+               (Hit[])resultList.toArray(new Hit[resultList.size()]));
+    results.setTotalIsExact(totalIsExact);
+    return results;
+  }
+    
+
+  public String getExplanation(Query query, Hit hit) throws IOException {
+    return searcher.getExplanation(query, hit);
+  }
+
+  public HitDetails getDetails(Hit hit) throws IOException {
+    return detailer.getDetails(hit);
+  }
+
+  public HitDetails[] getDetails(Hit[] hits) throws IOException {
+    return detailer.getDetails(hits);
+  }
+
+  public Summary getSummary(HitDetails hit, Query query) throws IOException {
+    return summarizer.getSummary(hit, query);
+  }
+
+  public Summary[] getSummary(HitDetails[] hits, Query query)
+    throws IOException {
+    return summarizer.getSummary(hits, query);
+  }
+
+  public byte[] getContent(HitDetails hit) throws IOException {
+    return content.getContent(hit);
+  }
+
+  public ParseData getParseData(HitDetails hit) throws IOException {
+    return content.getParseData(hit);
+  }
+
+  public ParseText getParseText(HitDetails hit) throws IOException {
+    return content.getParseText(hit);
+  }
+
+  public String[] getAnchors(HitDetails hit) throws IOException {
+    return linkDb.getAnchors(hit);
+  }
+
+  public Inlinks getInlinks(HitDetails hit) throws IOException {
+    return linkDb.getInlinks(hit);
+  }
+
+  public long getFetchDate(HitDetails hit) throws IOException {
+    return content.getFetchDate(hit);
+  }
+
+  public void close() throws IOException {
+    if (content != null) { content.close(); }
+    if (searcher != null) { searcher.close(); }
+    if (linkDb != null) { linkDb.close(); }
+    if (fs != null) { fs.close(); }
+  }
+  
+  /** For debugging. */
+  public static void main(String[] args) throws Exception {
+    String usage = "NutchBean query";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+
+    Configuration conf = NutchConfiguration.create();
+    NutchBean bean = new NutchBean(conf);
+    Query query = Query.parse(args[0], conf);
+    Hits hits = bean.search(query, 10);
+    System.out.println("Total hits: " + hits.getTotal());
+    int length = (int)Math.min(hits.getTotal(), 10);
+    Hit[] show = hits.getHits(0, length);
+    HitDetails[] details = bean.getDetails(show);
+    Summary[] summaries = bean.getSummary(details, query);
+
+    for (int i = 0; i < hits.getLength(); i++) {
+      System.out.println(" "+i+" "+ details[i] + "\n" + summaries[i]);
+    }
+  }
+
+  public long getProtocolVersion(String className, long arg1) throws IOException {
+    if(DistributedSearch.Protocol.class.getName().equals(className)){
+      return 1;
+    } else {
+      throw new IOException("Unknown Protocol classname:" + className);
+    }
+  }
+
+  /** Responsible for constructing a NutchBean singleton instance and 
+   *  caching it in the servlet context. This class should be registered in 
+   *  the deployment descriptor as a listener 
+   */
+  public static class NutchBeanConstructor implements ServletContextListener {
+    
+    public void contextDestroyed(ServletContextEvent sce) { }
+
+    public void contextInitialized(ServletContextEvent sce) {
+      ServletContext app = sce.getServletContext();
+      Configuration conf = NutchConfiguration.get(app);
+      
+      LOG.info("creating new bean");
+      NutchBean bean = null;
+      try {
+        bean = new NutchBean(conf);
+        app.setAttribute(KEY, bean);
+      }
+      catch (IOException ex) {
+        LOG.error(StringUtils.stringifyException(ex));
+      }
+    }
+  }
+
+}

Added: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/Searcher.java
===================================================================
--- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/Searcher.java	                        (rev 0)
+++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/Searcher.java	2010-03-24 01:08:53 UTC (rev 3005)
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.searcher;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.Closeable;
+
+/** Service that searches. */
+public interface Searcher extends Closeable {
+  /** Return the top-scoring hits for a query. */
+  Hits search(Query query, int numHits,
+              int maxHitsPerDup,
+              String dedupField, 
+              String sortField, boolean reverse)
+    throws IOException;
+
+  /** Return an HTML-formatted explanation of how a query scored. */
+  String getExplanation(Query query, Hit hit) throws IOException;
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




2005	Jan	Feb	Mar	Apr	May	Jun	Jul (1)	Aug (10)	Sep (36)	Oct (339)	Nov (103)	Dec (152)
2006	Jan (141)	Feb (102)	Mar (125)	Apr (203)	May (57)	Jun (30)	Jul (139)	Aug (46)	Sep (64)	Oct (105)	Nov (34)	Dec (162)
2007	Jan (81)	Feb (57)	Mar (141)	Apr (72)	May (9)	Jun (1)	Jul (144)	Aug (88)	Sep (40)	Oct (43)	Nov (34)	Dec (20)
2008	Jan (44)	Feb (45)	Mar (16)	Apr (36)	May (8)	Jun (77)	Jul (177)	Aug (66)	Sep (8)	Oct (33)	Nov (13)	Dec (37)
2009	Jan (2)	Feb (5)	Mar (8)	Apr	May (36)	Jun (19)	Jul (46)	Aug (8)	Sep (1)	Oct (66)	Nov (61)	Dec (10)
2010	Jan (13)	Feb (16)	Mar (38)	Apr (76)	May (47)	Jun (32)	Jul (35)	Aug (45)	Sep (20)	Oct (61)	Nov (24)	Dec (16)
2011	Jan (22)	Feb (34)	Mar (11)	Apr (8)	May (24)	Jun (23)	Jul (11)	Aug (42)	Sep (81)	Oct (48)	Nov (21)	Dec (20)
2012	Jan (30)	Feb (25)	Mar (4)	Apr (6)	May (1)	Jun (5)	Jul (5)	Aug (8)	Sep (6)	Oct (6)	Nov	Dec

archive-access-cvs Mailing List for Web Archive Access Utilities (Page 33)

archive-access-cvs — CVS commits