[Archive-access-cvs] SF.net SVN: archive-access:[2845] trunk/archive-access/projects/nutchwax/ arc

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2845
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2845&view=rev
Author:   binzino
Date:     2009-10-28 00:01:06 +0000 (Wed, 28 Oct 2009)

Log Message:
-----------
Ported from NW 0.12.9.

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java

Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java
===================================================================

--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java	2009-10-28 00:01:06 UTC (rev 2845)
@@ -0,0 +1,333 @@
+package org.archive.nutchwax.tools;
+
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.Collection;
+import java.util.HashSet;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+
+
+import org.apache.nutch.indexer.NutchSimilarity;
+
+/**
+ * This is heavily cribbed from org.apache.lucene.misc.LengthNormModifier
+ */
+public class LengthNormUpdater 
+{
+  private static final String USAGE = 
+    "Usage: LengthNormUpdater [OPTIONS] <pageranks> <index> [field1]...\n"
+    + "\n"
+    + "Update the norms of <index> with boosts based on values from <pageranks>\n"
+    + "\n"
+    + "Options:\n"
+    + "\t-s <classname>    similarity implementation to use\n"
+    + "\t-v                increase verbosity\n"
+    + "\n" 
+    + "Reads the pagerank values from the <pageranks> file and calculates new\n"
+    + "norms for the documents based on the formula:\n"
+    + "\n"
+    + "\tnorm = similarity.lengthNorm * log10(pagerank)\n"
+    + "\n"
+    + "If fields are specified on the command-line, only they will be updated.\n"
+    + "If a specified field does not have norms, an error message is given and\n"
+    + "the program terminates without performing any updates.\n"
+    + "\n"
+    + "If no fields are given, all the fields in the index that have norms will\n"
+    + "be updated.\n"
+    + "\n"
+    + "The default similarity implementation is NutchSimilarity\n"
+    + "\n"
+    + "Examples:\n"
+    + "\n"
+    + "\tLengthNormUpdater pagerank.txt index\n"
+    + "\tLengthNormUpdater -v -v pagerank.txt index title content\n"
+    + "\n"
+    ;
+
+  private static int VERBOSE = 0;
+
+  /**
+   *
+   */
+  public static void main( String[] args ) throws IOException 
+  {
+    if ( args.length < 1 ) 
+      {
+        System.err.print( USAGE );
+        System.exit(1);
+      }
+    
+    Similarity s = new NutchSimilarity( );
+
+    int pos = 0;
+    for ( ; (pos < args.length) && args[pos].startsWith( "-" ) ; pos++ )
+      {
+        if ( "-h".equals( args[pos] ) )
+          {
+            System.out.println( USAGE );
+            System.exit( 0 );
+          }
+        else if ( "-v".equals( args[pos] ) )
+          {
+            VERBOSE++;
+          }
+        else if ( "-s".equals( args[pos] ) )
+          {
+            pos++;
+
+            if ( pos == args.length )
+              {
+                System.err.println( "Error: missing argument to option -s" );
+                System.exit( 1 );
+              }
+
+            try 
+              {
+                Class simClass = Class.forName(args[pos]);
+                s = (Similarity)simClass.newInstance();
+              }
+            catch (Exception e) 
+              {
+                System.err.println( "Couldn't instantiate similarity with empty constructor: " + args[pos] );
+                e.printStackTrace(System.err);
+                System.exit( 1 );
+              }
+          }
+      }
+    
+    if ( (pos + 2) > args.length ) 
+      {
+        System.out.println( USAGE );
+        System.exit( 1 );
+      }
+
+    String pagerankFile = args[pos++];
+    
+    IndexReader reader = IndexReader.open( args[pos++] );
+
+    try
+      {
+        Set<String> fieldNames = new HashSet<String>( );
+        if ( pos == args.length )
+          {
+            // No fields specified on command-line, get a list of all
+            // fields in the index that have norms.
+            for ( String fieldName : (Collection<String>) reader.getFieldNames( IndexReader.FieldOption.ALL ) )
+              {
+                if ( reader.hasNorms( fieldName ) )
+                  {
+                    fieldNames.add( fieldName );
+                  }
+              }
+          }
+        else
+          {
+            // Verify all explicitly specified fields have norms.
+            for ( int i = pos ; i < args.length ; i++ )
+              {
+                if ( ! reader.hasNorms( args[i] ) )
+                  {
+                    System.err.println( "Error: No norms for field: " + args[i] );
+                    System.exit( 1 );
+                  }
+                
+                fieldNames.add( args[i] );
+              }
+          }
+        
+        if ( fieldNames.isEmpty( ) )
+          {
+            System.out.println( "Warning: No fields with norms to update" );
+            System.exit( 0 );
+          }
+        
+        Map<String,Integer> ranks = getPageRanks( pagerankFile );
+        
+        for ( String fieldName : fieldNames )
+          {
+            reSetNorms( reader, fieldName, ranks, s );
+          }
+
+      }
+    finally
+      {
+        if ( reader != null )
+          {
+            reader.close( );
+          }
+        
+      }
+  }
+
+ 
+  /**
+   *
+   */
+  public static void reSetNorms( IndexReader reader, 
+                                 String fieldName, 
+                                 Map<String,Integer> ranks, 
+                                 Similarity sim ) throws IOException 
+  {
+    if ( VERBOSE > 0 ) System.out.println( "Updating field: " + fieldName );
+
+    int[] termCounts = new int[0];
+    
+    TermEnum termEnum = null;
+    TermDocs termDocs = null;
+    
+    termCounts = new int[reader.maxDoc()];
+    try 
+      {
+        termEnum = reader.terms(new Term(fieldName,""));
+        try 
+          {
+            termDocs = reader.termDocs();
+            do 
+              {
+                Term term = termEnum.term();
+                if (term != null && term.field().equals(fieldName)) 
+                  {
+                    termDocs.seek(termEnum.term());
+                    while (termDocs.next()) 
+                      {
+                        termCounts[termDocs.doc()] += termDocs.freq();
+                      }
+                  }
+              } 
+            while (termEnum.next());
+          }
+        finally 
+          {
+            if (null != termDocs) termDocs.close();
+          }
+      }
+    finally 
+      {
+        if (null != termEnum) termEnum.close();
+      }
+    
+    for (int d = 0; d < termCounts.length; d++) 
+      {
+        if ( ! reader.isDeleted(d) ) 
+          {
+            Document doc = reader.document( d );
+
+            String url = doc.get( "url" );
+
+            if ( url != null )
+              {
+                Integer rank = ranks.get( url );
+                if ( rank == null ) continue;
+                
+                float originalNorm = sim.lengthNorm(fieldName, termCounts[d]);
+                byte  encodedOrig  = sim.encodeNorm(originalNorm);
+                float rankedNorm   = originalNorm * (float) ( Math.log10( rank ) + 1 );
+                byte  encodedRank  = sim.encodeNorm(rankedNorm);
+                                
+                if ( VERBOSE > 1 ) System.out.println( fieldName + "\t" + d + "\t" + originalNorm + "\t" + encodedOrig + "\t" + rankedNorm + "\t" + encodedRank );
+                
+                reader.setNorm(d, fieldName, encodedRank);
+              }
+          }
+      }
+  }
+
+  /**
+   * Utility function to read a list of page-rank records from a file
+   * specified in the configuration.
+   */
+  public static Map<String,Integer> getPageRanks( String filename )
+  {
+    if ( VERBOSE > 0 ) System.out.println( "Reading pageranks from: " + filename );
+
+    Map<String,Integer> pageranks = new HashMap<String,Integer>( );
+
+    BufferedReader reader = null;
+    try
+      {
+        reader = new BufferedReader( new InputStreamReader( new FileInputStream( filename), "UTF-8" ) );
+        
+        String line;
+        while ( (line = reader.readLine()) != null )
+          {
+            String fields[] = line.split( "\\s+" );
+            
+            if ( fields.length < 2 )
+              {
+                System.err.println( "Malformed pagerank, not enough fields ("+fields.length+"): " + line );
+                continue ;
+              }
+            
+            try
+              {
+                int    rank = Integer.parseInt( fields[0] );
+                String url  = fields[1];
+                
+                if ( rank < 0 )
+                  {
+                    System.err.println( "Malformed pagerank, rank less than 0: " + line );
+                  }
+                
+                pageranks.put( url, rank );
+              }
+            catch ( NumberFormatException nfe )
+              {
+                System.err.println( "Malformed pagerank, rank not an integer: " + line );
+                continue ;
+              }
+          }
+      }
+    catch ( IOException e )
+      {
+        // Umm, what to do?
+        throw new RuntimeException( e );
+      }
+    finally
+      {
+        try
+          {
+            if ( reader != null )
+              {
+                reader.close( );
+              }
+          }
+        catch  ( IOException e )
+          {
+            // Ignore it.
+          }
+      }
+    
+    return pageranks;
+  }
+
+  
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access:[2845] trunk/archive-access/projects/nutchwax/ arc

[Archive-access-cvs] SF.net SVN: archive-access:[2845] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java