From: <bi...@us...> - 2009-10-28 00:01:20
|
Revision: 2845 http://archive-access.svn.sourceforge.net/archive-access/?rev=2845&view=rev Author: binzino Date: 2009-10-28 00:01:06 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Ported from NW 0.12.9. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java 2009-10-28 00:01:06 UTC (rev 2845) @@ -0,0 +1,333 @@ +package org.archive.nutchwax.tools; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.Collection; +import java.util.HashSet; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; + + +import org.apache.nutch.indexer.NutchSimilarity; + +/** + * This is heavily cribbed from org.apache.lucene.misc.LengthNormModifier + */ +public class LengthNormUpdater +{ + private static final String USAGE = + "Usage: LengthNormUpdater [OPTIONS] <pageranks> <index> [field1]...\n" + + "\n" + + "Update the norms of <index> with boosts based on values from <pageranks>\n" + + "\n" + + "Options:\n" + + "\t-s <classname> similarity implementation to use\n" + + "\t-v increase verbosity\n" + + "\n" + + "Reads the pagerank values from the <pageranks> file and calculates new\n" + + "norms for the documents based on the formula:\n" + + "\n" + + "\tnorm = similarity.lengthNorm * log10(pagerank)\n" + + "\n" + + "If fields are specified on the command-line, only they will be updated.\n" + + "If a specified field does not have norms, an error message is given and\n" + + "the program terminates without performing any updates.\n" + + "\n" + + "If no fields are given, all the fields in the index that have norms will\n" + + "be updated.\n" + + "\n" + + "The default similarity implementation is NutchSimilarity\n" + + "\n" + + "Examples:\n" + + "\n" + + "\tLengthNormUpdater pagerank.txt index\n" + + "\tLengthNormUpdater -v -v pagerank.txt index title content\n" + + "\n" + ; + + private static int VERBOSE = 0; + + /** + * + */ + public static void main( String[] args ) throws IOException + { + if ( args.length < 1 ) + { + System.err.print( USAGE ); + System.exit(1); + } + + Similarity s = new NutchSimilarity( ); + + int pos = 0; + for ( ; (pos < args.length) && args[pos].startsWith( "-" ) ; pos++ ) + { + if ( "-h".equals( args[pos] ) ) + { + System.out.println( USAGE ); + System.exit( 0 ); + } + else if ( "-v".equals( args[pos] ) ) + { + VERBOSE++; + } + else if ( "-s".equals( args[pos] ) ) + { + pos++; + + if ( pos == args.length ) + { + System.err.println( "Error: missing argument to option -s" ); + System.exit( 1 ); + } + + try + { + Class simClass = Class.forName(args[pos]); + s = (Similarity)simClass.newInstance(); + } + catch (Exception e) + { + System.err.println( "Couldn't instantiate similarity with empty constructor: " + args[pos] ); + e.printStackTrace(System.err); + System.exit( 1 ); + } + } + } + + if ( (pos + 2) > args.length ) + { + System.out.println( USAGE ); + System.exit( 1 ); + } + + String pagerankFile = args[pos++]; + + IndexReader reader = IndexReader.open( args[pos++] ); + + try + { + Set<String> fieldNames = new HashSet<String>( ); + if ( pos == args.length ) + { + // No fields specified on command-line, get a list of all + // fields in the index that have norms. + for ( String fieldName : (Collection<String>) reader.getFieldNames( IndexReader.FieldOption.ALL ) ) + { + if ( reader.hasNorms( fieldName ) ) + { + fieldNames.add( fieldName ); + } + } + } + else + { + // Verify all explicitly specified fields have norms. + for ( int i = pos ; i < args.length ; i++ ) + { + if ( ! reader.hasNorms( args[i] ) ) + { + System.err.println( "Error: No norms for field: " + args[i] ); + System.exit( 1 ); + } + + fieldNames.add( args[i] ); + } + } + + if ( fieldNames.isEmpty( ) ) + { + System.out.println( "Warning: No fields with norms to update" ); + System.exit( 0 ); + } + + Map<String,Integer> ranks = getPageRanks( pagerankFile ); + + for ( String fieldName : fieldNames ) + { + reSetNorms( reader, fieldName, ranks, s ); + } + + } + finally + { + if ( reader != null ) + { + reader.close( ); + } + + } + } + + + /** + * + */ + public static void reSetNorms( IndexReader reader, + String fieldName, + Map<String,Integer> ranks, + Similarity sim ) throws IOException + { + if ( VERBOSE > 0 ) System.out.println( "Updating field: " + fieldName ); + + int[] termCounts = new int[0]; + + TermEnum termEnum = null; + TermDocs termDocs = null; + + termCounts = new int[reader.maxDoc()]; + try + { + termEnum = reader.terms(new Term(fieldName,"")); + try + { + termDocs = reader.termDocs(); + do + { + Term term = termEnum.term(); + if (term != null && term.field().equals(fieldName)) + { + termDocs.seek(termEnum.term()); + while (termDocs.next()) + { + termCounts[termDocs.doc()] += termDocs.freq(); + } + } + } + while (termEnum.next()); + } + finally + { + if (null != termDocs) termDocs.close(); + } + } + finally + { + if (null != termEnum) termEnum.close(); + } + + for (int d = 0; d < termCounts.length; d++) + { + if ( ! reader.isDeleted(d) ) + { + Document doc = reader.document( d ); + + String url = doc.get( "url" ); + + if ( url != null ) + { + Integer rank = ranks.get( url ); + if ( rank == null ) continue; + + float originalNorm = sim.lengthNorm(fieldName, termCounts[d]); + byte encodedOrig = sim.encodeNorm(originalNorm); + float rankedNorm = originalNorm * (float) ( Math.log10( rank ) + 1 ); + byte encodedRank = sim.encodeNorm(rankedNorm); + + if ( VERBOSE > 1 ) System.out.println( fieldName + "\t" + d + "\t" + originalNorm + "\t" + encodedOrig + "\t" + rankedNorm + "\t" + encodedRank ); + + reader.setNorm(d, fieldName, encodedRank); + } + } + } + } + + /** + * Utility function to read a list of page-rank records from a file + * specified in the configuration. + */ + public static Map<String,Integer> getPageRanks( String filename ) + { + if ( VERBOSE > 0 ) System.out.println( "Reading pageranks from: " + filename ); + + Map<String,Integer> pageranks = new HashMap<String,Integer>( ); + + BufferedReader reader = null; + try + { + reader = new BufferedReader( new InputStreamReader( new FileInputStream( filename), "UTF-8" ) ); + + String line; + while ( (line = reader.readLine()) != null ) + { + String fields[] = line.split( "\\s+" ); + + if ( fields.length < 2 ) + { + System.err.println( "Malformed pagerank, not enough fields ("+fields.length+"): " + line ); + continue ; + } + + try + { + int rank = Integer.parseInt( fields[0] ); + String url = fields[1]; + + if ( rank < 0 ) + { + System.err.println( "Malformed pagerank, rank less than 0: " + line ); + } + + pageranks.put( url, rank ); + } + catch ( NumberFormatException nfe ) + { + System.err.println( "Malformed pagerank, rank not an integer: " + line ); + continue ; + } + } + } + catch ( IOException e ) + { + // Umm, what to do? + throw new RuntimeException( e ); + } + finally + { + try + { + if ( reader != null ) + { + reader.close( ); + } + } + catch ( IOException e ) + { + // Ignore it. + } + } + + return pageranks; + } + + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |