Revision: 2654 http://archive-access.svn.sourceforge.net/archive-access/?rev=2654&view=rev Author: binzino Date: 2008-12-09 01:58:04 +0000 (Tue, 09 Dec 2008) Log Message: ----------- Added class-level javadoc description. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java 2008-12-09 01:42:08 UTC (rev 2653) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java 2008-12-09 01:58:04 UTC (rev 2654) @@ -48,6 +48,58 @@ import org.apache.nutch.scoring.ScoringFilterException; +/** + * Simple scoring plugin that applies a PageRank multiple to the + * document score/boost during index time. Only implements the + * <code>ScoringFilter</code> method associated with indexing, none of + * the other scoring methods are implemented. + * </p><p> + * Applies a simple log10 multipler to the document score based on the + * base-10 log value of the number of inlinks. For example, a page with + * 13,032 inlinks will have a score/boost of 5. The actual formula is + * </p> + * <code> + * initialScore *= ( floor( log10( # inlinks ) ) + 1 ) + * </code> + * <p> + * We use floor() to get an integer value from the log10() function + * since we're only interested in order of magnitude. We then add 1 + * so that a page with < 10 inlins will have a multipler of 1, and + * thus stay the same, 10-100 gets a multipler of 2, 100-1000 is 3, and + * so forth. + * </p> + * <p> + * The number of inlinks for a page is not taken from the <code>inlinks</code> + * method parameter. Rather a map of <URL,rank> values is read from + * an external file. Confusing? Yes. + * </p> + * <p> + * We use an external file because the <code>inlinks</code> will + * <strong>always</strong> be empty. This is because the + * <code>linkdb</code> uses URLs where the <strong>key</strong> is not + * the URL rather the URL+digest. Thus the URLs in the + * <code>linkdb</code> never match the keys and Hadoop doesn't pass + * in the expected <code>linkdb</code> information. + * </p> + * <p> + * We work around this by using a NutchWAX command-line tool to + * extract the relevant PageRank information from the + * <code>linkdb</code> and write to an external file. We then read + * that external file here and use the information contained therein. + * </p> + * <p> + * Yes, this is a hassle. But it's the best we got right now. + * </p> + * <h2>Implementation note</h2> + * <p> + * Since the scoring plugins are used <em>only</em> during the + * <code>reduce</code> step during indexing, we delay the + * initialization of the <URL,rank> map until the first call to + * the <code>indexerScore</code> method. This way, we don't spend the + * effort to read the external file when we are instantiated during + * <code>map</code> phase. + * </p> + */ public class PageRankScoringFilter implements ScoringFilter { public static final Log LOG = LogFactory.getLog( PageRankScoringFilter.class ); @@ -247,5 +299,4 @@ return pageranks; } - } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2657 http://archive-access.svn.sourceforge.net/archive-access/?rev=2657&view=rev Author: binzino Date: 2008-12-10 05:01:14 +0000 (Wed, 10 Dec 2008) Log Message: ----------- Removed use of floor() in calculating the book multiplier. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java 2008-12-10 04:59:10 UTC (rev 2656) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java 2008-12-10 05:01:14 UTC (rev 2657) @@ -56,17 +56,14 @@ * </p><p> * Applies a simple log10 multipler to the document score based on the * base-10 log value of the number of inlinks. For example, a page with - * 13,032 inlinks will have a score/boost of 5. The actual formula is + * 13,032 inlinks will have a score/boost of 5.115. The actual formula is * </p> * <code> - * initialScore *= ( floor( log10( # inlinks ) ) + 1 ) + * newScore = initialScore * ( log10( # inlinks ) + 1 ) * </code> * <p> - * We use floor() to get an integer value from the log10() function - * since we're only interested in order of magnitude. We then add 1 - * so that a page with < 10 inlins will have a multipler of 1, and - * thus stay the same, 10-100 gets a multipler of 2, 100-1000 is 3, and - * so forth. + * We add the extra 1 for pages with only 1 inlink since log10(1)=0 and we + * don't want a 0 multiplier. * </p> * <p> * The number of inlinks for a page is not taken from the <code>inlinks</code> @@ -115,8 +112,6 @@ public void setConf( Configuration conf ) { this.conf = conf; - - //this.ranks = getPageRanks( conf ); } public void injectedScore(Text url, CrawlDatum datum) @@ -181,7 +176,7 @@ return initScore; } - String keyParts[] = key.toString( ).split( "\\s+" ); + String keyParts[] = key.toString( ).split( "\\s+", 2 ); if ( keyParts.length != 2 ) { @@ -201,7 +196,7 @@ return initScore; } - float newScore = initScore * (float) ( Math.floor( Math.log( rank ) ) + 1 ); + float newScore = initScore * (float) ( Math.log( rank ) + 1 ); LOG.info( "PageRankScoringFilter: initScore = " + newScore + " ; key = " + key ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2838 http://archive-access.svn.sourceforge.net/archive-access/?rev=2838&view=rev Author: binzino Date: 2009-10-27 21:31:15 +0000 (Tue, 27 Oct 2009) Log Message: ----------- Updated to Nutch 1.0 API. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java 2009-10-27 21:29:00 UTC (rev 2837) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java 2009-10-27 21:31:15 UTC (rev 2838) @@ -41,6 +41,7 @@ import org.apache.lucene.document.Document; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; import org.apache.nutch.protocol.Content; @@ -158,7 +159,7 @@ // Not implemented } - public float indexerScore(Text key, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + public float indexerScore(Text key, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { synchronized ( this ) @@ -196,7 +197,7 @@ return initScore; } - float newScore = initScore * (float) ( Math.log( rank ) + 1 ); + float newScore = initScore * (float) ( Math.log10( rank ) + 1 ); LOG.info( "PageRankScoringFilter: initScore = " + newScore + " ; key = " + key ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |