From: <bi...@us...> - 2008-12-09 02:22:13
|
Revision: 2651 http://archive-access.svn.sourceforge.net/archive-access/?rev=2651&view=rev Author: binzino Date: 2008-12-09 01:39:26 +0000 (Tue, 09 Dec 2008) Log Message: ----------- Initial revision. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/build.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/plugin.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java Added: trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/build.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/build.xml (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/build.xml 2008-12-09 01:39:26 UTC (rev 2651) @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="scoring-nutchwax" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Added: trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/plugin.xml (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/plugin.xml 2008-12-09 01:39:26 UTC (rev 2651) @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="scoring-nutchwax" + name="NutchWAX Scoring Filter" + version="1.0.0" + provider-name="archive.org"> + + <runtime> + <library name="scoring-nutchwax.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.archive.nutchwax.scoring" + name="PageRank" + point="org.apache.nutch.scoring.ScoringFilter"> + <implementation id="PageRank" + class="org.archive.nutchwax.scoring.PageRankScoringFilter"/> + </extension> + +</plugin> Added: trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java 2008-12-09 01:39:26 UTC (rev 2651) @@ -0,0 +1,251 @@ +/* + * Copyright (C) 2008 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.nutchwax.scoring; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.lucene.document.Document; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.scoring.ScoringFilter; +import org.apache.nutch.scoring.ScoringFilterException; + + +public class PageRankScoringFilter implements ScoringFilter +{ + public static final Log LOG = LogFactory.getLog( PageRankScoringFilter.class ); + + private Configuration conf; + private Map<String,Integer> ranks; + + public Configuration getConf( ) + { + return this.conf; + } + + public void setConf( Configuration conf ) + { + this.conf = conf; + + //this.ranks = getPageRanks( conf ); + } + + public void injectedScore(Text url, CrawlDatum datum) + throws ScoringFilterException + { + // Not implemented + } + + public void initialScore(Text url, CrawlDatum datum) + throws ScoringFilterException + { + // Not implemented + } + + public float generatorSortValue(Text url, CrawlDatum datum, float initSort) + throws ScoringFilterException + { + // Not implemented + return initSort; + } + + public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) + throws ScoringFilterException + { + // Not implemented + } + + public void passScoreAfterParsing(Text url, Content content, Parse parse) + throws ScoringFilterException + { + // Not implemented + } + + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) + throws ScoringFilterException + { + // Not implemented + return adjust; + } + + public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) + throws ScoringFilterException + { + // Not implemented + } + + public float indexerScore(Text key, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + throws ScoringFilterException + { + synchronized ( this ) + { + if ( this.ranks == null ) + { + this.ranks = getPageRanks( this.conf ); + } + } + + LOG.info( "PageRankScoringFilter: initScore = " + initScore + " ; key = " + key ); + + if ( initScore <= 0 ) + { + return initScore; + } + + String keyParts[] = key.toString( ).split( "\\s+" ); + + if ( keyParts.length != 2 ) + { + LOG.warn( "Unexpected URL/key format: " + key ); + + return initScore; + } + + String url = keyParts[0]; + + Integer rank = this.ranks.get( url ); + + if ( rank == null ) + { + LOG.info( "No rank found for: " + url ); + + return initScore; + } + + float newScore = initScore * (float) ( Math.floor( Math.log( rank ) ) + 1 ); + + LOG.info( "PageRankScoringFilter: initScore = " + newScore + " ; key = " + key ); + + return newScore; + } + + + /** + * Utility function to read a list of page-rank records from a file + * specified in the configuration. + */ + public static Map<String,Integer> getPageRanks( Configuration conf ) + { + String pageranksPath = conf.get( "nutchwax.scoringfilter.pagerank.ranks" ); + + if ( pageranksPath == null || pageranksPath.trim().length() == 0 ) + { + LOG.warn( "No pagerank file set for property: \"nutchwax.scoringfilter.pagerank.ranks\"" ); + + return Collections.EMPTY_MAP; + } + + LOG.warn( "Using pageranks: " + pageranksPath ); + + Map<String,Integer> pageranks = new HashMap<String,Integer>( ); + + BufferedReader reader = null; + try + { + Path p = new Path( pageranksPath.trim() ); + + FileSystem fs = FileSystem.get( conf ); + + if ( fs.exists( p ) ) + { + InputStream is = p.getFileSystem( conf ).open( p ); + + reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) ); + + String line; + while ( (line = reader.readLine()) != null ) + { + String fields[] = line.split( "\\s+" ); + + if ( fields.length < 2 ) + { + LOG.warn( "Malformed pagerank, not enough fields ("+fields.length+"): " + line ); + continue ; + } + + try + { + int rank = Integer.parseInt( fields[0] ); + String url = fields[1]; + + if ( rank < 0 ) + { + LOG.warn( "Malformed pagerank, rank less than 0: " + line ); + } + + pageranks.put( url, rank ); + } + catch ( NumberFormatException nfe ) + { + LOG.warn( "Malformed pagerank, rank not an integer: " + line ); + continue ; + } + } + } + else + { + LOG.warn( "Pagerank file doesn't exist: " + pageranksPath ); + } + } + catch ( IOException e ) + { + // Umm, what to do? + throw new RuntimeException( e ); + } + finally + { + try + { + if ( reader != null ) + { + reader.close( ); + } + } + catch ( IOException e ) + { + // Ignore it. + } + } + + return pageranks; + } + + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |