From: <bi...@us...> - 2008-06-26 22:29:12
|
Revision: 2326 http://archive-access.svn.sourceforge.net/archive-access/?rev=2326&view=rev Author: binzino Date: 2008-06-26 15:29:21 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Initial revision of WaybackURLFilter and associated changes to build files. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/build-plugin.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/plugin.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/build.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/plugin.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/build-plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/build-plugin.xml 2008-06-26 22:26:10 UTC (rev 2325) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/build-plugin.xml 2008-06-26 22:29:21 UTC (rev 2326) @@ -55,6 +55,10 @@ <fileset dir="${nutch.root}/lib"> <include name="*.jar" /> </fileset> + <!-- This is the contrib/archive/lib directory --> + <fileset dir="../../../lib"> + <include name="*.jar" /> + </fileset> <path refid="plugin.deps"/> </path> Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml 2008-06-26 22:26:10 UTC (rev 2325) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml 2008-06-26 22:29:21 UTC (rev 2326) @@ -26,17 +26,18 @@ <!-- Build & deploy all the plugin jars. --> <!-- ====================================================== --> <target name="deploy"> - <ant dir="index-nutchwax" target="deploy"/> - <ant dir="query-nutchwax" target="deploy"/> + <ant dir="index-nutchwax" target="deploy"/> + <ant dir="query-nutchwax" target="deploy"/> + <ant dir="urlfilter-nutchwax" target="deploy"/> </target> <!-- ====================================================== --> <!-- Clean all of the plugins. --> <!-- ====================================================== --> <target name="clean"> - <ant dir="index-nutchwax" target="clean"/> - <ant dir="query-nutchwax" target="clean"/> + <ant dir="index-nutchwax" target="clean"/> + <ant dir="query-nutchwax" target="clean"/> + <ant dir="urlfilter-nutchwax" target="clean"/> </target> </project> - Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/plugin.xml 2008-06-26 22:26:10 UTC (rev 2325) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/plugin.xml 2008-06-26 22:29:21 UTC (rev 2326) @@ -21,7 +21,7 @@ --> <plugin id="index-nutchwax" - name="NutchWax Indexing Filter" + name="NutchWAX Indexing Filter" version="1.0.0" provider-name="archive.org"> @@ -36,9 +36,9 @@ </requires> <extension id="org.apache.nutch.indexer.basic" - name="NutchWax Indexing Filter" + name="Configurable Indexing Filter" point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="NutchWaxIndexingFilter" + <implementation id="ConfigurableIndexingFilter" class="org.archive.nutchwax.index.ConfigurableIndexingFilter" /> </extension> Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml 2008-06-26 22:26:10 UTC (rev 2325) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml 2008-06-26 22:29:21 UTC (rev 2326) @@ -21,7 +21,7 @@ --> <plugin id="query-nutchwax" - name="NutchWax Query Filter" + name="NutchWAX Query Filter" version="1.0.0" provider-name="archive.org"> Added: trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/build.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/build.xml (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/build.xml 2008-06-26 22:29:21 UTC (rev 2326) @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlfilter-nutchwax" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Added: trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/plugin.xml (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/plugin.xml 2008-06-26 22:29:21 UTC (rev 2326) @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlfilter-nutchwax" + name="NutchWAX URL Filter" + version="1.0.0" + provider-name="archive.org"> + + <runtime> + <library name="urlfilter-nutchwax.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.archive.nutchwax.urlfilter.wayback" + name="Wayback URL Filter" + point="org.apache.nutch.net.URLFilter"> + <implementation id="WaybackURLFilter" + class="org.archive.nutchwax.urlfilter.WaybackURLFilter"/> + </extension> + +</plugin> Added: trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java 2008-06-26 22:29:21 UTC (rev 2326) @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2008 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.nutchwax.urlfilter; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.commons.httpclient.URIException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.nutch.net.URLFilter; + +import org.archive.wayback.UrlCanonicalizer; + +/** + * Nutch URLFilter that filters a URL based on URL+digest+date + * metadata values, where the URL can also be canonicalized using the + * same logic as the Wayback. By making Wayback canonicalization + * available, we can use exclusion rules generated from CDX files. + */ +// TODO: Add logging +public class WaybackURLFilter implements URLFilter +{ + public static final Log LOG = LogFactory.getLog( WaybackURLFilter.class ); + + private Configuration conf; + private UrlCanonicalizer canonicalizer; + private Set<String> exclusions; + + public WaybackURLFilter( ) + { + } + + /** + * + */ + public String filter( String urlString ) + { + // Assume input is in expected form of space-separated values + // url + // digest + // 14-digit timestamp + String s[] = urlString.split( "\\s+" ); + + if ( s.length != 3 ) + { + // Don't filter. + LOG.info( "Allowing: " + urlString ); + + return urlString; + } + + boolean exclude = false; + + String url = s[0]; + String digest = s[1]; + String date = s[2]; + + try + { + // First, transform the URL into the same form that the + // Wayback uses for CDX files. + url = this.canonicalizer.urlStringToKey( url ); + + // Then, build a key to be compared against the exclusion + // list. + String key = url + " " + digest + " " + date; + + exclude = this.exclusions.contains( key ); + } + catch ( URIException e ) + { + // If we can't handle the URL, we let it through. + exclude = false; + } + + if ( exclude ) + { + LOG.info( "Excluding: " + urlString ); + + return null; + } + + LOG.info( "Allowing : " + urlString ); + + return urlString; + } + + public Configuration getConf( ) + { + return conf; + } + + public void setConf( Configuration conf ) + { + this.conf = conf; + + this.canonicalizer = getCanonicalizer( conf ); + this.exclusions = getExclusions ( conf ); + } + + /** + * Utility function to instantiate a UrlCanonicalizer based on an + * implementation specified in the configuration. + */ + public static UrlCanonicalizer getCanonicalizer( Configuration conf ) + { + // Which Wayback canonicalizer to use: Aggressive, Identity, etc. + String canonicalizerClassName = conf.get( "nutchwax.urlfilter.wayback.canonicalizer" ); + + if ( canonicalizerClassName == null || canonicalizerClassName.trim().length() == 0 ) + { + throw new RuntimeException( "Missing value for property: nutchwax.urlfilter.wayback.canonicalizer" ); + } + + try + { + UrlCanonicalizer canonicalizer = (UrlCanonicalizer) Class.forName( canonicalizerClassName ).newInstance( ); + + return canonicalizer; + } + catch ( Exception e ) + { + // If we can't instantiate it, there's not much else we can do + // other than just throw the Exception. + throw new RuntimeException( e ); + } + } + + /** + * Utility function to read a list of exclusion records from a file + * specified in the configuration. + */ + public static Set<String> getExclusions( Configuration conf ) + { + String exclusionsPath = conf.get( "nutchwax.urlfilter.wayback.exclusions" ); + + if ( exclusionsPath == null || exclusionsPath.trim().length() == 0 ) + { + LOG.warn( "No exclusions file set for property: \"nutchwax.urlfilter.wayback.exclusions\"" ); + + return Collections.EMPTY_SET; + } + + LOG.warn( "Using exclusions: " + exclusionsPath ); + + Set<String> exclusions = new HashSet<String>( ); + + BufferedReader reader = null; + try + { + Path p = new Path( exclusionsPath.trim() ); + + FileSystem fs = FileSystem.get( conf ); + + if ( fs.exists( p ) ) + { + InputStream is = p.getFileSystem( conf ).open( p ); + + reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) ); + + String line; + while ( (line = reader.readLine()) != null ) + { + exclusions.add( line ); + } + } + else + { + LOG.warn( "Exclusions doesn't exist: " + exclusionsPath ); + } + } + catch ( IOException e ) + { + // Umm, what to do? + throw new RuntimeException( e ); + } + finally + { + try + { + if ( reader != null ) + { + reader.close( ); + } + } + catch ( IOException e ) + { + // Ignore it. + } + } + + return exclusions; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |