Revision: 3307
http://archive-access.svn.sourceforge.net/archive-access/?rev=3307&view=rev
Author: binzino
Date: 2010-10-27 07:06:04 +0000 (Wed, 27 Oct 2010)
Log Message:
-----------
Add type normalization and filtering. Added uri/path filtering.
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2010-10-27 07:00:57 UTC (rev 3306)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2010-10-27 07:06:04 UTC (rev 3307)
@@ -1,30 +1,26 @@
/*
- * Copyright (C) 2008 Internet Archive.
- *
- * This file is part of the archive-access tools project
- * (http://sourceforge.net/projects/archive-access).
- *
- * The archive-access tools are free software; you can redistribute them and/or
- * modify them under the terms of the GNU Lesser Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or any
- * later version.
- *
- * The archive-access tools are distributed in the hope that they will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
- * Public License for more details.
- *
- * You should have received a copy of the GNU Lesser Public License along with
- * the archive-access tools; if not, write to the Free Software Foundation,
- * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
+
package org.archive.nutchwax.index;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.List;
+import java.net.*;
+import java.util.*;
+
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
@@ -51,6 +47,9 @@
private List<FieldSpecification> fieldSpecs;
private int MAX_TITLE_LENGTH;
+ private TypeNormalizer typenormalizer;
+ private TypeFilter typefilter;
+ private URLFilter urlfilter;
public void setConf( Configuration conf )
{
@@ -58,6 +57,13 @@
this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
+ // this.allowedTypes = new HashSet<String>( conf.get( "indexer.mimetypes.allowed", "" ).split( "\\s+" ) );
+ this.typenormalizer = new TypeNormalizer( );
+ this.typenormalizer.setAliases( typenormalizer.getDefaultAliases( ) );
+
+ this.typefilter = new TypeFilter( TypeFilter.getDefaultAllowed( ), this.typenormalizer );
+ this.urlfilter = new URLFilter ( URLFilter.getDefaultProhibited( ) );
+
String filterSpecs = conf.get( "nutchwax.filter.index" );
if ( null == filterSpecs )
@@ -143,6 +149,8 @@
{
Metadata meta = parse.getData().getContentMeta();
+ //
+
for ( FieldSpecification spec : this.fieldSpecs )
{
String value = null;
@@ -150,15 +158,24 @@
{
try
{
- value = (new URL( meta.get( "url" ) ) ).getHost( );
+ URI uri = new URI( meta.get( "url" ) );
+ if ( ! this.urlfilter.isAllowed( uri ) )
+ {
+ LOG.info( "Rejecting: " + key + " due to url: " + uri );
+
+ return null;
+ }
+
+ value = uri.getHost( );
+
// Strip off any "www." header.
if ( value.startsWith( "www." ) )
{
value = value.substring( 4 );
}
}
- catch ( MalformedURLException mue ) { /* Eat it */ }
+ catch ( URISyntaxException use ) { /* Eat it */ }
}
else if ( "content".equals( spec.srcKey ) )
{
@@ -178,8 +195,16 @@
if ( value == null ) continue ;
- int p = value.indexOf( ';' );
- if ( p >= 0 ) value = value.substring( 0, p );
+ //int p = value.indexOf( ';' );
+ //if ( p >= 0 ) value = value.substring( 0, p );
+ value = this.typenormalizer.normalize( value );
+
+ if ( ! this.typefilter.isAllowed( value ) )
+ {
+ LOG.info( "Rejecting: " + key + " due to type: " + value );
+
+ return null;
+ }
}
else if ( "collection".equals( spec.srcKey ) )
{
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|