Revision: 3307 http://archive-access.svn.sourceforge.net/archive-access/?rev=3307&view=rev Author: binzino Date: 2010-10-27 07:06:04 +0000 (Wed, 27 Oct 2010) Log Message: ----------- Add type normalization and filtering. Added uri/path filtering. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2010-10-27 07:00:57 UTC (rev 3306) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2010-10-27 07:06:04 UTC (rev 3307) @@ -1,30 +1,26 @@ /* - * Copyright (C) 2008 Internet Archive. - * - * This file is part of the archive-access tools project - * (http://sourceforge.net/projects/archive-access). - * - * The archive-access tools are free software; you can redistribute them and/or - * modify them under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or any - * later version. - * - * The archive-access tools are distributed in the hope that they will be - * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser - * Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License along with - * the archive-access tools; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + package org.archive.nutchwax.index; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.List; +import java.net.*; +import java.util.*; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -51,6 +47,9 @@ private List<FieldSpecification> fieldSpecs; private int MAX_TITLE_LENGTH; + private TypeNormalizer typenormalizer; + private TypeFilter typefilter; + private URLFilter urlfilter; public void setConf( Configuration conf ) { @@ -58,6 +57,13 @@ this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100); + // this.allowedTypes = new HashSet<String>( conf.get( "indexer.mimetypes.allowed", "" ).split( "\\s+" ) ); + this.typenormalizer = new TypeNormalizer( ); + this.typenormalizer.setAliases( typenormalizer.getDefaultAliases( ) ); + + this.typefilter = new TypeFilter( TypeFilter.getDefaultAllowed( ), this.typenormalizer ); + this.urlfilter = new URLFilter ( URLFilter.getDefaultProhibited( ) ); + String filterSpecs = conf.get( "nutchwax.filter.index" ); if ( null == filterSpecs ) @@ -143,6 +149,8 @@ { Metadata meta = parse.getData().getContentMeta(); + // + for ( FieldSpecification spec : this.fieldSpecs ) { String value = null; @@ -150,15 +158,24 @@ { try { - value = (new URL( meta.get( "url" ) ) ).getHost( ); + URI uri = new URI( meta.get( "url" ) ); + if ( ! this.urlfilter.isAllowed( uri ) ) + { + LOG.info( "Rejecting: " + key + " due to url: " + uri ); + + return null; + } + + value = uri.getHost( ); + // Strip off any "www." header. if ( value.startsWith( "www." ) ) { value = value.substring( 4 ); } } - catch ( MalformedURLException mue ) { /* Eat it */ } + catch ( URISyntaxException use ) { /* Eat it */ } } else if ( "content".equals( spec.srcKey ) ) { @@ -178,8 +195,16 @@ if ( value == null ) continue ; - int p = value.indexOf( ';' ); - if ( p >= 0 ) value = value.substring( 0, p ); + //int p = value.indexOf( ';' ); + //if ( p >= 0 ) value = value.substring( 0, p ); + value = this.typenormalizer.normalize( value ); + + if ( ! this.typefilter.isAllowed( value ) ) + { + LOG.info( "Rejecting: " + key + " due to type: " + value ); + + return null; + } } else if ( "collection".equals( spec.srcKey ) ) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |