|
From: <bi...@us...> - 2010-10-27 06:56:49
|
Revision: 3305
http://archive-access.svn.sourceforge.net/archive-access/?rev=3305&view=rev
Author: binzino
Date: 2010-10-27 06:56:42 +0000 (Wed, 27 Oct 2010)
Log Message:
-----------
Initial revision.
Added Paths:
-----------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeFilter.java
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeNormalizer.java
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/URLFilter.java
Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeFilter.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeFilter.java (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeFilter.java 2010-10-27 06:56:42 UTC (rev 3305)
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.index;
+
+import java.io.*;
+import java.util.*;
+
+public class TypeFilter
+{
+ public static final String[] DEFAULT_ALLOWED =
+ {
+ "text/html",
+ "text/plain",
+ "application/pdf",
+ // MS Office document types
+ "application/msword",
+ "application/vnd.ms-powerpoint",
+ // OpenOffice document types
+ "application/vnd.oasis.opendocument.text",
+ "application/vnd.oasis.opendocument.presentation",
+ "application/vnd.oasis.opendocument.spreadsheet",
+ };
+
+ private Set<String> allowed;
+ private TypeNormalizer normalizer;
+
+ public TypeFilter( )
+ {
+
+ }
+
+ public TypeFilter( Set<String> allowed, TypeNormalizer normalizer )
+ {
+ this.allowed = allowed;
+ this.normalizer = normalizer;
+ }
+
+ public void setTypeNormalizer( TypeNormalizer normalizer )
+ {
+ this.normalizer = normalizer;
+ }
+
+ public void setAllowed( Set<String> allowed )
+ {
+ this.allowed = allowed;
+ }
+
+ public Set<String> getAllowed( )
+ {
+ return this.allowed;
+ }
+
+ public boolean isAllowed( String type )
+ {
+ // If no explicit list of allowed types, allow them all.
+ if ( this.allowed == null || this.allowed.size( ) == 0 )
+ {
+ return true;
+ }
+
+ // De-alias it.
+ type = this.normalizer.normalize( type );
+
+ return allowed.contains( type );
+ }
+
+ public static Set<String> parse( String s )
+ {
+ Set<String> types = new HashSet<String>( );
+
+ for ( String type : s.split( "\\s+" ) )
+ {
+ if ( type.length() < 1 ) continue ;
+
+ types.add( type );
+ }
+
+ return types;
+ }
+
+ public static Set<String> getDefaultAllowed( )
+ {
+ Set<String> defaults = new HashSet<String>( );
+
+ for ( String allowed : DEFAULT_ALLOWED )
+ {
+ defaults.add( allowed );
+ }
+
+ return defaults;
+ }
+
+}
Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeNormalizer.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeNormalizer.java (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeNormalizer.java 2010-10-27 06:56:42 UTC (rev 3305)
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.index;
+
+import java.io.*;
+import java.util.*;
+
+public class TypeNormalizer
+{
+ // Maps alias->canonical
+ public static final String[][] DEFAULT_ALIASES =
+ {
+ // PDF aliases
+ { "application/x-pdf", "application/pdf" },
+ // HTML aliases.
+ { "application/xhtml+xml", "text/html" },
+ // MS Word aliases.
+ { "application/vnd.ms-word", "application/msword" },
+ { "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/msword" },
+ // PowerPoint aliases.
+ {"application/mspowerpoint", "application/vnd.ms-powerpoint" },
+ {"application/ms-powerpoint", "application/vnd.ms-powerpoint" },
+ {"application/mspowerpnt", "application/vnd.ms-powerpoint" },
+ {"application/vnd-mspowerpoint", "application/vnd.ms-powerpoint" },
+ {"application/powerpoint", "application/vnd.ms-powerpoint" },
+ {"application/x-powerpoint", "application/vnd.ms-powerpoint" },
+ {"application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint" },
+ };
+
+ private Map<String,String> aliases;
+
+ public static Map<String,String> getDefaultAliases( )
+ {
+ Map<String,String> defaults = new HashMap<String,String>( );
+
+ for ( String[] alias : DEFAULT_ALIASES )
+ {
+ defaults.put( alias[0], alias[1] );
+ }
+
+ return defaults;
+ }
+
+ public static Map<String,String> parseAliases( String s )
+ {
+ Map<String,String> aliases = new HashMap<String,String>( );
+
+ for ( String line : s.split( "\\s+" ) )
+ {
+ if ( line.length() < 1 ) continue ;
+
+ String[] tokens = line.split( "[:,]" );
+
+ if ( tokens.length < 2 ) continue ;
+
+ String type = tokens[0];
+
+ if ( type.length() < 1 ) continue ;
+
+ for ( int i = 1; i < tokens.length ; i++ )
+ {
+ aliases.put( tokens[i], type );
+ }
+ }
+
+ return aliases;
+ }
+
+ public void setAliases( Map<String,String> aliases )
+ {
+ this.aliases = aliases;
+ }
+
+ public Map<String,String> getAliases( )
+ {
+ return this.aliases;
+ }
+
+ public String normalize( String type )
+ {
+ // Chop off anything after a ';' character. This is
+ // for stuff like: "text/html; charset=utf-8"
+ int p = type.indexOf( ';' );
+ if ( p >= 0 ) type = type.substring( 0, p ).trim();
+
+ if ( this.aliases != null && this.aliases.containsKey( type ) )
+ {
+ type = this.aliases.get( type );
+ }
+
+ return type;
+ }
+
+}
Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/URLFilter.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/URLFilter.java (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/URLFilter.java 2010-10-27 06:56:42 UTC (rev 3305)
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.index;
+
+import java.io.*;
+import java.net.*;
+import java.util.*;
+
+public class URLFilter
+{
+ public static final String[] DEFAULT_PROHIBITED =
+ {
+ "/robots.txt",
+ "/favicon.ico",
+ };
+
+ private Set<String> prohibited;
+
+ public URLFilter( Set<String> prohibited )
+ {
+ this.prohibited = prohibited;
+ }
+
+ public void setProhibited( Set<String> prohibited )
+ {
+ this.prohibited = prohibited;
+ }
+
+ public Set<String> getProhibited( )
+ {
+ return this.prohibited;
+ }
+
+ public boolean isAllowed( URI uri )
+ {
+ String path = uri.getRawPath( );
+
+ return ! this.prohibited.contains( path );
+ }
+
+ public static Set<String> parse( String s )
+ {
+ Set<String> paths = new HashSet<String>( );
+
+ for ( String type : s.split( "\\s+" ) )
+ {
+ if ( type.length() < 1 ) continue ;
+
+ paths.add( type );
+ }
+
+ return paths;
+ }
+
+ public static Set<String> getDefaultProhibited( )
+ {
+ Set<String> defaults = new HashSet<String>( );
+
+ for ( String prohibited : DEFAULT_PROHIBITED )
+ {
+ defaults.add( prohibited );
+ }
+
+ return defaults;
+ }
+
+}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|