archive-access-cvs Mailing List for Web Archive Access Utilities (Page 47)

Brought to you by: binzino, bradtofel, gojomo, ia_igor, and 5 others

archive-access-cvs — CVS commits

You can subscribe to this list here.

2005	Jan	Feb	Mar	Apr	May	Jun	Jul (1)	Aug (10)	Sep (36)	Oct (339)	Nov (103)	Dec (152)
2006	Jan (141)	Feb (102)	Mar (125)	Apr (203)	May (57)	Jun (30)	Jul (139)	Aug (46)	Sep (64)	Oct (105)	Nov (34)	Dec (162)
2007	Jan (81)	Feb (57)	Mar (141)	Apr (72)	May (9)	Jun (1)	Jul (144)	Aug (88)	Sep (40)	Oct (43)	Nov (34)	Dec (20)
2008	Jan (44)	Feb (45)	Mar (16)	Apr (36)	May (8)	Jun (77)	Jul (177)	Aug (66)	Sep (8)	Oct (33)	Nov (13)	Dec (37)
2009	Jan (2)	Feb (5)	Mar (8)	Apr	May (36)	Jun (19)	Jul (46)	Aug (8)	Sep (1)	Oct (66)	Nov (61)	Dec (10)
2010	Jan (13)	Feb (16)	Mar (38)	Apr (76)	May (47)	Jun (32)	Jul (35)	Aug (45)	Sep (20)	Oct (61)	Nov (24)	Dec (16)
2011	Jan (22)	Feb (34)	Mar (11)	Apr (8)	May (24)	Jun (23)	Jul (11)	Aug (42)	Sep (81)	Oct (48)	Nov (21)	Dec (20)
2012	Jan (30)	Feb (25)	Mar (4)	Apr (6)	May (1)	Jun (5)	Jul (5)	Aug (8)	Sep (6)	Oct (6)	Nov	Dec

Flat | Threaded

<< < 1 .. 45 46 47 48 49 .. 171 > >> (Page 47 of 171)

[Archive-access-cvs] SF.net SVN: archive-access:[2655] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/Importer.java

From: <bi...@us...> - 2008-12-10 04:58:28

Revision: 2655
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2655&view=rev
Author:   binzino
Date:     2008-12-10 04:58:24 +0000 (Wed, 10 Dec 2008)

Log Message:
-----------
Change output of messages from stderr to stdout. Add code to check return status of job and pass back to command-line via System.exti() call.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-12-09 01:58:04 UTC (rev 2654)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-12-10 04:58:24 UTC (rev 2655)
@@ -36,6 +36,8 @@
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.TextInputFormat;
+import org.apache.hadoop.mapred.RunningJob;
+import org.apache.hadoop.mapred.JobStatus;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -601,7 +603,7 @@
       {
         if ( args.length < 2 )
           {
-            System.err.println( "ERROR: Missing filename for option \"-e\"\n" );
+            System.out.println( "ERROR: Missing filename for option \"-e\"\n" );
             usage( );
             return -1;
           }
@@ -613,7 +615,7 @@
     
     if ( args.length - pos < 1 )
       {
-        System.err.println( "ERROR: Missing manifest file.\n" );
+        System.out.println( "ERROR: Missing manifest file.\n" );
         usage( );
         return -1;
       }
@@ -645,17 +647,20 @@
         job.setOutputKeyClass  ( Text.class                );
         job.setOutputValueClass( NutchWritable.class       );
 
-        JobClient.runJob( job );
+        RunningJob rj = JobClient.runJob( job );
+
+        // Emit job id and status.
+        System.out.println( "JOB_STATUS: " + rj.getID( ) + ": " + (rj.isSuccessful( ) ? "SUCCESS" : "FAIL" ) );
+
+        return rj.isSuccessful( ) ? 0 : 1;
       }
     catch ( Exception e )
       {
         LOG.fatal( "Importer: ", e );
-        System.err.println( "Fatal error: " + e );
-        e.printStackTrace( System.err );
+        System.out.println( "Fatal error: " + e );
+        e.printStackTrace( System.out );
         return -1;
       }
-    
-    return 0;
   }
 
   /**
@@ -673,7 +678,7 @@
       + "necessary.  This is to mirror the behavior of other Nutch actions.\n"
       ;
     
-    System.err.println( usage );
+    System.out.println( usage );
   }
 
   /**


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2651] trunk/archive-access/projects/nutchwax/ archive/src/plugin

From: <bi...@us...> - 2008-12-09 02:22:13

Revision: 2651
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2651&view=rev
Author:   binzino
Date:     2008-12-09 01:39:26 +0000 (Tue, 09 Dec 2008)

Log Message:
-----------
Initial revision.

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/
    trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/build.xml
    trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/plugin.xml
    trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/
    trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/
    trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/
    trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/
    trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/
    trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/
    trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java

Added: trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/build.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/build.xml	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/build.xml	2008-12-09 01:39:26 UTC (rev 2651)
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-nutchwax" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Added: trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/plugin.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/plugin.xml	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/plugin.xml	2008-12-09 01:39:26 UTC (rev 2651)
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="scoring-nutchwax"
+   name="NutchWAX Scoring Filter"
+   version="1.0.0"
+   provider-name="archive.org">
+
+   <runtime>
+      <library name="scoring-nutchwax.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.archive.nutchwax.scoring"
+              name="PageRank"
+              point="org.apache.nutch.scoring.ScoringFilter">
+      <implementation id="PageRank"
+                      class="org.archive.nutchwax.scoring.PageRankScoringFilter"/>
+   </extension>
+
+</plugin>

Added: trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java	2008-12-09 01:39:26 UTC (rev 2651)
@@ -0,0 +1,251 @@
+/*
+ * Copyright (C) 2008 Internet Archive.
+ * 
+ * This file is part of the archive-access tools project
+ * (http://sourceforge.net/projects/archive-access).
+ * 
+ * The archive-access tools are free software; you can redistribute them and/or
+ * modify them under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or any
+ * later version.
+ * 
+ * The archive-access tools are distributed in the hope that they will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
+ * Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser Public License along with
+ * the archive-access tools; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.nutchwax.scoring;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.document.Document;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+
+public class PageRankScoringFilter implements ScoringFilter
+{
+  public static final Log LOG = LogFactory.getLog( PageRankScoringFilter.class );
+  
+  private Configuration       conf;
+  private Map<String,Integer> ranks;
+
+  public Configuration getConf( )
+  {
+    return this.conf;
+  }
+
+  public void setConf( Configuration conf )
+  {
+    this.conf = conf;
+
+    //this.ranks = getPageRanks( conf );
+  }
+  
+  public void injectedScore(Text url, CrawlDatum datum) 
+    throws ScoringFilterException
+  {
+    // Not implemented
+  }
+  
+  public void initialScore(Text url, CrawlDatum datum) 
+    throws ScoringFilterException
+  {
+    // Not implemented
+  }
+  
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort) 
+    throws ScoringFilterException
+  {
+    // Not implemented
+    return initSort;
+  }
+  
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) 
+    throws ScoringFilterException
+  {
+    // Not implemented
+  }
+  
+  public void passScoreAfterParsing(Text url, Content content, Parse parse) 
+    throws ScoringFilterException
+  {
+    // Not implemented
+  }
+  
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) 
+    throws ScoringFilterException
+  {
+    // Not implemented
+    return adjust;
+  }
+  
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked)
+    throws ScoringFilterException
+  {
+    // Not implemented
+  }
+  
+  public float indexerScore(Text key, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+    throws ScoringFilterException
+  {
+    synchronized ( this )
+      {
+        if ( this.ranks == null )
+          {
+            this.ranks = getPageRanks( this.conf );
+          }
+      }
+
+    LOG.info( "PageRankScoringFilter: initScore = " + initScore + " ; key = " + key );
+
+    if ( initScore <= 0 )
+      {
+        return initScore;
+      }
+
+    String keyParts[] = key.toString( ).split( "\\s+" );
+
+    if ( keyParts.length != 2 )
+      {
+        LOG.warn( "Unexpected URL/key format: " + key );
+
+        return initScore;
+      }
+    
+    String url = keyParts[0];
+
+    Integer rank = this.ranks.get( url );
+
+    if ( rank == null )
+      {
+        LOG.info( "No rank found for: " + url );
+
+        return initScore;
+      }
+
+    float newScore = initScore * (float) ( Math.floor( Math.log( rank ) ) + 1 );
+
+    LOG.info( "PageRankScoringFilter: initScore = " + newScore + " ; key = " + key );
+
+    return newScore;
+  }
+  
+
+  /**
+   * Utility function to read a list of page-rank records from a file
+   * specified in the configuration.
+   */
+  public static Map<String,Integer> getPageRanks( Configuration conf )
+  {
+    String pageranksPath = conf.get( "nutchwax.scoringfilter.pagerank.ranks" );
+
+    if ( pageranksPath == null || pageranksPath.trim().length() == 0 )
+      {
+        LOG.warn( "No pagerank file set for property: \"nutchwax.scoringfilter.pagerank.ranks\"" );
+
+        return Collections.EMPTY_MAP;
+      }
+
+    LOG.warn( "Using pageranks: " + pageranksPath );
+
+    Map<String,Integer> pageranks = new HashMap<String,Integer>( );
+
+    BufferedReader reader = null;
+    try
+      {
+        Path p = new Path( pageranksPath.trim() );
+        
+        FileSystem fs = FileSystem.get( conf );
+        
+        if ( fs.exists( p ) )
+          {
+            InputStream is = p.getFileSystem( conf ).open( p );
+            
+            reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) );
+            
+            String line;
+            while ( (line = reader.readLine()) != null )
+              {
+                String fields[] = line.split( "\\s+" );
+                
+                if ( fields.length < 2 )
+                  {
+                    LOG.warn( "Malformed pagerank, not enough fields ("+fields.length+"): " + line );
+                    continue ;
+                  }
+
+                try
+                  {
+                    int    rank = Integer.parseInt( fields[0] );
+                    String url  = fields[1];
+
+                    if ( rank < 0 )
+                      {
+                        LOG.warn( "Malformed pagerank, rank less than 0: " + line );
+                      }
+                    
+                    pageranks.put( url, rank );
+                  }
+                catch ( NumberFormatException nfe )
+                  {
+                    LOG.warn( "Malformed pagerank, rank not an integer: " + line );
+                    continue ;
+                  }
+              }
+          }
+        else
+          {
+            LOG.warn( "Pagerank file doesn't exist: " + pageranksPath );
+          }
+      }
+    catch ( IOException e )
+      {
+        // Umm, what to do?
+        throw new RuntimeException( e );
+      }
+    finally
+      {
+        try
+          {
+            if ( reader != null )
+              {
+                reader.close( );
+              }
+          }
+        catch  ( IOException e )
+          {
+            // Ignore it.
+          }
+      }
+
+    return pageranks;
+  }
+
+
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2650] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/DistributedSearch.java

From: <bi...@us...> - 2008-12-09 02:22:09

Revision: 2650
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2650&view=rev
Author:   binzino
Date:     2008-12-09 01:38:51 +0000 (Tue, 09 Dec 2008)

Log Message:
-----------
Initial revision.

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/DistributedSearch.java

Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/DistributedSearch.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/DistributedSearch.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/DistributedSearch.java	2008-12-09 01:38:51 UTC (rev 2650)
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.nutchwax;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.ipc.RPC;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.searcher.NutchBean;
+
+/** 
+ * A command-line wrapper for the Nutch DistributedSearch$Server class
+ * which adds the NutchBeanModifier.modify() call to be able to handle
+ * parallel indices as well as other NutchWAX enhancements.
+ * </p>
+ * <p>
+ * Invoked the same as the regular Nutch DistributedSearch$Server, but
+ * with the NutchWAX package prefix, i.e.
+ * </p>
+ * <code>
+ * $ nutch org.archive.nutchwax.DistributedSearch\$Server 9000 &lt;crawl-dir&gt;
+ * </code>
+ */
+public class DistributedSearch 
+{
+  public static final Log LOG = LogFactory.getLog(DistributedSearch.class);
+
+  private DistributedSearch() {}                  // no public ctor
+
+  /** The search server. */
+  public static class Server  
+  {
+
+    private Server() 
+    {
+    }
+    
+    /** Runs a search server. */
+    public static void main(String[] args) throws Exception
+    {
+      String usage = "DistributedSearch$Server <port> <index dir>";
+      
+      if (args.length == 0 || args.length > 2)
+        {
+          System.err.println(usage);
+          System.exit(-1);
+        }
+      
+      int port = Integer.parseInt(args[0]);
+      Path directory = new Path(args[1]);
+      
+      Configuration conf = NutchConfiguration.create();
+      
+      org.apache.hadoop.ipc.Server server = getServer(conf, directory, port);
+      server.start();
+      server.join();
+    }
+    
+    static org.apache.hadoop.ipc.Server getServer(Configuration conf, Path directory, int port) throws IOException
+    {
+      NutchBean bean = new NutchBean(conf, directory);
+
+      // Modify the NutchBean, adding the WAX enhancements to it.
+      NutchWaxBean.NutchBeanModifier.modify( bean );
+
+      int numHandlers = conf.getInt("searcher.num.handlers", 10);      
+      return RPC.getServer(bean, "0.0.0.0", port, numHandlers, true, conf);
+    }
+
+  }
+
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2654] trunk/archive-access/projects/nutchwax/ archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/ PageRankScoringFilter.java

From: <bi...@us...> - 2008-12-09 01:58:08

Revision: 2654
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2654&view=rev
Author:   binzino
Date:     2008-12-09 01:58:04 +0000 (Tue, 09 Dec 2008)

Log Message:
-----------
Added class-level javadoc description.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java	2008-12-09 01:42:08 UTC (rev 2653)
+++ trunk/archive-access/projects/nutchwax/archive/src/plugin/scoring-nutchwax/src/java/org/archive/nutchwax/scoring/PageRankScoringFilter.java	2008-12-09 01:58:04 UTC (rev 2654)
@@ -48,6 +48,58 @@
 import org.apache.nutch.scoring.ScoringFilterException;
 
 
+/**
+ * Simple scoring plugin that applies a PageRank multiple to the
+ * document score/boost during index time.  Only implements the
+ * <code>ScoringFilter</code> method associated with indexing, none of
+ * the other scoring methods are implemented.
+ * </p><p>
+ * Applies a simple log10 multipler to the document score based on the
+ * base-10 log value of the number of inlinks.  For example, a page with
+ * 13,032 inlinks will have a score/boost of 5.  The actual formula is
+ * </p>
+ * <code>
+ *  initialScore *= ( floor( log10( # inlinks ) ) + 1 )
+ * </code>
+ * <p>
+ * We use floor() to get an integer value from the log10() function
+ * since we're only interested in order of magnitude.  We then add 1
+ * so that a page with &lt; 10 inlins will have a multipler of 1, and
+ * thus stay the same, 10-100 gets a multipler of 2, 100-1000 is 3, and
+ * so forth.
+ * </p>
+ * <p>
+ * The number of inlinks for a page is not taken from the <code>inlinks</code>
+ * method parameter.  Rather a map of &lt;URL,rank&gt; values is read from
+ * an external file.  Confusing?  Yes.
+ * </p>
+ * <p>
+ * We use an external file because the <code>inlinks</code> will
+ * <strong>always</strong> be empty.  This is because the
+ * <code>linkdb</code> uses URLs where the <strong>key</strong> is not
+ * the URL rather the URL+digest.  Thus the URLs in the
+ * <code>linkdb</code> never match the keys and Hadoop doesn't pass
+ * in the expected <code>linkdb</code> information.
+ * </p>
+ * <p>
+ * We work around this by using a NutchWAX command-line tool to
+ * extract the relevant PageRank information from the
+ * <code>linkdb</code> and write to an external file.  We then read
+ * that external file here and use the information contained therein.
+ * </p>
+ * <p>
+ * Yes, this is a hassle.  But it's the best we got right now.
+ * </p>
+ * <h2>Implementation note</h2>
+ * <p>
+ * Since the scoring plugins are used <em>only</em> during the
+ * <code>reduce</code> step during indexing, we delay the
+ * initialization of the &lt;URL,rank&gt; map until the first call to
+ * the <code>indexerScore</code> method.  This way, we don't spend the
+ * effort to read the external file when we are instantiated during
+ * <code>map</code> phase.
+ * </p>
+ */
 public class PageRankScoringFilter implements ScoringFilter
 {
   public static final Log LOG = LogFactory.getLog( PageRankScoringFilter.class );
@@ -247,5 +299,4 @@
     return pageranks;
   }
 
-
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2652] trunk/archive-access/projects/nutchwax/ archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ FieldSetter.java

From: <bi...@us...> - 2008-12-09 01:45:48

Revision: 2652
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2652&view=rev
Author:   binzino
Date:     2008-12-09 01:41:27 +0000 (Tue, 09 Dec 2008)

Log Message:
-----------
Modified call to String.split() so that only one '=' delimiter in
key/value pair is used.  This was a value can have a '=' character in
it.  Also fixed a type-o in a log message.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/FieldSetter.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/FieldSetter.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/FieldSetter.java	2008-12-09 01:39:26 UTC (rev 2651)
+++ trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/FieldSetter.java	2008-12-09 01:41:27 UTC (rev 2652)
@@ -95,7 +95,7 @@
     for ( String field : s.split( "\\s+" ) )
       {
         // Split: "foo:true:false=bar" into ["foo:true:false","bar"]
-        String[] fieldParts = field.split("=");
+        String[] fieldParts = field.split("=", 2);
 
         // Split: "foo:true:false" into ["foo","true","false"]
         String[] keyParts = fieldParts[0].split( "[:]" );
@@ -118,7 +118,7 @@
 
         String value = fieldParts.length > 1 ? fieldParts[1] : null;
 
-        LOG.info( "Add field spetting: " + key + "[" + store + ":" + tokenize + "] = " + value );
+        LOG.info( "Add field setting: " + key + "[" + store + ":" + tokenize + "] = " + value );
 
         this.settings.add( new FieldSetting( key, store, tokenize, value ) );
       }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2653] trunk/archive-access/projects/nutchwax/ archive/src/plugin/build.xml

From: <bi...@us...> - 2008-12-09 01:45:43

Revision: 2653
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2653&view=rev
Author:   binzino
Date:     2008-12-09 01:42:08 +0000 (Tue, 09 Dec 2008)

Log Message:
-----------
Added scoring plugin.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml

Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml	2008-12-09 01:41:27 UTC (rev 2652)
+++ trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml	2008-12-09 01:42:08 UTC (rev 2653)
@@ -29,6 +29,7 @@
     <ant dir="index-nutchwax"     target="deploy"/>
     <ant dir="query-nutchwax"     target="deploy"/>
     <ant dir="urlfilter-nutchwax" target="deploy"/>
+    <ant dir="scoring-nutchwax"   target="deploy"/>
   </target>
 
   <!-- ====================================================== -->
@@ -38,6 +39,7 @@
     <ant dir="index-nutchwax"     target="clean"/>
     <ant dir="query-nutchwax"     target="clean"/>
     <ant dir="urlfilter-nutchwax" target="clean"/>
+    <ant dir="scoring-nutchwax"   target="clean"/>
   </target>
 
 </project>


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2649] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp

From: <bra...@us...> - 2008-12-05 22:46:34

Revision: 2649
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2649&view=rev
Author:   bradtofel
Date:     2008-12-05 22:46:30 +0000 (Fri, 05 Dec 2008)

Log Message:
-----------
ACC-56: two new XML Query .jsp renderers that output RSS opensearch format.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/OpenSearchCaptureResults.jsp
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/OpenSearchUrlResults.jsp
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/opensearchdescription.xml

Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/OpenSearchCaptureResults.jsp
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/OpenSearchCaptureResults.jsp	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/OpenSearchCaptureResults.jsp	2008-12-05 22:46:30 UTC (rev 2649)
@@ -0,0 +1,81 @@
+<?xml version="1.0" encoding="UTF-8"?><%@
+ page language="java" pageEncoding="utf-8" contentType="text/xml;charset=utf-8"
+%><%@
+ page import="java.util.Iterator"
+%><%@
+ page import="java.util.ArrayList"
+%><%@
+ page import="java.util.Map"
+%><%@
+ page import="java.util.Enumeration"
+%><%@
+ page import="org.archive.wayback.core.CaptureSearchResult"
+%><%@
+ page import="org.archive.wayback.core.CaptureSearchResults"
+%><%@
+ page import="org.archive.wayback.core.SearchResults"
+%><%@
+ page import="org.archive.wayback.core.UIResults"
+%><%@
+ page import="org.archive.wayback.core.WaybackRequest"
+%><%@
+ page import="org.archive.wayback.requestparser.OpenSearchRequestParser"
+%><%@
+ page import="org.archive.wayback.util.StringFormatter"
+%><%
+UIResults uiResults = UIResults.extractCaptureQuery(request);
+
+WaybackRequest wbRequest = uiResults.getWbRequest();
+StringFormatter fmt = wbRequest.getFormatter();
+CaptureSearchResults results = uiResults.getCaptureResults();
+Iterator<CaptureSearchResult> itr = results.iterator();
+String contextRoot = wbRequest.getContextPrefix();
+String searchString = wbRequest.getRequestUrl();
+long firstResult = results.getFirstReturned();
+long shownResultCount = results.getReturnedCount();
+long lastResult = results.getReturnedCount() + firstResult;
+long resultCount = results.getMatchingCount();
+String searchTerms = "";
+Map<String,String[]> queryMap = request.getParameterMap();
+String arr[] = queryMap.get(OpenSearchRequestParser.SEARCH_QUERY);
+if(arr != null && arr.length > 1) {
+	searchTerms = arr[0];
+}
+%>
+<rss version="2.0" 
+      xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/"
+      xmlns:atom="http://www.w3.org/2005/Atom">
+   <channel>
+     <title>Wayback OpenSearch Results</title>
+     <link><%= contextRoot %>></link>
+     <description><%= fmt.format("PathQueryClassic.searchedFor",searchString) %></description>
+     <opensearch:totalResults><%= resultCount %></opensearch:totalResults>
+     <opensearch:startIndex><%= firstResult %></opensearch:startIndex>
+     <opensearch:itemsPerPage><%= shownResultCount %></opensearch:itemsPerPage>
+     <atom:link rel="search" type="application/opensearchdescription+xml" href="<%= contextRoot %>/opensearchdescription.xml"/>
+     <opensearch:Query role="request" searchTerms="<%= UIResults.encodeXMLContent(searchTerms) %>" startPage="<%= wbRequest.getPageNum() %>" />
+<%
+  while(itr.hasNext()) {
+    %>
+     <item>
+    <%
+    CaptureSearchResult result = itr.next();
+
+    String replayUrl = UIResults.encodeXMLEntity(
+    		uiResults.resultToReplayUrl(result));
+
+    String prettyDate = UIResults.encodeXMLEntity(
+    		fmt.format("MetaReplay.captureDateDisplay",result.getCaptureDate()));
+
+    String requestUrl = UIResults.encodeXMLEntity(
+    		wbRequest.getRequestUrl());
+    %>
+      <title><%= prettyDate %></title>
+      <link><%= replayUrl %></link>
+      <description><%= requestUrl %></description>
+    </item>
+    <%
+  }
+%>  
+   </channel>
+ </rss>

Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/OpenSearchUrlResults.jsp
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/OpenSearchUrlResults.jsp	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/OpenSearchUrlResults.jsp	2008-12-05 22:46:30 UTC (rev 2649)
@@ -0,0 +1,97 @@
+<?xml version="1.0" encoding="UTF-8"?><%@
+ page language="java" pageEncoding="utf-8" contentType="text/xml;charset=utf-8"
+%><%@
+ page import="java.util.Iterator"
+%><%@
+ page import="java.util.ArrayList"
+%><%@
+ page import="java.util.Date"
+%><%@
+ page import="java.util.Map"
+%><%@
+ page import="java.util.Enumeration"
+%><%@
+ page import="org.archive.wayback.core.UrlSearchResult"
+%><%@
+ page import="org.archive.wayback.core.UrlSearchResults"
+%><%@
+ page import="org.archive.wayback.core.SearchResults"
+%><%@
+ page import="org.archive.wayback.core.UIResults"
+%><%@
+ page import="org.archive.wayback.core.WaybackRequest"
+%><%@
+ page import="org.archive.wayback.requestparser.OpenSearchRequestParser"
+%><%@
+ page import="org.archive.wayback.util.StringFormatter"
+%><%
+UIResults uiResults = UIResults.extractUrlQuery(request);
+
+WaybackRequest wbRequest = uiResults.getWbRequest();
+StringFormatter fmt = wbRequest.getFormatter();
+UrlSearchResults results = uiResults.getUrlResults();
+Iterator<UrlSearchResult> itr = results.iterator();
+String contextRoot = wbRequest.getContextPrefix();
+String searchString = wbRequest.getRequestUrl();
+long firstResult = results.getFirstReturned();
+long shownResultCount = results.getReturnedCount();
+long lastResult = results.getReturnedCount() + firstResult;
+long resultCount = results.getMatchingCount();
+String searchTerms = "";
+Map<String,String[]> queryMap = request.getParameterMap();
+String arr[] = queryMap.get(OpenSearchRequestParser.SEARCH_QUERY);
+if(arr != null && arr.length > 1) {
+    searchTerms = arr[0];
+}
+%>
+<rss version="2.0" 
+      xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/"
+      xmlns:atom="http://www.w3.org/2005/Atom">
+   <channel>
+     <title>Wayback OpenSearch Results</title>
+     <link><%= contextRoot %>></link>
+     <description><%= fmt.format("PathQueryClassic.searchedFor",searchString) %></description>
+     <opensearch:totalResults><%= resultCount %></opensearch:totalResults>
+     <opensearch:startIndex><%= firstResult %></opensearch:startIndex>
+     <opensearch:itemsPerPage><%= shownResultCount %></opensearch:itemsPerPage>
+     <atom:link rel="search" type="application/opensearchdescription+xml" href="<%= contextRoot %>/opensearchdescription.xml"/>
+     <opensearch:Query role="request" searchTerms="<%= UIResults.encodeXMLContent(searchTerms) %>" startPage="<%= wbRequest.getPageNum() %>" />
+<%
+  while(itr.hasNext()) {
+    %>
+     <item>
+    <%
+    UrlSearchResult result = itr.next();
+
+    String originalUrl = result.getOriginalUrl();
+    String title = UIResults.encodeXMLEntity(originalUrl);
+    
+    String queryUrl = UIResults.encodeXMLEntity(
+            uiResults.makeCaptureQueryUrl(originalUrl));
+
+    String requestUrl = UIResults.encodeXMLEntity(
+            wbRequest.getRequestUrl());
+    long numCaptures = result.getNumCaptures();
+    long numVersions = result.getNumVersions();
+
+    Date firstDate = result.getFirstCaptureDate();
+    Date lastDate = result.getLastCaptureDate();
+    %>
+      <title><%= title %></title>
+      <link><%= queryUrl %></link>
+      <description>
+        <%= requestUrl %>
+        <span class="mainSearchText">
+          <%= fmt.format("PathPrefixQuery.versionCount",numVersions) %>
+        </span>
+        <span class="mainSearchText">
+          <%= fmt.format("PathPrefixQuery.multiCaptureDate",numCaptures,firstDate,lastDate) %>
+        </span>
+
+      </description>
+    </item>
+    <%
+  }
+%>  
+   </channel>
+ </rss>

Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/opensearchdescription.xml
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/opensearchdescription.xml	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/opensearchdescription.xml	2008-12-05 22:46:30 UTC (rev 2649)
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/">
+  <ShortName>Wayback</ShortName>
+  <Description>Wayback Search Result RSS feed.</Description>
+  <Tags>wayback rss</Tags>
+  <Contact>arc...@ar...</Contact>
+  <Url type="application/rss+xml" 
+       template="http://wayback.archive-it.org/query?q={searchTerms}&amp;start_page={startPage?}&amp;count={count?}"/>
+</OpenSearchDescription>
\ No newline at end of file


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2648] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp/WEB-INF/replay

From: <bra...@us...> - 2008-12-05 22:34:18

Revision: 2648
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2648&view=rev
Author:   bradtofel
Date:     2008-12-05 22:34:13 +0000 (Fri, 05 Dec 2008)

Log Message:
-----------
ACC-55: added explicit style="display:inline;" to all <img> tags, so they can't inherit a containing page's img style display:block declaration.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/JSLessTimeline.jsp
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Timeline.jsp

Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/JSLessTimeline.jsp
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/JSLessTimeline.jsp	2008-12-05 22:24:43 UTC (rev 2647)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/JSLessTimeline.jsp	2008-12-05 22:34:13 UTC (rev 2648)
@@ -149,7 +149,7 @@
 						<table cellspacing="0" border="0" cellpadding="0"  width="100%">
 							<tr>
 								<td width="48%" nowrap><span><%= firstDate %></span></td>
-								<td align="center" valign="bottom" nowrap><img wmSpecial="1" src="<%= contextRoot %>/images/mark.jpg"></td>
+								<td align="center" valign="bottom" nowrap><img style="display: inline;" wmSpecial="1" src="<%= contextRoot %>/images/mark.jpg"></td>
 								<td width="48%" nowrap align="right"><span><%= lastDate %></span></td>
 							</tr>
 						</table>
@@ -165,7 +165,7 @@
 									first.getCaptureDate()) + "\"";
 							%><a wmSpecial="1" href="<%= results.resultToReplayUrl(first) %>"><%
 						}
-						%><img <%= titleString %> wmSpecial="1" border=0 width=19 height=20 src="<%= contextRoot %>/images/first.jpg"><%
+						%><img style="display: inline;" <%= titleString %> wmSpecial="1" border=0 width=19 height=20 src="<%= contextRoot %>/images/first.jpg"><%
 						if(first != null) {
 							%></a><%
 						}
@@ -176,7 +176,7 @@
 										prev.getCaptureDate()) + "\"";
 							%><a wmSpecial="1" href="<%= results.resultToReplayUrl(prev) %>"><%
 						}
-						%><img <%= titleString %> wmSpecial="1" border=0 width=13 height=20 src="<%= contextRoot %>/images/prev.jpg"><%
+						%><img style="display: inline;" <%= titleString %> wmSpecial="1" border=0 width=13 height=20 src="<%= contextRoot %>/images/prev.jpg"><%
 						if(first != null) {
 							%></a><%
 						}
@@ -205,17 +205,17 @@
 		}
 		if((i > 0) && (i < numPartitions)) {
 
-%><img wmSpecial="1" border=0 width=1 height=16 src="<%= contextRoot %>/images/linemark.jpg"><%
+%><img style="display: inline;" wmSpecial="1" border=0 width=1 height=16 src="<%= contextRoot %>/images/linemark.jpg"><%
 		
 		}
 
 		if(replayUrl == null) {
 
-%><img wmSpecial="1" border=0 width=7 height=16 src="<%= imageUrl %>"><%
+%><img style="display: inline;" wmSpecial="1" border=0 width=7 height=16 src="<%= imageUrl %>"><%
 		
 		} else {
 
-%><a wmSpecial="1" href="<%= replayUrl %>"><img wmSpecial="1" border=0 width=7 height=16 title="<%= prettyDateTime %>" src="<%= imageUrl %>"></a><%
+%><a wmSpecial="1" href="<%= replayUrl %>"><img style="display: inline;" wmSpecial="1" border=0 width=7 height=16 title="<%= prettyDateTime %>" src="<%= imageUrl %>"></a><%
 
 		}
 	}
@@ -229,7 +229,7 @@
 									next.getCaptureDate()) + "\"";
 							%><a wmSpecial="1" href="<%= results.resultToReplayUrl(next) %>"><%
 						}
-						%><img wmSpecial="1" <%= titleString %> border=0 width=13 height=20 src="<%= contextRoot %>/images/next.jpg"><%
+						%><img style="display: inline;" wmSpecial="1" <%= titleString %> border=0 width=13 height=20 src="<%= contextRoot %>/images/next.jpg"><%
 						if(first != null) {
 							%></a><%
 						}
@@ -240,7 +240,7 @@
 									last.getCaptureDate()) + "\"";
 							%><a wmSpecial="1" href="<%= results.resultToReplayUrl(last) %>"><%
 						}
-						%><img wmSpecial="1" <%= titleString %> border=0 width=19 height=20 src="<%= contextRoot %>/images/last.jpg"><%
+						%><img style="display: inline;" wmSpecial="1" <%= titleString %> border=0 width=19 height=20 src="<%= contextRoot %>/images/last.jpg"><%
 						if(first != null) {
 							%></a><%
 						}
@@ -283,7 +283,7 @@
       %></a>
 		</td>
 		<td>
-			<img wmSpecial="1" alt='' height='1' src='<%= contextRoot %>/images/1px.gif' width='5'>
+			<img style="display: inline;" wmSpecial="1" alt='' height='1' src='<%= contextRoot %>/images/1px.gif' width='5'>
 		</td>
 	</tr>
 </table>

Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Timeline.jsp
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Timeline.jsp	2008-12-05 22:24:43 UTC (rev 2647)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Timeline.jsp	2008-12-05 22:34:13 UTC (rev 2648)
@@ -171,7 +171,7 @@
 						<table cellspacing="0" border="0" cellpadding="0"  width="100%">
 							<tr>
 								<td width="48%" nowrap><span><%= firstDate %></span></td>
-								<td align="center" valign="bottom" nowrap><img wmSpecial="1" src="<%= contextRoot %>/images/mark.jpg"></td>
+								<td align="center" valign="bottom" nowrap><img style="display: inline;" wmSpecial="1" src="<%= contextRoot %>/images/mark.jpg"></td>
 								<td width="48%" nowrap align="right"><span><%= lastDate %></span></td>
 							</tr>
 						</table>
@@ -187,7 +187,7 @@
 									first.getCaptureDate()) + "\"";
 							%><a wmSpecial="1" onclick="SetAnchorDate('<%= first.getCaptureTimestamp() %>');" href="<%= results.resultToReplayUrl(first) %>"><%
 						}
-						%><img <%= titleString %> wmSpecial="1" border=0 width=19 height=20 src="<%= contextRoot %>/images/first.jpg"><%
+						%><img style="display: inline;" <%= titleString %> wmSpecial="1" border=0 width=19 height=20 src="<%= contextRoot %>/images/first.jpg"><%
 						if(first != null) {
 							%></a><%
 						}
@@ -198,7 +198,7 @@
 									prev.getCaptureDate()) + "\"";
 							%><a wmSpecial="1" onclick="SetAnchorDate('<%= prev.getCaptureTimestamp() %>');" href="<%= results.resultToReplayUrl(prev) %>"><%
 						}
-						%><img <%= titleString %> wmSpecial="1" border=0 width=13 height=20 src="<%= contextRoot %>/images/prev.jpg"><%
+						%><img style="display: inline;" <%= titleString %> wmSpecial="1" border=0 width=13 height=20 src="<%= contextRoot %>/images/prev.jpg"><%
 						if(first != null) {
 							%></a><%
 						}
@@ -230,17 +230,17 @@
 		}
 		if((i > 0) && (i < numPartitions)) {
 
-%><img wmSpecial="1" border=0 width=1 height=16 src="<%= contextRoot %>/images/linemark.jpg"><%
+%><img style="display: inline;" wmSpecial="1" border=0 width=1 height=16 src="<%= contextRoot %>/images/linemark.jpg"><%
 		
 		}
 
 		if(replayUrl == null) {
 
-%><img wmSpecial="1" border=0 width=7 height=16 src="<%= imageUrl %>"><%
+%><img style="display: inline;" wmSpecial="1" border=0 width=7 height=16 src="<%= imageUrl %>"><%
 		
 		} else {
 
-%><a wmSpecial="1" onclick="SetAnchorDate('<%= ts %>');" href="<%= replayUrl %>"><img wmSpecial="1" border=0 width=7 height=16 title="<%= prettyDateTime %>" src="<%= imageUrl %>"></a><%
+%><a wmSpecial="1" onclick="SetAnchorDate('<%= ts %>');" href="<%= replayUrl %>"><img style="display: inline;" wmSpecial="1" border=0 width=7 height=16 title="<%= prettyDateTime %>" src="<%= imageUrl %>"></a><%
 
 		}
 	}
@@ -254,7 +254,7 @@
 									next.getCaptureDate()) + "\"";
 							%><a wmSpecial="1" onclick="SetAnchorDate('<%= next.getCaptureTimestamp() %>');" href="<%= results.resultToReplayUrl(next) %>"><%
 						}
-						%><img wmSpecial="1" <%= titleString %> border=0 width=13 height=20 src="<%= contextRoot %>/images/next.jpg"><%
+						%><img style="display: inline;" wmSpecial="1" <%= titleString %> border=0 width=13 height=20 src="<%= contextRoot %>/images/next.jpg"><%
 						if(next != null) {
 							%></a><%
 						}
@@ -265,7 +265,7 @@
 									last.getCaptureDate()) + "\"";
 							%><a wmSpecial="1" onclick="SetAnchorDate('<%= last.getCaptureTimestamp() %>');" href="<%= results.resultToReplayUrl(last) %>"><%
 						}
-						%><img wmSpecial="1" <%= titleString %> border=0 width=19 height=20 src="<%= contextRoot %>/images/last.jpg"><%
+						%><img style="display: inline;" wmSpecial="1" <%= titleString %> border=0 width=19 height=20 src="<%= contextRoot %>/images/last.jpg"><%
 						if(last != null) {
 							%></a><%
 						}
@@ -308,7 +308,7 @@
       %></a>
 		</td>
 		<td>
-			<img wmSpecial="1" alt='' height='1' src='<%= contextRoot %>/images/1px.gif' width='5'>
+			<img style="display: inline;" wmSpecial="1" alt='' height='1' src='<%= contextRoot %>/images/1px.gif' width='5'>
 		</td>
 	</tr>
 </table>


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2647] trunk/archive-access/projects/wayback/ dist/src/site/xdoc

From: <bra...@us...> - 2008-12-05 22:24:46

Revision: 2647
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2647&view=rev
Author:   bradtofel
Date:     2008-12-05 22:24:43 +0000 (Fri, 05 Dec 2008)

Log Message:
-----------
oops. forgot to commit site updates within the 1.4.1 branch... this is deployed ad-hoc anyways at the moment, so we'll leave it committed here under 1.5.0.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml
    trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml

Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml
===================================================================
--- trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml	2008-12-05 22:21:42 UTC (rev 2646)
+++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml	2008-12-05 22:24:43 UTC (rev 2647)
@@ -74,6 +74,16 @@
         </p>
     </section>
     <section name="News">
+        <subsection name="Maintenance Release - 1.4.1, 11/10/2008">
+          <p>
+            Release 1.4.1 fixes several problems discovered in the 1.4.0 
+            release, and most notably disables by default the AnchorDate and
+            AnchorWindow features which generated some confusion. Please
+            see the <a href="release_notes.html">release notes</a> for
+            a detailed list of changes.
+          </p>
+        </subsection>
+
         <subsection name="New Release - 1.4.0, 8/20/2008">
           <p>
             Release 1.4.0 has several new features, as well as several 

Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml
===================================================================
--- trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml	2008-12-05 22:21:42 UTC (rev 2646)
+++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml	2008-12-05 22:24:43 UTC (rev 2647)
@@ -14,6 +14,61 @@
         to release 1.2.0.
       </p>
     </section>
+    <section name="Release 1.4.1">
+      <subsection name="Features">
+        <ul>
+          <li>
+            Index filter which allows including/excluding records based on HTTP 
+            response code field.(<i>ACC-43</i>)
+          </li>
+          <li>
+            Outputs log message instead of stack dump when failing to access
+            a Resource.
+          </li>
+        </ul>
+      </subsection>
+      <subsection name="Bug Fixes">
+        <ul>
+          <li>
+            Some redirect records were not being located in index due to bad
+            logic in Duplicate record filter.(<i>ACC-30</i>)
+          </li>
+          <li>
+            Wayback was not throwing a NotInArchiveException when 
+            Self-Redirect replay filter removes all records. (unreported)
+          </li>
+          <li>
+            Location HTTP header values were not being escaped before
+            placing in CDX, causing some records to have too many columns.
+            (<i>ACC-31</i>)
+          </li>
+          <li>
+            Search Result summary counts were incorrect in Url Prefix
+            searches.(<i>ACC-33</i>)
+          </li>
+          <li>
+            Implemented NoCache.jsp, a replay insert which adds a 
+            <b>Cache-Control: no-cache</b> HTTP header to all replayed
+            documents.(<i>ACC-34</i>)
+          </li>
+          <li>
+            Timeline.jsp was using Request Date, not Capture date, which
+            caused Proxy Mode Timeline to show the wrong date.
+            (<i>ACC-36</i>)
+          </li>
+          <li>
+            Advanced Search reference implementation .jsp was broken.
+            (<i>ACC-37</i>)
+          </li>
+          <li>
+            AnchorDate and AnchorWindow functionality is now disabled by
+            default, and can be enabled via configuration on an AccessPoint.
+            (<i>ACC-46</i>)
+          </li>
+        </ul>
+      </subsection>
+    </section>
+
     <section name="Release 1.4.0">
       <subsection name="Features">
         <ul>


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2646] trunk/archive-access/projects/wayback/ dist/pom.xml

From: <bra...@us...> - 2008-12-05 22:21:45

Revision: 2646
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2646&view=rev
Author:   bradtofel
Date:     2008-12-05 22:21:42 +0000 (Fri, 05 Dec 2008)

Log Message:
-----------
added javax.servlet dependency for wayback-core

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/dist/pom.xml

Modified: trunk/archive-access/projects/wayback/dist/pom.xml
===================================================================
--- trunk/archive-access/projects/wayback/dist/pom.xml	2008-12-05 22:20:33 UTC (rev 2645)
+++ trunk/archive-access/projects/wayback/dist/pom.xml	2008-12-05 22:21:42 UTC (rev 2646)
@@ -62,6 +62,11 @@
       <artifactId>wayback-mapreduce</artifactId>
       <version>1.5.0-SNAPSHOT</version>
     </dependency>
+    <dependency>
+    	<groupId>javax.servlet</groupId>
+    	<artifactId>servlet-api</artifactId>
+    	<version>2.5</version>
+    </dependency>
   </dependencies>
     
   <build>


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2645] trunk/archive-access/projects/wayback/ wayback-core/src/test/java/org/archive/wayback/replay/TagMagixTest.java

From: <bra...@us...> - 2008-12-05 22:20:38

Revision: 2645
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2645&view=rev
Author:   bradtofel
Date:     2008-12-05 22:20:33 +0000 (Fri, 05 Dec 2008)

Log Message:
-----------
Fixed bad tests of style rewrites -- we were only testing one of the two CSS rewrite ops at a time, when both were being applied within HTML pages, this was causing some URLs to be rewritten twice, but these tests didn't catch that. Now they do both rewrites, which works now since URIConverter ops are now idempotent.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/TagMagixTest.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/TagMagixTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/TagMagixTest.java	2008-12-05 22:17:55 UTC (rev 2644)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/TagMagixTest.java	2008-12-05 22:20:33 UTC (rev 2645)
@@ -423,6 +423,13 @@
 //				"<table style=\"bg: url(\\\"http://w.a.org/wb/2004/http://f.au/css/b.gif\\\"); fg: url(\\\"http://w.a.org/wb/2004/http://f.au/css/f.gif\\\");\"></table>",
 //				"http://w.a.org/wb/","2004","http://f.au/");
 		
+		
+		checkStyleUrlMarkup("<td style=\"b-i:url(i/b.jpg);\n\"></td>",
+				"<td style=\"b-i:url(http://w.a.org/wb/2004/http://f.au/i/b.jpg);\n\"></td>",
+				"http://w.a.org/wb/","2004","http://f.au/");
+		
+//		    "<td style=\"background-image:url(images/banner.jpg);\n\"></td>"
+
 	
 	}
 	
@@ -449,11 +456,12 @@
 		ArchivalUrlResultURIConverter uriC = new ArchivalUrlResultURIConverter();
 		uriC.setReplayURIPrefix(prefix);
 		TagMagix.markupCSSImports(buf, uriC, ts, url);
+		TagMagix.markupStyleUrls(buf,uriC,ts,url);
 		String marked = buf.toString();
 		assertEquals(want,marked);
 	}
 	
-	private void checkStyleUrlMarkup(String orig, String want, String prefix, String ts, String url) {
+	private void checkStyleOnlyUrlMarkup(String orig, String want, String prefix, String ts, String url) {
 		StringBuilder buf = new StringBuilder(orig);
 		ArchivalUrlResultURIConverter uriC = new ArchivalUrlResultURIConverter();
 		uriC.setReplayURIPrefix(prefix);
@@ -461,6 +469,16 @@
 		String marked = buf.toString();
 		assertEquals(want,marked);
 	}
+
+	private void checkStyleUrlMarkup(String orig, String want,String prefix, String ts, String url) {
+		StringBuilder buf = new StringBuilder(orig);
+		ArchivalUrlResultURIConverter uriC = new ArchivalUrlResultURIConverter();
+		uriC.setReplayURIPrefix(prefix);
+		TagMagix.markupCSSImports(buf, uriC, ts, url);
+		TagMagix.markupStyleUrls(buf, uriC, ts, url);
+		String marked = buf.toString();
+		assertEquals(want,marked);
+	}
 	
 	private void checkMarkup(String orig, String want, String tag, String attr, String prefix, String ts, String url) {
 		StringBuilder buf = new StringBuilder(orig);


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2644] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback

From: <bra...@us...> - 2008-12-05 22:17:59

Revision: 2644
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2644&view=rev
Author:   bradtofel
Date:     2008-12-05 22:17:55 +0000 (Fri, 05 Dec 2008)

Log Message:
-----------
ACC-53: now we check within the URI converters to (try to) ensure an URL is rewritten at most once -- becomes an idempotent operation.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlResultURIConverter.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixResultURIConverter.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlResultURIConverter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlResultURIConverter.java	2008-12-05 22:13:14 UTC (rev 2643)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlResultURIConverter.java	2008-12-05 22:17:55 UTC (rev 2644)
@@ -42,10 +42,14 @@
 	 * @see org.archive.wayback.ResultURIConverter#makeReplayURI(java.lang.String, java.lang.String)
 	 */
 	public String makeReplayURI(String datespec, String url) {
+		String suffix = datespec + "/" + url;
 		if(replayURIPrefix == null) {
-			return datespec + "/" + url;
+			return suffix;
 		} else {
-			return replayURIPrefix + datespec + "/" + url;
+			if(url.startsWith(replayURIPrefix)) {
+				return url;
+			}
+			return replayURIPrefix + suffix;
 		}
 	}
 

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixResultURIConverter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixResultURIConverter.java	2008-12-05 22:13:14 UTC (rev 2643)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixResultURIConverter.java	2008-12-05 22:17:55 UTC (rev 2644)
@@ -45,6 +45,9 @@
 	public String makeReplayURI(String datespec, String url) {
 		String replayURI = "";
 		try {
+			if(url.contains(hostPort)) {
+				return url;
+			}
 			URI uri = new URI(url);
 			StringBuilder sb = new StringBuilder(90);
 			sb.append("http://");


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2643] trunk/archive-access/projects/wayback

From: <bra...@us...> - 2008-12-05 22:13:17

Revision: 2643
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2643&view=rev
Author:   bradtofel
Date:     2008-12-05 22:13:14 +0000 (Fri, 05 Dec 2008)

Log Message:
-----------
ACC-35: Now has optional prefix, which via config is defaulted to "X-Archive-Orig-", which is prepended to all original HTTP headers, to clarify what headers are added by Wayback, and what were presented with the original document.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderProcessor.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/RedirectRewritingHttpHeaderProcessor.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/XArchiveHttpHeaderProcessor.java
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderProcessor.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderProcessor.java	2008-11-07 22:45:08 UTC (rev 2642)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderProcessor.java	2008-12-05 22:13:14 UTC (rev 2643)
@@ -49,6 +49,10 @@
 	public final static String HTTP_CONTENT_BASE_HEADER_UP = 
 		HTTP_CONTENT_BASE_HEADER.toUpperCase();
 
+	public final static String HTTP_CONTENT_LOCATION_HEADER = "Content-Location";
+	public final static String HTTP_CONTENT_LOCATION_HEADER_UP = 
+		HTTP_CONTENT_LOCATION_HEADER.toUpperCase();
+
 	public final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type";
 	public final static String HTTP_CONTENT_TYPE_HEADER_UP = 
 		HTTP_CONTENT_TYPE_HEADER.toUpperCase();

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/RedirectRewritingHttpHeaderProcessor.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/RedirectRewritingHttpHeaderProcessor.java	2008-11-07 22:45:08 UTC (rev 2642)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/RedirectRewritingHttpHeaderProcessor.java	2008-12-05 22:13:14 UTC (rev 2643)
@@ -39,6 +39,18 @@
 public class RedirectRewritingHttpHeaderProcessor 
 	implements HttpHeaderProcessor {
 
+	private static String DEFAULT_PREFIX = null;
+	private String prefix = DEFAULT_PREFIX; 
+
+	public String getPrefix() {
+		return prefix;
+	}
+
+	public void setPrefix(String prefix) {
+		this.prefix = prefix;
+	}
+
+
 	/* (non-Javadoc)
 	 * @see org.archive.wayback.replay.HttpHeaderProcessor#filter(java.util.Map, java.lang.String, java.lang.String, org.archive.wayback.ResultURIConverter, org.archive.wayback.core.CaptureSearchResult)
 	 */
@@ -47,9 +59,18 @@
 
 		String keyUp = key.toUpperCase();
 
+		// first stick it in as-is, or with prefix, then maybe we'll overwrite
+		// with the later logic.
+		if(prefix == null) {
+			output.put(key, value);
+		} else {
+			output.put(prefix + key, value);
+		}
+
 		// rewrite Location header URLs
 		if (keyUp.startsWith(HTTP_LOCATION_HEADER_UP) ||
-				keyUp.startsWith(HTTP_CONTENT_BASE_HEADER_UP)) {
+			keyUp.startsWith(HTTP_CONTENT_LOCATION_HEADER_UP) ||
+			keyUp.startsWith(HTTP_CONTENT_BASE_HEADER_UP)) {
 
 			String baseUrl = result.getOriginalUrl();
 			String cd = result.getCaptureTimestamp();
@@ -57,13 +78,10 @@
 			String u = UrlOperations.resolveUrl(baseUrl, value);
 
 			output.put(key, uriConverter.makeReplayURI(cd,u));
-//		} else if(keyUp.startsWith(HTTP_CONTENT_TYPE_HEADER_UP)) {
-//			output.put("X-Wayback-Orig-" + key,value);
-//			output.put(key,value);
-		} else {
-			// others go out as-is:
 
-			output.put(key, value);
+		} else if(keyUp.startsWith(HTTP_CONTENT_TYPE_HEADER_UP)) {
+			// let's leave this one as-is:
+			output.put(key,value);
 		}
 	}
 }

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/XArchiveHttpHeaderProcessor.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/XArchiveHttpHeaderProcessor.java	2008-11-07 22:45:08 UTC (rev 2642)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/XArchiveHttpHeaderProcessor.java	2008-12-05 22:13:14 UTC (rev 2643)
@@ -22,13 +22,10 @@
 			ResultURIConverter uriConverter, CaptureSearchResult result) {
 		String keyUp = key.toUpperCase();
 
-		// rewrite Location header URLs
+		output.put(prefix + key,value);
 		if (keyUp.startsWith(HTTP_CONTENT_TYPE_HEADER_UP)) {
-			// let's leave this one alone... seems important.
+			// add this one as-is, too.
 			output.put(key, value);
-		} else {
-			// others go out with prefix:
-			output.put(prefix + key,value);
 		}
 	}
 }

Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml	2008-11-07 22:45:08 UTC (rev 2642)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml	2008-12-05 22:13:14 UTC (rev 2643)
@@ -4,7 +4,9 @@
        xsi:schemaLocation="http://www.springframework.org/schema/beans
            http://www.springframework.org/schema/beans/spring-beans-2.5.xsd">
 
-  <bean id="archivalurlhttpheaderprocessor" class="org.archive.wayback.replay.RedirectRewritingHttpHeaderProcessor" />
+  <bean id="archivalurlhttpheaderprocessor" class="org.archive.wayback.replay.RedirectRewritingHttpHeaderProcessor">
+    <property name="prefix" value="X-Archive-Orig-" />
+  </bean>
 
   <bean id="archivaldateredirectingreplayrenderer" class="org.archive.wayback.replay.DateRedirectReplayRenderer" />
   <bean id="archivalcssreplayrenderer" class="org.archive.wayback.archivalurl.ArchivalUrlCSSReplayRenderer">


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2642] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourceindex/ LocalResourceIndex.java

From: <bra...@us...> - 2008-11-07 22:45:11

Revision: 2642
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2642&view=rev
Author:   bradtofel
Date:     2008-11-07 22:45:08 +0000 (Fri, 07 Nov 2008)

Log Message:
-----------
FEATURE: now adds a SchemeMatchFilter if WaybackRequest specifies it is needed.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java	2008-11-07 22:41:50 UTC (rev 2641)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java	2008-11-07 22:45:08 UTC (rev 2642)
@@ -51,6 +51,7 @@
 import org.archive.wayback.resourceindex.filters.EndDateFilter;
 import org.archive.wayback.resourceindex.filters.GuardRailFilter;
 import org.archive.wayback.resourceindex.filters.HostMatchFilter;
+import org.archive.wayback.resourceindex.filters.SchemeMatchFilter;
 import org.archive.wayback.resourceindex.filters.SelfRedirectFilter;
 import org.archive.wayback.resourceindex.filters.UrlMatchFilter;
 import org.archive.wayback.resourceindex.filters.UrlPrefixMatchFilter;
@@ -63,6 +64,7 @@
 import org.archive.wayback.util.ObjectFilterIterator;
 import org.archive.wayback.util.Timestamp;
 import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
+import org.archive.wayback.util.url.UrlOperations;
 
 /**
  *
@@ -378,6 +380,10 @@
 				filter.addFilter(exactHost);
 			}
 
+			if(request.isExactScheme()) {
+				filter.addFilter(new SchemeMatchFilter(
+						UrlOperations.urlToScheme(request.getRequestUrl())));
+			}
 			// count how many results got to the ExclusionFilter:
 			filter.addFilter(preExclusionCounter);
 
@@ -417,6 +423,7 @@
 			}
 		}
 	}
+
 	private static HostMatchFilter getExactHostFilter(WaybackRequest r) { 
 
 		HostMatchFilter filter = null;


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2641] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/ CDXLineToSearchResultAdapter.java

From: <bra...@us...> - 2008-11-07 22:41:54

Revision: 2641
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2641&view=rev
Author:   bradtofel
Date:     2008-11-07 22:41:50 +0000 (Fri, 07 Nov 2008)

Log Message:
-----------
TWEAK: removed code which looked like it did something but had wrong signature, so was unused and confusing.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java	2008-11-07 22:40:02 UTC (rev 2640)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java	2008-11-07 22:41:50 UTC (rev 2641)
@@ -60,20 +60,6 @@
 		}
 	}
 
-	/* (non-Javadoc)
-	 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
-	 */
-	public CaptureSearchResult adapt(CaptureSearchResult o) {
-		String urlKey = o.getUrlKey();
-		StringBuilder sb = new StringBuilder(urlKey.length());
-		sb.append(DEFAULT_SCHEME);
-		sb.append(o.getOriginalUrl());
-		sb.append(urlKey.substring(getEndOfHostIndex(urlKey)));
-		o.setOriginalUrl(sb.toString());
-		return o;
-	}
-	
-	
 	public CaptureSearchResult adapt(String line) {
 		return doAdapt(line);
 	}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2640] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ SchemeMatchFilter.java

From: <bra...@us...> - 2008-11-07 22:40:06

Revision: 2640
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2640&view=rev
Author:   bradtofel
Date:     2008-11-07 22:40:02 +0000 (Fri, 07 Nov 2008)

Log Message:
-----------
INITIAL REV: CaptureSearchResult ObjectFilter that only includes results matching the specified scheme.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SchemeMatchFilter.java

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SchemeMatchFilter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SchemeMatchFilter.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SchemeMatchFilter.java	2008-11-07 22:40:02 UTC (rev 2640)
@@ -0,0 +1,60 @@
+/* SchemeMatchFilter
+ *
+ * $Id$
+ *
+ * Created on 6:40:02 PM Nov 6, 2008.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.resourceindex.filters;
+
+import org.archive.wayback.core.CaptureSearchResult;
+import org.archive.wayback.util.ObjectFilter;
+import org.archive.wayback.util.url.UrlOperations;
+
+/**
+ * ObjectFilter which omits CaptureSearchResult objects if their scheme does not
+ * match the specified scheme.
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+
+public class SchemeMatchFilter implements ObjectFilter<CaptureSearchResult> {
+
+	private String scheme = null;
+	
+	/**
+	 * @param hostname String of original host to match
+	 */
+	public SchemeMatchFilter(final String scheme) {
+		this.scheme = scheme;
+	}
+
+	/* (non-Javadoc)
+	 * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object)
+	 */
+	public int filterObject(CaptureSearchResult r) {
+		String captureScheme = UrlOperations.urlToScheme(r.getOriginalUrl());
+		if(scheme == null) {
+			return captureScheme == null ? FILTER_INCLUDE : FILTER_EXCLUDE;
+		}
+		return scheme.equals(captureScheme) ? FILTER_INCLUDE : FILTER_EXCLUDE;
+	}
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2639] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer

From: <bra...@us...> - 2008-11-07 22:38:52

Revision: 2639
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2639&view=rev
Author:   bradtofel
Date:     2008-11-07 22:38:48 +0000 (Fri, 07 Nov 2008)

Log Message:
-----------
FEATURE: added -all option to warc-indexer command line tool, causing the tool to output records for request and metadata records as well as duplicate, capture, and dns records.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java	2008-11-07 22:35:24 UTC (rev 2638)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java	2008-11-07 22:38:48 UTC (rev 2639)
@@ -2,7 +2,7 @@
 
 import java.io.File;
 import java.io.IOException;
-//import java.util.logging.Logger;
+import java.util.logging.Logger;
 
 import org.apache.commons.httpclient.Header;
 import org.apache.commons.httpclient.HttpParser;
@@ -33,14 +33,23 @@
  */
 public class WARCRecordToSearchResultAdapter
 implements Adapter<WARCRecord,CaptureSearchResult>{
+	private static final Logger LOGGER =
+        Logger.getLogger(WARCRecordToSearchResultAdapter.class.getName());
 	
 	private final static String DEFAULT_VALUE = "-"; 
 
-//	private static final Logger LOGGER = Logger.getLogger(
-//			WARCRecordToSearchResultAdapter.class.getName());
-
 	private UrlCanonicalizer canonicalizer = null;
+	
+	private boolean processAll = false;
 
+	public boolean isProcessAll() {
+		return processAll;
+	}
+
+	public void setProcessAll(boolean processAll) {
+		this.processAll = processAll;
+	}
+
 	public WARCRecordToSearchResultAdapter() {
 		canonicalizer = new AggressiveUrlCanonicalizer();
 	}
@@ -75,12 +84,19 @@
 		return output.toString();
 	}
 	
-	private static String transformHTTPMime(final String input) {
+	private static String escapeSpaces(final String input) {
+		if(input.contains(" ")) {
+			return input.replace(" ", "%20");
+		}
+		return input;
+	}
+	
+	private static String transformHTTPMime(String input) {
 		int semiIdx = input.indexOf(";");
 		if(semiIdx > 0) {
-			return input.substring(0,semiIdx).trim();
+			return escapeSpaces(input.substring(0,semiIdx).trim());
 		}
-		return input.trim();
+		return escapeSpaces(input.trim());
 	}
 
 	private String transformWarcFilename(String readerIdentifier) {
@@ -148,16 +164,21 @@
 		return result;
 	}
 
-	private CaptureSearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) 
+	private CaptureSearchResult adaptGeneric(ArchiveRecordHeader header,
+			WARCRecord rec, String mime) 
 	throws IOException {
 
 		CaptureSearchResult result = getBlankSearchResult();
 
 		result.setCaptureTimestamp(transformDate(header.getDate()));
+		result.setFile(transformWarcFilename(header.getReaderIdentifier()));
+		result.setOffset(header.getOffset());
 		result.setDigest(transformDigest(header.getHeaderValue(
-						WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
+				WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
 		
 		addUrlDataToSearchResult(result,header.getUrl());
+		
+		result.setMimeType(mime);
 
 		return result;
 	}
@@ -243,7 +264,7 @@
 		}
 		return result;
 	}
-	
+
 	private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException {
 		
 		CaptureSearchResult result = null;
@@ -257,7 +278,17 @@
 				result = adaptResponse(header,rec);
 			}
 		} else if(type.equals(WARCConstants.REVISIT)) {
-			result = adaptRevisit(header,rec);
+			result = adaptGeneric(header,rec,"warc/revisit");
+		} else if(type.equals(WARCConstants.REQUEST)) {
+			if(processAll) {
+				result = adaptGeneric(header,rec,"warc/request");
+			}
+		} else if(type.equals(WARCConstants.METADATA)) {
+			if(processAll) {
+				result = adaptGeneric(header,rec,"warc/metadata");
+			}
+		} else {
+			LOGGER.info("Skipping record type : " + type);
 		}
 
 		return result;

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java	2008-11-07 22:35:24 UTC (rev 2638)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java	2008-11-07 22:38:48 UTC (rev 2639)
@@ -26,9 +26,19 @@
 	public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g";
 
 	private UrlCanonicalizer canonicalizer = null;
+	private boolean processAll = false;
 	public WarcIndexer() {
 		canonicalizer = new AggressiveUrlCanonicalizer();
 	}
+
+	public boolean isProcessAll() {
+		return processAll;
+	}
+
+	public void setProcessAll(boolean processAll) {
+		this.processAll = processAll;
+	}
+
 	
 	/**
 	 * @param warc
@@ -61,6 +71,7 @@
 		WARCRecordToSearchResultAdapter adapter2 = 
 			new WARCRecordToSearchResultAdapter();
 		adapter2.setCanonicalizer(canonicalizer);
+		adapter2.setProcessAll(processAll);
 
 		ArchiveReaderCloseableIterator itr1 = 
 			new ArchiveReaderCloseableIterator(reader,reader.iterator());
@@ -82,11 +93,12 @@
 	private static void USAGE() {
 		System.err.println("USAGE:");
 		System.err.println("");
-		System.err.println("warc-indexer [-identity] WARCFILE");
-		System.err.println("warc-indexer [-identity] WARCFILE CDXFILE");
+		System.err.println("warc-indexer [-identity] [-all] WARCFILE");
+		System.err.println("warc-indexer [-identity] [-all] WARCFILE CDXFILE");
 		System.err.println("");
 		System.err.println("Create a CDX format index at CDXFILE or to STDOUT");
 		System.err.println("With -identity, perform no url canonicalization.");
+		System.err.println("With -all, output request and metadata records.");
 		System.exit(1);
 	}
 
@@ -96,8 +108,14 @@
 	public static void main(String[] args) {
 		WarcIndexer indexer = new WarcIndexer();
 		int idx = 0;
-		if(args[0] != null && args[0].equals("-identity")) {
-			indexer.setCanonicalizer(new IdentityUrlCanonicalizer());
+		while(args[idx] != null) {
+			if(args[idx].equals("-identity")) {
+				indexer.setCanonicalizer(new IdentityUrlCanonicalizer());
+			} else if(args[idx].equals("-all")) {
+				indexer.setProcessAll(true);
+			} else {
+				break;
+			}
 			idx++;
 		}
 		File arc = new File(args[idx]);


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2638] trunk/archive-access/projects/wayback/ wayback-core/src

From: <bra...@us...> - 2008-11-07 22:35:28

Revision: 2638
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2638&view=rev
Author:   bradtofel
Date:     2008-11-07 22:35:24 +0000 (Fri, 07 Nov 2008)

Log Message:
-----------
FEATURE: Now supports canonicalization of some non-http:// schemes.
TWEAK: removed unused commented out code

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java
    trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java	2008-11-07 22:34:00 UTC (rev 2637)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java	2008-11-07 22:35:24 UTC (rev 2638)
@@ -206,25 +206,32 @@
 			return urlString;
 		}
 		String searchUrl = canonicalize(urlString);
-
-		// TODO: force https into http for the moment...
-		if(searchUrl.startsWith("https://")) {
-			searchUrl = searchUrl.substring(8);
+		String scheme = UrlOperations.urlToScheme(searchUrl);
+		if(scheme != null) {
+			searchUrl = searchUrl.substring(scheme.length());
+		} else {
+			scheme = UrlOperations.HTTP_SCHEME;
 		}
-		
-		// TODO: this will only work with http:// scheme. should work with all?
-		// force add of scheme and possible add '/' with empty path:
-		if (searchUrl.startsWith("http://")) {
-			if (-1 == searchUrl.indexOf('/', 8)) {
-				searchUrl = searchUrl + "/";
-			}
+	
+		if (-1 == searchUrl.indexOf("/")) {
+			searchUrl = scheme + searchUrl + "/";
 		} else {
-			if (-1 == searchUrl.indexOf("/")) {
-				searchUrl = searchUrl + "/";
-			}
-			searchUrl = "http://" + searchUrl;
+			searchUrl = scheme + searchUrl;
 		}
 
+		// TODO: this will only work with http:// scheme. should work with all?
+		// force add of scheme and possible add '/' with empty path:
+//		if (searchUrl.startsWith("http://")) {
+//			if (-1 == searchUrl.indexOf('/', 8)) {
+//				searchUrl = searchUrl + "/";
+//			}
+//		} else {
+//			if (-1 == searchUrl.indexOf("/")) {
+//				searchUrl = searchUrl + "/";
+//			}
+//			searchUrl = "http://" + searchUrl;
+//		}
+
 		// TODO: These next few lines look crazy -- need to be reworked.. This
 		// was the only easy way I could find to get the correct unescaping
 		// out of UURIs, possible a bug. Definitely needs some TLC in any case,
@@ -250,23 +257,18 @@
 //		if((newPath.length() > 1) && newPath.endsWith("/")) {
 //			newPath = newPath.substring(0,newPath.length()-1);
 //		}
-//		searchURI.setEscapedPath(newPath);
-//		searchURI.setRawPath(newPath.toCharArray());
-//		String query = searchURI.getEscapedQuery();
 		
-		// TODO: handle non HTTP port stripping, too.
-//		String portStr = "";
-//		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
-//			portStr = ":" + searchURI.getPort();
-//		}
-//		return searchURI.getHostBasename() + portStr + 
-//		searchURI.getEscapedPathQuery();
-		
 		StringBuilder sb = new StringBuilder(searchUrl.length());
 		sb.append(searchURI.getHostBasename());
-		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
+
+		// omit port if scheme default:
+		int defaultSchemePort = UrlOperations.schemeToDefaultPort(scheme);
+		if(searchURI.getPort() != defaultSchemePort 
+				&& searchURI.getPort() != -1) {
+
 			sb.append(":").append(searchURI.getPort());
 		}
+
 		sb.append(newPath);
 		if(searchURI.getEscapedQuery() != null) {
 			sb.append("?").append(searchURI.getEscapedQuery());

Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java	2008-11-07 22:34:00 UTC (rev 2637)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java	2008-11-07 22:35:24 UTC (rev 2638)
@@ -45,16 +45,15 @@
 		// simple strip of http://
 		checkCanonicalization("http://foo.com/","foo.com/");
 
-// would be nice to handle other protocols...
-//		// simple strip of https://
-//		checkCanonicalization("https://foo.com/","foo.com/");
-//
-//		// simple strip of ftp://
-//		checkCanonicalization("ftp://foo.com/","foo.com/");
-//
-//		// simple strip of rtsp://
-//		checkCanonicalization("rtsp://foo.com/","foo.com/");
+		// simple strip of https://
+		checkCanonicalization("https://foo.com/","foo.com/");
 
+		// simple strip of ftp://
+		checkCanonicalization("ftp://foo.com/","foo.com/");
+
+		// simple strip of rtsp://
+		checkCanonicalization("rtsp://foo.com/","foo.com/");
+
 		// strip leading 'www.'
 		checkCanonicalization("http://www.foo.com/","foo.com/");
 		
@@ -63,6 +62,9 @@
 		
 		// strip leading 'www##.'
 		checkCanonicalization("http://www12.foo.com/","foo.com/");
+
+		// strip leading 'www##.' with https
+		checkCanonicalization("https://www12.foo.com/","foo.com/");
 		
 		// strip leading 'www##.' with no protocol
 		checkCanonicalization("www12.foo.com/","foo.com/");
@@ -174,13 +176,53 @@
 		checkCanonicalization(
 				"http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules",
 				"legislature.mi.gov/mileg.aspx?page=sessionschedules");
+
+
+
+
+		// default port stripping:
 		
+		// FIRST the easy-on-the-eyes
+
 		// strip port 80
 		checkCanonicalization("http://www.chub.org:80/foo","chub.org/foo");
 
 		// but not other ports...
 		checkCanonicalization("http://www.chub.org:8080/foo","chub.org:8080/foo");
+		
+		// but not other ports... with "www#." massage
+		checkCanonicalization("http://www232.chub.org:8080/foo","chub.org:8080/foo");
 
+		// default HTTP (:80) stripping without a scheme:
+		checkCanonicalization("www.chub.org:80/foo","chub.org/foo");
+		
+		// no strip https port (443) without scheme:
+		checkCanonicalization("www.chub.org:443/foo","chub.org:443/foo");
+
+		// yes strip https port (443) with scheme:
+		checkCanonicalization("https://www.chub.org:443/foo","chub.org/foo");
+		
+		// NEXT the exhaustive:
+		String origHost = "www.chub.org";
+		String massagedHost = "chub.org";
+		String path = "/foo";
+		for(String scheme : UrlOperations.ALL_SCHEMES) {
+
+			int defaultPort = UrlOperations.schemeToDefaultPort(scheme);
+			int nonDefaultPort = 19991;
+
+			String origDefault = scheme + origHost + ":" + defaultPort + path;
+			String canonDefault = massagedHost + path;
+
+			String origNonDefault = 
+				scheme + origHost + ":" + nonDefaultPort + path;
+			String canonNonDefault =
+				massagedHost + ":" + nonDefaultPort + path;
+
+			checkCanonicalization(origDefault,canonDefault);
+			checkCanonicalization(origNonDefault,canonNonDefault);
+		}
+
 	}
 	
 	private void checkCanonicalization(String orig, String want) {


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2637] trunk/archive-access/projects/wayback/ wayback-core/src

From: <bra...@us...> - 2008-11-07 22:34:04

Revision: 2637
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2637&view=rev
Author:   bradtofel
Date:     2008-11-07 22:34:00 +0000 (Fri, 07 Nov 2008)

Log Message:
-----------
FEATURE: added static methods urlToScheme() and getSchemeDefaultPort()

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java
    trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java	2008-11-07 22:31:42 UTC (rev 2636)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java	2008-11-07 22:34:00 UTC (rev 2637)
@@ -81,15 +81,16 @@
 	 * @return url resolved against baseUrl, unless it is absolute already
 	 */
 	public static String resolveUrl(String baseUrl, String url) {
-		// TODO: this only works for http://
-		if(url.startsWith("http://")) {
-			try {
-				return UURIFactory.getInstance(url).getEscapedURI();
-			} catch (URIException e) {
-				e.printStackTrace();
-				// can't let a space exist... send back close to whatever came
-				// in...
-				return url.replace(" ", "%20");
+		for(final String scheme : ALL_SCHEMES) {
+			if(url.startsWith(scheme)) {
+				try {
+					return UURIFactory.getInstance(url).getEscapedURI();
+				} catch (URIException e) {
+					e.printStackTrace();
+					// can't let a space exist... send back close to whatever came
+					// in...
+					return url.replace(" ", "%20");
+				}
 			}
 		}
 		UURI absBaseURI;
@@ -99,11 +100,39 @@
 			resolvedURI = UURIFactory.getInstance(absBaseURI, url);
 		} catch (URIException e) {
 			e.printStackTrace();
-			return url;
+			return url.replace(" ", "%20");
 		}
 		return resolvedURI.getEscapedURI();
 	}
 	
+	public static String urlToScheme(final String url) {
+		for(final String scheme : ALL_SCHEMES) {
+			if(url.startsWith(scheme)) {
+				return scheme;
+			}
+		}
+		return null;
+	}
+	
+	public static int schemeToDefaultPort(final String scheme) {
+		if(scheme.equals(HTTP_SCHEME)) {
+			return 80;
+		}
+		if(scheme.equals(HTTPS_SCHEME)) {
+			return 443;
+		}
+		if(scheme.equals(FTP_SCHEME)) {
+			return 21;
+		}
+		if(scheme.equals(RTSP_SCHEME)) {
+			return 554;
+		}
+		if(scheme.equals(MMS_SCHEME)) {
+			return 1755;
+		}
+		return -1;
+	}
+	
 	public static String urlToHost(String url) {
 		if(url.startsWith("dns:")) {
 			return url.substring(4);

Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java	2008-11-07 22:31:42 UTC (rev 2636)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java	2008-11-07 22:34:00 UTC (rev 2637)
@@ -62,7 +62,33 @@
 		assertEquals("foo.com",UrlOperations.urlToHost("http://foo.com/path:/"));
 		assertEquals("foo.com",UrlOperations.urlToHost("https://foo.com/path:/"));
 		assertEquals("foo.com",UrlOperations.urlToHost("ftp://foo.com/path:/"));
+	}
+
+	public void testResolveUrl() {
+		for(String scheme : UrlOperations.ALL_SCHEMES) {
+
+			assertEquals(scheme + "a.org/1/2",
+				UrlOperations.resolveUrl(scheme + "a.org/3/","/1/2"));
+
+			assertEquals(scheme + "b.org/1/2",
+				UrlOperations.resolveUrl(scheme + "a.org/3/",
+						scheme + "b.org/1/2"));
+
+			assertEquals(scheme + "a.org/3/1/2",
+				UrlOperations.resolveUrl(scheme + "a.org/3/","1/2"));
+
+			assertEquals(scheme + "a.org/1/2",
+				UrlOperations.resolveUrl(scheme + "a.org/3","1/2"));
 		
+		}
 		
 	}
+	public void testUrlToScheme() {
+		assertEquals("http://",UrlOperations.urlToScheme("http://a.com/"));
+		assertEquals("https://",UrlOperations.urlToScheme("https://a.com/"));
+		assertEquals("ftp://",UrlOperations.urlToScheme("ftp://a.com/"));
+		assertEquals("rtsp://",UrlOperations.urlToScheme("rtsp://a.com/"));
+		assertEquals("mms://",UrlOperations.urlToScheme("mms://a.com/"));
+		assertNull(UrlOperations.urlToScheme("blah://a.com/"));
+	}	
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2636] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java

From: <bra...@us...> - 2008-11-07 22:31:45

Revision: 2636
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2636&view=rev
Author:   bradtofel
Date:     2008-11-07 22:31:42 +0000 (Fri, 07 Nov 2008)

Log Message:
-----------
FEATURE: added exactSchemeMatch configuration, which for now is set on all requests within this AccessPoint.
TWEAK: removed some code that had been commented out and is no longer used/needed.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java	2008-11-07 22:29:45 UTC (rev 2635)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java	2008-11-07 22:31:42 UTC (rev 2636)
@@ -80,6 +80,7 @@
 	
 	private boolean useServerName = false;
 	private boolean useAnchorWindow = false;
+	private boolean exactSchemeMatch = true;
 
 	private int contextPort = 0;
 	private String contextName = null;
@@ -217,11 +218,6 @@
 			prefix.append(":").append(waybackPort);
 		}
 		String contextPath = getContextPath(httpRequest);
-//		if(contextPath.length() > 1) {
-//			prefix.append(contextPath);
-//		} else {
-//			prefix.append(contextPath);
-//		}
 		prefix.append(contextPath);
 		return prefix.toString();
 	}
@@ -264,19 +260,6 @@
 		} catch(IOException e) {
 			// TODO: figure out if we got IO because of a missing dispatcher
 		}
-//		uiResults.storeInRequest(httpRequest,translated);
-//		RequestDispatcher dispatcher = null;
-//		// special case for the front '/' page:
-//		if(translated.length() == 0) {
-//			translated = "/";
-//		} else {
-//			translated = "/" + translated;
-//		}
-//		dispatcher = httpRequest.getRequestDispatcher(translated);
-//		if(dispatcher != null) {
-//			dispatcher.forward(httpRequest, httpResponse);
-//			return true;
-//		}
 		return false;
 	}
 	
@@ -299,9 +282,13 @@
 
 			if(wbRequest != null) {
 				handled = true;
+
+				// TODO: refactor this code into RequestParser implementations
 				wbRequest.setAccessPoint(this);
 				wbRequest.setContextPrefix(getAbsoluteLocalPrefix(httpRequest));
 				wbRequest.fixup(httpRequest);
+				// end of refactor
+
 				if(authentication != null) {
 					if(!authentication.isTrue(wbRequest)) {
 						throw new AuthenticationControlException("Not authorized");
@@ -311,6 +298,12 @@
 				if(exclusionFactory != null) {
 					wbRequest.setExclusionFilter(exclusionFactory.get());
 				}
+				// TODO: refactor this into RequestParser implementations, so a
+				// user could alter requests to change the behavior within a
+				// single AccessPoint. For now, this is a simple way to expose
+				// the feature to configuration.
+				wbRequest.setExactScheme(exactSchemeMatch);
+
 				if(wbRequest.isReplayRequest()) {
 
 					handleReplay(wbRequest,httpRequest,httpResponse);
@@ -488,7 +481,21 @@
 	public void setUseAnchorWindow(boolean useAnchorWindow) {
 		this.useAnchorWindow = useAnchorWindow;
 	}
+	
+	/**
+	 * @return the exactSchemeMatch
+	 */
+	public boolean isExactSchemeMatch() {
+		return exactSchemeMatch;
+	}
 
+	/**
+	 * @param exactSchemeMatch the exactSchemeMatch to set
+	 */
+	public void setExactSchemeMatch(boolean exactSchemeMatch) {
+		this.exactSchemeMatch = exactSchemeMatch;
+	}
+
 	public ExclusionFilterFactory getExclusionFactory() {
 		return exclusionFactory;
 	}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2635] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java

From: <bra...@us...> - 2008-11-07 22:29:48

Revision: 2635
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2635&view=rev
Author:   bradtofel
Date:     2008-11-07 22:29:45 +0000 (Fri, 07 Nov 2008)

Log Message:
-----------
FEATURE: now supports more schemes within setRequestUrl() - any defined within org.archive.wayback.util.url.UrlOperations
FEATURE: now supports boolean exactScheme flag to indicate that user wishes only to match records with the same scheme as the requestUrl
TWEAK: removed some code that had been commented out and is no longer used/needed.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java	2008-11-07 02:11:09 UTC (rev 2634)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java	2008-11-07 22:29:45 UTC (rev 2635)
@@ -39,6 +39,7 @@
 import org.archive.wayback.util.ObjectFilter;
 import org.archive.wayback.util.StringFormatter;
 import org.archive.wayback.util.Timestamp;
+import org.archive.wayback.util.url.UrlOperations;
 import org.archive.wayback.webapp.AccessPoint;
 
 /**
@@ -186,6 +187,12 @@
 	public static final String REQUEST_EXACT_HOST_ONLY = "requestexacthost";
 
 	/**
+	 * Indicates user only wants results that were captured using the same 
+	 * scheme as that specified in REQUEST_URL.
+	 */
+	public static final String REQUEST_EXACT_SCHEME_ONLY = "requestexactscheme";
+	
+	/**
 	 * indicates positive value for any request boolean flag.
 	 */
 	public static final String REQUEST_YES = "yes";
@@ -556,16 +563,27 @@
      * @param urlStr Request URL.
      */
 	public void setRequestUrl(String urlStr) {
-		// TODO: fix this to use other schemes
-		if (!urlStr.startsWith("http://")) {
+
+		// This looks a little confusing: We're trying to fixup an incoming
+		// request URL that starts with: 
+		//       "http:/www.archive.org"
+		// so it becomes:
+		//       "http://www.archive.org"
+		// (note the missing second "/" in the first)
+		// 
+		// if that is not the case, then see if the incoming scheme
+		// is known, adding an implied "http://" scheme if there doesn't appear
+		// to be a scheme..
+		// TODO: make the default "http://" configurable.
+		if (!urlStr.startsWith(UrlOperations.HTTP_SCHEME)) {
 	    	if(urlStr.startsWith("http:/")) {
-	    		urlStr = "http://" + urlStr.substring(6);
+	    		urlStr = UrlOperations.HTTP_SCHEME + urlStr.substring(6);
 	    	} else {
-	    		urlStr = "http://" + urlStr;
+	    		if(UrlOperations.urlToScheme(urlStr) == null) {
+	    			urlStr = UrlOperations.HTTP_SCHEME + urlStr;
+	    		}
 	    	}
 	    }
-//	    UURI requestURI = UURIFactory.getInstance(urlStr);
-//	    put(REQUEST_URL_CLEANED, requestURI.toString());
         put(REQUEST_URL, urlStr);
 	}
 	
@@ -614,6 +632,13 @@
 	public boolean isExactHost() {
 		return getBoolean(REQUEST_EXACT_HOST_ONLY);
 	}
+
+	public void setExactScheme(boolean isExactScheme) {
+		setBoolean(REQUEST_EXACT_SCHEME_ONLY,isExactScheme);
+	}
+	public boolean isExactScheme() {
+		return getBoolean(REQUEST_EXACT_SCHEME_ONLY);
+	}
 	
 	public String getAnchorTimestamp() {
 		return get(REQUEST_ANCHOR_DATE);


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2634] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourcestore/ LocationDBResourceStore.java

From: <bra...@us...> - 2008-11-07 02:11:14

Revision: 2634
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2634&view=rev
Author:   bradtofel
Date:     2008-11-07 02:11:09 +0000 (Fri, 07 Nov 2008)

Log Message:
-----------
REFACTOR: ResourceFactory already knows how to distinguish between URLs and Paths.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java	2008-11-07 00:44:24 UTC (rev 2633)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java	2008-11-07 02:11:09 UTC (rev 2634)
@@ -24,9 +24,7 @@
  */
 package org.archive.wayback.resourcestore;
 
-import java.io.File;
 import java.io.IOException;
-import java.net.URL;
 import java.util.logging.Logger;
 
 import org.archive.wayback.ResourceStore;
@@ -80,12 +78,7 @@
 				
 			try {
 
-				if(url.startsWith("http://")) {
-					r = ResourceFactory.getResource(new URL(url), offset);
-				} else {
-					// assume local path:
-					r = ResourceFactory.getResource(new File(url), offset);
-				}
+				r = ResourceFactory.getResource(url, offset);
 				// TODO: attempt to grab the first few KB? The underlying 
 				// 		InputStreams support mark(), so we could reset() after.
 				//      wait for now, currently this will parse HTTP headers, 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2633] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ ResourceFactory.java

From: <bra...@us...> - 2008-11-07 00:44:29

Revision: 2633
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2633&view=rev
Author:   bradtofel
Date:     2008-11-07 00:44:24 +0000 (Fri, 07 Nov 2008)

Log Message:
-----------
TWEAK: made static xform methods public.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java	2008-11-06 22:54:01 UTC (rev 2632)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java	2008-11-07 00:44:24 UTC (rev 2633)
@@ -94,7 +94,7 @@
 			|| name.endsWith(ArcWarcFilenameFilter.WARC_GZ_SUFFIX));	
 	}
 	
-	private static Resource ARCArchiveRecordToResource(ArchiveRecord rec,
+	public static Resource ARCArchiveRecordToResource(ArchiveRecord rec,
 			ARCReader reader) throws ResourceNotAvailableException, IOException {
 
 		if (!(rec instanceof ARCRecord)) {
@@ -105,7 +105,7 @@
 		return ar;
 	}
 
-	private static Resource WARCArchiveRecordToResource(ArchiveRecord rec,
+	public static Resource WARCArchiveRecordToResource(ArchiveRecord rec,
 			WARCReader reader) throws ResourceNotAvailableException, IOException {
 
 		if (!(rec instanceof WARCRecord)) {


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2632] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java

From: <bra...@us...> - 2008-11-06 22:54:10

Revision: 2632
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2632&view=rev
Author:   bradtofel
Date:     2008-11-06 22:54:01 +0000 (Thu, 06 Nov 2008)

Log Message:
-----------
BUGFIX(ACC-46): anchorDate adherence is now configured on AccessPoint, and is disabled by default.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java	2008-11-06 22:53:25 UTC (rev 2631)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java	2008-11-06 22:54:01 UTC (rev 2632)
@@ -79,6 +79,8 @@
 			AccessPoint.class.getName());
 	
 	private boolean useServerName = false;
+	private boolean useAnchorWindow = false;
+
 	private int contextPort = 0;
 	private String contextName = null;
 	private String beanName = null;
@@ -343,7 +345,7 @@
 	
 			// TODO: check which versions are actually accessible right now?
 			CaptureSearchResult closest = captureResults.getClosest(wbRequest, 
-					true);
+					useAnchorWindow);
 			resource = collection.getResourceStore().retrieveResource(closest);
 			ReplayRenderer renderer = replay.getRenderer(wbRequest, closest, resource);
 			renderer.renderResource(httpRequest, httpResponse, wbRequest,
@@ -473,6 +475,20 @@
 		this.useServerName = useServerName;
 	}
 
+	/**
+	 * @return the useAnchorWindow
+	 */
+	public boolean isUseAnchorWindow() {
+		return useAnchorWindow;
+	}
+
+	/**
+	 * @param useAnchorWindow the useAnchorWindow to set
+	 */
+	public void setUseAnchorWindow(boolean useAnchorWindow) {
+		this.useAnchorWindow = useAnchorWindow;
+	}
+
 	public ExclusionFilterFactory getExclusionFactory() {
 		return exclusionFactory;
 	}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2631] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResults. java

From: <bra...@us...> - 2008-11-06 22:53:31

Revision: 2631
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2631&view=rev
Author:   bradtofel
Date:     2008-11-06 22:53:25 +0000 (Thu, 06 Nov 2008)

Log Message:
-----------
BUGFIX(ACC-46): anchorDate adherence is now configured on AccessPoint, and is disabled by default.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResults.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResults.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResults.java	2008-11-06 22:51:24 UTC (rev 2630)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResults.java	2008-11-06 22:53:25 UTC (rev 2631)
@@ -100,17 +100,22 @@
 	}
 	/**
 	 * @param wbRequest
-	 * @param err if true, then check Request Anchor Window and Date, throwing
-	 *        exception if no Result is within the Window.
+	 * @param useAnchor if true, then check Request Anchor Window and Date,
+	 * 		  throwing exception if no Result is within the Window.
 	 * @return The closest CaptureSearchResult to the request.
 	 */
-	public CaptureSearchResult getClosest(WaybackRequest wbRequest, boolean err) 
+	public CaptureSearchResult getClosest(WaybackRequest wbRequest, 
+			boolean useAnchor) 
 		throws AnchorWindowTooSmallException {
 
 		CaptureSearchResult closest = null;
 		long closestDistance = 0;
 		CaptureSearchResult cur = null;
-		String anchorDate = wbRequest.getAnchorTimestamp();
+		String anchorDate = null;
+		// TODO: check if HTTP request referrer is set before using? 
+		if(useAnchor) {
+			anchorDate = wbRequest.getAnchorTimestamp();
+		}
 		long maxWindow = -1;
 		long wantTime = wbRequest.getReplayDate().getTime();
 		if(anchorDate != null) {
@@ -129,7 +134,7 @@
 				closestDistance = curDistance;
 			}
 		}
-		if(err && (maxWindow > 0)) {
+		if(useAnchor && (maxWindow > 0)) {
 			if(closestDistance > maxWindow) {
 				throw new AnchorWindowTooSmallException("Closest is " + 
 						closestDistance + " seconds away, Window is " + 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

Flat | Threaded

<< < 1 .. 45 46 47 48 49 .. 171 > >> (Page 47 of 171)