archive-access-cvs Mailing List for Web Archive Access Utilities (Page 53)

Brought to you by: binzino, bradtofel, gojomo, ia_igor, and 5 others

archive-access-cvs — CVS commits

You can subscribe to this list here.

2005	Jan	Feb	Mar	Apr	May	Jun	Jul (1)	Aug (10)	Sep (36)	Oct (339)	Nov (103)	Dec (152)
2006	Jan (141)	Feb (102)	Mar (125)	Apr (203)	May (57)	Jun (30)	Jul (139)	Aug (46)	Sep (64)	Oct (105)	Nov (34)	Dec (162)
2007	Jan (81)	Feb (57)	Mar (141)	Apr (72)	May (9)	Jun (1)	Jul (144)	Aug (88)	Sep (40)	Oct (43)	Nov (34)	Dec (20)
2008	Jan (44)	Feb (45)	Mar (16)	Apr (36)	May (8)	Jun (77)	Jul (177)	Aug (66)	Sep (8)	Oct (33)	Nov (13)	Dec (37)
2009	Jan (2)	Feb (5)	Mar (8)	Apr	May (36)	Jun (19)	Jul (46)	Aug (8)	Sep (1)	Oct (66)	Nov (61)	Dec (10)
2010	Jan (13)	Feb (16)	Mar (38)	Apr (76)	May (47)	Jun (32)	Jul (35)	Aug (45)	Sep (20)	Oct (61)	Nov (24)	Dec (16)
2011	Jan (22)	Feb (34)	Mar (11)	Apr (8)	May (24)	Jun (23)	Jul (11)	Aug (42)	Sep (81)	Oct (48)	Nov (21)	Dec (20)
2012	Jan (30)	Feb (25)	Mar (4)	Apr (6)	May (1)	Jun (5)	Jul (5)	Aug (8)	Sep (6)	Oct (6)	Nov	Dec

Flat | Threaded

<< < 1 .. 51 52 53 54 55 .. 171 > >> (Page 53 of 171)

[Archive-access-cvs] SF.net SVN: archive-access:[2505] trunk/archive-access/projects/nutchwax/ archive/conf/nutch-site.xml

From: <bi...@us...> - 2008-07-28 19:29:07

Revision: 2505
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2505&view=rev
Author:   binzino
Date:     2008-07-28 19:29:16 +0000 (Mon, 28 Jul 2008)

Log Message:
-----------
Added length metadata field to list of indexed fields.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml

Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml	2008-07-26 15:47:56 UTC (rev 2504)
+++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml	2008-07-28 19:29:16 UTC (rev 2505)
@@ -52,6 +52,7 @@
     collection
     date
     type
+    length
   </value>
 </property>
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2504] trunk/archive-access/projects/nutchwax/ imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean. java

From: <mi...@us...> - 2008-07-26 15:47:47

Revision: 2504
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2504&view=rev
Author:   miklosh
Date:     2008-07-26 15:47:56 +0000 (Sat, 26 Jul 2008)

Log Message:
-----------
Made modifications needed for using Hadoop 0.17.
Display image metadata in search results.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java

Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java	2008-07-26 15:46:55 UTC (rev 2503)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java	2008-07-26 15:47:56 UTC (rev 2504)
@@ -25,6 +25,7 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
@@ -40,13 +41,16 @@
 import org.apache.lucene.store.FSDirectory;
 import org.apache.nutch.indexer.FsDirectory;
 import org.apache.nutch.indexer.Indexer;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.NutchConfiguration;
 
-public class ImageSearcherBean {
+public class ImageSearcherBean implements ImageLoader {
 
     public static final Log LOG = LogFactory.getLog(ImageSearcherBean.class);
     
     private IndexReader reader;
+    private ImageDataReader imageReader;
 
     private Path baseDir;
     private Configuration conf;
@@ -66,18 +70,9 @@
         Path indexesDir = new Path(baseDir, "indexes");
         if (this.fs.exists(indexesDir)) {
             Vector<Path> doneDirs = new Vector<Path>();
-            Path[] dirs = fs.listPaths(indexesDir, new PathFilter() {
-
-                public boolean accept(Path f) {
-                    try {
-                        if (fs.isDirectory(f)) {
-                            return true;
-                        }
-                    } catch (IOException ioe) {
-                    }
-                    return false;
-                }
-            });
+            FileStatus[] fstats = fs.listStatus(indexesDir, 
+                    HadoopFSUtil.getPassDirectoriesFilter(fs));
+            Path[] dirs = HadoopFSUtil.getPaths(fstats);
             for (Path dir : dirs) {
                 Path indexdone = new Path(dir, Indexer.DONE_NAME);
                 if (fs.isFile(indexdone)) {
@@ -95,6 +90,8 @@
             Path[] indexDir = {new Path(baseDir, "index")};
             init(indexDir);
         }
+        this.imageReader = new ImageDataReader(FileSystem.get(conf), 
+                new Path(baseDir, "segments").toString(), conf);
     }
 
     /** Init given a set of indexes or just one index. */
@@ -124,12 +121,19 @@
         if (reader != null) {
             reader.close();
         }
+        if (imageReader != null) {
+            imageReader.close();
+        }
     }
 
     public IndexReader getReader() {
         return reader;
     }
     
+    public ImageWritable getImage(String id) throws IOException {
+        return imageReader.getImage(id);
+    }
+    
     /**
      * Calculate the score for an image hit.
      * @param hit found hit
@@ -245,6 +249,9 @@
                     nextDist = imgIndex < numDocImages-1 ? 
                         Math.abs(imagePositions[imgIndex+1] - pos) + (end-pos) : Integer.MAX_VALUE;
                 }
+                if (imgIndex >= numDocImages) {
+                    continue;
+                }
                 // Check if this image is in the allowed proximity of the span
                 if (dist > distThreshold) {
                     if (LOG.isDebugEnabled()) {
@@ -261,6 +268,7 @@
                 ImageHit newHit = new ImageHit(imageIds[imgIndex], imageUrls[imgIndex], currentDoc);
                 newHit.docSim = docSim;
                 newHit.docScore = docBoost;
+                newHit.parentUrl = doc.get("url");
                 newHit.proximity = Math.min(1.0f, 1.0f-((float)dist/maxDist));
                 newHit.score = scoreHit(newHit, doc);
 
@@ -338,6 +346,10 @@
                 hits.getTotal() >= maxHits ? maxHits : (int)hits.getTotal());
         for (ImageHit hit : top) {
             System.out.println(hit.score + " " + hit.url + " " + hit.imageId);
+            ImageWritable imageData = isb.getImage(hit.imageId);
+            if (imageData != null) {
+                System.out.println("[ " + imageData.getMetadata() + "]");
+            }
         }
     }
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2503] trunk/archive-access/projects/nutchwax/ imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ ImageParseFilter.java

From: <mi...@us...> - 2008-07-26 15:46:46

Revision: 2503
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2503&view=rev
Author:   miklosh
Date:     2008-07-26 15:46:55 +0000 (Sat, 26 Jul 2008)

Log Message:
-----------
Normalize extracted image URLs.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java

Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java	2008-07-26 15:42:26 UTC (rev 2502)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java	2008-07-26 15:46:55 UTC (rev 2503)
@@ -33,6 +33,7 @@
 import org.apache.nutch.analysis.AnalyzerFactory;
 import org.apache.nutch.analysis.NutchAnalyzer;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.util.NodeWalker;
 import org.archive.nutchwax.imagesearch.ImageSearch;
 import org.w3c.dom.*;
@@ -42,7 +43,8 @@
 
     public static final Log LOG = LogFactory.getLog(ImageParseFilter.class);
 
-    
+    private URLNormalizers normalizers;
+
     private void findImages(Node doc, URL base, ParseData parentData, ParseResult parseResult) {
 
         // Get language
@@ -87,8 +89,20 @@
                         try {
                             imgSrc = new URL(base, attr.getValue());
                             imgUrl = imgSrc.toString();
+                            // Normalize it
+                            // Replace spaces with %20
+                            imgUrl = imgUrl.replaceAll("\\s", "%20");
+                            imgUrl = normalizers.normalize(imgUrl, 
+                                    URLNormalizers.SCOPE_FETCHER);
+                            // TODO: apply NutchWAX specific URL canonicalization
                         } catch (MalformedURLException mue) {
-                            skipNode = true;
+                            if (imgUrl != null) {
+                                if (LOG.isInfoEnabled()) {
+                                    LOG.info("MalformedURL: " + imgUrl);
+                                }
+                            } else {
+                                skipNode = true;
+                            }
                         }
                     } else if ("alt".equalsIgnoreCase(attr.getName())) {
                         altText = attr.getValue();
@@ -162,6 +176,7 @@
 
     public void setConf(Configuration conf) {
         this.conf = conf;
+        this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
     }
 
     public Configuration getConf() {


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2502] trunk/archive-access/projects/nutchwax/ imagesearch

From: <mi...@us...> - 2008-07-26 15:42:17

Revision: 2502
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2502&view=rev
Author:   miklosh
Date:     2008-07-26 15:42:26 +0000 (Sat, 26 Jul 2008)

Log Message:
-----------
Added basic JSP-based user interface.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/imagesearch/build.xml

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/imagesearch/src/web/
    trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/
    trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/images/
    trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/images/logo.jpg
    trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/results.jsp
    trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/thumb.jsp
    trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/
    trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/
    trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/
    trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/
    trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch.properties
    trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch_en.properties
    trunk/archive-access/projects/nutchwax/imagesearch/src/web/web.xml

Modified: trunk/archive-access/projects/nutchwax/imagesearch/build.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/build.xml	2008-07-26 15:37:14 UTC (rev 2501)
+++ trunk/archive-access/projects/nutchwax/imagesearch/build.xml	2008-07-26 15:42:26 UTC (rev 2502)
@@ -22,6 +22,7 @@
   <property name="src.dir"   value="src" />
   <property name="lib.dir"   value="lib" />
   <property name="build.dir" value="${nutch.dir}/build" />
+  <property name="build.plugins" value="${nutch.dir}/build/plugins" />
   <!-- HACK: Need to import default.properties like Nutch does -->
   <property name="dist.dir"  value="${build.dir}/nutch-1.0-dev" />
 
@@ -135,4 +136,56 @@
 
   </target>
 
+  <!--
+    generate the servlet context file (nutch.xml)
+  -->
+  <target name="generate-context">
+    <!-- xmlcatalog definition for xslt task -->
+    <xmlcatalog id="docDTDs">
+     <dtd publicId="-//W3C//DTD XHTML 1.0 Transitional//EN"            
+          location="${xmlcatalog.dir}/xhtml1-transitional.dtd"/> 
+    </xmlcatalog> 
+    <xslt in="${nutch.dir}/conf/nutch-default.xml"
+          out="${build.dir}/nutch.xml"
+          style="${nutch.dir}/conf/context.xsl">
+        <xmlcatalog refid="docDTDs"/>
+    	<outputproperty name="indent" value="yes"/>
+    </xslt>
+  </target>
+
+  <target name="imagesearch-war" depends="generate-context, jar">
+    <war destfile="${build.dir}/imagesearch.war"
+	 webxml="${src.dir}/web/web.xml">
+      <fileset dir="${src.dir}/web/jsp"/>
+      <lib dir="${nutch.dir}/lib">
+	<include name="lucene*.jar"/>
+	<include name="taglibs-*.jar"/>
+ 	<include name="hadoop-*.jar"/>
+	<include name="dom4j-*.jar"/>
+	<include name="xerces-*.jar"/>
+        <include name="tika-*.jar"/>      	
+        <include name="commons-cli-*.jar"/>
+        <include name="commons-lang-*.jar"/>
+        <include name="commons-logging-*.jar"/>
+        <include name="log4j-*.jar"/>
+      </lib>
+      <lib dir="${build.dir}">
+        <include name="nutch-*.jar"/>
+      </lib>
+
+      <zipfileset prefix="WEB-INF/classes" dir="${build.dir}/classes"/>
+      <classes dir="${nutch.dir}/conf" excludes="**/*.template"/>
+      <classes dir="${src.dir}/web/locale"/>
+      <zipfileset prefix="WEB-INF/classes/plugins" dir="${build.plugins}">
+        <exclude name="parse-*/**"/>
+        <exclude name="protocol-*/**"/>
+        <exclude name="urlfilter-*/**"/>
+      </zipfileset>
+
+      <webinf dir="${nutch.dir}/lib">
+        <include name="taglibs-*.tld"/>
+      </webinf>
+    </war>
+  </target>
+
 </project>


Property changes on: trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/images/logo.jpg
___________________________________________________________________
Added: svn:mime-type
   + application/octet-stream

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/results.jsp
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/results.jsp	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/results.jsp	2008-07-26 15:42:26 UTC (rev 2502)
@@ -0,0 +1,226 @@
+<%@ page
+session="false"
+contentType="text/html; charset=UTF-8"
+pageEncoding="UTF-8"
+
+import="java.io.*"
+import="java.util.*"
+import="java.net.*"
+
+import="org.archive.nutchwax.imagesearch.*"
+import="org.apache.hadoop.conf.*"
+import="org.apache.nutch.html.Entities"
+import="org.apache.nutch.searcher.*"
+import="org.apache.nutch.metadata.*"
+import="org.apache.nutch.util.NutchConfiguration"
+import="org.apache.nutch.plugin.*"
+import="org.apache.hadoop.fs.Path"
+import="org.apache.lucene.index.*"
+import="org.apache.lucene.document.*"
+
+%>
+<%
+Configuration nutchConf = NutchConfiguration.get(application);
+
+// Get query from request
+boolean haveQuery = true;
+request.setCharacterEncoding("UTF-8");
+String queryString = request.getParameter("query");
+String mainTitle = "Internet Archive Image Search";
+if (queryString == null) {
+    queryString = "";
+    haveQuery = false;
+} else {
+    mainTitle = queryString + " - " + mainTitle;
+}
+String htmlQueryString = Entities.encode(queryString);
+
+int start = 0;
+String startString = request.getParameter("start");
+if (startString != null) {
+    start = Integer.parseInt(startString);
+}
+
+int hitsPerPage = 20;       // number of hits to display per page
+int rowLength = 5;          // number of images to display per row
+
+int end = start+hitsPerPage;
+
+%><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"><%
+// To prevent the character encoding declared with 'contentType' page
+// directive from being overriden by JSTL (apache i18n), we freeze it
+// by flushing the output buffer. 
+// see http://java.sun.com/developer/technicalArticles/Intl/MultilingualJSP/
+out.flush();
+
+String language = ResourceBundle.getBundle("org.archive.jsp.imagesearch", request.getLocale())
+    .getLocale().getLanguage();
+%>
+<%@ taglib uri="http://jakarta.apache.org/taglibs/i18n" prefix="i18n" %>
+<i18n:bundle baseName="org.archive.jsp.imagesearch"/>
+<html>
+<head>
+<title><%=mainTitle%></title>
+
+<style type="text/css"><!--
+.xs {font-family:verdana,arial,helvetica,sans-serif;font-size: x-small}
+body,div,span,th,td,input,.s {font-family: arial, helvetica, sans-serif; font-size: small}
+.m {font-size: medium}
+.lbl,a.lbl:link,a.lbl:visited,a.lbl:active,a.lbl:hover{color:#444;font-weight:bold;font-family:verdana,arial,helvetica,sans-serif;;text-decoration:none}
+
+.resItem {font-family: arial, helvetica, sans-serif; font-size: small}
+
+.site{color:#55AA55} 
+a:link,a:active{color:#2249cc} a:visited{color:#573875} a{text-decoration:underline}
+a.res{color:#2249cc; font-size:1.3em }
+
+#res_pages {text-align: center; font-size:medium; font-weight:bold;}
+.nav_curr {font-size:1.3em;}
+#results {margin-right: 240px; text-align: justify;}
+.ellipsis {font-weight: bold;}
+.highlight{font-weight: bold;}
+
+form{margin-bottom:0px}
+
+-->
+</style>
+
+</head>
+<body>
+
+<table width="100%" border=0 cellspacing=0 cellpadding=0>
+<tr><td class=xs>&nbsp;</td></tr>
+<tr><td width=50><img src="images/logo.jpg"></td>
+<td><table border=0 cellspacing=0 cellpadding=1>
+	<tr><td width=10><form action="results.jsp">&nbsp;</td><td>
+	<input type=text name=query size=20 maxlength=500 value="<%=queryString%>" style='width:230px' />
+	</td>
+	<td>&nbsp;&nbsp; <input type=submit value="<i18n:message key="search"/>" class=btn /></td>
+	<td>&nbsp;</form>
+	</td></tr></table>
+</td></tr>
+<tr><td class=xs>&nbsp;</td></tr>
+</table>
+
+<%
+ImageHits hitList = null;
+ImageSearcherBean bean = new ImageSearcherBean(nutchConf);
+
+if (haveQuery) {
+    hitList = bean.search(queryString, end);
+
+    long numHits = hitList.getTotal();
+    if (end >= numHits) {
+        end = (int)numHits;
+    }
+
+    // Output info about the results
+    if (numHits > 0) {
+    %><hr size=1 />
+      <div align=left class=lbl style="padding: 0px 0px 0px">
+        <i18n:message key="showingHits">
+          <i18n:messageArg value="<%=new Long(start+1)%>"/>
+          <i18n:messageArg value="<%=new Long(end)%>"/>
+          <i18n:messageArg value="<%=numHits%>"/>
+        </i18n:message>
+    </div>
+
+    <table border=0 cellpadding=20 cellspacing=0 align=center width=700>
+    <%
+    }
+    // Get hits
+    ImageHit[] hits = null;
+    System.out.println(numHits);
+    if (numHits > 0) {
+        hits = hitList.getHits(start, (end-start) >= hitsPerPage ? hitsPerPage :
+            (end-start));
+    } else {
+        hits = new ImageHit[0];
+    }
+
+    // Output results
+    for (int i = 0; i < hits.length; i++) {
+        ImageHit hit = hits[i];
+        String url = hit.url;
+        String fileName = null;
+        int slashIndex = url.lastIndexOf("/");
+        if (slashIndex > 0) {
+            fileName = url.substring(slashIndex + 1);
+        } else {
+            fileName = url;
+        }
+        String parentUrl = hit.parentUrl;
+        String host = null;
+        String thumbnailUrl = "thumb.jsp?id=" + hit.imageId;
+        try {
+            URL u;
+            u = new URL(url);
+            host = u.getHost();
+        } catch (MalformedURLException e) {
+            host = "";
+        }
+        
+        // Load metadata
+        String size = null;
+        String dimensions = null;
+        String imgWidth = null;
+        ImageWritable image = bean.getImage(hit.imageId);
+        if (image != null) {
+            Metadata meta = image.getMetadata();
+            int sizeInt = Integer.parseInt(meta.get(ImageSearch.SIZE_KEY));
+            size = Integer.toString((int)Math.round(sizeInt / 1024.0)) + "k";
+            dimensions = meta.get("width") + "x" + meta.get("height");
+            imgWidth = "";
+        } else {
+            size = "??k";
+            dimensions = "??x??";
+            // Have the downscaled original image displayed by the browser
+            thumbnailUrl = url;
+            int maxSize = nutchConf.getInt("imagesearcher.thumbnail.maxSize", 150);
+            imgWidth = "width=" + maxSize;
+        }
+                
+        if (i % rowLength == 0) {
+            if (i > 0) {
+                %></tr><%
+            }
+%>      <tr class=resultGroup><%
+        }
+%>          <td class=resItem width=200 valign=top align=center>
+		<br/><a href="<%=parentUrl%>"><img src="<%=thumbnailUrl%>" border=0 <%=imgWidth%>/></a>
+		<br/><%=fileName%><br/>
+		<span class=resMeta><%=dimensions%> - <%=size%><br/></span>
+		<span class=site><%=host%></span>
+            </td><%
+    }
+    if (hits.length == 0) {
+    %><i18n:message key="noMatch"><i18n:messageArg value="<%=queryString%>"/></i18n:message><%
+    } else {    // Draw paging information
+    %>
+    </table>
+    <div id=res_pages>
+    <br/><%
+        int currentPage = (start - (start % hitsPerPage)) / hitsPerPage;
+        int currentLoc = currentPage*hitsPerPage;
+        int pageCounter = 0;
+        String encodedQuery = URLEncoder.encode(queryString, "UTF-8");
+
+        // Prev
+        if (currentPage > 0) {
+            int prevPageLoc = (currentPage-1)*hitsPerPage;
+            String prevUrl = "./results.jsp?query=" + encodedQuery + "&start=" +
+                    prevPageLoc;
+            %><a class=res href="<%=prevUrl%>">&lt;&lt; <i18n:message key="prev"/></a>&nbsp;&nbsp;<%
+        }
+        // Next
+        if (currentPage*hitsPerPage + hitsPerPage < numHits) {
+            int nextPageLoc = (currentPage+1)*hitsPerPage;
+            String nextUrl = "./results.jsp?query=" + encodedQuery + "&start=" +
+                    nextPageLoc;
+            %>&nbsp;&nbsp;<a class=res href="<%=nextUrl%>"><i18n:message key="next"/> &gt;&gt;</a><%
+        }
+    }
+    bean.close();
+}%>
+    </div>
+</body>
\ No newline at end of file

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/thumb.jsp
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/thumb.jsp	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/thumb.jsp	2008-07-26 15:42:26 UTC (rev 2502)
@@ -0,0 +1,33 @@
+<%@ page
+import="java.io.*"
+import="java.util.*"
+
+import="org.archive.nutchwax.imagesearch.*"
+import="org.apache.hadoop.conf.*"
+import="org.apache.nutch.util.NutchConfiguration"
+import="org.apache.hadoop.fs.Path"
+
+%><%
+Configuration nutchConf = NutchConfiguration.get(application);
+
+// Get id from request
+request.setCharacterEncoding("UTF-8");
+String idString = request.getParameter("id");
+if (idString == null) {
+    response.sendRedirect("./results.jsp");
+    return;
+}
+
+ImageSearcherBean bean = new ImageSearcherBean(nutchConf);
+ImageWritable imageData = bean.getImage(idString);
+if (imageData != null) {
+    StoredImage thumb = imageData.getThumbnail();
+    response.setContentType("image/jpg");
+    OutputStream os = response.getOutputStream();
+    os.write(thumb.getData());
+    os.close();
+} else {
+    response.sendRedirect("./results.jsp");
+}
+bean.close();
+%>
\ No newline at end of file

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch.properties
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch.properties	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch.properties	2008-07-26 15:42:26 UTC (rev 2502)
@@ -0,0 +1,5 @@
+search = Search Images
+showingHits = Results <b>{0}-{1}</b> out of about <b>{2}</b>.
+next = Next
+prev = Previous
+noMatch = Your query (<b>{0}</b>) did not match any documents.<br/>Please try different keywords.

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch_en.properties
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch_en.properties	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch_en.properties	2008-07-26 15:42:26 UTC (rev 2502)
@@ -0,0 +1,5 @@
+search = Search Images
+showingHits = Results <b>{0}-{1}</b> out of about <b>{2}</b>.
+next = Next
+prev = Previous
+noMatch = Your query (<b>{0}</b>) did not match any documents.<br/>Please try different keywords.

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/web/web.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/web/web.xml	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/web/web.xml	2008-07-26 15:42:26 UTC (rev 2502)
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!DOCTYPE web-app
+    PUBLIC "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN"
+    "http://java.sun.com/dtd/web-app_2_3.dtd">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<web-app>
+
+<!-- order is very important here -->
+
+<welcome-file-list>
+  <welcome-file>results.jsp</welcome-file>
+  <welcome-file>index.html</welcome-file>
+  <welcome-file>index.jsp</welcome-file>
+</welcome-file-list>
+
+<taglib>
+  <taglib-uri>http://jakarta.apache.org/taglibs/i18n</taglib-uri>
+  <taglib-location>/WEB-INF/taglibs-i18n.tld</taglib-location>
+ </taglib>
+
+</web-app>


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2501] trunk/archive-access/projects/nutchwax/ imagesearch/src/java/org/archive/nutchwax/imagesearch

From: <mi...@us...> - 2008-07-26 15:37:06

Revision: 2501
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2501&view=rev
Author:   miklosh
Date:     2008-07-26 15:37:14 +0000 (Sat, 26 Jul 2008)

Log Message:
-----------
Added ImageDataReader for image metadata and thumbnail retrieval.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageDataReader.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageLoader.java

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageDataReader.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageDataReader.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageDataReader.java	2008-07-26 15:37:14 UTC (rev 2501)
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.imagesearch;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.nutch.util.HadoopFSUtil;
+
+/** Retrieves image thumbnails and metadata from segments. */
+public class ImageDataReader implements ImageLoader {
+
+    private HashMap<String, MapFile.Reader[]> segments = new HashMap<String, MapFile.Reader[]>();
+
+    /** Construct given a directory containing segments. */
+    ImageDataReader(FileSystem fs, String segmentsDir, Configuration conf) throws IOException {
+        FileStatus[] fstats = fs.listStatus(new Path(segmentsDir), 
+                HadoopFSUtil.getPassDirectoriesFilter(fs));
+        Path[] segmentDirs = HadoopFSUtil.getPaths(fstats);
+
+        if (segmentDirs != null) {
+            for (Path segmentDir : segmentDirs) {
+                MapFile.Reader[] readers = MapFileOutputFormat.
+                        getReaders(fs, new Path(segmentDir, ImageWritable.IMAGE_DATA_DIR), conf);
+                if (readers != null) {
+                    segments.put(segmentDir.getName(), readers);
+                }
+            }
+        }
+    }
+
+    /**
+     * Loads the stored ImageWritable from disk.
+     * @param id identifier of the image to retrieve
+     */
+    public ImageWritable getImage(String id) throws IOException {
+        // TODO: try the segment in which the parent doc resides first
+        Text key = new Text(id);
+        ImageWritable holder = new ImageWritable();
+        Iterator<MapFile.Reader[]> it = segments.values().iterator();
+        while (it.hasNext()) {
+            MapFile.Reader[] readers = it.next();
+            for (MapFile.Reader reader : readers) {
+                ImageWritable result = (ImageWritable) reader.get(key, holder);
+                if (result != null) {
+                    return result;
+                }
+            }
+        }
+        return null;
+    }
+
+    public void close() throws IOException {
+        Iterator<MapFile.Reader[]> it = segments.values().iterator();
+        while (it.hasNext()) {
+            MapFile.Reader[] readers = it.next();
+            for (MapFile.Reader reader : readers) {
+                reader.close();
+            }
+        }
+    }
+}
\ No newline at end of file

Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java	2008-07-26 15:35:27 UTC (rev 2500)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java	2008-07-26 15:37:14 UTC (rev 2501)
@@ -23,6 +23,7 @@
 
     public String imageId;
     public String url;
+    public String parentUrl;
 
     public float docSim;
     public float proximity;

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageLoader.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageLoader.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageLoader.java	2008-07-26 15:37:14 UTC (rev 2501)
@@ -0,0 +1,25 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.imagesearch;
+
+import java.io.IOException;
+
+/** Interface for loading image data from disk. */
+public interface ImageLoader {
+    public ImageWritable getImage(String id) throws IOException;
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2500] trunk/archive-access/projects/nutchwax/ imagesearch/src/java/org/archive/nutchwax/imagesearch

From: <mi...@us...> - 2008-07-26 15:35:18

Revision: 2500
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2500&view=rev
Author:   miklosh
Date:     2008-07-26 15:35:27 +0000 (Sat, 26 Jul 2008)

Log Message:
-----------
Updated ImageProcessor to store size of original image and deduplicate thumbnails based on digest.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java

Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java	2008-07-26 15:33:48 UTC (rev 2499)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java	2008-07-26 15:35:27 UTC (rev 2500)
@@ -18,6 +18,7 @@
 package org.archive.nutchwax.imagesearch;
 
 import java.io.IOException;
+import java.util.Iterator;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -29,18 +30,21 @@
 import org.apache.hadoop.mapred.MapFileOutputFormat;
 import org.apache.hadoop.mapred.Mapper;
 import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 
 public class ImageProcessor extends Configured implements Tool,
-        Mapper<Text, Content, Text, ImageWritable> {
+        Mapper<Text, Content, Text, ImageWritable>, 
+        Reducer<Text, ImageWritable, Text, ImageWritable> {
 
     private static final Log LOG = LogFactory.getLog(ImageProcessor.class);
     
@@ -56,25 +60,44 @@
             OutputCollector<Text, ImageWritable> output,
             Reporter reporter) throws IOException {
 
-        Metadata metadata = new Metadata();
         // Check content type
         if (!content.getContentType().contains("image/")) {
             return;
         }
 
         // Generate thumbnail
+        Metadata metadata = new Metadata();
         byte[] data = content.getContent();
         StoredImage thumb = ThumbnailGenerator.generateThumbnail(data, 
                 thumbMaxSize, thumbMaxSize, thumbQuality, metadata);
 
         // Create and setup an ImageWritable
         ImageWritable image = new ImageWritable(key.toString());
+        metadata.set(ImageSearch.SIZE_KEY, Integer.toString(data.length));
         image.setMetadata(metadata);
         image.setThumbnail(thumb);
 
-        output.collect(key, image);
+        // Get digest of image content
+        Metadata contentMeta = content.getMetadata();
+        String digest = contentMeta.get("digest");
+        if (digest == null) {
+            digest = contentMeta.get(Nutch.SIGNATURE_KEY);
+        }
+
+        output.collect(new Text(digest), image);
     }
+    
+    public void reduce(Text key, Iterator<ImageWritable> values, 
+            OutputCollector<Text, ImageWritable> output, Reporter reporter) 
+            throws IOException {
 
+        if (values.hasNext()) {
+            // Save only one instance
+            output.collect(key, values.next());
+            return;
+        }
+    }
+
     public void processImageContent(Path segment) 
         throws IOException {
         
@@ -88,6 +111,7 @@
 
         job.setInputFormat(SequenceFileInputFormat.class);
         job.setMapperClass(ImageProcessor.class);
+        job.setReducerClass(ImageProcessor.class);
 
         job.setOutputPath(new Path(segment, ImageWritable.IMAGE_DATA_DIR));
         job.setOutputFormat(MapFileOutputFormat.class);

Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java	2008-07-26 15:33:48 UTC (rev 2499)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java	2008-07-26 15:35:27 UTC (rev 2500)
@@ -20,9 +20,10 @@
 public class ImageSearch {
     public static final String PARENT_URL_KEY   = "parent_url";
     public static final String ALT_TEXT_KEY     = "alt";
-    
+
     public static final String IMAGE_IDS_KEY    = "image_ids";
     public static final String IMAGE_POS_KEY    = "image_pos";
     public static final String IMAGE_URLS_KEY   = "image_urls";
     public static final String HAS_IMAGE_KEY    = "has_image";
+    public static final String SIZE_KEY         = "size";
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2499] trunk/archive-access/projects/nutchwax/ imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java

From: <mi...@us...> - 2008-07-26 15:33:40

Revision: 2499
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2499&view=rev
Author:   miklosh
Date:     2008-07-26 15:33:48 +0000 (Sat, 26 Jul 2008)

Log Message:
-----------
Added DocIndexer for correctly indexing image digest information.

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java

Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java	2008-07-26 15:33:48 UTC (rev 2499)
@@ -0,0 +1,483 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.nutchwax.imagesearch;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.*;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.analysis.*;
+
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.LogUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.crawl.LinkDb;
+import org.apache.nutch.crawl.NutchWritable;
+
+import org.apache.lucene.index.*;
+import org.apache.lucene.document.*;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilters;
+import org.apache.nutch.indexer.NutchSimilarity;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.protocol.Content;
+
+/** Create indexes for segments suitable for image search. */
+public class DocIndexer extends Configured implements Tool,
+        Reducer<Text, NutchWritable, Text, Writable>,
+        Mapper<Text, Writable, Text, NutchWritable> {
+
+    public static final String DONE_NAME = "index.done";
+    public static final Log LOG = LogFactory.getLog(DocIndexer.class);
+
+    /** A utility class used to pass a lucene document from Indexer.reduce 
+     * to Indexer.OutputFormat.
+     * Note: Despite its name, it can't properly wrap a lucene document - it
+     * doesn't know how to serialize/deserialize a lucene document.
+     */
+    private static class LuceneDocumentWrapper implements Writable {
+
+        private Document doc;
+
+        public LuceneDocumentWrapper(Document doc) {
+            this.doc = doc;
+        }
+
+        public Document get() {
+            return doc;
+        }
+
+        public void readFields(DataInput in) throws IOException {
+        // intentionally left blank
+        }
+
+        public void write(DataOutput out) throws IOException {
+        // intentionally left blank
+        }
+    }
+
+    /** Unwrap Lucene Documents created by reduce and add them to an index. */
+    public static class OutputFormat
+            extends org.apache.hadoop.mapred.OutputFormatBase<WritableComparable, LuceneDocumentWrapper> {
+
+        public RecordWriter<WritableComparable, LuceneDocumentWrapper> getRecordWriter(final FileSystem fs, JobConf job,
+                String name, final Progressable progress) throws IOException {
+            final Path perm = new Path(job.getOutputPath(), name);
+            final Path temp =
+                    job.getLocalPath("index/_" + Integer.toString(new Random().nextInt()));
+
+            fs.delete(perm);                            // delete old, if any
+
+            final AnalyzerFactory factory = new AnalyzerFactory(job);
+            final IndexWriter writer = // build locally first
+                    new IndexWriter(fs.startLocalOutput(perm, temp).toString(),
+                    new NutchDocumentAnalyzer(job), true);
+
+            writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
+            writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
+            writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE));
+            writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128));
+            writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
+            writer.setInfoStream(LogUtil.getInfoStream(LOG));
+            writer.setUseCompoundFile(false);
+            writer.setSimilarity(new NutchSimilarity());
+
+            return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() {
+
+                boolean closed;
+
+                public void write(WritableComparable key, LuceneDocumentWrapper value)
+                        throws IOException {                  // unwrap & index doc
+                    Document doc = value.get();
+                    NutchAnalyzer analyzer = factory.get(doc.get("lang"));
+                    if (LOG.isInfoEnabled()) {
+                        LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" +
+                                " with analyzer " + analyzer +
+                                " (" + doc.get("lang") + ")");
+                    }
+                    writer.addDocument(doc, analyzer);
+                    progress.progress();
+                }
+
+                public void close(final Reporter reporter) throws IOException {
+                    // spawn a thread to give progress heartbeats
+                    Thread prog = new Thread() {
+
+                        public void run() {
+                            while (!closed) {
+                                try {
+                                    reporter.setStatus("closing");
+                                    Thread.sleep(1000);
+                                } catch (InterruptedException e) {
+                                    continue;
+                                } catch (Throwable e) {
+                                    return;
+                                }
+                            }
+                        }
+                    };
+
+                    try {
+                        prog.start();
+                        if (LOG.isInfoEnabled()) {
+                            LOG.info("Optimizing index.");
+                        }
+                        // optimize & close index
+                        writer.optimize();
+                        writer.close();
+                        fs.completeLocalOutput(perm, temp);   // copy to dfs
+                        fs.createNewFile(new Path(perm, DONE_NAME));
+                    } finally {
+                        closed = true;
+                    }
+                }
+            };
+        }
+    }
+    private IndexingFilters filters;
+    private ScoringFilters scfilters;
+
+    public DocIndexer() {
+
+    }
+
+    public DocIndexer(Configuration conf) {
+        setConf(conf);
+    }
+
+    public void configure(JobConf job) {
+        setConf(job);
+        this.filters = new IndexingFilters(getConf());
+        this.scfilters = new ScoringFilters(getConf());
+    }
+
+    public void close() {
+    }
+
+    public void reduce(Text key, Iterator<NutchWritable> values,
+            OutputCollector<Text, Writable> output, Reporter reporter)
+            throws IOException {
+        Inlinks inlinks = null;
+        CrawlDatum dbDatum = null;
+        CrawlDatum fetchDatum = null;
+        ParseData parseData = null;
+        ParseText parseText = null;
+        Metadata imageUrlMapping = new Metadata();
+        while (values.hasNext()) {
+            Writable value = values.next().get(); // unwrap
+            if (value instanceof Inlinks) {
+                inlinks = (Inlinks) value;
+            } else if (value instanceof CrawlDatum) {
+                CrawlDatum datum = (CrawlDatum) value;
+                if (CrawlDatum.hasDbStatus(datum)) {
+                    dbDatum = datum;
+                } else if (CrawlDatum.hasFetchStatus(datum)) {
+                    // don't index unmodified (empty) pages
+                    if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
+                        fetchDatum = datum;
+                    }
+                } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
+                        CrawlDatum.STATUS_SIGNATURE == datum.getStatus()) {
+                    continue;
+                } else {
+                    throw new RuntimeException("Unexpected status: " + datum.getStatus());
+                }
+            } else if (value instanceof ParseData) {
+                parseData = (ParseData) value;
+            } else if (value instanceof ParseText) {
+                parseText = (ParseText) value;
+            } else if (value instanceof Metadata) {
+                // Add image URL->digest mapping
+                Metadata mapping = (Metadata) value;
+                String[] imageUrls = mapping.names();
+                for (String imageUrl : imageUrls) {
+                    if (imageUrlMapping.get(imageUrl) == null) {
+                        imageUrlMapping.add(imageUrl, mapping.get(imageUrl));
+                    }
+                }
+            } else if (LOG.isWarnEnabled()) {
+                LOG.warn("Unrecognized type: " + value.getClass());
+            }
+        }
+
+        if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) {
+            return;                                     // only have inlinks
+        }
+
+        if (!parseData.getStatus().isSuccess() ||
+                fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
+            return;
+        }
+
+        Document doc = new Document();
+        Metadata metadata = parseData.getContentMeta();
+
+        // replace image ids with proper ones
+        if (imageUrlMapping.size() > 0) {
+            Metadata parseMeta = parseData.getParseMeta();
+            parseMeta.remove(ImageSearch.IMAGE_IDS_KEY);
+            String[] imageUrls = parseMeta.getValues(ImageSearch.IMAGE_URLS_KEY);
+            for (String imageUrl : imageUrls) {
+                String mappedTo = imageUrlMapping.get(imageUrl);
+                if (mappedTo == null) {
+                    if (LOG.isInfoEnabled()) {
+                        LOG.info("No digest information for " + imageUrl);
+                    }
+                    parseMeta.add(ImageSearch.IMAGE_IDS_KEY, "-");
+                    continue;
+                }
+                parseMeta.add(ImageSearch.IMAGE_IDS_KEY, mappedTo);
+            }
+        }
+
+        // add segment, used to map from merged index back to segment files
+        doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY),
+                Field.Store.YES, Field.Index.NO));
+
+        // add digest, used by dedup
+        doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY),
+                Field.Store.YES, Field.Index.NO));
+
+        Parse parse = new ParseImpl(parseText, parseData);
+        try {
+            // extract information from dbDatum and pass it to
+            // fetchDatum so that indexing filters can use it
+            Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
+            if (url != null) {
+                fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
+            }
+            // run indexing filters
+            doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
+        } catch (IndexingException e) {
+            if (LOG.isWarnEnabled()) {
+                LOG.warn("Error indexing " + key + ": " + e);
+            }
+            return;
+        }
+
+        // skip documents discarded by indexing filters
+        if (doc == null) {
+            return;
+        }
+
+        float boost = 1.0f;
+        // run scoring filters
+        try {
+            boost = this.scfilters.indexerScore((Text) key, doc, dbDatum,
+                    fetchDatum, parse, inlinks, boost);
+        } catch (ScoringFilterException e) {
+            if (LOG.isWarnEnabled()) {
+                LOG.warn("Error calculating score " + key + ": " + e);
+            }
+            return;
+        }
+        // apply boost to all indexed fields.
+        doc.setBoost(boost);
+        // store boost for use by explain and dedup
+        doc.add(new Field("boost", Float.toString(boost),
+                Field.Store.YES, Field.Index.NO));
+
+        output.collect(key, new LuceneDocumentWrapper(doc));
+    }
+
+    /**
+     * Emits image URLs as keys and their URL+digest as values.
+     */
+    public static class ImageUrlEmitter 
+            implements Mapper<Text, Writable, Text, Text>, 
+            Reducer<Text, Text, Text, Metadata> {
+
+        public void map(Text key, Writable value, 
+                OutputCollector<Text, Text> output, Reporter reporter)
+                throws IOException {
+
+            if (value instanceof ParseData) {
+                ParseData parseData = (ParseData) value;
+                Metadata parseMeta = parseData.getParseMeta();
+                String[] imageUrls = parseMeta.getValues(ImageSearch.IMAGE_URLS_KEY);
+                if (imageUrls.length > 0) {
+                    for (String url : imageUrls) {
+                        output.collect(new Text(url), key);
+                    }
+                }
+            } else if (value instanceof Content) {
+                Content content = (Content) value;
+                if (content.getContentType().contains("image/")) {
+                    Metadata meta = content.getMetadata();
+                    // Using NutchWax.DIGEST_KEY here
+                    String digest = meta.get("digest");
+                    if (digest == null) {
+                        digest = meta.get(Metadata.SIGNATURE_KEY);
+                    }
+                    output.collect(new Text(content.getUrl()), new Text(digest));
+                }
+            }
+        }
+
+        public void reduce(Text key, Iterator<Text> values, 
+                OutputCollector<Text, Metadata> output, Reporter reporter) 
+                throws IOException {
+            
+            Vector<Text> parents = new Vector<Text>();
+            String imageUrl = key.toString();
+            String imageDigest = null;
+            while (values.hasNext()) {
+                Text data = values.next();
+                String value = data.toString();
+                // Determine type of value
+                if (value.contains("/")) {
+                    // This value is a parent's key
+                    parents.add(data);
+                } else {
+                    // This value is a digest
+                    imageDigest = value.toString();
+                }
+            }
+            if (imageDigest != null) {
+                Metadata meta = new Metadata();
+                meta.add(imageUrl, imageDigest);
+                Iterator<Text> it = parents.iterator();
+                while (it.hasNext()) {
+                    Text parentKey = it.next();
+                    output.collect(parentKey, meta);
+                }
+            }
+        }
+
+        public void configure(JobConf job) {}
+        public void close() {}
+    }
+
+    public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments)
+            throws IOException {
+
+        if (LOG.isInfoEnabled()) {
+            LOG.info("DocIndexer: starting");
+            LOG.info("DocIndexer: linkdb: " + linkDb);
+        }
+
+        /*
+         * First phase: determining image keys
+         */
+        Path outDir = new Path("imgkeys-"+
+                Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+        JobConf job = new NutchJob(getConf());
+        job.setJobName("imagekeys " + indexDir);
+        for (int i = 0; i < segments.length; i++) {
+            job.addInputPath(new Path(segments[i], ParseData.DIR_NAME));
+            job.addInputPath(new Path(segments[i], Content.DIR_NAME));
+        }
+
+        job.setInputFormat(SequenceFileInputFormat.class);
+        job.setMapperClass(ImageUrlEmitter.class);
+        job.setMapOutputKeyClass(Text.class);
+        job.setMapOutputValueClass(Text.class);
+        job.setReducerClass(ImageUrlEmitter.class);
+
+        job.setOutputPath(outDir);
+        job.setOutputFormat(SequenceFileOutputFormat.class);
+        job.setOutputKeyClass(Text.class);
+        job.setOutputValueClass(Metadata.class);
+
+        JobClient.runJob(job);
+
+        /*
+         * Second phase: creating Lucene index
+         */
+        job = new NutchJob(getConf());
+        job.setJobName("index " + indexDir);
+
+        for (int i = 0; i < segments.length; i++) {
+            if (LOG.isInfoEnabled()) {
+                LOG.info("DocIndexer: adding segment: " + segments[i]);
+            }
+            job.addInputPath(new Path(segments[i], CrawlDatum.FETCH_DIR_NAME));
+            job.addInputPath(new Path(segments[i], CrawlDatum.PARSE_DIR_NAME));
+            job.addInputPath(new Path(segments[i], ParseData.DIR_NAME));
+            job.addInputPath(new Path(segments[i], ParseText.DIR_NAME));
+        }
+
+        job.addInputPath(outDir);
+        job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
+        job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME));
+        job.setInputFormat(SequenceFileInputFormat.class);
+
+        job.setMapperClass(DocIndexer.class);
+        job.setReducerClass(DocIndexer.class);
+
+        job.setOutputPath(indexDir);
+        job.setOutputFormat(OutputFormat.class);
+        job.setOutputKeyClass(Text.class);
+        job.setOutputValueClass(NutchWritable.class);
+
+        JobClient.runJob(job);
+
+        FileSystem fs = FileSystem.get(getConf());
+        fs.delete(outDir);
+
+        if (LOG.isInfoEnabled()) {
+            LOG.info("DocIndexer: done");
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        int res = ToolRunner.run(NutchConfiguration.create(), new DocIndexer(), args);
+        System.exit(res);
+    }
+
+    public int run(String[] args) throws Exception {
+
+        if (args.length < 4) {
+            System.err.println("Usage: <index> <crawldb> <linkdb> <segment> ...");
+            return -1;
+        }
+
+        Path[] segments = new Path[args.length - 3];
+        for (int i = 3; i < args.length; i++) {
+            segments[i - 3] = new Path(args[i]);
+        }
+
+        try {
+            index(new Path(args[0]), new Path(args[1]), new Path(args[2]),
+                    segments);
+            return 0;
+        } catch (Exception e) {
+            LOG.fatal("DocIndexer: " + StringUtils.stringifyException(e));
+            return -1;
+        }
+    }
+
+    public void map(Text key, Writable value,
+            OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException {
+        output.collect(key, new NutchWritable(value));
+    }
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2498] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/ AlphaPartitionedIndex.java

From: <bra...@us...> - 2008-07-26 01:37:59

Revision: 2498
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2498&view=rev
Author:   bradtofel
Date:     2008-07-26 01:38:07 +0000 (Sat, 26 Jul 2008)

Log Message:
-----------
TWEAK: removed unneeded suppressWarnings

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java	2008-07-26 01:37:41 UTC (rev 2497)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java	2008-07-26 01:38:07 UTC (rev 2498)
@@ -152,7 +152,6 @@
 		}
 	}
 	
-	@SuppressWarnings("unchecked")
 	protected RangeGroup getRangeGroupForRequest(WaybackRequest wbRequest)
 		throws BadQueryException, ResourceIndexNotAvailableException {
 		


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2497] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/liveweb/ LiveWebLocalResourceIndex.java

From: <bra...@us...> - 2008-07-26 01:37:36

Revision: 2497
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2497&view=rev
Author:   bradtofel
Date:     2008-07-26 01:37:41 +0000 (Sat, 26 Jul 2008)

Log Message:
-----------
TWEAK: removed unneeded suppressWarnings

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java	2008-07-25 21:26:03 UTC (rev 2496)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java	2008-07-26 01:37:41 UTC (rev 2497)
@@ -45,7 +45,6 @@
 	 * @throws IOException 
 	 * @throws UnsupportedOperationException 
 	 */
-	@SuppressWarnings("unchecked")
 	public void addSearchResult(CaptureSearchResult result) 
 	throws UnsupportedOperationException, IOException {
 		


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2496] trunk/archive-access/projects/wayback/ wayback-core/pom.xml

From: <bra...@us...> - 2008-07-25 21:25:54

Revision: 2496
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2496&view=rev
Author:   bradtofel
Date:     2008-07-25 21:26:03 +0000 (Fri, 25 Jul 2008)

Log Message:
-----------
TWEAK: added beanshell dependency

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/pom.xml

Modified: trunk/archive-access/projects/wayback/wayback-core/pom.xml
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/pom.xml	2008-07-25 20:33:59 UTC (rev 2495)
+++ trunk/archive-access/projects/wayback/wayback-core/pom.xml	2008-07-25 21:26:03 UTC (rev 2496)
@@ -79,6 +79,11 @@
       <artifactId>spring-beans</artifactId>
       <version>2.5.1</version>
     </dependency>
+    <dependency>
+      <groupId>org.beanshell</groupId>
+      <artifactId>bsh</artifactId>
+      <version>2.0b4</version>
+    </dependency>
     <!-- 
       Doh... I'm not sure what package is configuring org.apache.commons-logging
       to use log4j, but it's breaking some command line tools.


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2495] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/Importer.java

From: <bi...@us...> - 2008-07-25 20:33:50

Revision: 2495
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2495&view=rev
Author:   binzino
Date:     2008-07-25 20:33:59 +0000 (Fri, 25 Jul 2008)

Log Message:
-----------
Changed "none" to "unknown" for HTTPStatusCodeFilter to avoid
confusion over whether "none" means "nothing is allowed at all"
vs. "no code for this record".

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-07-25 20:24:53 UTC (rev 2494)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-07-25 20:33:59 UTC (rev 2495)
@@ -715,10 +715,10 @@
       {
         Range range = new Range( );
 
-        // Special handling for "none" where an ARCRecord doesn't have
+        // Special handling for "unknown" where an ARCRecord doesn't have
         // an HTTP status code.  The ARCRecord.getStatusCode() returns
         // -1 in that case, so we make a range for it.
-        if ( value.toLowerCase( ).equals( "none" ) )
+        if ( value.toLowerCase( ).equals( "unknown" ) )
           {
             range.lower = -1;
             range.upper = -1;


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2494] trunk/archive-access/projects/nutchwax/ archive

From: <bi...@us...> - 2008-07-25 20:24:45

Revision: 2494
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2494&view=rev
Author:   binzino
Date:     2008-07-25 20:24:53 +0000 (Fri, 25 Jul 2008)

Log Message:
-----------
Added HTTPStatusCodeFilter and configuration thereof in nutch-site.xml.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java

Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml	2008-07-25 02:46:16 UTC (rev 2493)
+++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml	2008-07-25 20:24:53 UTC (rev 2494)
@@ -32,7 +32,7 @@
 </property>
 
 <property>
-  <!-- Configure the 'index-nutchwax' plugin.  Specify how the metadata fields added by the ArcsToSegment are mapped to the Lucene documents during indexing.
+  <!-- Configure the 'index-nutchwax' plugin.  Specify how the metadata fields added by the Importer are mapped to the Lucene documents during indexing.
        The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key"
        Where the only required part is the "src-key", the rest will assume the following defaults:
           lowercase = true
@@ -111,9 +111,16 @@
 <property>
   <name>nutchwax.urlfilter.wayback.canonicalizer</name>
   <value>org.archive.wayback.util.url.AggressiveUrlCanonicalizer</value>
-  <description></description>
+  <description>Implementation of URL canonicalizer to use.</description>
 </property>
 
+<property>
+  <name>nutchwax.filter.http.status</name>
+  <value>
+    200-299
+  </value>
+</property>
+
 <!-- Similar to Nutch's
        file.content.limit
        http.content.limit

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-07-25 02:46:16 UTC (rev 2493)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-07-25 20:24:53 UTC (rev 2494)
@@ -20,6 +20,8 @@
 import java.net.MalformedURLException;
 import java.util.Map.Entry;
 import java.util.Iterator;
+import java.util.List;
+import java.util.ArrayList;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -106,12 +108,8 @@
   private ParseUtil      parseUtil;
   private URLNormalizers normalizers;
   private int            interval;
+  private HTTPStatusCodeFilter httpStatusCodeFilter;
 
-  private long           numSkipped;
-  private long           numImported;
-  private long           bytesSkipped;
-  private long           bytesImported;
-
   /**
    * ?: Is this necessary?
    */
@@ -146,6 +144,8 @@
     this.parseUtil   = new ParseUtil     ( jobConf );
     this.normalizers = new URLNormalizers( jobConf, URLNormalizers.SCOPE_FETCHER );
     this.interval    = jobConf.getInt( "db.fetch.interval.default", 2592000      );
+
+    this.httpStatusCodeFilter = new HTTPStatusCodeFilter( jobConf.get( "nutchwax.filter.http.status" ) );
   }
 
   /**
@@ -233,6 +233,13 @@
     
     if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" );
 
+    if ( ! this.httpStatusCodeFilter.isAllowed( record.getStatusCode( ) ) )
+      {
+        if ( LOG.isInfoEnabled() ) LOG.info( "Skip     URL: " + meta.getUrl() + " HTTP status:" + record.getStatusCode() );
+
+        return false;
+      }
+
     try
       {
         // Skip the HTTP headers in the response body, so that the
@@ -313,6 +320,7 @@
         contentMetadata.set( NutchWax.DIGEST_KEY,         meta.getDigest()   );
         contentMetadata.set( NutchWax.CONTENT_TYPE_KEY,   meta.getMimetype() );
         contentMetadata.set( NutchWax.CONTENT_LENGTH_KEY, String.valueOf( meta.getLength() ) );
+        contentMetadata.set( NutchWax.HTTP_RESPONSE_KEY,  String.valueOf( record.getStatusCode() ) );
 
         Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() );
 
@@ -677,3 +685,96 @@
   }
 
 }
+
+
+/**
+ * This should all be moved into some sort of filtering plugin.
+ * Unfortunately the URLFilter plugin interface isn't adequate as it
+ * only looks at a URL string.  Rather than jamming a response code
+ * through that interface, we do a one-off filter class here.
+ *
+ * A long-term solution would be to create a new Nutch extension point
+ * interface that takes an ARCRecord rather than a URL string.  That
+ * way we can write filters that can operate on any part of an
+ * ARCRecord, not just the URL.
+ */
+class HTTPStatusCodeFilter
+{
+  List<Range> ranges = new ArrayList<Range>( );
+
+  public HTTPStatusCodeFilter( String configuration )
+  {
+    if ( configuration == null )
+      {
+        return ;
+      }
+
+    configuration = configuration.trim( );
+
+    for ( String value : configuration.split( "\\s+" ) )
+      {
+        Range range = new Range( );
+
+        // Special handling for "none" where an ARCRecord doesn't have
+        // an HTTP status code.  The ARCRecord.getStatusCode() returns
+        // -1 in that case, so we make a range for it.
+        if ( value.toLowerCase( ).equals( "none" ) )
+          {
+            range.lower = -1;
+            range.upper = -1;
+
+            this.ranges.add( range );
+
+            continue;
+          }
+
+        String values[] = value.split( "[-]" );
+
+        try
+          {
+            switch ( values.length )
+              {
+              case 2:
+                // It's a range, N-M
+                range.lower = Integer.parseInt( values[0] );
+                range.upper = Integer.parseInt( values[1] );
+                break;
+                
+              case 1:
+                // It's a single value, convert to a single-value range
+                range.lower = Integer.parseInt( values[0] );
+                range.upper = range.lower;
+                break;
+                
+              default:
+                // Bad format
+                Importer.LOG.warn( "Illegal format for nutchwax.filter.http.status: " + range );
+                continue ;
+              }
+
+            this.ranges.add( range );
+          }
+        catch ( NumberFormatException nfe )
+          {
+            Importer.LOG.warn( "Illegal format for nutchwax.filter.http.status: " + range, nfe );
+          }
+      }
+
+  }
+
+  public boolean isAllowed( int code )
+  {
+    for ( Range r : this.ranges )
+      {
+          return ( r.lower <= code && code <= r.upper );
+      }
+
+    return false;
+  }
+
+  static class Range 
+  {
+    int lower;
+    int upper;
+  }
+}

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java	2008-07-25 02:46:16 UTC (rev 2493)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java	2008-07-25 20:24:53 UTC (rev 2494)
@@ -31,4 +31,5 @@
   public static final String DIGEST_KEY         = "digest";
   public static final String CONTENT_TYPE_KEY   = "type";
   public static final String CONTENT_LENGTH_KEY = "length";
+  public static final String HTTP_RESPONSE_KEY  = "http_response_code";
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2493] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/tools/DateAdder.java

From: <bi...@us...> - 2008-07-25 02:46:07

Revision: 2493
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2493&view=rev
Author:   binzino
Date:     2008-07-25 02:46:16 +0000 (Fri, 25 Jul 2008)

Log Message:
-----------
Integrated into Hadoop framework via Tool interface and Configured
superclass.  This enables us to read Nutch(WAX) configuration
properties, in particular the url canonicalizer implementation to use.
Fix JIRA: WAX-6.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2008-07-24 23:35:54 UTC (rev 2492)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2008-07-25 02:46:16 UTC (rev 2493)
@@ -38,8 +38,14 @@
 import org.apache.lucene.document.Field;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+import org.apache.nutch.util.NutchConfiguration;
+
 import org.archive.wayback.UrlCanonicalizer;
-import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
 
 import org.archive.nutchwax.NutchWax;
 
@@ -48,10 +54,9 @@
  * Reads series of (digest+URL,date) lines, finds corresponding
  * document in index, and adds the date to it.
  */
-public class DateAdder
+public class DateAdder extends Configured implements Tool
 {
-  public static void main(String[] args)
-    throws Exception
+  public int run( String[] args ) throws Exception
   {
     if ( args.length < 4 )
       {
@@ -111,7 +116,7 @@
 
     IndexWriter writer = new IndexWriter( destIndexDir, new WhitespaceAnalyzer( ), true );
     
-    UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer( );
+    UrlCanonicalizer canonicalizer = getCanonicalizer( this.getConf( ) );
 
     for ( int i = 0 ; i < reader.numDocs( ) ; i++ )
       {
@@ -155,6 +160,47 @@
 
     reader.close( );
     writer.close( );
+
+    return 0;
   }
+
+  /**
+   * Utility function to instantiate a UrlCanonicalizer based on an
+   * implementation specified in the configuration.
+   */
+  public static UrlCanonicalizer getCanonicalizer( Configuration conf )
+  {
+    // Which Wayback canonicalizer to use: Aggressive, Identity, etc.
+    String canonicalizerClassName = conf.get( "nutchwax.urlfilter.wayback.canonicalizer" );
+
+    if ( canonicalizerClassName == null || canonicalizerClassName.trim().length() == 0 )
+      {
+        throw new RuntimeException( "Missing value for property: nutchwax.urlfilter.wayback.canonicalizer" );
+      }
+
+    try
+      {
+        UrlCanonicalizer canonicalizer = (UrlCanonicalizer) Class.forName( canonicalizerClassName ).newInstance( );
+
+        return canonicalizer;
+      }
+    catch ( Exception e )
+      {
+        // If we can't instantiate it, there's not much else we can do
+        // other than just throw the Exception.
+        throw new RuntimeException( e );
+      }
+  }
+
+  /**
+   * Command-line driver.  Runs the Importer as a Hadoop job.
+   */
+  public static void main( String args[] ) throws Exception
+  {
+    int result = ToolRunner.run( NutchConfiguration.create(), new DateAdder(), args );
+
+    System.exit( result );
+  }
+
   
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2492] trunk/archive-access/projects/nutchwax/ archive/lib/commons-2.0.1-SNAPSHOT.jar

From: <bi...@us...> - 2008-07-24 23:35:44

Revision: 2492
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2492&view=rev
Author:   binzino
Date:     2008-07-24 23:35:54 +0000 (Thu, 24 Jul 2008)

Log Message:
-----------
Fix bug in org.archive.net.RsyncURLHandler where only part of URL was being
used in call to 'rsync' command.  Commit updated version of library.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/lib/commons-2.0.1-SNAPSHOT.jar


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2491] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax

From: <bi...@us...> - 2008-07-24 23:34:37

Revision: 2491
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2491&view=rev
Author:   binzino
Date:     2008-07-24 23:34:46 +0000 (Thu, 24 Jul 2008)

Log Message:
-----------
Add content-length to metadata stored for imported document.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-07-24 23:31:54 UTC (rev 2490)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-07-24 23:34:46 UTC (rev 2491)
@@ -231,7 +231,7 @@
   {
     ARCRecordMetaData meta = record.getMetaData();
     
-    if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ")" );
+    if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" );
 
     try
       {
@@ -302,16 +302,18 @@
 
         // We store both the normal URL and the URL+digest key for
         // later retrieval by the indexing plugin(s).
-        contentMetadata.set( NutchWax.URL_KEY,          url  );
-        contentMetadata.set( NutchWax.ORIG_KEY,         key  );
+        contentMetadata.set( NutchWax.URL_KEY,            url  );
+        contentMetadata.set( NutchWax.ORIG_KEY,           key  );
 
-        contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype()          );
-        contentMetadata.set( NutchWax.FILENAME_KEY,     meta.getArcFile().getName() );
-        contentMetadata.set( NutchWax.FILEOFFSET_KEY,   String.valueOf( record.getHeader().getOffset( ) ) );
-        contentMetadata.set( NutchWax.COLLECTION_KEY,   collectionName              );
-        contentMetadata.set( NutchWax.DATE_KEY,         meta.getDate()              );
-        contentMetadata.set( NutchWax.DIGEST_KEY,       meta.getDigest()            );
+        contentMetadata.set( NutchWax.FILENAME_KEY,       meta.getArcFile().getName() );
+        contentMetadata.set( NutchWax.FILEOFFSET_KEY,     String.valueOf( record.getHeader().getOffset( ) ) );
 
+        contentMetadata.set( NutchWax.COLLECTION_KEY,     collectionName     );
+        contentMetadata.set( NutchWax.DATE_KEY,           meta.getDate()     );
+        contentMetadata.set( NutchWax.DIGEST_KEY,         meta.getDigest()   );
+        contentMetadata.set( NutchWax.CONTENT_TYPE_KEY,   meta.getMimetype() );
+        contentMetadata.set( NutchWax.CONTENT_LENGTH_KEY, String.valueOf( meta.getLength() ) );
+
         Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() );
 
         output( output, new Text( key  ), content );

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java	2008-07-24 23:31:54 UTC (rev 2490)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java	2008-07-24 23:34:46 UTC (rev 2491)
@@ -22,12 +22,13 @@
 
 public class NutchWax
 {
-  public static final String URL_KEY          = "url";
-  public static final String ORIG_KEY         = "orig";
-  public static final String FILENAME_KEY     = "filename";
-  public static final String FILEOFFSET_KEY   = "fileoffset";
-  public static final String COLLECTION_KEY   = "collection";
-  public static final String CONTENT_TYPE_KEY = "type";
-  public static final String DATE_KEY         = "date";
-  public static final String DIGEST_KEY       = "digest";
+  public static final String URL_KEY            = "url";
+  public static final String ORIG_KEY           = "orig";
+  public static final String FILENAME_KEY       = "filename";
+  public static final String FILEOFFSET_KEY     = "fileoffset";
+  public static final String COLLECTION_KEY     = "collection";
+  public static final String DATE_KEY           = "date";
+  public static final String DIGEST_KEY         = "digest";
+  public static final String CONTENT_TYPE_KEY   = "type";
+  public static final String CONTENT_LENGTH_KEY = "length";
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2490] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/XSLTFilter.java

From: <bi...@us...> - 2008-07-24 23:31:45

Revision: 2490
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2490&view=rev
Author:   binzino
Date:     2008-07-24 23:31:54 +0000 (Thu, 24 Jul 2008)

Log Message:
-----------
Initial check of servlet filter which applies an XSLT transform to the
output of the filter chain.

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/XSLTFilter.java

Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/XSLTFilter.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/XSLTFilter.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/XSLTFilter.java	2008-07-24 23:31:54 UTC (rev 2490)
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2008 Internet Archive.
+ * 
+ * This file is part of the archive-access tools project
+ * (http://sourceforge.net/projects/archive-access).
+ * 
+ * The archive-access tools are free software; you can redistribute them and/or
+ * modify them under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or any
+ * later version.
+ * 
+ * The archive-access tools are distributed in the hope that they will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
+ * Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser Public License along with
+ * the archive-access tools; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.nutchwax;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.CharArrayWriter;
+
+import javax.servlet.Filter;
+import javax.servlet.FilterChain;
+import javax.servlet.FilterConfig;
+import javax.servlet.ServletContext;
+import javax.servlet.ServletException;
+import javax.servlet.ServletOutputStream;
+import javax.servlet.ServletRequest;
+import javax.servlet.ServletResponse;
+import javax.servlet.ServletResponseWrapper;
+import javax.servlet.http.*;
+
+import javax.xml.transform.Source;
+import javax.xml.transform.stream.StreamSource;
+import javax.xml.transform.Templates;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+
+public class XSLTFilter implements Filter
+{
+  private String xsltUrl;
+  private String contentType;
+
+  public void init( FilterConfig config )
+    throws ServletException
+  {
+    ServletContext app = config.getServletContext( );
+
+    this.xsltUrl = config.getInitParameter( "xsltUrl" );
+
+    if ( this.xsltUrl != null )
+      {
+        this.xsltUrl = this.xsltUrl.trim( );
+        
+        if ( this.xsltUrl.length( ) == 0 )
+          {
+            this.xsltUrl = null;
+          }
+      }
+
+    this.contentType = config.getInitParameter( "contentType" );
+
+    if ( this.contentType != null )
+      {
+        this.contentType = this.contentType.trim( );
+        
+        if ( this.contentType.length( ) == 0 )
+          {
+            this.contentType = null;
+          }
+      }
+
+    if ( this.contentType == null )
+      {
+        this.contentType = "application/xml";
+      }
+  }
+
+  public void doFilter( ServletRequest request, ServletResponse response, FilterChain chain )
+    throws IOException, ServletException 
+  {
+    if ( this.xsltUrl != null )
+      {
+        ByteArrayOutputStream baos = new ByteArrayOutputStream( 8 * 1024 );
+
+        HttpServletResponseInterceptor capturedResponse = new HttpServletResponseInterceptor( (HttpServletResponse) response, baos );
+        
+        chain.doFilter( request, capturedResponse );
+        
+        byte output[] = baos.toByteArray( );
+        
+        try
+          {
+            Source      xsltSource    = new StreamSource( xsltUrl );
+            Templates   xsltTemplates = TransformerFactory.newInstance( ).newTemplates( xsltSource );
+            Transformer transformer   = xsltTemplates.newTransformer( );
+            
+            StreamSource source = new StreamSource( new ByteArrayInputStream( output ) );
+            StreamResult result = new StreamResult( response.getOutputStream( ) );
+            
+            // Enforce XML content-type in the response.
+            response.setContentType( this.contentType );
+            
+            transformer.transform( source, result );
+          }
+        catch ( javax.xml.transform.TransformerConfigurationException tce )
+          {
+          }
+        catch( javax.xml.transform.TransformerException te )
+          {
+          }
+      }
+    else
+      {
+        chain.doFilter( request, response );
+      }
+  }
+
+  public void destroy()
+  {
+
+  }
+
+}
+
+
+class HttpServletResponseInterceptor extends HttpServletResponseWrapper
+{
+  private OutputStream os;
+
+  HttpServletResponseInterceptor( HttpServletResponse response, OutputStream os )
+  {
+    super( response );
+    
+    this.os = os;
+  }
+
+  public ServletOutputStream getOutputStream() 
+  {
+    ServletOutputStream sos = new ServletOutputStream( )
+      {
+        public void write( int b )
+          throws java.io.IOException
+        {
+          HttpServletResponseInterceptor.this.os.write( b );
+        }
+      };
+    
+    return sos;
+  }
+
+  public PrintWriter getWriter( )
+  {
+    PrintWriter pw = new PrintWriter( this.os );
+
+    return pw;
+  }
+
+}
\ No newline at end of file


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2489] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters

From: <bra...@us...> - 2008-07-23 01:10:19

Revision: 2489
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2489&view=rev
Author:   bradtofel
Date:     2008-07-23 01:10:27 +0000 (Wed, 23 Jul 2008)

Log Message:
-----------
TWEAK: changing from ObjectFilter to Adapter...

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentitySearchResultAdapter.java

Removed Paths:
-------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java	2008-07-23 01:07:34 UTC (rev 2488)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java	2008-07-23 01:10:27 UTC (rev 2489)
@@ -1,74 +0,0 @@
-/* LegacyToIdentityFilter
- *
- * $Id$
- *
- * Created on 11:48:56 AM Jul 10, 2008.
- *
- * Copyright (C) 2008 Internet Archive.
- *
- * This file is part of wayback.
- *
- * wayback is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * any later version.
- *
- * wayback is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser Public License for more details.
- *
- * You should have received a copy of the GNU Lesser Public License
- * along with wayback; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-package org.archive.wayback.resourceindex.adapters;
-
-import org.archive.wayback.core.CaptureSearchResult;
-import org.archive.wayback.util.ObjectFilter;
-import org.archive.wayback.util.url.UrlOperations;
-
-/**
- * CaptureSearchResult ObjectFilter which passes through all inputs, modifying
- * each to construct a corrected original URL to comply with new Identity 
- * format.
- *
- * @author brad
- * @version $Date$, $Revision$
- */
-public class LegacyToIdentityFilter implements ObjectFilter<CaptureSearchResult> {
-	private final static String DEFAULT_SCHEME = "http://";
-	
-	private int getEndOfHostIndex(String url) {
-		int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR);
-		int pathIdx = url.indexOf(UrlOperations.PATH_START);
-		if(portIdx == -1 && pathIdx == -1) {
-			return url.length();
-		}
-		if(portIdx == -1) {
-			return pathIdx;
-		}
-		if(pathIdx == -1) {
-			return portIdx;
-		}
-		if(pathIdx > portIdx) {
-			return portIdx;
-		} else {
-			return pathIdx;
-		}
-	}
-	
-	/* (non-Javadoc)
-	 * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object)
-	 */
-	public int filterObject(CaptureSearchResult o) {
-		String urlKey = o.getUrlKey();
-		StringBuilder sb = new StringBuilder(urlKey.length());
-		sb.append(DEFAULT_SCHEME);
-		sb.append(o.getOriginalUrl());
-		sb.append(urlKey.substring(getEndOfHostIndex(urlKey)));
-		o.setOriginalUrl(sb.toString());
-		return FILTER_INCLUDE;
-	}
-
-}

Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentitySearchResultAdapter.java (from rev 2488, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentitySearchResultAdapter.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentitySearchResultAdapter.java	2008-07-23 01:10:27 UTC (rev 2489)
@@ -0,0 +1,74 @@
+/* LegacyToIdentityFilter
+ *
+ * $Id$
+ *
+ * Created on 11:48:56 AM Jul 10, 2008.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.resourceindex.adapters;
+
+import org.archive.wayback.core.CaptureSearchResult;
+import org.archive.wayback.util.ObjectFilter;
+import org.archive.wayback.util.url.UrlOperations;
+
+/**
+ * CaptureSearchResult ObjectFilter which passes through all inputs, modifying
+ * each to construct a corrected original URL to comply with new Identity 
+ * format.
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class LegacyToIdentitySearchResultAdapter implements ObjectFilter<CaptureSearchResult> {
+	private final static String DEFAULT_SCHEME = "http://";
+	
+	private int getEndOfHostIndex(String url) {
+		int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR);
+		int pathIdx = url.indexOf(UrlOperations.PATH_START);
+		if(portIdx == -1 && pathIdx == -1) {
+			return url.length();
+		}
+		if(portIdx == -1) {
+			return pathIdx;
+		}
+		if(pathIdx == -1) {
+			return portIdx;
+		}
+		if(pathIdx > portIdx) {
+			return portIdx;
+		} else {
+			return pathIdx;
+		}
+	}
+	
+	/* (non-Javadoc)
+	 * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object)
+	 */
+	public int filterObject(CaptureSearchResult o) {
+		String urlKey = o.getUrlKey();
+		StringBuilder sb = new StringBuilder(urlKey.length());
+		sb.append(DEFAULT_SCHEME);
+		sb.append(o.getOriginalUrl());
+		sb.append(urlKey.substring(getEndOfHostIndex(urlKey)));
+		o.setOriginalUrl(sb.toString());
+		return FILTER_INCLUDE;
+	}
+
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2488] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourceindex

From: <bra...@us...> - 2008-07-23 01:07:26

Revision: 2488
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2488&view=rev
Author:   bradtofel
Date:     2008-07-23 01:07:34 +0000 (Wed, 23 Jul 2008)

Log Message:
-----------
REFACTOR: moved various Adapter<*SearchResult> into org.archive.wayback.resourceindex.adapters

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java

Removed Paths:
-------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java

Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java (from rev 2483, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java	2008-07-23 01:07:34 UTC (rev 2488)
@@ -0,0 +1,74 @@
+/* LegacyToIdentityFilter
+ *
+ * $Id$
+ *
+ * Created on 11:48:56 AM Jul 10, 2008.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.resourceindex.adapters;
+
+import org.archive.wayback.core.CaptureSearchResult;
+import org.archive.wayback.util.ObjectFilter;
+import org.archive.wayback.util.url.UrlOperations;
+
+/**
+ * CaptureSearchResult ObjectFilter which passes through all inputs, modifying
+ * each to construct a corrected original URL to comply with new Identity 
+ * format.
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class LegacyToIdentityFilter implements ObjectFilter<CaptureSearchResult> {
+	private final static String DEFAULT_SCHEME = "http://";
+	
+	private int getEndOfHostIndex(String url) {
+		int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR);
+		int pathIdx = url.indexOf(UrlOperations.PATH_START);
+		if(portIdx == -1 && pathIdx == -1) {
+			return url.length();
+		}
+		if(portIdx == -1) {
+			return pathIdx;
+		}
+		if(pathIdx == -1) {
+			return portIdx;
+		}
+		if(pathIdx > portIdx) {
+			return portIdx;
+		} else {
+			return pathIdx;
+		}
+	}
+	
+	/* (non-Javadoc)
+	 * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object)
+	 */
+	public int filterObject(CaptureSearchResult o) {
+		String urlKey = o.getUrlKey();
+		StringBuilder sb = new StringBuilder(urlKey.length());
+		sb.append(DEFAULT_SCHEME);
+		sb.append(o.getOriginalUrl());
+		sb.append(urlKey.substring(getEndOfHostIndex(urlKey)));
+		o.setOriginalUrl(sb.toString());
+		return FILTER_INCLUDE;
+	}
+
+}

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java	2008-07-23 01:06:29 UTC (rev 2487)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java	2008-07-23 01:07:34 UTC (rev 2488)
@@ -1,74 +0,0 @@
-/* LegacyToIdentityFilter
- *
- * $Id$
- *
- * Created on 11:48:56 AM Jul 10, 2008.
- *
- * Copyright (C) 2008 Internet Archive.
- *
- * This file is part of wayback.
- *
- * wayback is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * any later version.
- *
- * wayback is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser Public License for more details.
- *
- * You should have received a copy of the GNU Lesser Public License
- * along with wayback; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-package org.archive.wayback.resourceindex.filters;
-
-import org.archive.wayback.core.CaptureSearchResult;
-import org.archive.wayback.util.ObjectFilter;
-import org.archive.wayback.util.url.UrlOperations;
-
-/**
- * CaptureSearchResult ObjectFilter which passes through all inputs, modifying
- * each to construct a corrected original URL to comply with new Identity 
- * format.
- *
- * @author brad
- * @version $Date$, $Revision$
- */
-public class LegacyToIdentityFilter implements ObjectFilter<CaptureSearchResult> {
-	private final static String DEFAULT_SCHEME = "http://";
-	
-	private int getEndOfHostIndex(String url) {
-		int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR);
-		int pathIdx = url.indexOf(UrlOperations.PATH_START);
-		if(portIdx == -1 && pathIdx == -1) {
-			return url.length();
-		}
-		if(portIdx == -1) {
-			return pathIdx;
-		}
-		if(pathIdx == -1) {
-			return portIdx;
-		}
-		if(pathIdx > portIdx) {
-			return portIdx;
-		} else {
-			return pathIdx;
-		}
-	}
-	
-	/* (non-Javadoc)
-	 * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object)
-	 */
-	public int filterObject(CaptureSearchResult o) {
-		String urlKey = o.getUrlKey();
-		StringBuilder sb = new StringBuilder(urlKey.length());
-		sb.append(DEFAULT_SCHEME);
-		sb.append(o.getOriginalUrl());
-		sb.append(urlKey.substring(getEndOfHostIndex(urlKey)));
-		o.setOriginalUrl(sb.toString());
-		return FILTER_INCLUDE;
-	}
-
-}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2487] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback

From: <bra...@us...> - 2008-07-23 01:06:22

Revision: 2487
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2487&view=rev
Author:   bradtofel
Date:     2008-07-23 01:06:29 +0000 (Wed, 23 Jul 2008)

Log Message:
-----------
REFACTOR: moved various Adapter<*SearchResult> into org.archive.wayback.resourceindex.adapters

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java

Removed Paths:
-------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java	2008-07-23 01:04:09 UTC (rev 2486)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java	2008-07-23 01:06:29 UTC (rev 2487)
@@ -1,113 +0,0 @@
-/* CaptureToUrlSearchResultAdapter
- *
- * $Id$
- *
- * Created on 4:45:55 PM Jun 28, 2008.
- *
- * Copyright (C) 2008 Internet Archive.
- *
- * This file is part of wayback.
- *
- * wayback is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * any later version.
- *
- * wayback is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser Public License for more details.
- *
- * You should have received a copy of the GNU Lesser Public License
- * along with wayback; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-package org.archive.wayback.core;
-
-import java.util.HashMap;
-
-import org.archive.wayback.util.AdaptedIterator;
-import org.archive.wayback.util.Adapter;
-import org.archive.wayback.util.CloseableIterator;
-
-/**
- *
- *
- * @author brad
- * @version $Date$, $Revision$
- */
-public class CaptureToUrlSearchResultAdapter
-	implements Adapter<CaptureSearchResult, UrlSearchResult> {
-
-	private String currentUrl;
-	private String originalUrl;
-	private String firstCapture;
-	private String lastCapture;
-	private int numCaptures;
-	private HashMap<String,Object> digests;
-	private UrlSearchResult resultRef = null;
-	public CaptureToUrlSearchResultAdapter() {
-		
-	}
-	private UrlSearchResult makeUrlSearchResult(CaptureSearchResult result) {
-		currentUrl = result.getUrlKey();
-		originalUrl = result.getOriginalUrl();
-		firstCapture = result.getCaptureTimestamp();
-		lastCapture = firstCapture;
-		digests = new HashMap<String,Object>();
-		digests.put(result.getDigest(),null);
-		numCaptures = 1;
-
-		resultRef = new UrlSearchResult();
-		resultRef.setUrlKey(currentUrl);
-		resultRef.setOriginalUrl(originalUrl);
-		resultRef.setFirstCapture(firstCapture);
-		resultRef.setLastCapture(lastCapture);
-		resultRef.setNumCaptures(1);
-		resultRef.setNumVersions(1);
-		return resultRef;
-	}
-
-	/* (non-Javadoc)
-	 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
-	 */
-	public UrlSearchResult adapt(CaptureSearchResult c) {
-		String urlKey = c.getUrlKey();
-		if(resultRef == null || !currentUrl.equals(urlKey)) {
-			return makeUrlSearchResult(c);
-		}
-
-		// same url -- accumulate into the last one we returned:
-		String captureDate = c.getCaptureTimestamp();
-		if(captureDate.compareTo(firstCapture) < 0) {
-			firstCapture = captureDate;
-			resultRef.setFirstCapture(firstCapture);
-		}
-		if(captureDate.compareTo(lastCapture) > 0) {
-			lastCapture = captureDate;
-			resultRef.setLastCapture(lastCapture);
-		}
-		numCaptures++;
-		digests.put(c.getDigest(), null);
-		resultRef.setNumCaptures(numCaptures);
-		resultRef.setNumVersions(digests.size());
-		return null;
-	}
-	public static CloseableIterator<UrlSearchResult> adaptCaptureIterator(
-			CloseableIterator<CaptureSearchResult> itr) {
-
-		// HACKHACK: this is pretty lame. We return an UrlSearchResult the
-		// first time we see a new urlKey, and cache a reference to the returned
-		// UrlSearchResult, updating it as we see subsequent CaptureSearchResult
-		// objects with the same urlKey.
-		// This means that users of the returned UrlSearchResult need to wait
-		// until they've got the *next* returned UrlSearchResult before using
-		// the *previous* UrlSearchResult.
-		// At the moment, this all happens inside a LocalResourceIndex, so
-		// none of the UrlSearchResult objects should be seen/used in any 
-		// significant way before they've all be accumulated into an 
-		// UrlSearchResults object..
-		return new AdaptedIterator<CaptureSearchResult,UrlSearchResult>(itr,
-				new CaptureToUrlSearchResultAdapter());
-	}
-}

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java	2008-07-23 01:04:09 UTC (rev 2486)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java	2008-07-23 01:06:29 UTC (rev 2487)
@@ -1,63 +0,0 @@
-package org.archive.wayback.resourceindex;
-
-import java.util.HashMap;
-
-import org.archive.wayback.core.CaptureSearchResult;
-import org.archive.wayback.util.Adapter;
-
-/**
- * Adapter class that observes a stream of SearchResults tracking for each
- * complete record, a mapping of that records digest to:
- *   Arc/Warc Filename
- * 	 Arc/Warc offset
- *   HTTP Response
- *   MIME-Type
- *   Redirect URL
- *   
- * If subsequent SearchResults are missing these fields ("-") and the Digest 
- * field has been seen, then the subsequent SearchResults are updated with the 
- * values from the kept copy matching that digest, and an additional annotation
- * field is added.
- * 
- * 
- * @author brad
- * @version $Date$, $Revision$
- */
-public class DeduplicationSearchResultAnnotationAdapter 
-implements Adapter<CaptureSearchResult,CaptureSearchResult> {
-	private final static String EMPTY_VALUE = "-";
-
-	private HashMap<String,CaptureSearchResult> memory = null;
-
-	public DeduplicationSearchResultAnnotationAdapter() {
-		memory = new HashMap<String,CaptureSearchResult>();
-	}
-
-	private CaptureSearchResult annotate(CaptureSearchResult o) {
-		String thisDigest = o.getDigest();
-		CaptureSearchResult last = memory.get(thisDigest);
-		if(last == null) {
-			// TODO: log missing record digest reference
-			return null;
-		}
-		o.setFile(last.getFile());
-		o.setOffset(last.getOffset());
-		o.setHttpCode(last.getHttpCode());
-		o.setMimeType(last.getMimeType());
-		o.setRedirectUrl(last.getRedirectUrl());
-		o.flagDuplicateDigest(last.getCaptureTimestamp());
-		return o;
-	}
-
-	private CaptureSearchResult remember(CaptureSearchResult o) {
-		memory.put(o.getDigest(),o);
-		return o;
-	}
-
-	public CaptureSearchResult adapt(CaptureSearchResult o) {
-		if(o.getFile().equals(EMPTY_VALUE)) {
-			return annotate(o);
-		}
-		return remember(o);
-	}
-}
\ No newline at end of file

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java	2008-07-23 01:04:09 UTC (rev 2486)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java	2008-07-23 01:06:29 UTC (rev 2487)
@@ -34,7 +34,6 @@
 import org.archive.wayback.UrlCanonicalizer;
 import org.archive.wayback.core.CaptureSearchResult;
 import org.archive.wayback.core.CaptureSearchResults;
-import org.archive.wayback.core.CaptureToUrlSearchResultAdapter;
 import org.archive.wayback.core.SearchResult;
 import org.archive.wayback.core.SearchResults;
 import org.archive.wayback.core.UrlSearchResult;
@@ -44,6 +43,8 @@
 import org.archive.wayback.exception.BadQueryException;
 import org.archive.wayback.exception.ResourceIndexNotAvailableException;
 import org.archive.wayback.exception.ResourceNotInArchiveException;
+import org.archive.wayback.resourceindex.adapters.CaptureToUrlSearchResultAdapter;
+import org.archive.wayback.resourceindex.adapters.DeduplicationSearchResultAnnotationAdapter;
 import org.archive.wayback.resourceindex.filters.CounterFilter;
 import org.archive.wayback.resourceindex.filters.DateRangeFilter;
 import org.archive.wayback.resourceindex.filters.DuplicateRecordFilter;

Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java (from rev 2448, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java	2008-07-23 01:06:29 UTC (rev 2487)
@@ -0,0 +1,115 @@
+/* CaptureToUrlSearchResultAdapter
+ *
+ * $Id$
+ *
+ * Created on 4:45:55 PM Jun 28, 2008.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.resourceindex.adapters;
+
+import java.util.HashMap;
+
+import org.archive.wayback.core.CaptureSearchResult;
+import org.archive.wayback.core.UrlSearchResult;
+import org.archive.wayback.util.AdaptedIterator;
+import org.archive.wayback.util.Adapter;
+import org.archive.wayback.util.CloseableIterator;
+
+/**
+ *
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class CaptureToUrlSearchResultAdapter
+	implements Adapter<CaptureSearchResult, UrlSearchResult> {
+
+	private String currentUrl;
+	private String originalUrl;
+	private String firstCapture;
+	private String lastCapture;
+	private int numCaptures;
+	private HashMap<String,Object> digests;
+	private UrlSearchResult resultRef = null;
+	public CaptureToUrlSearchResultAdapter() {
+		
+	}
+	private UrlSearchResult makeUrlSearchResult(CaptureSearchResult result) {
+		currentUrl = result.getUrlKey();
+		originalUrl = result.getOriginalUrl();
+		firstCapture = result.getCaptureTimestamp();
+		lastCapture = firstCapture;
+		digests = new HashMap<String,Object>();
+		digests.put(result.getDigest(),null);
+		numCaptures = 1;
+
+		resultRef = new UrlSearchResult();
+		resultRef.setUrlKey(currentUrl);
+		resultRef.setOriginalUrl(originalUrl);
+		resultRef.setFirstCapture(firstCapture);
+		resultRef.setLastCapture(lastCapture);
+		resultRef.setNumCaptures(1);
+		resultRef.setNumVersions(1);
+		return resultRef;
+	}
+
+	/* (non-Javadoc)
+	 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
+	 */
+	public UrlSearchResult adapt(CaptureSearchResult c) {
+		String urlKey = c.getUrlKey();
+		if(resultRef == null || !currentUrl.equals(urlKey)) {
+			return makeUrlSearchResult(c);
+		}
+
+		// same url -- accumulate into the last one we returned:
+		String captureDate = c.getCaptureTimestamp();
+		if(captureDate.compareTo(firstCapture) < 0) {
+			firstCapture = captureDate;
+			resultRef.setFirstCapture(firstCapture);
+		}
+		if(captureDate.compareTo(lastCapture) > 0) {
+			lastCapture = captureDate;
+			resultRef.setLastCapture(lastCapture);
+		}
+		numCaptures++;
+		digests.put(c.getDigest(), null);
+		resultRef.setNumCaptures(numCaptures);
+		resultRef.setNumVersions(digests.size());
+		return null;
+	}
+	public static CloseableIterator<UrlSearchResult> adaptCaptureIterator(
+			CloseableIterator<CaptureSearchResult> itr) {
+
+		// HACKHACK: this is pretty lame. We return an UrlSearchResult the
+		// first time we see a new urlKey, and cache a reference to the returned
+		// UrlSearchResult, updating it as we see subsequent CaptureSearchResult
+		// objects with the same urlKey.
+		// This means that users of the returned UrlSearchResult need to wait
+		// until they've got the *next* returned UrlSearchResult before using
+		// the *previous* UrlSearchResult.
+		// At the moment, this all happens inside a LocalResourceIndex, so
+		// none of the UrlSearchResult objects should be seen/used in any 
+		// significant way before they've all be accumulated into an 
+		// UrlSearchResults object..
+		return new AdaptedIterator<CaptureSearchResult,UrlSearchResult>(itr,
+				new CaptureToUrlSearchResultAdapter());
+	}
+}

Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java (from rev 2448, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java	2008-07-23 01:06:29 UTC (rev 2487)
@@ -0,0 +1,63 @@
+package org.archive.wayback.resourceindex.adapters;
+
+import java.util.HashMap;
+
+import org.archive.wayback.core.CaptureSearchResult;
+import org.archive.wayback.util.Adapter;
+
+/**
+ * Adapter class that observes a stream of SearchResults tracking for each
+ * complete record, a mapping of that records digest to:
+ *   Arc/Warc Filename
+ * 	 Arc/Warc offset
+ *   HTTP Response
+ *   MIME-Type
+ *   Redirect URL
+ *   
+ * If subsequent SearchResults are missing these fields ("-") and the Digest 
+ * field has been seen, then the subsequent SearchResults are updated with the 
+ * values from the kept copy matching that digest, and an additional annotation
+ * field is added.
+ * 
+ * 
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class DeduplicationSearchResultAnnotationAdapter 
+implements Adapter<CaptureSearchResult,CaptureSearchResult> {
+	private final static String EMPTY_VALUE = "-";
+
+	private HashMap<String,CaptureSearchResult> memory = null;
+
+	public DeduplicationSearchResultAnnotationAdapter() {
+		memory = new HashMap<String,CaptureSearchResult>();
+	}
+
+	private CaptureSearchResult annotate(CaptureSearchResult o) {
+		String thisDigest = o.getDigest();
+		CaptureSearchResult last = memory.get(thisDigest);
+		if(last == null) {
+			// TODO: log missing record digest reference
+			return null;
+		}
+		o.setFile(last.getFile());
+		o.setOffset(last.getOffset());
+		o.setHttpCode(last.getHttpCode());
+		o.setMimeType(last.getMimeType());
+		o.setRedirectUrl(last.getRedirectUrl());
+		o.flagDuplicateDigest(last.getCaptureTimestamp());
+		return o;
+	}
+
+	private CaptureSearchResult remember(CaptureSearchResult o) {
+		memory.put(o.getDigest(),o);
+		return o;
+	}
+
+	public CaptureSearchResult adapt(CaptureSearchResult o) {
+		if(o.getFile().equals(EMPTY_VALUE)) {
+			return annotate(o);
+		}
+		return remember(o);
+	}
+}
\ No newline at end of file


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2486] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/

From: <bra...@us...> - 2008-07-23 01:04:00

Revision: 2486
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2486&view=rev
Author:   bradtofel
Date:     2008-07-23 01:04:09 +0000 (Wed, 23 Jul 2008)

Log Message:
-----------
package for various Adapter<*SearchResult>

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2485] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback

From: <bra...@us...> - 2008-07-23 00:42:39

Revision: 2485
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2485&view=rev
Author:   bradtofel
Date:     2008-07-23 00:42:47 +0000 (Wed, 23 Jul 2008)

Log Message:
-----------
REFACTOR: moved WaybackRequest.fixup() out of all RequestParser implementations, and into AccessPoint. This allows the AccessPoint to set the reference to itself before fixup() allowing:
FEATURE: Added get/setLocale() to AccessPoint, allowing explicit configuration of the Locale to use for that AccessPoint. If none is specified, then the Locale of the HttpServletRequest is used, as before.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/proxy/ProxyRequestParser.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/FormRequestParser.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/OpenSearchRequestParser.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java	2008-07-22 02:59:11 UTC (rev 2484)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java	2008-07-23 00:42:47 UTC (rev 2485)
@@ -720,10 +720,14 @@
 				String.valueOf(httpRequest.getLocalPort()));
 		putUnlessNull(REQUEST_WAYBACK_CONTEXT, httpRequest.getContextPath());
 
-		Locale l = httpRequest.getLocale();
-		ResourceBundle b = ResourceBundle.getBundle(UI_RESOURCE_BUNDLE_NAME,
-				httpRequest.getLocale());
-		formatter = new StringFormatter(b,l);
+		Locale l = null;
+		if(accessPoint != null) {
+			l = accessPoint.getLocale();
+		}
+		if(l == null) {
+			l = httpRequest.getLocale();
+		}
+		setLocale(l);
 		putUnlessNull(REQUEST_LOCALE_LANG,l.getDisplayLanguage());
 
 		Cookie[] cookies = httpRequest.getCookies();

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/proxy/ProxyRequestParser.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/proxy/ProxyRequestParser.java	2008-07-22 02:59:11 UTC (rev 2484)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/proxy/ProxyRequestParser.java	2008-07-23 00:42:47 UTC (rev 2485)
@@ -74,7 +74,6 @@
 	            String replayDateStr = BDBMap.getTimestampForId(
 	            		httpRequest.getContextPath(), id); 
 	            wbRequest.setReplayTimestamp(replayDateStr);
-	            wbRequest.fixup(httpRequest);
 	    }
 	    return wbRequest;
 	}

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/FormRequestParser.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/FormRequestParser.java	2008-07-22 02:59:11 UTC (rev 2484)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/FormRequestParser.java	2008-07-23 00:42:47 UTC (rev 2485)
@@ -102,10 +102,6 @@
 				}
 			}
 		}
-		if(wbRequest != null) {
-			wbRequest.fixup(httpRequest);
-		}
-
 		return wbRequest;
 	}
 }

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/OpenSearchRequestParser.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/OpenSearchRequestParser.java	2008-07-22 02:59:11 UTC (rev 2484)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/OpenSearchRequestParser.java	2008-07-23 00:42:47 UTC (rev 2485)
@@ -146,7 +146,6 @@
 		if(wbRequest.getEndTimestamp() == null) {
 			wbRequest.setEndTimestamp(getLatestTimestamp());
 		}
-		wbRequest.fixup(httpRequest);
 
 		return wbRequest;
 	}

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java	2008-07-22 02:59:11 UTC (rev 2484)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java	2008-07-23 00:42:47 UTC (rev 2485)
@@ -67,7 +67,6 @@
 		
 		WaybackRequest wbRequest = parse(requestPath);
 		if(wbRequest != null) {
-			wbRequest.fixup(httpRequest);
 			wbRequest.setResultsPerPage(maxRecords);
 		}
 

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java	2008-07-22 02:59:11 UTC (rev 2484)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java	2008-07-23 00:42:47 UTC (rev 2485)
@@ -25,6 +25,7 @@
 package org.archive.wayback.webapp;
 
 import java.io.IOException;
+import java.util.Locale;
 import java.util.Properties;
 import java.util.logging.Logger;
 
@@ -89,7 +90,17 @@
 	private Properties configs = null;
 	private ExclusionFilterFactory exclusionFactory = null;
 	private BooleanOperator<WaybackRequest> authentication = null;
+	private String urlRoot = null;
+	private Locale locale = null;
 
+	public Locale getLocale() {
+		return locale;
+	}
+
+	public void setLocale(Locale locale) {
+		this.locale = locale;
+	}
+
 	/**
 	 * 
 	 */
@@ -224,6 +235,9 @@
 	 * Canonical server and port information.
 	 */
 	public String getAbsoluteLocalPrefix(HttpServletRequest httpRequest) {
+		if(urlRoot != null) {
+			return urlRoot;
+		}
 		return getAbsoluteContextPrefix(httpRequest, useServerName);
 	}
 
@@ -236,7 +250,7 @@
 		WaybackRequest wbRequest = new WaybackRequest();
 		wbRequest.setContextPrefix(getAbsoluteLocalPrefix(httpRequest));
 		wbRequest.setAccessPoint(this);
-
+		wbRequest.fixup(httpRequest);
 		UIResults uiResults = new UIResults(wbRequest,uriConverter);
 		try {
 			uiResults.forward(httpRequest, httpResponse, translated);
@@ -278,9 +292,10 @@
 			wbRequest = parser.parse(httpRequest, this);
 
 			if(wbRequest != null) {
+				handled = true;
 				wbRequest.setAccessPoint(this);
-				handled = true;
 				wbRequest.setContextPrefix(getAbsoluteLocalPrefix(httpRequest));
+				wbRequest.fixup(httpRequest);
 				if(authentication != null) {
 					if(!authentication.isTrue(wbRequest)) {
 						throw new AuthenticationControlException("Not authorized");
@@ -485,4 +500,12 @@
 	public void setException(ExceptionRenderer exception) {
 		this.exception = exception;
 	}
+
+	public String getUrlRoot() {
+		return urlRoot;
+	}
+
+	public void setUrlRoot(String urlRoot) {
+		this.urlRoot = urlRoot;
+	}
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2484] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp

From: <bra...@us...> - 2008-07-22 02:59:02

Revision: 2484
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2484&view=rev
Author:   bradtofel
Date:     2008-07-22 02:59:11 +0000 (Tue, 22 Jul 2008)

Log Message:
-----------
MOVED: Redirect.jsp up a level to the /webapp/jsp/ directory, where it is externally accessible.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/jsp/Redirect.jsp

Removed Paths:
-------------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Redirect.jsp

Deleted: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Redirect.jsp
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Redirect.jsp	2008-07-22 02:56:29 UTC (rev 2483)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Redirect.jsp	2008-07-22 02:59:11 UTC (rev 2484)
@@ -1,15 +0,0 @@
-<%@ page import="org.archive.wayback.util.bdb.BDBMap" %>
-
-<%
- String url = request.getParameter("url");
- String time = request.getParameter("time");
-  
- // Put time-mapping for this id, or if no id, the ip-addr.
- String id = request.getHeader("Proxy-Id");
- if(id == null)	id = request.getRemoteAddr();
- BDBMap.addTimestampForId(request.getContextPath(),id, time);
- 
- // Now redirect to the page the user wanted.
- response.sendRedirect(url);
-%>
-anchored date!

Copied: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/jsp/Redirect.jsp (from rev 2465, trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Redirect.jsp)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/jsp/Redirect.jsp	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/jsp/Redirect.jsp	2008-07-22 02:59:11 UTC (rev 2484)
@@ -0,0 +1,15 @@
+<%@ page import="org.archive.wayback.util.bdb.BDBMap" %>
+
+<%
+ String url = request.getParameter("url");
+ String time = request.getParameter("time");
+  
+ // Put time-mapping for this id, or if no id, the ip-addr.
+ String id = request.getHeader("Proxy-Id");
+ if(id == null)	id = request.getRemoteAddr();
+ BDBMap.addTimestampForId(request.getContextPath(),id, time);
+ 
+ // Now redirect to the page the user wanted.
+ response.sendRedirect(url);
+%>
+anchored date!


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2483] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ LegacyToIdentityFilter.java

From: <bra...@us...> - 2008-07-22 02:56:20

Revision: 2483
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2483&view=rev
Author:   bradtofel
Date:     2008-07-22 02:56:29 +0000 (Tue, 22 Jul 2008)

Log Message:
-----------
INITIAL-REV: ObjectFilter which converts legacy CaptureSearchResults objects into the new form, meaning fabricating the "Original URL" field from the URL key and the Original Host.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java	2008-07-22 02:56:29 UTC (rev 2483)
@@ -0,0 +1,74 @@
+/* LegacyToIdentityFilter
+ *
+ * $Id$
+ *
+ * Created on 11:48:56 AM Jul 10, 2008.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.resourceindex.filters;
+
+import org.archive.wayback.core.CaptureSearchResult;
+import org.archive.wayback.util.ObjectFilter;
+import org.archive.wayback.util.url.UrlOperations;
+
+/**
+ * CaptureSearchResult ObjectFilter which passes through all inputs, modifying
+ * each to construct a corrected original URL to comply with new Identity 
+ * format.
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class LegacyToIdentityFilter implements ObjectFilter<CaptureSearchResult> {
+	private final static String DEFAULT_SCHEME = "http://";
+	
+	private int getEndOfHostIndex(String url) {
+		int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR);
+		int pathIdx = url.indexOf(UrlOperations.PATH_START);
+		if(portIdx == -1 && pathIdx == -1) {
+			return url.length();
+		}
+		if(portIdx == -1) {
+			return pathIdx;
+		}
+		if(pathIdx == -1) {
+			return portIdx;
+		}
+		if(pathIdx > portIdx) {
+			return portIdx;
+		} else {
+			return pathIdx;
+		}
+	}
+	
+	/* (non-Javadoc)
+	 * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object)
+	 */
+	public int filterObject(CaptureSearchResult o) {
+		String urlKey = o.getUrlKey();
+		StringBuilder sb = new StringBuilder(urlKey.length());
+		sb.append(DEFAULT_SCHEME);
+		sb.append(o.getOriginalUrl());
+		sb.append(urlKey.substring(getEndOfHostIndex(urlKey)));
+		o.setOriginalUrl(sb.toString());
+		return FILTER_INCLUDE;
+	}
+
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2482] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/exception

From: <bra...@us...> - 2008-07-22 02:53:41

Revision: 2482
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2482&view=rev
Author:   bradtofel
Date:     2008-07-22 02:53:50 +0000 (Tue, 22 Jul 2008)

Log Message:
-----------
INITIAL-REV: two custom ExceptionRenderer implementations which provide the ability to override specific the standard .jsp handlers, via a list of special-case hosts, or by consulting an external Oracle. The AnnotationExceptionRenderer is highly experimental.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/AnnotationExceptionRenderer.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/CustomNotInArchiveExceptionRenderer.java

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/AnnotationExceptionRenderer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/AnnotationExceptionRenderer.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/AnnotationExceptionRenderer.java	2008-07-22 02:53:50 UTC (rev 2482)
@@ -0,0 +1,130 @@
+/* AnnotationExceptionRenderer
+ *
+ * $Id$
+ *
+ * Created on 7:17:24 PM Jun 10, 2008.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.exception;
+
+import java.util.Date;
+
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.archive.accesscontrol.AccessControlClient;
+import org.archive.accesscontrol.RuleOracleUnavailableException;
+import org.archive.accesscontrol.model.Rule;
+import org.archive.wayback.core.WaybackRequest;
+
+/**
+ *
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class AnnotationExceptionRenderer extends BaseExceptionRenderer {
+	private AccessControlClient client = null;
+	private String oracleUrl = null;
+	private String who = null;
+	public void init() {
+		client = new AccessControlClient(oracleUrl);
+	}
+	public String getExceptionHandler(HttpServletRequest httpRequest,
+			HttpServletResponse httpResponse, WaybackRequest wbRequest,
+			WaybackException exception) {
+		// the "standard HTML" response handler:
+		String jspPath = getCustomHandler(exception,wbRequest);
+		if(jspPath == null) {
+			jspPath = super.getExceptionHandler(httpRequest, httpResponse,
+					wbRequest, exception);
+		}
+		return jspPath;
+	}
+
+	private String getCustomHandler(WaybackException e, WaybackRequest wbRequest) {
+		String jspPath = null;
+		if((e instanceof ResourceNotInArchiveException)
+				&& wbRequest.isReplayRequest()) {
+			String url = wbRequest.getRequestUrl();
+			Date captureDate = wbRequest.getReplayDate();
+			try {
+				Rule rule = client.getRule(url,captureDate,new Date(),who);
+				jspPath = ruleToJspPath(rule);
+			} catch (RuleOracleUnavailableException e1) {
+				e1.printStackTrace();
+			}
+		}
+		return jspPath;
+	}
+
+	private String ruleToJspPath(Rule rule) {
+		if(rule != null) {
+			String pc = rule.getPublicComment();
+			if(pc.startsWith("/")) {
+				return pc;
+			}
+		}
+		return null;
+	}
+	/**
+	 * @return the client
+	 */
+	public AccessControlClient getClient() {
+		return client;
+	}
+
+	/**
+	 * @param client the client to set
+	 */
+	public void setClient(AccessControlClient client) {
+		client.setRobotLookupsEnabled(false);
+		this.client = client;
+	}
+
+	/**
+	 * @return the oracleUrl
+	 */
+	public String getOracleUrl() {
+		return oracleUrl;
+	}
+
+	/**
+	 * @param oracleUrl the oracleUrl to set
+	 */
+	public void setOracleUrl(String oracleUrl) {
+		this.oracleUrl = oracleUrl;
+		setClient(new AccessControlClient(oracleUrl));
+	}
+
+	/**
+	 * @return the who
+	 */
+	public String getWho() {
+		return who;
+	}
+
+	/**
+	 * @param who the who to set
+	 */
+	public void setWho(String who) {
+		this.who = who;
+	}
+}

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/CustomNotInArchiveExceptionRenderer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/CustomNotInArchiveExceptionRenderer.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/CustomNotInArchiveExceptionRenderer.java	2008-07-22 02:53:50 UTC (rev 2482)
@@ -0,0 +1,95 @@
+/* CustomNotInArchiveExceptionRenderer
+ *
+ * $Id$
+ *
+ * Created on 1:21:49 PM Jul 8, 2008.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.exception;
+
+import java.util.HashMap;
+import java.util.List;
+
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.archive.wayback.core.WaybackRequest;
+import org.archive.wayback.util.url.UrlOperations;
+
+/**
+ *
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class CustomNotInArchiveExceptionRenderer extends BaseExceptionRenderer  {
+	private HashMap<String,Object> hosts = null;
+	private String jspHandler = null;
+	
+	
+	public String getExceptionHandler(HttpServletRequest httpRequest,
+			HttpServletResponse httpResponse, WaybackRequest wbRequest,
+			WaybackException exception) {
+		String jspPath = getCustomHandler(exception,wbRequest);
+		if(jspPath == null) {
+			jspPath = super.getExceptionHandler(httpRequest, httpResponse,
+					wbRequest, exception);
+		}
+		return jspPath;
+	}
+
+
+	/**
+	 * @param exception
+	 * @param wbRequest
+	 * @return
+	 */
+	private String getCustomHandler(WaybackException exception,
+			WaybackRequest wbRequest) {
+		if((exception instanceof ResourceNotInArchiveException)
+				&& wbRequest.isReplayRequest()) {
+			String url = wbRequest.getRequestUrl();
+			String host = UrlOperations.urlToHost(url);
+			if(hosts.containsKey(host)) {
+				return jspHandler;
+			}
+		}
+		return null;
+	}
+
+
+	public String getJspHandler() {
+		return jspHandler;
+	}
+
+
+	public void setJspHandler(String jspHandler) {
+		this.jspHandler = jspHandler;
+	}
+	public List<String> getHosts() {
+		return null;
+	}
+	public void setHosts(List<String> hosts) {
+		this.hosts = new HashMap<String,Object>();
+		for(String host : hosts) {
+			this.hosts.put(host, null);
+		}
+	}
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2481] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp/WEB-INF

From: <bra...@us...> - 2008-07-22 02:48:06

Revision: 2481
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2481&view=rev
Author:   bradtofel
Date:     2008-07-22 02:48:14 +0000 (Tue, 22 Jul 2008)

Log Message:
-----------
REFACTOR: moved all the cumbersome wiring Spring code for the various Replay modes into separate files, which are now imported into the main wayback.xml file.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ProxyReplay.xml

Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml	2008-07-22 02:48:14 UTC (rev 2481)
@@ -0,0 +1,105 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<beans xmlns="http://www.springframework.org/schema/beans"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans
+           http://www.springframework.org/schema/beans/spring-beans-2.5.xsd">
+
+  <bean id="archivalurlhttpheaderprocessor" class="org.archive.wayback.replay.RedirectRewritingHttpHeaderProcessor" />
+
+	<bean id="archivaldateredirectingreplayrenderer" class="org.archive.wayback.replay.DateRedirectReplayRenderer" />
+  <bean id="archivalcssreplayrenderer" class="org.archive.wayback.archivalurl.ArchivalUrlCSSReplayRenderer">
+    <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg>
+  </bean>
+  <bean id="archivalasxreplayrenderer" class="org.archive.wayback.archivalurl.ArchivalUrlASXReplayRenderer">
+    <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg>
+  </bean>
+  <bean id="archivaltransparentreplayrenderer" class="org.archive.wayback.replay.TransparentReplayRenderer">
+    <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg>
+  </bean>
+
+  <bean id="archivalserversidehtmlreplayrenderer" class="org.archive.wayback.archivalurl.ServerSideHTMLReplayRenderer">
+    <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg>
+    <property name="jspInserts">
+      <list>
+        <value>/WEB-INF/replay/ArchiveComment.jsp</value>
+<!--
+        <value>/WEB-INF/replay/JSLessTimeline.jsp</value>
+-->
+      </list>
+    </property>
+  </bean>
+
+  <bean id="archivalclientsidehtmlreplayrenderer" class="org.archive.wayback.archivalurl.ClientSideHTMLReplayRenderer">
+    <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg>
+    <property name="jspInserts">
+      <list>
+	      <value>/WEB-INF/replay/ArchiveComment.jsp</value>
+	      <value>/WEB-INF/replay/ClientSideJSInsert.jsp</value>
+	      <value>/WEB-INF/replay/DebugBanner.jsp</value>
+<!-- 
+        <value>/WEB-INF/replay/Disclaimer.jsp</value>
+        <value>/WEB-INF/replay/Timeline.jsp</value>
+-->
+      </list>
+    </property>
+  </bean>
+	
+  <bean id="archivalurlreplay" class="org.archive.wayback.replay.SelectorReplayDispatcher">
+    <property name="selectors">
+      <list>
+
+        <!-- REDIRECT IF NOT EXACT DATE -->
+	      <bean class="org.archive.wayback.replay.selector.DateMismatchSelector">
+	        <property name="renderer" ref="archivaldateredirectingreplayrenderer"/>
+	      </bean>
+
+        <!-- HTML REPLAY -->
+        <bean class="org.archive.wayback.replay.selector.MimeTypeSelector">
+          <property name="mimeContains">
+            <list>
+              <value>text/html</value>
+              <value>application/xhtml</value>
+            </list>
+          </property>
+          <property name="renderer" ref="archivalclientsidehtmlreplayrenderer"/>
+        </bean>
+
+        <!-- CSS REPLAY -->
+        <bean class="org.archive.wayback.replay.selector.MimeTypeSelector">
+          <property name="mimeContains">
+            <list>
+              <value>text/css</value>
+            </list>
+          </property>
+          <property name="renderer" ref="archivalcssreplayrenderer"/>
+        </bean>
+
+        <!-- ASX-MIME REPLAY -->
+        <bean class="org.archive.wayback.replay.selector.MimeTypeSelector">
+          <property name="mimeContains">
+            <list>
+              <value>video/x-ms-asf</value>
+            </list>
+          </property>
+          <property name="renderer" ref="archivalasxreplayrenderer"/>
+        </bean>
+
+        <!-- ASX-PATH REPLAY -->
+        <bean class="org.archive.wayback.replay.selector.PathMatchSelector">
+          <property name="pathContains">
+            <list>
+              <value>.asx</value>
+            </list>
+          </property>
+          <property name="renderer" ref="archivalasxreplayrenderer"/>
+        </bean>
+
+        <!-- DEFAULT-TRANSPARENT REPLAY -->
+        <bean class="org.archive.wayback.replay.selector.AlwaysMatchSelector">
+          <property name="renderer" ref="archivaltransparentreplayrenderer"/>
+        </bean>
+
+      </list>
+    </property>
+  </bean>
+</beans>
\ No newline at end of file

Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml	2008-07-22 02:48:14 UTC (rev 2481)
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<beans xmlns="http://www.springframework.org/schema/beans"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans
+           http://www.springframework.org/schema/beans/spring-beans-2.5.xsd">
+
+  <bean id="domainprefixhttpheaderprocessor" class="org.archive.wayback.replay.RedirectRewritingHttpHeaderProcessor" />
+
+  <bean id="domainprefixdateredirectingreplayrenderer" class="org.archive.wayback.replay.DateRedirectReplayRenderer" />
+  <bean id="domainprefixtransparentreplayrenderer" class="org.archive.wayback.replay.TransparentReplayRenderer">
+    <constructor-arg><ref bean="domainprefixhttpheaderprocessor"/></constructor-arg>
+  </bean>
+
+  <bean id="domainprefixtextreplayrenderer" class="org.archive.wayback.domainprefix.DomainPrefixTextReplayRenderer">
+    <constructor-arg><ref bean="domainprefixhttpheaderprocessor"/></constructor-arg>
+    <property name="jspInserts">
+      <list>
+        <value>/WEB-INF/replay/ArchiveComment.jsp</value>
+        <value>/WEB-INF/replay/DebugBanner.jsp</value>
+<!--
+        <value>/WEB-INF/replay/JSLessTimeline.jsp</value>
+-->
+      </list>
+    </property>
+  </bean>
+
+  <bean id="domainprefixreplay" class="org.archive.wayback.replay.SelectorReplayDispatcher">
+    <property name="selectors">
+      <list>
+
+        <!-- REDIRECT IF NOT EXACT DATE -->
+        <bean class="org.archive.wayback.replay.selector.DateMismatchSelector">
+          <property name="renderer" ref="domainprefixdateredirectingreplayrenderer"/>
+        </bean>
+
+        <!-- HTML REPLAY -->
+        <bean class="org.archive.wayback.replay.selector.MimeTypeSelector">
+          <property name="mimeContains">
+            <list>
+              <value>text/html</value>
+              <value>application/xhtml</value>
+            </list>
+          </property>
+          <property name="renderer" ref="domainprefixtextreplayrenderer"/>
+        </bean>
+        <!-- DEFAULT-TRANSPARENT REPLAY -->
+        <bean class="org.archive.wayback.replay.selector.AlwaysMatchSelector">
+          <property name="renderer" ref="domainprefixtransparentreplayrenderer"/>
+        </bean>
+
+      </list>
+    </property>
+  </bean>
+</beans>
\ No newline at end of file

Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ProxyReplay.xml
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ProxyReplay.xml	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ProxyReplay.xml	2008-07-22 02:48:14 UTC (rev 2481)
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<beans xmlns="http://www.springframework.org/schema/beans"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans
+           http://www.springframework.org/schema/beans/spring-beans-2.5.xsd">
+
+  <bean id="identityhttpheaderprocessor" class="org.archive.wayback.replay.IdentityHttpHeaderProcessor" />
+
+  <bean id="proxytransparentreplayrenderer" class="org.archive.wayback.replay.TransparentReplayRenderer">
+    <constructor-arg><ref bean="identityhttpheaderprocessor"/></constructor-arg>
+  </bean>
+  <bean id="proxymarkupreplayrenderer" class="org.archive.wayback.proxy.ProxyHTMLMarkupReplayRenderer">
+    <constructor-arg><ref bean="identityhttpheaderprocessor"/></constructor-arg>
+    <property name="jspInserts">
+      <list>
+        <value>/WEB-INF/replay/ArchiveComment.jsp</value>
+        <value>/WEB-INF/replay/Disclaimer.jsp</value>
+<!--
+        <value>/replay/JSLessTimeline.jsp</value>
+-->
+      </list>
+    </property>
+  </bean>
+
+  <bean id="proxyreplay" class="org.archive.wayback.replay.SelectorReplayDispatcher">
+    <property name="selectors">
+      <list>
+
+        <!-- HTML REPLAY -->
+        <bean class="org.archive.wayback.replay.selector.MimeTypeSelector">
+          <property name="mimeContains">
+            <list>
+              <value>text/html</value>
+              <value>application/xhtml</value>
+            </list>
+          </property>
+          <property name="renderer" ref="proxymarkupreplayrenderer"/>
+        </bean>
+
+        <!-- DEFAULT-TRANSPARENT REPLAY -->
+        <bean class="org.archive.wayback.replay.selector.AlwaysMatchSelector">
+          <property name="renderer" ref="proxytransparentreplayrenderer"/>
+        </bean>
+
+      </list>
+    </property>
+  </bean>
+</beans>
\ No newline at end of file

Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml	2008-07-22 02:45:34 UTC (rev 2480)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml	2008-07-22 02:48:14 UTC (rev 2481)
@@ -1,6 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE beans PUBLIC "-//SPRING//DTD BEAN//EN" "http://www.springframework.org/dtd/spring-beans.dtd">
-<beans>
+<beans xmlns="http://www.springframework.org/schema/beans"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans
+           http://www.springframework.org/schema/beans/spring-beans-2.5.xsd">
 
 <!--
     The following 3 beans are required when using the ArcProxy for providing 
@@ -8,19 +10,6 @@
     or directories.
 -->
 <!-- 
-	<bean id="filelocationdb" class="org.archive.wayback.resourcestore.http.FileLocationDB"
-		init-method="init">
-		<property name="bdbPath" value="/tmp/wayback/arc-db" />
-		<property name="bdbName" value="DB1" />
-		<property name="logPath" value="/tmp/wayback/arc-db.log" />
-	</bean>
-
-	<bean name="8080:arcproxy" class="org.archive.wayback.resourcestore.http.ArcProxyServlet">
-		<property name="locationDB" ref="filelocationdb" />
-	</bean>
-	<bean name="8080:locationdb" class="org.archive.wayback.resourcestore.http.FileLocationDBServlet">
-		<property name="locationDB" ref="filelocationdb" />
-	</bean>
 -->
 
 
@@ -34,7 +23,6 @@
       <property name="arcCacheDir">
         <bean class="org.archive.wayback.liveweb.ARCCacheDirectory"
           init-method="init">
-          
           <property name="arcDir" value="/tmp/wayback/liveweb/arcs/" />
           <property name="arcPrefix" value="live" />
         </bean>
@@ -67,19 +55,21 @@
       <property name="webCache" ref="livewebcache" />
     </bean>
 -->
+
 <!--
-	The following bean is an example using the Access Control Oracle, thanks Alex Osborne.
-	Currently this is pretty undocumented, but here is a place to get started:
+	The following bean is an example using the Access Control Oracle, thanks 
+	Alex Osborne and NLA. Currently this is pretty undocumented, but here is a
+	place to get started:
 
 		http://webteam.archive.org/confluence/display/wayback/Exclusions+API
 -->
-<!--
+
     <bean id="excluder-factory-oracle" class="org.archive.wayback.accesscontrol.oracleclient.OracleExclusionFilterFactory">
-      <property name="oracleUrl" value="http://192.168.1.11:8080/oracle-0.0.1-SNAPSHOT/" />
+      <property name="oracleUrl" value="http://localhost:8180/oracle/" />
       <property name="accessGroup" value="ia_archiver" />
     </bean>
--->
 
+<!-- 
   <bean id="localbdbcollection" class="org.archive.wayback.webapp.WaybackCollection">
 
     <property name="resourceStore">
@@ -125,8 +115,123 @@
       </bean>
     </property>
   </bean>
+-->
 
+
+  
+
 <!--
+      <property name="annotater">
+        <bean class="org.archive.wayback.resourceindex.filters.OracleAnnotationFilter">
+          <property name="oracleUrl" value="http://localhost:8180/oracle/" />
+          <property name="who" value="annotation" />
+        </bean>
+      </property>
+-->
+  
+  <bean id="resourcefilelocationdb" class="org.archive.wayback.resourcestore.locationdb.BDBResourceFileLocationDB"
+    init-method="init">
+    <property name="bdbPath" value="/tmp/wayback/file-db" />
+    <property name="bdbName" value="DB1" />
+    <property name="logPath" value="/tmp/wayback/file-db.log" />
+  </bean>
+  <bean name="8080:locationdb" class="org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBServlet">
+    <property name="locationDB" ref="resourcefilelocationdb" />
+  </bean>
+  <bean name="8080:fileproxy" class="org.archive.wayback.resourcestore.locationdb.FileProxyServlet">
+    <property name="locationDB" ref="resourcefilelocationdb" />
+  </bean>
+  <bean id="indexqueue" class="org.archive.wayback.resourcestore.indexer.DirectoryIndexQueue">
+    <property name="path" value="/tmp/wayback/indexer-queue" />
+  </bean>
+
+      <bean id="localbdbresourceindex" class="org.archive.wayback.resourceindex.LocalResourceIndex">
+        <property name="source">
+          <bean class="org.archive.wayback.resourceindex.bdb.BDBIndex"
+            init-method="init">
+            <property name="bdbName" value="DB1" />
+            <property name="bdbPath" value="/tmp/wayback/index/" />
+          </bean>
+        </property>
+        <property name="maxRecords" value="10000" />
+      </bean>
+
+  <bean id="localbdbcollection" class="org.archive.wayback.webapp.WaybackCollection">
+
+    <property name="resourceStore">
+		  <bean id="localresourcestore" class="org.archive.wayback.resourcestore.LocalResourceFileResourceStore">
+		    <property name="db" ref="resourcefilelocationdb" />
+		  </bean>
+    </property>
+
+    <property name="resourceIndex" ref="localbdbresourceindex"/>
+
+    <property name="shutdownables">
+      <list>
+         <!-- This thread notices new files appearing in your resourcefilesources -->
+         <bean id="resourcefilesourceupdater" class="org.archive.wayback.resourcestore.resourcefile.ResourceFileSourceUpdater"
+          init-method="init">
+          <property name="target" value="/tmp/wayback/file-db-incoming" />
+          <property name="interval" value="100000" />
+          <property name="sources">
+            <list>
+              <bean id="resourcefilesource" class="org.archive.wayback.resourcestore.resourcefile.DirectoryResourceFileSource">
+                <property name="name" value="braddir1" />
+                <property name="prefix" value="/tmp/wayback/files1/" />
+              </bean>
+            </list>
+          </property>
+        </bean>
+      
+        <!-- This thread updates the location db with updates from resourcefilesourceupdater -->
+        <bean id="resourcefilelocationdbupdater" class="org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBUpdater"
+          init-method="init">
+          <property name="interval" value="100000" />
+          <property name="db" ref="resourcefilelocationdb" />
+          <property name="incomingDir" value="/tmp/wayback/file-db-incoming" />
+          <property name="stateDir" value="/tmp/wayback/file-db-state" />
+        </bean>
+        
+        <!-- This thread notices new files arriving in the filelocationdb, and queues them for indexing -->
+        <bean id="indexqueueupdater" class="org.archive.wayback.resourcestore.indexer.IndexQueueUpdater"
+          init-method="init">
+          <property name="db" ref="resourcefilelocationdb" />
+          <property name="queue" ref="indexqueue" />
+          <property name="interval" value="1000" />
+          <property name="lastMark" value="/tmp/wayback/index-queue.mark" />
+        </bean>
+      
+        <!-- This thread checks the to-be-indexed queue for files needing indexing, indexes them, and hands off the results for merging with the ResourceIndex -->
+        <bean id="indexworker" class="org.archive.wayback.resourcestore.indexer.IndexWorker"
+          init-method="init">
+          <property name="db" ref="resourcefilelocationdb" />
+          <property name="queue" ref="indexqueue" />
+          <property name="interval" value="1000" />
+          <property name="target">
+            <bean class="org.archive.wayback.resourceindex.updater.IndexClient">
+              <property name="tmpDir" value="/tmp/wayback/index-data/tmp/" />
+              <property name="target" value="/tmp/wayback/index-data/incoming/" />
+            </bean>
+          </property>
+        </bean>
+        
+        <!-- This thread merges updates from the indexworker into the ResourceIndex -->
+        <bean class="org.archive.wayback.resourceindex.updater.LocalResourceIndexUpdater"
+				  init-method="init">
+				  
+				  <property name="index" ref="localbdbresourceindex" />
+				  <property name="incoming" value="/tmp/wayback/index-data/incoming/" />
+				  <property name="failed" value="/tmp/wayback/index-data/failed/" />
+				  <property name="merged" value="/tmp/wayback/index-data/merged/" />
+				  <property name="runInterval" value="10000" />
+				</bean>
+      </list>
+    </property>
+  </bean>
+
+
+
+<!--
     The following WaybackCollection bean template is required when using a
     manually built local CDX index.
 -->
@@ -157,8 +262,8 @@
 <!--
     The following WaybackCollection bean template is required when using a
     remote ResourceIndex and ResourceStore implementation. This will also 
-    required setting up an arcproxy and locationdb on the host specified by
-    the resourceStore:urlPrefix configuration, and an addition AccessPoint
+    require setting up an arcproxy and locationdb on the host specified by
+    the resourceStore:urlPrefix configuration, and an additional AccessPoint
     on the host specified by the resourceIndex:searchUrlBase configuration.
 -->
 <!--
@@ -176,8 +281,6 @@
 		    <property name="searchUrlBase" value="http://indexhost:8080/index/xmlquery" />
 		  </bean>
     </property>
-
-    
   </bean>
 -->
 
@@ -188,23 +291,28 @@
     installation. You may also need to ensure that the maxRecords on your RequestParser is
     not greater than the maxRecords configured on the RemoteNutchResourceIndex.
 -->
-<!--
+
   <bean id="remotenutchcollection" class="org.archive.wayback.webapp.WaybackCollection">
 
     <property name="resourceStore">
       <bean class="org.archive.wayback.resourcestore.Http11ResourceStore">
-        <property name="urlPrefix" value="http://localhost:8080/arcproxy/" />
+<!-- 
+        <property name="urlPrefix" value="http://crawling11.us.archive.org/arcproxy/" />
+-->
+         <property name="urlPrefix" value="http://webapp100.us.archive.org/arcproxy/" />
       </bean>
     </property>
 
     <property name="resourceIndex">
       <bean class="org.archive.wayback.resourceindex.NutchResourceIndex" init-method="init">
+<!-- 
         <property name="searchUrlBase" value="http://webteam-ws.us.archive.org:8080/katrina/opensearch" />
+ -->
+        <property name="searchUrlBase" value="http://192.168.1.208:9090/nutch-1.0-dev/opensearch" />
         <property name="maxRecords" value="100" />
       </bean>
     </property>
   </bean>
--->
 
 <!--
     This is the only AccessPoint defined by default within this wayback.xml
@@ -216,51 +324,32 @@
     running Tomcat. To provide external access, replace "localhost" with your
     fully qualified hostname of the computer running Tomcat.
 -->
+  <import resource="ArchivalUrlReplay.xml"/>
 	<bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint">
-
+	<!--
+    <property name="exclusionFactory" ref="excluder-factory-oracle" />
+  -->
 		<property name="collection" ref="localbdbcollection" />
+		<property name="configs">
+		    <props>
+		        <prop key="inst">foo</prop>
+		        <prop key="coll">supreme court</prop>
+		    </props>
+		</property>
 
     <property name="uriConverter">
       <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter">
-        <property name="replayURIPrefix" value="http://localhost:8080/wayback/" />
+        <property name="replayURIPrefix" value="http://localhost:8080/wayback/"/>
       </bean>
     </property>
 
 		<property name="query">
 			<bean class="org.archive.wayback.query.Renderer">
-				<property name="captureJsp" value="/jsp/HTMLResults.jsp" />
+				<property name="captureJsp" value="/WEB-INF/query/CalendarResults.jsp" />
 			</bean>
 		</property>
 
-    <property name="replay">
-      <bean class="org.archive.wayback.archivalurl.ArchivalUrlReplayDispatcher">
-        <property name="serverSideRendering" value="false" />
-        <property name="jspInserts">
-          <list>
-            <value>/replay/ArchiveComment.jsp</value>
-            <value>/replay/ClientSideJSInsert.jsp</value>
-<!--
-  The following 2 .jsp include values will produce in-page elements within
-  replayed HTML pages. Both require client-side Javascript.
--->
-<!-- 
-            <value>/replay/Disclaimer.jsp</value>
-            <value>/replay/Timeline.jsp</value>
--->
-<!--
-  The following .jsp include value will produce a timeline within *all* replayed
-  pages, including all subframes within a frameset, but requires no client side
-  Javascript. It is intended for use in deployments which use:
-  
-      serverSideRendering=true
--->
-<!--  
-            <value>/replay/JSLessTimeline.jsp</value>
--->
-          </list>
-        </property>
-      </bean>
-    </property>
+    <property name="replay" ref="archivalurlreplay" />
 
 		<property name="parser">
 			<bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser"
@@ -271,20 +360,68 @@
 		</property>
 	</bean>
 
+  <bean name="8080:rwayback" parent="8080:wayback">
+    <property name="parser">
+      <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser"
+        init-method="init">
+        <property name="maxRecords" value="100" />
+        <property name="earliestTimestamp" value="1996" />
+      </bean>
+    </property>
+    <property name="exception">
+      <bean class="org.archive.wayback.exception.CustomNotInArchiveExceptionRenderer">
+        <property name="hosts">
+          <list>
+            <value>www.aladems.org</value>
+          </list>
+        </property>
+        <property name="jspHandler" value="/exception/GrayBlank.jsp"/>
+      </bean>
+    </property>
+    <property name="uriConverter">
+      <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter">
+        <property name="replayURIPrefix" value="http://localhost:8080/rwayback/" />
+      </bean>
+    </property>
+    <property name="collection" ref="remotenutchcollection">
+    <!--
+		  <bean class="org.archive.wayback.webapp.WaybackCollection">
+		    <property name="resourceStore" ref="fancyresourcestore" />
+		    <property name="resourceIndex">
+		      <bean class="org.archive.wayback.resourceindex.RemoteResourceIndex"
+		        init-method="init">
+		        <property name="searchUrlBase" value="http://localhost:8080/wayback/xmlquery" />
+		      </bean>
+		    </property>
+		  </bean>
+	  -->
+    </property>
+  </bean>
+
 <!--
     The following AccessPoint inherits all configuration from the 8080:wayback
     AccessPoint, but only allows access from the specified IP network.
 -->
 <!--
 	<bean name="8080:netsecure" parent="8080:wayback">
+	
 		<property name="authentication">
-			<bean class="org.archive.wayback.authenticationcontrol.IPMatchesBooleanOperator">
-				<property name="allowedRanges">
-					<list>
-						<value>192.168.1.16/24</value>
-					</list>
-				</property>
-			</bean>
+      <bean class="org.archive.wayback.authenticationcontrol.AccessControlSettingOperation">
+        <property name="operator">
+          <bean class="org.archive.wayback.util.operator.NotBooleanOperator">
+            <property name="operand">
+							<bean class="org.archive.wayback.authenticationcontrol.IPMatchesBooleanOperator">
+								<property name="allowedRanges">
+									<list>
+										<value>192.168.1.16/24</value>
+									</list>
+								</property>
+							</bean>
+					  </property>
+					</bean>
+			  </property>
+			  <property name="factory" ref="excluder-factory-robot"/>
+      </bean>
 		</property>
     <property name="uriConverter">
       <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter">
@@ -293,7 +430,6 @@
     </property>
 	</bean>
 -->
-
 <!--
     The following AccessPoint inherits all configuration from the 8080:wayback
     AccessPoint, but checks live web robots.txt documents to determine if 
@@ -313,7 +449,24 @@
   </bean>
 -->
 
+<import resource="DomainPrefixReplay.xml"/>
+<bean name="8081" parent="8080:wayback">
+  <property name="useServerName" value="true" />
+  <property name="replay" ref="domainprefixreplay" />
+  <property name="uriConverter">
+    <bean class="org.archive.wayback.domainprefix.DomainPrefixResultURIConverter">
+      <property name="hostPort" value="localhost.archive.org:8081" />
+    </bean>
+  </property>
+  <property name="parser">
+    <bean class="org.archive.wayback.domainprefix.DomainPrefixCompositeRequestParser" init-method="init">
+      <property name="hostPort" value="localhost.archive.org:8081" />
+      <property name="maxRecords" value="1000" />
+    </bean>
+  </property>
+</bean>
 
+
 <!--
     The following AccessPoint inherits all configuration from the 8080:wayback
     AccessPoint, but provides a Proxy Replay UI to the same collection. These
@@ -323,27 +476,24 @@
     Note: using this AccessPoint requires adding a "Connector" on port 8090
          in your Tomcat's server.xml file.
  -->
-<!--
+  <import resource="ProxyReplay.xml"/>
 <bean name="8090" parent="8080:wayback">
   <property name="useServerName" value="true" />
-	<property name="replay">
-	  <bean class="org.archive.wayback.proxy.ProxyReplayDispatcher" />
-	</property>
+	<property name="replay" ref="proxyreplay" />
 	<property name="uriConverter">
 		<bean class="org.archive.wayback.proxy.RedirectResultURIConverter">
-		  <property name="redirectURI" value="http://foo.archive.org:8090/jsp/Redirect.jsp" />
+		  <property name="redirectURI" value="http://brad.archive.org/jsp/Redirect.jsp" />
 		</bean>
 	</property>
 	<property name="parser">
 		<bean class="org.archive.wayback.proxy.ProxyRequestParser" init-method="init">
 			<property name="localhostNames">
 				<list>
-				  <value>foo.archive.org</value>
+				  <value>brad.archive.org</value>
 				</list>
 			</property>
 			<property name="maxRecords" value="1000" />
 		</bean>
 	</property>
 </bean>
--->
 </beans>
\ No newline at end of file


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

Flat | Threaded

<< < 1 .. 51 52 53 54 55 .. 171 > >> (Page 53 of 171)