You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bi...@us...> - 2008-07-28 19:29:07
|
Revision: 2505 http://archive-access.svn.sourceforge.net/archive-access/?rev=2505&view=rev Author: binzino Date: 2008-07-28 19:29:16 +0000 (Mon, 28 Jul 2008) Log Message: ----------- Added length metadata field to list of indexed fields. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-26 15:47:56 UTC (rev 2504) +++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-28 19:29:16 UTC (rev 2505) @@ -52,6 +52,7 @@ collection date type + length </value> </property> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2504 http://archive-access.svn.sourceforge.net/archive-access/?rev=2504&view=rev Author: miklosh Date: 2008-07-26 15:47:56 +0000 (Sat, 26 Jul 2008) Log Message: ----------- Made modifications needed for using Hadoop 0.17. Display image metadata in search results. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java 2008-07-26 15:46:55 UTC (rev 2503) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearcherBean.java 2008-07-26 15:47:56 UTC (rev 2504) @@ -25,6 +25,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; @@ -40,13 +41,16 @@ import org.apache.lucene.store.FSDirectory; import org.apache.nutch.indexer.FsDirectory; import org.apache.nutch.indexer.Indexer; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.NutchConfiguration; -public class ImageSearcherBean { +public class ImageSearcherBean implements ImageLoader { public static final Log LOG = LogFactory.getLog(ImageSearcherBean.class); private IndexReader reader; + private ImageDataReader imageReader; private Path baseDir; private Configuration conf; @@ -66,18 +70,9 @@ Path indexesDir = new Path(baseDir, "indexes"); if (this.fs.exists(indexesDir)) { Vector<Path> doneDirs = new Vector<Path>(); - Path[] dirs = fs.listPaths(indexesDir, new PathFilter() { - - public boolean accept(Path f) { - try { - if (fs.isDirectory(f)) { - return true; - } - } catch (IOException ioe) { - } - return false; - } - }); + FileStatus[] fstats = fs.listStatus(indexesDir, + HadoopFSUtil.getPassDirectoriesFilter(fs)); + Path[] dirs = HadoopFSUtil.getPaths(fstats); for (Path dir : dirs) { Path indexdone = new Path(dir, Indexer.DONE_NAME); if (fs.isFile(indexdone)) { @@ -95,6 +90,8 @@ Path[] indexDir = {new Path(baseDir, "index")}; init(indexDir); } + this.imageReader = new ImageDataReader(FileSystem.get(conf), + new Path(baseDir, "segments").toString(), conf); } /** Init given a set of indexes or just one index. */ @@ -124,12 +121,19 @@ if (reader != null) { reader.close(); } + if (imageReader != null) { + imageReader.close(); + } } public IndexReader getReader() { return reader; } + public ImageWritable getImage(String id) throws IOException { + return imageReader.getImage(id); + } + /** * Calculate the score for an image hit. * @param hit found hit @@ -245,6 +249,9 @@ nextDist = imgIndex < numDocImages-1 ? Math.abs(imagePositions[imgIndex+1] - pos) + (end-pos) : Integer.MAX_VALUE; } + if (imgIndex >= numDocImages) { + continue; + } // Check if this image is in the allowed proximity of the span if (dist > distThreshold) { if (LOG.isDebugEnabled()) { @@ -261,6 +268,7 @@ ImageHit newHit = new ImageHit(imageIds[imgIndex], imageUrls[imgIndex], currentDoc); newHit.docSim = docSim; newHit.docScore = docBoost; + newHit.parentUrl = doc.get("url"); newHit.proximity = Math.min(1.0f, 1.0f-((float)dist/maxDist)); newHit.score = scoreHit(newHit, doc); @@ -338,6 +346,10 @@ hits.getTotal() >= maxHits ? maxHits : (int)hits.getTotal()); for (ImageHit hit : top) { System.out.println(hit.score + " " + hit.url + " " + hit.imageId); + ImageWritable imageData = isb.getImage(hit.imageId); + if (imageData != null) { + System.out.println("[ " + imageData.getMetadata() + "]"); + } } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2503 http://archive-access.svn.sourceforge.net/archive-access/?rev=2503&view=rev Author: miklosh Date: 2008-07-26 15:46:55 +0000 (Sat, 26 Jul 2008) Log Message: ----------- Normalize extracted image URLs. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java 2008-07-26 15:42:26 UTC (rev 2502) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java 2008-07-26 15:46:55 UTC (rev 2503) @@ -33,6 +33,7 @@ import org.apache.nutch.analysis.AnalyzerFactory; import org.apache.nutch.analysis.NutchAnalyzer; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.util.NodeWalker; import org.archive.nutchwax.imagesearch.ImageSearch; import org.w3c.dom.*; @@ -42,7 +43,8 @@ public static final Log LOG = LogFactory.getLog(ImageParseFilter.class); - + private URLNormalizers normalizers; + private void findImages(Node doc, URL base, ParseData parentData, ParseResult parseResult) { // Get language @@ -87,8 +89,20 @@ try { imgSrc = new URL(base, attr.getValue()); imgUrl = imgSrc.toString(); + // Normalize it + // Replace spaces with %20 + imgUrl = imgUrl.replaceAll("\\s", "%20"); + imgUrl = normalizers.normalize(imgUrl, + URLNormalizers.SCOPE_FETCHER); + // TODO: apply NutchWAX specific URL canonicalization } catch (MalformedURLException mue) { - skipNode = true; + if (imgUrl != null) { + if (LOG.isInfoEnabled()) { + LOG.info("MalformedURL: " + imgUrl); + } + } else { + skipNode = true; + } } } else if ("alt".equalsIgnoreCase(attr.getName())) { altText = attr.getValue(); @@ -162,6 +176,7 @@ public void setConf(Configuration conf) { this.conf = conf; + this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER); } public Configuration getConf() { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <mi...@us...> - 2008-07-26 15:42:17
|
Revision: 2502 http://archive-access.svn.sourceforge.net/archive-access/?rev=2502&view=rev Author: miklosh Date: 2008-07-26 15:42:26 +0000 (Sat, 26 Jul 2008) Log Message: ----------- Added basic JSP-based user interface. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/build.xml Added Paths: ----------- trunk/archive-access/projects/nutchwax/imagesearch/src/web/ trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/ trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/images/ trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/images/logo.jpg trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/results.jsp trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/thumb.jsp trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/ trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/ trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/ trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/ trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch.properties trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch_en.properties trunk/archive-access/projects/nutchwax/imagesearch/src/web/web.xml Modified: trunk/archive-access/projects/nutchwax/imagesearch/build.xml =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/build.xml 2008-07-26 15:37:14 UTC (rev 2501) +++ trunk/archive-access/projects/nutchwax/imagesearch/build.xml 2008-07-26 15:42:26 UTC (rev 2502) @@ -22,6 +22,7 @@ <property name="src.dir" value="src" /> <property name="lib.dir" value="lib" /> <property name="build.dir" value="${nutch.dir}/build" /> + <property name="build.plugins" value="${nutch.dir}/build/plugins" /> <!-- HACK: Need to import default.properties like Nutch does --> <property name="dist.dir" value="${build.dir}/nutch-1.0-dev" /> @@ -135,4 +136,56 @@ </target> + <!-- + generate the servlet context file (nutch.xml) + --> + <target name="generate-context"> + <!-- xmlcatalog definition for xslt task --> + <xmlcatalog id="docDTDs"> + <dtd publicId="-//W3C//DTD XHTML 1.0 Transitional//EN" + location="${xmlcatalog.dir}/xhtml1-transitional.dtd"/> + </xmlcatalog> + <xslt in="${nutch.dir}/conf/nutch-default.xml" + out="${build.dir}/nutch.xml" + style="${nutch.dir}/conf/context.xsl"> + <xmlcatalog refid="docDTDs"/> + <outputproperty name="indent" value="yes"/> + </xslt> + </target> + + <target name="imagesearch-war" depends="generate-context, jar"> + <war destfile="${build.dir}/imagesearch.war" + webxml="${src.dir}/web/web.xml"> + <fileset dir="${src.dir}/web/jsp"/> + <lib dir="${nutch.dir}/lib"> + <include name="lucene*.jar"/> + <include name="taglibs-*.jar"/> + <include name="hadoop-*.jar"/> + <include name="dom4j-*.jar"/> + <include name="xerces-*.jar"/> + <include name="tika-*.jar"/> + <include name="commons-cli-*.jar"/> + <include name="commons-lang-*.jar"/> + <include name="commons-logging-*.jar"/> + <include name="log4j-*.jar"/> + </lib> + <lib dir="${build.dir}"> + <include name="nutch-*.jar"/> + </lib> + + <zipfileset prefix="WEB-INF/classes" dir="${build.dir}/classes"/> + <classes dir="${nutch.dir}/conf" excludes="**/*.template"/> + <classes dir="${src.dir}/web/locale"/> + <zipfileset prefix="WEB-INF/classes/plugins" dir="${build.plugins}"> + <exclude name="parse-*/**"/> + <exclude name="protocol-*/**"/> + <exclude name="urlfilter-*/**"/> + </zipfileset> + + <webinf dir="${nutch.dir}/lib"> + <include name="taglibs-*.tld"/> + </webinf> + </war> + </target> + </project> Property changes on: trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/images/logo.jpg ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/results.jsp =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/results.jsp (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/results.jsp 2008-07-26 15:42:26 UTC (rev 2502) @@ -0,0 +1,226 @@ +<%@ page +session="false" +contentType="text/html; charset=UTF-8" +pageEncoding="UTF-8" + +import="java.io.*" +import="java.util.*" +import="java.net.*" + +import="org.archive.nutchwax.imagesearch.*" +import="org.apache.hadoop.conf.*" +import="org.apache.nutch.html.Entities" +import="org.apache.nutch.searcher.*" +import="org.apache.nutch.metadata.*" +import="org.apache.nutch.util.NutchConfiguration" +import="org.apache.nutch.plugin.*" +import="org.apache.hadoop.fs.Path" +import="org.apache.lucene.index.*" +import="org.apache.lucene.document.*" + +%> +<% +Configuration nutchConf = NutchConfiguration.get(application); + +// Get query from request +boolean haveQuery = true; +request.setCharacterEncoding("UTF-8"); +String queryString = request.getParameter("query"); +String mainTitle = "Internet Archive Image Search"; +if (queryString == null) { + queryString = ""; + haveQuery = false; +} else { + mainTitle = queryString + " - " + mainTitle; +} +String htmlQueryString = Entities.encode(queryString); + +int start = 0; +String startString = request.getParameter("start"); +if (startString != null) { + start = Integer.parseInt(startString); +} + +int hitsPerPage = 20; // number of hits to display per page +int rowLength = 5; // number of images to display per row + +int end = start+hitsPerPage; + +%><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"><% +// To prevent the character encoding declared with 'contentType' page +// directive from being overriden by JSTL (apache i18n), we freeze it +// by flushing the output buffer. +// see http://java.sun.com/developer/technicalArticles/Intl/MultilingualJSP/ +out.flush(); + +String language = ResourceBundle.getBundle("org.archive.jsp.imagesearch", request.getLocale()) + .getLocale().getLanguage(); +%> +<%@ taglib uri="http://jakarta.apache.org/taglibs/i18n" prefix="i18n" %> +<i18n:bundle baseName="org.archive.jsp.imagesearch"/> +<html> +<head> +<title><%=mainTitle%></title> + +<style type="text/css"><!-- +.xs {font-family:verdana,arial,helvetica,sans-serif;font-size: x-small} +body,div,span,th,td,input,.s {font-family: arial, helvetica, sans-serif; font-size: small} +.m {font-size: medium} +.lbl,a.lbl:link,a.lbl:visited,a.lbl:active,a.lbl:hover{color:#444;font-weight:bold;font-family:verdana,arial,helvetica,sans-serif;;text-decoration:none} + +.resItem {font-family: arial, helvetica, sans-serif; font-size: small} + +.site{color:#55AA55} +a:link,a:active{color:#2249cc} a:visited{color:#573875} a{text-decoration:underline} +a.res{color:#2249cc; font-size:1.3em } + +#res_pages {text-align: center; font-size:medium; font-weight:bold;} +.nav_curr {font-size:1.3em;} +#results {margin-right: 240px; text-align: justify;} +.ellipsis {font-weight: bold;} +.highlight{font-weight: bold;} + +form{margin-bottom:0px} + +--> +</style> + +</head> +<body> + +<table width="100%" border=0 cellspacing=0 cellpadding=0> +<tr><td class=xs> </td></tr> +<tr><td width=50><img src="images/logo.jpg"></td> +<td><table border=0 cellspacing=0 cellpadding=1> + <tr><td width=10><form action="results.jsp"> </td><td> + <input type=text name=query size=20 maxlength=500 value="<%=queryString%>" style='width:230px' /> + </td> + <td> <input type=submit value="<i18n:message key="search"/>" class=btn /></td> + <td> </form> + </td></tr></table> +</td></tr> +<tr><td class=xs> </td></tr> +</table> + +<% +ImageHits hitList = null; +ImageSearcherBean bean = new ImageSearcherBean(nutchConf); + +if (haveQuery) { + hitList = bean.search(queryString, end); + + long numHits = hitList.getTotal(); + if (end >= numHits) { + end = (int)numHits; + } + + // Output info about the results + if (numHits > 0) { + %><hr size=1 /> + <div align=left class=lbl style="padding: 0px 0px 0px"> + <i18n:message key="showingHits"> + <i18n:messageArg value="<%=new Long(start+1)%>"/> + <i18n:messageArg value="<%=new Long(end)%>"/> + <i18n:messageArg value="<%=numHits%>"/> + </i18n:message> + </div> + + <table border=0 cellpadding=20 cellspacing=0 align=center width=700> + <% + } + // Get hits + ImageHit[] hits = null; + System.out.println(numHits); + if (numHits > 0) { + hits = hitList.getHits(start, (end-start) >= hitsPerPage ? hitsPerPage : + (end-start)); + } else { + hits = new ImageHit[0]; + } + + // Output results + for (int i = 0; i < hits.length; i++) { + ImageHit hit = hits[i]; + String url = hit.url; + String fileName = null; + int slashIndex = url.lastIndexOf("/"); + if (slashIndex > 0) { + fileName = url.substring(slashIndex + 1); + } else { + fileName = url; + } + String parentUrl = hit.parentUrl; + String host = null; + String thumbnailUrl = "thumb.jsp?id=" + hit.imageId; + try { + URL u; + u = new URL(url); + host = u.getHost(); + } catch (MalformedURLException e) { + host = ""; + } + + // Load metadata + String size = null; + String dimensions = null; + String imgWidth = null; + ImageWritable image = bean.getImage(hit.imageId); + if (image != null) { + Metadata meta = image.getMetadata(); + int sizeInt = Integer.parseInt(meta.get(ImageSearch.SIZE_KEY)); + size = Integer.toString((int)Math.round(sizeInt / 1024.0)) + "k"; + dimensions = meta.get("width") + "x" + meta.get("height"); + imgWidth = ""; + } else { + size = "??k"; + dimensions = "??x??"; + // Have the downscaled original image displayed by the browser + thumbnailUrl = url; + int maxSize = nutchConf.getInt("imagesearcher.thumbnail.maxSize", 150); + imgWidth = "width=" + maxSize; + } + + if (i % rowLength == 0) { + if (i > 0) { + %></tr><% + } +%> <tr class=resultGroup><% + } +%> <td class=resItem width=200 valign=top align=center> + <br/><a href="<%=parentUrl%>"><img src="<%=thumbnailUrl%>" border=0 <%=imgWidth%>/></a> + <br/><%=fileName%><br/> + <span class=resMeta><%=dimensions%> - <%=size%><br/></span> + <span class=site><%=host%></span> + </td><% + } + if (hits.length == 0) { + %><i18n:message key="noMatch"><i18n:messageArg value="<%=queryString%>"/></i18n:message><% + } else { // Draw paging information + %> + </table> + <div id=res_pages> + <br/><% + int currentPage = (start - (start % hitsPerPage)) / hitsPerPage; + int currentLoc = currentPage*hitsPerPage; + int pageCounter = 0; + String encodedQuery = URLEncoder.encode(queryString, "UTF-8"); + + // Prev + if (currentPage > 0) { + int prevPageLoc = (currentPage-1)*hitsPerPage; + String prevUrl = "./results.jsp?query=" + encodedQuery + "&start=" + + prevPageLoc; + %><a class=res href="<%=prevUrl%>"><< <i18n:message key="prev"/></a> <% + } + // Next + if (currentPage*hitsPerPage + hitsPerPage < numHits) { + int nextPageLoc = (currentPage+1)*hitsPerPage; + String nextUrl = "./results.jsp?query=" + encodedQuery + "&start=" + + nextPageLoc; + %> <a class=res href="<%=nextUrl%>"><i18n:message key="next"/> >></a><% + } + } + bean.close(); +}%> + </div> +</body> \ No newline at end of file Added: trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/thumb.jsp =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/thumb.jsp (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/thumb.jsp 2008-07-26 15:42:26 UTC (rev 2502) @@ -0,0 +1,33 @@ +<%@ page +import="java.io.*" +import="java.util.*" + +import="org.archive.nutchwax.imagesearch.*" +import="org.apache.hadoop.conf.*" +import="org.apache.nutch.util.NutchConfiguration" +import="org.apache.hadoop.fs.Path" + +%><% +Configuration nutchConf = NutchConfiguration.get(application); + +// Get id from request +request.setCharacterEncoding("UTF-8"); +String idString = request.getParameter("id"); +if (idString == null) { + response.sendRedirect("./results.jsp"); + return; +} + +ImageSearcherBean bean = new ImageSearcherBean(nutchConf); +ImageWritable imageData = bean.getImage(idString); +if (imageData != null) { + StoredImage thumb = imageData.getThumbnail(); + response.setContentType("image/jpg"); + OutputStream os = response.getOutputStream(); + os.write(thumb.getData()); + os.close(); +} else { + response.sendRedirect("./results.jsp"); +} +bean.close(); +%> \ No newline at end of file Added: trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch.properties =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch.properties (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch.properties 2008-07-26 15:42:26 UTC (rev 2502) @@ -0,0 +1,5 @@ +search = Search Images +showingHits = Results <b>{0}-{1}</b> out of about <b>{2}</b>. +next = Next +prev = Previous +noMatch = Your query (<b>{0}</b>) did not match any documents.<br/>Please try different keywords. Added: trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch_en.properties =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch_en.properties (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/web/locale/org/archive/jsp/imagesearch_en.properties 2008-07-26 15:42:26 UTC (rev 2502) @@ -0,0 +1,5 @@ +search = Search Images +showingHits = Results <b>{0}-{1}</b> out of about <b>{2}</b>. +next = Next +prev = Previous +noMatch = Your query (<b>{0}</b>) did not match any documents.<br/>Please try different keywords. Added: trunk/archive-access/projects/nutchwax/imagesearch/src/web/web.xml =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/web/web.xml (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/web/web.xml 2008-07-26 15:42:26 UTC (rev 2502) @@ -0,0 +1,36 @@ +<?xml version="1.0" encoding="ISO-8859-1"?> +<!DOCTYPE web-app + PUBLIC "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN" + "http://java.sun.com/dtd/web-app_2_3.dtd"> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<web-app> + +<!-- order is very important here --> + +<welcome-file-list> + <welcome-file>results.jsp</welcome-file> + <welcome-file>index.html</welcome-file> + <welcome-file>index.jsp</welcome-file> +</welcome-file-list> + +<taglib> + <taglib-uri>http://jakarta.apache.org/taglibs/i18n</taglib-uri> + <taglib-location>/WEB-INF/taglibs-i18n.tld</taglib-location> + </taglib> + +</web-app> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <mi...@us...> - 2008-07-26 15:37:06
|
Revision: 2501 http://archive-access.svn.sourceforge.net/archive-access/?rev=2501&view=rev Author: miklosh Date: 2008-07-26 15:37:14 +0000 (Sat, 26 Jul 2008) Log Message: ----------- Added ImageDataReader for image metadata and thumbnail retrieval. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java Added Paths: ----------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageDataReader.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageLoader.java Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageDataReader.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageDataReader.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageDataReader.java 2008-07-26 15:37:14 UTC (rev 2501) @@ -0,0 +1,84 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.MapFileOutputFormat; +import org.apache.nutch.util.HadoopFSUtil; + +/** Retrieves image thumbnails and metadata from segments. */ +public class ImageDataReader implements ImageLoader { + + private HashMap<String, MapFile.Reader[]> segments = new HashMap<String, MapFile.Reader[]>(); + + /** Construct given a directory containing segments. */ + ImageDataReader(FileSystem fs, String segmentsDir, Configuration conf) throws IOException { + FileStatus[] fstats = fs.listStatus(new Path(segmentsDir), + HadoopFSUtil.getPassDirectoriesFilter(fs)); + Path[] segmentDirs = HadoopFSUtil.getPaths(fstats); + + if (segmentDirs != null) { + for (Path segmentDir : segmentDirs) { + MapFile.Reader[] readers = MapFileOutputFormat. + getReaders(fs, new Path(segmentDir, ImageWritable.IMAGE_DATA_DIR), conf); + if (readers != null) { + segments.put(segmentDir.getName(), readers); + } + } + } + } + + /** + * Loads the stored ImageWritable from disk. + * @param id identifier of the image to retrieve + */ + public ImageWritable getImage(String id) throws IOException { + // TODO: try the segment in which the parent doc resides first + Text key = new Text(id); + ImageWritable holder = new ImageWritable(); + Iterator<MapFile.Reader[]> it = segments.values().iterator(); + while (it.hasNext()) { + MapFile.Reader[] readers = it.next(); + for (MapFile.Reader reader : readers) { + ImageWritable result = (ImageWritable) reader.get(key, holder); + if (result != null) { + return result; + } + } + } + return null; + } + + public void close() throws IOException { + Iterator<MapFile.Reader[]> it = segments.values().iterator(); + while (it.hasNext()) { + MapFile.Reader[] readers = it.next(); + for (MapFile.Reader reader : readers) { + reader.close(); + } + } + } +} \ No newline at end of file Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java 2008-07-26 15:35:27 UTC (rev 2500) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageHit.java 2008-07-26 15:37:14 UTC (rev 2501) @@ -23,6 +23,7 @@ public String imageId; public String url; + public String parentUrl; public float docSim; public float proximity; Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageLoader.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageLoader.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageLoader.java 2008-07-26 15:37:14 UTC (rev 2501) @@ -0,0 +1,25 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.imagesearch; + +import java.io.IOException; + +/** Interface for loading image data from disk. */ +public interface ImageLoader { + public ImageWritable getImage(String id) throws IOException; +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <mi...@us...> - 2008-07-26 15:35:18
|
Revision: 2500 http://archive-access.svn.sourceforge.net/archive-access/?rev=2500&view=rev Author: miklosh Date: 2008-07-26 15:35:27 +0000 (Sat, 26 Jul 2008) Log Message: ----------- Updated ImageProcessor to store size of original image and deduplicate thumbnails based on digest. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java 2008-07-26 15:33:48 UTC (rev 2499) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageProcessor.java 2008-07-26 15:35:27 UTC (rev 2500) @@ -18,6 +18,7 @@ package org.archive.nutchwax.imagesearch; import java.io.IOException; +import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -29,18 +30,21 @@ import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; public class ImageProcessor extends Configured implements Tool, - Mapper<Text, Content, Text, ImageWritable> { + Mapper<Text, Content, Text, ImageWritable>, + Reducer<Text, ImageWritable, Text, ImageWritable> { private static final Log LOG = LogFactory.getLog(ImageProcessor.class); @@ -56,25 +60,44 @@ OutputCollector<Text, ImageWritable> output, Reporter reporter) throws IOException { - Metadata metadata = new Metadata(); // Check content type if (!content.getContentType().contains("image/")) { return; } // Generate thumbnail + Metadata metadata = new Metadata(); byte[] data = content.getContent(); StoredImage thumb = ThumbnailGenerator.generateThumbnail(data, thumbMaxSize, thumbMaxSize, thumbQuality, metadata); // Create and setup an ImageWritable ImageWritable image = new ImageWritable(key.toString()); + metadata.set(ImageSearch.SIZE_KEY, Integer.toString(data.length)); image.setMetadata(metadata); image.setThumbnail(thumb); - output.collect(key, image); + // Get digest of image content + Metadata contentMeta = content.getMetadata(); + String digest = contentMeta.get("digest"); + if (digest == null) { + digest = contentMeta.get(Nutch.SIGNATURE_KEY); + } + + output.collect(new Text(digest), image); } + + public void reduce(Text key, Iterator<ImageWritable> values, + OutputCollector<Text, ImageWritable> output, Reporter reporter) + throws IOException { + if (values.hasNext()) { + // Save only one instance + output.collect(key, values.next()); + return; + } + } + public void processImageContent(Path segment) throws IOException { @@ -88,6 +111,7 @@ job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(ImageProcessor.class); + job.setReducerClass(ImageProcessor.class); job.setOutputPath(new Path(segment, ImageWritable.IMAGE_DATA_DIR)); job.setOutputFormat(MapFileOutputFormat.class); Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java 2008-07-26 15:33:48 UTC (rev 2499) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java 2008-07-26 15:35:27 UTC (rev 2500) @@ -20,9 +20,10 @@ public class ImageSearch { public static final String PARENT_URL_KEY = "parent_url"; public static final String ALT_TEXT_KEY = "alt"; - + public static final String IMAGE_IDS_KEY = "image_ids"; public static final String IMAGE_POS_KEY = "image_pos"; public static final String IMAGE_URLS_KEY = "image_urls"; public static final String HAS_IMAGE_KEY = "has_image"; + public static final String SIZE_KEY = "size"; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <mi...@us...> - 2008-07-26 15:33:40
|
Revision: 2499 http://archive-access.svn.sourceforge.net/archive-access/?rev=2499&view=rev Author: miklosh Date: 2008-07-26 15:33:48 +0000 (Sat, 26 Jul 2008) Log Message: ----------- Added DocIndexer for correctly indexing image digest information. Added Paths: ----------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java Added: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java (rev 0) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java 2008-07-26 15:33:48 UTC (rev 2499) @@ -0,0 +1,483 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.nutchwax.imagesearch; + +import java.io.*; +import java.util.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.io.*; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.conf.*; +import org.apache.hadoop.mapred.*; +import org.apache.hadoop.util.*; +import org.apache.nutch.parse.*; +import org.apache.nutch.analysis.*; + +import org.apache.nutch.scoring.ScoringFilterException; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.LogUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.CrawlDb; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.crawl.LinkDb; +import org.apache.nutch.crawl.NutchWritable; + +import org.apache.lucene.index.*; +import org.apache.lucene.document.*; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilters; +import org.apache.nutch.indexer.NutchSimilarity; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.protocol.Content; + +/** Create indexes for segments suitable for image search. */ +public class DocIndexer extends Configured implements Tool, + Reducer<Text, NutchWritable, Text, Writable>, + Mapper<Text, Writable, Text, NutchWritable> { + + public static final String DONE_NAME = "index.done"; + public static final Log LOG = LogFactory.getLog(DocIndexer.class); + + /** A utility class used to pass a lucene document from Indexer.reduce + * to Indexer.OutputFormat. + * Note: Despite its name, it can't properly wrap a lucene document - it + * doesn't know how to serialize/deserialize a lucene document. + */ + private static class LuceneDocumentWrapper implements Writable { + + private Document doc; + + public LuceneDocumentWrapper(Document doc) { + this.doc = doc; + } + + public Document get() { + return doc; + } + + public void readFields(DataInput in) throws IOException { + // intentionally left blank + } + + public void write(DataOutput out) throws IOException { + // intentionally left blank + } + } + + /** Unwrap Lucene Documents created by reduce and add them to an index. */ + public static class OutputFormat + extends org.apache.hadoop.mapred.OutputFormatBase<WritableComparable, LuceneDocumentWrapper> { + + public RecordWriter<WritableComparable, LuceneDocumentWrapper> getRecordWriter(final FileSystem fs, JobConf job, + String name, final Progressable progress) throws IOException { + final Path perm = new Path(job.getOutputPath(), name); + final Path temp = + job.getLocalPath("index/_" + Integer.toString(new Random().nextInt())); + + fs.delete(perm); // delete old, if any + + final AnalyzerFactory factory = new AnalyzerFactory(job); + final IndexWriter writer = // build locally first + new IndexWriter(fs.startLocalOutput(perm, temp).toString(), + new NutchDocumentAnalyzer(job), true); + + writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10)); + writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100)); + writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE)); + writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128)); + writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000)); + writer.setInfoStream(LogUtil.getInfoStream(LOG)); + writer.setUseCompoundFile(false); + writer.setSimilarity(new NutchSimilarity()); + + return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() { + + boolean closed; + + public void write(WritableComparable key, LuceneDocumentWrapper value) + throws IOException { // unwrap & index doc + Document doc = value.get(); + NutchAnalyzer analyzer = factory.get(doc.get("lang")); + if (LOG.isInfoEnabled()) { + LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" + + " with analyzer " + analyzer + + " (" + doc.get("lang") + ")"); + } + writer.addDocument(doc, analyzer); + progress.progress(); + } + + public void close(final Reporter reporter) throws IOException { + // spawn a thread to give progress heartbeats + Thread prog = new Thread() { + + public void run() { + while (!closed) { + try { + reporter.setStatus("closing"); + Thread.sleep(1000); + } catch (InterruptedException e) { + continue; + } catch (Throwable e) { + return; + } + } + } + }; + + try { + prog.start(); + if (LOG.isInfoEnabled()) { + LOG.info("Optimizing index."); + } + // optimize & close index + writer.optimize(); + writer.close(); + fs.completeLocalOutput(perm, temp); // copy to dfs + fs.createNewFile(new Path(perm, DONE_NAME)); + } finally { + closed = true; + } + } + }; + } + } + private IndexingFilters filters; + private ScoringFilters scfilters; + + public DocIndexer() { + + } + + public DocIndexer(Configuration conf) { + setConf(conf); + } + + public void configure(JobConf job) { + setConf(job); + this.filters = new IndexingFilters(getConf()); + this.scfilters = new ScoringFilters(getConf()); + } + + public void close() { + } + + public void reduce(Text key, Iterator<NutchWritable> values, + OutputCollector<Text, Writable> output, Reporter reporter) + throws IOException { + Inlinks inlinks = null; + CrawlDatum dbDatum = null; + CrawlDatum fetchDatum = null; + ParseData parseData = null; + ParseText parseText = null; + Metadata imageUrlMapping = new Metadata(); + while (values.hasNext()) { + Writable value = values.next().get(); // unwrap + if (value instanceof Inlinks) { + inlinks = (Inlinks) value; + } else if (value instanceof CrawlDatum) { + CrawlDatum datum = (CrawlDatum) value; + if (CrawlDatum.hasDbStatus(datum)) { + dbDatum = datum; + } else if (CrawlDatum.hasFetchStatus(datum)) { + // don't index unmodified (empty) pages + if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) { + fetchDatum = datum; + } + } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() || + CrawlDatum.STATUS_SIGNATURE == datum.getStatus()) { + continue; + } else { + throw new RuntimeException("Unexpected status: " + datum.getStatus()); + } + } else if (value instanceof ParseData) { + parseData = (ParseData) value; + } else if (value instanceof ParseText) { + parseText = (ParseText) value; + } else if (value instanceof Metadata) { + // Add image URL->digest mapping + Metadata mapping = (Metadata) value; + String[] imageUrls = mapping.names(); + for (String imageUrl : imageUrls) { + if (imageUrlMapping.get(imageUrl) == null) { + imageUrlMapping.add(imageUrl, mapping.get(imageUrl)); + } + } + } else if (LOG.isWarnEnabled()) { + LOG.warn("Unrecognized type: " + value.getClass()); + } + } + + if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) { + return; // only have inlinks + } + + if (!parseData.getStatus().isSuccess() || + fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) { + return; + } + + Document doc = new Document(); + Metadata metadata = parseData.getContentMeta(); + + // replace image ids with proper ones + if (imageUrlMapping.size() > 0) { + Metadata parseMeta = parseData.getParseMeta(); + parseMeta.remove(ImageSearch.IMAGE_IDS_KEY); + String[] imageUrls = parseMeta.getValues(ImageSearch.IMAGE_URLS_KEY); + for (String imageUrl : imageUrls) { + String mappedTo = imageUrlMapping.get(imageUrl); + if (mappedTo == null) { + if (LOG.isInfoEnabled()) { + LOG.info("No digest information for " + imageUrl); + } + parseMeta.add(ImageSearch.IMAGE_IDS_KEY, "-"); + continue; + } + parseMeta.add(ImageSearch.IMAGE_IDS_KEY, mappedTo); + } + } + + // add segment, used to map from merged index back to segment files + doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY), + Field.Store.YES, Field.Index.NO)); + + // add digest, used by dedup + doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY), + Field.Store.YES, Field.Index.NO)); + + Parse parse = new ParseImpl(parseText, parseData); + try { + // extract information from dbDatum and pass it to + // fetchDatum so that indexing filters can use it + Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); + if (url != null) { + fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url); + } + // run indexing filters + doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks); + } catch (IndexingException e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Error indexing " + key + ": " + e); + } + return; + } + + // skip documents discarded by indexing filters + if (doc == null) { + return; + } + + float boost = 1.0f; + // run scoring filters + try { + boost = this.scfilters.indexerScore((Text) key, doc, dbDatum, + fetchDatum, parse, inlinks, boost); + } catch (ScoringFilterException e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Error calculating score " + key + ": " + e); + } + return; + } + // apply boost to all indexed fields. + doc.setBoost(boost); + // store boost for use by explain and dedup + doc.add(new Field("boost", Float.toString(boost), + Field.Store.YES, Field.Index.NO)); + + output.collect(key, new LuceneDocumentWrapper(doc)); + } + + /** + * Emits image URLs as keys and their URL+digest as values. + */ + public static class ImageUrlEmitter + implements Mapper<Text, Writable, Text, Text>, + Reducer<Text, Text, Text, Metadata> { + + public void map(Text key, Writable value, + OutputCollector<Text, Text> output, Reporter reporter) + throws IOException { + + if (value instanceof ParseData) { + ParseData parseData = (ParseData) value; + Metadata parseMeta = parseData.getParseMeta(); + String[] imageUrls = parseMeta.getValues(ImageSearch.IMAGE_URLS_KEY); + if (imageUrls.length > 0) { + for (String url : imageUrls) { + output.collect(new Text(url), key); + } + } + } else if (value instanceof Content) { + Content content = (Content) value; + if (content.getContentType().contains("image/")) { + Metadata meta = content.getMetadata(); + // Using NutchWax.DIGEST_KEY here + String digest = meta.get("digest"); + if (digest == null) { + digest = meta.get(Metadata.SIGNATURE_KEY); + } + output.collect(new Text(content.getUrl()), new Text(digest)); + } + } + } + + public void reduce(Text key, Iterator<Text> values, + OutputCollector<Text, Metadata> output, Reporter reporter) + throws IOException { + + Vector<Text> parents = new Vector<Text>(); + String imageUrl = key.toString(); + String imageDigest = null; + while (values.hasNext()) { + Text data = values.next(); + String value = data.toString(); + // Determine type of value + if (value.contains("/")) { + // This value is a parent's key + parents.add(data); + } else { + // This value is a digest + imageDigest = value.toString(); + } + } + if (imageDigest != null) { + Metadata meta = new Metadata(); + meta.add(imageUrl, imageDigest); + Iterator<Text> it = parents.iterator(); + while (it.hasNext()) { + Text parentKey = it.next(); + output.collect(parentKey, meta); + } + } + } + + public void configure(JobConf job) {} + public void close() {} + } + + public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments) + throws IOException { + + if (LOG.isInfoEnabled()) { + LOG.info("DocIndexer: starting"); + LOG.info("DocIndexer: linkdb: " + linkDb); + } + + /* + * First phase: determining image keys + */ + Path outDir = new Path("imgkeys-"+ + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); + JobConf job = new NutchJob(getConf()); + job.setJobName("imagekeys " + indexDir); + for (int i = 0; i < segments.length; i++) { + job.addInputPath(new Path(segments[i], ParseData.DIR_NAME)); + job.addInputPath(new Path(segments[i], Content.DIR_NAME)); + } + + job.setInputFormat(SequenceFileInputFormat.class); + job.setMapperClass(ImageUrlEmitter.class); + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(Text.class); + job.setReducerClass(ImageUrlEmitter.class); + + job.setOutputPath(outDir); + job.setOutputFormat(SequenceFileOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(Metadata.class); + + JobClient.runJob(job); + + /* + * Second phase: creating Lucene index + */ + job = new NutchJob(getConf()); + job.setJobName("index " + indexDir); + + for (int i = 0; i < segments.length; i++) { + if (LOG.isInfoEnabled()) { + LOG.info("DocIndexer: adding segment: " + segments[i]); + } + job.addInputPath(new Path(segments[i], CrawlDatum.FETCH_DIR_NAME)); + job.addInputPath(new Path(segments[i], CrawlDatum.PARSE_DIR_NAME)); + job.addInputPath(new Path(segments[i], ParseData.DIR_NAME)); + job.addInputPath(new Path(segments[i], ParseText.DIR_NAME)); + } + + job.addInputPath(outDir); + job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME)); + job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME)); + job.setInputFormat(SequenceFileInputFormat.class); + + job.setMapperClass(DocIndexer.class); + job.setReducerClass(DocIndexer.class); + + job.setOutputPath(indexDir); + job.setOutputFormat(OutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(NutchWritable.class); + + JobClient.runJob(job); + + FileSystem fs = FileSystem.get(getConf()); + fs.delete(outDir); + + if (LOG.isInfoEnabled()) { + LOG.info("DocIndexer: done"); + } + } + + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new DocIndexer(), args); + System.exit(res); + } + + public int run(String[] args) throws Exception { + + if (args.length < 4) { + System.err.println("Usage: <index> <crawldb> <linkdb> <segment> ..."); + return -1; + } + + Path[] segments = new Path[args.length - 3]; + for (int i = 3; i < args.length; i++) { + segments[i - 3] = new Path(args[i]); + } + + try { + index(new Path(args[0]), new Path(args[1]), new Path(args[2]), + segments); + return 0; + } catch (Exception e) { + LOG.fatal("DocIndexer: " + StringUtils.stringifyException(e)); + return -1; + } + } + + public void map(Text key, Writable value, + OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { + output.collect(key, new NutchWritable(value)); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2498 http://archive-access.svn.sourceforge.net/archive-access/?rev=2498&view=rev Author: bradtofel Date: 2008-07-26 01:38:07 +0000 (Sat, 26 Jul 2008) Log Message: ----------- TWEAK: removed unneeded suppressWarnings Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java 2008-07-26 01:37:41 UTC (rev 2497) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java 2008-07-26 01:38:07 UTC (rev 2498) @@ -152,7 +152,6 @@ } } - @SuppressWarnings("unchecked") protected RangeGroup getRangeGroupForRequest(WaybackRequest wbRequest) throws BadQueryException, ResourceIndexNotAvailableException { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2497 http://archive-access.svn.sourceforge.net/archive-access/?rev=2497&view=rev Author: bradtofel Date: 2008-07-26 01:37:41 +0000 (Sat, 26 Jul 2008) Log Message: ----------- TWEAK: removed unneeded suppressWarnings Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java 2008-07-25 21:26:03 UTC (rev 2496) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java 2008-07-26 01:37:41 UTC (rev 2497) @@ -45,7 +45,6 @@ * @throws IOException * @throws UnsupportedOperationException */ - @SuppressWarnings("unchecked") public void addSearchResult(CaptureSearchResult result) throws UnsupportedOperationException, IOException { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-25 21:25:54
|
Revision: 2496 http://archive-access.svn.sourceforge.net/archive-access/?rev=2496&view=rev Author: bradtofel Date: 2008-07-25 21:26:03 +0000 (Fri, 25 Jul 2008) Log Message: ----------- TWEAK: added beanshell dependency Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/pom.xml Modified: trunk/archive-access/projects/wayback/wayback-core/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/pom.xml 2008-07-25 20:33:59 UTC (rev 2495) +++ trunk/archive-access/projects/wayback/wayback-core/pom.xml 2008-07-25 21:26:03 UTC (rev 2496) @@ -79,6 +79,11 @@ <artifactId>spring-beans</artifactId> <version>2.5.1</version> </dependency> + <dependency> + <groupId>org.beanshell</groupId> + <artifactId>bsh</artifactId> + <version>2.0b4</version> + </dependency> <!-- Doh... I'm not sure what package is configuring org.apache.commons-logging to use log4j, but it's breaking some command line tools. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-07-25 20:33:50
|
Revision: 2495 http://archive-access.svn.sourceforge.net/archive-access/?rev=2495&view=rev Author: binzino Date: 2008-07-25 20:33:59 +0000 (Fri, 25 Jul 2008) Log Message: ----------- Changed "none" to "unknown" for HTTPStatusCodeFilter to avoid confusion over whether "none" means "nothing is allowed at all" vs. "no code for this record". Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-25 20:24:53 UTC (rev 2494) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-25 20:33:59 UTC (rev 2495) @@ -715,10 +715,10 @@ { Range range = new Range( ); - // Special handling for "none" where an ARCRecord doesn't have + // Special handling for "unknown" where an ARCRecord doesn't have // an HTTP status code. The ARCRecord.getStatusCode() returns // -1 in that case, so we make a range for it. - if ( value.toLowerCase( ).equals( "none" ) ) + if ( value.toLowerCase( ).equals( "unknown" ) ) { range.lower = -1; range.upper = -1; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-07-25 20:24:45
|
Revision: 2494 http://archive-access.svn.sourceforge.net/archive-access/?rev=2494&view=rev Author: binzino Date: 2008-07-25 20:24:53 +0000 (Fri, 25 Jul 2008) Log Message: ----------- Added HTTPStatusCodeFilter and configuration thereof in nutch-site.xml. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-25 02:46:16 UTC (rev 2493) +++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-25 20:24:53 UTC (rev 2494) @@ -32,7 +32,7 @@ </property> <property> - <!-- Configure the 'index-nutchwax' plugin. Specify how the metadata fields added by the ArcsToSegment are mapped to the Lucene documents during indexing. + <!-- Configure the 'index-nutchwax' plugin. Specify how the metadata fields added by the Importer are mapped to the Lucene documents during indexing. The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key" Where the only required part is the "src-key", the rest will assume the following defaults: lowercase = true @@ -111,9 +111,16 @@ <property> <name>nutchwax.urlfilter.wayback.canonicalizer</name> <value>org.archive.wayback.util.url.AggressiveUrlCanonicalizer</value> - <description></description> + <description>Implementation of URL canonicalizer to use.</description> </property> +<property> + <name>nutchwax.filter.http.status</name> + <value> + 200-299 + </value> +</property> + <!-- Similar to Nutch's file.content.limit http.content.limit Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-25 02:46:16 UTC (rev 2493) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-25 20:24:53 UTC (rev 2494) @@ -20,6 +20,8 @@ import java.net.MalformedURLException; import java.util.Map.Entry; import java.util.Iterator; +import java.util.List; +import java.util.ArrayList; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -106,12 +108,8 @@ private ParseUtil parseUtil; private URLNormalizers normalizers; private int interval; + private HTTPStatusCodeFilter httpStatusCodeFilter; - private long numSkipped; - private long numImported; - private long bytesSkipped; - private long bytesImported; - /** * ?: Is this necessary? */ @@ -146,6 +144,8 @@ this.parseUtil = new ParseUtil ( jobConf ); this.normalizers = new URLNormalizers( jobConf, URLNormalizers.SCOPE_FETCHER ); this.interval = jobConf.getInt( "db.fetch.interval.default", 2592000 ); + + this.httpStatusCodeFilter = new HTTPStatusCodeFilter( jobConf.get( "nutchwax.filter.http.status" ) ); } /** @@ -233,6 +233,13 @@ if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" ); + if ( ! this.httpStatusCodeFilter.isAllowed( record.getStatusCode( ) ) ) + { + if ( LOG.isInfoEnabled() ) LOG.info( "Skip URL: " + meta.getUrl() + " HTTP status:" + record.getStatusCode() ); + + return false; + } + try { // Skip the HTTP headers in the response body, so that the @@ -313,6 +320,7 @@ contentMetadata.set( NutchWax.DIGEST_KEY, meta.getDigest() ); contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); contentMetadata.set( NutchWax.CONTENT_LENGTH_KEY, String.valueOf( meta.getLength() ) ); + contentMetadata.set( NutchWax.HTTP_RESPONSE_KEY, String.valueOf( record.getStatusCode() ) ); Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() ); @@ -677,3 +685,96 @@ } } + + +/** + * This should all be moved into some sort of filtering plugin. + * Unfortunately the URLFilter plugin interface isn't adequate as it + * only looks at a URL string. Rather than jamming a response code + * through that interface, we do a one-off filter class here. + * + * A long-term solution would be to create a new Nutch extension point + * interface that takes an ARCRecord rather than a URL string. That + * way we can write filters that can operate on any part of an + * ARCRecord, not just the URL. + */ +class HTTPStatusCodeFilter +{ + List<Range> ranges = new ArrayList<Range>( ); + + public HTTPStatusCodeFilter( String configuration ) + { + if ( configuration == null ) + { + return ; + } + + configuration = configuration.trim( ); + + for ( String value : configuration.split( "\\s+" ) ) + { + Range range = new Range( ); + + // Special handling for "none" where an ARCRecord doesn't have + // an HTTP status code. The ARCRecord.getStatusCode() returns + // -1 in that case, so we make a range for it. + if ( value.toLowerCase( ).equals( "none" ) ) + { + range.lower = -1; + range.upper = -1; + + this.ranges.add( range ); + + continue; + } + + String values[] = value.split( "[-]" ); + + try + { + switch ( values.length ) + { + case 2: + // It's a range, N-M + range.lower = Integer.parseInt( values[0] ); + range.upper = Integer.parseInt( values[1] ); + break; + + case 1: + // It's a single value, convert to a single-value range + range.lower = Integer.parseInt( values[0] ); + range.upper = range.lower; + break; + + default: + // Bad format + Importer.LOG.warn( "Illegal format for nutchwax.filter.http.status: " + range ); + continue ; + } + + this.ranges.add( range ); + } + catch ( NumberFormatException nfe ) + { + Importer.LOG.warn( "Illegal format for nutchwax.filter.http.status: " + range, nfe ); + } + } + + } + + public boolean isAllowed( int code ) + { + for ( Range r : this.ranges ) + { + return ( r.lower <= code && code <= r.upper ); + } + + return false; + } + + static class Range + { + int lower; + int upper; + } +} Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-25 02:46:16 UTC (rev 2493) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-25 20:24:53 UTC (rev 2494) @@ -31,4 +31,5 @@ public static final String DIGEST_KEY = "digest"; public static final String CONTENT_TYPE_KEY = "type"; public static final String CONTENT_LENGTH_KEY = "length"; + public static final String HTTP_RESPONSE_KEY = "http_response_code"; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-07-25 02:46:07
|
Revision: 2493 http://archive-access.svn.sourceforge.net/archive-access/?rev=2493&view=rev Author: binzino Date: 2008-07-25 02:46:16 +0000 (Fri, 25 Jul 2008) Log Message: ----------- Integrated into Hadoop framework via Tool interface and Configured superclass. This enables us to read Nutch(WAX) configuration properties, in particular the url canonicalizer implementation to use. Fix JIRA: WAX-6. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-07-24 23:35:54 UTC (rev 2492) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-07-25 02:46:16 UTC (rev 2493) @@ -38,8 +38,14 @@ import org.apache.lucene.document.Field; import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +import org.apache.nutch.util.NutchConfiguration; + import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; import org.archive.nutchwax.NutchWax; @@ -48,10 +54,9 @@ * Reads series of (digest+URL,date) lines, finds corresponding * document in index, and adds the date to it. */ -public class DateAdder +public class DateAdder extends Configured implements Tool { - public static void main(String[] args) - throws Exception + public int run( String[] args ) throws Exception { if ( args.length < 4 ) { @@ -111,7 +116,7 @@ IndexWriter writer = new IndexWriter( destIndexDir, new WhitespaceAnalyzer( ), true ); - UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer( ); + UrlCanonicalizer canonicalizer = getCanonicalizer( this.getConf( ) ); for ( int i = 0 ; i < reader.numDocs( ) ; i++ ) { @@ -155,6 +160,47 @@ reader.close( ); writer.close( ); + + return 0; } + + /** + * Utility function to instantiate a UrlCanonicalizer based on an + * implementation specified in the configuration. + */ + public static UrlCanonicalizer getCanonicalizer( Configuration conf ) + { + // Which Wayback canonicalizer to use: Aggressive, Identity, etc. + String canonicalizerClassName = conf.get( "nutchwax.urlfilter.wayback.canonicalizer" ); + + if ( canonicalizerClassName == null || canonicalizerClassName.trim().length() == 0 ) + { + throw new RuntimeException( "Missing value for property: nutchwax.urlfilter.wayback.canonicalizer" ); + } + + try + { + UrlCanonicalizer canonicalizer = (UrlCanonicalizer) Class.forName( canonicalizerClassName ).newInstance( ); + + return canonicalizer; + } + catch ( Exception e ) + { + // If we can't instantiate it, there's not much else we can do + // other than just throw the Exception. + throw new RuntimeException( e ); + } + } + + /** + * Command-line driver. Runs the Importer as a Hadoop job. + */ + public static void main( String args[] ) throws Exception + { + int result = ToolRunner.run( NutchConfiguration.create(), new DateAdder(), args ); + + System.exit( result ); + } + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-07-24 23:35:44
|
Revision: 2492 http://archive-access.svn.sourceforge.net/archive-access/?rev=2492&view=rev Author: binzino Date: 2008-07-24 23:35:54 +0000 (Thu, 24 Jul 2008) Log Message: ----------- Fix bug in org.archive.net.RsyncURLHandler where only part of URL was being used in call to 'rsync' command. Commit updated version of library. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/lib/commons-2.0.1-SNAPSHOT.jar This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-07-24 23:34:37
|
Revision: 2491 http://archive-access.svn.sourceforge.net/archive-access/?rev=2491&view=rev Author: binzino Date: 2008-07-24 23:34:46 +0000 (Thu, 24 Jul 2008) Log Message: ----------- Add content-length to metadata stored for imported document. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-24 23:31:54 UTC (rev 2490) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-24 23:34:46 UTC (rev 2491) @@ -231,7 +231,7 @@ { ARCRecordMetaData meta = record.getMetaData(); - if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ")" ); + if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" ); try { @@ -302,16 +302,18 @@ // We store both the normal URL and the URL+digest key for // later retrieval by the indexing plugin(s). - contentMetadata.set( NutchWax.URL_KEY, url ); - contentMetadata.set( NutchWax.ORIG_KEY, key ); + contentMetadata.set( NutchWax.URL_KEY, url ); + contentMetadata.set( NutchWax.ORIG_KEY, key ); - contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); - contentMetadata.set( NutchWax.FILENAME_KEY, meta.getArcFile().getName() ); - contentMetadata.set( NutchWax.FILEOFFSET_KEY, String.valueOf( record.getHeader().getOffset( ) ) ); - contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); - contentMetadata.set( NutchWax.DATE_KEY, meta.getDate() ); - contentMetadata.set( NutchWax.DIGEST_KEY, meta.getDigest() ); + contentMetadata.set( NutchWax.FILENAME_KEY, meta.getArcFile().getName() ); + contentMetadata.set( NutchWax.FILEOFFSET_KEY, String.valueOf( record.getHeader().getOffset( ) ) ); + contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); + contentMetadata.set( NutchWax.DATE_KEY, meta.getDate() ); + contentMetadata.set( NutchWax.DIGEST_KEY, meta.getDigest() ); + contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); + contentMetadata.set( NutchWax.CONTENT_LENGTH_KEY, String.valueOf( meta.getLength() ) ); + Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() ); output( output, new Text( key ), content ); Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-24 23:31:54 UTC (rev 2490) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-24 23:34:46 UTC (rev 2491) @@ -22,12 +22,13 @@ public class NutchWax { - public static final String URL_KEY = "url"; - public static final String ORIG_KEY = "orig"; - public static final String FILENAME_KEY = "filename"; - public static final String FILEOFFSET_KEY = "fileoffset"; - public static final String COLLECTION_KEY = "collection"; - public static final String CONTENT_TYPE_KEY = "type"; - public static final String DATE_KEY = "date"; - public static final String DIGEST_KEY = "digest"; + public static final String URL_KEY = "url"; + public static final String ORIG_KEY = "orig"; + public static final String FILENAME_KEY = "filename"; + public static final String FILEOFFSET_KEY = "fileoffset"; + public static final String COLLECTION_KEY = "collection"; + public static final String DATE_KEY = "date"; + public static final String DIGEST_KEY = "digest"; + public static final String CONTENT_TYPE_KEY = "type"; + public static final String CONTENT_LENGTH_KEY = "length"; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-07-24 23:31:45
|
Revision: 2490 http://archive-access.svn.sourceforge.net/archive-access/?rev=2490&view=rev Author: binzino Date: 2008-07-24 23:31:54 +0000 (Thu, 24 Jul 2008) Log Message: ----------- Initial check of servlet filter which applies an XSLT transform to the output of the filter chain. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/XSLTFilter.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/XSLTFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/XSLTFilter.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/XSLTFilter.java 2008-07-24 23:31:54 UTC (rev 2490) @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2008 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.nutchwax; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.CharArrayWriter; + +import javax.servlet.Filter; +import javax.servlet.FilterChain; +import javax.servlet.FilterConfig; +import javax.servlet.ServletContext; +import javax.servlet.ServletException; +import javax.servlet.ServletOutputStream; +import javax.servlet.ServletRequest; +import javax.servlet.ServletResponse; +import javax.servlet.ServletResponseWrapper; +import javax.servlet.http.*; + +import javax.xml.transform.Source; +import javax.xml.transform.stream.StreamSource; +import javax.xml.transform.Templates; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.Transformer; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + + +public class XSLTFilter implements Filter +{ + private String xsltUrl; + private String contentType; + + public void init( FilterConfig config ) + throws ServletException + { + ServletContext app = config.getServletContext( ); + + this.xsltUrl = config.getInitParameter( "xsltUrl" ); + + if ( this.xsltUrl != null ) + { + this.xsltUrl = this.xsltUrl.trim( ); + + if ( this.xsltUrl.length( ) == 0 ) + { + this.xsltUrl = null; + } + } + + this.contentType = config.getInitParameter( "contentType" ); + + if ( this.contentType != null ) + { + this.contentType = this.contentType.trim( ); + + if ( this.contentType.length( ) == 0 ) + { + this.contentType = null; + } + } + + if ( this.contentType == null ) + { + this.contentType = "application/xml"; + } + } + + public void doFilter( ServletRequest request, ServletResponse response, FilterChain chain ) + throws IOException, ServletException + { + if ( this.xsltUrl != null ) + { + ByteArrayOutputStream baos = new ByteArrayOutputStream( 8 * 1024 ); + + HttpServletResponseInterceptor capturedResponse = new HttpServletResponseInterceptor( (HttpServletResponse) response, baos ); + + chain.doFilter( request, capturedResponse ); + + byte output[] = baos.toByteArray( ); + + try + { + Source xsltSource = new StreamSource( xsltUrl ); + Templates xsltTemplates = TransformerFactory.newInstance( ).newTemplates( xsltSource ); + Transformer transformer = xsltTemplates.newTransformer( ); + + StreamSource source = new StreamSource( new ByteArrayInputStream( output ) ); + StreamResult result = new StreamResult( response.getOutputStream( ) ); + + // Enforce XML content-type in the response. + response.setContentType( this.contentType ); + + transformer.transform( source, result ); + } + catch ( javax.xml.transform.TransformerConfigurationException tce ) + { + } + catch( javax.xml.transform.TransformerException te ) + { + } + } + else + { + chain.doFilter( request, response ); + } + } + + public void destroy() + { + + } + +} + + +class HttpServletResponseInterceptor extends HttpServletResponseWrapper +{ + private OutputStream os; + + HttpServletResponseInterceptor( HttpServletResponse response, OutputStream os ) + { + super( response ); + + this.os = os; + } + + public ServletOutputStream getOutputStream() + { + ServletOutputStream sos = new ServletOutputStream( ) + { + public void write( int b ) + throws java.io.IOException + { + HttpServletResponseInterceptor.this.os.write( b ); + } + }; + + return sos; + } + + public PrintWriter getWriter( ) + { + PrintWriter pw = new PrintWriter( this.os ); + + return pw; + } + +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-23 01:10:19
|
Revision: 2489 http://archive-access.svn.sourceforge.net/archive-access/?rev=2489&view=rev Author: bradtofel Date: 2008-07-23 01:10:27 +0000 (Wed, 23 Jul 2008) Log Message: ----------- TWEAK: changing from ObjectFilter to Adapter... Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentitySearchResultAdapter.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java 2008-07-23 01:07:34 UTC (rev 2488) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java 2008-07-23 01:10:27 UTC (rev 2489) @@ -1,74 +0,0 @@ -/* LegacyToIdentityFilter - * - * $Id$ - * - * Created on 11:48:56 AM Jul 10, 2008. - * - * Copyright (C) 2008 Internet Archive. - * - * This file is part of wayback. - * - * wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourceindex.adapters; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.util.ObjectFilter; -import org.archive.wayback.util.url.UrlOperations; - -/** - * CaptureSearchResult ObjectFilter which passes through all inputs, modifying - * each to construct a corrected original URL to comply with new Identity - * format. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class LegacyToIdentityFilter implements ObjectFilter<CaptureSearchResult> { - private final static String DEFAULT_SCHEME = "http://"; - - private int getEndOfHostIndex(String url) { - int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); - int pathIdx = url.indexOf(UrlOperations.PATH_START); - if(portIdx == -1 && pathIdx == -1) { - return url.length(); - } - if(portIdx == -1) { - return pathIdx; - } - if(pathIdx == -1) { - return portIdx; - } - if(pathIdx > portIdx) { - return portIdx; - } else { - return pathIdx; - } - } - - /* (non-Javadoc) - * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object) - */ - public int filterObject(CaptureSearchResult o) { - String urlKey = o.getUrlKey(); - StringBuilder sb = new StringBuilder(urlKey.length()); - sb.append(DEFAULT_SCHEME); - sb.append(o.getOriginalUrl()); - sb.append(urlKey.substring(getEndOfHostIndex(urlKey))); - o.setOriginalUrl(sb.toString()); - return FILTER_INCLUDE; - } - -} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentitySearchResultAdapter.java (from rev 2488, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentitySearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentitySearchResultAdapter.java 2008-07-23 01:10:27 UTC (rev 2489) @@ -0,0 +1,74 @@ +/* LegacyToIdentityFilter + * + * $Id$ + * + * Created on 11:48:56 AM Jul 10, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.adapters; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.UrlOperations; + +/** + * CaptureSearchResult ObjectFilter which passes through all inputs, modifying + * each to construct a corrected original URL to comply with new Identity + * format. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class LegacyToIdentitySearchResultAdapter implements ObjectFilter<CaptureSearchResult> { + private final static String DEFAULT_SCHEME = "http://"; + + private int getEndOfHostIndex(String url) { + int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); + int pathIdx = url.indexOf(UrlOperations.PATH_START); + if(portIdx == -1 && pathIdx == -1) { + return url.length(); + } + if(portIdx == -1) { + return pathIdx; + } + if(pathIdx == -1) { + return portIdx; + } + if(pathIdx > portIdx) { + return portIdx; + } else { + return pathIdx; + } + } + + /* (non-Javadoc) + * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object) + */ + public int filterObject(CaptureSearchResult o) { + String urlKey = o.getUrlKey(); + StringBuilder sb = new StringBuilder(urlKey.length()); + sb.append(DEFAULT_SCHEME); + sb.append(o.getOriginalUrl()); + sb.append(urlKey.substring(getEndOfHostIndex(urlKey))); + o.setOriginalUrl(sb.toString()); + return FILTER_INCLUDE; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-23 01:07:26
|
Revision: 2488 http://archive-access.svn.sourceforge.net/archive-access/?rev=2488&view=rev Author: bradtofel Date: 2008-07-23 01:07:34 +0000 (Wed, 23 Jul 2008) Log Message: ----------- REFACTOR: moved various Adapter<*SearchResult> into org.archive.wayback.resourceindex.adapters Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java (from rev 2483, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java 2008-07-23 01:07:34 UTC (rev 2488) @@ -0,0 +1,74 @@ +/* LegacyToIdentityFilter + * + * $Id$ + * + * Created on 11:48:56 AM Jul 10, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.adapters; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.UrlOperations; + +/** + * CaptureSearchResult ObjectFilter which passes through all inputs, modifying + * each to construct a corrected original URL to comply with new Identity + * format. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class LegacyToIdentityFilter implements ObjectFilter<CaptureSearchResult> { + private final static String DEFAULT_SCHEME = "http://"; + + private int getEndOfHostIndex(String url) { + int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); + int pathIdx = url.indexOf(UrlOperations.PATH_START); + if(portIdx == -1 && pathIdx == -1) { + return url.length(); + } + if(portIdx == -1) { + return pathIdx; + } + if(pathIdx == -1) { + return portIdx; + } + if(pathIdx > portIdx) { + return portIdx; + } else { + return pathIdx; + } + } + + /* (non-Javadoc) + * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object) + */ + public int filterObject(CaptureSearchResult o) { + String urlKey = o.getUrlKey(); + StringBuilder sb = new StringBuilder(urlKey.length()); + sb.append(DEFAULT_SCHEME); + sb.append(o.getOriginalUrl()); + sb.append(urlKey.substring(getEndOfHostIndex(urlKey))); + o.setOriginalUrl(sb.toString()); + return FILTER_INCLUDE; + } + +} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java 2008-07-23 01:06:29 UTC (rev 2487) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java 2008-07-23 01:07:34 UTC (rev 2488) @@ -1,74 +0,0 @@ -/* LegacyToIdentityFilter - * - * $Id$ - * - * Created on 11:48:56 AM Jul 10, 2008. - * - * Copyright (C) 2008 Internet Archive. - * - * This file is part of wayback. - * - * wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourceindex.filters; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.util.ObjectFilter; -import org.archive.wayback.util.url.UrlOperations; - -/** - * CaptureSearchResult ObjectFilter which passes through all inputs, modifying - * each to construct a corrected original URL to comply with new Identity - * format. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class LegacyToIdentityFilter implements ObjectFilter<CaptureSearchResult> { - private final static String DEFAULT_SCHEME = "http://"; - - private int getEndOfHostIndex(String url) { - int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); - int pathIdx = url.indexOf(UrlOperations.PATH_START); - if(portIdx == -1 && pathIdx == -1) { - return url.length(); - } - if(portIdx == -1) { - return pathIdx; - } - if(pathIdx == -1) { - return portIdx; - } - if(pathIdx > portIdx) { - return portIdx; - } else { - return pathIdx; - } - } - - /* (non-Javadoc) - * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object) - */ - public int filterObject(CaptureSearchResult o) { - String urlKey = o.getUrlKey(); - StringBuilder sb = new StringBuilder(urlKey.length()); - sb.append(DEFAULT_SCHEME); - sb.append(o.getOriginalUrl()); - sb.append(urlKey.substring(getEndOfHostIndex(urlKey))); - o.setOriginalUrl(sb.toString()); - return FILTER_INCLUDE; - } - -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-23 01:06:22
|
Revision: 2487 http://archive-access.svn.sourceforge.net/archive-access/?rev=2487&view=rev Author: bradtofel Date: 2008-07-23 01:06:29 +0000 (Wed, 23 Jul 2008) Log Message: ----------- REFACTOR: moved various Adapter<*SearchResult> into org.archive.wayback.resourceindex.adapters Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java 2008-07-23 01:04:09 UTC (rev 2486) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java 2008-07-23 01:06:29 UTC (rev 2487) @@ -1,113 +0,0 @@ -/* CaptureToUrlSearchResultAdapter - * - * $Id$ - * - * Created on 4:45:55 PM Jun 28, 2008. - * - * Copyright (C) 2008 Internet Archive. - * - * This file is part of wayback. - * - * wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.core; - -import java.util.HashMap; - -import org.archive.wayback.util.AdaptedIterator; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.CloseableIterator; - -/** - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class CaptureToUrlSearchResultAdapter - implements Adapter<CaptureSearchResult, UrlSearchResult> { - - private String currentUrl; - private String originalUrl; - private String firstCapture; - private String lastCapture; - private int numCaptures; - private HashMap<String,Object> digests; - private UrlSearchResult resultRef = null; - public CaptureToUrlSearchResultAdapter() { - - } - private UrlSearchResult makeUrlSearchResult(CaptureSearchResult result) { - currentUrl = result.getUrlKey(); - originalUrl = result.getOriginalUrl(); - firstCapture = result.getCaptureTimestamp(); - lastCapture = firstCapture; - digests = new HashMap<String,Object>(); - digests.put(result.getDigest(),null); - numCaptures = 1; - - resultRef = new UrlSearchResult(); - resultRef.setUrlKey(currentUrl); - resultRef.setOriginalUrl(originalUrl); - resultRef.setFirstCapture(firstCapture); - resultRef.setLastCapture(lastCapture); - resultRef.setNumCaptures(1); - resultRef.setNumVersions(1); - return resultRef; - } - - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public UrlSearchResult adapt(CaptureSearchResult c) { - String urlKey = c.getUrlKey(); - if(resultRef == null || !currentUrl.equals(urlKey)) { - return makeUrlSearchResult(c); - } - - // same url -- accumulate into the last one we returned: - String captureDate = c.getCaptureTimestamp(); - if(captureDate.compareTo(firstCapture) < 0) { - firstCapture = captureDate; - resultRef.setFirstCapture(firstCapture); - } - if(captureDate.compareTo(lastCapture) > 0) { - lastCapture = captureDate; - resultRef.setLastCapture(lastCapture); - } - numCaptures++; - digests.put(c.getDigest(), null); - resultRef.setNumCaptures(numCaptures); - resultRef.setNumVersions(digests.size()); - return null; - } - public static CloseableIterator<UrlSearchResult> adaptCaptureIterator( - CloseableIterator<CaptureSearchResult> itr) { - - // HACKHACK: this is pretty lame. We return an UrlSearchResult the - // first time we see a new urlKey, and cache a reference to the returned - // UrlSearchResult, updating it as we see subsequent CaptureSearchResult - // objects with the same urlKey. - // This means that users of the returned UrlSearchResult need to wait - // until they've got the *next* returned UrlSearchResult before using - // the *previous* UrlSearchResult. - // At the moment, this all happens inside a LocalResourceIndex, so - // none of the UrlSearchResult objects should be seen/used in any - // significant way before they've all be accumulated into an - // UrlSearchResults object.. - return new AdaptedIterator<CaptureSearchResult,UrlSearchResult>(itr, - new CaptureToUrlSearchResultAdapter()); - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java 2008-07-23 01:04:09 UTC (rev 2486) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java 2008-07-23 01:06:29 UTC (rev 2487) @@ -1,63 +0,0 @@ -package org.archive.wayback.resourceindex; - -import java.util.HashMap; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.util.Adapter; - -/** - * Adapter class that observes a stream of SearchResults tracking for each - * complete record, a mapping of that records digest to: - * Arc/Warc Filename - * Arc/Warc offset - * HTTP Response - * MIME-Type - * Redirect URL - * - * If subsequent SearchResults are missing these fields ("-") and the Digest - * field has been seen, then the subsequent SearchResults are updated with the - * values from the kept copy matching that digest, and an additional annotation - * field is added. - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class DeduplicationSearchResultAnnotationAdapter -implements Adapter<CaptureSearchResult,CaptureSearchResult> { - private final static String EMPTY_VALUE = "-"; - - private HashMap<String,CaptureSearchResult> memory = null; - - public DeduplicationSearchResultAnnotationAdapter() { - memory = new HashMap<String,CaptureSearchResult>(); - } - - private CaptureSearchResult annotate(CaptureSearchResult o) { - String thisDigest = o.getDigest(); - CaptureSearchResult last = memory.get(thisDigest); - if(last == null) { - // TODO: log missing record digest reference - return null; - } - o.setFile(last.getFile()); - o.setOffset(last.getOffset()); - o.setHttpCode(last.getHttpCode()); - o.setMimeType(last.getMimeType()); - o.setRedirectUrl(last.getRedirectUrl()); - o.flagDuplicateDigest(last.getCaptureTimestamp()); - return o; - } - - private CaptureSearchResult remember(CaptureSearchResult o) { - memory.put(o.getDigest(),o); - return o; - } - - public CaptureSearchResult adapt(CaptureSearchResult o) { - if(o.getFile().equals(EMPTY_VALUE)) { - return annotate(o); - } - return remember(o); - } -} \ No newline at end of file Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2008-07-23 01:04:09 UTC (rev 2486) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2008-07-23 01:06:29 UTC (rev 2487) @@ -34,7 +34,6 @@ import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.CaptureSearchResults; -import org.archive.wayback.core.CaptureToUrlSearchResultAdapter; import org.archive.wayback.core.SearchResult; import org.archive.wayback.core.SearchResults; import org.archive.wayback.core.UrlSearchResult; @@ -44,6 +43,8 @@ import org.archive.wayback.exception.BadQueryException; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.resourceindex.adapters.CaptureToUrlSearchResultAdapter; +import org.archive.wayback.resourceindex.adapters.DeduplicationSearchResultAnnotationAdapter; import org.archive.wayback.resourceindex.filters.CounterFilter; import org.archive.wayback.resourceindex.filters.DateRangeFilter; import org.archive.wayback.resourceindex.filters.DuplicateRecordFilter; Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java (from rev 2448, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java 2008-07-23 01:06:29 UTC (rev 2487) @@ -0,0 +1,115 @@ +/* CaptureToUrlSearchResultAdapter + * + * $Id$ + * + * Created on 4:45:55 PM Jun 28, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.adapters; + +import java.util.HashMap; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.UrlSearchResult; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.CloseableIterator; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class CaptureToUrlSearchResultAdapter + implements Adapter<CaptureSearchResult, UrlSearchResult> { + + private String currentUrl; + private String originalUrl; + private String firstCapture; + private String lastCapture; + private int numCaptures; + private HashMap<String,Object> digests; + private UrlSearchResult resultRef = null; + public CaptureToUrlSearchResultAdapter() { + + } + private UrlSearchResult makeUrlSearchResult(CaptureSearchResult result) { + currentUrl = result.getUrlKey(); + originalUrl = result.getOriginalUrl(); + firstCapture = result.getCaptureTimestamp(); + lastCapture = firstCapture; + digests = new HashMap<String,Object>(); + digests.put(result.getDigest(),null); + numCaptures = 1; + + resultRef = new UrlSearchResult(); + resultRef.setUrlKey(currentUrl); + resultRef.setOriginalUrl(originalUrl); + resultRef.setFirstCapture(firstCapture); + resultRef.setLastCapture(lastCapture); + resultRef.setNumCaptures(1); + resultRef.setNumVersions(1); + return resultRef; + } + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public UrlSearchResult adapt(CaptureSearchResult c) { + String urlKey = c.getUrlKey(); + if(resultRef == null || !currentUrl.equals(urlKey)) { + return makeUrlSearchResult(c); + } + + // same url -- accumulate into the last one we returned: + String captureDate = c.getCaptureTimestamp(); + if(captureDate.compareTo(firstCapture) < 0) { + firstCapture = captureDate; + resultRef.setFirstCapture(firstCapture); + } + if(captureDate.compareTo(lastCapture) > 0) { + lastCapture = captureDate; + resultRef.setLastCapture(lastCapture); + } + numCaptures++; + digests.put(c.getDigest(), null); + resultRef.setNumCaptures(numCaptures); + resultRef.setNumVersions(digests.size()); + return null; + } + public static CloseableIterator<UrlSearchResult> adaptCaptureIterator( + CloseableIterator<CaptureSearchResult> itr) { + + // HACKHACK: this is pretty lame. We return an UrlSearchResult the + // first time we see a new urlKey, and cache a reference to the returned + // UrlSearchResult, updating it as we see subsequent CaptureSearchResult + // objects with the same urlKey. + // This means that users of the returned UrlSearchResult need to wait + // until they've got the *next* returned UrlSearchResult before using + // the *previous* UrlSearchResult. + // At the moment, this all happens inside a LocalResourceIndex, so + // none of the UrlSearchResult objects should be seen/used in any + // significant way before they've all be accumulated into an + // UrlSearchResults object.. + return new AdaptedIterator<CaptureSearchResult,UrlSearchResult>(itr, + new CaptureToUrlSearchResultAdapter()); + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java (from rev 2448, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java 2008-07-23 01:06:29 UTC (rev 2487) @@ -0,0 +1,63 @@ +package org.archive.wayback.resourceindex.adapters; + +import java.util.HashMap; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.Adapter; + +/** + * Adapter class that observes a stream of SearchResults tracking for each + * complete record, a mapping of that records digest to: + * Arc/Warc Filename + * Arc/Warc offset + * HTTP Response + * MIME-Type + * Redirect URL + * + * If subsequent SearchResults are missing these fields ("-") and the Digest + * field has been seen, then the subsequent SearchResults are updated with the + * values from the kept copy matching that digest, and an additional annotation + * field is added. + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class DeduplicationSearchResultAnnotationAdapter +implements Adapter<CaptureSearchResult,CaptureSearchResult> { + private final static String EMPTY_VALUE = "-"; + + private HashMap<String,CaptureSearchResult> memory = null; + + public DeduplicationSearchResultAnnotationAdapter() { + memory = new HashMap<String,CaptureSearchResult>(); + } + + private CaptureSearchResult annotate(CaptureSearchResult o) { + String thisDigest = o.getDigest(); + CaptureSearchResult last = memory.get(thisDigest); + if(last == null) { + // TODO: log missing record digest reference + return null; + } + o.setFile(last.getFile()); + o.setOffset(last.getOffset()); + o.setHttpCode(last.getHttpCode()); + o.setMimeType(last.getMimeType()); + o.setRedirectUrl(last.getRedirectUrl()); + o.flagDuplicateDigest(last.getCaptureTimestamp()); + return o; + } + + private CaptureSearchResult remember(CaptureSearchResult o) { + memory.put(o.getDigest(),o); + return o; + } + + public CaptureSearchResult adapt(CaptureSearchResult o) { + if(o.getFile().equals(EMPTY_VALUE)) { + return annotate(o); + } + return remember(o); + } +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-23 01:04:00
|
Revision: 2486 http://archive-access.svn.sourceforge.net/archive-access/?rev=2486&view=rev Author: bradtofel Date: 2008-07-23 01:04:09 +0000 (Wed, 23 Jul 2008) Log Message: ----------- package for various Adapter<*SearchResult> Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-23 00:42:39
|
Revision: 2485 http://archive-access.svn.sourceforge.net/archive-access/?rev=2485&view=rev Author: bradtofel Date: 2008-07-23 00:42:47 +0000 (Wed, 23 Jul 2008) Log Message: ----------- REFACTOR: moved WaybackRequest.fixup() out of all RequestParser implementations, and into AccessPoint. This allows the AccessPoint to set the reference to itself before fixup() allowing: FEATURE: Added get/setLocale() to AccessPoint, allowing explicit configuration of the Locale to use for that AccessPoint. If none is specified, then the Locale of the HttpServletRequest is used, as before. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/proxy/ProxyRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/FormRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/OpenSearchRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java 2008-07-22 02:59:11 UTC (rev 2484) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java 2008-07-23 00:42:47 UTC (rev 2485) @@ -720,10 +720,14 @@ String.valueOf(httpRequest.getLocalPort())); putUnlessNull(REQUEST_WAYBACK_CONTEXT, httpRequest.getContextPath()); - Locale l = httpRequest.getLocale(); - ResourceBundle b = ResourceBundle.getBundle(UI_RESOURCE_BUNDLE_NAME, - httpRequest.getLocale()); - formatter = new StringFormatter(b,l); + Locale l = null; + if(accessPoint != null) { + l = accessPoint.getLocale(); + } + if(l == null) { + l = httpRequest.getLocale(); + } + setLocale(l); putUnlessNull(REQUEST_LOCALE_LANG,l.getDisplayLanguage()); Cookie[] cookies = httpRequest.getCookies(); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/proxy/ProxyRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/proxy/ProxyRequestParser.java 2008-07-22 02:59:11 UTC (rev 2484) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/proxy/ProxyRequestParser.java 2008-07-23 00:42:47 UTC (rev 2485) @@ -74,7 +74,6 @@ String replayDateStr = BDBMap.getTimestampForId( httpRequest.getContextPath(), id); wbRequest.setReplayTimestamp(replayDateStr); - wbRequest.fixup(httpRequest); } return wbRequest; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/FormRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/FormRequestParser.java 2008-07-22 02:59:11 UTC (rev 2484) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/FormRequestParser.java 2008-07-23 00:42:47 UTC (rev 2485) @@ -102,10 +102,6 @@ } } } - if(wbRequest != null) { - wbRequest.fixup(httpRequest); - } - return wbRequest; } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/OpenSearchRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/OpenSearchRequestParser.java 2008-07-22 02:59:11 UTC (rev 2484) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/OpenSearchRequestParser.java 2008-07-23 00:42:47 UTC (rev 2485) @@ -146,7 +146,6 @@ if(wbRequest.getEndTimestamp() == null) { wbRequest.setEndTimestamp(getLatestTimestamp()); } - wbRequest.fixup(httpRequest); return wbRequest; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java 2008-07-22 02:59:11 UTC (rev 2484) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java 2008-07-23 00:42:47 UTC (rev 2485) @@ -67,7 +67,6 @@ WaybackRequest wbRequest = parse(requestPath); if(wbRequest != null) { - wbRequest.fixup(httpRequest); wbRequest.setResultsPerPage(maxRecords); } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2008-07-22 02:59:11 UTC (rev 2484) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2008-07-23 00:42:47 UTC (rev 2485) @@ -25,6 +25,7 @@ package org.archive.wayback.webapp; import java.io.IOException; +import java.util.Locale; import java.util.Properties; import java.util.logging.Logger; @@ -89,7 +90,17 @@ private Properties configs = null; private ExclusionFilterFactory exclusionFactory = null; private BooleanOperator<WaybackRequest> authentication = null; + private String urlRoot = null; + private Locale locale = null; + public Locale getLocale() { + return locale; + } + + public void setLocale(Locale locale) { + this.locale = locale; + } + /** * */ @@ -224,6 +235,9 @@ * Canonical server and port information. */ public String getAbsoluteLocalPrefix(HttpServletRequest httpRequest) { + if(urlRoot != null) { + return urlRoot; + } return getAbsoluteContextPrefix(httpRequest, useServerName); } @@ -236,7 +250,7 @@ WaybackRequest wbRequest = new WaybackRequest(); wbRequest.setContextPrefix(getAbsoluteLocalPrefix(httpRequest)); wbRequest.setAccessPoint(this); - + wbRequest.fixup(httpRequest); UIResults uiResults = new UIResults(wbRequest,uriConverter); try { uiResults.forward(httpRequest, httpResponse, translated); @@ -278,9 +292,10 @@ wbRequest = parser.parse(httpRequest, this); if(wbRequest != null) { + handled = true; wbRequest.setAccessPoint(this); - handled = true; wbRequest.setContextPrefix(getAbsoluteLocalPrefix(httpRequest)); + wbRequest.fixup(httpRequest); if(authentication != null) { if(!authentication.isTrue(wbRequest)) { throw new AuthenticationControlException("Not authorized"); @@ -485,4 +500,12 @@ public void setException(ExceptionRenderer exception) { this.exception = exception; } + + public String getUrlRoot() { + return urlRoot; + } + + public void setUrlRoot(String urlRoot) { + this.urlRoot = urlRoot; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-22 02:59:02
|
Revision: 2484 http://archive-access.svn.sourceforge.net/archive-access/?rev=2484&view=rev Author: bradtofel Date: 2008-07-22 02:59:11 +0000 (Tue, 22 Jul 2008) Log Message: ----------- MOVED: Redirect.jsp up a level to the /webapp/jsp/ directory, where it is externally accessible. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/jsp/Redirect.jsp Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Redirect.jsp Deleted: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Redirect.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Redirect.jsp 2008-07-22 02:56:29 UTC (rev 2483) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Redirect.jsp 2008-07-22 02:59:11 UTC (rev 2484) @@ -1,15 +0,0 @@ -<%@ page import="org.archive.wayback.util.bdb.BDBMap" %> - -<% - String url = request.getParameter("url"); - String time = request.getParameter("time"); - - // Put time-mapping for this id, or if no id, the ip-addr. - String id = request.getHeader("Proxy-Id"); - if(id == null) id = request.getRemoteAddr(); - BDBMap.addTimestampForId(request.getContextPath(),id, time); - - // Now redirect to the page the user wanted. - response.sendRedirect(url); -%> -anchored date! Copied: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/jsp/Redirect.jsp (from rev 2465, trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Redirect.jsp) =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/jsp/Redirect.jsp (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/jsp/Redirect.jsp 2008-07-22 02:59:11 UTC (rev 2484) @@ -0,0 +1,15 @@ +<%@ page import="org.archive.wayback.util.bdb.BDBMap" %> + +<% + String url = request.getParameter("url"); + String time = request.getParameter("time"); + + // Put time-mapping for this id, or if no id, the ip-addr. + String id = request.getHeader("Proxy-Id"); + if(id == null) id = request.getRemoteAddr(); + BDBMap.addTimestampForId(request.getContextPath(),id, time); + + // Now redirect to the page the user wanted. + response.sendRedirect(url); +%> +anchored date! This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2483 http://archive-access.svn.sourceforge.net/archive-access/?rev=2483&view=rev Author: bradtofel Date: 2008-07-22 02:56:29 +0000 (Tue, 22 Jul 2008) Log Message: ----------- INITIAL-REV: ObjectFilter which converts legacy CaptureSearchResults objects into the new form, meaning fabricating the "Original URL" field from the URL key and the Original Host. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java 2008-07-22 02:56:29 UTC (rev 2483) @@ -0,0 +1,74 @@ +/* LegacyToIdentityFilter + * + * $Id$ + * + * Created on 11:48:56 AM Jul 10, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.filters; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.UrlOperations; + +/** + * CaptureSearchResult ObjectFilter which passes through all inputs, modifying + * each to construct a corrected original URL to comply with new Identity + * format. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class LegacyToIdentityFilter implements ObjectFilter<CaptureSearchResult> { + private final static String DEFAULT_SCHEME = "http://"; + + private int getEndOfHostIndex(String url) { + int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); + int pathIdx = url.indexOf(UrlOperations.PATH_START); + if(portIdx == -1 && pathIdx == -1) { + return url.length(); + } + if(portIdx == -1) { + return pathIdx; + } + if(pathIdx == -1) { + return portIdx; + } + if(pathIdx > portIdx) { + return portIdx; + } else { + return pathIdx; + } + } + + /* (non-Javadoc) + * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object) + */ + public int filterObject(CaptureSearchResult o) { + String urlKey = o.getUrlKey(); + StringBuilder sb = new StringBuilder(urlKey.length()); + sb.append(DEFAULT_SCHEME); + sb.append(o.getOriginalUrl()); + sb.append(urlKey.substring(getEndOfHostIndex(urlKey))); + o.setOriginalUrl(sb.toString()); + return FILTER_INCLUDE; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-22 02:53:41
|
Revision: 2482 http://archive-access.svn.sourceforge.net/archive-access/?rev=2482&view=rev Author: bradtofel Date: 2008-07-22 02:53:50 +0000 (Tue, 22 Jul 2008) Log Message: ----------- INITIAL-REV: two custom ExceptionRenderer implementations which provide the ability to override specific the standard .jsp handlers, via a list of special-case hosts, or by consulting an external Oracle. The AnnotationExceptionRenderer is highly experimental. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/AnnotationExceptionRenderer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/CustomNotInArchiveExceptionRenderer.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/AnnotationExceptionRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/AnnotationExceptionRenderer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/AnnotationExceptionRenderer.java 2008-07-22 02:53:50 UTC (rev 2482) @@ -0,0 +1,130 @@ +/* AnnotationExceptionRenderer + * + * $Id$ + * + * Created on 7:17:24 PM Jun 10, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.exception; + +import java.util.Date; + +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.archive.accesscontrol.AccessControlClient; +import org.archive.accesscontrol.RuleOracleUnavailableException; +import org.archive.accesscontrol.model.Rule; +import org.archive.wayback.core.WaybackRequest; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class AnnotationExceptionRenderer extends BaseExceptionRenderer { + private AccessControlClient client = null; + private String oracleUrl = null; + private String who = null; + public void init() { + client = new AccessControlClient(oracleUrl); + } + public String getExceptionHandler(HttpServletRequest httpRequest, + HttpServletResponse httpResponse, WaybackRequest wbRequest, + WaybackException exception) { + // the "standard HTML" response handler: + String jspPath = getCustomHandler(exception,wbRequest); + if(jspPath == null) { + jspPath = super.getExceptionHandler(httpRequest, httpResponse, + wbRequest, exception); + } + return jspPath; + } + + private String getCustomHandler(WaybackException e, WaybackRequest wbRequest) { + String jspPath = null; + if((e instanceof ResourceNotInArchiveException) + && wbRequest.isReplayRequest()) { + String url = wbRequest.getRequestUrl(); + Date captureDate = wbRequest.getReplayDate(); + try { + Rule rule = client.getRule(url,captureDate,new Date(),who); + jspPath = ruleToJspPath(rule); + } catch (RuleOracleUnavailableException e1) { + e1.printStackTrace(); + } + } + return jspPath; + } + + private String ruleToJspPath(Rule rule) { + if(rule != null) { + String pc = rule.getPublicComment(); + if(pc.startsWith("/")) { + return pc; + } + } + return null; + } + /** + * @return the client + */ + public AccessControlClient getClient() { + return client; + } + + /** + * @param client the client to set + */ + public void setClient(AccessControlClient client) { + client.setRobotLookupsEnabled(false); + this.client = client; + } + + /** + * @return the oracleUrl + */ + public String getOracleUrl() { + return oracleUrl; + } + + /** + * @param oracleUrl the oracleUrl to set + */ + public void setOracleUrl(String oracleUrl) { + this.oracleUrl = oracleUrl; + setClient(new AccessControlClient(oracleUrl)); + } + + /** + * @return the who + */ + public String getWho() { + return who; + } + + /** + * @param who the who to set + */ + public void setWho(String who) { + this.who = who; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/CustomNotInArchiveExceptionRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/CustomNotInArchiveExceptionRenderer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/CustomNotInArchiveExceptionRenderer.java 2008-07-22 02:53:50 UTC (rev 2482) @@ -0,0 +1,95 @@ +/* CustomNotInArchiveExceptionRenderer + * + * $Id$ + * + * Created on 1:21:49 PM Jul 8, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.exception; + +import java.util.HashMap; +import java.util.List; + +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.util.url.UrlOperations; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class CustomNotInArchiveExceptionRenderer extends BaseExceptionRenderer { + private HashMap<String,Object> hosts = null; + private String jspHandler = null; + + + public String getExceptionHandler(HttpServletRequest httpRequest, + HttpServletResponse httpResponse, WaybackRequest wbRequest, + WaybackException exception) { + String jspPath = getCustomHandler(exception,wbRequest); + if(jspPath == null) { + jspPath = super.getExceptionHandler(httpRequest, httpResponse, + wbRequest, exception); + } + return jspPath; + } + + + /** + * @param exception + * @param wbRequest + * @return + */ + private String getCustomHandler(WaybackException exception, + WaybackRequest wbRequest) { + if((exception instanceof ResourceNotInArchiveException) + && wbRequest.isReplayRequest()) { + String url = wbRequest.getRequestUrl(); + String host = UrlOperations.urlToHost(url); + if(hosts.containsKey(host)) { + return jspHandler; + } + } + return null; + } + + + public String getJspHandler() { + return jspHandler; + } + + + public void setJspHandler(String jspHandler) { + this.jspHandler = jspHandler; + } + public List<String> getHosts() { + return null; + } + public void setHosts(List<String> hosts) { + this.hosts = new HashMap<String,Object>(); + for(String host : hosts) { + this.hosts.put(host, null); + } + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-22 02:48:06
|
Revision: 2481 http://archive-access.svn.sourceforge.net/archive-access/?rev=2481&view=rev Author: bradtofel Date: 2008-07-22 02:48:14 +0000 (Tue, 22 Jul 2008) Log Message: ----------- REFACTOR: moved all the cumbersome wiring Spring code for the various Replay modes into separate files, which are now imported into the main wayback.xml file. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ProxyReplay.xml Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2008-07-22 02:48:14 UTC (rev 2481) @@ -0,0 +1,105 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd"> + + <bean id="archivalurlhttpheaderprocessor" class="org.archive.wayback.replay.RedirectRewritingHttpHeaderProcessor" /> + + <bean id="archivaldateredirectingreplayrenderer" class="org.archive.wayback.replay.DateRedirectReplayRenderer" /> + <bean id="archivalcssreplayrenderer" class="org.archive.wayback.archivalurl.ArchivalUrlCSSReplayRenderer"> + <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> + </bean> + <bean id="archivalasxreplayrenderer" class="org.archive.wayback.archivalurl.ArchivalUrlASXReplayRenderer"> + <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> + </bean> + <bean id="archivaltransparentreplayrenderer" class="org.archive.wayback.replay.TransparentReplayRenderer"> + <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> + </bean> + + <bean id="archivalserversidehtmlreplayrenderer" class="org.archive.wayback.archivalurl.ServerSideHTMLReplayRenderer"> + <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> + <property name="jspInserts"> + <list> + <value>/WEB-INF/replay/ArchiveComment.jsp</value> +<!-- + <value>/WEB-INF/replay/JSLessTimeline.jsp</value> +--> + </list> + </property> + </bean> + + <bean id="archivalclientsidehtmlreplayrenderer" class="org.archive.wayback.archivalurl.ClientSideHTMLReplayRenderer"> + <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> + <property name="jspInserts"> + <list> + <value>/WEB-INF/replay/ArchiveComment.jsp</value> + <value>/WEB-INF/replay/ClientSideJSInsert.jsp</value> + <value>/WEB-INF/replay/DebugBanner.jsp</value> +<!-- + <value>/WEB-INF/replay/Disclaimer.jsp</value> + <value>/WEB-INF/replay/Timeline.jsp</value> +--> + </list> + </property> + </bean> + + <bean id="archivalurlreplay" class="org.archive.wayback.replay.SelectorReplayDispatcher"> + <property name="selectors"> + <list> + + <!-- REDIRECT IF NOT EXACT DATE --> + <bean class="org.archive.wayback.replay.selector.DateMismatchSelector"> + <property name="renderer" ref="archivaldateredirectingreplayrenderer"/> + </bean> + + <!-- HTML REPLAY --> + <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> + <property name="mimeContains"> + <list> + <value>text/html</value> + <value>application/xhtml</value> + </list> + </property> + <property name="renderer" ref="archivalclientsidehtmlreplayrenderer"/> + </bean> + + <!-- CSS REPLAY --> + <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> + <property name="mimeContains"> + <list> + <value>text/css</value> + </list> + </property> + <property name="renderer" ref="archivalcssreplayrenderer"/> + </bean> + + <!-- ASX-MIME REPLAY --> + <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> + <property name="mimeContains"> + <list> + <value>video/x-ms-asf</value> + </list> + </property> + <property name="renderer" ref="archivalasxreplayrenderer"/> + </bean> + + <!-- ASX-PATH REPLAY --> + <bean class="org.archive.wayback.replay.selector.PathMatchSelector"> + <property name="pathContains"> + <list> + <value>.asx</value> + </list> + </property> + <property name="renderer" ref="archivalasxreplayrenderer"/> + </bean> + + <!-- DEFAULT-TRANSPARENT REPLAY --> + <bean class="org.archive.wayback.replay.selector.AlwaysMatchSelector"> + <property name="renderer" ref="archivaltransparentreplayrenderer"/> + </bean> + + </list> + </property> + </bean> +</beans> \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml 2008-07-22 02:48:14 UTC (rev 2481) @@ -0,0 +1,54 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd"> + + <bean id="domainprefixhttpheaderprocessor" class="org.archive.wayback.replay.RedirectRewritingHttpHeaderProcessor" /> + + <bean id="domainprefixdateredirectingreplayrenderer" class="org.archive.wayback.replay.DateRedirectReplayRenderer" /> + <bean id="domainprefixtransparentreplayrenderer" class="org.archive.wayback.replay.TransparentReplayRenderer"> + <constructor-arg><ref bean="domainprefixhttpheaderprocessor"/></constructor-arg> + </bean> + + <bean id="domainprefixtextreplayrenderer" class="org.archive.wayback.domainprefix.DomainPrefixTextReplayRenderer"> + <constructor-arg><ref bean="domainprefixhttpheaderprocessor"/></constructor-arg> + <property name="jspInserts"> + <list> + <value>/WEB-INF/replay/ArchiveComment.jsp</value> + <value>/WEB-INF/replay/DebugBanner.jsp</value> +<!-- + <value>/WEB-INF/replay/JSLessTimeline.jsp</value> +--> + </list> + </property> + </bean> + + <bean id="domainprefixreplay" class="org.archive.wayback.replay.SelectorReplayDispatcher"> + <property name="selectors"> + <list> + + <!-- REDIRECT IF NOT EXACT DATE --> + <bean class="org.archive.wayback.replay.selector.DateMismatchSelector"> + <property name="renderer" ref="domainprefixdateredirectingreplayrenderer"/> + </bean> + + <!-- HTML REPLAY --> + <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> + <property name="mimeContains"> + <list> + <value>text/html</value> + <value>application/xhtml</value> + </list> + </property> + <property name="renderer" ref="domainprefixtextreplayrenderer"/> + </bean> + <!-- DEFAULT-TRANSPARENT REPLAY --> + <bean class="org.archive.wayback.replay.selector.AlwaysMatchSelector"> + <property name="renderer" ref="domainprefixtransparentreplayrenderer"/> + </bean> + + </list> + </property> + </bean> +</beans> \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ProxyReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ProxyReplay.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ProxyReplay.xml 2008-07-22 02:48:14 UTC (rev 2481) @@ -0,0 +1,48 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd"> + + <bean id="identityhttpheaderprocessor" class="org.archive.wayback.replay.IdentityHttpHeaderProcessor" /> + + <bean id="proxytransparentreplayrenderer" class="org.archive.wayback.replay.TransparentReplayRenderer"> + <constructor-arg><ref bean="identityhttpheaderprocessor"/></constructor-arg> + </bean> + <bean id="proxymarkupreplayrenderer" class="org.archive.wayback.proxy.ProxyHTMLMarkupReplayRenderer"> + <constructor-arg><ref bean="identityhttpheaderprocessor"/></constructor-arg> + <property name="jspInserts"> + <list> + <value>/WEB-INF/replay/ArchiveComment.jsp</value> + <value>/WEB-INF/replay/Disclaimer.jsp</value> +<!-- + <value>/replay/JSLessTimeline.jsp</value> +--> + </list> + </property> + </bean> + + <bean id="proxyreplay" class="org.archive.wayback.replay.SelectorReplayDispatcher"> + <property name="selectors"> + <list> + + <!-- HTML REPLAY --> + <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> + <property name="mimeContains"> + <list> + <value>text/html</value> + <value>application/xhtml</value> + </list> + </property> + <property name="renderer" ref="proxymarkupreplayrenderer"/> + </bean> + + <!-- DEFAULT-TRANSPARENT REPLAY --> + <bean class="org.archive.wayback.replay.selector.AlwaysMatchSelector"> + <property name="renderer" ref="proxytransparentreplayrenderer"/> + </bean> + + </list> + </property> + </bean> +</beans> \ No newline at end of file Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-07-22 02:45:34 UTC (rev 2480) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-07-22 02:48:14 UTC (rev 2481) @@ -1,6 +1,8 @@ <?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE beans PUBLIC "-//SPRING//DTD BEAN//EN" "http://www.springframework.org/dtd/spring-beans.dtd"> -<beans> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd"> <!-- The following 3 beans are required when using the ArcProxy for providing @@ -8,19 +10,6 @@ or directories. --> <!-- - <bean id="filelocationdb" class="org.archive.wayback.resourcestore.http.FileLocationDB" - init-method="init"> - <property name="bdbPath" value="/tmp/wayback/arc-db" /> - <property name="bdbName" value="DB1" /> - <property name="logPath" value="/tmp/wayback/arc-db.log" /> - </bean> - - <bean name="8080:arcproxy" class="org.archive.wayback.resourcestore.http.ArcProxyServlet"> - <property name="locationDB" ref="filelocationdb" /> - </bean> - <bean name="8080:locationdb" class="org.archive.wayback.resourcestore.http.FileLocationDBServlet"> - <property name="locationDB" ref="filelocationdb" /> - </bean> --> @@ -34,7 +23,6 @@ <property name="arcCacheDir"> <bean class="org.archive.wayback.liveweb.ARCCacheDirectory" init-method="init"> - <property name="arcDir" value="/tmp/wayback/liveweb/arcs/" /> <property name="arcPrefix" value="live" /> </bean> @@ -67,19 +55,21 @@ <property name="webCache" ref="livewebcache" /> </bean> --> + <!-- - The following bean is an example using the Access Control Oracle, thanks Alex Osborne. - Currently this is pretty undocumented, but here is a place to get started: + The following bean is an example using the Access Control Oracle, thanks + Alex Osborne and NLA. Currently this is pretty undocumented, but here is a + place to get started: http://webteam.archive.org/confluence/display/wayback/Exclusions+API --> -<!-- + <bean id="excluder-factory-oracle" class="org.archive.wayback.accesscontrol.oracleclient.OracleExclusionFilterFactory"> - <property name="oracleUrl" value="http://192.168.1.11:8080/oracle-0.0.1-SNAPSHOT/" /> + <property name="oracleUrl" value="http://localhost:8180/oracle/" /> <property name="accessGroup" value="ia_archiver" /> </bean> ---> +<!-- <bean id="localbdbcollection" class="org.archive.wayback.webapp.WaybackCollection"> <property name="resourceStore"> @@ -125,8 +115,123 @@ </bean> </property> </bean> +--> + + + <!-- + <property name="annotater"> + <bean class="org.archive.wayback.resourceindex.filters.OracleAnnotationFilter"> + <property name="oracleUrl" value="http://localhost:8180/oracle/" /> + <property name="who" value="annotation" /> + </bean> + </property> +--> + + <bean id="resourcefilelocationdb" class="org.archive.wayback.resourcestore.locationdb.BDBResourceFileLocationDB" + init-method="init"> + <property name="bdbPath" value="/tmp/wayback/file-db" /> + <property name="bdbName" value="DB1" /> + <property name="logPath" value="/tmp/wayback/file-db.log" /> + </bean> + <bean name="8080:locationdb" class="org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBServlet"> + <property name="locationDB" ref="resourcefilelocationdb" /> + </bean> + <bean name="8080:fileproxy" class="org.archive.wayback.resourcestore.locationdb.FileProxyServlet"> + <property name="locationDB" ref="resourcefilelocationdb" /> + </bean> + <bean id="indexqueue" class="org.archive.wayback.resourcestore.indexer.DirectoryIndexQueue"> + <property name="path" value="/tmp/wayback/indexer-queue" /> + </bean> + + <bean id="localbdbresourceindex" class="org.archive.wayback.resourceindex.LocalResourceIndex"> + <property name="source"> + <bean class="org.archive.wayback.resourceindex.bdb.BDBIndex" + init-method="init"> + <property name="bdbName" value="DB1" /> + <property name="bdbPath" value="/tmp/wayback/index/" /> + </bean> + </property> + <property name="maxRecords" value="10000" /> + </bean> + + <bean id="localbdbcollection" class="org.archive.wayback.webapp.WaybackCollection"> + + <property name="resourceStore"> + <bean id="localresourcestore" class="org.archive.wayback.resourcestore.LocalResourceFileResourceStore"> + <property name="db" ref="resourcefilelocationdb" /> + </bean> + </property> + + <property name="resourceIndex" ref="localbdbresourceindex"/> + + <property name="shutdownables"> + <list> + <!-- This thread notices new files appearing in your resourcefilesources --> + <bean id="resourcefilesourceupdater" class="org.archive.wayback.resourcestore.resourcefile.ResourceFileSourceUpdater" + init-method="init"> + <property name="target" value="/tmp/wayback/file-db-incoming" /> + <property name="interval" value="100000" /> + <property name="sources"> + <list> + <bean id="resourcefilesource" class="org.archive.wayback.resourcestore.resourcefile.DirectoryResourceFileSource"> + <property name="name" value="braddir1" /> + <property name="prefix" value="/tmp/wayback/files1/" /> + </bean> + </list> + </property> + </bean> + + <!-- This thread updates the location db with updates from resourcefilesourceupdater --> + <bean id="resourcefilelocationdbupdater" class="org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBUpdater" + init-method="init"> + <property name="interval" value="100000" /> + <property name="db" ref="resourcefilelocationdb" /> + <property name="incomingDir" value="/tmp/wayback/file-db-incoming" /> + <property name="stateDir" value="/tmp/wayback/file-db-state" /> + </bean> + + <!-- This thread notices new files arriving in the filelocationdb, and queues them for indexing --> + <bean id="indexqueueupdater" class="org.archive.wayback.resourcestore.indexer.IndexQueueUpdater" + init-method="init"> + <property name="db" ref="resourcefilelocationdb" /> + <property name="queue" ref="indexqueue" /> + <property name="interval" value="1000" /> + <property name="lastMark" value="/tmp/wayback/index-queue.mark" /> + </bean> + + <!-- This thread checks the to-be-indexed queue for files needing indexing, indexes them, and hands off the results for merging with the ResourceIndex --> + <bean id="indexworker" class="org.archive.wayback.resourcestore.indexer.IndexWorker" + init-method="init"> + <property name="db" ref="resourcefilelocationdb" /> + <property name="queue" ref="indexqueue" /> + <property name="interval" value="1000" /> + <property name="target"> + <bean class="org.archive.wayback.resourceindex.updater.IndexClient"> + <property name="tmpDir" value="/tmp/wayback/index-data/tmp/" /> + <property name="target" value="/tmp/wayback/index-data/incoming/" /> + </bean> + </property> + </bean> + + <!-- This thread merges updates from the indexworker into the ResourceIndex --> + <bean class="org.archive.wayback.resourceindex.updater.LocalResourceIndexUpdater" + init-method="init"> + + <property name="index" ref="localbdbresourceindex" /> + <property name="incoming" value="/tmp/wayback/index-data/incoming/" /> + <property name="failed" value="/tmp/wayback/index-data/failed/" /> + <property name="merged" value="/tmp/wayback/index-data/merged/" /> + <property name="runInterval" value="10000" /> + </bean> + </list> + </property> + </bean> + + + +<!-- The following WaybackCollection bean template is required when using a manually built local CDX index. --> @@ -157,8 +262,8 @@ <!-- The following WaybackCollection bean template is required when using a remote ResourceIndex and ResourceStore implementation. This will also - required setting up an arcproxy and locationdb on the host specified by - the resourceStore:urlPrefix configuration, and an addition AccessPoint + require setting up an arcproxy and locationdb on the host specified by + the resourceStore:urlPrefix configuration, and an additional AccessPoint on the host specified by the resourceIndex:searchUrlBase configuration. --> <!-- @@ -176,8 +281,6 @@ <property name="searchUrlBase" value="http://indexhost:8080/index/xmlquery" /> </bean> </property> - - </bean> --> @@ -188,23 +291,28 @@ installation. You may also need to ensure that the maxRecords on your RequestParser is not greater than the maxRecords configured on the RemoteNutchResourceIndex. --> -<!-- + <bean id="remotenutchcollection" class="org.archive.wayback.webapp.WaybackCollection"> <property name="resourceStore"> <bean class="org.archive.wayback.resourcestore.Http11ResourceStore"> - <property name="urlPrefix" value="http://localhost:8080/arcproxy/" /> +<!-- + <property name="urlPrefix" value="http://crawling11.us.archive.org/arcproxy/" /> +--> + <property name="urlPrefix" value="http://webapp100.us.archive.org/arcproxy/" /> </bean> </property> <property name="resourceIndex"> <bean class="org.archive.wayback.resourceindex.NutchResourceIndex" init-method="init"> +<!-- <property name="searchUrlBase" value="http://webteam-ws.us.archive.org:8080/katrina/opensearch" /> + --> + <property name="searchUrlBase" value="http://192.168.1.208:9090/nutch-1.0-dev/opensearch" /> <property name="maxRecords" value="100" /> </bean> </property> </bean> ---> <!-- This is the only AccessPoint defined by default within this wayback.xml @@ -216,51 +324,32 @@ running Tomcat. To provide external access, replace "localhost" with your fully qualified hostname of the computer running Tomcat. --> + <import resource="ArchivalUrlReplay.xml"/> <bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> - + <!-- + <property name="exclusionFactory" ref="excluder-factory-oracle" /> + --> <property name="collection" ref="localbdbcollection" /> + <property name="configs"> + <props> + <prop key="inst">foo</prop> + <prop key="coll">supreme court</prop> + </props> + </property> <property name="uriConverter"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> - <property name="replayURIPrefix" value="http://localhost:8080/wayback/" /> + <property name="replayURIPrefix" value="http://localhost:8080/wayback/"/> </bean> </property> <property name="query"> <bean class="org.archive.wayback.query.Renderer"> - <property name="captureJsp" value="/jsp/HTMLResults.jsp" /> + <property name="captureJsp" value="/WEB-INF/query/CalendarResults.jsp" /> </bean> </property> - <property name="replay"> - <bean class="org.archive.wayback.archivalurl.ArchivalUrlReplayDispatcher"> - <property name="serverSideRendering" value="false" /> - <property name="jspInserts"> - <list> - <value>/replay/ArchiveComment.jsp</value> - <value>/replay/ClientSideJSInsert.jsp</value> -<!-- - The following 2 .jsp include values will produce in-page elements within - replayed HTML pages. Both require client-side Javascript. ---> -<!-- - <value>/replay/Disclaimer.jsp</value> - <value>/replay/Timeline.jsp</value> ---> -<!-- - The following .jsp include value will produce a timeline within *all* replayed - pages, including all subframes within a frameset, but requires no client side - Javascript. It is intended for use in deployments which use: - - serverSideRendering=true ---> -<!-- - <value>/replay/JSLessTimeline.jsp</value> ---> - </list> - </property> - </bean> - </property> + <property name="replay" ref="archivalurlreplay" /> <property name="parser"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser" @@ -271,20 +360,68 @@ </property> </bean> + <bean name="8080:rwayback" parent="8080:wayback"> + <property name="parser"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser" + init-method="init"> + <property name="maxRecords" value="100" /> + <property name="earliestTimestamp" value="1996" /> + </bean> + </property> + <property name="exception"> + <bean class="org.archive.wayback.exception.CustomNotInArchiveExceptionRenderer"> + <property name="hosts"> + <list> + <value>www.aladems.org</value> + </list> + </property> + <property name="jspHandler" value="/exception/GrayBlank.jsp"/> + </bean> + </property> + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://localhost:8080/rwayback/" /> + </bean> + </property> + <property name="collection" ref="remotenutchcollection"> + <!-- + <bean class="org.archive.wayback.webapp.WaybackCollection"> + <property name="resourceStore" ref="fancyresourcestore" /> + <property name="resourceIndex"> + <bean class="org.archive.wayback.resourceindex.RemoteResourceIndex" + init-method="init"> + <property name="searchUrlBase" value="http://localhost:8080/wayback/xmlquery" /> + </bean> + </property> + </bean> + --> + </property> + </bean> + <!-- The following AccessPoint inherits all configuration from the 8080:wayback AccessPoint, but only allows access from the specified IP network. --> <!-- <bean name="8080:netsecure" parent="8080:wayback"> + <property name="authentication"> - <bean class="org.archive.wayback.authenticationcontrol.IPMatchesBooleanOperator"> - <property name="allowedRanges"> - <list> - <value>192.168.1.16/24</value> - </list> - </property> - </bean> + <bean class="org.archive.wayback.authenticationcontrol.AccessControlSettingOperation"> + <property name="operator"> + <bean class="org.archive.wayback.util.operator.NotBooleanOperator"> + <property name="operand"> + <bean class="org.archive.wayback.authenticationcontrol.IPMatchesBooleanOperator"> + <property name="allowedRanges"> + <list> + <value>192.168.1.16/24</value> + </list> + </property> + </bean> + </property> + </bean> + </property> + <property name="factory" ref="excluder-factory-robot"/> + </bean> </property> <property name="uriConverter"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> @@ -293,7 +430,6 @@ </property> </bean> --> - <!-- The following AccessPoint inherits all configuration from the 8080:wayback AccessPoint, but checks live web robots.txt documents to determine if @@ -313,7 +449,24 @@ </bean> --> +<import resource="DomainPrefixReplay.xml"/> +<bean name="8081" parent="8080:wayback"> + <property name="useServerName" value="true" /> + <property name="replay" ref="domainprefixreplay" /> + <property name="uriConverter"> + <bean class="org.archive.wayback.domainprefix.DomainPrefixResultURIConverter"> + <property name="hostPort" value="localhost.archive.org:8081" /> + </bean> + </property> + <property name="parser"> + <bean class="org.archive.wayback.domainprefix.DomainPrefixCompositeRequestParser" init-method="init"> + <property name="hostPort" value="localhost.archive.org:8081" /> + <property name="maxRecords" value="1000" /> + </bean> + </property> +</bean> + <!-- The following AccessPoint inherits all configuration from the 8080:wayback AccessPoint, but provides a Proxy Replay UI to the same collection. These @@ -323,27 +476,24 @@ Note: using this AccessPoint requires adding a "Connector" on port 8090 in your Tomcat's server.xml file. --> -<!-- + <import resource="ProxyReplay.xml"/> <bean name="8090" parent="8080:wayback"> <property name="useServerName" value="true" /> - <property name="replay"> - <bean class="org.archive.wayback.proxy.ProxyReplayDispatcher" /> - </property> + <property name="replay" ref="proxyreplay" /> <property name="uriConverter"> <bean class="org.archive.wayback.proxy.RedirectResultURIConverter"> - <property name="redirectURI" value="http://foo.archive.org:8090/jsp/Redirect.jsp" /> + <property name="redirectURI" value="http://brad.archive.org/jsp/Redirect.jsp" /> </bean> </property> <property name="parser"> <bean class="org.archive.wayback.proxy.ProxyRequestParser" init-method="init"> <property name="localhostNames"> <list> - <value>foo.archive.org</value> + <value>brad.archive.org</value> </list> </property> <property name="maxRecords" value="1000" /> </bean> </property> </bean> ---> </beans> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |