You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bra...@us...> - 2010-09-28 21:17:55
|
Revision: 3255 http://archive-access.svn.sourceforge.net/archive-access/?rev=3255&view=rev Author: bradtofel Date: 2010-09-28 21:17:49 +0000 (Tue, 28 Sep 2010) Log Message: ----------- Now only accepts GET & HEAD Dates are in GMT Changes format and content of Link headers Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/Memento.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/Memento.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/Memento.jsp 2010-09-28 21:07:22 UTC (rev 3254) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/Memento.jsp 2010-09-28 21:17:49 UTC (rev 3255) @@ -11,75 +11,153 @@ <%@ page import="java.util.List" %> <%@ page import="java.util.ArrayList" %> <%@ page import="org.archive.wayback.ResultURIConverter" %> +<%@ page import="java.util.TimeZone" %> +<%@ page import="java.io.PrintWriter"%> <%@ page import="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter" %> <% + //timegate implementation + String method = request.getMethod(); + // may be there is better place to put this piece of code? + if( !(method.equals("GET")||method.equals("HEAD")) ) { + response.setStatus(405); + response.setHeader("Allow", "GET, HEAD"); + return; + } + response.setHeader("Vary", "negotiate,accept-datetime"); SimpleDateFormat httpformatterl = new SimpleDateFormat( "E, dd MMM yyyy HH:mm:ss z"); + TimeZone tzo = TimeZone.getTimeZone("GMT"); + httpformatterl.setTimeZone(tzo); + SimpleDateFormat formatterk = new SimpleDateFormat("yyyyMMddHHmmss"); + formatterk.setTimeZone(tzo); Date now = new Date(); UIResults results = UIResults.extractCaptureQuery(request);//nuzno potom perepisat' WaybackRequest wbRequest = results.getWbRequest(); - //String p_url = wbRequest.getContextPrefix(); String u = wbRequest.getRequestUrl(); - // String agguri = p_url.replace("memento","ore") +"timebundle/" + u; - // String ad = wbRequest.getStartTimestamp(); - // Date sdate = wbRequest.getStartDate(); - //Date pdate = wbRequest.getAnchorDate(); + String dtdate = wbRequest.get("dtconneg"); - Date dt = now; - if (dtdate != null) { - dt = httpformatterl.parse(dtdate); - } CaptureSearchResults cResults = results.getCaptureResults(); CaptureSearchResult res = cResults.getClosest(wbRequest, true); Date closestDate = res.getCaptureDate(); - //String url = res.getRedirectUrl(); + String agguri = results.getContextConfig("aggregationPrefix") + "timebundle/" + u; String timemap = " , <" + results.getContextConfig("aggregationPrefix") + "timemap/link/" + u + ">;rel=\"timemap\"; type=\"text/csv\""; + String origlink = ", <" + u + ">;rel=\"original\""; ArchivalUrlResultURIConverter uriconverter = (ArchivalUrlResultURIConverter) results .getURIConverter(); String uriPrefix = uriconverter.getReplayURIPrefix(); String replayUrl = results.resultToReplayUrl(res); - //alternates header - String qvalue = "1.0"; //just example + StringBuffer sb = new StringBuffer(); - // sb.append("{"); - //sb.append("\"" + u +"\" "+qvalue); - //sb.append(" {dt original}},"); - //calculate X-Archive-Interval - + + String memento = ",<" + replayUrl + + ">;rel=\"memento\";datetime=\"" + + httpformatterl.format(closestDate) + "\""; StringFormatter fmt = results.getWbRequest().getFormatter(); Date f = cResults.getFirstResultDate(); Date l = cResults.getLastResultDate(); - SimpleDateFormat formatterk = new SimpleDateFormat("yyyyMMddHHmmss"); - //sb.append("{\"" + uriPrefix +formatterk.format(f)+"/" +u +"\" " +qvalue); - //sb.append(" {dt " + "\""+httpformatterl.format(f) +"\" first}}"); - sb.append(", <" + uriPrefix + formatterk.format(f) + "/" + u + + String mfl = null; + if ( (closestDate.equals(f)) && closestDate.equals(l)) { + + mfl = ", <" + uriPrefix + formatterk.format(f) + "/" + u + + ">;rel=\"first-memento memento last-memento\"; datetime=\"" + + httpformatterl.format(f) + "\""; + + } else if (closestDate.equals(f)){ + + mfl = ", <" + uriPrefix + formatterk.format(f) + "/" + u + + ">;rel=\"first-memento memento\"; datetime=\"" + + httpformatterl.format(f) + "\"" + + ", <" + uriPrefix + formatterk.format(l) + "/" + u + + ">;rel=\"last-memento\"; datetime=\"" + + httpformatterl.format(l) + "\""; + + } else if (closestDate.equals(l)) { + + mfl = ", <" + uriPrefix + formatterk.format(l) + "/" + u + + ">;rel=\"last-memento memento\"; datetime=\"" + + httpformatterl.format(l) + "\"" + + ", <" + uriPrefix + formatterk.format(f) + "/" + u + ">;rel=\"first-memento\"; datetime=\"" - + httpformatterl.format(f) + "\""); - if (!f.equals(l)) { + + httpformatterl.format(f) + "\""; - // sb.append(","); - // sb.append("{\"" + uriPrefix +formatterk.format(l)+"/" +u +"\" " +qvalue); - // sb.append(" {dt " + "\""+httpformatterl.format(l) +"\" last}}"); - sb.append(", <" + uriPrefix + formatterk.format(f) + "/" + u - + ">;rel=\"last-memento\"; datetime=\"" - + httpformatterl.format(f) + "\""); + } else { + + mfl = memento + + ", <" + uriPrefix + formatterk.format(l) + "/" + u + + ">;rel=\"last-memento\"; datetime=\"" + + httpformatterl.format(l) + "\"" + + ", <" + uriPrefix + formatterk.format(f) + "/" + u + + ">;rel=\"first-memento\"; datetime=\"" + + httpformatterl.format(f) + "\""; } - - //response.setHeader("X-Archive-Interval","{"+httpformatterl.format(f)+"} - {"+httpformatterl.format(l)+"}"); - + + sb = new StringBuffer(mfl); + + if (dtdate==null) dtdate=""; + + + //special handling date unparsable case + if (dtdate.equals("unparsable")) { + String fl= null; + if (f.equals(l)) { + fl=", <" + uriPrefix + formatterk.format(f) + "/" + u + + ">;rel=\"last-memento first-memento\"; datetime=\"" + + httpformatterl.format(f) + "\""; + + } else { + fl = ", <" + uriPrefix + formatterk.format(l) + "/" + u + + ">;rel=\"last-memento\"; datetime=\"" + + httpformatterl.format(l) + "\""; + fl =fl +", <" + uriPrefix + formatterk.format(f) + "/" + u + + ">;rel=\"first-memento\"; datetime=\"" + + httpformatterl.format(f) + "\""; + } + response.setHeader("TCN", "list"); + response.setStatus(400); + response.setHeader("Link", "<" + agguri + + ">;rel=\"timebundle\"" + origlink + fl + + timemap); + + + + StringBuffer sberr = new StringBuffer(); + sberr.append("<html><head><title>400 Bad Request</title></head><body>" ); + sberr.append("<center><table width='800px'><tr><td><div style='background-color: #e0e0e0; padding: 10px;'><br/>"); + sberr.append("<center><b>Error: 400</b><center>" ); + sberr.append("<center><p>Bad Date Request.</p>" ); + sberr.append("However, we found archived resources available in the following time-range: " ); + sberr.append("<i><blockquote><ul> " ); + + sberr.append("<li>Very first available Memento "+ " at "+ uriPrefix + formatterk.format(f) + "/" + u +"</BR>\n" ); + sberr.append("<li>Most recent available Memento " + " at " + uriPrefix + formatterk.format(f) + "/" + u +"</BR>\n" ); + + + sberr.append("</ul> </blockquote></i>" ); + sberr.append("<br/></div></td></tr>"); + sberr.append("</table>"); + sberr.append("</body></html>"); + PrintWriter pw = response.getWriter(); + response.setContentType("text/html"); + pw.print(sberr.toString()); + pw.flush(); + pw.close(); + return; + } + + // calculate closest values for alternates - CaptureSearchResult closestleft = null; CaptureSearchResult closestright = null; long rclosestDistance = 0; @@ -88,16 +166,14 @@ String anchorDate = null; long maxWindow = -1; - //long wantTime = wbRequest.getReplayDate().getTime(); long wantTime = closestDate.getTime(); - + Iterator<CaptureSearchResult> itr = cResults.iterator(); while (itr.hasNext()) { cur = itr.next(); cur.getCaptureDate(); - //long curDistance = Math.abs(wantTime - cur.getCaptureDate().getTime()); - long curDistance = wantTime - cur.getCaptureDate().getTime(); - // == 0 propuskaem + long curDistance = cur.getCaptureDate().getTime()-wantTime; + // == 0 skip if (curDistance > 0) { if ((closestright == null) || (Math.abs(curDistance) < Math @@ -118,75 +194,38 @@ } - if ((dt.before(f)) || dt.after(now)) { - //if ((pdate.before(f))||pdate.after(now)) { - response.setHeader("TCN", "list"); - response.setStatus(406); - // response.setHeader("Link","<"+agguri+">;rel=\"aggregation\""); - // sb.append("}"); - // response.setHeader("Alternates",sb.toString()); - } else { - // SimpleDateFormat formatterk = new SimpleDateFormat("yyyyMMddHHmmss"); - - // StringBuffer sb = new StringBuffer(); - - // List list = new ArrayList(); - if (closestleft != null) { - if (!closestleft.getCaptureDate().equals(f)) { - // sb.append(","); - // sb.append("{\"" + uriPrefix +formatterk.format(closestleft.getCaptureDate())+"/" +u +"\" "+qvalue); - // sb.append(" {dt " +"\""+httpformatterl.format(closestleft.getCaptureDate()) +"\" prev} {type " + closestleft.getMimeType() +"}}"); - sb.append(", <" + uriPrefix + formatterk.format(f) - + "/" + u - + ">;rel=\"prev-memento\"; datetime=\"" - + httpformatterl.format(f) + "\""); - // list.add(closestleft); - } + + + if (closestleft != null) { + if (!(closestleft.getCaptureDate().equals(f))) { + sb.append(", <" + uriPrefix + formatterk.format(closestleft.getCaptureDate()) + + "/" + u + + ">;rel=\"prev-memento\"; datetime=\"" + + httpformatterl.format(closestleft.getCaptureDate()) + "\""); + } else { + int m_index = sb.lastIndexOf("\"first-memento\""); + sb.insert(m_index + 1, "prev-memento "); } - if (closestright != null) { - if (!closestright.getCaptureDate().equals(l)) { - // sb.append(","); - // sb.append("{\"" + uriPrefix +formatterk.format(closestright.getCaptureDate())+"/" +u +"\" " +qvalue); - //sb.append(" {dt " +"\""+httpformatterl.format(closestright.getCaptureDate()) +"\" next} {type " + closestright.getMimeType() +"}}"); - sb.append(", <" + uriPrefix + formatterk.format(f) - + "/" + u - + ">;rel=\"next-memento\"; datetime=\"" - + httpformatterl.format(f) + "\""); - } - - // list.add(closestright); + } + if (closestright != null) { + if (!(closestright.getCaptureDate().equals(l))) { + sb.append(", <" + uriPrefix + formatterk.format(closestright.getCaptureDate()) + + "/" + u + + ">;rel=\"next-memento\"; datetime=\"" + + httpformatterl.format(closestright.getCaptureDate()) + "\""); + } else { + int m_index = sb.lastIndexOf("\"last-memento\""); + sb.insert(m_index + 1, "next-memento "); } + } + + + response.setHeader("Link", "<" + agguri + + ">;rel=\"timebundle\"" + origlink + sb.toString() + + timemap); //added timemap + + response.setHeader("TCN", "choice"); + response.setHeader("Location", replayUrl); + response.sendError(302, "Found"); - // Iterator it = list.iterator(); - //int count=0; - //while (it.hasNext()) { - - // count++; - //CaptureSearchResult alt = (CaptureSearchResult) it.next(); - - // sb.append("{"); - //sb.append("\"" + uriPrefix +formatterk.format(alt.getCaptureDate())+"/" +u +"\" "); - //sb.append("{dt " + httpformatterl.format(alt.getCaptureDate()) +"} {type " + alt.getMimeType() +"}"); - - //sb.append("}"); - //if (count!=list.size()) { - // sb.append(","); } - - //} - - // sb.append("}"); - String origlink = ", <" + u + ">;rel=\"original\""; - String memento = ",<" + replayUrl - + ">;rel=\"memento\";datetime=\"" - + httpformatterl.format(closestDate) + "\""; - response.setHeader("Link", "<" + agguri - + ">;rel=\"timebundle\"" + origlink + sb.toString() - + memento + timemap); //added timemap - // response.setHeader("Alternates",sb.toString()); - response.setHeader("TCN", "choice"); - response.setHeader("Location", replayUrl); - // response.setStatus(302,"Found"); //does'not work - response.sendError(302, "Found"); - - } %> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-09-28 21:07:28
|
Revision: 3254 http://archive-access.svn.sourceforge.net/archive-access/?rev=3254&view=rev Author: bradtofel Date: 2010-09-28 21:07:22 +0000 (Tue, 28 Sep 2010) Log Message: ----------- Dates now in GMT Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp 2010-09-28 21:06:39 UTC (rev 3253) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp 2010-09-28 21:07:22 UTC (rev 3254) @@ -24,12 +24,16 @@ <%@ page import="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter" %> <%@page import="org.dspace.foresite.Triple"%> <%@page import="org.dspace.foresite.jena.TripleJena"%> -<%@page import="java.util.UUID"%> +<%@page import="java.util.UUID"%> +<%@ page import="java.util.TimeZone" %> <%@page import="java.util.Calendar"%> <% UIResults results = UIResults.extractCaptureQuery(request);//nuzno potom perepisat' SimpleDateFormat httpformatterl = new SimpleDateFormat( "E, dd MMM yyyy HH:mm:ss z"); + TimeZone tzo = TimeZone.getTimeZone("GMT"); + httpformatterl.setTimeZone(tzo); + WaybackRequest wbRequest = results.getWbRequest(); CaptureSearchResults cResults = results.getCaptureResults(); CaptureSearchResult res = cResults.getClosest(wbRequest, true); @@ -77,7 +81,7 @@ Iterator<CaptureSearchResult> itr = cResults.iterator(); SimpleDateFormat formatterk = new SimpleDateFormat( "yyyyMMddHHmmss"); - + formatterk.setTimeZone(tzo); Date f = cResults.getFirstResultDate(); Date l = cResults.getLastResultDate(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-09-28 21:06:58
|
Revision: 3253 http://archive-access.svn.sourceforge.net/archive-access/?rev=3253&view=rev Author: bradtofel Date: 2010-09-28 21:06:39 +0000 (Tue, 28 Sep 2010) Log Message: ----------- Dates now in GMT, HTTP header changed from "Content-datetime" to "Memento-datetime", also changed Link header format and contents.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/MementoValidity.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/MementoValidity.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/MementoValidity.jsp 2010-09-28 21:02:16 UTC (rev 3252) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/MementoValidity.jsp 2010-09-28 21:06:39 UTC (rev 3253) @@ -5,6 +5,7 @@ <%@ page import="org.archive.wayback.core.WaybackRequest" %> <%@ page import="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter" %> <%@ page import="java.text.SimpleDateFormat" %> +<%@ page import="java.util.TimeZone" %> <%@ page import="java.util.Iterator" %> <% UIResults results = UIResults.extractCaptureQuery(request); @@ -14,6 +15,13 @@ String u = wbRequest.getRequestUrl(); SimpleDateFormat httpformatterl = new SimpleDateFormat( "E, dd MMM yyyy HH:mm:ss z"); + TimeZone tzo = TimeZone.getTimeZone("GMT"); + httpformatterl.setTimeZone(tzo); + SimpleDateFormat formatterk = new SimpleDateFormat("yyyyMMddHHmmss"); + formatterk.setTimeZone(tzo); + + + Date closestDate = res.getCaptureDate(); String agguri = results.getContextConfig("aggregationPrefix") + "timebundle/" + u; String timemap = " , <" @@ -27,23 +35,51 @@ Date f = cResults.getFirstResultDate(); Date l = cResults.getLastResultDate(); - String qvalue = "1.0"; //just example + StringBuffer sb = new StringBuffer(); - response.setHeader("Content-Datetime", httpformatterl.format(res + + response.setHeader("Memento-Datetime", httpformatterl.format(res .getCaptureDate())); - SimpleDateFormat formatterk = new SimpleDateFormat("yyyyMMddHHmmss"); - sb.append(", <" + uriPrefix + formatterk.format(f) + "/" + u - + ">;rel=\"first-memento\"; datetime=\"" - + httpformatterl.format(f) + "\""); - if (!f.equals(l)) { - - sb.append(", <" + uriPrefix + formatterk.format(f) + "/" + u - + ">;rel=\"last-memento\"; datetime=\"" - + httpformatterl.format(f) + "\""); - } - - // calculate closest values for alternates - Date closestDate = res.getCaptureDate(); + + String memento = ",<" + uriPrefix + formatterk.format(closestDate) + "/" + u + + ">;rel=\"memento\";datetime=\"" + + httpformatterl.format(closestDate) + "\""; + String mfl = null; + if ( (closestDate.equals(f)) && closestDate.equals(l)) { + mfl = ", <" + uriPrefix + formatterk.format(f) + "/" + u + + ">;rel=\"first-memento memento last-memento\"; datetime=\"" + + httpformatterl.format(f) + "\""; + } + else if (closestDate.equals(f)){ + mfl=", <" + uriPrefix + formatterk.format(f) + "/" + u + + ">;rel=\"first-memento memento\"; datetime=\"" + + httpformatterl.format(f) + "\"" ; + mfl = mfl+", <" + uriPrefix + formatterk.format(l) + "/" + u + + ">;rel=\"last-memento\"; datetime=\"" + + httpformatterl.format(l) + "\""; + + } + else if (closestDate.equals(l)) { + mfl=", <" + uriPrefix + formatterk.format(l) + "/" + u + + ">;rel=\"last-memento memento\"; datetime=\"" + + httpformatterl.format(l) + "\""; + mfl = mfl +", <" + uriPrefix + formatterk.format(f) + "/" + u + + ">;rel=\"first-memento\"; datetime=\"" + + httpformatterl.format(f) + "\""; + } + else { + mfl = memento ; + + mfl = mfl+", <" + uriPrefix + formatterk.format(l) + "/" + u + + ">;rel=\"last-memento\"; datetime=\"" + + httpformatterl.format(l) + "\""; + mfl =mfl +", <" + uriPrefix + formatterk.format(f) + "/" + u + + ">;rel=\"first-memento\"; datetime=\"" + + httpformatterl.format(f) + "\""; + } + + sb = new StringBuffer(mfl); + CaptureSearchResult closestleft = null; CaptureSearchResult closestright = null; long rclosestDistance = 0; @@ -53,13 +89,13 @@ long maxWindow = -1; long wantTime = closestDate.getTime(); - + Iterator<CaptureSearchResult> itr = cResults.iterator(); while (itr.hasNext()) { cur = itr.next(); cur.getCaptureDate(); - long curDistance = wantTime - cur.getCaptureDate().getTime(); - // == 0 propuskaem + long curDistance = cur.getCaptureDate().getTime()-wantTime; + // == 0 skip if (curDistance > 0) { if ((closestright == null) || (Math.abs(curDistance) < Math @@ -81,18 +117,29 @@ } if (closestleft != null) { - if (!closestleft.getCaptureDate().equals(f)) { - sb.append(", <" + uriPrefix + formatterk.format(f) + "/" + if (!(closestleft.getCaptureDate().equals(f))) { + + sb.append(", <" + uriPrefix + formatterk.format(closestleft.getCaptureDate()) + "/" + u + ">;rel=\"prev-memento\"; datetime=\"" - + httpformatterl.format(f) + "\""); + + httpformatterl.format(closestleft.getCaptureDate()) + "\""); } + else { + int m_index = sb.lastIndexOf("\"first-memento\""); + sb.insert(m_index + 1, "prev-memento "); + + } } if (closestright != null) { - if (!closestright.getCaptureDate().equals(l)) { - sb.append(", <" + uriPrefix + formatterk.format(f) + "/" + if (!(closestright.getCaptureDate().equals(l))) { + sb.append(", <" + uriPrefix + formatterk.format(closestright.getCaptureDate()) + "/" + u + ">;rel=\"next-memento\"; datetime=\"" - + httpformatterl.format(f) + "\""); + + httpformatterl.format(closestright.getCaptureDate()) + "\""); } + else { + int m_index = sb.lastIndexOf("\"last-memento\""); + sb.insert(m_index + 1, "next-memento "); + + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-09-28 21:02:22
|
Revision: 3252 http://archive-access.svn.sourceforge.net/archive-access/?rev=3252&view=rev Author: bradtofel Date: 2010-09-28 21:02:16 +0000 (Tue, 28 Sep 2010) Log Message: ----------- Adding special exception for Timegate errors Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/memento.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/memento.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/memento.xml 2010-09-07 22:39:44 UTC (rev 3251) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/memento.xml 2010-09-28 21:02:16 UTC (rev 3252) @@ -133,6 +133,11 @@ <property name="earliestTimestamp" value="1996" /> </bean> </property> + <property name="exception"> + <bean class="org.archive.wayback.exception.BaseExceptionRenderer"> + <property name="errorJsp" value="/WEB-INF/exception/TimegateError.jsp" /> + </bean> + </property> </bean> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-09-07 22:39:51
|
Revision: 3251 http://archive-access.svn.sourceforge.net/archive-access/?rev=3251&view=rev Author: bradtofel Date: 2010-09-07 22:39:44 +0000 (Tue, 07 Sep 2010) Log Message: ----------- TWEAK: fixed « and » HTML entity references... Somehow they had an extra 'S'. Also changed some to the &#NNN; format, as the Toolbar.jsp is upper casing some of the results. &LAQUO; is NOT the same thing as «... Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties 2010-09-07 22:17:11 UTC (rev 3250) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties 2010-09-07 22:39:44 UTC (rev 3251) @@ -143,15 +143,15 @@ GraphTimeline.searchButtonText=Go graph.title=Jump to first record in {0,date,yyyy} : ({0,date,MMM d}) -graph.prevYear= ‹‹‹‹Year -graph.prevMonth=‹‹‹Month -graph.prevDay=‹‹Day -graph.prevCapture=‹Prev +graph.prevYear= ‹«««Year +graph.prevMonth=«««Month +graph.prevDay=««Day +graph.prevCapture=«Prev graph.current=See All Versions -graph.nextCapture=Next› -graph.nextDay=Day›› -graph.nextMonth=Month››› -graph.nextYear=Year›››› +graph.nextCapture=Next» +graph.nextDay=Day»» +graph.nextMonth=Month»»» +graph.nextYear=Year»»»» ToolBar.closeTitle=Close the toolbar @@ -172,25 +172,25 @@ ToolBar.noPrevMonthText={0,date,MMM} ToolBar.prevMonthTitle={0,date,d MMM yyyy} -ToolBar.prevMonthText=‹{0,date,MMM} +ToolBar.prevMonthText=«{0,date,MMM} ToolBar.curMonthTitle=You are here: {0,date,H:mm:ss MMM d, yyyy} ToolBar.curMonthText={0,date,MMM} ToolBar.noNextMonthText={0,date,MMM} ToolBar.nextMonthTitle={0,date,d MMM yyyy} -ToolBar.nextMonthText={0,date,MMM} › +ToolBar.nextMonthText={0,date,MMM} » ToolBar.noPrevYearText={0,date,yyyy} ToolBar.prevYearTitle={0,date,d MMM yyyy} -ToolBar.prevYearText=‹{0,date,yyyy} +ToolBar.prevYearText=«{0,date,yyyy} ToolBar.curYearTitle=You are here: {0,date,H:mm:ss MMM d, yyyy} ToolBar.curYearText={0,date,yyyy} ToolBar.noNextYearText={0,date,yyyy} ToolBar.nextYearTitle={0,date,d MMM yyyy} -ToolBar.nextYearText={0,date,yyyy} › +ToolBar.nextYearText={0,date,yyyy} » -3 April 2010 +#3 April 2010 # 20 May 2010 14:09:56 PartitionSize.dateHeader.yearGraphLabel={0,date,yyyy} PartitionSize.dateHeader.monthGraphLabel={0,date,MMM} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-09-07 22:17:17
|
Revision: 3250 http://archive-access.svn.sourceforge.net/archive-access/?rev=3250&view=rev Author: bradtofel Date: 2010-09-07 22:17:11 +0000 (Tue, 07 Sep 2010) Log Message: ----------- TWEAK: setting default pool wait to 300 MS. Heritrix dropped use of the commons-pool, and in the new code this is no longer a timeout, but a default wait before even checking if more pool members are needed. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java 2010-09-07 22:14:41 UTC (rev 3249) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java 2010-09-07 22:17:11 UTC (rev 3250) @@ -55,7 +55,7 @@ ARCCacheDirectory.class.getName()); private int poolWriters = 5; - private int maxPoolWait = 5 * 1000; + private int maxPoolWait = 300; private long maxARCSize = ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE; private String arcPrefix = "wayback-live"; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-09-07 22:14:48
|
Revision: 3249 http://archive-access.svn.sourceforge.net/archive-access/?rev=3249&view=rev Author: bradtofel Date: 2010-09-07 22:14:41 +0000 (Tue, 07 Sep 2010) Log Message: ----------- BUGFIX(unreported) last checkin left in "debug" code which dumped original content to STDOUT... Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java 2010-09-07 22:12:17 UTC (rev 3248) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java 2010-09-07 22:14:41 UTC (rev 3249) @@ -138,8 +138,7 @@ getMethod.setRequestHeader("User-Agent", userAgent); int code = client.executeMethod(getMethod); LOGGER.info("URL(" + url + ") HTTP:" + code); -// ByteOp.discardStream(getMethod.getResponseBodyAsStream()); - ByteOp.copyStream(getMethod.getResponseBodyAsStream(), System.out); + ByteOp.discardStream(getMethod.getResponseBodyAsStream()); getMethod.releaseConnection(); gotUrl = true; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3248 http://archive-access.svn.sourceforge.net/archive-access/?rev=3248&view=rev Author: bradtofel Date: 2010-09-07 22:12:17 +0000 (Tue, 07 Sep 2010) Log Message: ----------- BUGFIX(unreported) added deprecated BGSOUND html HEAD tag. Back in the bad-old-days, this was used in IE to play a sound when browsers loaded a page. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java 2010-09-03 23:19:28 UTC (rev 3247) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java 2010-09-07 22:12:17 UTC (rev 3248) @@ -67,7 +67,7 @@ private final String[] okHeadTags = { "![CDATA[*", "![CDATA[", "?", "!DOCTYPE", "HTML", "HEAD", "BASE", "LINK", "META", "TITLE", - "STYLE", "SCRIPT" }; + "STYLE", "SCRIPT" , "BGSOUND"}; private HashMap<String, Object> okHeadTagMap = null; private final static String FRAMESET_TAG = "FRAMESET"; private final static String BODY_TAG = "BODY"; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3247 http://archive-access.svn.sourceforge.net/archive-access/?rev=3247&view=rev Author: bradtofel Date: 2010-09-03 23:19:28 +0000 (Fri, 03 Sep 2010) Log Message: ----------- FEATURE: explicitly allowing download of /robots.txt paths, without consulting the robots.txt file. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-09-03 22:32:51 UTC (rev 3246) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-09-03 23:19:28 UTC (rev 3247) @@ -236,7 +236,9 @@ URL url; try { url = new URL(ArchiveUtils.addImpliedHttpIfNecessary(resultURL)); - if(!rules.blocksPathForUA(url.getPath(), userAgent)) { + String path = url.getPath(); + if(path.equals(ROBOT_SUFFIX) || + !rules.blocksPathForUA(path, userAgent)) { if(!notifiedPassed) { if(filterGroup != null) { filterGroup.setPassedRobots(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-09-03 22:32:57
|
Revision: 3246 http://archive-access.svn.sourceforge.net/archive-access/?rev=3246&view=rev Author: bradtofel Date: 2010-09-03 22:32:51 +0000 (Fri, 03 Sep 2010) Log Message: ----------- hadoop jar-with-dependencies manifest Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-hadoop/src/ trunk/archive-access/projects/wayback/wayback-hadoop/src/main/ trunk/archive-access/projects/wayback/wayback-hadoop/src/main/archive/ trunk/archive-access/projects/wayback/wayback-hadoop/src/main/archive/MANIFEST.MF Added: trunk/archive-access/projects/wayback/wayback-hadoop/src/main/archive/MANIFEST.MF =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop/src/main/archive/MANIFEST.MF (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop/src/main/archive/MANIFEST.MF 2010-09-03 22:32:51 UTC (rev 3246) @@ -0,0 +1,7 @@ +Manifest-Version: 1.0 +Archiver-Version: Plexus Archiver +Created-By: Apache Maven +Built-By: brad +Build-Jdk: 1.6.0_16 +Main-Class: org.archive.wayback.hadoop.SortDriver +Class-Path: hadoop-0.19.1-core.jar lib/commons-cli-2.0-SNAPSHOT.jar lib/commons-codec-1.3.jar lib/commons-httpclient-3.0.1.jar lib/commons-logging-1.0.4.jar lib/commons-logging-api-1.0.4.jar lib/commons-net-1.4.1.jar lib/hsqldb-1.8.0.10.jar lib/jets3t-0.6.1.jar lib/jetty-5.1.4.jar lib/junit-3.8.1.jar lib/kfs-0.2.0.jar lib/log4j-1.2.15.jar lib/oro-2.0.8.jar lib/servlet-api.jar lib/slf4j-api-1.4.3.jar lib/slf4j-log4j12-1.4.3.jar lib/xmlenc-0.52.jar lib/jetty-ext/commons-el.jar lib/jetty-ext/jasper-compiler.jar lib/jetty-ext/jasper-runtime.jar lib/jetty-ext/jsp-api.jar This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-09-03 22:31:59
|
Revision: 3245 http://archive-access.svn.sourceforge.net/archive-access/?rev=3245&view=rev Author: bradtofel Date: 2010-09-03 22:31:52 +0000 (Fri, 03 Sep 2010) Log Message: ----------- New version of the Wayback hadoop cdx sorting code, integrated with hadoop 20.2, and more flexible Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/AlphaPartitioner.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSort.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXCanonicalizingMapper.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXReducer.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingInputFormat.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/AlphaPartitioner.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/AlphaPartitioner.java 2010-09-03 22:30:36 UTC (rev 3244) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/AlphaPartitioner.java 2010-09-03 22:31:52 UTC (rev 3245) @@ -25,7 +25,6 @@ package org.archive.wayback.hadoop; import java.io.BufferedReader; -import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; @@ -33,14 +32,12 @@ import java.util.ArrayList; import java.util.Arrays; -import org.apache.hadoop.filecache.DistributedCache; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Partitioner; +import org.apache.hadoop.mapreduce.Partitioner; /** * @@ -48,104 +45,65 @@ * @author brad * @version $Date$, $Revision$ */ -public class AlphaPartitioner implements Partitioner<Text, Text> { +public class AlphaPartitioner extends Partitioner<Text, Text> implements + Configurable { + private static String CONFIG_SPLIT_PATH_NAME = "alphapartitioner.path"; private String boundaries[] = new String[0]; - public static final String DEFAULT_PATH = "_split.txt"; - public static final String SPLIT_PATH_NAME = "alpha-partition.txt"; - public static final String SPLIT_CACHE_NAME = "alpha-partition-cache.txt"; - public static final String CACHE_SPLIT_URI_CONFIG = - "alphapartitioner.cachesplituri"; - public static final String CACHE_SPLIT_PATH_CONFIG = - "alphapartitioner.cachesplitpath"; - public static final String CACHE_SPLIT_STAMP_CONFIG = - "alphapartitioner.cachesplitstamp"; - /** - * Called by client prior to launching the job. The File argument is a - * split file, which is to be pushed into the FileSystem, and into the - * DistributedCache from there, for use by the Map tasks. - * @throws URISyntaxException - */ - public static void setPartitionFile(JobConf conf, File f) - throws IOException, URISyntaxException { - - FileSystem fs = FileSystem.get(conf); + Configuration conf; - Path fsSplitPath = new Path(SPLIT_PATH_NAME); - fs.copyFromLocalFile(new Path(f.getAbsolutePath()), fsSplitPath); - - String cacheURIString = SPLIT_PATH_NAME + "#" + SPLIT_CACHE_NAME; - DistributedCache.addCacheFile(new URI(cacheURIString), conf); - - FileStatus fsStat = fs.getFileStatus(fsSplitPath); - String mtime = String.valueOf(fsStat.getModificationTime()); - System.err.println("Files mtime(" + mtime + ")"); - conf.set(AlphaPartitioner.CACHE_SPLIT_URI_CONFIG,cacheURIString); - conf.set(AlphaPartitioner.CACHE_SPLIT_PATH_CONFIG,SPLIT_CACHE_NAME); - conf.set(AlphaPartitioner.CACHE_SPLIT_STAMP_CONFIG,mtime); + @Override + public int getPartition(Text key, Text value, int numPartitions) { + String keyS = key.toString(); + int loc = Arrays.binarySearch(boundaries, keyS); + if (loc < 0) { + loc = (loc * -1) - 2; + if (loc < 0) { + loc = 0; + } + } + return loc; } - public static void setPartitionFileBad(JobConf conf, File f) - throws IOException, URISyntaxException { - - FileSystem fs = FileSystem.get(conf); - - Path fsSplitPath = new Path(SPLIT_PATH_NAME); - fs.copyFromLocalFile(new Path(f.getAbsolutePath()), fsSplitPath); - - String cacheURIString = SPLIT_PATH_NAME + "#" + SPLIT_CACHE_NAME; - DistributedCache.addCacheFile(new URI(cacheURIString), conf); - - FileStatus fsStat = fs.getFileStatus(fsSplitPath); - String mtime = String.valueOf(fsStat.getModificationTime()); - System.err.println("Files mtime(" + mtime + ")"); - conf.set(AlphaPartitioner.CACHE_SPLIT_URI_CONFIG,cacheURIString); - conf.set(AlphaPartitioner.CACHE_SPLIT_PATH_CONFIG,SPLIT_CACHE_NAME); - conf.set(AlphaPartitioner.CACHE_SPLIT_STAMP_CONFIG,mtime); - } - - /** - * Get a BufferedReader on the alphabetic split file stored in the - * DistributedCache - * @throws IOException - * @throws URISyntaxException - */ - private static BufferedReader getPartitionFile(JobConf conf) - throws IOException, URISyntaxException { - - System.err.println("Loading split partition file..."); - FileSystem fs = FileSystem.getLocal(conf); - Path[] cacheFiles = DistributedCache.getLocalCacheFiles(conf); - - -// System.err.println("Local FS:"+fs.toString()); -// URI cacheURI = new URI(conf.get(CACHE_SPLIT_URI_CONFIG)); -// System.err.println("CacheURI: " + cacheURI.toString()); -// long mtime = Long.valueOf(conf.get(CACHE_SPLIT_STAMP_CONFIG)); -// System.err.println("Cache split timestamp: " + mtime); -// Path localSplitPath = DistributedCache.getLocalCache(cacheURI, conf, -// conf.getLocalPath(conf.getJobLocalDir()), false, mtime, -// conf.getWorkingDirectory()); -// System.err.println("LocalSplitPath: " + localSplitPath.toString()); -// FSDataInputStream in = fs.open(localSplitPath); - FSDataInputStream in = fs.open(cacheFiles[0]); - InputStreamReader is = new InputStreamReader(in); - return new BufferedReader(is); + public Configuration getConf() { + return conf; } - public void configure(JobConf conf) { + public void setConf(Configuration conf) { + this.conf = conf; + String partitionPath = getPartitionPath(conf); + String numReduceTasks = conf.get("mapred.reduce.tasks"); + System.err.println("Num configured reduce tasks:" + numReduceTasks); try { - System.err.println("Loading split file from cache..."); - loadBoundaries(getPartitionFile(conf)); - System.err.println("Loaded and Sorted split file"); + URI uri = new URI(partitionPath); + FileSystem fs = FileSystem.get(uri, conf); + Path p = new Path(partitionPath); + loadBoundaries(new BufferedReader(new InputStreamReader(fs.open(p)))); } catch (IOException e) { - throw new RuntimeException(e); + // TODO: ugh. how to handle? + e.printStackTrace(); } catch (URISyntaxException e) { - throw new RuntimeException(e); + e.printStackTrace(); } } - public void loadBoundaries(BufferedReader bis) throws IOException { + /** + * @param conf Configuration for the Job + * @param path hdfs:// URI pointing to the split file + */ + public static void setPartitionPath(Configuration conf, String path) { + conf.set(CONFIG_SPLIT_PATH_NAME, path); + } + + /** + * @param conf Configuration for the Job + * @return the hdfs:// URI for the split file configured for this job + */ + public static String getPartitionPath(Configuration conf) { + return conf.get(CONFIG_SPLIT_PATH_NAME); + } + + private void loadBoundaries(BufferedReader bis) throws IOException { ArrayList<String> l = new ArrayList<String>(); while (true) { String line = bis.readLine(); @@ -157,30 +115,4 @@ boundaries = l.toArray(boundaries); Arrays.sort(boundaries); } - - /** - * @return the number of partitions in the configuration file. This is also - * the number of reduce tasks in the job. - */ - public int getNumPartitions() { - return boundaries.length; - } - - /** - * @param key - * @param value - * @param numReduceTasks - * @return int partition index for key - */ - public int getPartition(Text key, Text value, int numPartitions) { - String keyS = key.toString(); - int loc = Arrays.binarySearch(boundaries, keyS); - if (loc < 0) { - loc = (loc * -1) - 2; - if (loc < 0) { - loc = 0; - } - } - return loc; - } } Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXCanonicalizingMapper.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXCanonicalizingMapper.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXCanonicalizingMapper.java 2010-09-03 22:31:52 UTC (rev 3245) @@ -0,0 +1,152 @@ +package org.archive.wayback.hadoop; + +import java.io.IOException; + +import org.apache.commons.httpclient.URIException; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Mapper; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +/** + * @author brad + * + */ +public class CDXCanonicalizingMapper extends Mapper<Object, Text, Text, Text> +implements Configurable { + + private static String MODE_CONFIG_NAME = "cdx.map.mode"; + public static int MODE_GLOBAL = 0; + public static int MODE_FULL = 1; + + + private Configuration conf; + private int mode = MODE_GLOBAL; + private Text key = new Text(); + private Text remainder = new Text(); + private String delim = " "; + StringBuilder sb = new StringBuilder(); + + public void map(Object y, Text value, Context context) + throws IOException, InterruptedException { + if(mode == MODE_GLOBAL) { + mapGlobal(y,value,context); + } else { + mapFull(y,value,context); + } + } + + private static int SHA1_DIGITS = 3; + AggressiveUrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + private StringBuilder ksb = new StringBuilder(); + private StringBuilder vsb = new StringBuilder(); + private int i1 = 0; + private int i2 = 0; + private int i3 = 0; + private int i4 = 0; + + private void mapGlobal(Object y, Text value, Context context) + throws IOException, InterruptedException { + String s = value.toString(); + + String parts[] = s.split(delim); + if(parts.length == 10) { + if(!parts[9].contains("A")) { + ksb.setLength(0); + vsb.setLength(0); + try { + ksb.append(canonicalizer.urlStringToKey(parts[0])).append(" "); + ksb.append(parts[1]); // date + vsb.append(parts[0]).append(delim); // orig_url + vsb.append(parts[3]).append(delim); // MIME + vsb.append(parts[4]).append(delim); // HTTP_CODE + vsb.append(parts[5].substring(0, SHA1_DIGITS)).append(" "); // SHA1 + vsb.append(parts[6]).append(delim); // redirect + vsb.append(parts[7]).append(delim); // start_offset + vsb.append(parts[8]).append(".arc.gz"); // arc_prefix + key.set(ksb.toString()); + remainder.set(vsb.toString()); + context.write(key, remainder); + } catch (URIException e) { + System.err.println("Failed Canonicalize:("+parts[0]+ + ") in ("+parts[8]+"):("+parts[7]+")"); + } + } + } else { + System.err.println("Funky: Problem with line("+s+")"); + } + + } + private void mapFull(Object y, Text value, Context context) + throws IOException, InterruptedException { + String s = value.toString(); + + boolean problems = true; + i1 = s.indexOf(delim); + if(i1 > 0) { + i2 = s.indexOf(delim, i1 + 1); + if(i2 > 0) { + i3 = s.indexOf(delim, i2 + 1); + if(i3 > 0) { + i4 = s.lastIndexOf(delim); + if(i4 > i3) { + try { + ksb.setLength(0); + ksb.append(canonicalizer.urlStringToKey(s.substring(i2 + 1, i3))); + ksb.append(s.substring(i1,i4)); + key.set(ksb.toString()); + remainder.set(s.substring(i4+1)); + context.write(key, remainder); + problems = false; + } catch(URIException e) { + // just eat it.. problems will be true. + } + } + } + } + } + if(problems) { + System.err.println("CDX-Can: Problem with line("+s+")"); + } + } + +// private void mapOld(Object y, Text value, Context context) +// throws IOException, InterruptedException { +// String parts[] = value.toString().split(delim); +// // lets assume key is field 1-2: +// sb.setLength(0); +// sb.append(parts[0]).append(delim).append(parts[1]); +// key.set(sb.toString()); +// remainder.set(join(delim,parts,2)); +// context.write(key, remainder); +// } +// +// private String join(String delim, String parts[], int start) { +// sb.setLength(0); +// int count = parts.length -1; +// for(int i = start; i < count; i++) { +// sb.append(parts[i]).append(delim); +// } +// sb.append(parts[count]); +// return sb.toString(); +// } + + /** + * @param conf Configuration for the Job + * @param mode String mode to use, one of MODE_GLOBAL, MODE_FULL + */ + public static void setMapMode(Configuration conf, int mode) { + conf.setInt(MODE_CONFIG_NAME, mode); + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + mode = conf.getInt(MODE_CONFIG_NAME, MODE_FULL); + delim = conf.get(CDXSortDriver.TEXT_OUTPUT_DELIM_CONFIG,delim); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXCanonicalizingMapper.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXReducer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXReducer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXReducer.java 2010-09-03 22:31:52 UTC (rev 3245) @@ -0,0 +1,19 @@ +package org.archive.wayback.hadoop; + +import java.io.IOException; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Reducer; + +/** + * @author brad + * + */ +public class CDXReducer extends Reducer<Text, Text, Text, Text> { + public void reduce(Text key, Iterable<Text> values, Context context) + throws IOException, InterruptedException { + for(Text value : values) { + context.write(key, value); + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXReducer.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSort.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSort.java 2010-09-03 22:30:36 UTC (rev 3244) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSort.java 2010-09-03 22:31:52 UTC (rev 3245) @@ -4,12 +4,19 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; import java.util.ArrayList; import java.util.Date; import java.util.List; +import java.util.zip.GZIPInputStream; +import org.apache.commons.httpclient.URIException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; @@ -55,6 +62,7 @@ boolean compressOutput = false; boolean dereferenceInputs = false; boolean canonicalize = false; + boolean funkyInput = false; JobConf jobConf = new JobConf(getConf(), CDXSort.class); jobConf.setJobName("cdxsort"); @@ -73,6 +81,8 @@ jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("--compress-output".equals(args[i])) { compressOutput = true; + } else if ("--funky-input".equals(args[i])) { + funkyInput = true; } else if ("--dereference-inputs".equals(args[i])) { dereferenceInputs = true; } else if ("--canonicalize".equals(args[i])) { @@ -107,32 +117,33 @@ File localSplitFile = new File(splitPath); FileReader is = new FileReader(localSplitFile); BufferedReader bis = new BufferedReader(is); - try { - partitioner.loadBoundaries(bis); - } catch (IOException except) { - System.err.println("ERROR: Problem loading file " + splitPath); - return printUsage(); // exits - } - jobConf.setNumReduceTasks(partitioner.getNumPartitions()); +// try { +// partitioner.loadBoundaries(bis); +// } catch (IOException except) { +// System.err.println("ERROR: Problem loading file " + splitPath); +// return printUsage(); // exits +// } +// jobConf.setNumReduceTasks(partitioner.getNumPartitions()); +// +// // copy the split file into the FS, add to the DistributedCache: +//// AlphaPartitioner.setPartitionFile(jobConf, localSplitFile); +// AlphaPartitioner.setSplitCache(jobConf, localSplitFile); +// System.err.println("uploaded split file to FS and DistributedCache"); +// +// // Set job configs: +// jobConf.setInputFormat(TextInputFormat.class); +// +// jobConf.setOutputFormat(TextOutputFormat.class); +// if (canonicalize) { +// jobConf.setMapperClass(CDXCanonicalizerMapClass.class); +// } else { +// jobConf.setMapperClass(CDXMapClass.class); +// } +// jobConf.setOutputKeyClass(Text.class); +// jobConf.setOutputValueClass(Text.class); +// jobConf.set("mapred.textoutputformat.separator", " "); +// jobConf.setPartitionerClass(AlphaPartitioner.class); - // copy the split file into the FS, add to the DistributedCache: - AlphaPartitioner.setPartitionFile(jobConf, localSplitFile); - System.err.println("uploaded split file to FS and DistributedCache"); - - // Set job configs: - jobConf.setInputFormat(TextInputFormat.class); - - jobConf.setOutputFormat(TextOutputFormat.class); - if (canonicalize) { - jobConf.setMapperClass(CDXCanonicalizerMapClass.class); - } else { - jobConf.setMapperClass(CDXMapClass.class); - } - jobConf.setOutputKeyClass(Text.class); - jobConf.setOutputValueClass(Text.class); - jobConf.set("mapred.textoutputformat.separator", " "); - jobConf.setPartitionerClass(AlphaPartitioner.class); - int inputCount = 0; // Set job input: if (dereferenceInputs) { @@ -150,23 +161,35 @@ // System.err.println("Added path(" + inputCount + "): " + line); // } - FileReader is2 = new FileReader(new File(inputPath)); - BufferedReader bis2 = new BufferedReader(is2); - ArrayList<String> list = new ArrayList<String>(); - while (true) { - String line = bis2.readLine(); - if (line == null) { - break; - } - list.add(line); - inputCount++; + + // PASS 2: +// FileReader is2 = new FileReader(new File(inputPath)); +// BufferedReader bis2 = new BufferedReader(is2); +// ArrayList<String> list = new ArrayList<String>(); +// +// while (true) { +// String line = bis2.readLine(); +// if (line == null) { +// break; +// } +// list.add(line); +// inputCount++; +// } +// Path arr[] = new Path[list.size()]; +// for(int i=0; i < list.size(); i++) { +// arr[i] = new Path(list.get(i)); +// } +// FileInputFormat.setInputPaths(jobConf, arr); + + // PASS 3: + if(funkyInput) { + jobConf.setMapperClass(FunkyDeReffingCDXCanonicalizerMapClass.class); + } else { + jobConf.setMapperClass(DeReffingCDXCanonicalizerMapClass.class); } - Path arr[] = new Path[list.size()]; - for(int i=0; i < list.size(); i++) { - arr[i] = new Path(list.get(i)); - } - FileInputFormat.setInputPaths(jobConf, arr); + FileInputFormat.setInputPaths(jobConf, new Path(inputPath)); + inputCount = 1; } else { @@ -182,10 +205,10 @@ FileOutputFormat.setOutputCompressorClass(jobConf, GzipCodec.class); } - System.out.println("Running on " + cluster.getTaskTrackers() - + " nodes, processing " + inputCount + " files/directories" - + " into " + outputPath + " with " - + partitioner.getNumPartitions() + " reduces."); +// System.out.println("Running on " + cluster.getTaskTrackers() +// + " nodes, processing " + inputCount + " files/directories" +// + " into " + outputPath + " with " +// + partitioner.getNumPartitions() + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); @@ -228,7 +251,70 @@ // reporter.setStatus("Running"); } } + public static class FunkyDeReffingCDXCanonicalizerMapClass extends DeReffingCDXCanonicalizerMapClass { + protected Mapper<LongWritable, Text, Text, Text> getInner() { + return new FunkyCDXCanonicalizerMapClass(); + } + } + public static class DeReffingCDXCanonicalizerMapClass extends MapReduceBase + implements Mapper<LongWritable, Text, Text, Text> { + protected Mapper<LongWritable, Text, Text, Text> getInner() { + return new CDXCanonicalizerMapClass(); + } + /* (non-Javadoc) + * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter) + */ + public void map(LongWritable lineNo, Text urlText, + OutputCollector<Text, Text> output, Reporter reporter) + throws IOException { + LongWritable lw = new LongWritable(); + Text tmp = new Text(); +// CDXCanonicalizerMapClass inner = new CDXCanonicalizerMapClass(); + Mapper<LongWritable, Text, Text, Text> inner = getInner(); + // arg 1 is a URL + + String urlString = urlText.toString(); + InputStream is = null; + FileSystem fs = null; + if(urlString.startsWith("http://")) { + URL u = new URL(urlString.toString()); + System.err.println("Openning URL stream for:" + urlString); + is = u.openStream(); + } else { + System.err.println("Creating default Filesystem for:" + urlString); + + fs = FileSystem.get(new Configuration(true)); + Path p = new Path(urlString); +// FSDataInputStream fsdis = fs.open(p); + is = fs.open(p); + + } + if(urlString.endsWith(".gz")) { + is = new GZIPInputStream(is); + } + try { + BufferedReader br = new BufferedReader( + new InputStreamReader(is)); + String tmpS = null; + long line = 0; + while((tmpS = br.readLine()) != null) { + lw.set(line++); + tmp.set(tmpS); + inner.map(lw, tmp, output, reporter); + } + is.close(); + if(fs != null) { + fs.close(); + } + } catch (IOException e) { + System.err.println("IOException with url:" + urlString); + e.printStackTrace(); + throw e; + } + } + + } /** * Mapper which reads an identity CDX line, outputting: key - canonicalized * original URL + timestamp val - everything else @@ -263,23 +349,109 @@ if(i3 > 0) { i4 = s.lastIndexOf(' '); if(i4 > i3) { - ksb.setLength(0); - ksb.append(canonicalizer.urlStringToKey(s.substring(i2 + 1, i3))); - ksb.append(s.substring(i1,i4)); - outKey.set(ksb.toString()); - outValue.set(s.substring(i4+1)); - output.collect(outKey, outValue); - problems = false; + try { + ksb.setLength(0); + ksb.append(canonicalizer.urlStringToKey(s.substring(i2 + 1, i3))); + ksb.append(s.substring(i1,i4)); + outKey.set(ksb.toString()); + outValue.set(s.substring(i4+1)); + output.collect(outKey, outValue); + problems = false; + } catch(URIException e) { + // just eat it.. problems will be true. + } } } } } if(problems) { - System.err.println("Problem with line("+s+")"); + System.err.println("CDX-Can: Problem with line("+s+")"); } } } + /** + * Mapper which reads an identity Funky format CDX line, outputting: + * key - canonicalized original URL + timestamp + * val - everything else + * + * input lines are a hybrid format: + * + * ORIG_URL + * DATE + * '-' (literal) + * MIME + * HTTP_CODE + * SHA1 + * REDIRECT + * START_OFFSET + * ARC_PREFIX (sans .arc.gz) + * ROBOT_FLAG (combo of AIF - no: Archive,Index,Follow, or '-' if none) + * + * Ex: + * http://www.myow.de:80/news_show.php? 20061126032815 - text/html 200 DVKFPTOJGCLT3G5GUVLCETHLFO3222JM - 91098929 foo A + * + * Need to: + * . replace col 3 with orig url + * . replace col 1 with canonicalized orig url + * . replace SHA1 with first 4 digits of SHA1 + * . append .arc.gz to ARC_PREFIX + * . omit lines with ROBOT_FLAG containing 'A' + * . remove last column + * + * @author brad + * @version $Date$, $Revision$ + */ + public static class FunkyCDXCanonicalizerMapClass extends MapReduceBase + implements Mapper<LongWritable, Text, Text, Text> { + + private static int SHA1_DIGITS = 3; + private Text outKey = new Text(); + private Text outValue = new Text(); + AggressiveUrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + private StringBuilder ksb = new StringBuilder(); + private StringBuilder vsb = new StringBuilder(); + + private int i1 = 0; + private int i2 = 0; + private int i3 = 0; + private int i4 = 0; + + public void map(LongWritable lineNumber, Text line, + OutputCollector<Text, Text> output, Reporter reporter) + throws IOException { + String s = line.toString(); + + String parts[] = s.split(" "); + boolean problems = true; + if(parts.length == 10) { + if(!parts[9].contains("A")) { + ksb.setLength(0); + vsb.setLength(0); + try { + ksb.append(canonicalizer.urlStringToKey(parts[0])).append(" "); + ksb.append(parts[1]); // date + vsb.append(parts[0]).append(" "); // orig_url + vsb.append(parts[3]).append(" "); // MIME + vsb.append(parts[4]).append(" "); // HTTP_CODE + vsb.append(parts[5].substring(0, SHA1_DIGITS)).append(" "); // SHA1 + vsb.append(parts[6]).append(" "); // redirect + vsb.append(parts[7]).append(" "); // start_offset + vsb.append(parts[8]).append(".arc.gz"); // arc_prefix + outKey.set(ksb.toString()); + outValue.set(vsb.toString()); + output.collect(outKey, outValue); + } catch (URIException e) { + System.err.println("Failed Canonicalize:("+parts[0]+ + ") in ("+parts[8]+"):("+parts[7]+")"); + } + } + } else { + System.err.println("Funky: Problem with line("+s+")"); + } + } + } + public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new CDXSort(), args); System.exit(res); Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java 2010-09-03 22:31:52 UTC (rev 3245) @@ -0,0 +1,183 @@ +package org.archive.wayback.hadoop; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +/** + * @author brad + * + */ +public class CDXSortDriver implements Tool { + Configuration conf = null; + /** + * As hard-coded into the Text RecordWriter + */ + public static String TEXT_OUTPUT_DELIM_CONFIG = + "mapred.textoutputformat.separator"; + + private static int countLinesInPath(Path path, Configuration conf) + throws IOException { + FileSystem fs = path.getFileSystem(conf); + FSDataInputStream is = fs.open(path); + BufferedReader br = new BufferedReader(new InputStreamReader(is, + "utf-8")); + int lineCount = 0; + while (br.readLine() != null) { + lineCount++; + } + is.close(); + return lineCount; + } + + static int printUsage() { + System.out.println("cdxsort <split> <input> <output>"); + System.out.println("cdxsort [OPTIONS] <split> <input> <output>"); + System.out.println("\tOPTIONS can be:"); + System.out.println("\t\t-m NUM - try to run with approximately NUM map tasks"); + System.out.println("\t\t--compress-output - compress output files with GZip"); + System.out.println("\t\t--delimiter DELIM - assume DELIM delimter for input and output, instead of default <SPACE>"); + System.out.println("\t\t--map-global - use the GLOBAL CDX map function, which implies:"); + System.out.println("\t\t\t. extra trailing field indicating HTML meta NOARCHIVE data, which should be omitted, result lines do not include the last field"); + System.out.println("\t\t\t. truncating digest field to 3 digits"); + System.out.println("\t\t\t. column 0 is original URL (identity CDX files)"); + System.out.println(); +// ToolRunner.printGenericCommandUsage(System.out); + return -1; + } + + /** + * The main driver for sort program. Invoke this method to submit the + * map/reduce job. + * + * @throws IOException + * When there is communication problems with the job tracker. + */ + public int run(String[] args) throws Exception { + + String delim = " "; + + long desiredMaps = 10; + boolean compressOutput = false; + List<String> otherArgs = new ArrayList<String>(); + int mapMode = CDXCanonicalizingMapper.MODE_FULL; + for (int i = 0; i < args.length; ++i) { + try { + if ("-m".equals(args[i])) { + desiredMaps = Integer.parseInt(args[++i]); + } else if ("--compress-output".equals(args[i])) { + compressOutput = true; + } else if ("--delimiter".equals(args[i])) { + delim = args[++i]; + } else if ("--map-full".equals(args[i])) { + mapMode = CDXCanonicalizingMapper.MODE_FULL; + } else if ("--map-global".equals(args[i])) { + mapMode = CDXCanonicalizingMapper.MODE_GLOBAL; + } else { + otherArgs.add(args[i]); + } + } catch (NumberFormatException except) { + System.out.println("ERROR: Integer expected instead of " + + args[i]); + return printUsage(); + } catch (ArrayIndexOutOfBoundsException except) { + System.out.println("ERROR: Required parameter missing from " + + args[i - 1]); + return printUsage(); // exits + } + } + + // Make sure there are exactly 3 parameters left: split input output + if (otherArgs.size() != 3) { + System.out.println("ERROR: Wrong number of parameters: " + + otherArgs.size() + " instead of 3."); + return printUsage(); + } + + + String splitPathString = otherArgs.get(0); + String inputPathString = otherArgs.get(1); + String outputPathString = otherArgs.get(2); + + Path splitPath = new Path(splitPathString); + Path inputPath = new Path(inputPathString); + Path outputPath = new Path(outputPathString); + + Job job = new Job(getConf(), "cdx-sort"); + Configuration conf = job.getConfiguration(); + job.setJarByClass(CDXSortDriver.class); + + job.setMapperClass(CDXCanonicalizingMapper.class); + + job.setReducerClass(CDXReducer.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(Text.class); + + // configure the "map mode" + CDXCanonicalizingMapper.setMapMode(conf, mapMode); + + // set up the delimter: + conf.set(TEXT_OUTPUT_DELIM_CONFIG, delim); + + if (compressOutput) { + FileOutputFormat.setCompressOutput(job, true); + FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); + } + + // set up the Partitioner, including number of reduce tasks: + FileSystem fs = inputPath.getFileSystem(conf); + + int splitCount = countLinesInPath(splitPath, conf); + System.err.println("Split/Reduce count:" + splitCount); + job.setNumReduceTasks(splitCount); + + AlphaPartitioner.setPartitionPath(conf, splitPathString); + job.setPartitionerClass(AlphaPartitioner.class); + + // calculate the byte size to get the correct number of map tasks: + FileStatus inputStatus = fs.getFileStatus(inputPath); + long inputLen = inputStatus.getLen(); + long bytesPerMap = (int) inputLen / desiredMaps; + + FileInputFormat.addInputPath(job, inputPath); + FileInputFormat.setMaxInputSplitSize(job, bytesPerMap); + job.setInputFormatClass(LineDereferencingInputFormat.class); + + FileOutputFormat.setOutputPath(job, outputPath); + + return (job.waitForCompletion(true) ? 0 : 1); + } + + /** + * @param args + * @throws Exception + */ + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(new Configuration(), new CDXSortDriver(), args); + System.exit(res); + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingInputFormat.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingInputFormat.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingInputFormat.java 2010-09-03 22:31:52 UTC (rev 3245) @@ -0,0 +1,38 @@ +package org.archive.wayback.hadoop; + +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.RecordReader; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; + +/** + * FileInputFormat subclass which assumes the configured input files are lines + * containing hdfs:// pointers to the actual Text data. + * + * @author brad + * + */ +public class LineDereferencingInputFormat extends FileInputFormat<Text, Text>{ + TextInputFormat tif = null; + + @Override + public List<InputSplit> getSplits(JobContext context) throws IOException { + if(tif == null) { + tif = new TextInputFormat(); + } + return tif.getSplits(context); + } + + @Override + public RecordReader<Text, Text> createRecordReader(InputSplit split, + TaskAttemptContext context) throws IOException, + InterruptedException { + return new LineDereferencingRecordReader(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingInputFormat.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java 2010-09-03 22:31:52 UTC (rev 3245) @@ -0,0 +1,105 @@ +package org.archive.wayback.hadoop; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.zip.GZIPInputStream; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.RecordReader; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; + +/** + * RecordReader which reads pointers to actual files from an internal + * LineRecordReader, producing a LineRecordReader for the files pointed to by + * the actual input. + * + * @author brad + * + */ +public class LineDereferencingRecordReader extends RecordReader<Text, Text>{ + LineRecordReader internal = new LineRecordReader(); + + FileSystem fileSystem = null; + Text key = null; + Text value = null; + BufferedReader curReader = null; + String curFile = null; + long curLine = 0; + float progress = 0.0f; + @Override + public void initialize(InputSplit split, TaskAttemptContext context) + throws IOException, InterruptedException { + FileSplit fileSplit = (FileSplit) split; + fileSystem = fileSplit.getPath().getFileSystem(context.getConfiguration()); + internal.initialize(split, context); + } + + @Override + public boolean nextKeyValue() throws IOException, InterruptedException { + if(key == null) { + key = new Text(); + } + if(value == null) { + value = new Text(); + } + while(true) { + if(curReader == null) { + // are there more? + if(internal.nextKeyValue()) { + progress = internal.getProgress(); + curFile = internal.getCurrentValue().toString(); + Path path = new Path(curFile); + InputStream is = fileSystem.open(path); + // TODO: use the real Codec stuff.. + if(curFile.endsWith(".gz")) { + is = new GZIPInputStream(is); + } + curReader = new BufferedReader(new InputStreamReader(is)); + + } else { + // all done: + return false; + } + } + // try to read another line: + String nextLine = curReader.readLine(); + if(nextLine != null) { + key.set(curFile+":"+curLine); + value.set(nextLine); + curLine++; + return true; + } + curReader = null; + curFile = null; + curLine = 0; + } + } + + @Override + public Text getCurrentKey() throws IOException, + InterruptedException { + return key; + } + + @Override + public Text getCurrentValue() throws IOException, InterruptedException { + return value; + } + + @Override + public float getProgress() throws IOException, InterruptedException { + return progress; + } + + @Override + public void close() throws IOException { + internal.close(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java 2010-09-03 22:30:36 UTC (rev 3244) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java 2010-09-03 22:31:52 UTC (rev 3245) @@ -11,8 +11,8 @@ int exitCode = -1; ProgramDriver pgd = new ProgramDriver(); try { - pgd.addClass("cdxsort", CDXSort.class, - "A map/reduce program that counts the words in the input files."); + pgd.addClass("cdxsort", CDXSortDriver.class, + "A map/reduce program that canonicalizes and provides a total order sort into multiple CDX files"); pgd.driver(args); // Success exitCode = 0; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-09-03 22:30:43
|
Revision: 3244 http://archive-access.svn.sourceforge.net/archive-access/?rev=3244&view=rev Author: bradtofel Date: 2010-09-03 22:30:36 +0000 (Fri, 03 Sep 2010) Log Message: ----------- POM updates, newer version of hadoop, including some httpclient exclusions to prevent jar resolution problem with heritrix htpclient overrides... Modified Paths: -------------- trunk/archive-access/projects/wayback/pom.xml trunk/archive-access/projects/wayback/wayback-core/pom.xml trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml Modified: trunk/archive-access/projects/wayback/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/pom.xml 2010-08-27 23:35:14 UTC (rev 3243) +++ trunk/archive-access/projects/wayback/pom.xml 2010-09-03 22:30:36 UTC (rev 3244) @@ -93,7 +93,7 @@ </releases> <snapshots> <enabled>true</enabled> - <updatePolicy>never</updatePolicy> + <updatePolicy>daily</updatePolicy> <checksumPolicy>fail</checksumPolicy> </snapshots> <id>internetarchive</id> @@ -101,6 +101,27 @@ <url>http://builds.archive.org:8080/maven2</url> <layout>default</layout> </repository> + +<!-- + <repository> + <releases> + <enabled>true</enabled> + <updatePolicy>daily</updatePolicy> + <checksumPolicy>warn</checksumPolicy> + </releases> + <snapshots> + <enabled>true</enabled> + <updatePolicy>daily</updatePolicy> + <checksumPolicy>fail</checksumPolicy> + </snapshots> + <id>dspace</id> + <name>DSpace Maven Repository</name> + <url>http://maven.dspace.org/</url> + <layout>default</layout> + </repository> + --> + + </repositories> <pluginRepositories> @@ -181,7 +202,7 @@ <id>website</id> <name>Website</name> <!--Pass as command-line system property to maven--> - <url>${website.url}/projects/${pom.artifactId}</url> + <url>${website.url}/projects/${project.artifactId}</url> </site> </distributionManagement> Modified: trunk/archive-access/projects/wayback/wayback-core/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/pom.xml 2010-08-27 23:35:14 UTC (rev 3243) +++ trunk/archive-access/projects/wayback/wayback-core/pom.xml 2010-09-03 22:30:36 UTC (rev 3244) @@ -102,11 +102,11 @@ <!-- Doh... I'm not sure what package is configuring org.apache.commons-logging to use log4j, but it's breaking some command line tools. - --> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.14</version> </dependency> + --> </dependencies> </project> Modified: trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml 2010-08-27 23:35:14 UTC (rev 3243) +++ trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml 2010-09-03 22:30:36 UTC (rev 3244) @@ -35,6 +35,18 @@ <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> <finalName>wayback-hadoop</finalName> + <archive> + <manifestFile>src/main/archive/MANIFEST.MF</manifestFile> +<!-- + <manifest> + <mainClass>org.archive.wayback.hadoop.SortDriver</mainClass> + </manifest> + <manifestEntries> + <Class-Path>hadoop-0.19.1-core.jar lib/commons-cli-2.0-SNAPSHOT.jar lib/commons-codec-1.3.jar lib/commons-httpclient-3.0.1.jar lib/commons-logging-1.0.4.jar lib/commons-logging-api-1.0.4.jar lib/commons-net-1.4.1.jar lib/hsqldb-1.8.0.10.jar lib/jets3t-0.6.1.jar lib/jetty-5.1.4.jar lib/junit-3.8.1.jar lib/kfs-0.2.0.jar lib/log4j-1.2.15.jar lib/oro-2.0.8.jar lib/servlet-api.jar lib/slf4j-api-1.4.3.jar lib/slf4j-log4j12-1.4.3.jar lib/xmlenc-0.52.jar lib/jetty-ext/commons-el.jar lib/jetty-ext/jasper-compiler.jar lib/jetty-ext/jasper-runtime.jar lib/jetty-ext/jsp-api.jar</Class-Path> + </manifestEntries> +--> + </archive> + </configuration> <executions> <execution> Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml 2010-08-27 23:35:14 UTC (rev 3243) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml 2010-09-03 22:30:36 UTC (rev 3244) @@ -1,4 +1,5 @@ -<?xml version="1.0" encoding="UTF-8"?><project> +<?xml version="1.0" encoding="UTF-8"?> +<project> <parent> <artifactId>wayback</artifactId> <groupId>org.archive</groupId> @@ -19,11 +20,18 @@ <version>3.8.1</version> <scope>test</scope> </dependency> + <dependency> - <groupId>org.apache.mahout.hadoop</groupId> <artifactId>hadoop-core</artifactId> - <scope>provided</scope> - <version>0.19.1</version> + <groupId>org.apache.hadoop</groupId> + <version>0.20.2</version> + <scope>compile</scope> + <exclusions> + <exclusion> + <groupId>commons-httpclient</groupId> + <artifactId>commons-httpclient</artifactId> + </exclusion> + </exclusions> </dependency> <dependency> <groupId>org.archive.wayback</groupId> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-08-27 23:35:22
|
Revision: 3243 http://archive-access.svn.sourceforge.net/archive-access/?rev=3243&view=rev Author: bradtofel Date: 2010-08-27 23:35:14 +0000 (Fri, 27 Aug 2010) Log Message: ----------- LOGGING: changed all log4j references to java.util.Logging TWEAK: reorganized many import stanza Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/remote/RemoteExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotRules.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/authenticationcontrol/IPMatchesBooleanOperator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCRecordingProxy.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCUnwrappingProxy.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeBundleRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeGateRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/RemoteResourceIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/dynamic/DynamicCDXIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/GuardRailFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/IndexClient.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/LocalResourceIndexUpdater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/FileProxyServlet.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/RemoteResourceFileLocationDB.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/ResourceFileLocationDBUpdater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/DirectoryResourceFileSource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileList.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSourceUpdater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ARCCreator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/partition/Partitioner.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/BeanNameRegistrar.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/PortMapper.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestMapper.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/SpringReader.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/StaticFileRequestHandler.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/ServerRelativeArchivalRedirect.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/remote/RemoteExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/remote/RemoteExclusionFilter.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/remote/RemoteExclusionFilter.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -31,7 +31,7 @@ import java.net.URL; import java.net.URLEncoder; -import org.apache.log4j.Logger; +import java.util.logging.Logger; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.resourceindex.filters.ExclusionFilter; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotRules.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotRules.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotRules.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -34,7 +34,7 @@ import java.util.LinkedList; import java.util.List; -import org.apache.log4j.Logger; +import java.util.logging.Logger; /** * Class which parses a robots.txt file, storing the rules contained therein, @@ -112,7 +112,7 @@ current = new ArrayList<String>(); } rules.put(ua, current); - LOGGER.trace("Found User-agent(" + ua + ") rules..."); + LOGGER.fine("Found User-agent(" + ua + ") rules..."); continue; } if (read.matches("(?i)Disallow:.*")) { @@ -145,7 +145,7 @@ return false; } else { - LOGGER.trace("UA(" + curUA + ") has (" + LOGGER.fine("UA(" + curUA + ") has (" + disallowedPath + ") blocked...(" + disallowedPath.length() + ")"); if (disallowedPath.equals("/") || path.startsWith(disallowedPath)) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -25,10 +25,9 @@ package org.archive.wayback.accesscontrol.staticmap; import java.util.Map; +import java.util.logging.Logger; - import org.apache.commons.httpclient.URIException; -import org.apache.log4j.Logger; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.surt.SURTTokenizer; @@ -64,7 +63,7 @@ if(nextSearch == null) { break; } - LOGGER.trace("EXCLUSION-MAP:Checking " + nextSearch); + LOGGER.fine("EXCLUSION-MAP:Checking " + nextSearch); if(exclusionMap.containsKey(nextSearch)) { LOGGER.info("EXCLUSION-MAP: EXCLUDED: \"" + nextSearch + "\" (" + url +")"); return true; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -28,9 +28,8 @@ import java.io.IOException; import java.util.HashMap; import java.util.Map; +import java.util.logging.Logger; - -import org.apache.log4j.Logger; import org.archive.wayback.accesscontrol.ExclusionFilterFactory; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.surt.SURTTokenizer; @@ -72,7 +71,7 @@ long currentMod = file.lastModified(); if(currentMod == lastUpdated) { if(currentMod == 0) { - LOGGER.error("No exclude file at " + file.getAbsolutePath()); + LOGGER.severe("No exclude file at " + file.getAbsolutePath()); } return; } @@ -85,7 +84,7 @@ lastUpdated = -1; currentMap = null; e.printStackTrace(); - LOGGER.error("Reload " + file.getAbsolutePath() + " FAILED:" + + LOGGER.severe("Reload " + file.getAbsolutePath() + " FAILED:" + e.getLocalizedMessage()); } } @@ -101,7 +100,7 @@ } String surt = line.startsWith("(") ? line : SURTTokenizer.prefixKey(line); - LOGGER.trace("EXCLUSION-MAP: adding " + surt); + LOGGER.fine("EXCLUSION-MAP: adding " + surt); newMap.put(surt, null); } itr.close(); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/authenticationcontrol/IPMatchesBooleanOperator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/authenticationcontrol/IPMatchesBooleanOperator.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/authenticationcontrol/IPMatchesBooleanOperator.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -26,8 +26,8 @@ import java.util.ArrayList; import java.util.List; +import java.util.logging.Logger; -import org.apache.log4j.Logger; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.util.IPRange; import org.archive.wayback.util.operator.BooleanOperator; @@ -62,7 +62,7 @@ if(range.setRange(ip)) { this.allowedRanges.add(range); } else { - LOGGER.error("Unable to parse range (" + ip + ")"); + LOGGER.severe("Unable to parse range (" + ip + ")"); } } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -28,8 +28,8 @@ import java.io.IOException; import java.util.Arrays; import java.util.List; +import java.util.logging.Logger; -import org.apache.log4j.Logger; import org.archive.io.ArchiveFileConstants; import org.archive.io.WriterPoolSettings; import org.archive.io.arc.ARCConstants; @@ -55,10 +55,17 @@ ARCCacheDirectory.class.getName()); private int poolWriters = 5; - private int maxPoolWait = 60 * 1000; + private int maxPoolWait = 5 * 1000; private long maxARCSize = ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE; private String arcPrefix = "wayback-live"; + /** + * template string used to configure the ARC writer pool + */ + public static String LIVE_WAYBACK_TEMPLATE = + "${prefix}-${timestamp17}-${serialno}"; + + private File arcDir = null; private ARCWriterPool pool = null; @@ -131,19 +138,11 @@ private WriterPoolSettings getSettings(final boolean isCompressed, final String prefix, final File[] arcDirs) { return new WriterPoolSettings() { - public long getMaxSize() { - return maxARCSize; - } - public List<File> getOutputDirs() { return Arrays.asList(arcDirs); } - public boolean isCompressed() { - return isCompressed; - } - - @SuppressWarnings("unchecked") + @SuppressWarnings({ "unchecked", "rawtypes" }) public List getMetadata() { return null; } @@ -152,9 +151,19 @@ return prefix; } - public String getSuffix() { - return null; + public boolean getCompress() { + // TODO Auto-generated method stub + return isCompressed; } + + public long getMaxFileSizeBytes() { + // TODO Auto-generated method stub + return maxARCSize; + } + + public String getTemplate() { + return LIVE_WAYBACK_TEMPLATE; + } }; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCRecordingProxy.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCRecordingProxy.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCRecordingProxy.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -26,20 +26,22 @@ package org.archive.wayback.liveweb; import java.io.IOException; +import java.util.logging.Logger; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.commons.httpclient.URIException; -import org.apache.log4j.Logger; import org.archive.wayback.util.webapp.AbstractRequestHandler; +import org.archive.wayback.util.webapp.ShutdownListener; /** * @author brad * */ -public class ARCRecordingProxy extends AbstractRequestHandler { +public class ARCRecordingProxy extends AbstractRequestHandler +implements ShutdownListener { private final static String EXPIRES_HEADER = "Expires"; private long expiresMS = 60 * 60 * 1000; @@ -142,4 +144,9 @@ public void setFakeExpiresMS(long fakeExpiresMS) { this.fakeExpiresMS = fakeExpiresMS; } + + public void shutdown() { + arcCacheDir.shutdown(); + + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCUnwrappingProxy.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCUnwrappingProxy.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCUnwrappingProxy.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -28,6 +28,7 @@ import java.io.IOException; import java.util.Iterator; import java.util.Map; +import java.util.logging.Logger; import java.util.zip.GZIPInputStream; import javax.servlet.ServletException; @@ -39,7 +40,6 @@ import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.methods.GetMethod; -import org.apache.log4j.Logger; import org.archive.io.arc.ARCRecord; import org.archive.wayback.core.Resource; import org.archive.wayback.exception.ResourceNotAvailableException; @@ -97,7 +97,7 @@ try { res = ResourceFactory.ARCArchiveRecordToResource(r, null); } catch (ResourceNotAvailableException e) { - LOGGER.error(e); + LOGGER.severe(e.getMessage()); throw new IOException(e); } httpResponse.setStatus(res.getStatusCode()); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -29,8 +29,10 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.net.ConnectException; import java.net.UnknownHostException; import java.util.Date; +import java.util.logging.Logger; import org.apache.commons.httpclient.ConnectTimeoutException; import org.apache.commons.httpclient.Header; @@ -42,8 +44,6 @@ import org.apache.commons.httpclient.SimpleHttpConnectionManager; import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.cookie.CookiePolicy; -import org.apache.commons.httpclient.params.HttpClientParams; -import org.apache.log4j.Logger; import org.archive.httpclient.HttpRecorderGetMethod; import org.archive.io.RecordingInputStream; import org.archive.io.arc.ARCWriter; @@ -78,6 +78,8 @@ private int socketTimeoutMS = 10000; private int outBufferSize = 1024 * 100; private int inBufferSize = 1024 * 100; +// private int outBufferSize = 10; +// private int inBufferSize = 100; private final ThreadLocal<HttpClient> tl = new ThreadLocal<HttpClient>() { @@ -87,9 +89,6 @@ manager.getParams().setConnectionTimeout(connectionTimeoutMS); manager.getParams().setSoTimeout(socketTimeoutMS); http.setHttpConnectionManager(manager); - HttpClientParams clientParams = new HttpClientParams(); -// LOGGER.warn("Setting HTTP UserAgent to " + userAgent); -// clientParams.setParameter("http.useragent", userAgent); return http; } }; @@ -139,17 +138,21 @@ getMethod.setRequestHeader("User-Agent", userAgent); int code = client.executeMethod(getMethod); LOGGER.info("URL(" + url + ") HTTP:" + code); - ByteOp.discardStream(getMethod.getResponseBodyAsStream()); +// ByteOp.discardStream(getMethod.getResponseBodyAsStream()); + ByteOp.copyStream(getMethod.getResponseBodyAsStream(), System.out); getMethod.releaseConnection(); gotUrl = true; } catch (URIException e) { e.printStackTrace(); } catch (UnknownHostException e) { - LOGGER.warn("Unknown host for " + url); + LOGGER.warning("Unknown host for " + url); } catch (ConnectTimeoutException e) { // TODO: should we act like it's a full block? - LOGGER.warn("Timeout out connecting to " + url); + LOGGER.warning("Timeout out connecting to " + url); + } catch (ConnectException e) { + LOGGER.warning("ConnectionRefused to " + url); + } catch (HttpException e) { e.printStackTrace(); // we have to let IOExceptions out, problems caused by local disk Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeBundleRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeBundleRequestParser.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeBundleRequestParser.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -1,8 +1,9 @@ package org.archive.wayback.memento; +import java.util.logging.Logger; + import javax.servlet.http.HttpServletRequest; -import org.apache.log4j.Logger; import org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BadQueryException; @@ -37,7 +38,7 @@ BetterRequestException { String requestPath = accessPoint.translateRequestPathQuery(httpRequest); - LOGGER.trace("requestpath:" + requestPath); + LOGGER.fine("requestpath:" + requestPath); if (requestPath.startsWith("timebundle")) { @@ -72,9 +73,9 @@ .substring(requestPath.indexOf("/") + 1); String format = urlStrplus.substring(0, urlStrplus.indexOf("/")); - LOGGER.trace("format:" + format); + LOGGER.fine("format:" + format); String urlStr = urlStrplus.substring(urlStrplus.indexOf("/") + 1); - LOGGER.trace("id:" + urlStr); + LOGGER.fine("id:" + urlStr); WaybackRequest wbRequest = new WaybackRequest(); if (wbRequest.getStartTimestamp() == null) { wbRequest.setStartTimestamp(getEarliestTimestamp()); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeGateRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeGateRequestParser.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeGateRequestParser.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -5,10 +5,10 @@ import java.util.Date; import java.util.Iterator; import java.util.List; +import java.util.logging.Logger; import javax.servlet.http.HttpServletRequest; -import org.apache.log4j.Logger; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BadQueryException; import org.archive.wayback.exception.BetterRequestException; @@ -55,7 +55,7 @@ String base = accessPoint.translateRequestPath(httpRequest); String requestPath = accessPoint.translateRequestPathQuery(httpRequest); - LOGGER.trace("requestPath:" + requestPath); + LOGGER.fine("requestPath:" + requestPath); if (base.startsWith(MEMENTO_BASE)) { // strip leading "timegate/": Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -30,12 +30,12 @@ import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; +import java.util.logging.Logger; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; -import org.apache.log4j.Logger; import org.archive.wayback.ResourceIndex; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.CaptureSearchResults; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/RemoteResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/RemoteResourceIndex.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/RemoteResourceIndex.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -28,12 +28,12 @@ import java.io.IOException; import java.net.URL; import java.net.URLConnection; +import java.util.logging.Logger; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; -import org.apache.log4j.Logger; import org.archive.wayback.ResourceIndex; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; @@ -96,7 +96,7 @@ if(factory != null) { builder = factory.newDocumentBuilder(); if (!builder.isNamespaceAware()) { - LOGGER.error("Builder is not namespace aware."); + LOGGER.severe("Builder is not namespace aware."); } } } catch (ParserConfigurationException e) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultIterator.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultIterator.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -29,8 +29,8 @@ import java.util.HashMap; import java.util.Iterator; import java.util.NoSuchElementException; +import java.util.logging.Logger; -import org.apache.log4j.Logger; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.UrlSearchResult; import org.archive.wayback.util.CloseableIterator; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -24,8 +24,9 @@ */ package org.archive.wayback.resourceindex.bdb; +import java.util.logging.Logger; + import org.apache.commons.httpclient.URIException; -import org.apache.log4j.Logger; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; @@ -74,7 +75,7 @@ urlKey = canonicalizer.urlStringToKey(origUrl); } catch (URIException e) { // e.printStackTrace(); - LOGGER.warn("FAILED canonicalize(" + origUrl +")"); + LOGGER.warning("FAILED canonicalize(" + origUrl +")"); urlKey = origUrl; } keySB.append(urlKey); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatToSearchResultAdapter.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatToSearchResultAdapter.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -25,7 +25,8 @@ package org.archive.wayback.resourceindex.cdx; -import org.apache.log4j.Logger; +import java.util.logging.Logger; + import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.resourceindex.cdx.format.CDXFormat; import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; @@ -44,7 +45,7 @@ try { return cdx.parseResult(line); } catch (CDXFormatException e) { - LOGGER.warn("CDXFormat(" + line + "):"+e.getLocalizedMessage()); + LOGGER.warning("CDXFormat(" + line + "):"+e.getLocalizedMessage()); } return null; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -25,7 +25,8 @@ package org.archive.wayback.resourceindex.cdx; -import org.apache.log4j.Logger; +import java.util.logging.Logger; + import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.url.UrlOperations; @@ -109,7 +110,7 @@ try { compressedOffset = Long.parseLong(tokens[nextToken]); } catch (NumberFormatException e) { - LOGGER.warn("Bad compressed Offset field("+nextToken+") in (" + + LOGGER.warning("Bad compressed Offset field("+nextToken+") in (" + line +")"); return null; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/dynamic/DynamicCDXIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/dynamic/DynamicCDXIndex.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/dynamic/DynamicCDXIndex.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -32,18 +32,15 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.Set; +import java.util.logging.Logger; import java.util.regex.Pattern; -import org.apache.log4j.Logger; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.ResourceIndexNotAvailableException; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.FileDownloader; import org.archive.wayback.resourceindex.CompositeSearchResultSource; import org.archive.wayback.resourceindex.cdx.CDXIndex; -import org.archive.wayback.resourceindex.cdx.dynamic.CDXDefinitionFile; -import org.archive.wayback.resourceindex.cdx.dynamic.MD5LocationFile; -import org.archive.wayback.resourceindex.cdx.dynamic.RangeAssignmentFile; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.FileDownloader; /** * A CompositeSearchResultSource that autmatically manages it's list of sources @@ -274,7 +271,7 @@ break; } else { tmpTarget.delete(); - LOGGER.warn("Bad file contents. Location(" + + LOGGER.warning("Bad file contents. Location(" + loc +") should have MD5(" + neededMD5 + ") but has MD5(" + gotMD5 +")"); } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -31,9 +31,9 @@ import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; +import java.util.logging.Logger; import org.apache.commons.httpclient.URIException; -import org.apache.log4j.Logger; import org.archive.wayback.ResourceIndex; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.SearchResults; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/GuardRailFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/GuardRailFilter.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/GuardRailFilter.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -24,7 +24,8 @@ */ package org.archive.wayback.resourceindex.filters; -import org.apache.log4j.Logger; +import java.util.logging.Logger; + import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.ObjectFilter; @@ -55,7 +56,7 @@ public int filterObject(CaptureSearchResult r) { recordsScanned++; if(recordsScanned > maxRecordsToScan) { - LOGGER.warn("Hit max requests on " + r.getUrlKey() + " " + LOGGER.warning("Hit max requests on " + r.getUrlKey() + " " + r.getCaptureTimestamp()); return FILTER_ABORT; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/IndexClient.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/IndexClient.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/IndexClient.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -24,20 +24,20 @@ */ package org.archive.wayback.resourceindex.updater; +import java.io.BufferedOutputStream; import java.io.File; -import java.io.BufferedOutputStream; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.util.Iterator; +import java.util.logging.Logger; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.InputStreamRequestEntity; import org.apache.commons.httpclient.methods.PutMethod; -import org.apache.log4j.Logger; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; @@ -114,7 +114,7 @@ } File toBeMergedFile = new File(toBeMergedDir,base); if(toBeMergedFile.exists()) { - LOGGER.warn("WARNING: "+toBeMergedFile.getAbsolutePath() + + LOGGER.warning("WARNING: "+toBeMergedFile.getAbsolutePath() + "already exists!"); } else { if(cdx.renameTo(toBeMergedFile)) { @@ -122,7 +122,7 @@ " for merging."); added = true; } else { - LOGGER.error("FAILED rename("+cdx.getAbsolutePath()+ + LOGGER.severe("FAILED rename("+cdx.getAbsolutePath()+ ") to ("+toBeMergedFile.getAbsolutePath()+")"); } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/LocalResourceIndexUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/LocalResourceIndexUpdater.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/LocalResourceIndexUpdater.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -26,8 +26,8 @@ import java.io.File; import java.io.IOException; +import java.util.logging.Logger; -import org.apache.log4j.Logger; import org.archive.wayback.Shutdownable; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.ConfigurationException; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -30,10 +30,9 @@ import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; +import java.util.logging.Logger; import java.util.zip.GZIPInputStream; -import org.apache.log4j.Logger; - /** * @author brad * @@ -74,7 +73,7 @@ StringBuilder sb = new StringBuilder(16); sb.append(BYTES_HEADER).append(offset).append(BYTES_MINUS); sb.append((offset + BLOCK_SIZE)-1); - LOGGER.trace("Reading block:" + urlOrPath + "("+sb.toString()+")"); + LOGGER.fine("Reading block:" + urlOrPath + "("+sb.toString()+")"); // TODO: timeouts URL u = new URL(urlOrPath); URLConnection uc = u.openConnection(); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -29,19 +29,15 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStream; import java.io.InputStreamReader; import java.io.RandomAccessFile; import java.util.Iterator; import java.util.List; -import java.util.RandomAccess; +import java.util.logging.Logger; import java.util.zip.GZIPInputStream; -import org.apache.log4j.Logger; -import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.exception.RuntimeIOException; import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.webapp.AccessPoint; /** * @author brad Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -30,8 +30,8 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; +import java.util.logging.Logger; -import org.apache.log4j.Logger; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.resourceindex.SearchResultSource; @@ -97,7 +97,7 @@ String line = lines.next(); String[] parts = line.split("\\s"); if(parts.length != 2) { - LOGGER.error("Bad line(" + line +") in (" + + LOGGER.severe("Bad line(" + line +") in (" + chunkMapPath + ")"); throw new IOException("Bad line(" + line +") in (" + chunkMapPath + ")"); @@ -151,7 +151,7 @@ numBlocks++; String parts[] = blockDescriptor.split("\t"); if(parts.length != 3) { - LOGGER.error("Bad line(" + blockDescriptor +") in (" + + LOGGER.severe("Bad line(" + blockDescriptor +") in (" + chunkMapPath + ")"); throw new ResourceIndexNotAvailableException("Bad line(" + blockDescriptor + ")"); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -25,11 +25,11 @@ package org.archive.wayback.resourcestore; import java.io.IOException; +import java.util.logging.Logger; -import org.apache.log4j.Logger; import org.archive.wayback.ResourceStore; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.Resource; -import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.ResourceNotAvailableException; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; @@ -85,7 +85,7 @@ // which means we've already read some } catch (IOException e) { - LOGGER.warn("Unable to retrieve resource from " + url); + LOGGER.warning("Unable to retrieve resource from " + url); } if(r != null) { break; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -27,14 +27,14 @@ import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import java.util.logging.Logger; import org.apache.commons.httpclient.Header; -import org.apache.log4j.Logger; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.htmllex.ContextAwareLexer; +import org.archive.wayback.util.htmllex.ParseContext; import org.archive.wayback.util.htmllex.ParseEventDelegator; -import org.archive.wayback.util.htmllex.ParseContext; import org.archive.wayback.util.url.UrlOperations; import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; @@ -156,13 +156,13 @@ } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); - LOGGER.warn(fileContext + " " + e.getLocalizedMessage()); + LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); - LOGGER.warn(fileContext + " " + e.getLocalizedMessage()); + LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); } catch (IOException e) { - LOGGER.warn(fileContext + " " + e.getLocalizedMessage()); + LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); } } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -29,8 +29,8 @@ import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; +import java.util.logging.Logger; -import org.apache.log4j.Logger; import org.archive.wayback.Shutdownable; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; import org.archive.wayback.util.CloseableIterator; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -28,8 +28,8 @@ import java.io.IOException; import java.io.PrintWriter; import java.util.Iterator; +import java.util.logging.Logger; -import org.apache.log4j.Logger; import org.archive.wayback.Shutdownable; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; @@ -103,7 +103,7 @@ try { pathsOrUrls = db.nameToUrls(name); } catch(IOException e) { - LOGGER.error("FAILED TO LOOKUP(" + name + ")" + + LOGGER.severe("FAILED TO LOOKUP(" + name + ")" + e.getLocalizedMessage()); return false; } @@ -118,7 +118,7 @@ } } } catch(IOException e) { - LOGGER.error("FAILED to index or upload (" + name + ")"); + LOGGER.severe("FAILED to index or upload (" + name + ")"); e.printStackTrace(); } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -26,13 +26,13 @@ import java.io.File; import java.io.IOException; +import java.util.logging.Logger; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpParser; import org.apache.commons.httpclient.StatusLine; import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.util.EncodingUtil; -import org.apache.log4j.Logger; import org.archive.io.ArchiveRecordHeader; import org.archive.io.RecoverableIOException; import org.archive.io.arc.ARCConstants; @@ -166,7 +166,7 @@ String urlKey = canonicalizer.urlStringToKey(origUrl); result.setUrlKey(urlKey); } catch (URIException e) { - LOGGER.warn("FAILED canonicalize(" + origUrl + "):" + + LOGGER.warning("FAILED canonicalize(" + origUrl + "):" + file + " " + offset); result.setUrlKey(origUrl); } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/FileProxyServlet.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/FileProxyServlet.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/FileProxyServlet.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -33,6 +33,7 @@ import java.net.InetSocketAddress; import java.net.Socket; import java.net.URL; +import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -41,7 +42,6 @@ import javax.servlet.http.HttpServletResponse; import org.apache.commons.httpclient.ChunkedInputStream; -import org.apache.log4j.Logger; import org.archive.util.anvl.ANVLRecord; import org.archive.wayback.util.http.HttpRequestMessage; import org.archive.wayback.util.http.HttpResponse; @@ -96,7 +96,7 @@ if(urls == null || urls.length == 0) { - LOGGER.warn("No locations for " + location.getName()); + LOGGER.warning("No locations for " + location.getName()); httpResponse.sendError(HttpServletResponse.SC_NOT_FOUND, "Unable to locate("+ location.getName() +")"); } else { @@ -110,12 +110,12 @@ break; } } catch(IOException e) { - LOGGER.warn("failed proxy of " + urlString + " " + + LOGGER.warning("failed proxy of " + urlString + " " + e.getLocalizedMessage()); } } if(ds == null) { - LOGGER.warn("No successful locations for " + + LOGGER.warning("No successful locations for " + location.getName()); httpResponse.sendError(HttpServletResponse.SC_BAD_GATEWAY, "failed proxy of ("+ location.getName() +")"); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/RemoteResourceFileLocationDB.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/RemoteResourceFileLocationDB.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/RemoteResourceFileLocationDB.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -31,6 +31,7 @@ import java.io.InputStreamReader; import java.util.Arrays; import java.util.Iterator; +import java.util.logging.Logger; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpStatus; @@ -38,8 +39,6 @@ import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.PostMethod; import org.apache.commons.httpclient.util.ParameterFormatter; -import org.apache.log4j.Logger; -import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBServlet; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.WrappedCloseableIterator; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/ResourceFileLocationDBUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/ResourceFileLocationDBUpdater.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/ResourceFileLocationDBUpdater.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -27,8 +27,8 @@ import java.io.File; import java.io.IOException; import java.util.Iterator; +import java.util.logging.Logger; -import org.apache.log4j.Logger; import org.archive.wayback.Shutdownable; import org.archive.wayback.resourcestore.resourcefile.ResourceFileList; import org.archive.wayback.resourcestore.resourcefile.ResourceFileLocation; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/DirectoryResourceFileSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/DirectoryResourceFileSource.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/DirectoryResourceFileSource.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -29,10 +29,9 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.logging.Logger; -import org.apache.log4j.Logger; - /** * Local directory tree holding ARC and WARC files. * @@ -87,7 +86,7 @@ } } } else { - LOGGER.warn(root.getAbsolutePath() + " is not a directory."); + LOGGER.warning(root.getAbsolutePath() + " is not a directory."); return; } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileList.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileList.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileList.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -28,8 +28,8 @@ import java.io.IOException; import java.util.HashMap; import java.util.Iterator; +import java.util.logging.Logger; -import org.apache.log4j.Logger; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.CloseableIterator; @@ -80,7 +80,7 @@ if(location != null) { list.add(location); } else { - LOGGER.warn("Bad parse of line(" + line + ") in (" + + LOGGER.warning("Bad parse of line(" + line + ") in (" + source.getAbsolutePath() + ")"); } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSourceUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSourceUpdater.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSourceUpdater.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -27,8 +27,8 @@ import java.io.File; import java.io.IOException; import java.util.List; +import java.util.logging.Logger; -import org.apache.log4j.Logger; import org.archive.wayback.Shutdownable; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBUpdater; import org.archive.wayback.util.DirMaker; @@ -87,7 +87,7 @@ LOGGER.info("Synchronized " + name); } catch (IOException e) { e.printStackTrace(); - LOGGER.warn("FAILED Synchronize " + name + e.getMessage()); + LOGGER.warning("FAILED Synchronize " + name + e.getMessage()); } } @@ -120,7 +120,7 @@ if(sleepInterval > 0) { sleep(sleepInterval); } else { - LOGGER.warn("Last Synchronize took " + syncDuration + + LOGGER.warning("Last Synchronize took " + syncDuration + " where interval is " + interval + ". Not sleeping."); } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ARCCreator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ARCCreator.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ARCCreator.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -32,8 +32,8 @@ import java.util.Arrays; import java.util.HashMap; import java.util.concurrent.atomic.AtomicInteger; +import java.util.logging.Logger; -import org.apache.log4j.Logger; import org.archive.io.arc.ARCConstants; import org.archive.io.arc.ARCWriter; import org.archive.util.ArchiveUtils; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/partition/Partitioner.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/partition/Partitioner.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/partition/Partitioner.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -7,8 +7,8 @@ import java.util.Iterator; import java.util.List; import java.util.TimeZone; +import java.util.logging.Logger; -import org.apache.log4j.Logger; import org.archive.wayback.util.partition.size.DayPartitionSize; import org.archive.wayback.util.partition.size.HourPartitionSize; import org.archive.wayback.util.partition.size.MonthPartitionSize; @@ -200,7 +200,7 @@ } if(itr.hasNext()) { // eew... Likely bad usage. is this an error? - LOGGER.warn("Not all elements fit in partitions!"); + LOGGER.warning("Not all elements fit in partitions!"); } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -389,6 +389,11 @@ line + ") skipped (" + parts[column] + ")"); e.printStackTrace(); continue; + } catch (StringIndexOutOfBoundsException e) { + System.err.println("Invalid URL in line " + lineNumber + " (" + + line + ") skipped (" + parts[column] + ")"); + e.printStackTrace(); + continue; } } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-08-26 23:43:06 UTC (rev 3242) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-08-27 23:35:14 UTC (rev 3243) @@ -24,11 +24,11 @@ */ package org.archive.wayback.util.url; +import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.httpclient.URIException; -import org.apache.log4j.Logger; import org.archive.net.UURI; import org.archive.net.UURIFactory; @@ -162,7 +162,7 @@ try { return UURIFactory.getInstance(url).getEscapedURI(); } catch (URIException e) { - LOGGER.warn(e.getLocalizedMessage() + ": " + url); + LOGGER.warning(e.getLocalizedMessage() + ": " + url); // can't let a space exist... send back close to whatever came // in... return url.replace(" ", "%20"); @@ -175,7 +175,7 @@ absBaseURI = UURIFactory.getInstance(baseUrl); resolvedURI = UURIFactory.getInstance(absBaseURI, url); } catch (URIException e) { - LOGGER.warn(e.getLocalizedMessage() + ": " + url); + LOGGER.warning(e.getLocalizedMessage(... [truncated message content] |
Revision: 3242 http://archive-access.svn.sourceforge.net/archive-access/?rev=3242&view=rev Author: bradtofel Date: 2010-08-26 23:43:06 +0000 (Thu, 26 Aug 2010) Log Message: ----------- BUGFIX(unreported): the underlying database was not being shutdown.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java 2010-08-17 01:01:08 UTC (rev 3241) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java 2010-08-26 23:43:06 UTC (rev 3242) @@ -101,13 +101,19 @@ * @see org.archive.wayback.ResourceStore#shutdown() */ public void shutdown() throws IOException { - // NOOP + db.shutdown(); } + /** + * @return the ResourceFileLocationDB used by this ResourceStore + */ public ResourceFileLocationDB getDb() { return db; } + /** + * @param db the ResourceFileLocationDB to use with this ResourceStore + */ public void setDb(ResourceFileLocationDB db) { this.db = db; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-08-17 01:01:14
|
Revision: 3241 http://archive-access.svn.sourceforge.net/archive-access/?rev=3241&view=rev Author: bradtofel Date: 2010-08-17 01:01:08 +0000 (Tue, 17 Aug 2010) Log Message: ----------- added exclusion for conflicting heritrix-commons version transitive dependency from access-control Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/pom.xml Modified: trunk/archive-access/projects/wayback/wayback-core/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/pom.xml 2010-08-17 00:58:56 UTC (rev 3240) +++ trunk/archive-access/projects/wayback/wayback-core/pom.xml 2010-08-17 01:01:08 UTC (rev 3241) @@ -68,6 +68,10 @@ <groupId>commons-pool</groupId> <artifactId>commons-pool</artifactId> </exclusion> + <exclusion> + <groupId>org.archive.heritrix</groupId> + <artifactId>commons</artifactId> + </exclusion> </exclusions> </dependency> <dependency> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-08-17 00:59:02
|
Revision: 3240 http://archive-access.svn.sourceforge.net/archive-access/?rev=3240&view=rev Author: bradtofel Date: 2010-08-17 00:58:56 +0000 (Tue, 17 Aug 2010) Log Message: ----------- added dspace support Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/pom.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2010-08-17 00:58:03 UTC (rev 3239) +++ trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2010-08-17 00:58:56 UTC (rev 3240) @@ -60,6 +60,11 @@ <artifactId>standard</artifactId> <version>1.1.2</version> </dependency> + <dependency> + <groupId>org.dspace</groupId> + <artifactId>foresite</artifactId> + <version>SNAPSHOT</version> + </dependency> </dependencies> </project> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3239 http://archive-access.svn.sourceforge.net/archive-access/?rev=3239&view=rev Author: bradtofel Date: 2010-08-17 00:58:03 +0000 (Tue, 17 Aug 2010) Log Message: ----------- FEATURE: BetterRequestException now allows specifying of the specific HTTP response code Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/BetterRequestException.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/BetterRequestException.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/BetterRequestException.java 2010-08-17 00:56:49 UTC (rev 3238) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/BetterRequestException.java 2010-08-17 00:58:03 UTC (rev 3239) @@ -24,6 +24,8 @@ */ package org.archive.wayback.exception; +import javax.servlet.http.HttpServletResponse; + /** * Exception class for queries which can be better expressed as another URL, or * should, for one reason or another, be requested at a different URL. Likely @@ -42,12 +44,26 @@ private static final long serialVersionUID = 1L; protected static final String ID = "betterRequest"; private String betterURI; + private int status = HttpServletResponse.SC_FOUND; + /** * Constructor * @param betterURI + * @param status * */ + public BetterRequestException(String betterURI, int status) { + super("Better URI for query"); + this.betterURI = betterURI; + this.status = status; + id = ID; + } + /** + * Constructor + * @param betterURI + * + */ public BetterRequestException(String betterURI) { super("Better URI for query"); this.betterURI = betterURI; @@ -60,4 +76,8 @@ public String getBetterURI() { return betterURI; } + public int getStatus() { + return status; + } + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-08-17 00:56:55
|
Revision: 3238 http://archive-access.svn.sourceforge.net/archive-access/?rev=3238&view=rev Author: bradtofel Date: 2010-08-17 00:56:49 +0000 (Tue, 17 Aug 2010) Log Message: ----------- TWEAK: RequestFilter is now under ...wayback.utils package Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/web.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/web.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/web.xml 2010-08-17 00:55:09 UTC (rev 3237) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/web.xml 2010-08-17 00:56:49 UTC (rev 3238) @@ -11,10 +11,11 @@ <context-param> <param-name>config-path</param-name> <param-value>WEB-INF/wayback.xml</param-value> +<!-- <param-value>WEB-INF/memento.xml</param-value> --> </context-param> <filter> <filter-name>RequestFilter</filter-name> - <filter-class>org.archive.wayback.webapp.RequestFilter</filter-class> + <filter-class>org.archive.wayback.util.webapp.RequestFilter</filter-class> </filter> <filter-mapping> <filter-name>RequestFilter</filter-name> @@ -51,12 +52,12 @@ <auth-method>BASIC</auth-method> <realm-name>Secured-Wayback</realm-name> </login-config> ---> <error-page> <exception-type>java.lang.Exception</exception-type> <location>/WEB-INF/exception/HTMLError.jsp</location> </error-page> +--> </web-app> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-08-17 00:55:15
|
Revision: 3237 http://archive-access.svn.sourceforge.net/archive-access/?rev=3237&view=rev Author: bradtofel Date: 2010-08-17 00:55:09 +0000 (Tue, 17 Aug 2010) Log Message: ----------- INITIAL-REV: likely temporary top level Spring Config file, until we've confirmed memento functionality. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/memento.xml Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/memento.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/memento.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/memento.xml 2010-08-17 00:55:09 UTC (rev 3237) @@ -0,0 +1,163 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> + +<!-- + Macro-like substitutions for the overall file: + wayback.basedir: default top level directory for all index, state, + locationdb storage. +--> + + <bean class="org.springframework.beans.factory.config.PropertyPlaceholderConfigurer"> + <property name="properties"> + <value> + wayback.basedir=/tmp/wayback + </value> + </property> + </bean> + + + <bean id="waybackCanonicalizer" class="org.archive.wayback.util.url.AggressiveUrlCanonicalizer" /> + +<!-- + The ResourceFileLocationDB implementation to use for mapping ARC/WARC names + to absolute paths/URLs via a BDBJE database. +--> + + <bean id="resourcefilelocationdb" class="org.archive.wayback.resourcestore.locationdb.BDBResourceFileLocationDB"> + <property name="bdbPath" value="${wayback.basedir}/file-db/db/" /> + <property name="bdbName" value="DB1" /> + <property name="logPath" value="${wayback.basedir}/file-db/db.log" /> + </bean> + +<!-- + To enable manual management of, or remote access to the above locationDB, + uncomment the following bean. +--> +<!-- + <bean name="8080:locationdb" class="org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBServlet"> + <property name="locationDB" ref="resourcefilelocationdb" /> + </bean> +--> + +<!-- + The FileProxyServlet uses a ResourceFileLocationDB to make all ARC/WARC + files appear to reside within a single HTTP 1.1 exported directory. + Required when using the SimpleResourceStore to access distributed ARC/WARC + files over HTTP through a single reverse proxy. +--> +<!-- + <bean name="8080:fileproxy" class="org.archive.wayback.resourcestore.locationdb.FileProxyServlet"> + <property name="locationDB" ref="resourcefilelocationdb" /> + </bean> +--> + + +<!-- + The XML files indicated in the following import tags contain alternate + example implementations of WaybackCollections. To specify where your + ARC/WARC files are located, see the file BDBCollection.xml. +--> + <import resource="BDBCollection.xml"/> +<!-- + <import resource="NutchCollection.xml"/> + <import resource="CDXCollection.xml"/> + <import resource="RemoteCollection.xml"/> +--> + + +<!-- + This is the only AccessPoint defined by default within this wayback.xml + Spring configuration file, providing an ArchivalURL Replay UI to the + "localbdbcollection", defined in "BDBCollection.xml" by providing + ArchivalURL-specific implementations of the replay, parser, and + uriConverter. + + This AccessPoint currently will provide access only from the machine + running Tomcat. To provide external access, replace "localhost.archive.org" + with your fully qualified hostname of the computer running Tomcat. +--> + <import resource="ArchivalUrlReplay.xml"/> + <bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> + <property name="staticPrefix" value="http://localhost.archive.org:8080/wayback/" /> + + <property name="collection" ref="localbdbcollection" /> + <property name="replay" ref="archivalurlreplay" /> + <property name="query"> + <bean class="org.archive.wayback.query.Renderer"> + <property name="captureJsp" value="/WEB-INF/query/CalendarResults.jsp" /> + </bean> + </property> + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://localhost.archive.org:8080/wayback/"/> + </bean> + </property> + + <property name="parser"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser"> + <property name="maxRecords" value="1000" /> + <property name="earliestTimestamp" value="1996" /> + </bean> + </property> + + </bean> + + <import resource="MementoReplay.xml"/> + <bean name="8080:memento" parent="8080:wayback"> + <property name="staticPrefix" value="http://localhost.archive.org:8080/memento/" /> + <property name="configs"> + <props> + <prop key="aggregationPrefix">http://localhost.archive.org:8080/ore/</prop> + </props> + </property> + + <property name="replay" ref="mementoreplay" /> + <property name="query"> + <bean class="org.archive.wayback.query.Renderer"> + <property name="captureJsp" value="/WEB-INF/query/Memento.jsp" /> + </bean> + </property> + + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://localhost.archive.org:8080/memento/"/> + </bean> + </property> + <property name="parser"> + <bean class="org.archive.wayback.memento.MementoRequestParser"> + <property name="maxRecords" value="1000" /> + <property name="earliestTimestamp" value="1996" /> + </bean> + </property> + + </bean> + + + <bean name="8080:ore" parent="8080:memento"> + <property name="staticPrefix" value="http://localhost.archive.org:8080/ore/" /> + <property name="configs"> + <props> + <prop key="Prefix">http://localhost.archive.org:8080/memento/</prop> + </props> + </property> + + <property name="replay" ref="archivalurlreplay" /> + <property name="query"> + <bean class="org.archive.wayback.query.Renderer"> + <property name="captureJsp" value="/WEB-INF/query/ORE.jsp" /> + </bean> + </property> + + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://localhost.archive.org:8080/ore/"/> + </bean> + </property> + + </bean> + +</beans> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-08-17 00:50:18
|
Revision: 3236 http://archive-access.svn.sourceforge.net/archive-access/?rev=3236&view=rev Author: bradtofel Date: 2010-08-17 00:50:12 +0000 (Tue, 17 Aug 2010) Log Message: ----------- INITIAL REV: Memento-specific replay configuration. Seems likely it will require some CSS and JS tweaking down the line.. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/MementoReplay.xml Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/MementoReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/MementoReplay.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/MementoReplay.xml 2010-08-17 00:50:12 UTC (rev 3236) @@ -0,0 +1,105 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd"> + +<!-- + + This file depends on beans defined into ArchivalUrlReplay.xml. + + --> + + + <bean id="mementoclientsidehtmlreplayrenderer" class="org.archive.wayback.memento.MementoHTMLReplayRenderer"> + <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> + <property name="jspInserts"> + <list> + <value>/WEB-INF/replay/ArchiveComment.jsp</value> + <value>/WEB-INF/replay/Disclaimer.jsp</value> + <value>/WEB-INF/replay/MementoValidity.jsp</value> + </list> + </property> + </bean> + + <bean id="mementoreplay" class="org.archive.wayback.replay.SelectorReplayDispatcher"> + <property name="selectors"> + <list> + + <!-- REDIRECT IF NOT EXACT DATE --> + <bean class="org.archive.wayback.replay.selector.DateMismatchSelector"> + <property name="renderer"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlDateRedirectReplayRenderer" /> + </property> + </bean> + + <!-- HTML REPLAY --> + <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> + <property name="mimeContains"> + <list> + <value>text/html</value> + <value>application/xhtml</value> + </list> + </property> + <property name="renderer" ref="mementoclientsidehtmlreplayrenderer"/> + </bean> + + <!-- CSS REPLAY --> + <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> + <property name="mimeContains"> + <list> + <value>text/css</value> + </list> + </property> + <property name="renderer" ref="archivalcssreplayrenderer"/> + </bean> + + <!-- ASX-MIME REPLAY --> + <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> + <property name="mimeContains"> + <list> + <value>video/x-ms-asf</value> + </list> + </property> + <property name="renderer" ref="archivalasxreplayrenderer"/> + </bean> + + <!-- ASX-PATH REPLAY --> + <bean class="org.archive.wayback.replay.selector.PathMatchSelector"> + <property name="pathContains"> + <list> + <value>.asx</value> + </list> + </property> + <property name="renderer" ref="archivalasxreplayrenderer"/> + </bean> + + <!-- DEFAULT-TRANSPARENT REPLAY --> + <bean class="org.archive.wayback.replay.selector.AlwaysMatchSelector"> + <property name="renderer" ref="archivaltransparentreplayrenderer"/> + </bean> + + </list> + </property> + </bean> + + +<!-- + This bean is unused. May be useful to continue down the server-side rewrite + path if we run into too many client-side rewrite problems. + --> + <bean id="mementosaxreplayrenderer" class="org.archive.wayback.archivalurl.ArchivalUrlSAXRewriteReplayRenderer"> + <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> + <property name="charsetDetector"> + <bean class="org.archive.wayback.replay.charset.RotatingCharsetDetector"/> + </property> + <property name="delegator"> + <bean id="fastArchivalSAXDelegator" class="org.archive.wayback.archivalurl.FastArchivalUrlReplayParseEventHandler" > + <property name="jspInsertPath" value="/WEB-INF/replay/MementoAggregate.jsp"/> + </bean> + </property> + </bean> + + + +</beans> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-08-17 00:48:38
|
Revision: 3235 http://archive-access.svn.sourceforge.net/archive-access/?rev=3235&view=rev Author: bradtofel Date: 2010-08-17 00:48:32 +0000 (Tue, 17 Aug 2010) Log Message: ----------- CLEANUP: mostly comment changes, hopefully they are now more useful, also moved some old stanzas "below the fold" Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2010-08-17 00:45:52 UTC (rev 3234) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2010-08-17 00:48:32 UTC (rev 3235) @@ -17,8 +17,8 @@ </bean> <!-- - Renderer for both CSS and Javascript, causing a comment containing - archive inforation to be inserted in the returned documents. + Renderer for both CSS and JavaScript, causing a comment containing + archive information to be inserted in the returned documents. --> <bean id="archivalcssreplayrenderer" class="org.archive.wayback.archivalurl.ArchivalUrlCSSReplayRenderer"> <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> @@ -45,47 +45,46 @@ <!-- Renderer which returns documents as-is. Suitable for images, binary formats, - and anything else Wayback doesn't know how to handle yet. + and anything else Wayback doesn't know how to handle yet. This still + rewrites HTTP headers. --> <bean id="archivaltransparentreplayrenderer" class="org.archive.wayback.replay.TransparentReplayRenderer"> <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> </bean> + +<!-- + Renderer which returns documents as-is, including HTTP headers. + --> <bean id="identityreplayrenderer" class="org.archive.wayback.replay.TransparentReplayRenderer"> <constructor-arg> <bean id="identityhttpheaderprocessor" class="org.archive.wayback.replay.IdentityHttpHeaderProcessor"/> </constructor-arg> </bean> -<!-- - The following bean is an example of the experimental Regex-Based - server-side HTML rewriting Renderer - --> - <bean id="archivalserversidehtmlreplayrenderer" class="org.archive.wayback.archivalurl.ServerSideHTMLReplayRenderer"> - <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> - <property name="jspInserts"> - <list> - <value>/WEB-INF/replay/ArchiveComment.jsp</value> + <!-- - <value>/WEB-INF/replay/JSLessTimeline.jsp</value> ---> - </list> - </property> - </bean> - -<!-- - ArchivalUrlSaxReplay.xml is a template for defining custom rules for - rewriting HTML content using a SAX parser, allowing fine-tuned and - flexible server-side rewriting. Defines a bean with id - "archivalSAXDelegator", which can be substituted with the - "fastArchivalSAXDelegator" below. + Current production-standard HTML rewriting is performed entirely on the + server, improving rewrite accuracy over the old client-side JavaScript-based + rewriting. This also reduces live-web request leaks, which improves end-user + privacy. + + SAX events can be handled by the "best practices, mean and lean" + FastArchivalUrlReplayParseEventHandler, which covers the current known set + of rewrite instructions required. + + There is also a configurable SAX event handler. ArchivalUrlSaxReplay.xml + is a template for defining custom rules for rewriting HTML content using + a SAX parser, allowing fine-tuned and flexible server-side rewriting. + ArchivalUrlSaxReplay.xml Defines a bean with id "archivalSAXDelegator", + which can be substituted with the "fastArchivalSAXDelegator" below. <import resource="ArchivalUrlSaxReplay.xml"/> --> <bean id="fastArchivalSAXDelegator" class="org.archive.wayback.archivalurl.FastArchivalUrlReplayParseEventHandler" > <property name="jspInsertPath" value="/WEB-INF/replay/DisclaimChooser.jsp"/> </bean> - + <!-- - The following bean is an example of the new SAX based rewriting renderer. It + The following bean defines the SAX based rewriting renderer. It also uses a pluggable character encoding detector, which could allow clients to issue special requests to Wayback to alter the detection strategy. --> @@ -97,30 +96,13 @@ <property name="delegator" ref="fastArchivalSAXDelegator"/> </bean> -<!-- - The following bean is an example of the "classic" or most mature ArchivalUrl - Replay system - it uses a combination of server-side regex rewriting and - a client-side javascript insert to rewite links within an HTML page. ---> - <bean id="archivalclientsidehtmlreplayrenderer" class="org.archive.wayback.archivalurl.ClientSideHTMLReplayRenderer"> - <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> - <property name="jspInserts"> - <list> - <value>/WEB-INF/replay/Timeline.jsp</value> <!-- - <value>/WEB-INF/replay/ArchiveComment.jsp</value> - <value>/WEB-INF/replay/ClientSideJSInsert.jsp</value> - <value>/WEB-INF/replay/Disclaimer.jsp</value> - <value>/WEB-INF/replay/DebugBanner.jsp</value> ---> - </list> - </property> - </bean> - - -<!-- The main Archival URL replay dispatcher. It uses a list of Selectors to determine which ReplayRenderer should be used for each document. + + Each Selector specified is attempted in the order defined here. When a + Selector indicates it can handle the document, it's renderer is used to + return the document to the user. --> <bean id="archivalurlreplay" class="org.archive.wayback.replay.SelectorReplayDispatcher"> <property name="selectors"> @@ -171,9 +153,6 @@ </list> </property> <property name="renderer" ref="archivalsaxreplayrenderer"/> -<!-- - <property name="renderer" ref="archivalclientsidehtmlreplayrenderer"/> - --> </bean> @@ -227,4 +206,46 @@ </list> </property> </bean> + +<!-- + BELOW ARE OUT-MODED + --> + + +<!-- + The following bean is an example of the experimental Regex-Based + server-side HTML rewriting Renderer + --> + <bean id="archivalserversidehtmlreplayrenderer" class="org.archive.wayback.archivalurl.ServerSideHTMLReplayRenderer"> + <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> + <property name="jspInserts"> + <list> + <value>/WEB-INF/replay/ArchiveComment.jsp</value> +<!-- + <value>/WEB-INF/replay/JSLessTimeline.jsp</value> +--> + </list> + </property> + </bean> + +<!-- + The following bean is an example of the "classic" or most mature ArchivalUrl + Replay system - it uses a combination of server-side regex rewriting and + a client-side javascript insert to rewite links within an HTML page. +--> + <bean id="archivalclientsidehtmlreplayrenderer" class="org.archive.wayback.archivalurl.ClientSideHTMLReplayRenderer"> + <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> + <property name="jspInserts"> + <list> + <value>/WEB-INF/replay/ArchiveComment.jsp</value> + <value>/WEB-INF/replay/ClientSideJSInsert.jsp</value> + <value>/WEB-INF/replay/Disclaimer.jsp</value> +<!-- + <value>/WEB-INF/replay/DebugBanner.jsp</value> +--> + </list> + </property> + </bean> + + </beans> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-08-17 00:45:59
|
Revision: 3234 http://archive-access.svn.sourceforge.net/archive-access/?rev=3234&view=rev Author: bradtofel Date: 2010-08-17 00:45:52 +0000 (Tue, 17 Aug 2010) Log Message: ----------- TWEAK: added placeholder for recurse option, upped maxrecords to something absurdly high. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml 2010-08-17 00:43:40 UTC (rev 3233) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml 2010-08-17 00:45:52 UTC (rev 3234) @@ -42,10 +42,12 @@ <bean class="org.archive.wayback.resourcestore.resourcefile.DirectoryResourceFileSource"> <property name="name" value="files1" /> <property name="prefix" value="${wayback.basedir}/files1/" /> + <property name="recurse" value="false" /> </bean> <bean class="org.archive.wayback.resourcestore.resourcefile.DirectoryResourceFileSource"> <property name="name" value="files2" /> <property name="prefix" value="${wayback.basedir}/files2/" /> + <property name="recurse" value="false" /> </bean> </list> </property> @@ -62,7 +64,7 @@ <property name="bdbPath" value="${wayback.basedir}/index/" /> </bean> </property> - <property name="maxRecords" value="10000" /> + <property name="maxRecords" value="100000000" /> </bean> <!-- This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-08-17 00:43:47
|
Revision: 3233 http://archive-access.svn.sourceforge.net/archive-access/?rev=3233&view=rev Author: bradtofel Date: 2010-08-17 00:43:40 +0000 (Tue, 17 Aug 2010) Log Message: ----------- FEATURE: One step closer to "native" Memento support. Still need a clean-up pass over the .jsp, once functionality is confirmed. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoHTMLReplayRenderer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeBundleRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoHTMLReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoHTMLReplayRenderer.java 2010-08-17 00:07:39 UTC (rev 3232) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoHTMLReplayRenderer.java 2010-08-17 00:43:40 UTC (rev 3233) @@ -49,7 +49,7 @@ */ public class MementoHTMLReplayRenderer extends TextReplayRenderer { /** - * @param httpHeaderProcessor + * @param httpHeaderProcessor to use */ public MementoHTMLReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) { super(httpHeaderProcessor); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoRequestParser.java 2010-08-17 00:07:39 UTC (rev 3232) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoRequestParser.java 2010-08-17 00:43:40 UTC (rev 3233) @@ -42,6 +42,8 @@ */ public class MementoRequestParser extends ArchivalUrlRequestParser { protected RequestParser[] getRequestParsers() { + // all the usual ArchivalURL RequestParsers, plus the memento-specific + // ones: RequestParser[] theParsers = { new ReplayRequestParser(this), new TimeGateRequestParser(this), Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeBundleRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeBundleRequestParser.java 2010-08-17 00:07:39 UTC (rev 3232) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeBundleRequestParser.java 2010-08-17 00:43:40 UTC (rev 3233) @@ -3,6 +3,7 @@ import javax.servlet.http.HttpServletRequest; import org.apache.log4j.Logger; +import org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BadQueryException; import org.archive.wayback.exception.BetterRequestException; @@ -51,11 +52,18 @@ wbRequest.setCaptureQueryRequest(); wbRequest.setRequestUrl(urlStr); + ArchivalUrlResultURIConverter conv = + (ArchivalUrlResultURIConverter) accessPoint.getUriConverter(); + + String uriPrefix = conv.getReplayURIPrefix(); + String betterUrl = uriPrefix + "timemap/rdf/" + urlStr; + + throw new BetterRequestException(betterUrl, 303); // TODO: is it critical to return a 303 code, or will a 302 do? // if so, this and ORE.jsp can be simplified by throwing a // BetterRequestException here. - wbRequest.put("redirect", "true"); - return wbRequest; +// wbRequest.put("redirect", "true"); +// return wbRequest; } if (requestPath.startsWith("timemap")) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2010-08-17 00:07:39 UTC (rev 3232) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2010-08-17 00:43:40 UTC (rev 3233) @@ -234,7 +234,10 @@ } } catch(BetterRequestException e) { - httpResponse.sendRedirect(e.getBetterURI()); + + httpResponse.setStatus(e.getStatus()); + httpResponse.setHeader("Location", e.getBetterURI()); +// httpResponse.sendRedirect(e.getBetterURI()); handled = true; } catch(WaybackException e) { Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp 2010-08-17 00:07:39 UTC (rev 3232) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp 2010-08-17 00:43:40 UTC (rev 3233) @@ -44,277 +44,263 @@ //String remuri = p_url +"timemap/" + u; //System.out.println(agguri); //System.out.println(remuri); - String redirection = null; - if (wbRequest.containsKey("redirect")) { - redirection = wbRequest.get("redirect"); - } - if (redirection != null) { - //default poka - //skip content negotiation + String format = wbRequest.get("format"); + // System.out.println("here"); + Aggregation agg = OREFactory.createAggregation(new URI(agguri)); + //System.out.println("here"); + ResourceMap rem = agg.createResourceMap(new URI(uriPrefix + + "timemap/" + format + "/" + u)); - // TODO: see comment in TimeBundleParser - could be handled elsewhere - response.setStatus(303); - response.setHeader("Location", uriPrefix + "timemap/rdf/" + u); - //response.sendRedirect(p_url +"timemap/rdf/" + u); - } else { - String format = wbRequest.get("format"); - // System.out.println("here"); - Aggregation agg = OREFactory.createAggregation(new URI(agguri)); - //System.out.println("here"); - ResourceMap rem = agg.createResourceMap(new URI(uriPrefix - + "timemap/" + format + "/" + u)); + //SimpleDateFormat formatter_utc = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); - //SimpleDateFormat formatter_utc = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); + Date now = new Date(); - Date now = new Date(); + rem.setCreated(now); + Predicate pr_type = new Predicate(); + pr_type.setURI(new URI( + "http://www.w3.org/1999/02/22-rdf-syntax-ns#type")); - rem.setCreated(now); - Predicate pr_type = new Predicate(); - pr_type.setURI(new URI( - "http://www.w3.org/1999/02/22-rdf-syntax-ns#type")); + rem.setModified(now); + rem.createTriple(pr_type, new URI( + "http://www.mementoweb.org/terms/tb/TimeMap")); + //rem.addType(new URI("http://www.mementoweb.org/terms/tb/TimeMap")); + Agent creator = OREFactory.createAgent(); + creator.addName("Foresite Toolkit (Java)"); + //creator.addMbox(new URI("for...@go...")); - rem.setModified(now); - rem.createTriple(pr_type, new URI( - "http://www.mementoweb.org/terms/tb/TimeMap")); - //rem.addType(new URI("http://www.mementoweb.org/terms/tb/TimeMap")); - Agent creator = OREFactory.createAgent(); - creator.addName("Foresite Toolkit (Java)"); - //creator.addMbox(new URI("for...@go...")); + //rem.addAgent(new URI("http://foresite-toolkit.googlecode.com/#javaAgent"),creator); + rem.addCreator(creator); + agg.addTitle("Memento Time Bundle for " + u); - //rem.addAgent(new URI("http://foresite-toolkit.googlecode.com/#javaAgent"),creator); - rem.addCreator(creator); - agg.addTitle("Memento Time Bundle for " + u); + //CaptureSearchResults cResults = results.getCaptureResults(); + //CaptureSearchResult res = cResults.getClosest(wbRequest,true); + Iterator<CaptureSearchResult> itr = cResults.iterator(); + SimpleDateFormat formatterk = new SimpleDateFormat( + "yyyyMMddHHmmss"); - //CaptureSearchResults cResults = results.getCaptureResults(); - //CaptureSearchResult res = cResults.getClosest(wbRequest,true); - Iterator<CaptureSearchResult> itr = cResults.iterator(); - SimpleDateFormat formatterk = new SimpleDateFormat( - "yyyyMMddHHmmss"); + Date f = cResults.getFirstResultDate(); + Date l = cResults.getLastResultDate(); - Date f = cResults.getFirstResultDate(); - Date l = cResults.getLastResultDate(); + String ArchiveInterval = formatterk.format(f) + " - " + + formatterk.format(l); - String ArchiveInterval = formatterk.format(f) + " - " - + formatterk.format(l); + //agg.createTriple(new URI("http://www.mementoweb.org/ns/archiveInterval"),ArchiveInterval); + agg.addType(new URI( + "http://www.mementoweb.org/terms/tb/TimeBundle")); + //String mementourl =p_url.replace("ore","memento"); + //include original into aggregation - //agg.createTriple(new URI("http://www.mementoweb.org/ns/archiveInterval"),ArchiveInterval); - agg.addType(new URI( - "http://www.mementoweb.org/terms/tb/TimeBundle")); - //String mementourl =p_url.replace("ore","memento"); - //include original into aggregation + AggregatedResource ar_o = agg.createAggregatedResource(new URI( + u)); + ar_o.createTriple(pr_type, new URI( + "http://www.mementoweb.org/terms/tb/OriginalResource")); + //include timegate into aggregation + AggregatedResource ar_tg = agg + .createAggregatedResource(new URI(results + .getContextConfig("Prefix") + + "timegate/" + u)); + Predicate pr_format = new Predicate(); + pr_format.setURI(new URI( + "http://purl.org/dc/elements/1.1/format")); + ar_tg.createTriple(pr_format, new URI(u)); + ar_tg.createTriple(pr_type, new URI( + "http://www.mementoweb.org/terms/tb/TimeGate")); - AggregatedResource ar_o = agg.createAggregatedResource(new URI( - u)); - ar_o.createTriple(pr_type, new URI( - "http://www.mementoweb.org/terms/tb/OriginalResource")); - //include timegate into aggregation - AggregatedResource ar_tg = agg - .createAggregatedResource(new URI(results - .getContextConfig("Prefix") - + "timegate/" + u)); - Predicate pr_format = new Predicate(); - pr_format.setURI(new URI( - "http://purl.org/dc/elements/1.1/format")); - ar_tg.createTriple(pr_format, new URI(u)); - ar_tg.createTriple(pr_type, new URI( - "http://www.mementoweb.org/terms/tb/TimeGate")); + String previos_digest = null; + List<String> previos_blancs = new ArrayList<String>(); - String previos_digest = null; - List<String> previos_blancs = new ArrayList<String>(); + Predicate pr = new Predicate(); + pr.setURI(new URI("http://www.mementoweb.org/terms/tb/start")); + Predicate pre = new Predicate(); + pre.setURI(new URI("http://www.mementoweb.org/terms/tb/end")); + Calendar cal = Calendar.getInstance(); + AggregatedResource ar = null; - Predicate pr = new Predicate(); - pr.setURI(new URI("http://www.mementoweb.org/terms/tb/start")); - Predicate pre = new Predicate(); - pre.setURI(new URI("http://www.mementoweb.org/terms/tb/end")); - Calendar cal = Calendar.getInstance(); - AggregatedResource ar = null; + Date enddate = null; - Date enddate = null; + // String buffer for special link serialization format + StringBuffer linkbf = new StringBuffer(); - // String buffer for special link serialization format - StringBuffer linkbf = new StringBuffer(); + linkbf.append("<" + u + ">;rel=\"original\"\n"); + linkbf.append(",<" + agguri + ">;rel=\"timebundle\"\n"); + String firstmemento = null; - linkbf.append("<" + u + ">;rel=\"original\"\n"); - linkbf.append(",<" + agguri + ">;rel=\"timebundle\"\n"); - String firstmemento = null; + while (itr.hasNext()) { + CaptureSearchResult cur = itr.next(); + //I am not deduping urls (by digest) for the rdf serialization running out of time, extra efforts for me now ;) - while (itr.hasNext()) { - CaptureSearchResult cur = itr.next(); - //I am not deduping urls (by digest) for the rdf serialization running out of time, extra efforts for me now ;) + String resurl = results.getContextConfig("Prefix") + + formatterk.format(cur.getCaptureDate()) + "/" + u; - String resurl = results.getContextConfig("Prefix") - + formatterk.format(cur.getCaptureDate()) + "/" + u; + String digest = cur.getDigest(); + if (previos_digest == null) { + previos_digest = digest; + } - String digest = cur.getDigest(); - if (previos_digest == null) { - previos_digest = digest; - } + ar = agg.createAggregatedResource(new URI(resurl)); + ar.createTriple(pr_format, cur.getMimeType()); - ar = agg.createAggregatedResource(new URI(resurl)); - ar.createTriple(pr_format, cur.getMimeType()); + Predicate pr_1 = new Predicate(); + pr_1.setURI(new URI( + "http://www.mementoweb.org/terms/tb/mementoFor")); + ar.createTriple(pr_1, new URI(u)); + ar.createTriple(pr_type, new URI( + "http://www.mementoweb.org/terms/tb/Memento")); - Predicate pr_1 = new Predicate(); - pr_1.setURI(new URI( - "http://www.mementoweb.org/terms/tb/mementoFor")); - ar.createTriple(pr_1, new URI(u)); - ar.createTriple(pr_type, new URI( - "http://www.mementoweb.org/terms/tb/Memento")); + Date startdate = cur.getDuplicateDigestStoredDate(); + //System.out.println("start:"+startdate); + enddate = cur.getCaptureDate(); + //System.out.println("end:"+enddate); - Date startdate = cur.getDuplicateDigestStoredDate(); - //System.out.println("start:"+startdate); - enddate = cur.getCaptureDate(); - //System.out.println("end:"+enddate); + // serialiase it in links format only for unique digest - // serialiase it in links format only for unique digest - - if (startdate == null) { - if (firstmemento == null) { - linkbf.append(",<" + resurl - + ">;rel=\"first-memento\";datetime=\"" - + httpformatterl.format(enddate) + "\"\n"); - firstmemento = "firstmemento"; - } else { - linkbf.append(",<" + resurl - + ">;rel=\"memento\";datetime=\"" - + httpformatterl.format(enddate) + "\"\n"); - } + if (startdate == null) { + if (firstmemento == null) { + linkbf.append(",<" + resurl + + ">;rel=\"first-memento\";datetime=\"" + + httpformatterl.format(enddate) + "\"\n"); + firstmemento = "firstmemento"; + } else { + linkbf.append(",<" + resurl + + ">;rel=\"memento\";datetime=\"" + + httpformatterl.format(enddate) + "\"\n"); } + } - // Adding blanc node - Triple triple = new TripleJena(); - triple.initialise(new URI(resurl)); - Predicate pred = new Predicate(); - UUID a = UUID.randomUUID(); - String blanc = "urn:uuid:" + a.toString(); + // Adding blanc node + Triple triple = new TripleJena(); + triple.initialise(new URI(resurl)); + Predicate pred = new Predicate(); + UUID a = UUID.randomUUID(); + String blanc = "urn:uuid:" + a.toString(); - //System.out.println(blanc); - pred.setURI(new URI( - "http://www.mementoweb.org/terms/tb/validOver")); - triple.relate(pred, new URI(blanc)); - Triple tr = new TripleJena(); - tr.initialise(new URI(blanc)); + //System.out.println(blanc); + pred.setURI(new URI( + "http://www.mementoweb.org/terms/tb/validOver")); + triple.relate(pred, new URI(blanc)); + Triple tr = new TripleJena(); + tr.initialise(new URI(blanc)); - tr.relate(pr_type, new URI( - "http://www.mementoweb.org/terms/tb/Period")); + tr.relate(pr_type, new URI( + "http://www.mementoweb.org/terms/tb/Period")); - //period difined by [ [ interval [ date first digest recorded and date of next digest recorded [ + //period difined by [ [ interval [ date first digest recorded and date of next digest recorded [ - String start = null; - Triple trd = new TripleJena(); - trd.initialise(new URI(blanc)); - //Calendar cal = Calendar.getInstance(); + String start = null; + Triple trd = new TripleJena(); + trd.initialise(new URI(blanc)); + //Calendar cal = Calendar.getInstance(); - if (startdate != null) { + if (startdate != null) { - cal.setTime(startdate); - trd.relate(pr, cal); - start = httpformatterl.format(startdate); - } else { - cal.setTime(enddate); - trd.relate(pr, cal); - start = httpformatterl.format(enddate); - } + cal.setTime(startdate); + trd.relate(pr, cal); + start = httpformatterl.format(startdate); + } else { + cal.setTime(enddate); + trd.relate(pr, cal); + start = httpformatterl.format(enddate); + } - //System.out.println("type" +trd.getLiteralType()); + //System.out.println("type" +trd.getLiteralType()); - ar.addTriple(triple); - ar.addTriple(tr); - ar.addTriple(trd); + ar.addTriple(triple); + ar.addTriple(tr); + ar.addTriple(trd); - if (!digest.equals("previos_digest")) { + if (!digest.equals("previos_digest")) { - Iterator<String> it = previos_blancs.iterator(); - while (it.hasNext()) { - String blanc_ = (String) it.next(); - Triple tre = new TripleJena(); - tre.initialise(new URI(blanc_)); + Iterator<String> it = previos_blancs.iterator(); + while (it.hasNext()) { + String blanc_ = (String) it.next(); + Triple tre = new TripleJena(); + tre.initialise(new URI(blanc_)); - //Calendar cal = Calendar.getInstance(); - cal.setTime(enddate); - tre.relate(pre, cal); - ar.addTriple(tre); - } - - previos_blancs.clear(); - previos_digest = digest; + //Calendar cal = Calendar.getInstance(); + cal.setTime(enddate); + tre.relate(pre, cal); + ar.addTriple(tre); } - previos_blancs.add(blanc); - + previos_blancs.clear(); + previos_digest = digest; } - Iterator it = previos_blancs.iterator(); - while (it.hasNext()) { - String blanc_ = (String) it.next(); - Triple tre = new TripleJena(); - tre.initialise(new URI(blanc_)); + previos_blancs.add(blanc); - cal.setTime(now); //or date of archive stop archiving - tre.relate(pre, cal); + } - ar.addTriple(tre); - } + Iterator it = previos_blancs.iterator(); + while (it.hasNext()) { + String blanc_ = (String) it.next(); + Triple tre = new TripleJena(); + tre.initialise(new URI(blanc_)); - // additional logic for link format - int m_index = linkbf.lastIndexOf("\"memento\""); - //System.out.println(m_index); - linkbf.insert(m_index + 1, "last-"); - //System.out.println("here"); + cal.setTime(now); //or date of archive stop archiving + tre.relate(pre, cal); - //String format = wbRequest.get("format"); - ORESerialiser serial = null; - if (format.equals("rdf")) { - serial = ORESerialiserFactory.getInstance("RDF/XML"); - response.setContentType("application/rdf+xml"); - } - //else if (format.equals("atom")) { - // serial = ORESerialiserFactory.getInstance("ATOM-1.0"); - //} - //else if (format.equals ("html")) { - // serial = ORESerialiserFactory.getInstance("RDFa"); - //} - //removed n3 because serialization of the date to the String type - //else if (format.equals("n3")) { - //serial = ORESerialiserFactory.getInstance("N3"); + ar.addTriple(tre); + } - //response.setContentType("text/n3"); - //} - else if (format.equals("link")) { - PrintWriter pw = response.getWriter(); - //System.out.println(linkbf.toString()); - - // TODO: are we sure this is right? We want to flush *before* - // setting content-type? - pw.print(linkbf.toString()); - pw.flush(); - response.setContentType("text/csv"); - } else { - // response.setStatus(404); - // TODO: this should be handled in TimeBundleParser to allow - // usual Exception rendering to happen. - response.sendError(404, "Unknown TimeMap serialization"); - } - if (serial != null) { - ResourceMapDocument doc = serial.serialise(rem); - // TODO: this could get really big. Any way to stream the data out - // so we don't need another copy beyond the ResourceMap, - // and other helper objects? - String serialisation = doc.toString(); - if (format.equals("rdf")) { - //bug in jena? did not serialise date to date type but to string type // stupid fix will need investigate it - serialisation = serialisation - .replaceAll( - "end rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string", - "end rdf:datatype=\"http://www.w3.org/2001/XMLSchema#dateTime"); - serialisation = serialisation - .replaceAll( - "start rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string", - "start rdf:datatype=\"http://www.w3.org/2001/XMLSchema#dateTime"); - } - PrintWriter pw = response.getWriter(); - pw.print(serialisation); - pw.flush(); - } + // additional logic for link format + int m_index = linkbf.lastIndexOf("\"memento\""); + //System.out.println(m_index); + linkbf.insert(m_index + 1, "last-"); + //System.out.println("here"); + //String format = wbRequest.get("format"); + ORESerialiser serial = null; + if (format.equals("rdf")) { + serial = ORESerialiserFactory.getInstance("RDF/XML"); + response.setContentType("application/rdf+xml"); } + //else if (format.equals("atom")) { + // serial = ORESerialiserFactory.getInstance("ATOM-1.0"); + //} + //else if (format.equals ("html")) { + // serial = ORESerialiserFactory.getInstance("RDFa"); + //} + //removed n3 because serialization of the date to the String type + //else if (format.equals("n3")) { + //serial = ORESerialiserFactory.getInstance("N3"); + + //response.setContentType("text/n3"); + //} + else if (format.equals("link")) { + PrintWriter pw = response.getWriter(); + //System.out.println(linkbf.toString()); + + // TODO: are we sure this is right? We want to flush *before* + // setting content-type? + pw.print(linkbf.toString()); + pw.flush(); + response.setContentType("text/csv"); + } else { + // response.setStatus(404); + // TODO: this should be handled in TimeBundleParser to allow + // usual Exception rendering to happen. + response.sendError(404, "Unknown TimeMap serialization"); + } + if (serial != null) { + ResourceMapDocument doc = serial.serialise(rem); + // TODO: this could get really big. Any way to stream the data out + // so we don't need another copy beyond the ResourceMap, + // and other helper objects? + String serialisation = doc.toString(); + if (format.equals("rdf")) { + //bug in jena? did not serialise date to date type but to string type // stupid fix will need investigate it + serialisation = serialisation + .replaceAll( + "end rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string", + "end rdf:datatype=\"http://www.w3.org/2001/XMLSchema#dateTime"); + serialisation = serialisation + .replaceAll( + "start rdf:datatype=\"http://www.w3.org/2001/XMLSchema#string", + "start rdf:datatype=\"http://www.w3.org/2001/XMLSchema#dateTime"); + } + PrintWriter pw = response.getWriter(); + pw.print(serialisation); + pw.flush(); + } + %> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3232 http://archive-access.svn.sourceforge.net/archive-access/?rev=3232&view=rev Author: bradtofel Date: 2010-08-17 00:07:39 +0000 (Tue, 17 Aug 2010) Log Message: ----------- FEATURE: new test verifies correct replacement of partial date. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/ArchivalUrlTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/ArchivalUrlTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/ArchivalUrlTest.java 2010-08-17 00:06:02 UTC (rev 3231) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/ArchivalUrlTest.java 2010-08-17 00:07:39 UTC (rev 3232) @@ -84,9 +84,29 @@ return new ArchivalUrl(parse(path)); } - public void trt(String want, String src) throws BetterRequestException, BadQueryException { - assertEquals(want,parseAU(src).toString()); + private void trt(String want, String src) { + try { + assertEquals(want,parseAU(src).toString()); + } catch (BetterRequestException e) { + e.printStackTrace(); + fail(e.getMessage()); + } catch (BadQueryException e) { + e.printStackTrace(); + fail(e.getMessage()); + } } + + private void trtBetterExcept(String want, String src) { + try { + String foo = parseAU(src).toString(); + fail("should have thrown BetterRequestException"); + } catch (BetterRequestException e) { + assertEquals(want,e.getBetterURI()); + } catch (BadQueryException e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } /** * Test method for {@link org.archive.wayback.archivalurl.ArchivalUrl#toString()}. @@ -115,6 +135,10 @@ trt( "20010101000000im_/http://www.yahoo.com/", "20010101000000im_/www.yahoo.com:80/"); + + trt( + "20010101235959im_/http://www.yahoo.com/", + "20010101im_/www.yahoo.com:80/"); } /** This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-08-17 00:06:09
|
Revision: 3231 http://archive-access.svn.sourceforge.net/archive-access/?rev=3231&view=rev Author: bradtofel Date: 2010-08-17 00:06:02 +0000 (Tue, 17 Aug 2010) Log Message: ----------- BUGFIX: (ARI-2509) - now rewriting absolute URLs in javascritp strings, with escaped '/'s. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/JSStringTransformer.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/JSStringTransformer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/JSStringTransformer.java 2010-08-16 23:00:36 UTC (rev 3230) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/JSStringTransformer.java 2010-08-17 00:06:02 UTC (rev 3231) @@ -38,7 +38,7 @@ */ public class JSStringTransformer implements StringTransformer { private final static Pattern httpPattern = Pattern - .compile("(http://[A-Za-z0-9:_@.-]+)"); + .compile("(http:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+)"); public String transform(ReplayParseContext context, String input) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java 2010-08-16 23:00:36 UTC (rev 3230) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java 2010-08-17 00:06:02 UTC (rev 3231) @@ -57,8 +57,15 @@ jst.transform(rc, input); assertEquals(1,rc.got.size()); assertEquals("http://www.gavelgrab.org",rc.got.get(0)); + + input = "onloadRegister(function (){window.location.href=\"http:\\/\\/www.facebook.com\\/barrettforwisconsin?v=info\";});"; + rc = new RecordingReplayParseContext(null, new URL("http://foo.com/"), null); + jst.transform(rc, input); + assertEquals(1,rc.got.size()); + assertEquals("http:\\/\\/www.facebook.com",rc.got.get(0)); } + public class RecordingReplayParseContext extends ReplayParseContext { ArrayList<String> got = null; /** This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |