You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
Revision: 3453 http://archive-access.svn.sourceforge.net/archive-access/?rev=3453&view=rev Author: bradtofel Date: 2011-05-25 01:40:30 +0000 (Wed, 25 May 2011) Log Message: ----------- OPTIMIZ: now uses UrlOperations.getUrlPath() instead of constructing a URL object when determining if URLs are /robots.txt Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2011-05-25 01:37:48 UTC (rev 3452) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2011-05-25 01:40:30 UTC (rev 3453) @@ -39,6 +39,7 @@ import org.archive.wayback.liveweb.LiveWebCache; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.UrlOperations; /** * CaptureSearchResult Filter that uses a LiveWebCache to retrieve robots.txt @@ -230,6 +231,17 @@ } notifiedSeen = true; } + String resultURL = r.getOriginalUrl(); + String path = UrlOperations.getURLPath(resultURL); + if(path.equals(ROBOT_SUFFIX)) { + if(!notifiedPassed) { + if(filterGroup != null) { + filterGroup.setPassedRobots(); + } + notifiedPassed = true; + } + return ObjectFilter.FILTER_INCLUDE; + } int filterResult = ObjectFilter.FILTER_EXCLUDE; RobotRules rules = getRules(r); if(rules == null) { @@ -237,26 +249,17 @@ return ObjectFilter.FILTER_ABORT; } } else { - String resultURL = r.getOriginalUrl(); - URL url; - try { - url = new URL(ArchiveUtils.addImpliedHttpIfNecessary(resultURL)); - String path = url.getPath(); - if(path.equals(ROBOT_SUFFIX) || - !rules.blocksPathForUA(path, userAgent)) { - if(!notifiedPassed) { - if(filterGroup != null) { - filterGroup.setPassedRobots(); - } - notifiedPassed = true; + if(!rules.blocksPathForUA(path, userAgent)) { + if(!notifiedPassed) { + if(filterGroup != null) { + filterGroup.setPassedRobots(); } - filterResult = ObjectFilter.FILTER_INCLUDE; - LOGGER.fine("ROBOT: ALLOWED("+resultURL+")"); - } else { - LOGGER.info("ROBOT: BLOCKED("+resultURL+")"); + notifiedPassed = true; } - } catch (MalformedURLException e) { - e.printStackTrace(); + filterResult = ObjectFilter.FILTER_INCLUDE; + LOGGER.fine("ROBOT: ALLOWED("+resultURL+")"); + } else { + LOGGER.info("ROBOT: BLOCKED("+resultURL+")"); } } return filterResult; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:37:54
|
Revision: 3452 http://archive-access.svn.sourceforge.net/archive-access/?rev=3452&view=rev Author: bradtofel Date: 2011-05-25 01:37:48 +0000 (Wed, 25 May 2011) Log Message: ----------- REFACTOR: moved flag assignment and parsing code into ArchivalUrl Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrl.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlDateRedirectReplayRenderer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrl.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrl.java 2011-05-25 01:37:10 UTC (rev 3451) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrl.java 2011-05-25 01:37:48 UTC (rev 3452) @@ -61,37 +61,106 @@ public String toReplayString(String url) { return toString(wbRequest.getReplayTimestamp(),url); } + public String getDateSpec() { + return getDateSpec(wbRequest.getReplayTimestamp()); + } - public String toString(String datespec, String url) { + /** + * Given a date, create a new datespec + flags + * which represent the same options as requested by the WaybackRequest + * @param timestamp the 14-digit timestamp to use + * @return a String representing the flags on the WaybackRequest for the + * specified date + */ + public String getDateSpec(String datespec) { int dateLen = 0; if(datespec != null) { dateLen = datespec.length(); } StringBuilder sb = - new StringBuilder(url.length() + dateLen +10); + new StringBuilder(dateLen +10); if(dateLen > 0) { sb.append(datespec); } + if(wbRequest.isCSSContext()) { sb.append(ArchivalUrlRequestParser.CSS_CONTEXT); sb.append(ArchivalUrlRequestParser.FLAG_DELIM); + dateLen++; } if(wbRequest.isJSContext()) { sb.append(ArchivalUrlRequestParser.JS_CONTEXT); sb.append(ArchivalUrlRequestParser.FLAG_DELIM); + dateLen++; } if(wbRequest.isIMGContext()) { sb.append(ArchivalUrlRequestParser.IMG_CONTEXT); sb.append(ArchivalUrlRequestParser.FLAG_DELIM); + dateLen++; } if(wbRequest.isIdentityContext()) { sb.append(ArchivalUrlRequestParser.IDENTITY_CONTEXT); sb.append(ArchivalUrlRequestParser.FLAG_DELIM); + dateLen++; } - if(dateLen > 0) { + if(wbRequest.isIFrameWrapperContext()) { + sb.append(ArchivalUrlRequestParser.IFRAME_WRAPPED_CONTEXT); + sb.append(ArchivalUrlRequestParser.FLAG_DELIM); + dateLen++; + } + if(wbRequest.isFrameWrapperContext()) { + sb.append(ArchivalUrlRequestParser.FRAME_WRAPPED_CONTEXT); + sb.append(ArchivalUrlRequestParser.FLAG_DELIM); + dateLen++; + } + return sb.toString(); + } + + public String toString(String datespec, String url) { + int dateLen = 0; + if(datespec != null) { + dateLen = datespec.length(); + } + StringBuilder sb = + new StringBuilder(url.length() + dateLen +10); + String dateSpec = getDateSpec(datespec); + sb.append(dateSpec); + if(dateSpec.length() > 0) { sb.append("/"); } sb.append(UrlOperations.stripDefaultPortFromUrl(url)); return sb.toString(); } + + /** + * @param wbRequest + * @param flagsStr : "js_", "", "cs_", "cs_js_" + */ + public static void assignFlags(WaybackRequest wbRequest, String flagsStr) { + if(flagsStr != null) { + String[] flags = flagsStr.split( + ArchivalUrlRequestParser.FLAG_DELIM); + for(String flag: flags) { + if(flag.equals(ArchivalUrlRequestParser.CSS_CONTEXT)) { + wbRequest.setCSSContext(true); + } else if(flag.equals(ArchivalUrlRequestParser.JS_CONTEXT)) { + wbRequest.setJSContext(true); + } else if(flag.equals(ArchivalUrlRequestParser.IMG_CONTEXT)) { + wbRequest.setIMGContext(true); + } else if(flag.equals(ArchivalUrlRequestParser.IDENTITY_CONTEXT)) { + wbRequest.setIdentityContext(true); + } else if(flag.equals(ArchivalUrlRequestParser.FRAME_WRAPPED_CONTEXT)) { + wbRequest.setFrameWrapperContext(true); + } else if(flag.equals(ArchivalUrlRequestParser.IFRAME_WRAPPED_CONTEXT)) { + wbRequest.setIFrameWrapperContext(true); + } else if(flag.startsWith(ArchivalUrlRequestParser.CHARSET_MODE)) { + String modeString = flag.substring( + ArchivalUrlRequestParser.CHARSET_MODE.length()); + int mode = Integer.parseInt(modeString); + wbRequest.setCharsetMode(mode); + } + } + } + } + } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlDateRedirectReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlDateRedirectReplayRenderer.java 2011-05-25 01:37:10 UTC (rev 3451) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlDateRedirectReplayRenderer.java 2011-05-25 01:37:48 UTC (rev 3452) @@ -52,41 +52,9 @@ // redirect to the better version: String url = result.getOriginalUrl(); - String captureDate = makeFlagDateSpec( - result.getCaptureTimestamp(),wbRequest); - - String betterURI = uriConverter.makeReplayURI(captureDate,url); + ArchivalUrl aUrl = new ArchivalUrl(wbRequest); + String dateSpec = aUrl.getDateSpec(result.getCaptureTimestamp()); + String betterURI = uriConverter.makeReplayURI(dateSpec,url); httpResponse.sendRedirect(betterURI); } - - /** - * Given a date, and a WaybackRequest object, create a new datespec + flags - * which represent the same options as requested by the WaybackRequest - * @param timestamp the 14-digit timestamp to use - * @param request the WaybackRequest from which o get extra request option - * flags - * @return a String representing the flags on the WaybackRequest for the - * specified date - */ - public static String makeFlagDateSpec(String timestamp, WaybackRequest request) { - StringBuilder sb = new StringBuilder(); - sb.append(timestamp); - if(request.isCSSContext()) { - sb.append(ArchivalUrlRequestParser.CSS_CONTEXT); - sb.append(ArchivalUrlRequestParser.FLAG_DELIM); - } - if(request.isJSContext()) { - sb.append(ArchivalUrlRequestParser.JS_CONTEXT); - sb.append(ArchivalUrlRequestParser.FLAG_DELIM); - } - if(request.isIMGContext()) { - sb.append(ArchivalUrlRequestParser.IMG_CONTEXT); - sb.append(ArchivalUrlRequestParser.FLAG_DELIM); - } - if(request.isIdentityContext()) { - sb.append(ArchivalUrlRequestParser.IDENTITY_CONTEXT); - sb.append(ArchivalUrlRequestParser.FLAG_DELIM); - } - return sb.toString(); - } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlRequestParser.java 2011-05-25 01:37:10 UTC (rev 3451) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlRequestParser.java 2011-05-25 01:37:48 UTC (rev 3452) @@ -60,6 +60,14 @@ */ public final static String IDENTITY_CONTEXT = "id"; /** + * frame-wrapper context + */ + public final static String FRAME_WRAPPED_CONTEXT = "fw"; + /** + * iframe-wrapped context + */ + public final static String IFRAME_WRAPPED_CONTEXT = "if"; + /** * Charset detection strategy context - should be followed by an integer * indicating which strategy to use */ Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java 2011-05-25 01:37:10 UTC (rev 3451) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java 2011-05-25 01:37:48 UTC (rev 3452) @@ -22,6 +22,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.archive.wayback.archivalurl.ArchivalUrl; import org.archive.wayback.archivalurl.ArchivalUrlRequestParser; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BetterRequestException; @@ -62,7 +63,7 @@ String dateStr = matcher.group(1); urlStr = matcher.group(4); String flags = matcher.group(2); - assignFlags(wbRequest,flags); + ArchivalUrl.assignFlags(wbRequest,flags); // The logic of the classic WM wrt timestamp bounding: // if 14-digits are specified, assume min-max range boundaries @@ -134,30 +135,5 @@ return wbRequest; } - /** - * @param wbRequest - * @param flagsStr : "js_", "", "cs_", "cs_js_" - */ - private void assignFlags(WaybackRequest wbRequest, String flagsStr) { - if(flagsStr != null) { - String[] flags = flagsStr.split( - ArchivalUrlRequestParser.FLAG_DELIM); - for(String flag: flags) { - if(flag.equals(ArchivalUrlRequestParser.CSS_CONTEXT)) { - wbRequest.setCSSContext(true); - } else if(flag.equals(ArchivalUrlRequestParser.JS_CONTEXT)) { - wbRequest.setJSContext(true); - } else if(flag.equals(ArchivalUrlRequestParser.IMG_CONTEXT)) { - wbRequest.setIMGContext(true); - } else if(flag.equals(ArchivalUrlRequestParser.IDENTITY_CONTEXT)) { - wbRequest.setIdentityContext(true); - } else if(flag.startsWith(ArchivalUrlRequestParser.CHARSET_MODE)) { - String modeString = flag.substring( - ArchivalUrlRequestParser.CHARSET_MODE.length()); - int mode = Integer.parseInt(modeString); - wbRequest.setCharsetMode(mode); - } - } - } - } + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3451 http://archive-access.svn.sourceforge.net/archive-access/?rev=3451&view=rev Author: bradtofel Date: 2011-05-25 01:37:10 +0000 (Wed, 25 May 2011) Log Message: ----------- FEATURE: added FrameWrapped state tracking code Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2011-05-25 01:36:49 UTC (rev 3450) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2011-05-25 01:37:10 UTC (rev 3451) @@ -19,10 +19,12 @@ */ package org.archive.wayback.archivalurl; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import java.nio.charset.Charset; import java.util.Map; import javax.servlet.ServletException; @@ -39,15 +41,19 @@ import org.archive.wayback.replay.HttpHeaderOperation; import org.archive.wayback.replay.HttpHeaderProcessor; import org.archive.wayback.replay.JSPExecutor; +import org.archive.wayback.replay.TagMagix; import org.archive.wayback.replay.TextReplayRenderer; import org.archive.wayback.replay.charset.CharsetDetector; import org.archive.wayback.replay.charset.StandardCharsetDetector; import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.htmllex.ContextAwareLexer; import org.archive.wayback.util.htmllex.ParseEventHandler; import org.htmlparser.Node; +import org.htmlparser.lexer.InputStreamSource; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; +import org.htmlparser.lexer.Source; import org.htmlparser.util.ParserException; /** @@ -62,7 +68,16 @@ private HttpHeaderProcessor httpHeaderProcessor; private CharsetDetector charsetDetector = new StandardCharsetDetector(); private final static String OUTPUT_CHARSET = "utf-8"; + private static int FRAMESET_SCAN_BUFFER_SIZE = 16 * 1024; + private static ReplayRenderer frameWrappingRenderer = null; + public static ReplayRenderer getFrameWrappingRenderer() { + return frameWrappingRenderer; + } + public static void setFrameWrappingRenderer(ReplayRenderer frameWrappingRenderer) { + ArchivalUrlSAXRewriteReplayRenderer.frameWrappingRenderer = frameWrappingRenderer; + } + /** * @param httpHeaderProcessor which should process HTTP headers */ @@ -77,6 +92,51 @@ ResultURIConverter uriConverter, CaptureSearchResults results) throws ServletException, IOException, WaybackException { + // The URL of the page, for resolving in-page relative URLs: + URL url = null; + try { + url = new URL(result.getOriginalUrl()); + } catch (MalformedURLException e1) { + // TODO: this shouldn't happen... + e1.printStackTrace(); + throw new IOException(e1.getMessage()); + } + // determine the character set used to encode the document bytes: + String charSet = charsetDetector.getCharset(resource, wbRequest); + + ArchivalUrlContextResultURIConverterFactory fact = + new ArchivalUrlContextResultURIConverterFactory( + (ArchivalUrlResultURIConverter) uriConverter); + // set up the context: + ReplayParseContext context = + new ReplayParseContext(fact,url,result.getCaptureTimestamp()); + + if(!wbRequest.isFrameWrapperContext()) { + // in case this is an HTML page with FRAMEs, peek ahead an look: + // TODO: make ThreadLocal: + byte buffer[] = new byte[FRAMESET_SCAN_BUFFER_SIZE]; + + resource.mark(FRAMESET_SCAN_BUFFER_SIZE); + int amtRead = resource.read(buffer); + resource.reset(); + + if(amtRead > 0) { + StringBuilder foo = new StringBuilder(new String(buffer,charSet)); + int frameIdx = TagMagix.getEndOfFirstTag(foo, "FRAMESET"); + if(frameIdx != -1) { + // insert flag so we don't add FRAMESET: + context.putData(FastArchivalUrlReplayParseEventHandler.FERRET_DONE_KEY,""); + +// // top-level Frameset: Draw the frame wrapper thingy: +// frameWrappingRenderer.renderResource(httpRequest, +// httpResponse, wbRequest, result, resource, +// uriConverter, results); +// return; + } + } + } + + // copy the HTTP response code: HttpHeaderOperation.copyHTTPMessageHeader(resource, httpResponse); @@ -90,31 +150,14 @@ JSPExecutor jspExec = new JSPExecutor(uriConverter, httpRequest, httpResponse, wbRequest, results, result, resource); - // The URL of the page, for resolving in-page relative URLs: - URL url = null; - try { - url = new URL(result.getOriginalUrl()); - } catch (MalformedURLException e1) { - // TODO: this shouldn't happen... - e1.printStackTrace(); - throw new IOException(e1.getMessage()); - } // To make sure we get the length, we have to buffer it all up... ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ArchivalUrlContextResultURIConverterFactory fact = - new ArchivalUrlContextResultURIConverterFactory( - (ArchivalUrlResultURIConverter) uriConverter); - // set up the context: - ReplayParseContext context = - new ReplayParseContext(fact,url,result.getCaptureTimestamp()); context.setOutputCharset(OUTPUT_CHARSET); context.setOutputStream(baos); context.setJspExec(jspExec); - // determine the character set used to encode the document bytes: - String charSet = charsetDetector.getCharset(resource, wbRequest); // and finally, parse, using the special lexer that knows how to // handle javascript blocks containing unescaped HTML entities: This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:36:55
|
Revision: 3450 http://archive-access.svn.sourceforge.net/archive-access/?rev=3450&view=rev Author: bradtofel Date: 2011-05-25 01:36:49 +0000 (Wed, 25 May 2011) Log Message: ----------- FEATURE: added FrameWrapped state tracking code Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/selector/FrameEmbeddedRequestSelector.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/selector/IFrameEmbeddedRequestSelector.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java 2011-05-25 01:33:03 UTC (rev 3449) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java 2011-05-25 01:36:49 UTC (rev 3450) @@ -53,7 +53,7 @@ public class FastArchivalUrlReplayParseEventHandler implements ParseEventHandler { - private final static String FERRET_DONE_KEY = + public final static String FERRET_DONE_KEY = FastArchivalUrlReplayParseEventHandler.class.toString(); private String jspInsertPath = "/WEB-INF/replay/DisclaimChooser.jsp"; @@ -79,6 +79,8 @@ anchorUrlTrans = new URLStringTransformer(); anchorUrlTrans.setJsTransformer(jsBlockTrans); } + private static URLStringTransformer framesetUrlTrans = + new URLStringTransformer("fw_"); private static URLStringTransformer cssUrlTrans = new URLStringTransformer("cs_"); private static URLStringTransformer jsUrlTrans = @@ -227,7 +229,7 @@ transformAttr(context, tagNode, "ACTION", anchorUrlTrans); } else if(tagName.equals("FRAME")) { - transformAttr(context, tagNode, "SRC", anchorUrlTrans); + transformAttr(context, tagNode, "SRC", framesetUrlTrans); } else if(tagName.equals("LINK")) { if(transformAttrWhere(context, tagNode, "REL", "STYLESHEET", Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/selector/FrameEmbeddedRequestSelector.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/selector/FrameEmbeddedRequestSelector.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/selector/FrameEmbeddedRequestSelector.java 2011-05-25 01:36:49 UTC (rev 3450) @@ -0,0 +1,18 @@ +package org.archive.wayback.replay.selector; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.WaybackRequest; + +public class FrameEmbeddedRequestSelector extends BaseReplayRendererSelector { + + /* (non-Javadoc) + * @see org.archive.wayback.replay.selector.BaseReplayRendererSelector#canHandle(org.archive.wayback.core.WaybackRequest, org.archive.wayback.core.CaptureSearchResult, org.archive.wayback.core.Resource) + */ + @Override + public boolean canHandle(WaybackRequest wbRequest, + CaptureSearchResult result, Resource resource) { + return wbRequest.isFrameWrapperContext(); + } + +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/selector/IFrameEmbeddedRequestSelector.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/selector/IFrameEmbeddedRequestSelector.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/selector/IFrameEmbeddedRequestSelector.java 2011-05-25 01:36:49 UTC (rev 3450) @@ -0,0 +1,18 @@ +package org.archive.wayback.replay.selector; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.WaybackRequest; + +public class IFrameEmbeddedRequestSelector extends BaseReplayRendererSelector { + + /* (non-Javadoc) + * @see org.archive.wayback.replay.selector.BaseReplayRendererSelector#canHandle(org.archive.wayback.core.WaybackRequest, org.archive.wayback.core.CaptureSearchResult, org.archive.wayback.core.Resource) + */ + @Override + public boolean canHandle(WaybackRequest wbRequest, + CaptureSearchResult result, Resource resource) { + return wbRequest.isIFrameWrapperContext(); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:33:09
|
Revision: 3449 http://archive-access.svn.sourceforge.net/archive-access/?rev=3449&view=rev Author: bradtofel Date: 2011-05-25 01:33:03 +0000 (Wed, 25 May 2011) Log Message: ----------- FEATURE: added FrameWrapped state tracking code Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java 2011-05-25 01:30:18 UTC (rev 3448) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java 2011-05-25 01:33:03 UTC (rev 3449) @@ -266,6 +266,17 @@ */ public static final String REQUEST_IDENTITY_CONTEXT = "identitycontext"; + /** + * Request: Content should be wrapped in a frame + */ + public static final String REQUEST_FRAME_WRAPPER_CONTEXT = + "framewrappercontext"; + + /** + * Request: Display context for embedded metadata in an IFrame + */ + public static final String REQUEST_IFRAME_WRAPPER_CONTEXT = + "iframewrappercontext"; /** * Request: Charset detection mode @@ -784,6 +795,20 @@ return getBoolean(REQUEST_IDENTITY_CONTEXT); } + public void setFrameWrapperContext(boolean isFrameWrapperContext) { + setBoolean(REQUEST_FRAME_WRAPPER_CONTEXT,isFrameWrapperContext); + } + public boolean isFrameWrapperContext() { + return getBoolean(REQUEST_FRAME_WRAPPER_CONTEXT); + } + + public void setIFrameWrapperContext(boolean isIFrameWrapperContext) { + setBoolean(REQUEST_IFRAME_WRAPPER_CONTEXT,isIFrameWrapperContext); + } + public boolean isIFrameWrapperContext() { + return getBoolean(REQUEST_IFRAME_WRAPPER_CONTEXT); + } + public void setCharsetMode(int mode) { setInt(REQUEST_CHARSET_MODE,mode); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:30:25
|
Revision: 3448 http://archive-access.svn.sourceforge.net/archive-access/?rev=3448&view=rev Author: bradtofel Date: 2011-05-25 01:30:18 +0000 (Wed, 25 May 2011) Log Message: ----------- BUGFIX: returns "", not "/", for getXXXPrefix() when no accessPoint is available, allowing deployment at non-ROOT context Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UIResults.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UIResults.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UIResults.java 2011-05-25 01:26:48 UTC (rev 3447) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UIResults.java 2011-05-25 01:30:18 UTC (rev 3448) @@ -299,7 +299,7 @@ return wbRequest.getAccessPoint().getStaticPrefix(); } } - return "/"; + return ""; } /** @@ -311,7 +311,7 @@ return wbRequest.getAccessPoint().getQueryPrefix(); } } - return "/"; + return ""; } /** @@ -323,7 +323,7 @@ return wbRequest.getAccessPoint().getReplayPrefix(); } } - return "/"; + return ""; } /* This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:26:55
|
Revision: 3447 http://archive-access.svn.sourceforge.net/archive-access/?rev=3447&view=rev Author: bradtofel Date: 2011-05-25 01:26:48 +0000 (Wed, 25 May 2011) Log Message: ----------- TWEAK: removed unneeded TagMagix class identifier Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2011-05-25 01:21:30 UTC (rev 3446) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2011-05-25 01:26:48 UTC (rev 3447) @@ -333,7 +333,7 @@ public static String getTagAttrWhere(StringBuilder page, final String tag, final String findAttr, final String whereAttr, final String whereVal) { - Pattern tagPattern = TagMagix.getWholeTagPattern(tag); + Pattern tagPattern = getWholeTagPattern(tag); Pattern findAttrPattern = getAttrPattern(findAttr); Pattern whereAttrPattern = getAttrPattern(whereAttr); Matcher tagMatcher = tagPattern.matcher(page); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3446 http://archive-access.svn.sourceforge.net/archive-access/?rev=3446&view=rev Author: bradtofel Date: 2011-05-25 01:21:30 +0000 (Wed, 25 May 2011) Log Message: ----------- FEATURE: added "-blockDump" argument, which only produces a list of matching block-offset tuples to STDOUT Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-05-25 01:20:09 UTC (rev 3445) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-05-25 01:21:30 UTC (rev 3446) @@ -276,6 +276,7 @@ ZiplinesSearchResultSource zl = new ZiplinesSearchResultSource(format); PrintWriter pw = new PrintWriter(System.out); int idx; + boolean blockDump = false; for(idx = 0; idx < args.length; idx++) { if(args[idx].equals("-format")) { idx++; @@ -288,6 +289,8 @@ e1.printStackTrace(); System.exit(1); } + } else if(args[idx].equals("-blockDump")) { + blockDump = true; } else if(args[idx].equals("-max")) { idx++; if(idx >= args.length) { @@ -322,15 +325,25 @@ try { zl.init(); - Iterator<String> itr = zl.getStringPrefixIterator(key); - boolean truncated = ((StringPrefixIterator)itr).isTruncated(); - while(itr.hasNext()) { - pw.println(itr.next()); + if(blockDump) { + + ArrayList<ZiplinedBlock> blocks = zl.getBlockListForPrefix(key); + for(ZiplinedBlock block : blocks) { + pw.format("%s\t%s\n", block.urlOrPath, block.offset); + } + pw.close(); + + } else { + Iterator<String> itr = zl.getStringPrefixIterator(key); + boolean truncated = ((StringPrefixIterator)itr).isTruncated(); + while(itr.hasNext()) { + pw.println(itr.next()); + } + pw.close(); + if(truncated) { + System.err.println("Note that results are truncated..."); + } } - pw.close(); - if(truncated) { - System.err.println("Note that results are truncated..."); - } } catch (ResourceIndexNotAvailableException e) { // TODO Auto-generated catch block e.printStackTrace(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3445 http://archive-access.svn.sourceforge.net/archive-access/?rev=3445&view=rev Author: bradtofel Date: 2011-05-25 01:20:09 +0000 (Wed, 25 May 2011) Log Message: ----------- BUGFIX(unreported) NPEs Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java 2011-05-25 01:19:41 UTC (rev 3444) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java 2011-05-25 01:20:09 UTC (rev 3445) @@ -73,6 +73,9 @@ } public String transformHTTPMime(String input) { + if(input == null) { + return null; + } int semiIdx = input.indexOf(";"); if(semiIdx > 0) { return escapeSpaces(input.substring(0,semiIdx).trim()); @@ -121,6 +124,9 @@ // nothing present in the HTTP headers.. Use the WARC field: mimeType = transformHTTPMime(mimeGuess); } + if(mimeType == null) { + mimeType = "unknown"; + } result.setMimeType(mimeType); // Now the sticky part: If it looks like an HTML document, look for // robot meta tags: This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3444 http://archive-access.svn.sourceforge.net/archive-access/?rev=3444&view=rev Author: bradtofel Date: 2011-05-25 01:19:41 +0000 (Wed, 25 May 2011) Log Message: ----------- BUGFIX(unreported) NPEs Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2011-05-25 01:16:36 UTC (rev 3443) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2011-05-25 01:19:41 UTC (rev 3444) @@ -99,7 +99,7 @@ if(type.equals(WARCConstants.RESPONSE)) { String mime = annotater.transformHTTPMime(header.getMimetype()); - if(mime.equals("text/dns")) { + if(mime != null && mime.equals("text/dns")) { // close to complete reading, then the digest is legit // TODO: DO we want to use the WARC header digest for this? rec.close(); @@ -272,7 +272,8 @@ int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new RecoverableIOException("Failed to read http status where one " + - " was expected: " + new String(statusBytes)); + " was expected: " + + ((statusBytes == null) ? "(null)" : new String(statusBytes))); } String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:16:43
|
Revision: 3443 http://archive-access.svn.sourceforge.net/archive-access/?rev=3443&view=rev Author: bradtofel Date: 2011-05-25 01:16:36 +0000 (Wed, 25 May 2011) Log Message: ----------- OPTIMIZ: removed all internal references to Calendars, now depending mostly on ArchiveUtils date parsing - this was causing lots of lock contention in related TimeZone code. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/Timestamp.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/Timestamp.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/Timestamp.java 2011-05-25 01:08:38 UTC (rev 3442) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/Timestamp.java 2011-05-25 01:16:36 UTC (rev 3443) @@ -19,13 +19,16 @@ */ package org.archive.wayback.util; +import java.text.ParseException; import java.util.Calendar; import java.util.Date; import java.util.GregorianCalendar; import java.util.SimpleTimeZone; import java.util.TimeZone; +import org.archive.util.ArchiveUtils; + /** * Represents a moment in time as a 14-digit string, and interally as a Date. * @@ -54,6 +57,32 @@ private final static String[] months = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" }; + private final static String DAYS_IN_MONTH[][]; + private final static int DIM_START_YEAR = 1972; + private final static int DIM_END_YEAR = 2032; + static { + Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT")); + cal.clear(); + int years = DIM_END_YEAR - DIM_START_YEAR; + DAYS_IN_MONTH = new String[years][12]; + for(int y = 0; y < years; y++) { + for(int m = 0; m < 12; m++) { + cal.set(Calendar.YEAR,DIM_START_YEAR + y); + cal.set(Calendar.MONTH,m); + cal.set(Calendar.DAY_OF_MONTH,1); + int calV = cal.getActualMaximum(Calendar.DAY_OF_MONTH); + String maxDayOfMonth = String.valueOf(calV); + if(maxDayOfMonth.length() == 1) { + maxDayOfMonth = "0" + maxDayOfMonth; + } + DAYS_IN_MONTH[y][m] = maxDayOfMonth; + } + } + } + private static String getDaysInMonthBound(int year, int month) { + return DAYS_IN_MONTH[year - DIM_START_YEAR][month]; + } + private String dateStr = null; private Date date = null; @@ -61,50 +90,42 @@ * Constructor */ public Timestamp() { - super(); } /** * Construct and initialize structure from a 14-digit String timestamp. If * the argument is too short, or specifies an invalid timestamp, cleanup * will be attempted to create the earliest legal timestamp given the input. - * @param dateStr + * @param dateStr from which to set date */ public Timestamp(final String dateStr) { - super(); - - Calendar cal = dateStrToCalendar(dateStr); - setDate(cal.getTime()); + setDate(dateStrToDate(dateStr)); } /** * Construct and initialize structure from an integer number of seconds * since the epoch. - * @param sse + * @param sse SecondsSinceEpoch */ public Timestamp(final int sse) { - super(); setSse(sse); } /** * Construct and initialize structure from an Date - * @param date + * @param date from which date should be set */ public Timestamp(final Date date) { - super(); setDate(date); } /** * set internal structure using Date argument - * @param date + * @param date from which date should be set */ public void setDate(final Date date) { this.date = (Date) date.clone(); - Calendar cal = getCalendar(); - cal.setTime(this.date); - dateStr = calendarToDateStr(cal); + dateStr = ArchiveUtils.get14DigitDate(date); } @@ -117,7 +138,7 @@ /** * set internal structure using seconds since the epoch integer argument - * @param sse + * @param sse SecondsSinceEpoch */ public void setSse(final int sse) { setDate(new Date(((long)sse) * 1000)); @@ -128,11 +149,10 @@ * argument. Will clean up timestamp as needed to yield the ealiest * possible timestamp given the possible partial or wrong argument. * - * @param dateStr + * @param dateStr containing the timestamp */ public void setDateStr(String dateStr) { - Calendar cal = dateStrToCalendar(dateStr); - setDate(cal.getTime()); + setDate(dateStrToDate(dateStr)); } /** @@ -156,7 +176,7 @@ * timeStamp and the arguments timeStamp. result is the absolute number of * seconds difference. * - * @param otherTimeStamp + * @param otherTimeStamp to compare * @return int absolute seconds between the argument and this records * timestamp. */ @@ -170,7 +190,7 @@ * timeStamp is less than the argument, positive if it is greater, and 0 if * the same. * - * @param otherTimeStamp + * @param otherTimeStamp to compare * @return int milliseconds */ public int distanceFromTimestamp(final Timestamp otherTimeStamp) { @@ -238,18 +258,6 @@ * */ - private static String frontZeroPad(final String input, final int digits) { - int missing = digits - input.length(); - String padded = ""; - for(int i = 0; i < missing; i++) { - padded += "0"; - } - padded += input; - return padded; - } - private static String frontZeroPad(final int input, final int digits) { - return frontZeroPad(String.valueOf(input) ,digits); - } private static Calendar getCalendar() { String[] ids = TimeZone.getAvailableIDs(0); @@ -259,19 +267,15 @@ TimeZone gmt = new SimpleTimeZone(0, ids[0]); return new GregorianCalendar(gmt); } - + /** - * cleanup the dateStr argument assuming earliest values, and return a - * GMT calendar set to the time described by the dateStr. - * - * @param dateStr - * @return Calendar + * @param dateStr up to 14 digit String representing date + * @return a GMT Calendar object, set to the date represented */ public static Calendar dateStrToCalendar(final String dateStr) { - + Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT")); String paddedDateStr = padStartDateStr(dateStr); - Calendar cal = getCalendar(); int iYear = Integer.parseInt(paddedDateStr.substring(0,4)); int iMonth = Integer.parseInt(paddedDateStr.substring(4,6)); int iDay = Integer.parseInt(paddedDateStr.substring(6,8)); @@ -289,13 +293,24 @@ return cal; } - private static String calendarToDateStr(Calendar cal) { - return frontZeroPad(cal.get(Calendar.YEAR),4) + - frontZeroPad(cal.get(Calendar.MONTH) + 1 ,2) + - frontZeroPad(cal.get(Calendar.DAY_OF_MONTH),2) + - frontZeroPad(cal.get(Calendar.HOUR_OF_DAY),2) + - frontZeroPad(cal.get(Calendar.MINUTE),2) + - frontZeroPad(cal.get(Calendar.SECOND),2); + /** + * cleanup the dateStr argument assuming earliest values, and return a + * GMT calendar set to the time described by the dateStr. + * + * @param dateStr from which to create Calendar + * @return Calendar + */ + public static Date dateStrToDate(final String dateStr) { + + String paddedDateStr = padStartDateStr(dateStr); + try { + return ArchiveUtils.parse14DigitDate(paddedDateStr); + } catch (ParseException e) { + e.printStackTrace(); + // TODO: This is certainly not the right thing, but padStartDateStr + // should ensure we *never* get here.. + return new Date(SSE_1996); + } } @@ -333,18 +348,16 @@ } return test; } - + // check each of YEAR, MONTH, DAY, HOUR, MINUTE, SECOND to make sure they // are not too large or too small, factoring in the month, leap years, etc. + // BUGBUG: Leap second bug here.. How long till someone notices? private static String boundTimestamp(String input) { String boundTimestamp = ""; if(input == null) { input = ""; } // MAKE SURE THE YEAR IS WITHIN LEGAL BOUNDARIES: - Calendar tmpCal = getCalendar(); - tmpCal.setTime(new Date()); - boundTimestamp = boundDigits(input.substring(0,4), YEAR_LOWER_LIMIT,YEAR_UPPER_LIMIT); @@ -354,18 +367,10 @@ // NOW DEPENDING ON THE YEAR + MONTH, MAKE SURE THE DAY OF MONTH IS // WITHIN LEGAL BOUNDARIES: - Calendar cal = getCalendar(); - cal.clear(); int iYear = Integer.parseInt(boundTimestamp.substring(0,4)); int iMonth = Integer.parseInt(boundTimestamp.substring(4,6)); - cal.set(Calendar.YEAR,iYear); - cal.set(Calendar.MONTH,iMonth - 1); - cal.set(Calendar.DAY_OF_MONTH,1); + String maxDayOfMonth = getDaysInMonthBound(iYear, iMonth-1); - String maxDayOfMonth = String.valueOf(cal.getActualMaximum(Calendar.DAY_OF_MONTH)); - if(maxDayOfMonth.length() == 1) { - maxDayOfMonth = "0" + maxDayOfMonth; - } boundTimestamp += boundDigits(input.substring(6,8), DAY_LOWER_LIMIT,maxDayOfMonth); @@ -383,7 +388,7 @@ return boundTimestamp; } - + /** * clean up timestamp argument assuming latest possible values for missing * or bogus digits. @@ -407,7 +412,7 @@ } /** - * @param dateStr + * @param dateStr containing timestamp * @return Timestamp object representing the earliest date represented by * the (possibly) partial digit-string argument. */ @@ -416,7 +421,7 @@ } /** - * @param dateStr + * @param dateStr containing timestamp * @return Timestamp object representing the latest date represented by the * (possibly) partial digit-string argument. */ @@ -425,7 +430,7 @@ } /** - * @param sse + * @param sse SecondsSinceEpoch * @return Timestamp object representing the seconds since epoch argument. */ public static Timestamp fromSse(final int sse) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:08:44
|
Revision: 3442 http://archive-access.svn.sourceforge.net/archive-access/?rev=3442&view=rev Author: bradtofel Date: 2011-05-25 01:08:38 +0000 (Wed, 25 May 2011) Log Message: ----------- BUGFIX: urlToPath was not handling ports correctly Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2011-05-25 01:04:11 UTC (rev 3441) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2011-05-25 01:08:38 UTC (rev 3442) @@ -220,25 +220,28 @@ * @return the path component of the URL, or "" if it contains no path. */ public static String getURLPath(String url) { - int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); + url = stripURLScheme(url); int pathIdx = url.indexOf(UrlOperations.PATH_START); - if(portIdx == -1 && pathIdx == -1) { - return ""; - } - if(portIdx == -1) { - return url.substring(pathIdx); - } if(pathIdx == -1) { - return url.substring(portIdx); + return "/"; } - if(pathIdx > portIdx) { - return url.substring(portIdx); - } else { - return url.substring(pathIdx); - } + return url.substring(pathIdx); } - /** + * Attempt to extract the path component of a url String argument. + * @param url the URL which may contain a path, sans scheme. + * @return the path component of the URL, or "" if it contains no path. + */ + public static String stripURLScheme(String url) { + String lcUrl = url.toLowerCase(); + for(String scheme : ALL_SCHEMES) { + if(lcUrl.startsWith(scheme)) { + return url.substring(scheme.length()); + } + } + return url; + } + /** * Attempt to strip default ports out of URL strings. * @param url the original URL possibly including a port * @return the URL sans port, if the scheme was recognized and the default @@ -279,6 +282,11 @@ return sb.toString(); } + /** + * @param orig String containing a URL, possibly beginning with "http:/". + * @return original string if orig begins with "http://", or a new String + * with the extra slash, if orig only had one slash. + */ public static String fixupHTTPUrlWithOneSlash(String orig) { if(orig.startsWith("http:/") && ! orig.startsWith(HTTP_SCHEME)) { // very likely the IE "you must have meant 1 slash, not 2 bug: Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2011-05-25 01:04:11 UTC (rev 3441) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2011-05-25 01:08:38 UTC (rev 3442) @@ -205,7 +205,29 @@ } + public void testUrlPath() { + assertEquals("/",UrlOperations.getURLPath("http://foo.com")); + assertEquals("/",UrlOperations.getURLPath("http://foo.com/")); + assertEquals("/",UrlOperations.getURLPath("http://foo.com:80/")); + assertEquals("/blue",UrlOperations.getURLPath("http://foo.com:80/blue")); + assertEquals("/blue/red",UrlOperations.getURLPath("http://foo.com:80/blue/red")); + assertEquals("/blue/red:colon",UrlOperations.getURLPath("http://foo.com:80/blue/red:colon")); + assertEquals("/",UrlOperations.getURLPath("foo.com")); + assertEquals("/",UrlOperations.getURLPath("foo.com:80")); + assertEquals("/",UrlOperations.getURLPath("foo.com:8080")); + assertEquals("/",UrlOperations.getURLPath("foo.com/")); + assertEquals("/",UrlOperations.getURLPath("foo.com:80/")); + assertEquals("/",UrlOperations.getURLPath("foo.com:8080/")); + assertEquals("/bar",UrlOperations.getURLPath("foo.com/bar")); + assertEquals("/bar",UrlOperations.getURLPath("foo.com:80/bar")); + assertEquals("/bar",UrlOperations.getURLPath("foo.com:8080/bar")); + + assertEquals("/bar/baz",UrlOperations.getURLPath("foo.com/bar/baz")); + assertEquals("/bar/baz",UrlOperations.getURLPath("foo.com:80/bar/baz")); + assertEquals("/bar/baz",UrlOperations.getURLPath("foo.com:8080/bar/baz")); + + } public void testStripDefaultPort() { assertSDP("http://foo.com/","http://foo.com/"); assertSDP("http://foo.com","http://foo.com"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:04:17
|
Revision: 3441 http://archive-access.svn.sourceforge.net/archive-access/?rev=3441&view=rev Author: bradtofel Date: 2011-05-25 01:04:11 +0000 (Wed, 25 May 2011) Log Message: ----------- TWEAK: live web AccessPoint no longer attempts to retrieve non HTTP documents. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-05-25 01:03:17 UTC (rev 3440) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-05-25 01:04:11 UTC (rev 3441) @@ -77,6 +77,9 @@ wbRequest.setRequestUrl(urlString); URL url = null; try { + if(!urlString.startsWith(UrlOperations.HTTP_SCHEME)) { + throw new ResourceNotInArchiveException(urlString); + } Thread.currentThread().setName("Thread " + Thread.currentThread().getId() + " " + getBeanName() + " handling: " + urlString); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:03:23
|
Revision: 3440 http://archive-access.svn.sourceforge.net/archive-access/?rev=3440&view=rev Author: bradtofel Date: 2011-05-25 01:03:17 +0000 (Wed, 25 May 2011) Log Message: ----------- JAVADOC Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2011-05-25 01:02:18 UTC (rev 3439) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2011-05-25 01:03:17 UTC (rev 3440) @@ -702,7 +702,7 @@ } /** - * @param filePrefixes List of String file prefixes that will be matched + * @param fileIncludePrefixes List of String file prefixes that will be matched * when querying the ResourceIndex - only SearchResults from files * with a prefix matching one of those in this List will be returned. */ @@ -719,7 +719,7 @@ } /** - * @param filePrefixes List of String file prefixes that will be matched + * @param fileExcludePrefixes List of String file prefixes that will be matched * when querying the ResourceIndex - only SearchResults from files * with a prefix matching one of those in this List will be returned. */ @@ -893,9 +893,15 @@ this.bounceToQueryPrefix = bounceToQueryPrefix; } + /** + * @return the configured number of MS for min age to return from the index + */ public long getEmbargoMS() { return embargoMS; } + /** + * @param ms minimum number of MS age for content to be served from the index + */ public void setEmbargoMS(long ms) { this.embargoMS = ms; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:02:24
|
Revision: 3439 http://archive-access.svn.sourceforge.net/archive-access/?rev=3439&view=rev Author: bradtofel Date: 2011-05-25 01:02:18 +0000 (Wed, 25 May 2011) Log Message: ----------- BUGFIX: needed to add extra slash to pathPrefix for non-ROOT deployment to function. FEATURE: re-enabled production of correct AccessPoint lists, when clients fail to specify the final path component of an AccessPoint Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/PortMapper.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/PortMapper.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/PortMapper.java 2011-05-17 17:41:50 UTC (rev 3438) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/PortMapper.java 2011-05-25 01:02:18 UTC (rev 3439) @@ -19,6 +19,7 @@ */ package org.archive.wayback.util.webapp; +import java.util.ArrayList; import java.util.HashMap; import java.util.logging.Logger; @@ -130,9 +131,9 @@ String host = requestToHost(request); String contextPath = request.getContextPath(); StringBuilder pathPrefix = new StringBuilder(contextPath); - if(contextPath.length() == 0) { +// if(contextPath.length() == 0) { pathPrefix.append("/"); - } +// } String firstPath = requestToFirstPath(request); RequestHandler handler = pathMap.get(hostPathToKey(host,firstPath)); if(handler != null) { @@ -152,6 +153,20 @@ if(handler != null) { return new RequestHandlerContext(handler,contextPath); } + // Nothing matching this port:host:path. Try to help get user back on + // track. Note this won't help with hostname mismatches at the moment: + ArrayList<String> paths = new ArrayList<String>(); + for(String tmp : pathMap.keySet()) { + // slice off last chunk: + int idx = tmp.lastIndexOf('/'); + if(idx != -1) { + String path = tmp.substring(idx+1); + paths.add(path); + } + } + if(paths.size() > 0) { + request.setAttribute("AccessPointNames", paths); + } return null; } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-17 17:41:56
|
Revision: 3438 http://archive-access.svn.sourceforge.net/archive-access/?rev=3438&view=rev Author: bradtofel Date: 2011-05-17 17:41:50 +0000 (Tue, 17 May 2011) Log Message: ----------- BUGFIX(unreported): No longer relies on Heritrix ArchiveReader to perform seek() operation, as this behavior seems to have been broken. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java 2011-04-19 22:41:25 UTC (rev 3437) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java 2011-05-17 17:41:50 UTC (rev 3438) @@ -28,8 +28,8 @@ import java.util.Set; import org.apache.commons.httpclient.Header; +import org.archive.io.ArchiveReader; import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCRecord; import org.archive.wayback.core.Resource; import org.archive.wayback.replay.HttpHeaderOperation; @@ -54,7 +54,7 @@ * object for ARCReader -- need to hold on to this in order to call close() * to release filehandle after completing access to this record. optional */ - ARCReader arcReader = null; + ArchiveReader arcReader = null; /** * flag to indicate if the ARCRecord skipHTTPHeader() has been called */ @@ -71,7 +71,7 @@ * @param rec * @param reader */ - public ArcResource(final ARCRecord rec,final ARCReader reader) { + public ArcResource(final ARCRecord rec,final ArchiveReader reader) { super(); arcRecord = rec; arcReader = reader; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java 2011-04-19 22:41:25 UTC (rev 3437) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java 2011-05-17 17:41:50 UTC (rev 3438) @@ -20,7 +20,10 @@ package org.archive.wayback.resourcestore.resourcefile; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.RandomAccessFile; import java.net.URL; import org.archive.io.ArchiveReader; @@ -62,15 +65,18 @@ name = name.substring(0, name.length() - ArcWarcFilenameFilter.OPEN_SUFFIX.length()); } + RandomAccessFile raf = new RandomAccessFile(file, "r"); + raf.seek(offset); + InputStream is = new FileInputStream(raf.getFD()); + String fPath = file.getAbsolutePath(); if (isArc(name)) { + ArchiveReader reader = ARCReaderFactory.get(fPath, is, false); + r = ARCArchiveRecordToResource(reader.get(), reader); - ARCReader reader = ARCReaderFactory.get(file,offset); - r = ARCArchiveRecordToResource(reader.get(),reader); - } else if (isWarc(name)) { - WARCReader reader = WARCReaderFactory.get(file,offset); - r = WARCArchiveRecordToResource(reader.get(),reader); + ArchiveReader reader = WARCReaderFactory.get(fPath, is, false); + r = WARCArchiveRecordToResource(reader.get(), reader); } else { throw new ResourceNotAvailableException("Unknown extension"); @@ -78,7 +84,6 @@ return r; } - public static Resource getResource(URL url, long offset) throws IOException, ResourceNotAvailableException { @@ -114,7 +119,7 @@ } public static Resource ARCArchiveRecordToResource(ArchiveRecord rec, - ARCReader reader) throws ResourceNotAvailableException, IOException { + ArchiveReader reader) throws ResourceNotAvailableException, IOException { if (!(rec instanceof ARCRecord)) { throw new ResourceNotAvailableException("Bad ARCRecord format"); @@ -125,7 +130,7 @@ } public static Resource WARCArchiveRecordToResource(ArchiveRecord rec, - WARCReader reader) throws ResourceNotAvailableException, IOException { + ArchiveReader reader) throws ResourceNotAvailableException, IOException { if (!(rec instanceof WARCRecord)) { throw new ResourceNotAvailableException("Bad WARCRecord format"); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java 2011-04-19 22:41:25 UTC (rev 3437) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java 2011-05-17 17:41:50 UTC (rev 3438) @@ -27,21 +27,21 @@ import org.apache.commons.httpclient.HttpParser; import org.apache.commons.httpclient.StatusLine; import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.ArchiveReader; import org.archive.io.RecoverableIOException; import org.archive.io.arc.ARCConstants; -import org.archive.io.warc.WARCReader; import org.archive.io.warc.WARCRecord; import org.archive.wayback.core.Resource; import org.archive.wayback.replay.HttpHeaderOperation; public class WarcResource extends Resource { private WARCRecord rec = null; - private WARCReader reader = null; + private ArchiveReader reader = null; private Map<String, String> headers = null; private long length = 0; private int status = 0; private boolean parsedHeaders = false; - public WarcResource(WARCRecord rec, WARCReader reader) { + public WarcResource(WARCRecord rec, ArchiveReader reader) { this.rec = rec; this.reader = reader; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-04-19 22:41:32
|
Revision: 3437 http://archive-access.svn.sourceforge.net/archive-access/?rev=3437&view=rev Author: bradtofel Date: 2011-04-19 22:41:25 +0000 (Tue, 19 Apr 2011) Log Message: ----------- BUGFIX + 1.6.1 RELEASE CANDIDATE: Modified Paths: -------------- branches/wayback-1_6_1/dist/pom.xml branches/wayback-1_6_1/pom.xml branches/wayback-1_6_1/wayback-core/pom.xml branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/util/ARCCreator.java branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/util/webapp/PortMapper.java branches/wayback-1_6_1/wayback-hadoop/pom.xml branches/wayback-1_6_1/wayback-hadoop-java/pom.xml branches/wayback-1_6_1/wayback-webapp/pom.xml Modified: branches/wayback-1_6_1/dist/pom.xml =================================================================== --- branches/wayback-1_6_1/dist/pom.xml 2011-04-16 17:37:26 UTC (rev 3436) +++ branches/wayback-1_6_1/dist/pom.xml 2011-04-19 22:41:25 UTC (rev 3437) @@ -7,7 +7,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive.wayback</groupId> - <version>1.6.0</version> + <version>1.6.1</version> </parent> <artifactId>dist</artifactId> Modified: branches/wayback-1_6_1/pom.xml =================================================================== --- branches/wayback-1_6_1/pom.xml 2011-04-16 17:37:26 UTC (rev 3436) +++ branches/wayback-1_6_1/pom.xml 2011-04-19 22:41:25 UTC (rev 3437) @@ -7,7 +7,7 @@ <groupId>org.archive.wayback</groupId> <artifactId>wayback</artifactId> <packaging>pom</packaging> - <version>1.6.0</version> + <version>1.6.1</version> <name>Wayback</name> <modules> @@ -262,7 +262,7 @@ <dependency> <groupId>org.archive.heritrix</groupId> <artifactId>heritrix-commons</artifactId> - <version>3.1.1-SNAPSHOT</version> + <version>3.0.1-SNAPSHOT</version> </dependency> <dependency> <groupId>org.archive.access-control</groupId> Modified: branches/wayback-1_6_1/wayback-core/pom.xml =================================================================== --- branches/wayback-1_6_1/wayback-core/pom.xml 2011-04-16 17:37:26 UTC (rev 3436) +++ branches/wayback-1_6_1/wayback-core/pom.xml 2011-04-19 22:41:25 UTC (rev 3437) @@ -8,7 +8,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive.wayback</groupId> - <version>1.6.0</version> + <version>1.6.1</version> </parent> <artifactId>wayback-core</artifactId> Modified: branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java =================================================================== --- branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java 2011-04-16 17:37:26 UTC (rev 3436) +++ branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java 2011-04-19 22:41:25 UTC (rev 3437) @@ -159,6 +159,15 @@ public String getTemplate() { return LIVE_WAYBACK_TEMPLATE; } + + public boolean getFrequentFlushes() { + // TODO Auto-generated method stub + return false; + } + + public int getWriteBufferSize() { + return 4096; + } }; } Modified: branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java =================================================================== --- branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java 2011-04-16 17:37:26 UTC (rev 3436) +++ branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java 2011-04-19 22:41:25 UTC (rev 3437) @@ -28,6 +28,7 @@ import java.util.Set; import org.apache.commons.httpclient.Header; +import org.archive.io.ArchiveReader; import org.archive.io.ArchiveRecord; import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCRecord; @@ -54,7 +55,7 @@ * object for ARCReader -- need to hold on to this in order to call close() * to release filehandle after completing access to this record. optional */ - ARCReader arcReader = null; + ArchiveReader arcReader = null; /** * flag to indicate if the ARCRecord skipHTTPHeader() has been called */ @@ -71,7 +72,7 @@ * @param rec * @param reader */ - public ArcResource(final ARCRecord rec,final ARCReader reader) { + public ArcResource(final ARCRecord rec,final ArchiveReader reader) { super(); arcRecord = rec; arcReader = reader; Modified: branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java =================================================================== --- branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java 2011-04-16 17:37:26 UTC (rev 3436) +++ branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java 2011-04-19 22:41:25 UTC (rev 3437) @@ -20,7 +20,10 @@ package org.archive.wayback.resourcestore.resourcefile; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.RandomAccessFile; import java.net.URL; import org.archive.io.ArchiveReader; @@ -62,14 +65,19 @@ name = name.substring(0, name.length() - ArcWarcFilenameFilter.OPEN_SUFFIX.length()); } + RandomAccessFile raf = new RandomAccessFile(file, "r"); + raf.seek(offset); + InputStream is = new FileInputStream(raf.getFD()); + String fPath = file.getAbsolutePath(); if (isArc(name)) { - - ARCReader reader = ARCReaderFactory.get(file,offset); +// ARCReader reader = ARCReaderFactory.get(file,offset); + ArchiveReader reader = ARCReaderFactory.get(fPath,is,false); r = ARCArchiveRecordToResource(reader.get(),reader); } else if (isWarc(name)) { - WARCReader reader = WARCReaderFactory.get(file,offset); + ArchiveReader reader = WARCReaderFactory.get(fPath,is,false); +// WARCReader reader = WARCReaderFactory.get(file,offset); r = WARCArchiveRecordToResource(reader.get(),reader); } else { @@ -114,7 +122,7 @@ } public static Resource ARCArchiveRecordToResource(ArchiveRecord rec, - ARCReader reader) throws ResourceNotAvailableException, IOException { + ArchiveReader reader) throws ResourceNotAvailableException, IOException { if (!(rec instanceof ARCRecord)) { throw new ResourceNotAvailableException("Bad ARCRecord format"); @@ -125,7 +133,7 @@ } public static Resource WARCArchiveRecordToResource(ArchiveRecord rec, - WARCReader reader) throws ResourceNotAvailableException, IOException { + ArchiveReader reader) throws ResourceNotAvailableException, IOException { if (!(rec instanceof WARCRecord)) { throw new ResourceNotAvailableException("Bad WARCRecord format"); Modified: branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java =================================================================== --- branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java 2011-04-16 17:37:26 UTC (rev 3436) +++ branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java 2011-04-19 22:41:25 UTC (rev 3437) @@ -27,6 +27,7 @@ import org.apache.commons.httpclient.HttpParser; import org.apache.commons.httpclient.StatusLine; import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.ArchiveReader; import org.archive.io.RecoverableIOException; import org.archive.io.arc.ARCConstants; import org.archive.io.warc.WARCReader; @@ -36,12 +37,12 @@ public class WarcResource extends Resource { private WARCRecord rec = null; - private WARCReader reader = null; + private ArchiveReader reader = null; private Map<String, String> headers = null; private long length = 0; private int status = 0; private boolean parsedHeaders = false; - public WarcResource(WARCRecord rec, WARCReader reader) { + public WarcResource(WARCRecord rec, ArchiveReader reader) { this.rec = rec; this.reader = reader; } Modified: branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/util/ARCCreator.java =================================================================== --- branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/util/ARCCreator.java 2011-04-16 17:37:26 UTC (rev 3436) +++ branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/util/ARCCreator.java 2011-04-19 22:41:25 UTC (rev 3437) @@ -26,9 +26,11 @@ import java.text.ParseException; import java.util.Arrays; import java.util.HashMap; +import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Logger; +import org.archive.io.WriterPoolSettings; import org.archive.io.arc.ARCConstants; import org.archive.io.arc.ARCWriter; import org.archive.util.ArchiveUtils; @@ -98,9 +100,12 @@ throws IOException { File target[] = {tgtDir}; + +// ARCWriter writer = new ARCWriter(new AtomicInteger(), +// Arrays.asList(target),prefix,true, +// ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE); ARCWriter writer = new ARCWriter(new AtomicInteger(), - Arrays.asList(target),prefix,true, - ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE); + getSettings(true,prefix,Arrays.asList(target))); File sources[] = srcDir.listFiles(); LOGGER.info("Found " + sources.length + " files in " + srcDir); for(int i = 0; i<sources.length; i++) { @@ -121,6 +126,43 @@ LOGGER.info("Closed arc file named " + writer.getFile().getAbsolutePath()); } + private WriterPoolSettings getSettings(final boolean isCompressed, + final String prefix, final List<File> arcDirs) { + return new WriterPoolSettings() { + public List<File> getOutputDirs() { + return arcDirs; + } + + @SuppressWarnings({ "unchecked", "rawtypes" }) + public List getMetadata() { + return null; + } + + public String getPrefix() { + return prefix; + } + + public boolean getCompress() { + return isCompressed; + } + + public long getMaxFileSizeBytes() { + return ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE; + } + + public String getTemplate() { + return "${prefix}-${timestamp17}-${serialno}"; + } + + public boolean getFrequentFlushes() { + return false; + } + + public int getWriteBufferSize() { + return 4096; + } + }; + } /** * @param args Modified: branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java =================================================================== --- branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java 2011-04-16 17:37:26 UTC (rev 3436) +++ branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java 2011-04-19 22:41:25 UTC (rev 3437) @@ -26,8 +26,14 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import org.archive.io.WriterPoolSettings; +import org.archive.io.arc.ARCConstants; import org.archive.io.warc.WARCWriter; +import org.archive.io.warc.WARCWriterPoolSettings; +import org.archive.uid.RecordIDGenerator; +import org.archive.uid.UUIDGenerator; import org.archive.util.anvl.ANVLRecord; public class WARCHeader { @@ -45,8 +51,9 @@ List<String> metadata = new ArrayList<String>(1); metadata.add(ar.toString()); - writer = new WARCWriter(null, bos, target, true, null, - metadata); +// writer = new WARCWriter(new AtomicInteger(),null, bos, target, true, null, +// metadata); + writer = new WARCWriter(new AtomicInteger(),bos,target,getSettings(true, null, null, metadata)); // Write a warcinfo record with description about how this WARC // was made. writer.writeWarcinfoRecord(target.getName(), "Made from " @@ -54,7 +61,48 @@ + this.getClass().getName()); } + private WARCWriterPoolSettings getSettings(final boolean isCompressed, + final String prefix, final List<File> arcDirs, final List metadata) { + return new WARCWriterPoolSettings() { + public List<File> getOutputDirs() { + return arcDirs; + } + @SuppressWarnings({ "unchecked", "rawtypes" }) + public List getMetadata() { + return metadata; + } + + public String getPrefix() { + return prefix; + } + + public boolean getCompress() { + return isCompressed; + } + + public long getMaxFileSizeBytes() { + return ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE; + } + + public String getTemplate() { + return "${prefix}-${timestamp17}-${serialno}"; + } + + public boolean getFrequentFlushes() { + return false; + } + + public int getWriteBufferSize() { + return 4096; + } + + public RecordIDGenerator getRecordIDGenerator() { + return new UUIDGenerator(); + } + }; + } + public static void main(String[] args) { if (args.length != 3) { System.err.println("USAGE: tgtWarc fieldsSrc id"); Modified: branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/util/webapp/PortMapper.java =================================================================== --- branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/util/webapp/PortMapper.java 2011-04-16 17:37:26 UTC (rev 3436) +++ branches/wayback-1_6_1/wayback-core/src/main/java/org/archive/wayback/util/webapp/PortMapper.java 2011-04-19 22:41:25 UTC (rev 3437) @@ -130,9 +130,9 @@ String host = requestToHost(request); String contextPath = request.getContextPath(); StringBuilder pathPrefix = new StringBuilder(contextPath); - if(contextPath.length() == 0) { +// if(contextPath.length() == 0) { pathPrefix.append("/"); - } +// } String firstPath = requestToFirstPath(request); RequestHandler handler = pathMap.get(hostPathToKey(host,firstPath)); if(handler != null) { Modified: branches/wayback-1_6_1/wayback-hadoop/pom.xml =================================================================== --- branches/wayback-1_6_1/wayback-hadoop/pom.xml 2011-04-16 17:37:26 UTC (rev 3436) +++ branches/wayback-1_6_1/wayback-hadoop/pom.xml 2011-04-19 22:41:25 UTC (rev 3437) @@ -8,7 +8,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive.wayback</groupId> - <version>1.6.0</version> + <version>1.6.1</version> </parent> <artifactId>wayback-hadoop</artifactId> Modified: branches/wayback-1_6_1/wayback-hadoop-java/pom.xml =================================================================== --- branches/wayback-1_6_1/wayback-hadoop-java/pom.xml 2011-04-16 17:37:26 UTC (rev 3436) +++ branches/wayback-1_6_1/wayback-hadoop-java/pom.xml 2011-04-19 22:41:25 UTC (rev 3437) @@ -8,7 +8,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive.wayback</groupId> - <version>1.6.0</version> + <version>1.6.1</version> </parent> <artifactId>wayback-hadoop-java</artifactId> Modified: branches/wayback-1_6_1/wayback-webapp/pom.xml =================================================================== --- branches/wayback-1_6_1/wayback-webapp/pom.xml 2011-04-16 17:37:26 UTC (rev 3436) +++ branches/wayback-1_6_1/wayback-webapp/pom.xml 2011-04-19 22:41:25 UTC (rev 3437) @@ -7,7 +7,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive.wayback</groupId> - <version>1.6.0</version> + <version>1.6.1</version> </parent> <artifactId>wayback-webapp</artifactId> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2011-04-16 17:37:32
|
Revision: 3436 http://archive-access.svn.sourceforge.net/archive-access/?rev=3436&view=rev Author: binzino Date: 2011-04-16 17:37:26 +0000 (Sat, 16 Apr 2011) Log Message: ----------- Fix WAX-77. Import Heritrix3 commons library which handles the gzip problem. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/build.xml Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/commons-lang-2.3.jar tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/guava-r08.jar tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/heritrix-commons-3.0.1-SNAPSHOT.jar tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/kryo-1.01.jar Removed Paths: ------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/commons-2.0.1-SNAPSHOT.LICENSE tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/commons-2.0.1-SNAPSHOT.jar Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/build.xml 2011-04-16 17:21:25 UTC (rev 3435) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/build.xml 2011-04-16 17:37:26 UTC (rev 3436) @@ -27,6 +27,7 @@ <property name="dist.dir" value="${build.dir}/${final.name}" /> <target name="init"> + <delete file="../../lib/commons-lang-2.1.jar" /> <exec executable="rsync"> <arg value="-vacC"/> <arg value="src/nutch/"/> Deleted: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/commons-2.0.1-SNAPSHOT.LICENSE =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/commons-2.0.1-SNAPSHOT.LICENSE 2011-04-16 17:21:25 UTC (rev 3435) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/commons-2.0.1-SNAPSHOT.LICENSE 2011-04-16 17:37:26 UTC (rev 3436) @@ -1,504 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - <one line to give the library's name and a brief idea of what it does.> - Copyright (C) <year> <name of author> - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random Hacker. - - <signature of Ty Coon>, 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! - - Deleted: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/commons-2.0.1-SNAPSHOT.jar =================================================================== (Binary files differ) Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/commons-lang-2.3.jar =================================================================== (Binary files differ) Property changes on: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/commons-lang-2.3.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/guava-r08.jar =================================================================== (Binary files differ) Property changes on: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/guava-r08.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/heritrix-commons-3.0.1-SNAPSHOT.jar =================================================================== (Binary files differ) Property changes on: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/heritrix-commons-3.0.1-SNAPSHOT.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/kryo-1.01.jar =================================================================== (Binary files differ) Property changes on: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/kryo-1.01.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2011-04-16 17:21:31
|
Revision: 3435 http://archive-access.svn.sourceforge.net/archive-access/?rev=3435&view=rev Author: binzino Date: 2011-04-16 17:21:25 +0000 (Sat, 16 Apr 2011) Log Message: ----------- Remove 'content', 'crawl_parse', and 'crawl_data' subdirs from Nutch segment. Not used for NutchWAX. Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/fetcher/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/parse/ParseOutputFormat.java Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java 2011-04-16 17:21:25 UTC (rev 3435) @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.fetcher; + +import java.io.IOException; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.NutchWritable; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; + +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.OutputFormat; +import org.apache.hadoop.mapred.RecordWriter; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.hadoop.util.Progressable; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseOutputFormat; +import org.apache.nutch.protocol.Content; + +/** Splits FetcherOutput entries into multiple map files. */ +public class FetcherOutputFormat implements OutputFormat<Text, NutchWritable> { + + public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException { + Path out = FileOutputFormat.getOutputPath(job); + if (fs.exists(new Path(out, CrawlDatum.FETCH_DIR_NAME))) + throw new IOException("Segment already fetched!"); + } + + public RecordWriter<Text, NutchWritable> getRecordWriter(final FileSystem fs, + final JobConf job, + final String name, + final Progressable progress) throws IOException { + + Path out = FileOutputFormat.getOutputPath(job); + /* + final Path fetch = + new Path(new Path(out, CrawlDatum.FETCH_DIR_NAME), name); + final Path content = + new Path(new Path(out, Content.DIR_NAME), name); + */ + final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job); + + /* + final MapFile.Writer fetchOut = + new MapFile.Writer(job, fs, fetch.toString(), Text.class, CrawlDatum.class, + compType, progress); + */ + + return new RecordWriter<Text, NutchWritable>() { + //private MapFile.Writer contentOut; + private RecordWriter<Text, Parse> parseOut; + + { + /* + if (Fetcher.isStoringContent(job)) { + contentOut = new MapFile.Writer(job, fs, content.toString(), + Text.class, Content.class, + compType, progress); + } + */ + + if (Fetcher.isParsing(job)) { + parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name, progress); + } + } + + public void write(Text key, NutchWritable value) + throws IOException { + + Writable w = value.get(); + + //if (w instanceof CrawlDatum) + // fetchOut.append(key, w); + //else if (w instanceof Content) + // contentOut.append(key, w); + //else if (w instanceof Parse) + // parseOut.write(key, (Parse)w); + if (w instanceof Parse) + parseOut.write(key, (Parse)w); + } + + public void close(Reporter reporter) throws IOException { + /* + if (fetchOut != null) { + fetchOut.close(); + } + if (contentOut != null) { + contentOut.close(); + } + */ + if (parseOut != null) { + parseOut.close(reporter); + } + } + + }; + + } +} + Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/parse/ParseOutputFormat.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/parse/ParseOutputFormat.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/parse/ParseOutputFormat.java 2011-04-16 17:21:25 UTC (rev 3435) @@ -0,0 +1,284 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +// Commons Logging imports +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.io.*; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.fetcher.Fetcher; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.mapred.*; +import org.apache.nutch.scoring.ScoringFilterException; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.StringUtil; +import org.apache.nutch.util.URLUtil; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.*; + +import java.io.*; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Map.Entry; + +import org.apache.hadoop.util.Progressable; + +/* Parse content in a segment. */ +public class ParseOutputFormat implements OutputFormat<Text, Parse> { + private static final Log LOG = LogFactory.getLog(ParseOutputFormat.class); + + private URLFilters filters; + private URLNormalizers normalizers; + private ScoringFilters scfilters; + + private static class SimpleEntry implements Entry<Text, CrawlDatum> { + private Text key; + private CrawlDatum value; + + public SimpleEntry(Text key, CrawlDatum value) { + this.key = key; + this.value = value; + } + + public Text getKey() { + return key; + } + + public CrawlDatum getValue() { + return value; + } + + public CrawlDatum setValue(CrawlDatum value) { + this.value = value; + return this.value; + } + } + + public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException { + Path out = FileOutputFormat.getOutputPath(job); + if (fs.exists(out)) + throw new IOException("Segment already exists:" + out); + } + + public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job, + String name, Progressable progress) throws IOException { + + this.filters = new URLFilters(job); + this.normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK); + this.scfilters = new ScoringFilters(job); + final int interval = job.getInt("db.fetch.interval.default", 2592000); + final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false); + int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100); + final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE + : maxOutlinksPerPage; + final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job); + Path out = FileOutputFormat.getOutputPath(job); + + Path text = new Path(new Path(out, ParseText.DIR_NAME), name); + Path data = new Path(new Path(out, ParseData.DIR_NAME), name); + //Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name); + + final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb","").split(" *, *"); + + final MapFile.Writer textOut = + new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class, + CompressionType.RECORD, progress); + + final MapFile.Writer dataOut = + new MapFile.Writer(job, fs, data.toString(), Text.class, ParseData.class, + compType, progress); + + /* + final SequenceFile.Writer crawlOut = + SequenceFile.createWriter(fs, job, crawl, Text.class, CrawlDatum.class, + compType, progress); + */ + + return new RecordWriter<Text, Parse>() { + + + public void write(Text key, Parse parse) + throws IOException { + + String fromUrl = key.toString(); + String fromHost = null; + String toHost = null; + textOut.append(key, new ParseText(parse.getText())); + + ParseData parseData = parse.getData(); + // recover the signature prepared by Fetcher or ParseSegment + String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY); + if (sig != null) { + byte[] signature = StringUtil.fromHexString(sig); + if (signature != null) { + // append a CrawlDatum with a signature + CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0); + d.setSignature(signature); + //crawlOut.append(key, d); + } + } + + // see if the parse metadata contain things that we'd like + // to pass to the metadata of the crawlDB entry + CrawlDatum parseMDCrawlDatum = null; + for (String mdname : parseMDtoCrawlDB) { + String mdvalue = parse.getData().getParseMeta().get(mdname); + if (mdvalue != null) { + if (parseMDCrawlDatum == null) parseMDCrawlDatum = new CrawlDatum( + CrawlDatum.STATUS_PARSE_META, 0); + parseMDCrawlDatum.getMetaData().put(new Text(mdname), + new Text(mdvalue)); + } + } + // if (parseMDCrawlDatum != null) crawlOut.append(key, parseMDCrawlDatum); + + try { + ParseStatus pstatus = parseData.getStatus(); + if (pstatus != null && pstatus.isSuccess() && + pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { + String newUrl = pstatus.getMessage(); + int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); + try { + newUrl = normalizers.normalize(newUrl, + URLNormalizers.SCOPE_FETCHER); + } catch (MalformedURLException mfue) { + newUrl = null; + } + if (newUrl != null) newUrl = filters.filter(newUrl); + String url = key.toString(); + if (newUrl != null && !newUrl.equals(url)) { + String reprUrl = + URLUtil.chooseRepr(url, newUrl, + refreshTime < Fetcher.PERM_REFRESH_TIME); + CrawlDatum newDatum = new CrawlDatum(); + newDatum.setStatus(CrawlDatum.STATUS_LINKED); + if (reprUrl != null && !reprUrl.equals(newUrl)) { + newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, + new Text(reprUrl)); + } + //crawlOut.append(new Text(newUrl), newDatum); + } + } + } catch (URLFilterException e) { + // ignore + } + + // collect outlinks for subsequent db update + Outlink[] links = parseData.getOutlinks(); + int outlinksToStore = Math.min(maxOutlinks, links.length); + if (ignoreExternalLinks) { + try { + fromHost = new URL(fromUrl).getHost().toLowerCase(); + } catch (MalformedURLException e) { + fromHost = null; + } + } else { + fromHost = null; + } + + int validCount = 0; + CrawlDatum adjust = null; + List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(outlinksToStore); + List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); + for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { + String toUrl = links[i].getToUrl(); + // ignore links to self (or anchors within the page) + if (fromUrl.equals(toUrl)) { + continue; + } + if (ignoreExternalLinks) { + try { + toHost = new URL(toUrl).getHost().toLowerCase(); + } catch (MalformedURLException e) { + toHost = null; + } + if (toHost == null || !toHost.equals(fromHost)) { // external links + continue; // skip it + } + } + try { + toUrl = normalizers.normalize(toUrl, + URLNormalizers.SCOPE_OUTLINK); // normalize the url + toUrl = filters.filter(toUrl); // filter the url + if (toUrl == null) { + continue; + } + } catch (Exception e) { + continue; + } + CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval); + Text targetUrl = new Text(toUrl); + try { + scfilters.initialScore(targetUrl, target); + } catch (ScoringFilterException e) { + LOG.warn("Cannot filter init score for url " + key + + ", using default: " + e.getMessage()); + target.setScore(0.0f); + } + + targets.add(new SimpleEntry(targetUrl, target)); + outlinkList.add(links[i]); + validCount++; + } + try { + // compute score contributions and adjustment to the original score + adjust = scfilters.distributeScoreToOutlinks((Text)key, parseData, + targets, null, links.length); + } catch (ScoringFilterException e) { + LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage()); + } + for (Entry<Text, CrawlDatum> target : targets) { + // crawlOut.append(target.getKey(), target.getValue()); + } + // if (adjust != null) crawlOut.append(key, adjust); + + Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]); + parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), + filteredLinks, parseData.getContentMeta(), + parseData.getParseMeta()); + dataOut.append(key, parseData); + if (!parse.isCanonical()) { + CrawlDatum datum = new CrawlDatum(); + datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); + String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY); + try { + datum.setFetchTime(Long.parseLong(timeString)); + } catch (Exception e) { + LOG.warn("Can't read fetch time for: " + key); + datum.setFetchTime(System.currentTimeMillis()); + } + //crawlOut.append(key, datum); + } + } + + public void close(Reporter reporter) throws IOException { + textOut.close(); + dataOut.close(); + //crawlOut.close(); + } + + }; + + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3434 http://archive-access.svn.sourceforge.net/archive-access/?rev=3434&view=rev Author: binzino Date: 2011-04-16 02:38:39 +0000 (Sat, 16 Apr 2011) Log Message: ----------- Put keywords and description into content metadata, not parse metadata. Remove System.out messages. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/html/HtmlDecorator.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/html/HtmlDecorator.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/html/HtmlDecorator.java 2011-04-16 00:02:08 UTC (rev 3433) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/html/HtmlDecorator.java 2011-04-16 02:38:39 UTC (rev 3434) @@ -32,15 +32,13 @@ // If there are no meta-tags, just return. if ( metaTags == null ) return parseResult; - Metadata parseMeta = parseResult.get( content.getUrl( ) ).getData().getParseMeta(); + Metadata contentMeta = parseResult.get( content.getUrl( ) ).getData().getContentMeta(); for ( String key : new String[] { "description", "keywords" } ) { String value = metaTags.getGeneralTags().getProperty( key, "" ); - System.out.println( key + ": " + value ); - - parseMeta.set( key, value ); + contentMeta.set( key, value ); } return parseResult; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2011-04-16 00:02:15
|
Revision: 3433 http://archive-access.svn.sourceforge.net/archive-access/?rev=3433&view=rev Author: binzino Date: 2011-04-16 00:02:08 +0000 (Sat, 16 Apr 2011) Log Message: ----------- Fix WAX-79. HTML description and keywords meta tags are extracted. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/build.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/build.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/build.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/plugin.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/html/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/html/HtmlDecorator.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/build.xml 2011-04-15 01:16:28 UTC (rev 3432) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/build.xml 2011-04-16 00:02:08 UTC (rev 3433) @@ -23,7 +23,7 @@ <property name="lib.dir" value="lib" /> <property name="build.dir" value="${nutch.dir}/build" /> <!-- HACK: Need to import default.properties like Nutch does --> - <property name="final.name" value="nutch-1.0" /> + <property name="final.name" value="nutch-1.1" /> <property name="dist.dir" value="${build.dir}/${final.name}" /> <target name="init"> Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/build.xml 2011-04-15 01:16:28 UTC (rev 3432) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/build.xml 2011-04-16 00:02:08 UTC (rev 3433) @@ -376,6 +376,7 @@ <packageset dir="${plugins.dir}/query-nutchwax/src/java"/> <packageset dir="${plugins.dir}/scoring-nutchwax/src/java"/> <packageset dir="${plugins.dir}/urlfilter-nutchwax/src/java"/> + <packageset dir="${plugins.dir}/html-decorator/src/java"/> <link href="${javadoc.link.java}"/> <link href="${javadoc.link.lucene}"/> Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2011-04-15 01:16:28 UTC (rev 3432) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2011-04-16 00:02:08 UTC (rev 3433) @@ -10,7 +10,7 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-(pdf2|tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> + <value>protocol-http|parse-(pdf2|tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax|html-decorator</value> </property> <!-- Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2011-04-15 01:16:28 UTC (rev 3432) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2011-04-16 00:02:08 UTC (rev 3433) @@ -92,6 +92,7 @@ <ant dir="scoring-nutchwax" target="deploy" /> <ant dir="urlfilter-nutchwax" target="deploy" /> <ant dir="parse-pdf2" target="deploy" /> + <ant dir="html-decorator" target="deploy" /> </target> @@ -204,5 +205,6 @@ <ant dir="scoring-nutchwax" target="clean" /> <ant dir="urlfilter-nutchwax" target="clean" /> <ant dir="parse-pdf2" target="clean" /> + <ant dir="html-decorator" target="clean" /> </target> </project> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/build.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/build.xml 2011-04-16 00:02:08 UTC (rev 3433) @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="html-decorator" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/plugin.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/plugin.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/plugin.xml 2011-04-16 00:02:08 UTC (rev 3433) @@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="html-decorator" + name="NutchWAX HTML Decorator" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="html-decorator.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.archive.nutchwax.html.HtmlDecorator" + name="NutchWAX HTML Decorator" + point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="HtmlDecorator" + class="org.archive.nutchwax.html.HtmlDecorator" /> + </extension> + +</plugin> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/html/HtmlDecorator.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/html/HtmlDecorator.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/html/HtmlDecorator.java 2011-04-16 00:02:08 UTC (rev 3433) @@ -0,0 +1,58 @@ + +package org.archive.nutchwax.html; + +import java.util.*; + +import org.apache.hadoop.conf.Configuration; + +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NodeWalker; + +import org.w3c.dom.DocumentFragment; + + +public class HtmlDecorator implements HtmlParseFilter +{ + private Configuration conf; + + + public ParseResult filter( Content content, + ParseResult parseResult, + HTMLMetaTags metaTags, + DocumentFragment doc ) + { + // If no parse result, just return. + if ( parseResult == null ) return parseResult; + + // If there are no meta-tags, just return. + if ( metaTags == null ) return parseResult; + + Metadata parseMeta = parseResult.get( content.getUrl( ) ).getData().getParseMeta(); + + for ( String key : new String[] { "description", "keywords" } ) + { + String value = metaTags.getGeneralTags().getProperty( key, "" ); + + System.out.println( key + ": " + value ); + + parseMeta.set( key, value ); + } + + return parseResult; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } + + +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-04-15 01:16:34
|
Revision: 3432 http://archive-access.svn.sourceforge.net/archive-access/?rev=3432&view=rev Author: bradtofel Date: 2011-04-15 01:16:28 +0000 (Fri, 15 Apr 2011) Log Message: ----------- Added Paths: ----------- branches/wayback-1_6_1/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-04-15 01:15:49
|
Revision: 3431 http://archive-access.svn.sourceforge.net/archive-access/?rev=3431&view=rev Author: bradtofel Date: 2011-04-15 01:15:43 +0000 (Fri, 15 Apr 2011) Log Message: ----------- Removed Paths: ------------- branches/wayback-1_6_1/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-04-15 01:14:18
|
Revision: 3430 http://archive-access.svn.sourceforge.net/archive-access/?rev=3430&view=rev Author: bradtofel Date: 2011-04-15 01:14:12 +0000 (Fri, 15 Apr 2011) Log Message: ----------- 1.6.1... Added Paths: ----------- branches/wayback-1_6_1/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-03-09 05:56:23
|
Revision: 3429 http://archive-access.svn.sourceforge.net/archive-access/?rev=3429&view=rev Author: bradtofel Date: 2011-03-09 05:56:17 +0000 (Wed, 09 Mar 2011) Log Message: ----------- FEATURE: Now detects a LiveWebTimeout, or LiveRobotsTimeout request, and aborts subsequent attempts, which are also likely to timeout within this request. More of a BugFix, since this caused dramatic problems, hanging up the thread to timeout on robots request for each search result.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2011-03-09 05:53:57 UTC (rev 3428) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2011-03-09 05:56:17 UTC (rev 3429) @@ -35,6 +35,7 @@ import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.LiveDocumentNotAvailableException; import org.archive.wayback.exception.LiveWebCacheUnavailableException; +import org.archive.wayback.exception.LiveWebTimeoutException; import org.archive.wayback.liveweb.LiveWebCache; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.util.ObjectFilter; @@ -188,7 +189,7 @@ rulesCache.put(firstUrlString,tmpRules); rules = tmpRules; LOGGER.info("ROBOT: Downloaded("+urlString+")"); - + } catch (LiveDocumentNotAvailableException e) { LOGGER.info("ROBOT: LiveDocumentNotAvailableException("+urlString+")"); @@ -201,7 +202,12 @@ return null; } catch (LiveWebCacheUnavailableException e) { LOGGER.info("ROBOT: LiveWebCacheUnavailableException("+urlString+")"); + filterGroup.setLiveWebGone(); return null; + } catch (LiveWebTimeoutException e) { + LOGGER.info("ROBOT: LiveDocumentTimedOutException("+urlString+")"); + filterGroup.setRobotTimedOut(); + return null; } } } @@ -226,7 +232,11 @@ } int filterResult = ObjectFilter.FILTER_EXCLUDE; RobotRules rules = getRules(r); - if(rules != null) { + if(rules == null) { + if(filterGroup.getRobotTimedOut() || filterGroup.getLiveWebGone()) { + return ObjectFilter.FILTER_ABORT; + } + } else { String resultURL = r.getOriginalUrl(); URL url; try { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java 2011-03-09 05:53:57 UTC (rev 3428) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java 2011-03-09 05:56:17 UTC (rev 3429) @@ -26,8 +26,11 @@ import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.AccessControlException; import org.archive.wayback.exception.AdministrativeAccessControlException; +import org.archive.wayback.exception.LiveWebCacheUnavailableException; import org.archive.wayback.exception.ResourceNotInArchiveException; import org.archive.wayback.exception.RobotAccessControlException; +import org.archive.wayback.exception.RobotNotAvailableException; +import org.archive.wayback.exception.RobotTimedOutAccessControlException; import org.archive.wayback.resourceindex.filters.CounterFilter; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.util.ObjectFilter; @@ -41,6 +44,8 @@ String requestUrl = null; private boolean sawRobots = false; private boolean passedRobots = false; + private boolean robotTimedOut = false; + private boolean liveWebGone = false; private boolean sawAdministrative = false; private boolean passedAdministrative = false; @@ -67,7 +72,16 @@ } public void annotateResults(SearchResults results) - throws AccessControlException, ResourceNotInArchiveException { + throws AccessControlException, ResourceNotInArchiveException, + RobotNotAvailableException { + if(robotTimedOut) { + throw new RobotTimedOutAccessControlException("Unable to check" + + " robots.txt for " + requestUrl); + } + if(liveWebGone) { + throw new RobotNotAvailableException("The URL " + requestUrl + + " is blocked by the sites robots.txt file"); + } if(sawRobots && !passedRobots) { throw new RobotAccessControlException("The URL " + requestUrl + " is blocked by the sites robots.txt file"); @@ -91,4 +105,18 @@ public void setSawAdministrative() { sawAdministrative = true; } + + public void setRobotTimedOut() { + robotTimedOut = true; + } + public boolean getRobotTimedOut() { + return robotTimedOut; + } + + public void setLiveWebGone() { + liveWebGone = true; + } + public boolean getLiveWebGone() { + return liveWebGone; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |