You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
Revision: 2830 http://archive-access.svn.sourceforge.net/archive-access/?rev=2830&view=rev Author: alexoz Date: 2009-10-23 05:29:09 +0000 (Fri, 23 Oct 2009) Log Message: ----------- FEATURE: Add convenience methods for checking requests that have multiple groups. Modified Paths: -------------- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/AccessControlClient.java Modified: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/AccessControlClient.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/AccessControlClient.java 2009-10-23 05:28:37 UTC (rev 2829) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/AccessControlClient.java 2009-10-23 05:29:09 UTC (rev 2830) @@ -46,6 +46,22 @@ this(new CachingRuleDao(oracleUrl), new CachingRobotClient()); } + private String getPolicy(String url, Rule rule) + throws RobotsUnavailableException { + if (robotLookupsEnabled && rule != null && "robots".equals(rule.getPolicy())) { + try { + if (robotClient.isRobotPermitted(url, robotUserAgent)) { + return "allow"; + } else { + return "block"; + } + } catch (IOException e) { + throw new RobotsUnavailableException(e); + } + } + return rule.getPolicy(); + } + /** * Return the best-matching policy for the requested document. * @@ -63,24 +79,34 @@ * @throws RuleOracleUnavailableException */ public String getPolicy(String url, Date captureDate, Date retrievalDate, - String who) throws RobotsUnavailableException, RuleOracleUnavailableException { - Rule matchingRule = getRule(url, captureDate, retrievalDate, who); - - if (robotLookupsEnabled && matchingRule != null && "robots".equals(matchingRule.getPolicy())) { - try { - if (robotClient.isRobotPermitted(url, robotUserAgent)) { - return "allow"; - } else { - return "block"; - } - } catch (IOException e) { - throw new RobotsUnavailableException(e); - } - } - return matchingRule.getPolicy(); + String who) throws RobotsUnavailableException, + RuleOracleUnavailableException { + return getPolicy(url, getRule(url, captureDate, retrievalDate, who)); } /** + * Return the best-matching policy for the requested document. + * + * @param url + * URL of the requested document. + * @param captureDate + * Date the document was archived. + * @param retrievalDate + * Date of retrieval (usually now). + * @param groups + * Group names of the user accessing the document. + * @return Access-control policy that should be enforced. eg "robots", + * "block" or "allow". + * @throws RobotsUnavailableException + * @throws RuleOracleUnavailableException + */ + public String getPolicy(String url, Date captureDate, Date retrievalDate, + Collection<String> groups) throws RobotsUnavailableException, + RuleOracleUnavailableException { + return getPolicy(url, getRule(url, captureDate, retrievalDate, groups)); + } + + /** * Return the most specific matching rule for the requested document. * * @param url @@ -108,7 +134,42 @@ return matchingRule; } + /** + * Return the most specific matching rule for the requested document. + * + * @param url + * URL of the requested document. + * @param captureDate + * Date the document was archived. + * @param retrievalDate + * Date of retrieval (usually now). + * @param groups + * Group names of the user accessing the document. + * @return + * @throws RuleOracleUnavailableException + */ + public Rule getRule(String url, Date captureDate, Date retrievalDate, + Collection<String> groups) + throws RuleOracleUnavailableException { + Rule bestRule = null; + for (String who: groups) { + Rule rule = getRule(url, captureDate, retrievalDate, who); + + /* We compare policies not the rules themselves as + * a user should have full access to something one of their + * groups has access to, even if another group they are + * member of does not. + */ + if (bestRule == null || + rule.getPolicyId().compareTo(bestRule.getPolicyId()) < 0) { + bestRule = rule; + } + } + return bestRule; + } + + /** * This method allows the client to prepare for lookups from a given set of * urls. This can warm up a cache and/or enable a mass data transfer to be done in This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2829 http://archive-access.svn.sourceforge.net/archive-access/?rev=2829&view=rev Author: alexoz Date: 2009-10-23 05:28:37 +0000 (Fri, 23 Oct 2009) Log Message: ----------- BUGFIX (unreported): Remove test that was failing because http://web.archive.org/robots.txt had changed. Modified Paths: -------------- trunk/archive-access/projects/access-control/access-control/src/test/java/org/archive/accesscontrol/robotstxt/HttpRobotClientTest.java Modified: trunk/archive-access/projects/access-control/access-control/src/test/java/org/archive/accesscontrol/robotstxt/HttpRobotClientTest.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/test/java/org/archive/accesscontrol/robotstxt/HttpRobotClientTest.java 2009-10-23 00:58:43 UTC (rev 2828) +++ trunk/archive-access/projects/access-control/access-control/src/test/java/org/archive/accesscontrol/robotstxt/HttpRobotClientTest.java 2009-10-23 05:28:37 UTC (rev 2829) @@ -16,7 +16,6 @@ public void testBasic() throws Exception { HttpRobotClient client = new HttpRobotClient(); - assertFalse(client.isRobotPermitted("http://web.archive.org/cgi-bin/fishbowl", "wayback-access-control-test")); assertTrue(client.isRobotPermitted("http://www.archive.org/index.html", "wayback-access-control-test")); assertTrue(client.isRobotPermitted("http://google.com/fish.html", "wayback-access-control-test")); assertFalse(client.isRobotPermitted("http://google.com/news", "wayback-access-control-test")); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-23 00:58:51
|
Revision: 2828 http://archive-access.svn.sourceforge.net/archive-access/?rev=2828&view=rev Author: bradtofel Date: 2009-10-23 00:58:43 +0000 (Fri, 23 Oct 2009) Log Message: ----------- FEATURE: now parses "charset detection mode" flag ("cm#_") to specify different strategies. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlRequestParser.java 2009-10-23 00:46:31 UTC (rev 2827) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlRequestParser.java 2009-10-23 00:58:43 UTC (rev 2828) @@ -46,6 +46,7 @@ public final static String JS_CONTEXT = "js"; public final static String CSS_CONTEXT = "cs"; public final static String IMG_CONTEXT = "im"; + public final static String CHARSET_MODE = "cm"; protected RequestParser[] getRequestParsers() { RequestParser[] theParsers = { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java 2009-10-23 00:46:31 UTC (rev 2827) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java 2009-10-23 00:58:43 UTC (rev 2828) @@ -46,7 +46,7 @@ * timestamp, flags, & url */ public final static Pattern WB_REQUEST_REGEX = Pattern - .compile("^(\\d{1,14})(([a-z]{2}_)*)/(.*)$"); + .compile("^(\\d{1,14})(([a-z]{2}[0-9]*_)*)/(.*)$"); /** * @param wrapped @@ -124,6 +124,11 @@ wbRequest.setJSContext(true); } else if(flag.equals(ArchivalUrlRequestParser.IMG_CONTEXT)) { wbRequest.setIMGContext(true); + } else if(flag.startsWith(ArchivalUrlRequestParser.CHARSET_MODE)) { + String modeString = flag.substring( + ArchivalUrlRequestParser.CHARSET_MODE.length()); + int mode = Integer.parseInt(modeString); + wbRequest.setCharsetMode(mode); } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2827 http://archive-access.svn.sourceforge.net/archive-access/?rev=2827&view=rev Author: bradtofel Date: 2009-10-23 00:46:31 +0000 (Fri, 23 Oct 2009) Log Message: ----------- FEATURE: added closeMatches List of URLs that might satisfy a users requests Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/ResourceNotInArchiveException.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/ResourceNotInArchiveException.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/ResourceNotInArchiveException.java 2009-10-23 00:44:47 UTC (rev 2826) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/ResourceNotInArchiveException.java 2009-10-23 00:46:31 UTC (rev 2827) @@ -1,5 +1,7 @@ package org.archive.wayback.exception; +import java.util.List; + import javax.servlet.http.HttpServletResponse; /** @@ -14,8 +16,11 @@ */ private static final long serialVersionUID = 1L; protected static final String ID = "resourceNotInArchive"; - /** + * List of alternate string URLs that might get the user what they want. + */ + private List<String> closeMatches = null; + /** * Constructor * * @param message @@ -40,4 +45,16 @@ public int getStatus() { return HttpServletResponse.SC_NOT_FOUND; } + /** + * @return the closeMatches + */ + public List<String> getCloseMatches() { + return closeMatches; + } + /** + * @param closeMatches the closeMatches to set + */ + public void setCloseMatches(List<String> closeMatches) { + this.closeMatches = closeMatches; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2826 http://archive-access.svn.sourceforge.net/archive-access/?rev=2826&view=rev Author: bradtofel Date: 2009-10-23 00:44:47 +0000 (Fri, 23 Oct 2009) Log Message: ----------- FEATURE: Now handles all the HTTP communications explicitly, as I couldn't find a small HttpClient lib that enabled fine grained control over the various socket operations that can timeout. So, now you can configure specific timeouts. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/FileProxyServlet.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/FileProxyServlet.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/FileProxyServlet.java 2009-10-23 00:40:41 UTC (rev 2825) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/FileProxyServlet.java 2009-10-23 00:44:47 UTC (rev 2826) @@ -29,8 +29,10 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.RandomAccessFile; +import java.net.InetAddress; +import java.net.InetSocketAddress; +import java.net.Socket; import java.net.URL; -import java.net.URLConnection; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -39,6 +41,10 @@ import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; +import org.apache.commons.httpclient.ChunkedInputStream; +import org.archive.util.anvl.ANVLRecord; +import org.archive.wayback.util.http.HttpRequestMessage; +import org.archive.wayback.util.http.HttpResponse; import org.archive.wayback.webapp.ServletRequestContext; /** @@ -71,6 +77,10 @@ private static final long serialVersionUID = 1L; private ResourceFileLocationDB locationDB = null; + + private int socketTimeoutMs = 5000; + + private int connectTimeoutMs = 1000; public boolean handleRequest(HttpServletRequest httpRequest, HttpServletResponse httpResponse) throws IOException, @@ -114,6 +124,7 @@ httpResponse.setStatus(HttpServletResponse.SC_OK); // BUGBUG: this will be broken for non compressed data... httpResponse.setContentType(ds.getContentType()); + httpResponse.setBufferSize(BUF_SIZE); ds.copyTo(httpResponse.getOutputStream()); } } @@ -126,15 +137,51 @@ DataSource ds = null; if(location.startsWith("http://")) { URL url = new URL(location); - URLConnection conn = url.openConnection(); + String hostname = url.getHost(); + int port = url.getPort(); + if(port == -1) { + port = 80; + } + byte GET[] = "GET".getBytes(); + byte HTTP11[] = "HTTP/1.1".getBytes(); + InetAddress addr = InetAddress.getByName(hostname); + HttpRequestMessage requestMessage = new HttpRequestMessage( + GET,url.getFile().getBytes(),HTTP11); + ANVLRecord headers = new ANVLRecord(); + headers.addLabelValue("Host", hostname); + + if(offset != 0) { - conn.addRequestProperty(RANGE_HTTP_HEADER, + headers.addLabelValue(RANGE_HTTP_HEADER, HEADER_BYTES_PREFIX + String.valueOf(offset) + - HEADER_BYTES_SUFFIX); + HEADER_BYTES_SUFFIX); } + InetSocketAddress sockAddr = new InetSocketAddress(addr,port); + Socket socket = new Socket(); + socket.setSoTimeout(socketTimeoutMs); + socket.setReceiveBufferSize(BUF_SIZE); - ds = new URLDataSource(conn.getInputStream(),conn.getContentType()); + socket.connect(sockAddr, connectTimeoutMs); + OutputStream socketOut = socket.getOutputStream(); + InputStream socketIn = socket.getInputStream(); + socketOut.write(requestMessage.getBytes(true)); + socketOut.write(headers.getUTF8Bytes()); + socketOut.flush(); + HttpResponse response = HttpResponse.load(socketIn); + String contentType = response.getHeaders().asMap().get("Content-Type"); + if(contentType == null) { + contentType = "application/unknown"; + } + String xferEncoding = response.getHeaders().asMap().get("Transfer-Encoding"); + if(xferEncoding != null) { + if(xferEncoding.equals("chunked")) { + socketIn = new ChunkedInputStream(socketIn); + } + } + + ds = new URLDataSource(socketIn,contentType); + } else { // assume a local file path: File f = new File(location); @@ -204,6 +251,34 @@ this.locationDB = locationDB; } + /** + * @return the socketTimeoutMs + */ + public int getSocketTimeoutMs() { + return socketTimeoutMs; + } + + /** + * @param socketTimeoutMs the socketTimeoutMs to set + */ + public void setSocketTimeoutMs(int socketTimeoutMs) { + this.socketTimeoutMs = socketTimeoutMs; + } + + /** + * @return the connectTimeoutMs + */ + public int getConnectTimeoutMs() { + return connectTimeoutMs; + } + + /** + * @param connectTimeoutMs the connectTimeoutMs to set + */ + public void setConnectTimeoutMs(int connectTimeoutMs) { + this.connectTimeoutMs = connectTimeoutMs; + } + private class ResourceLocation { private String name = null; private long offset = 0; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2825 http://archive-access.svn.sourceforge.net/archive-access/?rev=2825&view=rev Author: bradtofel Date: 2009-10-23 00:40:41 +0000 (Fri, 23 Oct 2009) Log Message: ----------- Now explicitly adds the content-length header from the original HTTP headers, in case the configured HttpHeaderProcessor did not include it. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TransparentReplayRenderer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TransparentReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TransparentReplayRenderer.java 2009-10-23 00:39:07 UTC (rev 2824) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TransparentReplayRenderer.java 2009-10-23 00:40:41 UTC (rev 2825) @@ -67,6 +67,14 @@ Map<String,String> headers = HttpHeaderOperation.processHeaders( resource, result, uriConverter, httpHeaderProcessor); + + // HACKHACK: getContentLength() may not find the original content length + // if a HttpHeaderProcessor has mangled it too badly. Should this + // happen in the HttpHeaderProcessor itself? + String origLength = HttpHeaderOperation.getContentLength(headers); + if(origLength != null) { + headers.put(HttpHeaderOperation.HTTP_LENGTH_HEADER, origLength); + } HttpHeaderOperation.sendHeaders(headers, httpResponse); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-23 00:39:17
|
Revision: 2824 http://archive-access.svn.sourceforge.net/archive-access/?rev=2824&view=rev Author: bradtofel Date: 2009-10-23 00:39:07 +0000 (Fri, 23 Oct 2009) Log Message: ----------- FEATURE: Now can handle an extra robot-meta tag info field. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/SearchResultToCDXLineAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2009-10-23 00:37:11 UTC (rev 2823) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2009-10-23 00:39:07 UTC (rev 2824) @@ -70,8 +70,13 @@ public static CaptureSearchResult doAdapt(String line) { CaptureSearchResult result = new CaptureSearchResult(); String[] tokens = line.split(" "); + boolean hasRobotFlags = false; if (tokens.length != 9) { - return null; + if(tokens.length == 10) { + hasRobotFlags = true; + } else { + return null; + } //throw new IllegalArgumentException("Need 9 columns("+line+")"); } String urlKey = tokens[0]; @@ -91,10 +96,17 @@ String digest = tokens[5]; String redirectUrl = tokens[6]; long compressedOffset = -1; - if(!tokens[7].equals("-")) { - compressedOffset = Long.parseLong(tokens[7]); + int nextToken = 7; + if(hasRobotFlags) { + result.setRobotFlags(tokens[nextToken]); + nextToken++; } - String fileName = tokens[8]; + + if(!tokens[nextToken].equals("-")) { + compressedOffset = Long.parseLong(tokens[nextToken]); + } + nextToken++; + String fileName = tokens[nextToken]; result.setUrlKey(urlKey); result.setCaptureTimestamp(captureTS); result.setOriginalUrl(originalUrl); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/SearchResultToCDXLineAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/SearchResultToCDXLineAdapter.java 2009-10-23 00:37:11 UTC (rev 2823) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/SearchResultToCDXLineAdapter.java 2009-10-23 00:39:07 UTC (rev 2824) @@ -41,7 +41,14 @@ private static int DEFAULT_CAPACITY = 120; private final static String DELIMITER = " "; + private boolean outputRobot = false; + public boolean isOutputRobot() { + return outputRobot; + } + public void setIsOutputRobot(boolean isOutputRobot) { + this.outputRobot = isOutputRobot; + } /* (non-Javadoc) * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) */ @@ -63,15 +70,29 @@ sb.append(DELIMITER); sb.append(result.getRedirectUrl()); sb.append(DELIMITER); + if(outputRobot) { + String robotFlags = result.getRobotFlags(); + if(robotFlags == null || robotFlags.equals("")) { + robotFlags = "-"; + } + sb.append(robotFlags); + sb.append(DELIMITER); + } sb.append(result.getOffset()); sb.append(DELIMITER); sb.append(result.getFile()); return sb.toString(); } - public static Iterator<String> adapt(Iterator<CaptureSearchResult> input) { - return new AdaptedIterator<CaptureSearchResult,String>(input, - new SearchResultToCDXLineAdapter()); + return adapt(input,false); } + + public static Iterator<String> adapt(Iterator<CaptureSearchResult> input, + boolean isOutputRobot) { + SearchResultToCDXLineAdapter adapter = + new SearchResultToCDXLineAdapter(); + adapter.setIsOutputRobot(isOutputRobot); + return new AdaptedIterator<CaptureSearchResult,String>(input,adapter); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2823 http://archive-access.svn.sourceforge.net/archive-access/?rev=2823&view=rev Author: bradtofel Date: 2009-10-23 00:37:11 +0000 (Fri, 23 Oct 2009) Log Message: ----------- TWEAK: changed scope of adaptIterator() to allow subclasses to use it Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXIndex.java 2009-10-23 00:35:10 UTC (rev 2822) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXIndex.java 2009-10-23 00:37:11 UTC (rev 2823) @@ -49,7 +49,8 @@ */ private static final long serialVersionUID = 1L; - private CloseableIterator<CaptureSearchResult> adaptIterator(Iterator<String> itr) { + protected CloseableIterator<CaptureSearchResult> adaptIterator(Iterator<String> itr) + throws IOException { return new AdaptedIterator<String,CaptureSearchResult>(itr, new CDXLineToSearchResultAdapter()); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-23 00:35:20
|
Revision: 2822 http://archive-access.svn.sourceforge.net/archive-access/?rev=2822&view=rev Author: bradtofel Date: 2009-10-23 00:35:10 +0000 (Fri, 23 Oct 2009) Log Message: ----------- REFACTOR: moved functionality from adapters to filters. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ConditionalGetAnnotationFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/ConditionalGetAnnotationSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/ConditionalGetAnnotationSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/ConditionalGetAnnotationSearchResultAdapter.java 2009-10-23 00:34:19 UTC (rev 2821) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/ConditionalGetAnnotationSearchResultAdapter.java 2009-10-23 00:35:10 UTC (rev 2822) @@ -1,99 +0,0 @@ -/* ConditionalGetAnnotationSearchResultAdapter - * - * $Id$ - * - * Created on 6:09:05 PM Mar 12, 2009. - * - * Copyright (C) 2009 Internet Archive. - * - * This file is part of wayback. - * - * wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourceindex.adapters; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.util.Adapter; - -/** - * WARC file allows 2 forms of deduplication. The first actually downloads - * documents and compares their digest with a database of previous values. When - * a new capture of a document exactly matches the previous digest, an - * abbreviated record is stored in the WARC file. The second form uses an HTTP - * conditional GET request, sending previous values returned for a given URL - * (etag, last-modified, etc). In this case, the remote server either sends a - * new document (200) which is stored normally, or the server will return a - * 304 (Not Modified) response, which is stored in the WARC file. - * - * For the first record type, the wayback indexer will output a placeholder - * record that includes the digest of the last-stored record. For 304 responses, - * the indexer outputs a normal looking record, but the record will have a - * SHA1 digest which is easily distinguishable as an "empty" document. The SHA1 - * is always: - * - * 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - * - * This class will observe a stream of SearchResults, storing the values for - * the last seen non-empty SHA1 field. Any subsequent SearchResults with an - * empty SHA1 will be annotated, copying the values from the last non-empty - * record. - * - * This is highly experimental. - * - * @author brad - * @version $Date$, $Revision$ - */ - -public class ConditionalGetAnnotationSearchResultAdapter -implements Adapter<CaptureSearchResult,CaptureSearchResult> { - - private final static String EMPTY_VALUE = "-"; - private final static String EMPTY_SHA1 = "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"; - - private CaptureSearchResult lastSeen = null; - - public ConditionalGetAnnotationSearchResultAdapter() { - } - - private CaptureSearchResult annotate(CaptureSearchResult o) { - if(lastSeen == null) { - // TODO: log missing record digest reference - return null; - } - o.setFile(lastSeen.getFile()); - o.setOffset(lastSeen.getOffset()); - o.setDigest(lastSeen.getDigest()); - o.setHttpCode(lastSeen.getHttpCode()); - o.setMimeType(lastSeen.getMimeType()); - o.setRedirectUrl(lastSeen.getRedirectUrl()); - o.flagDuplicateHTTP(lastSeen.getCaptureTimestamp()); - return o; - } - - private CaptureSearchResult remember(CaptureSearchResult o) { - lastSeen = o; - return o; - } - - public CaptureSearchResult adapt(CaptureSearchResult o) { - if(o.getFile().equals(EMPTY_VALUE)) { - if(o.getDigest().equals(EMPTY_SHA1)) { - return annotate(o); - } - return o; - } - return remember(o); - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java 2009-10-23 00:34:19 UTC (rev 2821) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java 2009-10-23 00:35:10 UTC (rev 2822) @@ -1,65 +0,0 @@ -package org.archive.wayback.resourceindex.adapters; - -import java.util.HashMap; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.util.Adapter; - -/** - * Adapter class that observes a stream of SearchResults tracking for each - * complete record, a mapping of that records digest to: - * Arc/Warc Filename - * Arc/Warc offset - * HTTP Response - * MIME-Type - * Redirect URL - * - * If subsequent SearchResults are missing these fields ("-") and the Digest - * field has been seen, then the subsequent SearchResults are updated with the - * values from the kept copy matching that digest, and an additional annotation - * field is added. - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class DeduplicationSearchResultAnnotationAdapter -implements Adapter<CaptureSearchResult,CaptureSearchResult> { - private final static String EMPTY_VALUE = "-"; - private final static String REVISIT_VALUE = "warc/revisit"; - - private HashMap<String,CaptureSearchResult> memory = null; - - public DeduplicationSearchResultAnnotationAdapter() { - memory = new HashMap<String,CaptureSearchResult>(); - } - - private CaptureSearchResult annotate(CaptureSearchResult o) { - String thisDigest = o.getDigest(); - CaptureSearchResult last = memory.get(thisDigest); - if(last == null) { - // TODO: log missing record digest reference - return null; - } - o.setFile(last.getFile()); - o.setOffset(last.getOffset()); - o.setHttpCode(last.getHttpCode()); - o.setMimeType(last.getMimeType()); - o.setRedirectUrl(last.getRedirectUrl()); - o.flagDuplicateDigest(last.getCaptureTimestamp()); - return o; - } - - private CaptureSearchResult remember(CaptureSearchResult o) { - memory.put(o.getDigest(),o); - return o; - } - - public CaptureSearchResult adapt(CaptureSearchResult o) { - if(o.getFile().equals(EMPTY_VALUE) - || o.getMimeType().equals(REVISIT_VALUE)) { - return annotate(o); - } - return remember(o); - } -} \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ConditionalGetAnnotationFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ConditionalGetAnnotationFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ConditionalGetAnnotationFilter.java 2009-10-23 00:35:10 UTC (rev 2822) @@ -0,0 +1,72 @@ +package org.archive.wayback.resourceindex.filters; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; + +/** + * WARC file allows 2 forms of deduplication. The first actually downloads + * documents and compares their digest with a database of previous values. When + * a new capture of a document exactly matches the previous digest, an + * abbreviated record is stored in the WARC file. The second form uses an HTTP + * conditional GET request, sending previous values returned for a given URL + * (etag, last-modified, etc). In this case, the remote server either sends a + * new document (200) which is stored normally, or the server will return a + * 304 (Not Modified) response, which is stored in the WARC file. + * + * For the first record type, the wayback indexer will output a placeholder + * record that includes the digest of the last-stored record. For 304 responses, + * the indexer outputs a normal looking record, but the record will have a + * SHA1 digest which is easily distinguishable as an "empty" document. The SHA1 + * is always: + * + * 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ + * + * This class will observe a stream of SearchResults, storing the values for + * the last seen non-empty SHA1 field. Any subsequent SearchResults with an + * empty SHA1 will be annotated, copying the values from the last non-empty + * record. + * + * This is highly experimental. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ConditionalGetAnnotationFilter +implements ObjectFilter<CaptureSearchResult> { + + private final static String EMPTY_VALUE = "-"; + private final static String EMPTY_SHA1 = "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"; + + private CaptureSearchResult lastSeen = null; + + private int annotate(CaptureSearchResult o) { + if(lastSeen == null) { + // TODO: log missing record digest reference + return FILTER_EXCLUDE; + } + o.setFile(lastSeen.getFile()); + o.setOffset(lastSeen.getOffset()); + o.setDigest(lastSeen.getDigest()); + o.setHttpCode(lastSeen.getHttpCode()); + o.setMimeType(lastSeen.getMimeType()); + o.setRedirectUrl(lastSeen.getRedirectUrl()); + o.flagDuplicateHTTP(lastSeen.getCaptureTimestamp()); + return FILTER_INCLUDE; + } + + private int remember(CaptureSearchResult o) { + lastSeen = o; + return FILTER_INCLUDE; + } + + public int filterObject(CaptureSearchResult o) { + if(o.getFile().equals(EMPTY_VALUE)) { + if(o.getDigest().equals(EMPTY_SHA1)) { + return annotate(o); + } + return FILTER_INCLUDE; + } + return remember(o); + } + +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java 2009-10-23 00:35:10 UTC (rev 2822) @@ -0,0 +1,73 @@ +package org.archive.wayback.resourceindex.filters; + +import java.util.HashMap; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; + +/** + * Filter class that observes a stream of SearchResults tracking for each + * complete record, a mapping of that records Digest to: + * Arc/Warc Filename + * Arc/Warc offset + * HTTP Response + * MIME-Type + * Redirect URL + * + * If subsequent SearchResults are missing these fields ("-") and the Digest + * field is in the map, then the SearchResults missing fields are replaced with + * the values from the previously seen record with the same digest, and an + * additional annotation field is added. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class WARCRevisitAnnotationFilter +implements ObjectFilter<CaptureSearchResult> { + + private final static String EMPTY_VALUE = "-"; + private final static String REVISIT_VALUE = "warc/revisit"; + + private HashMap<String,CaptureSearchResult> memory = null; + + public WARCRevisitAnnotationFilter() { + memory = new HashMap<String,CaptureSearchResult>(); + } + + private int annotate(CaptureSearchResult o) { + String thisDigest = o.getDigest(); + CaptureSearchResult last = memory.get(thisDigest); + if(last == null) { + // TODO: log missing record digest reference? + return FILTER_EXCLUDE; + } + o.setFile(last.getFile()); + o.setOffset(last.getOffset()); + o.setHttpCode(last.getHttpCode()); + o.setMimeType(last.getMimeType()); + o.setRedirectUrl(last.getRedirectUrl()); + o.flagDuplicateDigest(last.getCaptureTimestamp()); + return FILTER_INCLUDE; + } + + private int remember(CaptureSearchResult o) { + memory.put(o.getDigest(),o); + return FILTER_INCLUDE; + } + +// public CaptureSearchResult adapt(CaptureSearchResult o) { +// if(o.getFile().equals(EMPTY_VALUE) +// || o.getMimeType().equals(REVISIT_VALUE)) { +// return annotate(o); +// } +// return remember(o); +// } + + public int filterObject(CaptureSearchResult o) { + if(o.getFile().equals(EMPTY_VALUE) + || o.getMimeType().equals(REVISIT_VALUE)) { + return annotate(o); + } + return remember(o); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2821 http://archive-access.svn.sourceforge.net/archive-access/?rev=2821&view=rev Author: bradtofel Date: 2009-10-23 00:34:19 +0000 (Fri, 23 Oct 2009) Log Message: ----------- functionality moved into CaptureSearchResult -- it attempts to create an original url from the original host and the url key in the same way this class did. Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentitySearchResultAdapter.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentitySearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentitySearchResultAdapter.java 2009-10-23 00:14:47 UTC (rev 2820) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentitySearchResultAdapter.java 2009-10-23 00:34:19 UTC (rev 2821) @@ -1,74 +0,0 @@ -/* LegacyToIdentityFilter - * - * $Id$ - * - * Created on 11:48:56 AM Jul 10, 2008. - * - * Copyright (C) 2008 Internet Archive. - * - * This file is part of wayback. - * - * wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourceindex.adapters; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.util.ObjectFilter; -import org.archive.wayback.util.url.UrlOperations; - -/** - * CaptureSearchResult ObjectFilter which passes through all inputs, modifying - * each to construct a corrected original URL to comply with new Identity - * format. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class LegacyToIdentitySearchResultAdapter implements ObjectFilter<CaptureSearchResult> { - private final static String DEFAULT_SCHEME = "http://"; - - private int getEndOfHostIndex(String url) { - int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); - int pathIdx = url.indexOf(UrlOperations.PATH_START); - if(portIdx == -1 && pathIdx == -1) { - return url.length(); - } - if(portIdx == -1) { - return pathIdx; - } - if(pathIdx == -1) { - return portIdx; - } - if(pathIdx > portIdx) { - return portIdx; - } else { - return pathIdx; - } - } - - /* (non-Javadoc) - * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object) - */ - public int filterObject(CaptureSearchResult o) { - String urlKey = o.getUrlKey(); - StringBuilder sb = new StringBuilder(urlKey.length()); - sb.append(DEFAULT_SCHEME); - sb.append(o.getOriginalUrl()); - sb.append(urlKey.substring(getEndOfHostIndex(urlKey))); - o.setOriginalUrl(sb.toString()); - return FILTER_INCLUDE; - } - -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2819 http://archive-access.svn.sourceforge.net/archive-access/?rev=2819&view=rev Author: bradtofel Date: 2009-10-23 00:12:40 +0000 (Fri, 23 Oct 2009) Log Message: ----------- FEATURE: now optionally annotates close matches that were excluded. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SchemeMatchFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SchemeMatchFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SchemeMatchFilter.java 2009-10-22 23:44:05 UTC (rev 2818) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SchemeMatchFilter.java 2009-10-23 00:12:40 UTC (rev 2819) @@ -25,6 +25,7 @@ package org.archive.wayback.resourceindex.filters; import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.resourceindex.filterfactory.QueryCaptureFilterGroup; import org.archive.wayback.util.ObjectFilter; import org.archive.wayback.util.url.UrlOperations; @@ -39,13 +40,22 @@ public class SchemeMatchFilter implements ObjectFilter<CaptureSearchResult> { private String scheme = null; - + private QueryCaptureFilterGroup annotationTarget = null; + /** * @param hostname String of original host to match */ public SchemeMatchFilter(final String scheme) { this.scheme = scheme; } + /** + * @param hostname String of original host to match + */ + public SchemeMatchFilter(final String scheme, + QueryCaptureFilterGroup annotationTarget) { + this.scheme = scheme; + this.annotationTarget = annotationTarget; + } /* (non-Javadoc) * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object) @@ -53,8 +63,23 @@ public int filterObject(CaptureSearchResult r) { String captureScheme = UrlOperations.urlToScheme(r.getOriginalUrl()); if(scheme == null) { - return captureScheme == null ? FILTER_INCLUDE : FILTER_EXCLUDE; + if(captureScheme == null) { + return FILTER_INCLUDE; + } else { + annotationTarget.addCloseMatch(r.getOriginalHost(), + r.getOriginalUrl()); + return FILTER_EXCLUDE; + } } - return scheme.equals(captureScheme) ? FILTER_INCLUDE : FILTER_EXCLUDE; + + if(scheme.equals(captureScheme)) { + return FILTER_INCLUDE; + } else { + if(annotationTarget != null) { + annotationTarget.addCloseMatch(r.getOriginalHost(), + r.getOriginalUrl()); + } + return FILTER_EXCLUDE; + } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2820 http://archive-access.svn.sourceforge.net/archive-access/?rev=2820&view=rev Author: bradtofel Date: 2009-10-23 00:14:47 +0000 (Fri, 23 Oct 2009) Log Message: ----------- FEATURE: added convenience constructor which takes UrlCanonicalizer Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SelfRedirectFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SelfRedirectFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SelfRedirectFilter.java 2009-10-23 00:12:40 UTC (rev 2819) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SelfRedirectFilter.java 2009-10-23 00:14:47 UTC (rev 2820) @@ -39,10 +39,13 @@ */ public class SelfRedirectFilter implements ObjectFilter<CaptureSearchResult> { - private UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + private UrlCanonicalizer canonicalizer = null; public SelfRedirectFilter() { canonicalizer = new AggressiveUrlCanonicalizer(); } + public SelfRedirectFilter(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } /* (non-Javadoc) * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object) */ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-22 23:44:12
|
Revision: 2818 http://archive-access.svn.sourceforge.net/archive-access/?rev=2818&view=rev Author: bradtofel Date: 2009-10-22 23:44:05 +0000 (Thu, 22 Oct 2009) Log Message: ----------- TWEAK: added SuppressWarnings Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/operator/Utils.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/operator/Utils.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/operator/Utils.java 2009-10-22 23:43:15 UTC (rev 2817) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/operator/Utils.java 2009-10-22 23:44:05 UTC (rev 2818) @@ -4,6 +4,7 @@ import java.util.List; public class Utils { + @SuppressWarnings("unchecked") public static <T> List<BooleanOperator<T>> getOperators(BooleanOperator<T> top) { ArrayList<BooleanOperator<T>> operators = new ArrayList<BooleanOperator<T>>(); ArrayList<BooleanOperator<T>> toInspect = new ArrayList<BooleanOperator<T>>(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-22 23:43:24
|
Revision: 2817 http://archive-access.svn.sourceforge.net/archive-access/?rev=2817&view=rev Author: bradtofel Date: 2009-10-22 23:43:15 +0000 (Thu, 22 Oct 2009) Log Message: ----------- TWEAK: made File member protected. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java 2009-10-22 23:42:14 UTC (rev 2816) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java 2009-10-22 23:43:15 UTC (rev 2817) @@ -48,7 +48,7 @@ private static final long serialVersionUID = 6174187801001601557L; private long lastMatchOffset; - private File file = null; + protected File file = null; /** * */ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-22 23:42:23
|
Revision: 2816 http://archive-access.svn.sourceforge.net/archive-access/?rev=2816&view=rev Author: bradtofel Date: 2009-10-22 23:42:14 +0000 (Thu, 22 Oct 2009) Log Message: ----------- FEATURE: now attempts to dechunkify Chunk-Encoded streams. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java 2009-10-22 23:41:35 UTC (rev 2815) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java 2009-10-22 23:42:14 UTC (rev 2816) @@ -28,6 +28,8 @@ import java.io.InputStream; import java.util.Map; +import org.apache.commons.httpclient.ChunkedInputStream; + /** * Abstraction on top of a document stored in a WaybackCollection. Currently * implemented subclasses include ArcResource and WarcResource. @@ -58,6 +60,15 @@ } } /** + * indicate that there is a Transfer-Encoding: chunked header, so the input + * data should be dechunked as it is read. + * @throws IOException + */ + public void setChunkedEncoding() throws IOException { + validate(); + is = new ChunkedInputStream(is); + } + /** * @return * @throws IOException * @see java.io.BufferedInputStream#available() Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java 2009-10-22 23:41:35 UTC (rev 2815) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java 2009-10-22 23:42:14 UTC (rev 2816) @@ -14,6 +14,7 @@ import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCRecord; import org.archive.wayback.core.Resource; +import org.archive.wayback.replay.HttpHeaderOperation; public class ArcResource extends Resource { /** @@ -78,6 +79,13 @@ String value = headers[i].getValue(); String name = headers[i].getName(); metaData.put(HTTP_HEADER_PREFIX + name,value); + if(name.toUpperCase().contains( + HttpHeaderOperation.HTTP_TRANSFER_ENC_HEADER)) { + if(value.toUpperCase().contains( + HttpHeaderOperation.HTTP_CHUNKED_ENCODING_HEADER)) { + setChunkedEncoding(); + } + } } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java 2009-10-22 23:41:35 UTC (rev 2815) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java 2009-10-22 23:42:14 UTC (rev 2816) @@ -13,6 +13,7 @@ import org.archive.io.warc.WARCReader; import org.archive.io.warc.WARCRecord; import org.archive.wayback.core.Resource; +import org.archive.wayback.replay.HttpHeaderOperation; public class WarcResource extends Resource { private WARCRecord rec = null; @@ -66,10 +67,17 @@ Header[] tmpHeaders = HttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING); headers = new Hashtable<String,String>(); + this.setInputStream(rec); for(Header header: tmpHeaders) { headers.put(header.getName(), header.getValue()); + if(header.getName().toUpperCase().contains( + HttpHeaderOperation.HTTP_TRANSFER_ENC_HEADER)) { + if(header.getValue().toUpperCase().contains( + HttpHeaderOperation.HTTP_CHUNKED_ENCODING_HEADER)) { + setChunkedEncoding(); + } + } } - this.setInputStream(rec); parsedHeaders = true; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2815 http://archive-access.svn.sourceforge.net/archive-access/?rev=2815&view=rev Author: bradtofel Date: 2009-10-22 23:41:35 +0000 (Thu, 22 Oct 2009) Log Message: ----------- REFACTOR: generalized the header search to allow adding isChunkEncoded() Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderOperation.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderOperation.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderOperation.java 2009-10-22 23:39:32 UTC (rev 2814) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderOperation.java 2009-10-22 23:41:35 UTC (rev 2815) @@ -42,6 +42,14 @@ * @version $Date$, $Revision$ */ public class HttpHeaderOperation { + public final static String HTTP_LENGTH_HEADER = "Content-Length"; + public final static String HTTP_LENGTH_HEADER_UP = + HTTP_LENGTH_HEADER.toUpperCase(); + public final static String HTTP_TRANSFER_ENC_HEADER = + "Transfer-Encoding".toUpperCase(); + public final static String HTTP_CHUNKED_ENCODING_HEADER = + "chunked".toUpperCase(); + /** * @param resource @@ -102,4 +110,31 @@ response.setHeader(key,value); } } + + public static String getContentLength(Map<String,String> headers) { + return getHeaderValue(headers,HTTP_LENGTH_HEADER); + } + public static boolean isChunkEncoded(Map<String,String> headers) { + String enc = getHeaderValue(headers,HTTP_TRANSFER_ENC_HEADER); + if(enc != null) { + return enc.toUpperCase().contains(HTTP_CHUNKED_ENCODING_HEADER); + } + return false; + } + public static String getHeaderValue(Map<String,String> headers, String k) { + String value = null; + Iterator<String> itr = headers.keySet().iterator(); + String keyUp = k.toUpperCase(); + while(itr.hasNext()) { + String key = itr.next(); + if(key != null) { + if(key.toUpperCase().contains(keyUp)) { + value = headers.get(key); + break; + } + } + } + return value; + } + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-22 23:39:48
|
Revision: 2814 http://archive-access.svn.sourceforge.net/archive-access/?rev=2814&view=rev Author: bradtofel Date: 2009-10-22 23:39:32 +0000 (Thu, 22 Oct 2009) Log Message: ----------- FEATURE: added closeMatches list, for results that do not match current filters, but still may be appropriate given a users request -- these closeMatches can be offered as alternative requests Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/SearchResults.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/SearchResults.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/SearchResults.java 2009-10-22 23:37:57 UTC (rev 2813) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/SearchResults.java 2009-10-22 23:39:32 UTC (rev 2814) @@ -25,6 +25,7 @@ package org.archive.wayback.core; import java.util.HashMap; +import java.util.List; import java.util.Map; /** @@ -59,7 +60,13 @@ * document returned", etc. */ private HashMap<String,String> filters = null; + /** + * List of URL strings that were not included in these results, but may be + * what the user was looking for. + */ + private List<String> closeMatches = null; + /** * Constructor */ public SearchResults() { @@ -192,4 +199,13 @@ int curPage = (int) Math.floor(firstResult/resultsPerPage) + 1; return curPage; } + + + public List<String> getCloseMatches() { + return closeMatches; + } + + public void setCloseMatches(List<String> closeMatches) { + this.closeMatches = closeMatches; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-22 23:38:11
|
Revision: 2813 http://archive-access.svn.sourceforge.net/archive-access/?rev=2813&view=rev Author: bradtofel Date: 2009-10-22 23:37:57 +0000 (Thu, 22 Oct 2009) Log Message: ----------- REFACTOR: moved request-specific resultFilter creation here from LocalResourceIndex -- these should still be moved into a "query parser" type class.. FEATURE: added charset "mode" feature, which will allow the user to specify one of several strategies on a request-by-request basis. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java 2009-10-22 23:34:57 UTC (rev 2812) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java 2009-10-22 23:37:57 UTC (rev 2813) @@ -36,7 +36,10 @@ import javax.servlet.http.HttpServletRequest; import org.archive.wayback.requestparser.OpenSearchRequestParser; +import org.archive.wayback.resourceindex.filters.HostMatchFilter; +import org.archive.wayback.resourceindex.filters.SchemeMatchFilter; import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; import org.archive.wayback.util.StringFormatter; import org.archive.wayback.util.Timestamp; import org.archive.wayback.util.url.UrlOperations; @@ -76,10 +79,22 @@ /** * custom CaptureSearchResult Filter to use for this specific request. Can * be null, and is sometimes useful to allow an AccessPoint to have specific - * and possibly variable filters. + * and possibly variable filters. These filters relate specifically to + * exclusion of results from the ResourceIndex. Compared to the + * resultFilters, if these filters redact all results, then an + * AccessControlException will be thrown. */ private ObjectFilter<CaptureSearchResult> exclusionFilter = null; + /** + * custom CaptureSearchResult Filter to use for this specific request. Can + * be null, and is sometimes useful to allow an AccessPoint to have specific + * and possibly variable filters. + */ + private ObjectFilterChain<CaptureSearchResult> resultFilters = null; + + + /** * StringFormatter object set up with the users specific Locale, and the * Wayback UI ResourceBundle prepared for use, simplifying UI generation * somewhat. @@ -245,6 +260,12 @@ * Request: IMG context requested */ public static final String REQUEST_IMAGE_CONTEXT = "imagecontext"; + + /** + * Request: Charset detection mode + */ + public static final String REQUEST_CHARSET_MODE = "charsetmode"; + /* * ******************************* * /OUTPUT TYPE CONSTANTS @@ -467,6 +488,35 @@ this.exclusionFilter = exclusionFilter; } + public ObjectFilter<CaptureSearchResult> getResultFilters() { + ObjectFilterChain<CaptureSearchResult> tmpFilters = + new ObjectFilterChain<CaptureSearchResult>(); + if(isExactHost()) { + tmpFilters.addFilter(new HostMatchFilter( + UrlOperations.urlToHost(getRequestUrl()))); + } + + if(isExactScheme()) { + tmpFilters.addFilter(new SchemeMatchFilter( + UrlOperations.urlToScheme(getRequestUrl()))); + } + if(resultFilters != null) { + tmpFilters.addFilters(resultFilters.getFilters()); + } + return tmpFilters; + } + + public void setResultFilters(ObjectFilterChain<CaptureSearchResult> resultFilters) { + this.resultFilters = resultFilters; + } + + public void addResultFilter(ObjectFilter<CaptureSearchResult> resultFilter) { + if(resultFilters == null) { + resultFilters = new ObjectFilterChain<CaptureSearchResult>(); + } + resultFilters.addFilter(resultFilter); + } + /** * @return StringFormatter based on user request info */ @@ -502,7 +552,18 @@ } else { remove(key); } + } + + private int getInt(String key) { + String value = get(key); + if(value == null) { + return -1; + } + return Integer.parseInt(value); } + private void setInt(String key, int value) { + put(key,String.valueOf(value)); + } private boolean getBoolean(String key) { String value = get(key); return(value != null && value.equals(REQUEST_YES)); @@ -549,7 +610,7 @@ return isRequestType(REQUEST_CAPTURE_QUERY); } /** - * marks this request as a Replay request + * marks this request as a Capture Query request */ public void setCaptureQueryRequest() { put(REQUEST_TYPE,REQUEST_CAPTURE_QUERY); @@ -561,7 +622,7 @@ return isRequestType(REQUEST_URL_QUERY); } /** - * marks this request as a Replay request + * marks this request as an Url Query request */ public void setUrlQueryRequest() { put(REQUEST_TYPE,REQUEST_URL_QUERY); @@ -711,6 +772,14 @@ return getBoolean(REQUEST_IMAGE_CONTEXT); } + public void setCharsetMode(int mode) { + setInt(REQUEST_CHARSET_MODE,mode); + } + public int getCharsetMode() { + int mode = getInt(REQUEST_CHARSET_MODE); + return (mode == -1) ? 0 : mode; + } + public String getWaybackContext() { return get(REQUEST_WAYBACK_CONTEXT); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-22 23:35:11
|
Revision: 2812 http://archive-access.svn.sourceforge.net/archive-access/?rev=2812&view=rev Author: bradtofel Date: 2009-10-22 23:34:57 +0000 (Thu, 22 Oct 2009) Log Message: ----------- REFACTOR: moved all character encoding detection into CharsetDetector interface. Two initial implementations, one which implements the previous behavior, and another which allows a user to "rotate" through different detection strategies. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/RotatingCharsetDetector.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/StandardCharsetDetector.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java 2009-10-19 22:55:27 UTC (rev 2811) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java 2009-10-22 23:34:57 UTC (rev 2812) @@ -28,11 +28,7 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.io.UnsupportedEncodingException; -import java.nio.charset.Charset; -import java.nio.charset.IllegalCharsetNameException; import java.text.ParseException; -import java.util.Iterator; -import java.util.Map; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; @@ -44,7 +40,6 @@ import org.archive.wayback.core.CaptureSearchResults; import org.archive.wayback.core.UIResults; import org.archive.wayback.core.WaybackRequest; -import org.mozilla.universalchardet.UniversalDetector; /** * Class which wraps functionality for converting a Resource(InputStream + @@ -56,13 +51,6 @@ * @version $Date$, $Revision$ */ public class TextDocument { - - // hand off this many bytes to the chardet library - private final static int MAX_CHARSET_READAHEAD = 65536; - // ...if it also includes "charset=" - private final static String CHARSET_TOKEN = "charset="; - // ...and if the chardet library fails, use the Content-Type header - private final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type"; // if documents are marked up before sending to clients, the data is // decoded into a String in chunks. This is how big a chunk to decode with. private final static int C_BUFFER_SIZE = 4096; @@ -89,155 +77,8 @@ this.uriConverter = uriConverter; } - private boolean isCharsetSupported(String charsetName) { - // can you believe that this throws a runtime? Just asking if it's - // supported!!?! They coulda just said "no"... - if(charsetName == null) { - return false; - } - try { - return Charset.isSupported(charsetName); - } catch(IllegalCharsetNameException e) { - return false; - } - } - - private String contentTypeToCharset(final String contentType) { - int offset = - contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase()); - - if (offset != -1) { - String cs = contentType.substring(offset + CHARSET_TOKEN.length()); - if(isCharsetSupported(cs)) { - return cs; - } - // test for extra spaces... there's at least one page out there that - // indicates it's charset with: -// <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1"> - - // bad web page! - if(isCharsetSupported(cs.replace(" ", ""))) { - return cs.replace(" ", ""); - } - } - return null; - } - /** - * Attempt to divine the character encoding of the document from the - * Content-Type HTTP header (with a "charset=") - * - * @param resource - * @return String character set found or null if the header was not present - * @throws IOException - */ - protected String getCharsetFromHeaders(Resource resource) - throws IOException { - - String charsetName = null; - - Map<String,String> httpHeaders = resource.getHttpHeaders(); - Iterator<String> keys = httpHeaders.keySet().iterator(); - String ctype = null; - while(keys.hasNext()) { - String headerKey = keys.next(); - String keyCmp = headerKey.toUpperCase().trim(); - if(keyCmp.equals(HTTP_CONTENT_TYPE_HEADER.toUpperCase())) { - ctype = httpHeaders.get(headerKey); - break; - } - } - if (ctype != null) { - charsetName = contentTypeToCharset(ctype); - } - return charsetName; - } - - /** - * Attempt to find a META tag in the HTML that hints at the character set - * used to write the document. - * - * @param resource - * @return String character set found from META tags in the HTML - * @throws IOException - */ - protected String getCharsetFromMeta(Resource resource) throws IOException { - String charsetName = null; - - byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD]; - resource.mark(MAX_CHARSET_READAHEAD); - resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD); - resource.reset(); - // convert to UTF-8 String -- which hopefully will not mess up the - // characters we're interested in... - StringBuilder sb = new StringBuilder(new String(bbuffer,"UTF-8")); - String metaContentType = TagMagix.getTagAttrWhere(sb, "META", - "content", "http-equiv", "Content-Type"); - if(metaContentType != null) { - charsetName = contentTypeToCharset(metaContentType); - } - return charsetName; - } - - /** - * Attempts to figure out the character set of the document using - * the excellent juniversalchardet library. - * - * @param resource - * @return String character encoding found, or null if nothing looked good. - * @throws IOException - */ - protected String getCharsetFromBytes(Resource resource) throws IOException { - String charsetName = null; - - byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD]; - // (1) - UniversalDetector detector = new UniversalDetector(null); - - // (2) - resource.mark(MAX_CHARSET_READAHEAD); - int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD); - resource.reset(); - detector.handleData(bbuffer, 0, len); - // (3) - detector.dataEnd(); - // (4) - charsetName = detector.getDetectedCharset(); - - // (5) - detector.reset(); - if(isCharsetSupported(charsetName)) { - return charsetName; - } - return null; - } - - /** - * Use META tags, byte-character-detection, HTTP headers, hope, and prayer - * to figure out what character encoding is being used for the document. - * If nothing else works, assumes UTF-8 for now. - * - * @param resource - * @return String charset for Resource - * @throws IOException - */ - protected String guessCharset() throws IOException { - - String charSet = getCharsetFromHeaders(resource); - if(charSet == null) { - charSet = getCharsetFromBytes(resource); - if(charSet == null) { - charSet = getCharsetFromMeta(resource); - if(charSet == null) { - charSet = "UTF-8"; - } - } - } - return charSet; - } - - /** * Update URLs inside the page, so those URLs which must be correct at * page load time resolve correctly to absolute URLs. * @@ -346,9 +187,6 @@ * @throws IOException */ public void readFully(String charSet) throws IOException { - if(charSet == null) { - charSet = guessCharset(); - } this.charSet = charSet; int recordLength = (int) resource.getRecordLength(); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java 2009-10-19 22:55:27 UTC (rev 2811) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java 2009-10-22 23:34:57 UTC (rev 2812) @@ -39,6 +39,8 @@ import org.archive.wayback.core.Resource; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BadContentException; +import org.archive.wayback.replay.charset.CharsetDetector; +import org.archive.wayback.replay.charset.StandardCharsetDetector; /** * @@ -48,12 +50,9 @@ */ public abstract class TextReplayRenderer implements ReplayRenderer { - public final static String HTTP_LENGTH_HEADER = "Content-Length"; - public final static String HTTP_LENGTH_HEADER_UP = - HTTP_LENGTH_HEADER.toUpperCase(); - private List<String> jspInserts = null; private HttpHeaderProcessor httpHeaderProcessor; + private CharsetDetector charsetDetector = new StandardCharsetDetector(); public TextReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) { this.httpHeaderProcessor = httpHeaderProcessor; @@ -80,16 +79,17 @@ Map<String,String> headers = HttpHeaderOperation.processHeaders( resource, result, uriConverter, httpHeaderProcessor); + String charSet = charsetDetector.getCharset(resource, wbRequest); // Load content into an HTML page, and resolve load-time URLs: TextDocument page = new TextDocument(resource,result,uriConverter); - page.readFully(); + page.readFully(charSet); updatePage(page,httpRequest,httpResponse,wbRequest,result,resource, uriConverter,results); // set the corrected length: int bytes = page.getBytes().length; - headers.put(HTTP_LENGTH_HEADER, String.valueOf(bytes)); + headers.put(HttpHeaderOperation.HTTP_LENGTH_HEADER, String.valueOf(bytes)); // Tomcat will always send a charset... It's trying to be smarter than // we are. If the original page didn't include a "charset" as part of // the "Content-Type" HTTP header, then Tomcat will use the default.. @@ -117,4 +117,18 @@ public void setJspInserts(List<String> jspInserts) { this.jspInserts = jspInserts; } + + /** + * @return the charsetDetector + */ + public CharsetDetector getCharsetDetector() { + return charsetDetector; + } + + /** + * @param charsetDetector the charsetDetector to set + */ + public void setCharsetDetector(CharsetDetector charsetDetector) { + this.charsetDetector = charsetDetector; + } } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java 2009-10-22 23:34:57 UTC (rev 2812) @@ -0,0 +1,148 @@ +package org.archive.wayback.replay.charset; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.util.Iterator; +import java.util.Map; + +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.replay.TagMagix; +import org.mozilla.universalchardet.UniversalDetector; + +public abstract class CharsetDetector { + // hand off this many bytes to the chardet library + protected final static int MAX_CHARSET_READAHEAD = 65536; + // ...if it also includes "charset=" + protected final static String CHARSET_TOKEN = "charset="; + // ...and if the chardet library fails, use the Content-Type header + protected final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type"; + public final static String DEFAULT_CHARSET = "UTF-8"; + + protected boolean isCharsetSupported(String charsetName) { + // can you believe that this throws a runtime? Just asking if it's + // supported!!?! They coulda just said "no"... + if(charsetName == null) { + return false; + } + try { + return Charset.isSupported(charsetName); + } catch(IllegalCharsetNameException e) { + return false; + } + } + + protected String contentTypeToCharset(final String contentType) { + int offset = + contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase()); + + if (offset != -1) { + String cs = contentType.substring(offset + CHARSET_TOKEN.length()); + if(isCharsetSupported(cs)) { + return cs; + } + // test for extra spaces... there's at least one page out there that + // indicates it's charset with: + +// <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1"> + + // bad web page! + if(isCharsetSupported(cs.replace(" ", ""))) { + return cs.replace(" ", ""); + } + } + return null; + } + + /** + * Attempt to divine the character encoding of the document from the + * Content-Type HTTP header (with a "charset=") + * + * @param resource + * @return String character set found or null if the header was not present + * @throws IOException + */ + protected String getCharsetFromHeaders(Resource resource) + throws IOException { + + String charsetName = null; + + Map<String,String> httpHeaders = resource.getHttpHeaders(); + Iterator<String> keys = httpHeaders.keySet().iterator(); + String ctype = null; + while(keys.hasNext()) { + String headerKey = keys.next(); + String keyCmp = headerKey.toUpperCase().trim(); + if(keyCmp.equals(HTTP_CONTENT_TYPE_HEADER.toUpperCase())) { + ctype = httpHeaders.get(headerKey); + break; + } + } + if (ctype != null) { + charsetName = contentTypeToCharset(ctype); + } + return charsetName; + } + + /** + * Attempt to find a META tag in the HTML that hints at the character set + * used to write the document. + * + * @param resource + * @return String character set found from META tags in the HTML + * @throws IOException + */ + protected String getCharsetFromMeta(Resource resource) throws IOException { + String charsetName = null; + + byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD]; + resource.mark(MAX_CHARSET_READAHEAD); + resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD); + resource.reset(); + // convert to UTF-8 String -- which hopefully will not mess up the + // characters we're interested in... + StringBuilder sb = new StringBuilder(new String(bbuffer,DEFAULT_CHARSET)); + String metaContentType = TagMagix.getTagAttrWhere(sb, "META", + "content", "http-equiv", "Content-Type"); + if(metaContentType != null) { + charsetName = contentTypeToCharset(metaContentType); + } + return charsetName; + } + + /** + * Attempts to figure out the character set of the document using + * the excellent juniversalchardet library. + * + * @param resource + * @return String character encoding found, or null if nothing looked good. + * @throws IOException + */ + protected String getCharsetFromBytes(Resource resource) throws IOException { + String charsetName = null; + + byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD]; + // (1) + UniversalDetector detector = new UniversalDetector(null); + + // (2) + resource.mark(MAX_CHARSET_READAHEAD); + int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD); + resource.reset(); + detector.handleData(bbuffer, 0, len); + // (3) + detector.dataEnd(); + // (4) + charsetName = detector.getDetectedCharset(); + + // (5) + detector.reset(); + if(isCharsetSupported(charsetName)) { + return charsetName; + } + return null; + } + public abstract String getCharset(Resource resource, WaybackRequest request) + throws IOException; +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/RotatingCharsetDetector.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/RotatingCharsetDetector.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/RotatingCharsetDetector.java 2009-10-22 23:34:57 UTC (rev 2812) @@ -0,0 +1,63 @@ +package org.archive.wayback.replay.charset; + +import java.io.IOException; + +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.WaybackRequest; + +/** + * @author brad + * + * Provides a way to rotate through several detection schemes + */ +public class RotatingCharsetDetector extends CharsetDetector { + public final static int MODES[][] = { + {0,1,2}, + {0,2,1}, + {1,0,2}, + {1,2,0}, + {2,1,0}, + {2,0,1} + }; + public final static int MODE_COUNT = 6; + public final static int GUESS_TYPES = 3; + + public int nextMode(int curMode) { + if(curMode >= MODE_COUNT - 1) { + return 0; + } + return curMode + 1; + } + public String getCharsetType(Resource resource, int type) throws IOException { + if(type == 0) { + return getCharsetFromHeaders(resource); + } else if(type == 1) { + return getCharsetFromMeta(resource); + } else if(type == 2) { + return getCharsetFromBytes(resource); + } + return null; + } + public String getCharset(Resource resource, int mode) throws IOException { + String charset = null; + if(mode >= MODE_COUNT) { + mode = 0; + } + for(int type = 0; type < GUESS_TYPES; type++) { + charset = getCharsetType(resource,MODES[mode][type]); + if(charset != null) { + break; + } + } + if(charset == null) { + charset = DEFAULT_CHARSET; + } + return charset; + } + @Override + public String getCharset(Resource resource, WaybackRequest request) + throws IOException { + int mode = request.getCharsetMode(); + return getCharset(resource,mode); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/StandardCharsetDetector.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/StandardCharsetDetector.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/StandardCharsetDetector.java 2009-10-22 23:34:57 UTC (rev 2812) @@ -0,0 +1,25 @@ +package org.archive.wayback.replay.charset; + +import java.io.IOException; + +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.WaybackRequest; + +public class StandardCharsetDetector extends CharsetDetector { + + @Override + public String getCharset(Resource resource, WaybackRequest request) + throws IOException { + String charSet = getCharsetFromHeaders(resource); + if(charSet == null) { + charSet = getCharsetFromMeta(resource); + if(charSet == null) { + charSet = getCharsetFromBytes(resource); + if(charSet == null) { + charSet = DEFAULT_CHARSET; + } + } + } + return charSet; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2811 http://archive-access.svn.sourceforge.net/archive-access/?rev=2811&view=rev Author: bradtofel Date: 2009-10-19 22:55:27 +0000 (Mon, 19 Oct 2009) Log Message: ----------- BUGFIX(ACC-73): comparison of server name is now case-insensitive Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixRequestParser.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixRequestParser.java 2009-10-19 22:52:08 UTC (rev 2810) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixRequestParser.java 2009-10-19 22:55:27 UTC (rev 2811) @@ -84,7 +84,7 @@ WaybackRequest wbRequest = null; String server = httpRequest.getServerName() + ":" + httpRequest.getServerPort(); - if(server.endsWith(hostPort)) { + if(server.toLowerCase().endsWith(hostPort.toLowerCase())) { int length = server.length() - hostPort.length(); if(server.length() > hostPort.length()) { String prefix = server.substring(0,length - 1); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2810 http://archive-access.svn.sourceforge.net/archive-access/?rev=2810&view=rev Author: bradtofel Date: 2009-10-19 22:52:08 +0000 (Mon, 19 Oct 2009) Log Message: ----------- BUGFIX (unreported): now explicitly redirect to longer date if requested date is shorter than date from index Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/selector/DateMismatchSelector.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/selector/DateMismatchSelector.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/selector/DateMismatchSelector.java 2009-10-15 22:51:23 UTC (rev 2809) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/selector/DateMismatchSelector.java 2009-10-19 22:52:08 UTC (rev 2810) @@ -42,11 +42,17 @@ @Override public boolean canHandle(WaybackRequest wbRequest, CaptureSearchResult result, Resource resource) { + + String reqDateStr = wbRequest.getReplayTimestamp(); + String resDateStr = result.getCaptureTimestamp(); + // if the request date is shorter than the result date, always redirect: + if(reqDateStr.length() < resDateStr.length()) { + return true; + } + // if the result is not for the exact date requested, redirect to the // exact date. some capture dates are not 14 digits, only compare as // many digits as are in the result date: - String reqDateStr = wbRequest.getReplayTimestamp(); - String resDateStr = result.getCaptureTimestamp(); return !resDateStr.equals(reqDateStr.substring(0, resDateStr.length())); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-15 22:51:36
|
Revision: 2809 http://archive-access.svn.sourceforge.net/archive-access/?rev=2809&view=rev Author: bradtofel Date: 2009-10-15 22:51:23 +0000 (Thu, 15 Oct 2009) Log Message: ----------- REFACTOR: moved parsing of path from a URL String here... it should be further refactored into URL, or UURI... Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2009-10-15 22:44:10 UTC (rev 2808) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2009-10-15 22:51:23 UTC (rev 2809) @@ -22,6 +22,9 @@ public final static String FTP_SCHEME = "ftp://"; public final static String MMS_SCHEME = "mms://"; public final static String RTSP_SCHEME = "rtsp://"; + + public final static String DEFAULT_SCHEME = HTTP_SCHEME; + // go brewster public final static String WAIS_SCHEME = "wais://"; @@ -132,6 +135,25 @@ } return -1; } + + public static String getURLPath(String url) { + int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); + int pathIdx = url.indexOf(UrlOperations.PATH_START); + if(portIdx == -1 && pathIdx == -1) { + return ""; + } + if(portIdx == -1) { + return url.substring(pathIdx); + } + if(pathIdx == -1) { + return url.substring(portIdx); + } + if(pathIdx > portIdx) { + return url.substring(portIdx); + } else { + return url.substring(pathIdx); + } + } public static String urlToHost(String url) { if(url.startsWith("dns:")) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-15 22:44:24
|
Revision: 2808 http://archive-access.svn.sourceforge.net/archive-access/?rev=2808&view=rev Author: bradtofel Date: 2009-10-15 22:44:10 +0000 (Thu, 15 Oct 2009) Log Message: ----------- FEATURE: added endOffset and robotFlags accessors. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResult.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResult.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResult.java 2009-10-15 22:33:53 UTC (rev 2807) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResult.java 2009-10-15 22:44:10 UTC (rev 2808) @@ -37,9 +37,11 @@ public class CaptureSearchResult extends SearchResult { private long cachedOffset = -1; + private long cachedEndOffset = -1; private long cachedDate = -1; public static final String CAPTURE_ORIGINAL_URL = "url"; + public static final String CAPTURE_ORIGINAL_HOST = "host"; /** * Result: canonicalized(lookup key) form of URL of captured document @@ -52,19 +54,19 @@ public static final String CAPTURE_CAPTURE_TIMESTAMP = "capturedate"; /** - * Result: basename of ARC file containing this document. + * Result: basename of ARC/WARC file containing this document. */ public static final String CAPTURE_FILE = "file"; /** - * Result: compressed byte offset within ARC file where this document's + * Result: compressed byte offset within ARC/WARC file where this document's * gzip envelope begins. */ public static final String CAPTURE_OFFSET = "compressedoffset"; /** - * Result: compressed byte offset within ARC file where this document's - * gzip envelope Ends. + * Result: compressed byte offset within ARC/WARC file where this document's + * gzip envelope Ends. */ public static final String CAPTURE_END_OFFSET = "compressedendoffset"; @@ -93,6 +95,20 @@ public static final String CAPTURE_REDIRECT_URL = "redirecturl"; /** + * Result: String flags which indicate robot instructions found in an HTML + * page. Currently one or more of: + * <li>"A" - noarchive</li> + * <li>"F" - nofollow</li> + * <li>"I" - noindex</li> + * @see http://noarchive.net/ + */ + public static final String CAPTURE_ROBOT_FLAGS = "robotflags"; + + public static final String CAPTURE_ROBOT_NOARCHIVE = "A"; + public static final String CAPTURE_ROBOT_NOFOLLOW = "F"; + public static final String CAPTURE_ROBOT_NOINDEX = "I"; + + /** * Result: flag within a SearchResult that indicates this is the closest to * a particular requested date. */ @@ -127,14 +143,36 @@ */ public static final String CAPTURE_DUPLICATE_HTTP = "http"; public String getOriginalUrl() { - return get(CAPTURE_ORIGINAL_URL); + String url = get(CAPTURE_ORIGINAL_URL); + if(url == null) { + // convert from ORIG_HOST to ORIG_URL here: + url = getUrlKey(); + String host = get(CAPTURE_ORIGINAL_HOST); + if(url != null && host != null) { + StringBuilder sb = new StringBuilder(url.length()); + sb.append(UrlOperations.DEFAULT_SCHEME); + sb.append(host); + sb.append(UrlOperations.getURLPath(url)); + url = sb.toString(); + // cache it for next time...? + setOriginalUrl(url); + } + } + return url; } public void setOriginalUrl(String originalUrl) { put(CAPTURE_ORIGINAL_URL,originalUrl); } public String getOriginalHost() { - return UrlOperations.urlToHost(getOriginalUrl()); + String host = get(CAPTURE_ORIGINAL_HOST); + if(host == null) { + host = UrlOperations.urlToHost(getOriginalUrl()); + } + return host; } + public void setOriginalHost(String originalHost) { + put(CAPTURE_ORIGINAL_HOST,originalHost); + } public String getUrlKey() { return get(CAPTURE_URL_KEY); } @@ -173,6 +211,16 @@ cachedOffset = offset; put(CAPTURE_OFFSET,String.valueOf(offset)); } + public long getEndOffset() { + if(cachedEndOffset == -1) { + cachedEndOffset = Long.parseLong(get(CAPTURE_END_OFFSET)); + } + return cachedEndOffset; + } + public void setEndOffset(long offset) { + cachedEndOffset = offset; + put(CAPTURE_END_OFFSET,String.valueOf(offset)); + } public String getMimeType() { return get(CAPTURE_MIME_TYPE); } @@ -253,4 +301,46 @@ } return null; } + public String getRobotFlags() { + return get(CAPTURE_ROBOT_FLAGS); + } + public void setRobotFlags(String robotFlags) { + put(CAPTURE_ROBOT_FLAGS,robotFlags); + } + public void setRobotFlag(String flag) { + String flags = get(CAPTURE_ROBOT_FLAGS); + if(flags == null) { + flags = ""; + } + if(!flags.contains(flag)) { + flags = flags + flag; + } + put(CAPTURE_ROBOT_FLAGS,flags); + } + public boolean isRobotFlagSet(String flag) { + String flags = get(CAPTURE_ROBOT_FLAGS); + if(flags == null) { + return false; + } + return flags.contains(flag); + } + + public boolean isRobotNoArchive() { + return isRobotFlagSet(CAPTURE_ROBOT_NOARCHIVE); + } + public boolean isRobotNoIndex() { + return isRobotFlagSet(CAPTURE_ROBOT_NOINDEX); + } + public boolean isRobotNoFollow() { + return isRobotFlagSet(CAPTURE_ROBOT_NOFOLLOW); + } + public void setRobotNoArchive() { + setRobotFlag(CAPTURE_ROBOT_NOARCHIVE); + } + public void setRobotNoIndex() { + setRobotFlag(CAPTURE_ROBOT_NOARCHIVE); + } + public void setRobotNoFollow() { + setRobotFlag(CAPTURE_ROBOT_NOARCHIVE); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-15 22:34:03
|
Revision: 2807 http://archive-access.svn.sourceforge.net/archive-access/?rev=2807&view=rev Author: bradtofel Date: 2009-10-15 22:33:53 +0000 (Thu, 15 Oct 2009) Log Message: ----------- TWEAK: removed unused import. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java 2009-10-15 22:32:38 UTC (rev 2806) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java 2009-10-15 22:33:53 UTC (rev 2807) @@ -25,7 +25,6 @@ package org.archive.wayback.webapp; import java.io.IOException; -import java.util.TimeZone; import java.util.logging.Logger; import javax.servlet.Filter; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-15 22:32:45
|
Revision: 2806 http://archive-access.svn.sourceforge.net/archive-access/?rev=2806&view=rev Author: bradtofel Date: 2009-10-15 22:32:38 +0000 (Thu, 15 Oct 2009) Log Message: ----------- BUGFIX(ACC-70): No longer explicitly set timezone for entire JVM to GMT, now GMT is specified at Calendar construction. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/query/resultspartitioner/ResultsPartitioner.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/Timestamp.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/query/resultspartitioner/ResultsPartitioner.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/query/resultspartitioner/ResultsPartitioner.java 2009-10-15 22:27:18 UTC (rev 2805) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/query/resultspartitioner/ResultsPartitioner.java 2009-10-15 22:32:38 UTC (rev 2806) @@ -41,12 +41,7 @@ public abstract class ResultsPartitioner { protected Calendar getCalendar() { - String[] ids = TimeZone.getAvailableIDs(0); - if (ids.length < 1) { - return null; - } - TimeZone gmt = new SimpleTimeZone(0, ids[0]); - return new GregorianCalendar(gmt); + return Calendar.getInstance(TimeZone.getTimeZone("GMT")); } protected Calendar dateStrToCalendar(String dateStr) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/Timestamp.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/Timestamp.java 2009-10-15 22:27:18 UTC (rev 2805) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/Timestamp.java 2009-10-15 22:32:38 UTC (rev 2806) @@ -42,7 +42,7 @@ private final static String UPPER_TIMESTAMP_LIMIT = "29991939295959"; private final static String YEAR_LOWER_LIMIT = "1996"; private final static String YEAR_UPPER_LIMIT = - String.valueOf(Calendar.getInstance().get(Calendar.YEAR)); + String.valueOf(Calendar.getInstance(TimeZone.getTimeZone("GMT")).get(Calendar.YEAR)); private final static String MONTH_LOWER_LIMIT = "01"; private final static String MONTH_UPPER_LIMIT = "12"; private final static String DAY_LOWER_LIMIT = "01"; @@ -327,14 +327,14 @@ return finalDigits; } - private static String boundDigits(String input, String min, String max) { - String bounded = input; - if(input.compareTo(min) < 0) { - bounded = min; - } else if(input.compareTo(max) > 0) { - bounded = max; + private static String boundDigits(final String test, final String min, + final String max) { + if(test.compareTo(min) < 0) { + return min; + } else if(test.compareTo(max) > 0) { + return max; } - return bounded; + return test; } // check each of YEAR, MONTH, DAY, HOUR, MINUTE, SECOND to make sure they Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java 2009-10-15 22:27:18 UTC (rev 2805) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java 2009-10-15 22:32:38 UTC (rev 2806) @@ -60,7 +60,6 @@ public void init(FilterConfig config) throws ServletException { LOGGER.info("Wayback Filter initializing..."); - TimeZone.setDefault(TimeZone.getTimeZone("GMT")); try { mapper = new RequestMapper(config.getServletContext()); } catch (ConfigurationException e) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |