You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bra...@us...> - 2008-01-15 02:17:06
|
Revision: 2130 http://archive-access.svn.sourceforge.net/archive-access/?rev=2130&view=rev Author: bradtofel Date: 2008-01-14 18:17:09 -0800 (Mon, 14 Jan 2008) Log Message: ----------- REFACTOR: moved UrlCanonicalizer to org.archive.wayback.util.url Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2008-01-15 01:46:54 UTC (rev 2129) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2008-01-15 02:17:09 UTC (rev 2130) @@ -1,391 +0,0 @@ -/* UrlCanonicalizer - * - * $Id$ - * - * Created on 2:08:07 PM Oct 11, 2006. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.util; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.commons.httpclient.URIException; -import org.archive.net.UURI; -import org.archive.net.UURIFactory; - -/** - * Class that performs the standard Heritrix URL canonicalization. Eventually, - * this should all be configurable, or perhaps be able to read the settings - * used within a Heritrix crawler... or even multiple crawlers... this is hard. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class UrlCanonicalizer { - - - private static final String CDX_PREFIX = " CDX "; - /** - * Strip leading 'www.' - */ - private static final Pattern STRIP_WWW_REGEX = - Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$"); - /** - * Strip leading 'www44.', 'www3.', etc. - */ - private static final Pattern STRIP_WWWN_REGEX = - Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$"); - /** - * Strip userinfo. - */ - private static final Pattern STRIP_USERINFO_REGEX = - Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$", - Pattern.CASE_INSENSITIVE); - - /** - * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A. - * Example: PHPSESSID=9682993c8daa2c5497996114facdc805. - */ - private static final Pattern STRIP_SESSION_ID_REGEX = - Pattern.compile("^(.+)(?:(?:(?:jsessionid)|(?:phpsessid))=" + - "[0-9a-zA-Z]{32})(?:&(.*))?$", - Pattern.CASE_INSENSITIVE); - - /** - * Example: sid=9682993c8daa2c5497996114facdc805. - * 'sid=' can be tricky but all sid= followed by 32 byte string - * so far seen have been session ids. Sid is a 32 byte string - * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid' - * so have to have it run after the phpsessid elimination. - */ - private static final Pattern STRIP_SID_REGEX = - Pattern.compile("^(.+)" + - "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE); - - /** - * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM. - */ - private static final Pattern STRIP_ASPSESSION_REGEX = - Pattern.compile("^(.+)" + - "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", - Pattern.CASE_INSENSITIVE); - - /** - * Examples: - * - * (.NET 2.0) - * http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx - * => http://legislature.mi.gov/mileg.aspx - * - * (.NET 1.0/1.1) - * http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx - * => http://legislature.mi.gov/mileg.aspx - * - * For more info, see: - * http://msdn2.microsoft.com/en-us/library/aa479315.aspx - * - */ - private static final Pattern STRIP_ASPSESSION2_REGEX = - Pattern.compile("^([^\\?]+/)" + - "(?:\\((?:S\\(|)[0-9a-z]{24}\\)(?:\\)|)/)([^\\?]+\\.aspx.*)$", - Pattern.CASE_INSENSITIVE); - - /** - * Examples: - * - * (.NET 2.0) - * http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules - * => http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules - * - * For more info, see: - * http://msdn2.microsoft.com/en-us/library/aa479315.aspx - * - */ - - private static final Pattern STRIP_ASPSESSION3_REGEX = - Pattern.compile("^([^\\?]+/" + - "\\((?:a\\([0-9a-z]{24}\\)))(?:S\\([0-9a-z]{24}\\))" + - "((?:f\\([0-9a-z]{24}\\))\\)/[^\\?]+\\.aspx.*)$", - Pattern.CASE_INSENSITIVE); - - /** - * Strip ColdFusion session IDs. Remove sessionids that look like the - * following: - * CFID=12412453&CFTOKEN=15501799 - * CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A - */ - private static final Pattern STRIP_CFSESSION_REGEX = - Pattern.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+(?:jsession=[^&]+)?)" + - "(?:&(.*))?$",Pattern.CASE_INSENSITIVE); - - /** - * Run a regex that strips elements of a string. - * - * Assumes the regex has a form that wants to strip elements of the passed - * string. Assumes that if a match, appending group 1 - * and group 2 yields desired result. - * @param url Url to search in. - * @param matcher Matcher whose form yields a group 1 and group 2 if a - * match (non-null. - * @return Original <code>url</code> else concatenization of group 1 - * and group 2. - */ - protected String doStripRegexMatch(String url, Matcher matcher) { - return (matcher != null && matcher.matches())? - checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)): - url; - } - - /** - * @param string String to check. - * @return <code>string</code> if non-null, else empty string (""). - */ - private String checkForNull(String string) { - return (string != null)? string: ""; - } - - /** - * return the canonical string key for the URL argument. - * - * @param urlString - * @return String lookup key for URL argument. - * @throws URIException - */ - public String urlStringToKey(final String urlString) throws URIException { - - String searchUrl = canonicalize(urlString); - - // TODO: force https into http for the moment... - if(searchUrl.startsWith("https://")) { - searchUrl = searchUrl.substring(8); - } - - // TODO: this will only work with http:// scheme. should work with all? - // force add of scheme and possible add '/' with empty path: - if (searchUrl.startsWith("http://")) { - if (-1 == searchUrl.indexOf('/', 8)) { - searchUrl = searchUrl + "/"; - } - } else { - if (-1 == searchUrl.indexOf("/")) { - searchUrl = searchUrl + "/"; - } - searchUrl = "http://" + searchUrl; - } - - // unescape anythying that can be: - UURI tmpURI = UURIFactory.getInstance(searchUrl); - tmpURI.setPath(tmpURI.getPath()); - - - // convert to UURI to perform require URI fixup: - UURI searchURI = UURIFactory.getInstance(tmpURI.getURI()); - - - - - // replace ' ' with '+' (this is only to match Alexa's canonicalization) - String newPath = searchURI.getEscapedPath().replace("%20","+"); -// String newPath = searchURI.getPath().replace(' ','+'); - - // replace multiple consecutive '/'s in the path. - while(newPath.contains("//")) { - newPath = newPath.replace("//","/"); - } - - // this would remove trailing a '/' character, unless the path is empty - // but we're not going to do this just yet.. -// if((newPath.length() > 1) && newPath.endsWith("/")) { -// newPath = newPath.substring(0,newPath.length()-1); -// } -// searchURI.setEscapedPath(newPath); -// searchURI.setRawPath(newPath.toCharArray()); -// String query = searchURI.getEscapedQuery(); - - // TODO: handle non HTTP port stripping, too. -// String portStr = ""; -// if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { -// portStr = ":" + searchURI.getPort(); -// } -// return searchURI.getHostBasename() + portStr + -// searchURI.getEscapedPathQuery(); - - StringBuilder sb = new StringBuilder(searchUrl.length()); - sb.append(searchURI.getHostBasename()); - if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { - sb.append(":").append(searchURI.getPort()); - } - sb.append(newPath); - if(searchURI.getEscapedQuery() != null) { - sb.append("?").append(searchURI.getEscapedQuery()); - } - - - return sb.toString(); - } - - - /** - * Idempotent operation that will determine the 'fuzziest' - * form of the url argument. This operation is done prior to adding records - * to the ResourceIndex, and prior to lookup. Current version is exactly - * the default found in Heritrix. When the configuration system for - * Heritrix stabilizes, hopefully this can use the system directly within - * Heritrix. - * - * @param url to be canonicalized. - * @return canonicalized version of url argument. - */ - public String canonicalize(String url) { - url = doStripRegexMatch(url, STRIP_USERINFO_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_WWW_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_WWWN_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_SESSION_ID_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_ASPSESSION_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_ASPSESSION2_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_ASPSESSION3_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_SID_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_CFSESSION_REGEX.matcher(url)); - url = url.toLowerCase(); - if (url == null || url.length() <= 0) { - return url; - } - - int index = url.lastIndexOf('?'); - if (index > 0) { - if (index == (url.length() - 1)) { - // '?' is last char in url. Strip it. - url = url.substring(0, url.length() - 1); - } else if (url.charAt(index + 1) == '&') { - // Next char is '&'. Strip it. - if (url.length() == (index + 2)) { - // Then url ends with '?&'. Strip them. - url = url.substring(0, url.length() - 2); - } else { - // The '&' is redundant. Strip it. - url = url.substring(0, index + 1) + - url.substring(index + 2); - } - } else if (url.charAt(url.length() - 1) == '&') { - // If we have a lone '&' on end of query str, - // strip it. - url = url.substring(0, url.length() - 1); - } - } - return url; - } - - private static void USAGE() { - System.err.println("Usage: [-f FIELD] [-d DELIM]"); - System.exit(3); - } - /** - * @param args - */ - public static void main(String[] args) { - UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); - int n = 0; - int i = 0; - ArrayList<Integer> columns = new ArrayList<Integer>(); - - long lineNumber = 0; - boolean cdxPassThru = false; - String delimiter = " "; - while(n < args.length) { - String arg = args[n]; - if(arg.compareTo("-cdx") == 0) { - cdxPassThru = true; - n++; - continue; - } - if(n == (args.length -1)) { - USAGE(); - } - String val = args[n+1]; - if(arg.compareTo("-f") == 0) { - columns.add(new Integer(val)); - } else if(arg.compareTo("-d") == 0) { - delimiter = val; - } else { - USAGE(); - } - n += 2; - } - // place default '0' in case none specified: - if(columns.size() == 0) { - columns.add(new Integer(1)); - } - - // convert to int[]: - int[] cols = new int[columns.size()]; - for(int idx = 0; idx < columns.size(); idx++) { - cols[idx] = columns.get(idx).intValue() - 1; - } - BufferedReader r = new BufferedReader(new InputStreamReader(System.in)); - StringBuilder sb = new StringBuilder(); - String line = null; - - while(true) { - try { - line = r.readLine(); - } catch (IOException e) { - e.printStackTrace(); - System.exit(1); - } - if(line == null) { - break; - } - lineNumber++; - if(cdxPassThru && line.startsWith(CDX_PREFIX)) { - System.out.println(line); - continue; - } - String parts[] = line.split(delimiter); - for(int column : cols) { - if(column >= parts.length) { - System.err.println("Invalid line " + lineNumber + " (" + - line + ") skipped"); - } else { - try { - parts[column] = canonicalizer.urlStringToKey(parts[column]); - } catch (URIException e) { - System.err.println("Invalid URL in line " + lineNumber + " (" + - line + ") skipped (" + parts[column] + ")"); - e.printStackTrace(); - continue; - } - } - } - sb.setLength(0); - for(i = 0; i < parts.length; i++) { - sb.append(parts[i]); - if(i < (parts.length-1)) { - sb.append(delimiter); - } - } - System.out.println(sb.toString()); - } - } -} \ No newline at end of file Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java (from rev 2128, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java 2008-01-15 02:17:09 UTC (rev 2130) @@ -0,0 +1,391 @@ +/* UrlCanonicalizer + * + * $Id$ + * + * Created on 2:08:07 PM Oct 11, 2006. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.url; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.httpclient.URIException; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; + +/** + * Class that performs the standard Heritrix URL canonicalization. Eventually, + * this should all be configurable, or perhaps be able to read the settings + * used within a Heritrix crawler... or even multiple crawlers... this is hard. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class UrlCanonicalizer { + + + private static final String CDX_PREFIX = " CDX "; + /** + * Strip leading 'www.' + */ + private static final Pattern STRIP_WWW_REGEX = + Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$"); + /** + * Strip leading 'www44.', 'www3.', etc. + */ + private static final Pattern STRIP_WWWN_REGEX = + Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$"); + /** + * Strip userinfo. + */ + private static final Pattern STRIP_USERINFO_REGEX = + Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$", + Pattern.CASE_INSENSITIVE); + + /** + * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A. + * Example: PHPSESSID=9682993c8daa2c5497996114facdc805. + */ + private static final Pattern STRIP_SESSION_ID_REGEX = + Pattern.compile("^(.+)(?:(?:(?:jsessionid)|(?:phpsessid))=" + + "[0-9a-zA-Z]{32})(?:&(.*))?$", + Pattern.CASE_INSENSITIVE); + + /** + * Example: sid=9682993c8daa2c5497996114facdc805. + * 'sid=' can be tricky but all sid= followed by 32 byte string + * so far seen have been session ids. Sid is a 32 byte string + * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid' + * so have to have it run after the phpsessid elimination. + */ + private static final Pattern STRIP_SID_REGEX = + Pattern.compile("^(.+)" + + "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE); + + /** + * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM. + */ + private static final Pattern STRIP_ASPSESSION_REGEX = + Pattern.compile("^(.+)" + + "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", + Pattern.CASE_INSENSITIVE); + + /** + * Examples: + * + * (.NET 2.0) + * http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx + * => http://legislature.mi.gov/mileg.aspx + * + * (.NET 1.0/1.1) + * http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx + * => http://legislature.mi.gov/mileg.aspx + * + * For more info, see: + * http://msdn2.microsoft.com/en-us/library/aa479315.aspx + * + */ + private static final Pattern STRIP_ASPSESSION2_REGEX = + Pattern.compile("^([^\\?]+/)" + + "(?:\\((?:S\\(|)[0-9a-z]{24}\\)(?:\\)|)/)([^\\?]+\\.aspx.*)$", + Pattern.CASE_INSENSITIVE); + + /** + * Examples: + * + * (.NET 2.0) + * http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules + * => http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules + * + * For more info, see: + * http://msdn2.microsoft.com/en-us/library/aa479315.aspx + * + */ + + private static final Pattern STRIP_ASPSESSION3_REGEX = + Pattern.compile("^([^\\?]+/" + + "\\((?:a\\([0-9a-z]{24}\\)))(?:S\\([0-9a-z]{24}\\))" + + "((?:f\\([0-9a-z]{24}\\))\\)/[^\\?]+\\.aspx.*)$", + Pattern.CASE_INSENSITIVE); + + /** + * Strip ColdFusion session IDs. Remove sessionids that look like the + * following: + * CFID=12412453&CFTOKEN=15501799 + * CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A + */ + private static final Pattern STRIP_CFSESSION_REGEX = + Pattern.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+(?:jsession=[^&]+)?)" + + "(?:&(.*))?$",Pattern.CASE_INSENSITIVE); + + /** + * Run a regex that strips elements of a string. + * + * Assumes the regex has a form that wants to strip elements of the passed + * string. Assumes that if a match, appending group 1 + * and group 2 yields desired result. + * @param url Url to search in. + * @param matcher Matcher whose form yields a group 1 and group 2 if a + * match (non-null. + * @return Original <code>url</code> else concatenization of group 1 + * and group 2. + */ + protected String doStripRegexMatch(String url, Matcher matcher) { + return (matcher != null && matcher.matches())? + checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)): + url; + } + + /** + * @param string String to check. + * @return <code>string</code> if non-null, else empty string (""). + */ + private String checkForNull(String string) { + return (string != null)? string: ""; + } + + /** + * return the canonical string key for the URL argument. + * + * @param urlString + * @return String lookup key for URL argument. + * @throws URIException + */ + public String urlStringToKey(final String urlString) throws URIException { + + String searchUrl = canonicalize(urlString); + + // TODO: force https into http for the moment... + if(searchUrl.startsWith("https://")) { + searchUrl = searchUrl.substring(8); + } + + // TODO: this will only work with http:// scheme. should work with all? + // force add of scheme and possible add '/' with empty path: + if (searchUrl.startsWith("http://")) { + if (-1 == searchUrl.indexOf('/', 8)) { + searchUrl = searchUrl + "/"; + } + } else { + if (-1 == searchUrl.indexOf("/")) { + searchUrl = searchUrl + "/"; + } + searchUrl = "http://" + searchUrl; + } + + // unescape anythying that can be: + UURI tmpURI = UURIFactory.getInstance(searchUrl); + tmpURI.setPath(tmpURI.getPath()); + + + // convert to UURI to perform require URI fixup: + UURI searchURI = UURIFactory.getInstance(tmpURI.getURI()); + + + + + // replace ' ' with '+' (this is only to match Alexa's canonicalization) + String newPath = searchURI.getEscapedPath().replace("%20","+"); +// String newPath = searchURI.getPath().replace(' ','+'); + + // replace multiple consecutive '/'s in the path. + while(newPath.contains("//")) { + newPath = newPath.replace("//","/"); + } + + // this would remove trailing a '/' character, unless the path is empty + // but we're not going to do this just yet.. +// if((newPath.length() > 1) && newPath.endsWith("/")) { +// newPath = newPath.substring(0,newPath.length()-1); +// } +// searchURI.setEscapedPath(newPath); +// searchURI.setRawPath(newPath.toCharArray()); +// String query = searchURI.getEscapedQuery(); + + // TODO: handle non HTTP port stripping, too. +// String portStr = ""; +// if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { +// portStr = ":" + searchURI.getPort(); +// } +// return searchURI.getHostBasename() + portStr + +// searchURI.getEscapedPathQuery(); + + StringBuilder sb = new StringBuilder(searchUrl.length()); + sb.append(searchURI.getHostBasename()); + if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { + sb.append(":").append(searchURI.getPort()); + } + sb.append(newPath); + if(searchURI.getEscapedQuery() != null) { + sb.append("?").append(searchURI.getEscapedQuery()); + } + + + return sb.toString(); + } + + + /** + * Idempotent operation that will determine the 'fuzziest' + * form of the url argument. This operation is done prior to adding records + * to the ResourceIndex, and prior to lookup. Current version is exactly + * the default found in Heritrix. When the configuration system for + * Heritrix stabilizes, hopefully this can use the system directly within + * Heritrix. + * + * @param url to be canonicalized. + * @return canonicalized version of url argument. + */ + public String canonicalize(String url) { + url = doStripRegexMatch(url, STRIP_USERINFO_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_WWW_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_WWWN_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_SESSION_ID_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_ASPSESSION_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_ASPSESSION2_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_ASPSESSION3_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_SID_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_CFSESSION_REGEX.matcher(url)); + url = url.toLowerCase(); + if (url == null || url.length() <= 0) { + return url; + } + + int index = url.lastIndexOf('?'); + if (index > 0) { + if (index == (url.length() - 1)) { + // '?' is last char in url. Strip it. + url = url.substring(0, url.length() - 1); + } else if (url.charAt(index + 1) == '&') { + // Next char is '&'. Strip it. + if (url.length() == (index + 2)) { + // Then url ends with '?&'. Strip them. + url = url.substring(0, url.length() - 2); + } else { + // The '&' is redundant. Strip it. + url = url.substring(0, index + 1) + + url.substring(index + 2); + } + } else if (url.charAt(url.length() - 1) == '&') { + // If we have a lone '&' on end of query str, + // strip it. + url = url.substring(0, url.length() - 1); + } + } + return url; + } + + private static void USAGE() { + System.err.println("Usage: [-f FIELD] [-d DELIM]"); + System.exit(3); + } + /** + * @param args + */ + public static void main(String[] args) { + UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + int n = 0; + int i = 0; + ArrayList<Integer> columns = new ArrayList<Integer>(); + + long lineNumber = 0; + boolean cdxPassThru = false; + String delimiter = " "; + while(n < args.length) { + String arg = args[n]; + if(arg.compareTo("-cdx") == 0) { + cdxPassThru = true; + n++; + continue; + } + if(n == (args.length -1)) { + USAGE(); + } + String val = args[n+1]; + if(arg.compareTo("-f") == 0) { + columns.add(new Integer(val)); + } else if(arg.compareTo("-d") == 0) { + delimiter = val; + } else { + USAGE(); + } + n += 2; + } + // place default '0' in case none specified: + if(columns.size() == 0) { + columns.add(new Integer(1)); + } + + // convert to int[]: + int[] cols = new int[columns.size()]; + for(int idx = 0; idx < columns.size(); idx++) { + cols[idx] = columns.get(idx).intValue() - 1; + } + BufferedReader r = new BufferedReader(new InputStreamReader(System.in)); + StringBuilder sb = new StringBuilder(); + String line = null; + + while(true) { + try { + line = r.readLine(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + if(line == null) { + break; + } + lineNumber++; + if(cdxPassThru && line.startsWith(CDX_PREFIX)) { + System.out.println(line); + continue; + } + String parts[] = line.split(delimiter); + for(int column : cols) { + if(column >= parts.length) { + System.err.println("Invalid line " + lineNumber + " (" + + line + ") skipped"); + } else { + try { + parts[column] = canonicalizer.urlStringToKey(parts[column]); + } catch (URIException e) { + System.err.println("Invalid URL in line " + lineNumber + " (" + + line + ") skipped (" + parts[column] + ")"); + e.printStackTrace(); + continue; + } + } + } + sb.setLength(0); + for(i = 0; i < parts.length; i++) { + sb.append(parts[i]); + if(i < (parts.length-1)) { + sb.append(delimiter); + } + } + System.out.println(sb.toString()); + } + } +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2129 http://archive-access.svn.sourceforge.net/archive-access/?rev=2129&view=rev Author: bradtofel Date: 2008-01-14 17:46:54 -0800 (Mon, 14 Jan 2008) Log Message: ----------- BUGFIX: was not correctly resolving URLs in Location: HTTP headers. BUGFIX: was not using WARC payload digest... which should be trusted. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java 2008-01-15 01:43:29 UTC (rev 2128) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java 2008-01-15 01:46:54 UTC (rev 2129) @@ -121,7 +121,7 @@ return result; } - private void addUrlDataToSearchResult(SearchResult result, String urlStr) + private UURI addUrlDataToSearchResult(SearchResult result, String urlStr) throws IOException { result.put(WaybackConstants.RESULT_URL, urlStr); @@ -141,6 +141,8 @@ String urlKey = canonicalizer.urlStringToKey(urlStr); result.put(WaybackConstants.RESULT_URL_KEY, urlKey); + + return uri; } private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) @@ -218,7 +220,7 @@ String.valueOf(header.getOffset())); String origUrl = header.getUrl(); - addUrlDataToSearchResult(result,origUrl); + UURI uri = addUrlDataToSearchResult(result,origUrl); // need to parse the documents HTTP message and headers here: WARCReader // does not implement this... yet.. @@ -243,7 +245,9 @@ ARCConstants.DEFAULT_ENCODING); rec.close(); - result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); + result.put(WaybackConstants.RESULT_MD5_DIGEST, + transformDigest(header.getHeaderValue( + WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); if (headers != null) { @@ -263,7 +267,7 @@ // should we prefer one over the other? // right now, we're ignoring "Content-Location" try { - UURI uriRedirect = UURIFactory.getInstance(origUrl, + UURI uriRedirect = UURIFactory.getInstance(uri, locationStr); result.put(WaybackConstants.RESULT_REDIRECT_URL, uriRedirect.getEscapedURI()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-15 01:43:31
|
Revision: 2128 http://archive-access.svn.sourceforge.net/archive-access/?rev=2128&view=rev Author: bradtofel Date: 2008-01-14 17:43:29 -0800 (Mon, 14 Jan 2008) Log Message: ----------- REFACTOR: removed isAuthority() and resolveUrl() Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2008-01-15 01:41:32 UTC (rev 2127) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2008-01-15 01:43:29 UTC (rev 2128) @@ -44,34 +44,8 @@ * @version $Date$, $Revision$ */ public class UrlCanonicalizer { - - private static final String CC_TLDS = "ac|ad|ae|af|ag|ai|al|am|an|ao|aq" + - "|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs" + - "|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cx" + - "|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo" + - "|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk" + - "|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg" + - "|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma" + - "|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz" + - "|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm" + - "|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj" + - "|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn" + - "|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu" + - "|wf|ws|ye|yt|yu|za|zm|zw"; - - private static final String GEN_TLDS = "aero|biz|cat|com|coop|edu|gov" + - "|info|int|jobs|mil|mobi|museum|name|net|org|pro|travel"; - - - private static final String ALL_TLD_PATTERN = CC_TLDS + "|" + GEN_TLDS; - private static final String IP_PATTERN = "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+"; - private static final Pattern AUTHORITY_REGEX = - Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLD_PATTERN + "))|" + - "(" + IP_PATTERN + ")"); - - private static final String CDX_PREFIX = " CDX "; /** * Strip leading 'www.' @@ -323,38 +297,6 @@ return url; } - /** - * @param urlPart - * @return boolean indicating whether urlPart might be an Authority. - */ - public boolean isAuthority(String urlPart) { - Matcher m = AUTHORITY_REGEX.matcher(urlPart); - - return (m != null) && m.matches(); - } - - /** - * @param baseUrl - * @param url - * @return url resolved against baseUrl, unless it is absolute already - */ - public static String resolveUrl(String baseUrl, String url) { - // TODO: this only works for http:// - if(url.startsWith("http://")) { - return url; - } - UURI absBaseURI; - UURI resolvedURI = null; - try { - absBaseURI = UURIFactory.getInstance(baseUrl); - resolvedURI = UURIFactory.getInstance(absBaseURI, url); - } catch (URIException e) { - e.printStackTrace(); - return url; - } - return resolvedURI.getEscapedURI(); - } - private static void USAGE() { System.err.println("Usage: [-f FIELD] [-d DELIM]"); System.exit(3); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2127 http://archive-access.svn.sourceforge.net/archive-access/?rev=2127&view=rev Author: bradtofel Date: 2008-01-14 17:41:32 -0800 (Mon, 14 Jan 2008) Log Message: ----------- INITIAL REV: Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/IdentityUrlCanonicalizer.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/IdentityUrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/IdentityUrlCanonicalizer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/IdentityUrlCanonicalizer.java 2008-01-15 01:41:32 UTC (rev 2127) @@ -0,0 +1,20 @@ +package org.archive.wayback.util.url; + +import org.apache.commons.httpclient.URIException; +import org.archive.wayback.UrlCanonicalizer; + +/** + * Identity UrlCanonicalizer implementation, passing through urls as-is. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class IdentityUrlCanonicalizer implements UrlCanonicalizer { + + /* (non-Javadoc) + * @see org.archive.wayback.UrlCanonicalizer#urlStringToKey(java.lang.String) + */ + public String urlStringToKey(String url) throws URIException { + return url; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-15 01:39:59
|
Revision: 2126 http://archive-access.svn.sourceforge.net/archive-access/?rev=2126&view=rev Author: bradtofel Date: 2008-01-14 17:39:58 -0800 (Mon, 14 Jan 2008) Log Message: ----------- REFACTOR: moved isAuthority() and resolveUrl() from the generic UrlCanonicalizer class, in preparation for making UrlCanonicalizers configurable. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2008-01-15 01:39:58 UTC (rev 2126) @@ -0,0 +1,76 @@ +package org.archive.wayback.util.url; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.httpclient.URIException; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; + +/** + * Class containing common static URL methods. Primarily resolveUrl() and + * the (currently) unused isAuthority(). + * + * @author brad + * @version $Date$, $Revision$ + */ +public class UrlOperations { + + private static final String CC_TLDS = "ac|ad|ae|af|ag|ai|al|am|an|ao|aq" + + "|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs" + + "|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cx" + + "|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo" + + "|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk" + + "|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg" + + "|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma" + + "|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz" + + "|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm" + + "|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj" + + "|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn" + + "|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu" + + "|wf|ws|ye|yt|yu|za|zm|zw"; + + private static final String GEN_TLDS = "aero|biz|cat|com|coop|edu|gov" + + "|info|int|jobs|mil|mobi|museum|name|net|org|pro|travel"; + + + private static final String ALL_TLD_PATTERN = CC_TLDS + "|" + GEN_TLDS; + + private static final String IP_PATTERN = "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+"; + + private static final Pattern AUTHORITY_REGEX = + Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLD_PATTERN + "))|" + + "(" + IP_PATTERN + ")"); + + /** + * @param urlPart + * @return boolean indicating whether urlPart might be an Authority. + */ + public static boolean isAuthority(String urlPart) { + Matcher m = AUTHORITY_REGEX.matcher(urlPart); + + return (m != null) && m.matches(); + } + + /** + * @param baseUrl + * @param url + * @return url resolved against baseUrl, unless it is absolute already + */ + public static String resolveUrl(String baseUrl, String url) { + // TODO: this only works for http:// + if(url.startsWith("http://")) { + return url; + } + UURI absBaseURI; + UURI resolvedURI = null; + try { + absBaseURI = UURIFactory.getInstance(baseUrl); + resolvedURI = UURIFactory.getInstance(absBaseURI, url); + } catch (URIException e) { + e.printStackTrace(); + return url; + } + return resolvedURI.getEscapedURI(); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-15 01:38:38
|
Revision: 2125 http://archive-access.svn.sourceforge.net/archive-access/?rev=2125&view=rev Author: bradtofel Date: 2008-01-14 17:38:36 -0800 (Mon, 14 Jan 2008) Log Message: ----------- FEATURE: added new constant for SearchResult value annotation indicating the timestamp that was actually stored for captures that were not actually stored due to deduplication. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/WaybackConstants.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/WaybackConstants.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/WaybackConstants.java 2008-01-15 01:36:58 UTC (rev 2124) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/WaybackConstants.java 2008-01-15 01:38:36 UTC (rev 2125) @@ -328,6 +328,13 @@ public static final String RESULT_DUPLICATE_ANNOTATION = "duplicate"; /** + * Result: this key is present when the RESULT_DUPLICATE_ANNOTATION is also + * present, with the value indicating the last date that was actually + * stored for this duplicate. + */ + public static final String RESULT_DUPLICATE_STORED_DATE = "duplicate-date"; + + /** * flag indicates that this document was downloaded and verified as * identical to a previous capture by digest. */ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-15 01:36:56
|
Revision: 2124 http://archive-access.svn.sourceforge.net/archive-access/?rev=2124&view=rev Author: bradtofel Date: 2008-01-14 17:36:58 -0800 (Mon, 14 Jan 2008) Log Message: ----------- INITIAL REV: new interface to abstract various canonicalization strategies. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/UrlCanonicalizer.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/UrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/UrlCanonicalizer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/UrlCanonicalizer.java 2008-01-15 01:36:58 UTC (rev 2124) @@ -0,0 +1,7 @@ +package org.archive.wayback; + +import org.apache.commons.httpclient.URIException; + +public interface UrlCanonicalizer { + public String urlStringToKey(String url) throws URIException; +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-07 23:08:52
|
Revision: 2123 http://archive-access.svn.sourceforge.net/archive-access/?rev=2123&view=rev Author: bradtofel Date: 2008-01-07 15:08:58 -0800 (Mon, 07 Jan 2008) Log Message: ----------- 2008 new max year. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/core/TimestampTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/core/TimestampTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/core/TimestampTest.java 2008-01-07 22:53:03 UTC (rev 2122) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/core/TimestampTest.java 2008-01-07 23:08:58 UTC (rev 2123) @@ -43,8 +43,8 @@ assertEquals("padStart '1'","19960101000000",Timestamp.padStartDateStr("1")); assertEquals("padEnd '1'","19991231235959",Timestamp.padEndDateStr("1")); assertEquals("padStart '2'","20000101000000",Timestamp.padStartDateStr("2")); - assertEquals("padEnd","20071231235959",Timestamp.padEndDateStr("2")); - assertEquals("padEnd","20071231235959",Timestamp.padEndDateStr("3")); + assertEquals("padEnd","20081231235959",Timestamp.padEndDateStr("2")); + assertEquals("padEnd","20081231235959",Timestamp.padEndDateStr("3")); assertEquals("padEnd","20061231235959",Timestamp.padEndDateStr("2006")); assertEquals("padEnd","20061231235959",Timestamp.padEndDateStr("200613")); assertEquals("padEnd","20071231235959",Timestamp.padEndDateStr("2007")); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2122 http://archive-access.svn.sourceforge.net/archive-access/?rev=2122&view=rev Author: bradtofel Date: 2008-01-07 14:53:03 -0800 (Mon, 07 Jan 2008) Log Message: ----------- BUGFIX: corrected ARCReaderFactory/WARCReaderFactory usage to include offset with initial reader construction. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java 2008-01-07 22:51:40 UTC (rev 2121) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java 2008-01-07 22:53:03 UTC (rev 2122) @@ -34,13 +34,13 @@ } if (isArc(name)) { - ARCReader reader = ARCReaderFactory.get(file); - r = ARCArchiveRecordToResource(reader.get(offset),reader); + ARCReader reader = ARCReaderFactory.get(file,offset); + r = ARCArchiveRecordToResource(reader.get(),reader); } else if (isWarc(name)) { - WARCReader reader = WARCReaderFactory.get(file); - r = WARCArchiveRecordToResource(reader.get(offset),reader); + WARCReader reader = WARCReaderFactory.get(file,offset); + r = WARCArchiveRecordToResource(reader.get(),reader); } else { throw new ResourceNotAvailableException("Unknown extension"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2121 http://archive-access.svn.sourceforge.net/archive-access/?rev=2121&view=rev Author: bradtofel Date: 2008-01-07 14:51:40 -0800 (Mon, 07 Jan 2008) Log Message: ----------- FEATURE: Now assumes that ARC/WARC filename entries in a CDX/BDB without a trailing .arc.gz, .arc, .warc.gz, or .warc are .arc.gz. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2008-01-07 22:48:40 UTC (rev 2120) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2008-01-07 22:51:40 UTC (rev 2121) @@ -63,7 +63,13 @@ throw new IOException("No ARC/WARC offset in search result..."); } final long offset = Long.parseLong(offsetString); - + if(!fileName.endsWith(LocalResourceStore.ARC_EXTENSION) + && !fileName.endsWith(LocalResourceStore.ARC_GZ_EXTENSION) + && !fileName.endsWith(LocalResourceStore.WARC_EXTENSION) + && !fileName.endsWith(LocalResourceStore.WARC_GZ_EXTENSION)) { + fileName = fileName + LocalResourceStore.ARC_GZ_EXTENSION; + } + String fileUrl = urlPrefix + fileName; Resource r = null; try { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-07 22:48:37
|
Revision: 2120 http://archive-access.svn.sourceforge.net/archive-access/?rev=2120&view=rev Author: bradtofel Date: 2008-01-07 14:48:40 -0800 (Mon, 07 Jan 2008) Log Message: ----------- BUGFIX/HACKHACK: upped max year to 2008. This should be fixed before 2009... Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Timestamp.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Timestamp.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Timestamp.java 2007-12-22 02:19:35 UTC (rev 2119) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Timestamp.java 2008-01-07 22:48:40 UTC (rev 2120) @@ -43,7 +43,7 @@ private final static String LOWER_TIMESTAMP_LIMIT = "10000000000000"; private final static String UPPER_TIMESTAMP_LIMIT = "29991939295959"; private final static String YEAR_LOWER_LIMIT = "1996"; - private final static String YEAR_UPPER_LIMIT = "2007"; + private final static String YEAR_UPPER_LIMIT = "2008"; private final static String MONTH_LOWER_LIMIT = "01"; private final static String MONTH_UPPER_LIMIT = "12"; private final static String DAY_LOWER_LIMIT = "01"; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-22 02:19:30
|
Revision: 2119 http://archive-access.svn.sourceforge.net/archive-access/?rev=2119&view=rev Author: bradtofel Date: 2007-12-21 18:19:35 -0800 (Fri, 21 Dec 2007) Log Message: ----------- FEATURE: added get/setResult() to UIQueryResults. This is set by HTMLPage when including JSP files in replayed documents via ArchivalURL mode Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayRenderer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/query/UIQueryResults.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayRenderer.java 2007-12-22 00:59:25 UTC (rev 2118) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayRenderer.java 2007-12-22 02:19:35 UTC (rev 2119) @@ -98,7 +98,7 @@ Iterator<String> itr = jspInserts.iterator(); while(itr.hasNext()) { toInsert.append(page.includeJspString(itr.next(), httpRequest, - httpResponse, wbRequest, results)); + httpResponse, wbRequest, results, result)); } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/query/UIQueryResults.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/query/UIQueryResults.java 2007-12-22 00:59:25 UTC (rev 2118) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/query/UIQueryResults.java 2007-12-22 02:19:35 UTC (rev 2119) @@ -74,6 +74,7 @@ private int curPage; private SearchResults results; + private SearchResult result; private ResultURIConverter uriConverter; /** @@ -312,4 +313,12 @@ public Timestamp getExactRequestedTimestamp() { return exactRequestedTimestamp; } + + public SearchResult getResult() { + return result; + } + + public void setResult(SearchResult result) { + this.result = result; + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2007-12-22 00:59:25 UTC (rev 2118) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2007-12-22 02:19:35 UTC (rev 2119) @@ -383,11 +383,12 @@ */ public String includeJspString(String jspPath, HttpServletRequest httpRequest, HttpServletResponse httpResponse, - WaybackRequest wbRequest, SearchResults results) + WaybackRequest wbRequest, SearchResults results, SearchResult result) throws ServletException, IOException { UIQueryResults uiResults = new UIQueryResults(httpRequest, wbRequest, results, uriConverter); + uiResults.setResult(result); StringHttpServletResponseWrapper wrappedResponse = new StringHttpServletResponseWrapper(httpResponse); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-22 00:59:21
|
Revision: 2118 http://archive-access.svn.sourceforge.net/archive-access/?rev=2118&view=rev Author: bradtofel Date: 2007-12-21 16:59:25 -0800 (Fri, 21 Dec 2007) Log Message: ----------- FIX: problem with new apache httpclient.URI where "+"s now get escaped.. Also added a couple more tests. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/UrlCanonicalizerTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2007-12-15 02:02:43 UTC (rev 2117) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2007-12-22 00:59:25 UTC (rev 2118) @@ -221,11 +221,20 @@ searchUrl = "http://" + searchUrl; } + // unescape anythying that can be: + UURI tmpURI = UURIFactory.getInstance(searchUrl); + tmpURI.setPath(tmpURI.getPath()); + + // convert to UURI to perform require URI fixup: - UURI searchURI = UURIFactory.getInstance(searchUrl); + UURI searchURI = UURIFactory.getInstance(tmpURI.getURI()); + + + // replace ' ' with '+' (this is only to match Alexa's canonicalization) - String newPath = searchURI.getPath().replace(' ','+'); + String newPath = searchURI.getEscapedPath().replace("%20","+"); +// String newPath = searchURI.getPath().replace(' ','+'); // replace multiple consecutive '/'s in the path. while(newPath.contains("//")) { @@ -237,15 +246,30 @@ // if((newPath.length() > 1) && newPath.endsWith("/")) { // newPath = newPath.substring(0,newPath.length()-1); // } - searchURI.setPath(newPath); +// searchURI.setEscapedPath(newPath); +// searchURI.setRawPath(newPath.toCharArray()); +// String query = searchURI.getEscapedQuery(); // TODO: handle non HTTP port stripping, too. - String portStr = ""; +// String portStr = ""; +// if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { +// portStr = ":" + searchURI.getPort(); +// } +// return searchURI.getHostBasename() + portStr + +// searchURI.getEscapedPathQuery(); + + StringBuilder sb = new StringBuilder(searchUrl.length()); + sb.append(searchURI.getHostBasename()); if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { - portStr = ":" + searchURI.getPort(); + sb.append(":").append(searchURI.getPort()); } - return searchURI.getHostBasename() + portStr + - searchURI.getEscapedPathQuery(); + sb.append(newPath); + if(searchURI.getEscapedQuery() != null) { + sb.append("?").append(searchURI.getEscapedQuery()); + } + + + return sb.toString(); } Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/UrlCanonicalizerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/UrlCanonicalizerTest.java 2007-12-15 02:02:43 UTC (rev 2117) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/UrlCanonicalizerTest.java 2007-12-22 00:59:25 UTC (rev 2118) @@ -88,12 +88,17 @@ // do not add trailing '/' non-empty path and without protocol checkCanonicalization("foo.com/boo","foo.com/boo"); + + // TEST + // replace escaped ' ' with '+' in path plus keep trailing slash and query + checkCanonicalization("foo.com/pa%20th?a=b","foo.com/pa+th?a=b"); + // replace escaped ' ' with '+' in path checkCanonicalization("foo.com/pa%20th","foo.com/pa+th"); - // replace escaped ' ' with '+' in path plus kill trailing slash -// checkCanonicalization("foo.com/pa%20th/","foo.com/pa+th"); + // replace escaped ' ' with '+' in path plus leave trailing slash + checkCanonicalization("foo.com/pa%20th/","foo.com/pa+th/"); // replace multiple consecutive /'s in path checkCanonicalization("foo.com//goo","foo.com/goo"); @@ -104,11 +109,11 @@ // leave alone consecutive /'s after ? checkCanonicalization("foo.com/b?jar=//goo","foo.com/b?jar=//goo"); - // replace multiple consecutive /'s in path, plus kill trailing / -// checkCanonicalization("foo.com///goo/","foo.com/goo"); + // replace multiple consecutive /'s in path, plus leave trailing / + checkCanonicalization("foo.com///goo/","foo.com/goo/"); // replace escaped ' ' with '+' in path plus keep trailing slash and query - checkCanonicalization("foo.com/pa%20th?a=b","foo.com/pa+th?a=b"); + checkCanonicalization("foo.com/pa%20th/?a=b","foo.com/pa+th/?a=b"); // replace escaped ' ' with '+' in path but not in query key @@ -117,6 +122,23 @@ // replace escaped ' ' with '+' in path but not in query value checkCanonicalization("foo.com/pa%20th?a=b%20b","foo.com/pa+th?a=b%20b"); + + // no change in '!' escaping + checkCanonicalization("foo.com/pa!th","foo.com/pa!th"); + + // no change in '+' escaping + checkCanonicalization("foo.com/pa+th","foo.com/pa+th"); + + // unescape legal escaped '!' (%21) + checkCanonicalization("foo.com/pa%21th","foo.com/pa!th"); + + // leave '%' (%25) + checkCanonicalization("foo.com/pa%th","foo.com/pa%th"); + + // unescape '%' (%25) + checkCanonicalization("foo.com/pa%25th","foo.com/pa%th"); + + // replace escaped ' ' with '+' in path, unescape legal '!' in path // no change in query escaping checkCanonicalization("foo.com/pa%20t%21h?a%20a=b","foo.com/pa+t!h?a%20a=b"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2117 http://archive-access.svn.sourceforge.net/archive-access/?rev=2117&view=rev Author: bradtofel Date: 2007-12-14 18:02:43 -0800 (Fri, 14 Dec 2007) Log Message: ----------- BUGFIX: was accepting File objects to set/getDataDir() method, not String objects. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2007-12-15 02:01:51 UTC (rev 2116) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2007-12-15 02:02:43 UTC (rev 2117) @@ -13,6 +13,7 @@ import org.archive.wayback.core.SearchResult; import org.archive.wayback.exception.ConfigurationException; import org.archive.wayback.exception.ResourceNotAvailableException; +import org.archive.wayback.util.DirMaker; /** * Class which implements a local ARC, WARC, ARC.gz, WARC.gz, ResourceStore @@ -112,12 +113,12 @@ return null; } - public File getDataDir() { - return dataDir; + public String getDataDir() { + return DirMaker.getAbsolutePath(dataDir); } - public void setDataDir(File dataDir) { - this.dataDir = dataDir; + public void setDataDir(String dataDir) throws IOException { + this.dataDir = DirMaker.ensureDir(dataDir); } private class ArcWarcFilenameFilter implements FilenameFilter { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-15 02:01:46
|
Revision: 2116 http://archive-access.svn.sourceforge.net/archive-access/?rev=2116&view=rev Author: bradtofel Date: 2007-12-14 18:01:51 -0800 (Fri, 14 Dec 2007) Log Message: ----------- FEATURE: added DuplicateRecordFilter to standard filter chains, to omit identical records from result stream. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/DuplicateRecordFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2007-12-15 02:00:51 UTC (rev 2115) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2007-12-15 02:01:51 UTC (rev 2116) @@ -34,6 +34,7 @@ import org.archive.wayback.resourceindex.filters.CaptureToUrlResultFilter; import org.archive.wayback.resourceindex.filters.CounterFilter; import org.archive.wayback.resourceindex.filters.DateRangeFilter; +import org.archive.wayback.resourceindex.filters.DuplicateRecordFilter; import org.archive.wayback.resourceindex.filters.EndDateFilter; import org.archive.wayback.resourceindex.filters.GuardRailFilter; import org.archive.wayback.resourceindex.filters.HostMatchFilter; @@ -224,7 +225,11 @@ // use the same guardrail for both: forwardFilters.addFilter(guardrail); reverseFilters.addFilter(guardrail); - + + // BUGBUG: won't work when closest is a dupe! + forwardFilters.addFilter(new DuplicateRecordFilter()); + reverseFilters.addFilter(new DuplicateRecordFilter()); + // match URL key: forwardFilters.addFilter(new UrlMatchFilter(keyUrl)); reverseFilters.addFilter(new UrlMatchFilter(keyUrl)); @@ -298,6 +303,7 @@ ObjectFilterChain<SearchResult> filters = new ObjectFilterChain<SearchResult>(); filters.addFilter(guardrail); + filters.addFilter(new DuplicateRecordFilter()); filters.addFilter(new UrlMatchFilter(keyUrl)); if(hostMatchFilter != null) { @@ -331,6 +337,7 @@ ObjectFilterChain<SearchResult> filters = new ObjectFilterChain<SearchResult>(); filters.addFilter(guardrail); + filters.addFilter(new DuplicateRecordFilter()); filters.addFilter(new UrlPrefixMatchFilter(keyUrl)); if(hostMatchFilter != null) { Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/DuplicateRecordFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/DuplicateRecordFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/DuplicateRecordFilter.java 2007-12-15 02:01:51 UTC (rev 2116) @@ -0,0 +1,30 @@ +package org.archive.wayback.resourceindex.filters; + +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.ObjectFilter; + +/** + * ObjectFilter which omits exact duplicate URL+date records from a stream + * of SearchResults. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class DuplicateRecordFilter implements ObjectFilter<SearchResult> { + private String lastUrl = null; + private String lastDate = null; + + public int filterObject(SearchResult o) { + String thisUrl = o.getUrl(); + String thisDate = o.getCaptureDate(); + int result = ObjectFilter.FILTER_INCLUDE; + if(lastUrl != null) { + if(lastUrl.equals(thisUrl) && thisDate.equals(lastDate)) { + result = FILTER_EXCLUDE; + } + } + lastUrl = thisUrl; + lastDate = thisDate; + return result; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-15 02:00:47
|
Revision: 2115 http://archive-access.svn.sourceforge.net/archive-access/?rev=2115&view=rev Author: bradtofel Date: 2007-12-14 18:00:51 -0800 (Fri, 14 Dec 2007) Log Message: ----------- BUGFIX: (unreported) regex was not finding simple tags (ex: "<head>")... Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/TagMagixTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2007-12-12 03:34:13 UTC (rev 2114) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2007-12-15 02:00:51 UTC (rev 2115) @@ -100,7 +100,7 @@ Pattern pc = wholeTagPatterns.get(tagName); if (pc == null) { - String tagPatString = "<\\s*" + tagName + "\\s+[^>]+>"; + String tagPatString = "<\\s*" + tagName + "((>)|(\\s+[^>]*>))"; pc = Pattern.compile(tagPatString, Pattern.CASE_INSENSITIVE); wholeTagPatterns.put(tagName, pc); Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/TagMagixTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/TagMagixTest.java 2007-12-12 03:34:13 UTC (rev 2114) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/TagMagixTest.java 2007-12-15 02:00:51 UTC (rev 2115) @@ -98,7 +98,24 @@ "author2","Bakri Abubakr http://bayanit.com/"); } + public void testFindEndOfFirst() { + findEndOf("<head>","head",6); + findEndOf("<html><head><body>","head",12); + findEndOf("<html><head goo=bar><body>","head",20); + findEndOf("<html><head goo=bar><body>full","body",26); + findEndOf("<html><head goo=bar><body >full","body",27); + findEndOf("<html><head goo=bar><body >full","body",27); + findEndOf("<html><head goo=bar><body yar=bam>full","body",34); + findEndOf("<html><head goo=bar><body yar='bam'>full","body",36); + findEndOf("<html><head goo=bar><body yar=\"bam\">full","body",36); + } + public void findEndOf(String page, String tag, int offset) { + StringBuilder sb = new StringBuilder(page); + int found = TagMagix.getEndOfFirstTag(sb,tag); + assertEquals("FAILED find end of " +tag+ " in ("+page+")",offset,found); + } + /** * Test method for 'org.archive.wayback.archivalurl.TagMagix.markupTag(StringBuffer, String, String, String, String, String)' */ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-12 03:34:08
|
Revision: 2114 http://archive-access.svn.sourceforge.net/archive-access/?rev=2114&view=rev Author: bradtofel Date: 2007-12-11 19:34:13 -0800 (Tue, 11 Dec 2007) Log Message: ----------- FEATURE: added constants for flagging duplicate records extracted from a ResourceIndex, where one or more of a series of records may not have been saved. These flags indicate to the UI that data for a SearchResult has been annotated from another duplicate version of the same document. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/WaybackConstants.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/WaybackConstants.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/WaybackConstants.java 2007-12-12 03:08:58 UTC (rev 2113) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/WaybackConstants.java 2007-12-12 03:34:13 UTC (rev 2114) @@ -318,7 +318,29 @@ */ public static final String RESULT_CLOSEST_INDICATOR = "closest"; public static final String RESULT_CLOSEST_VALUE = "true"; + + /** + * Result: this key being present indicates that this particular capture + * was not actually stored, and that other values within this SearchResult + * are actually values from a different record which *should* be identical + * to this capture, had it been stored. + */ + public static final String RESULT_DUPLICATE_ANNOTATION = "duplicate"; + + /** + * flag indicates that this document was downloaded and verified as + * identical to a previous capture by digest. + */ + public static final String RESULT_DUPLICATE_DIGEST = "digest"; + + /** + * flag indicates that this document was NOT downloaded, but that the + * origin server indicated that the document had not changed, based on + * If-Modified HTTP request headers. + */ + public static final String RESULT_DUPLICATE_HTTP = "http"; + /** * Name of configuration in web.xml for maximum number of results to return * in index searches. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2113 http://archive-access.svn.sourceforge.net/archive-access/?rev=2113&view=rev Author: bradtofel Date: 2007-12-11 19:08:58 -0800 (Tue, 11 Dec 2007) Log Message: ----------- REMOVE: all this code has been refactored into the new ReplayRendererDispatcher, and the various ReplayRenderer implementations. Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/BaseReplayRenderer.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/BaseReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/BaseReplayRenderer.java 2007-12-12 03:06:53 UTC (rev 2112) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/BaseReplayRenderer.java 2007-12-12 03:08:58 UTC (rev 2113) @@ -1,524 +0,0 @@ -/* BaseReplayRenderer - * - * $Id$ - * - * Created on 12:35:07 PM Apr 24, 2006. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of wayback. - * - * wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.replay; - -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.OutputStream; -import java.util.Iterator; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import javax.servlet.RequestDispatcher; -import javax.servlet.ServletException; -import javax.servlet.ServletOutputStream; -import javax.servlet.http.HttpServletRequest; -import javax.servlet.http.HttpServletResponse; - -import org.archive.wayback.ReplayRenderer; -import org.archive.wayback.ResultURIConverter; -import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.Resource; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.core.SearchResults; -import org.archive.wayback.core.UIResults; -import org.archive.wayback.core.WaybackRequest; -import org.archive.wayback.exception.WaybackException; -import org.mozilla.universalchardet.UniversalDetector; - - -/** - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class BaseReplayRenderer implements ReplayRenderer { - - // in several places, this class defers generation of client responses - // to a .jsp file, once the business logic of replaying is done. - - private String errorJsp = "/jsp/HTMLError.jsp"; - private String imageErrorJsp = "/jsp/HTMLError.jsp"; - private String javascriptErrorJsp = "/jsp/JavaScriptError.jsp"; - private String cssErrorJsp = "/jsp/CSSError.jsp"; - - // if documents are marked up before sending to clients, the data is - // decoded into a String in chunks. This is how big a chunk to decode with. - private final static int C_BUFFER_SIZE = 4096; - - // hand off this many bytes to the chardet library - private final static int MAX_CHARSET_READAHEAD = 65536; - - // ...and if the chardet library fails, use the Content-Type header - private final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type"; - - // ...if it also includes "charset=" - private final static String CHARSET_TOKEN = "charset="; - - private final static int BYTE_BUFFER_SIZE = 4096; - - protected final Pattern IMAGE_REGEX = Pattern - .compile(".*\\.(jpg|jpeg|gif|png|bmp|tiff|tif)$"); - - /* ERROR HANDLING RESPONSES: */ - - private boolean requestIsEmbedded(HttpServletRequest httpRequest, - WaybackRequest wbRequest) { - // without a wbRequest, assume it is not embedded: send back HTML - if(wbRequest == null) { - return false; - } - String referer = wbRequest.get(WaybackConstants.REQUEST_REFERER_URL); - return (referer != null && referer.length() > 0); - } - - private boolean requestIsImage(HttpServletRequest httpRequest, - WaybackRequest wbRequest) { - String requestUrl = wbRequest.get(WaybackConstants.REQUEST_URL); - if (requestUrl == null) - return false; - Matcher matcher = IMAGE_REGEX.matcher(requestUrl); - return (matcher != null && matcher.matches()); - } - - private boolean requestIsJavascript(HttpServletRequest httpRequest, - WaybackRequest wbRequest) { - - String requestUrl = wbRequest.get(WaybackConstants.REQUEST_URL); - return (requestUrl != null) && requestUrl.endsWith(".js"); - } - - private boolean requestIsCSS(HttpServletRequest httpRequest, - WaybackRequest wbRequest) { - - String requestUrl = wbRequest.get(WaybackConstants.REQUEST_URL); - return (requestUrl != null) && requestUrl.endsWith(".css"); - } - - /** - * @param httpRequest - * @param httpResponse - * @param wbRequest - * @param exception - * @throws ServletException - * @throws IOException - */ - public void renderException(HttpServletRequest httpRequest, - HttpServletResponse httpResponse, WaybackRequest wbRequest, - WaybackException exception) throws ServletException, IOException { - - // the "standard HTML" response handler: - String finalJspPath = errorJsp; - - // try to not cause client errors by sending the HTML response if - // this request is ebedded, and is obviously one of the special types: - if (requestIsEmbedded(httpRequest, wbRequest)) { - - if (requestIsJavascript(httpRequest, wbRequest)) { - - finalJspPath = javascriptErrorJsp; - - } else if (requestIsCSS(httpRequest, wbRequest)) { - - finalJspPath = cssErrorJsp; - - } else if (requestIsImage(httpRequest, wbRequest)) { - - finalJspPath = imageErrorJsp; - - } - } - - httpRequest.setAttribute("exception", exception); - UIResults uiResults = new UIResults(wbRequest); - uiResults.storeInRequest(httpRequest, finalJspPath); - - RequestDispatcher dispatcher = httpRequest - .getRequestDispatcher(finalJspPath); - - dispatcher.forward(httpRequest, httpResponse); - } - - /* GENERIC RESPONSE HELPER METHODS: */ - - /** - * Send the raw bytes from is (presumably the Resource/ARCRecord) to - * os (presumably the clients/HTTPResponse's OutputStream) with no - * decoding. Send them all as-is. - * - * @param is - * @param os - * @throws IOException - */ - protected void copy(InputStream is, OutputStream os) throws IOException { - byte[] buffer = new byte[BYTE_BUFFER_SIZE]; - for (int r = -1; (r = is.read(buffer, 0, BYTE_BUFFER_SIZE)) != -1;) { - os.write(buffer, 0, r); - } - } - - protected boolean isExactVersionRequested(WaybackRequest wbRequest, - SearchResult result) { - - String reqDateStr = wbRequest.get(WaybackConstants.REQUEST_EXACT_DATE); - String resDateStr = result.get(WaybackConstants.RESULT_CAPTURE_DATE); - - // some capture dates are not 14 digits, only compare as many - // digits as are in the result date: - return resDateStr.equals(reqDateStr.substring(0, resDateStr.length())); - } - - /** - * test if the Resource and SearchResult should be replayed raw, without - * any markup. - * - * This version always indicates that the document should be returned raw, - * but is intended to be overriden. - * - * @param resource - * @param result - * @return boolean, true if the document should be returned raw. - */ - protected boolean isRawReplayResult(Resource resource, - SearchResult result) { - return true; - } - - /** - * callback function for each HTTP header. If null is returned, header is - * omitted from final response to client, otherwise, the possibly modified - * http header value is returned to the client. - * - * This version just hands back all headers transparently, but is intended - * to be overriden. - * - * @param key - * @param value - * @param uriConverter - * @param result - * @return String - */ - protected String filterHeader(final String key, final String value, - final ResultURIConverter uriConverter, SearchResult result) { - return value; - } - - /** - * Iterate over all HTTP headers in resource, possibly sending them on - * to the client. The determination as to omit, send as-is, or send modified - * is handled thru the overridable filterHeader() method. - * - * @param response - * @param resource - * @param uriConverter - * @param result - * @throws IOException - */ - protected void copyRecordHttpHeader(HttpServletResponse response, - Resource resource, ResultURIConverter uriConverter, - SearchResult result) throws IOException { - Map<String,String> headers = resource.getHttpHeaders(); - int code = resource.getStatusCode(); - // Only return legit status codes -- don't return any minus - // codes, etc. - if (code <= HttpServletResponse.SC_CONTINUE) { - String identifier = ""; - response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR, - "Bad status code " + code + " (" + identifier + ")."); - return; - } - response.setStatus(code); - if (headers != null) { - Iterator<String> itr = headers.keySet().iterator(); - while(itr.hasNext()) { - String key = itr.next(); - String value = headers.get(key); - String finalValue = value; - if (value != null) { - finalValue = filterHeader(key, value, uriConverter, result); - if (finalValue == null) { - continue; - } - } - response.setHeader(key, (finalValue == null) ? "" : finalValue); - } - } - } - - private String contentTypeToCharset(final String contentType) { - int offset = contentType.indexOf(CHARSET_TOKEN); - if (offset != -1) { - return contentType.substring(offset + CHARSET_TOKEN.length()); - } - return null; - } - - /** - * Attempt to divine the character encoding of the document from the - * Content-Type HTTP header (with a "charset=") - * - * @param resource - * @return String character set found or null if the header was not present - * @throws IOException - */ - protected String getCharsetFromHeaders(Resource resource) - throws IOException { - - String charsetName = null; - - Map<String,String> httpHeaders = resource.getHttpHeaders(); - String ctype = httpHeaders.get(HTTP_CONTENT_TYPE_HEADER); - if (ctype != null) { - charsetName = contentTypeToCharset(ctype); - } - return charsetName; - } - - /** - * Attempt to find a META tag in the HTML that hints at the character set - * used to write the document. - * - * @param resource - * @return String character set found from META tags in the HTML - * @throws IOException - */ - protected String getCharsetFromMeta(Resource resource) throws IOException { - String charsetName = null; - - byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD]; - resource.mark(MAX_CHARSET_READAHEAD); - resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD); - resource.reset(); - // convert to UTF-8 String -- which hopefully will not mess up the - // characters we're interested in... - StringBuilder sb = new StringBuilder(new String(bbuffer,"UTF-8")); - String metaContentType = TagMagix.getTagAttrWhere(sb, "META", - "content", "http-equiv", "Content-Type"); - if(metaContentType != null) { - charsetName = contentTypeToCharset(metaContentType); - } - return charsetName; - } - - /** - * Attempts to figure out the character set of the document using - * the excellent juniversalchardet library. - * - * @param resource - * @return String character encoding found, or null if nothing looked good. - * @throws IOException - */ - protected String getCharsetFromBytes(Resource resource) throws IOException { - String charsetName = null; - - byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD]; - // (1) - UniversalDetector detector = new UniversalDetector(null); - - // (2) - resource.mark(MAX_CHARSET_READAHEAD); - int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD); - resource.reset(); - detector.handleData(bbuffer, 0, len); - // (3) - detector.dataEnd(); - // (4) - charsetName = detector.getDetectedCharset(); - - // (5) - detector.reset(); - - return charsetName; - } - - /** - * Use META tags, byte-character-detection, HTTP headers, hope, and prayer - * to figure out what character encoding is being used for the document. - * If nothing else works, assumes UTF-8 for now. - * - * @param resource - * @return String charset for Resource - * @throws IOException - */ - protected String getCharset(Resource resource) throws IOException { - - String charSet = getCharsetFromMeta(resource); - if(charSet == null) { - charSet = getCharsetFromBytes(resource); - if(charSet == null) { - charSet = getCharsetFromHeaders(resource); - if(charSet == null) { - charSet = "UTF-8"; - } - } - } - return charSet; - } - - /** - * Do "stuff" to the StringBuilder page argument. - * - * This version does nothing at all, but is intended to be overridden. - * - * @param page - * @param httpRequest - * @param httpResponse - * @param wbRequest - * @param result - * @param resource - * @param uriConverter - */ - protected void markUpPage(StringBuilder page, - HttpServletRequest httpRequest, HttpServletResponse httpResponse, - WaybackRequest wbRequest, SearchResult result, Resource resource, - ResultURIConverter uriConverter) { - - } - - - /* (non-Javadoc) - * @see org.archive.wayback.ReplayRenderer#renderResource(javax.servlet.http.HttpServletRequest, javax.servlet.http.HttpServletResponse, org.archive.wayback.core.WaybackRequest, org.archive.wayback.core.SearchResult, org.archive.wayback.core.Resource, org.archive.wayback.ResultURIConverter) - */ - public void renderResource(HttpServletRequest httpRequest, - HttpServletResponse httpResponse, WaybackRequest wbRequest, - SearchResult result, Resource resource, - ResultURIConverter uriConverter, SearchResults results) throws ServletException, - IOException { - - if (resource == null) { - throw new IllegalArgumentException("No resource"); - } - if (result == null) { - throw new IllegalArgumentException("No result"); - } - - if (isRawReplayResult(resource,result)) { - - resource.parseHeaders(); - copyRecordHttpHeader(httpResponse, resource, uriConverter, result); - copy(resource, httpResponse.getOutputStream()); - - } else { - - // We're going to do some markup on the page. - // first we'll need to convert the bytes to a String, which - // includes character encoding detection, then we'll call into the - // overridable markUpPage(), then we'll convert back to bytes and - // return them to the client - - resource.parseHeaders(); - copyRecordHttpHeader(httpResponse, resource, uriConverter, result); - - int recordLength = (int) resource.getRecordLength(); - - // get the charset: - String charSet = getCharset(resource); - - // convert bytes to characters for charset: - InputStreamReader isr = new InputStreamReader(resource, charSet); - - char[] cbuffer = new char[C_BUFFER_SIZE]; - - // slurp the whole thing into RAM: - StringBuilder sbuffer = new StringBuilder(recordLength); - for (int r = -1; (r = isr.read(cbuffer, 0, C_BUFFER_SIZE)) != -1;) { - sbuffer.append(cbuffer, 0, r); - } - - // do the "usual" markup: - markUpPage(sbuffer, httpRequest, httpResponse, wbRequest, result, - resource, uriConverter); - - // back to bytes... - byte[] ba = sbuffer.toString().getBytes(charSet); - - // inform browser how much is coming back: - httpResponse.setHeader("Content-Length", String.valueOf(ba.length)); - - // and send it out the door... - ServletOutputStream out = httpResponse.getOutputStream(); - out.write(ba); - } - } - - /** - * @return the errorJsp - */ - public String getErrorJsp() { - return errorJsp; - } - - /** - * @param errorJsp the errorJsp to set - */ - public void setErrorJsp(String errorJsp) { - this.errorJsp = errorJsp; - } - - /** - * @return the imageErrorJsp - */ - public String getImageErrorJsp() { - return imageErrorJsp; - } - - /** - * @param imageErrorJsp the imageErrorJsp to set - */ - public void setImageErrorJsp(String imageErrorJsp) { - this.imageErrorJsp = imageErrorJsp; - } - - /** - * @return the javascriptErrorJsp - */ - public String getJavascriptErrorJsp() { - return javascriptErrorJsp; - } - - /** - * @param javascriptErrorJsp the javascriptErrorJsp to set - */ - public void setJavascriptErrorJsp(String javascriptErrorJsp) { - this.javascriptErrorJsp = javascriptErrorJsp; - } - - /** - * @return the cssErrorJsp - */ - public String getCssErrorJsp() { - return cssErrorJsp; - } - - /** - * @param cssErrorJsp the cssErrorJsp to set - */ - public void setCssErrorJsp(String cssErrorJsp) { - this.cssErrorJsp = cssErrorJsp; - } -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2112 http://archive-access.svn.sourceforge.net/archive-access/?rev=2112&view=rev Author: bradtofel Date: 2007-12-11 19:06:53 -0800 (Tue, 11 Dec 2007) Log Message: ----------- REFACTOR: moved HTTP header constants into this class. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderProcessor.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderProcessor.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderProcessor.java 2007-12-12 03:06:10 UTC (rev 2111) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderProcessor.java 2007-12-12 03:06:53 UTC (rev 2112) @@ -36,6 +36,18 @@ * @version $Date$, $Revision$ */ public interface HttpHeaderProcessor { + + public final static String HTTP_LENGTH_HEADER = "Content-Length"; + public final static String HTTP_LENGTH_HEADER_UP = + HTTP_LENGTH_HEADER.toUpperCase(); + + public final static String HTTP_LOCATION_HEADER = "Location"; + public final static String HTTP_LOCATION_HEADER_UP = + HTTP_LOCATION_HEADER.toUpperCase(); + + public final static String HTTP_CONTENT_BASE_HEADER = "Content-Base"; + public final static String HTTP_CONTENT_BASE_HEADER_UP = + HTTP_CONTENT_BASE_HEADER.toUpperCase(); /** * optionally add header key:value to output for later returning to client This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-12 03:06:08
|
Revision: 2111 http://archive-access.svn.sourceforge.net/archive-access/?rev=2111&view=rev Author: bradtofel Date: 2007-12-11 19:06:10 -0800 (Tue, 11 Dec 2007) Log Message: ----------- BUGFIX: now rewriting Location and Content-Base headers in non HTML documents FEATURE: Added Server-Side rendering capability to normal ArchivalUrl mode. Now JS inserts are all handled through .jsp inserts, which include page specific variables, and reference to the common .js file which uses those variables. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayDispatcher.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayRenderer.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlTransparentReplayRenderer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayDispatcher.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayDispatcher.java 2007-12-12 02:19:12 UTC (rev 2110) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayDispatcher.java 2007-12-12 03:06:10 UTC (rev 2111) @@ -33,7 +33,6 @@ import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.replay.BaseReplayDispatcher; import org.archive.wayback.replay.DateRedirectReplayRenderer; -import org.archive.wayback.replay.TransparentReplayRenderer; /** * @@ -54,7 +53,9 @@ // TODO: make this configurable private final static long MAX_HTML_MARKUP_LENGTH = 1024 * 1024 * 5; - private ReplayRenderer transparent = new TransparentReplayRenderer(); + private ReplayRenderer transparent = + new ArchivalUrlTransparentReplayRenderer(); + private ReplayRenderer redirect = new DateRedirectReplayRenderer(); private ArchivalUrlReplayRenderer archivalHTML = new ArchivalUrlReplayRenderer(); @@ -75,6 +76,8 @@ return redirect; } + // TODO: handle .css docs -- embedded URLs there need to be fixed + // HTML and XHTML docs smaller than some size get marked up as HTML if (resource.getRecordLength() < MAX_HTML_MARKUP_LENGTH) { @@ -123,4 +126,20 @@ public void setJspInserts(List<String> jspInserts) { archivalHTML.setJspInserts(jspInserts); } + + /** + * @return + * @see org.archive.wayback.archivalurl.ArchivalUrlReplayRenderer#isServerSideRendering() + */ + public boolean isServerSideRendering() { + return archivalHTML.isServerSideRendering(); + } + + /** + * @param isServerSideRendering + * @see org.archive.wayback.archivalurl.ArchivalUrlReplayRenderer#setServerSideRendering(boolean) + */ + public void setServerSideRendering(boolean isServerSideRendering) { + archivalHTML.setServerSideRendering(isServerSideRendering); + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayRenderer.java 2007-12-12 02:19:12 UTC (rev 2110) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayRenderer.java 2007-12-12 03:06:10 UTC (rev 2111) @@ -25,7 +25,6 @@ package org.archive.wayback.archivalurl; import java.io.IOException; -import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -36,17 +35,14 @@ import org.archive.wayback.ReplayRenderer; import org.archive.wayback.ResultURIConverter; -import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.Resource; import org.archive.wayback.core.SearchResult; import org.archive.wayback.core.SearchResults; -import org.archive.wayback.core.Timestamp; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BadContentException; import org.archive.wayback.replay.HTMLPage; import org.archive.wayback.replay.HttpHeaderProcessor; import org.archive.wayback.replay.HttpHeaderOperation; -import org.archive.wayback.util.StringFormatter; import org.archive.wayback.util.UrlCanonicalizer; /** @@ -61,20 +57,10 @@ */ public class ArchivalUrlReplayRenderer implements ReplayRenderer, HttpHeaderProcessor { - private final static String HTTP_LENGTH_HEADER = "Content-Length"; - private final static String HTTP_LENGTH_HEADER_UP = - HTTP_LENGTH_HEADER.toUpperCase(); - private final static String HTTP_LOCATION_HEADER = "Location"; - private final static String HTTP_LOCATION_HEADER_UP = - HTTP_LOCATION_HEADER.toUpperCase(); - - private final static String HTTP_CONTENT_BASE_HEADER = "Content-Length"; - private final static String HTTP_CONTENT_BASE_HEADER_UP = - HTTP_CONTENT_BASE_HEADER.toUpperCase(); - private List<String> jsInserts = null; private List<String> jspInserts = null; + private boolean serverSideRendering = false; /* (non-Javadoc) * @see org.archive.wayback.ReplayRenderer#renderResource(javax.servlet.http.HttpServletRequest, javax.servlet.http.HttpServletResponse, org.archive.wayback.core.WaybackRequest, org.archive.wayback.core.SearchResult, org.archive.wayback.core.Resource, org.archive.wayback.ResultURIConverter, org.archive.wayback.core.SearchResults) @@ -85,41 +71,23 @@ ResultURIConverter uriConverter, SearchResults results) throws ServletException, IOException, BadContentException { - resource.parseHeaders(); - + StringBuilder toInsert = new StringBuilder(300); + HttpHeaderOperation.copyHTTPMessageHeader(resource, httpResponse); - + Map<String,String> headers = HttpHeaderOperation.processHeaders( resource, result, uriConverter, this); + // Load content into an HTML page, and resolve load-time URLs: HTMLPage page = new HTMLPage(resource,result,uriConverter); page.readFully(); - page.resolvePageUrls(); - // generate JS insert: - StringFormatter fmt = wbRequest.getFormatter(); - - String resourceTS = result.getCaptureDate(); - String resourceUrl = result.get(WaybackConstants.RESULT_URL); - Timestamp captureTS = Timestamp.parseBefore(resourceTS); - Date captureDate = captureTS.getDate(); - String contextPath = uriConverter.makeReplayURI(resourceTS, ""); - - - StringBuilder toInsert = new StringBuilder(300); - - toInsert.append("<script type=\"text/javascript\">\n\n"); - toInsert.append(fmt.format("ReplayView.javaScriptComment", captureDate, - new Date())); - String wmNotice = fmt.format("ReplayView.banner", resourceUrl, - captureDate); - String wmHideNotice = fmt.format("ReplayView.bannerHideLink"); - toInsert.append("var sWayBackCGI = \"" + contextPath + "\";\n"); - toInsert.append("var wmNotice = \"" + wmNotice + "\";\n"); - toInsert.append("var wmHideNotice = \"" + wmHideNotice + "\";\n"); - toInsert.append("</script>\n"); - + if(serverSideRendering) { + page.resolveAllPageUrls(); + } else { + page.resolvePageUrls(); + } if(jsInserts != null) { Iterator<String> itr = jsInserts.iterator(); while(itr.hasNext()) { @@ -134,16 +102,20 @@ } } - // add the javascript, and dump the result out to the client: - page.insertAtEndOfBody(toInsert.toString()); + // insert the new content: + if(serverSideRendering) { + page.insertAtStartOfBody(toInsert.toString()); + } else { + page.insertAtEndOfBody(toInsert.toString()); + } + + // set the corrected length: + int bytes = page.getBytes().length; + headers.put(HTTP_LENGTH_HEADER, String.valueOf(bytes)); // send back the headers: HttpHeaderOperation.sendHeaders(headers, httpResponse); - // plus the corrected length: - int bytes = page.getBytes().length; - headers.put(HTTP_LENGTH_HEADER, String.valueOf(bytes)); - page.writeToOutputStream(httpResponse.getOutputStream()); } @@ -205,4 +177,18 @@ public void setJspInserts(List<String> jspInserts) { this.jspInserts = jspInserts; } + + /** + * @return the isServerSideRendering + */ + public boolean isServerSideRendering() { + return serverSideRendering; + } + + /** + * @param isServerSideRendering the isServerSideRendering to set + */ + public void setServerSideRendering(boolean serverSideRendering) { + this.serverSideRendering = serverSideRendering; + } } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlTransparentReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlTransparentReplayRenderer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlTransparentReplayRenderer.java 2007-12-12 03:06:10 UTC (rev 2111) @@ -0,0 +1,45 @@ +package org.archive.wayback.archivalurl; + +import java.util.Map; + +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.replay.TransparentReplayRenderer; +import org.archive.wayback.util.UrlCanonicalizer; + +/** + * Slight extension to TransparentReplayRenderer, which rewrites Location and + * Content-Base HTTP headers as they go out. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ArchivalUrlTransparentReplayRenderer +extends TransparentReplayRenderer { + + /* (non-Javadoc) + * @see org.archive.wayback.replay.HeaderFilter#filter(java.util.Map, java.lang.String, java.lang.String, org.archive.wayback.ResultURIConverter, org.archive.wayback.core.SearchResult) + */ + public void filter(Map<String, String> output, String key, String value, + ResultURIConverter uriConverter, SearchResult result) { + + String keyUp = key.toUpperCase(); + + // rewrite Location header URLs + if (keyUp.startsWith(HTTP_LOCATION_HEADER_UP) || + keyUp.startsWith(HTTP_CONTENT_BASE_HEADER_UP)) { + + String baseUrl = result.getAbsoluteUrl(); + String cd = result.getCaptureDate(); + // by the spec, these should be absolute already, but just in case: + String u = UrlCanonicalizer.resolveUrl(baseUrl, value); + + output.put(key, uriConverter.makeReplayURI(cd,u)); + + } else { + // others go out as-is: + + output.put(key, value); + } + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-12 02:19:07
|
Revision: 2110 http://archive-access.svn.sourceforge.net/archive-access/?rev=2110&view=rev Author: bradtofel Date: 2007-12-11 18:19:12 -0800 (Tue, 11 Dec 2007) Log Message: ----------- BUGFIX: (unreported) no longer rewrite mailto: and javascript: URLs in full server-side rewrite mode Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2007-12-12 02:15:09 UTC (rev 2109) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2007-12-12 02:19:12 UTC (rev 2110) @@ -214,39 +214,64 @@ String captureDate = result.getCaptureDate(); String existingBaseHref = TagMagix.getBaseHref(sb); - if (existingBaseHref != null) { + if (existingBaseHref == null) { + insertAtStartOfHead("<base href=\"" + pageUrl + "\" />"); + } else { pageUrl = existingBaseHref; } - TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, - "FRAME", "SRC"); -// TagMagix.markupTagREURIC(page, uriConverter, captureDate, pageUrl, -// "IFRAME", "SRC"); - TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, - "META", "URL"); - TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, - "LINK", "HREF"); + String markups[][] = { + {"FRAME","SRC"}, + {"META","URL"}, + {"LINK","HREF"}, + {"SCRIPT","SRC"} + }; // TODO: The classic WM added a js_ to the datespec, so NotInArchives // can return an valid javascript doc, and not cause Javascript errors. - TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, - "SCRIPT", "SRC"); - - if (existingBaseHref == null) { - String baseTag = "<base href=\"" + pageUrl + "\" />"; - int insertPoint = sb.indexOf("<head>"); - if (-1 == insertPoint) { - insertPoint = sb.indexOf("<HEAD>"); - } - if (-1 == insertPoint) { - insertPoint = 0; - } else { - insertPoint += 6; // just after the tag - } - sb.insert(insertPoint, baseTag); + for(String tagAttr[] : markups) { + TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, + tagAttr[0], tagAttr[1]); } } + /** + * Update all URLs inside the page, so they resolve correctly to absolute + * URLs within the Wayback service. + */ + public void resolveAllPageUrls() { + // TODO: get url from Resource instead of SearchResult? + String pageUrl = result.getAbsoluteUrl(); + String captureDate = result.getCaptureDate(); + + String existingBaseHref = TagMagix.getBaseHref(sb); + if (existingBaseHref != null) { + pageUrl = existingBaseHref; + } + ResultURIConverter ruc = new SpecialResultURIConverter(uriConverter); + + // TODO: forms...? + String markups[][] = { + {"FRAME","SRC"}, + {"META","URL"}, + {"LINK","HREF"}, + {"SCRIPT","SRC"}, + {"IMG","SRC"}, + {"A","HREF"}, + {"AREA","HREF"}, + {"OBJECT","CODEBASE"}, + {"OBJECT","CDATA"}, + {"APPLET","CODEBASE"}, + {"APPLET","ARCHIVE"}, + {"EMBED","SRC"}, + {"IFRAME","SRC"}, + {"BODY","BACKGROUND"}, + }; + for(String tagAttr[] : markups) { + TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl, + tagAttr[0], tagAttr[1]); + } + } /** * @param charSet * @throws IOException @@ -310,9 +335,20 @@ } os.write(b); } - + /** * @param toInsert + */ + public void insertAtStartOfHead(String toInsert) { + int insertPoint = TagMagix.getEndOfFirstTag(sb,"head"); + if (-1 == insertPoint) { + insertPoint = 0; + } + sb.insert(insertPoint,toInsert); + } + + /** + * @param toInsert */ public void insertAtEndOfBody(String toInsert) { int insertPoint = sb.lastIndexOf("</body>"); @@ -325,6 +361,16 @@ sb.insert(insertPoint,toInsert); } /** + * @param toInsert + */ + public void insertAtStartOfBody(String toInsert) { + int insertPoint = TagMagix.getEndOfFirstTag(sb,"body"); + if (-1 == insertPoint) { + insertPoint = 0; + } + sb.insert(insertPoint,toInsert); + } + /** * @param jspPath * @param httpRequest * @param httpResponse @@ -373,4 +419,22 @@ public void setCharSet(String charSet) { this.charSet = charSet; } + + private class SpecialResultURIConverter implements ResultURIConverter { + private static final String EMAIL_PROTOCOL_PREFIX = "mailto:"; + private static final String JAVASCRIPT_PROTOCOL_PREFIX = "javascript:"; + private ResultURIConverter base = null; + public SpecialResultURIConverter(ResultURIConverter base) { + this.base = base; + } + public String makeReplayURI(String datespec, String url) { + if(url.startsWith(EMAIL_PROTOCOL_PREFIX)) { + return url; + } + if(url.startsWith(JAVASCRIPT_PROTOCOL_PREFIX)) { + return url; + } + return base.makeReplayURI(datespec, url); + } + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-12 02:15:05
|
Revision: 2109 http://archive-access.svn.sourceforge.net/archive-access/?rev=2109&view=rev Author: bradtofel Date: 2007-12-11 18:15:09 -0800 (Tue, 11 Dec 2007) Log Message: ----------- FEATURE: added static getEndOfFirstTag() Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2007-12-12 02:12:48 UTC (rev 2108) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2007-12-12 02:15:09 UTC (rev 2109) @@ -283,4 +283,14 @@ public static String getBaseHref(StringBuilder page) { return getTagAttr(page, "BASE", "HREF"); } + + public static int getEndOfFirstTag(StringBuilder page, String tag) { + Pattern tagPattern = getWholeTagPattern(tag); + Matcher tagMatcher = tagPattern.matcher(page); + int offset = -1; + if(tagMatcher.find()) { + offset = tagMatcher.end(); + } + return offset; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2108 http://archive-access.svn.sourceforge.net/archive-access/?rev=2108&view=rev Author: bradtofel Date: 2007-12-11 18:12:48 -0800 (Tue, 11 Dec 2007) Log Message: ----------- TWEAK: command + variable name changes Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2007-12-11 22:31:26 UTC (rev 2107) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2007-12-12 02:12:48 UTC (rev 2108) @@ -35,10 +35,10 @@ /** - * Implements ResourceStore where ARCs are accessed via HTTP 1.1 range requests. - * All ARC files are assumed to be "rooted" at a particular HTTP URL, within - * a single directory, implying an ARC file reverse-proxy to connect through - * to actual HTTP ARC locations. + * Implements ResourceStore where ARC/WARCs are accessed via HTTP 1.1 range + * requests. All files are assumed to be "rooted" at a particular HTTP URL, + * within a single directory, implying a file reverse-proxy to connect through + * to actual HTTP ARC/WARC locations. * * @author brad * @version $Date$, $Revision$ @@ -51,24 +51,24 @@ public Resource retrieveResource(SearchResult result) throws IOException, ResourceNotAvailableException { - // extract ARC filename + add .arc.gz if it is not present - String arcName = result.get(WaybackConstants.RESULT_ARC_FILE); - if(arcName == null || arcName.length() < 1) { + // extract ARC filename + String fileName = result.get(WaybackConstants.RESULT_ARC_FILE); + if(fileName == null || fileName.length() < 1) { throw new IOException("No ARC/WARC name in search result..."); } - // extract ARC offset + convert to long + // extract offset + convert to long final String offsetString = result.get(WaybackConstants.RESULT_OFFSET); if(offsetString == null || offsetString.length() < 1) { throw new IOException("No ARC/WARC offset in search result..."); } final long offset = Long.parseLong(offsetString); - String arcUrl = urlPrefix + arcName; + String fileUrl = urlPrefix + fileName; Resource r = null; try { - r = ResourceFactory.getResource(new URL(arcUrl), offset); + r = ResourceFactory.getResource(new URL(fileUrl), offset); } catch (IOException e) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-11 22:31:22
|
Revision: 2107 http://archive-access.svn.sourceforge.net/archive-access/?rev=2107&view=rev Author: bradtofel Date: 2007-12-11 14:31:26 -0800 (Tue, 11 Dec 2007) Log Message: ----------- RENAME: HttpARCResourceStore => Http11ResourceStore Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java (from rev 2105, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2007-12-11 22:31:26 UTC (rev 2107) @@ -0,0 +1,95 @@ +/* HttpARCResourceStore + * + * $Id$ + * + * Created on 5:29:56 PM Oct 12, 2006. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore; + +import java.io.IOException; +import java.net.URL; + +import org.archive.wayback.ResourceStore; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.exception.ResourceNotAvailableException; + + +/** + * Implements ResourceStore where ARCs are accessed via HTTP 1.1 range requests. + * All ARC files are assumed to be "rooted" at a particular HTTP URL, within + * a single directory, implying an ARC file reverse-proxy to connect through + * to actual HTTP ARC locations. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class Http11ResourceStore implements ResourceStore { + + private String urlPrefix = null; + + + public Resource retrieveResource(SearchResult result) throws IOException, + ResourceNotAvailableException { + + // extract ARC filename + add .arc.gz if it is not present + String arcName = result.get(WaybackConstants.RESULT_ARC_FILE); + if(arcName == null || arcName.length() < 1) { + throw new IOException("No ARC/WARC name in search result..."); + } + + // extract ARC offset + convert to long + final String offsetString = result.get(WaybackConstants.RESULT_OFFSET); + if(offsetString == null || offsetString.length() < 1) { + throw new IOException("No ARC/WARC offset in search result..."); + } + final long offset = Long.parseLong(offsetString); + + String arcUrl = urlPrefix + arcName; + Resource r = null; + try { + + r = ResourceFactory.getResource(new URL(arcUrl), offset); + + } catch (IOException e) { + + e.printStackTrace(); + throw new ResourceNotAvailableException("Unable to retrieve", + e.getLocalizedMessage()); + } + return r; + } + + /** + * @return the urlPrefix + */ + public String getUrlPrefix() { + return urlPrefix; + } + + /** + * @param urlPrefix the urlPrefix to set + */ + public void setUrlPrefix(String urlPrefix) { + this.urlPrefix = urlPrefix; + } +} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java 2007-12-11 22:28:41 UTC (rev 2106) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java 2007-12-11 22:31:26 UTC (rev 2107) @@ -1,95 +0,0 @@ -/* HttpARCResourceStore - * - * $Id$ - * - * Created on 5:29:56 PM Oct 12, 2006. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourcestore; - -import java.io.IOException; -import java.net.URL; - -import org.archive.wayback.ResourceStore; -import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.Resource; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.exception.ResourceNotAvailableException; - - -/** - * Implements ResourceStore where ARCs are accessed via HTTP 1.1 range requests. - * All ARC files are assumed to be "rooted" at a particular HTTP URL, within - * a single directory, implying an ARC file reverse-proxy to connect through - * to actual HTTP ARC locations. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class HttpARCResourceStore implements ResourceStore { - - private String urlPrefix = null; - - - public Resource retrieveResource(SearchResult result) throws IOException, - ResourceNotAvailableException { - - // extract ARC filename + add .arc.gz if it is not present - String arcName = result.get(WaybackConstants.RESULT_ARC_FILE); - if(arcName == null || arcName.length() < 1) { - throw new IOException("No ARC/WARC name in search result..."); - } - - // extract ARC offset + convert to long - final String offsetString = result.get(WaybackConstants.RESULT_OFFSET); - if(offsetString == null || offsetString.length() < 1) { - throw new IOException("No ARC/WARC offset in search result..."); - } - final long offset = Long.parseLong(offsetString); - - String arcUrl = urlPrefix + arcName; - Resource r = null; - try { - - r = ResourceFactory.getResource(new URL(arcUrl), offset); - - } catch (IOException e) { - - e.printStackTrace(); - throw new ResourceNotAvailableException("Unable to retrieve", - e.getLocalizedMessage()); - } - return r; - } - - /** - * @return the urlPrefix - */ - public String getUrlPrefix() { - return urlPrefix; - } - - /** - * @param urlPrefix the urlPrefix to set - */ - public void setUrlPrefix(String urlPrefix) { - this.urlPrefix = urlPrefix; - } -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2106 http://archive-access.svn.sourceforge.net/archive-access/?rev=2106&view=rev Author: bradtofel Date: 2007-12-11 14:28:41 -0800 (Tue, 11 Dec 2007) Log Message: ----------- FEATURE: Now can create ARC/WARC Resources from URL + offset REFACTOR: moved common code into private methods Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java 2007-12-11 22:27:08 UTC (rev 2105) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java 2007-12-11 22:28:41 UTC (rev 2106) @@ -2,6 +2,7 @@ import java.io.File; import java.io.IOException; +import java.net.URL; import org.archive.io.ArchiveRecord; import org.archive.io.arc.ARCReader; @@ -13,42 +14,92 @@ import org.archive.wayback.core.Resource; import org.archive.wayback.exception.ResourceNotAvailableException; +/** + * Static factory class for constructing ARC/WARC Resources from + * File/URL + offset. + * + * @author brad + * @version $Date$, $Revision$ + */ public class ResourceFactory { - - public static Resource getResource(File file, long offset) - throws IOException, ResourceNotAvailableException { + public static Resource getResource(File file, long offset) + throws IOException, ResourceNotAvailableException { + Resource r = null; String name = file.getName(); - if(name.endsWith(LocalResourceStore.OPEN_EXTENSION)) { - name = name.substring(0, name.length() - - LocalResourceStore.OPEN_EXTENSION.length()); + if (name.endsWith(LocalResourceStore.OPEN_EXTENSION)) { + name = name.substring(0, name.length() + - LocalResourceStore.OPEN_EXTENSION.length()); } - if(name.endsWith(LocalResourceStore.ARC_EXTENSION) || - name.endsWith(LocalResourceStore.ARC_GZ_EXTENSION)) { + if (isArc(name)) { ARCReader reader = ARCReaderFactory.get(file); - ArchiveRecord rec = reader.get(offset); - if(!(rec instanceof ARCRecord)) { - throw new ResourceNotAvailableException("Bad ARCRecord format"); - } - ArcResource ar = new ArcResource((ARCRecord) rec, reader); - ar.parseHeaders(); - r = ar; + r = ARCArchiveRecordToResource(reader.get(offset),reader); - } else if(name.endsWith(LocalResourceStore.WARC_EXTENSION) || - name.endsWith(LocalResourceStore.WARC_GZ_EXTENSION)) { + } else if (isWarc(name)) { WARCReader reader = WARCReaderFactory.get(file); - ArchiveRecord rec = reader.get(offset); - if(!(rec instanceof WARCRecord)) { - throw new ResourceNotAvailableException("Bad WARCRecord format"); - } - WarcResource wr = new WarcResource((WARCRecord) rec, reader); - wr.parseHeaders(); - r = wr; - } - + r = WARCArchiveRecordToResource(reader.get(offset),reader); + + } else { + throw new ResourceNotAvailableException("Unknown extension"); + } + return r; } + + public static Resource getResource(URL url, long offset) + throws IOException, ResourceNotAvailableException { + Resource r = null; + String name = url.getFile(); + if (isArc(name)) { + + ARCReader reader = ARCReaderFactory.get(url, offset); + r = ARCArchiveRecordToResource(reader.get(),reader); + + } else if (isWarc(name)) { + + WARCReader reader = WARCReaderFactory.get(url, offset); + r = WARCArchiveRecordToResource(reader.get(),reader); + + } else { + throw new ResourceNotAvailableException("Unknown extension"); + } + return r; + } + + private static boolean isArc(final String name) { + + return (name.endsWith(LocalResourceStore.ARC_EXTENSION) + || name.endsWith(LocalResourceStore.ARC_GZ_EXTENSION)); + } + + private static boolean isWarc(final String name) { + + return (name.endsWith(LocalResourceStore.WARC_EXTENSION) + || name.endsWith(LocalResourceStore.WARC_GZ_EXTENSION)); + } + + private static Resource ARCArchiveRecordToResource(ArchiveRecord rec, + ARCReader reader) throws ResourceNotAvailableException, IOException { + + if (!(rec instanceof ARCRecord)) { + throw new ResourceNotAvailableException("Bad ARCRecord format"); + } + ArcResource ar = new ArcResource((ARCRecord) rec, reader); + ar.parseHeaders(); + return ar; + } + + private static Resource WARCArchiveRecordToResource(ArchiveRecord rec, + WARCReader reader) throws ResourceNotAvailableException, IOException { + + if (!(rec instanceof WARCRecord)) { + throw new ResourceNotAvailableException("Bad WARCRecord format"); + } + WarcResource wr = new WarcResource((WARCRecord) rec, reader); + wr.parseHeaders(); + return wr; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |