You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bra...@us...> - 2011-06-16 17:26:38
|
Revision: 3478 http://archive-access.svn.sourceforge.net/archive-access/?rev=3478&view=rev Author: bradtofel Date: 2011-06-16 17:26:31 +0000 (Thu, 16 Jun 2011) Log Message: ----------- FEATURE: abstracted out fetching of byte chunks from local/remote files moved current code into Http11BlockLoader, which now uses a multithreaded HTTP connection manager to reuse connections implemented an HDFS BlockLoader Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/BlockLoader.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/Http11BlockLoader.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/RemoteHttp11BlockLoader.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/BlockLoader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/BlockLoader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/BlockLoader.java 2011-06-16 17:26:31 UTC (rev 3478) @@ -0,0 +1,19 @@ +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; + +public interface BlockLoader { + /** + * Fetch a range of bytes from a particular URL. Note that the bytes are + * read into memory all at once, so care should be taken with the length + * argument. + * + * @param url String URL to fetch + * @param offset byte start offset of the desired range + * @param length number of octets to fetch + * @return a new byte[] containing the octets fetched + * @throws IOException on Network and protocol failures, as well as Timeouts + */ + public byte[] getBlock(String url, long offset, int length) + throws IOException; +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java 2011-06-16 17:26:31 UTC (rev 3478) @@ -0,0 +1,46 @@ +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +public class HDFSBlockLoader implements BlockLoader { + FileSystem fs = null; + String defaultFSURI = null; + public HDFSBlockLoader(String defaultFSURI) { + this.defaultFSURI = defaultFSURI; + } + public void init() throws IOException, URISyntaxException { + Configuration c = new Configuration(); + c.set("fs.default.name",defaultFSURI); + fs = FileSystem.get(new URI(defaultFSURI),c); + } + + public byte[] getBlock(String url, long offset, int length) + throws IOException { + Path path = new Path(url); + FSDataInputStream s = fs.open(path); + byte buffer[] = new byte[length]; + s.readFully(offset, buffer); + return buffer; + } + + /** + * @return the defaultFSURI + */ + public String getDefaultFSURI() { + return defaultFSURI; + } + + /** + * @param defaultFSURI the defaultFSURI to set + */ + public void setDefaultFSURI(String defaultFSURI) { + this.defaultFSURI = defaultFSURI; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/Http11BlockLoader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/Http11BlockLoader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/Http11BlockLoader.java 2011-06-16 17:26:31 UTC (rev 3478) @@ -0,0 +1,164 @@ +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; +import java.io.InputStream; +import java.util.logging.Logger; +import org.apache.commons.httpclient.HostConfiguration; +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; +import org.apache.commons.httpclient.methods.GetMethod; +import org.apache.commons.httpclient.params.HttpClientParams; +import org.archive.wayback.webapp.PerformanceLogger; + +import com.google.common.io.ByteStreams; + +/** + * Class which wraps most of the complexity of an apache commons httpclient + * MultiThreaderHttpConnectionManager, exposing common configuration elements + * to Spring configuration. + * + * This class is a near direct copy of RemoteLiveWebCache: refactoring needed. + * + * @author brad + * + */ +public class Http11BlockLoader implements BlockLoader { + private static final Logger LOGGER = Logger.getLogger( + Http11BlockLoader.class.getName()); + + private MultiThreadedHttpConnectionManager connectionManager = null; + private HostConfiguration hostConfiguration = null; + private HttpClient http = null; + + /** + * + */ + public Http11BlockLoader() { + connectionManager = new MultiThreadedHttpConnectionManager(); + hostConfiguration = new HostConfiguration(); + HttpClientParams params = new HttpClientParams(); +// params.setParameter(HttpClientParams.RETRY_HANDLER, new NoRetryHandler()); + http = new HttpClient(params,connectionManager); + http.setHostConfiguration(hostConfiguration); + } + + /** + * Fetch a range of bytes from a particular URL. Note that the bytes are + * read into memory all at once, so care should be taken with the length + * argument. + * + * @param url String URL to fetch + * @param offset byte start offset of the desired range + * @param length number of octets to fetch + * @return a new byte[] containing the octets fetched + * @throws IOException on HTTP and Socket failures, as well as Timeouts + */ + public byte[] getBlock(String url, long offset, int length) + throws IOException { + + HttpMethod method = null; + try { + method = new GetMethod(url); + } catch(IllegalArgumentException e) { + LOGGER.warning("Bad URL for live web fetch:" + url); + throw new IOException("Url:" + url + " does not look like an URL?"); + } + StringBuilder sb = new StringBuilder(16); + sb.append(ZiplinedBlock.BYTES_HEADER).append(offset); + sb.append(ZiplinedBlock.BYTES_MINUS).append((offset + length)-1); + String rangeHeader = sb.toString(); + method.addRequestHeader(ZiplinedBlock.RANGE_HEADER, rangeHeader); + //uc.setRequestProperty(RANGE_HEADER, sb.toString()); + long start = System.currentTimeMillis(); + try { + LOGGER.fine("Reading block:" + url + "("+rangeHeader+")"); + int status = http.executeMethod(method); + if((status == 200) || (status == 206)) { + InputStream is = method.getResponseBodyAsStream(); + byte[] block = new byte[length]; + ByteStreams.readFully(is, block); + long elapsed = System.currentTimeMillis() - start; + PerformanceLogger.noteElapsed("CDXBlockLoad",elapsed,url); + return block; + + } else { + throw new IOException("Bad status for " + url); + } + } finally { + method.releaseConnection(); + } + } + + /** + * @param hostPort to proxy requests through - ex. "localhost:3128" + */ + public void setProxyHostPort(String hostPort) { + int colonIdx = hostPort.indexOf(':'); + if(colonIdx > 0) { + String host = hostPort.substring(0,colonIdx); + int port = Integer.valueOf(hostPort.substring(colonIdx+1)); + + hostConfiguration.setProxy(host, port); + } + } + + /** + * @param maxTotalConnections the HttpConnectionManagerParams config + */ + public void setMaxTotalConnections(int maxTotalConnections) { + connectionManager.getParams(). + setMaxTotalConnections(maxTotalConnections); + } + + /** + * @return the HttpConnectionManagerParams maxTotalConnections config + */ + public int getMaxTotalConnections() { + return connectionManager.getParams().getMaxTotalConnections(); + } + + /** + * @param maxHostConnections the HttpConnectionManagerParams config + */ + public void setMaxHostConnections(int maxHostConnections) { + connectionManager.getParams(). + setMaxConnectionsPerHost(hostConfiguration, maxHostConnections); + } + + /** + * @return the HttpConnectionManagerParams maxHostConnections config + */ + public int getMaxHostConnections() { + return connectionManager.getParams(). + getMaxConnectionsPerHost(hostConfiguration); + } + + /** + * @return the connectionTimeoutMS + */ + public int getConnectionTimeoutMS() { + return connectionManager.getParams().getConnectionTimeout(); + } + + /** + * @param connectionTimeoutMS the connectionTimeoutMS to set + */ + public void setConnectionTimeoutMS(int connectionTimeoutMS) { + connectionManager.getParams().setConnectionTimeout(connectionTimeoutMS); + } + + /** + * @return the socketTimeoutMS + */ + public int getSocketTimeoutMS() { + return connectionManager.getParams().getSoTimeout(); + } + + /** + * @param socketTimeoutMS the socketTimeoutMS to set + */ + public void setSocketTimeoutMS(int socketTimeoutMS) { + connectionManager.getParams().setSoTimeout(socketTimeoutMS); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/RemoteHttp11BlockLoader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/RemoteHttp11BlockLoader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/RemoteHttp11BlockLoader.java 2011-06-16 17:26:31 UTC (rev 3478) @@ -0,0 +1,9 @@ +package org.archive.wayback.resourceindex.ziplines; + +/** + * @author brad + * @deprecated use Http11BlockLoader + */ +public class RemoteHttp11BlockLoader extends Http11BlockLoader { + +} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2011-06-16 17:23:08 UTC (rev 3477) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2011-06-16 17:26:31 UTC (rev 3478) @@ -20,6 +20,7 @@ package org.archive.wayback.resourceindex.ziplines; import java.io.BufferedReader; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; @@ -37,13 +38,14 @@ private static final Logger LOGGER = Logger.getLogger( ZiplinedBlock.class.getName()); + BlockLoader loader = null; String urlOrPath = null; long offset = -1; int count = 0; public final static int BLOCK_SIZE = 128 * 1024; - private final static String RANGE_HEADER = "Range"; - private final static String BYTES_HEADER = "bytes="; - private final static String BYTES_MINUS = "-"; + public final static String RANGE_HEADER = "Range"; + public final static String BYTES_HEADER = "bytes="; + public final static String BYTES_MINUS = "-"; /** * @param urlOrPath URL where this file can be downloaded * @param offset start of 128K block boundary. @@ -62,10 +64,29 @@ this.count = count; } /** + * @param loader the RemoteHttp11BlockLoader to use when fetching this block + */ + public void setLoader(BlockLoader loader) { + this.loader = loader; + } + /** * @return a BufferedReader of the underlying compressed data in this block * @throws IOException for usual reasons */ public BufferedReader readBlock() throws IOException { + if(loader != null) { + return readBlockEfficiently(loader); + } + return readBlockInefficiently(); + } + private BufferedReader readBlockEfficiently(BlockLoader remote) + throws IOException { + byte bytes[] = remote.getBlock(urlOrPath, offset, BLOCK_SIZE); + return new BufferedReader(new InputStreamReader( + new GZIPInputStream(new ByteArrayInputStream(bytes)), + ByteOp.UTF8)); + } + private BufferedReader readBlockInefficiently() throws IOException { StringBuilder sb = new StringBuilder(16); sb.append(BYTES_HEADER).append(offset).append(BYTES_MINUS); sb.append((offset + BLOCK_SIZE)-1); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-06-16 17:23:08 UTC (rev 3477) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-06-16 17:26:31 UTC (rev 3478) @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.PrintWriter; +import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -78,6 +79,7 @@ private HashMap<String,String> chunkMap = null; private CDXFormat format = null; private int maxBlocks = 1000; + private BlockLoader blockLoader = null; public ZiplinesSearchResultSource() { } @@ -165,7 +167,9 @@ String url = chunkMap.get(parts[1]); long offset = Long.parseLong(parts[2]); LOGGER.info("Adding block source(" + parts[1] + "):" + offset); - blocks.add(new ZiplinedBlock(url, offset)); + ZiplinedBlock block = new ZiplinedBlock(url, offset); + block.setLoader(blockLoader); + blocks.add(block); } } finally { if(itr != null) { @@ -245,8 +249,22 @@ */ public void setMaxBlocks(int maxBlocks) { this.maxBlocks = maxBlocks; - } + } + /** + * @return the blockLoader + */ + public BlockLoader getBlockLoader() { + return blockLoader; + } + + /** + * @param blockLoader the blockLoader to set + */ + public void setBlockLoader(BlockLoader blockLoader) { + this.blockLoader = blockLoader; + } + private static void USAGE() { System.err.println("USAGE:"); System.err.println(""); @@ -267,6 +285,7 @@ // String cdxSpec = CDXFormatIndex.CDX_HEADER_MAGIC; String cdxSpec = " CDX N b a m s k r V g"; CDXFormat format = null; + BlockLoader blockLoader = new Http11BlockLoader(); try { format = new CDXFormat(cdxSpec); } catch (CDXFormatException e1) { @@ -291,6 +310,23 @@ } } else if(args[idx].equals("-blockDump")) { blockDump = true; + } else if(args[idx].equals("-hdfs")) { + idx++; + if(idx >= args.length) { + USAGE(); + } + blockLoader = new HDFSBlockLoader(args[idx]); + try { + ((HDFSBlockLoader)blockLoader).init(); + } catch (IOException e) { + e.printStackTrace(); + USAGE(); + System.exit(1); + } catch (URISyntaxException e) { + e.printStackTrace(); + USAGE(); + System.exit(1); + } } else if(args[idx].equals("-max")) { idx++; if(idx >= args.length) { @@ -319,6 +355,7 @@ USAGE(); } // first is summary path, then location path, then search key: + zl.setBlockLoader(blockLoader); zl.setChunkIndexPath(args[idx++]); zl.setChunkMapPath(args[idx++]); String key = args[idx++]; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-06-16 17:23:14
|
Revision: 3477 http://archive-access.svn.sourceforge.net/archive-access/?rev=3477&view=rev Author: bradtofel Date: 2011-06-16 17:23:08 +0000 (Thu, 16 Jun 2011) Log Message: ----------- Performance loggging, plus now throwing correct LiveDocumentNotAvailable exception, rather than NIA Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-06-16 17:22:14 UTC (rev 3476) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-06-16 17:23:08 UTC (rev 3477) @@ -36,6 +36,7 @@ import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.AdministrativeAccessControlException; import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.LiveDocumentNotAvailableException; import org.archive.wayback.exception.ResourceNotInArchiveException; import org.archive.wayback.exception.RobotAccessControlException; import org.archive.wayback.exception.WaybackException; @@ -113,7 +114,10 @@ } } // no robots check, or robots.txt says GO: + long start = System.currentTimeMillis(); ArcResource r = (ArcResource) cache.getCachedResource(url, maxCacheMS , false); + long elapsed = System.currentTimeMillis() - start; + PerformanceLogger.noteElapsed("LiveWebRequest",elapsed,urlString); ARCRecord ar = (ARCRecord) r.getArcRecord(); int status = ar.getStatusCode(); if((status == 200) || ((status >= 300) && (status < 400))) { @@ -128,7 +132,7 @@ httpRequest, httpResponse, wbRequest, result, r, inner.getUriConverter(), results); } else { - throw new ResourceNotInArchiveException("Not In Archive - Not on Live web"); + throw new LiveDocumentNotAvailableException(urlString); } } catch(WaybackException e) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3476 http://archive-access.svn.sourceforge.net/archive-access/?rev=3476&view=rev Author: bradtofel Date: 2011-06-16 17:22:14 +0000 (Thu, 16 Jun 2011) Log Message: ----------- HACK allow redirection of URLs without timestamp to the current timestamp Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/ServerRelativeArchivalRedirect.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/ServerRelativeArchivalRedirect.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/ServerRelativeArchivalRedirect.java 2011-06-16 17:19:34 UTC (rev 3475) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/ServerRelativeArchivalRedirect.java 2011-06-16 17:22:14 UTC (rev 3476) @@ -29,6 +29,7 @@ import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.util.ArchiveUtils; +import org.archive.wayback.util.Timestamp; import org.archive.wayback.util.url.UrlOperations; import org.archive.wayback.util.webapp.AbstractRequestHandler; @@ -42,6 +43,7 @@ boolean useCollection = false; private String matchHost = null; private int matchPort = -1; + private String replayPrefix; private boolean handleRequestWithCollection(HttpServletRequest httpRequest, HttpServletResponse httpResponse) throws ServletException, @@ -154,9 +156,28 @@ return false; } } - return useCollection ? + boolean handled = useCollection ? handleRequestWithCollection(httpRequest, httpResponse): handleRequestWithoutCollection(httpRequest, httpResponse); + if(!handled) { + if(replayPrefix != null) { + String thisPath = httpRequest.getRequestURI(); + String queryString = httpRequest.getQueryString(); + if (queryString != null) { + thisPath += "?" + queryString; + } + if(thisPath.startsWith("/http://")) { + // assume a replay request: + StringBuilder sb = new StringBuilder(thisPath.length() + replayPrefix.length() + 16); + sb.append(replayPrefix); + sb.append(Timestamp.currentTimestamp().getDateStr()); + sb.append(thisPath); + httpResponse.sendRedirect(sb.toString()); + handled = true; + } + } + } + return handled; } /** @@ -195,4 +216,18 @@ public void setMatchPort(int matchPort) { this.matchPort = matchPort; } + + /** + * @return the replayPrefix + */ + public String getReplayPrefix() { + return replayPrefix; + } + + /** + * @param replayPrefix the replayPrefix to set + */ + public void setReplayPrefix(String replayPrefix) { + this.replayPrefix = replayPrefix; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-06-16 17:19:39
|
Revision: 3475 http://archive-access.svn.sourceforge.net/archive-access/?rev=3475&view=rev Author: bradtofel Date: 2011-06-16 17:19:34 +0000 (Thu, 16 Jun 2011) Log Message: ----------- LOGGING Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2011-06-16 17:18:09 UTC (rev 3474) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2011-06-16 17:19:34 UTC (rev 3475) @@ -26,6 +26,7 @@ import java.util.List; import java.util.Locale; import java.util.Properties; +import java.util.logging.Level; import java.util.logging.Logger; import javax.servlet.ServletException; @@ -133,6 +134,9 @@ protected boolean dispatchLocal(HttpServletRequest httpRequest, HttpServletResponse httpResponse) throws ServletException, IOException { + if(LOGGER.isLoggable(Level.FINE)) { + LOGGER.fine("Local dispatch /" + translateRequestPath(httpRequest)); + } if(!serveStatic) { return false; } @@ -179,7 +183,7 @@ Thread.currentThread().setName("Thread " + Thread.currentThread().getId() + " " + getBeanName() + " handling: " + inputPath); - + LOGGER.fine("Handling translated: " + inputPath); wbRequest = getParser().parse(httpRequest, this); if(wbRequest != null) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-06-16 17:18:16
|
Revision: 3474 http://archive-access.svn.sourceforge.net/archive-access/?rev=3474&view=rev Author: bradtofel Date: 2011-06-16 17:18:09 +0000 (Thu, 16 Jun 2011) Log Message: ----------- INITIAL REV: new classes, as yet unused, for dealing with Iterators Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/AbstractPeekableIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/IPeekableIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/PeekableIteratorComparator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/SortedCompositeIterator.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/AbstractPeekableIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/AbstractPeekableIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/AbstractPeekableIterator.java 2011-06-16 17:18:09 UTC (rev 3474) @@ -0,0 +1,93 @@ +package org.archive.wayback.util.iterator; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.Comparator; +import java.util.Iterator; +import java.util.NoSuchElementException; + + +public abstract class AbstractPeekableIterator<E> implements IPeekableIterator<E> { + private E cachedNext = null; + private boolean done = false; + + // returns next E, or null if hasNext() would return false; + public abstract E getNextInner(); + + public boolean hasNext() { + if(cachedNext != null) { + return true; + } + if(done) { + return false; + } + cachedNext = getNextInner(); + return (cachedNext != null); + } + + public E next() { + if(cachedNext == null) { + if(!hasNext()) { + throw new NoSuchElementException("Call hasNext!"); + } + } + E tmp = cachedNext; + cachedNext = null; + return tmp; + } + + public void remove() { + throw new UnsupportedOperationException("No remove"); + } + + public E peek() { + if(cachedNext == null) { + if(!hasNext()) { + throw new NoSuchElementException("Call hasNext!"); + } + } + return cachedNext; + } + public static <T> IPeekableIterator<T> wrap(Iterator<T> itr) { + return new IteratorWrappedPeekableIterator<T>(itr); + } + public static IPeekableIterator<String> wrapReader(BufferedReader reader) { + return new BufferedReaderPeekableIterator(reader); + } + + private static class IteratorWrappedPeekableIterator<C> extends AbstractPeekableIterator<C> { + private Iterator<C> wrapped = null; + public IteratorWrappedPeekableIterator(Iterator<C> wrapped) { + this.wrapped = wrapped; + } + @Override + public C getNextInner() { + C next = null; + if(wrapped != null) { + if(wrapped.hasNext()) { + next = wrapped.next(); + } + } + return next; + } + } + private static class BufferedReaderPeekableIterator extends AbstractPeekableIterator<String> { + private BufferedReader reader = null; + public BufferedReaderPeekableIterator(BufferedReader reader) { + this.reader = reader; + } + @Override + public String getNextInner() { + String next = null; + if(reader != null) { + try { + next = reader.readLine(); + } catch (IOException e) { + e.printStackTrace(); + } + } + return next; + } + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/AbstractPeekableIterator.java ___________________________________________________________________ Added: svn:executable + * Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/IPeekableIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/IPeekableIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/IPeekableIterator.java 2011-06-16 17:18:09 UTC (rev 3474) @@ -0,0 +1,7 @@ +package org.archive.wayback.util.iterator; + +import java.util.Iterator; + +public interface IPeekableIterator<E> extends Iterator<E> { + public E peek(); +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/IPeekableIterator.java ___________________________________________________________________ Added: svn:executable + * Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/PeekableIteratorComparator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/PeekableIteratorComparator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/PeekableIteratorComparator.java 2011-06-16 17:18:09 UTC (rev 3474) @@ -0,0 +1,44 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.util.iterator; + +import java.util.Comparator; + +/** + * @author brad + * + * @param <J> type found in Iterators + */ +public class PeekableIteratorComparator<J> implements Comparator<IPeekableIterator<J>> { + private Comparator<J> comparator = null; + /** + * @param comparator to compare the iterators + */ + public PeekableIteratorComparator(Comparator<J> comparator) { + this.comparator = comparator; + } + + public int compare(IPeekableIterator<J> o1, IPeekableIterator<J> o2) { + return comparator.compare(o1.peek(), o2.peek()); + } + public static <K> Comparator<IPeekableIterator<K>> getComparator(Comparator<K> comparator) { + return new PeekableIteratorComparator<K>(comparator); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/SortedCompositeIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/SortedCompositeIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/SortedCompositeIterator.java 2011-06-16 17:18:09 UTC (rev 3474) @@ -0,0 +1,56 @@ +package org.archive.wayback.util.iterator; + +import java.util.Collection; +import java.util.Comparator; +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.PriorityQueue; + + +public class SortedCompositeIterator<E> implements Iterator<E> { + private static final int DEFAULT_CAPACITY = 10; + PriorityQueue<IPeekableIterator<E>> q = null; + + public SortedCompositeIterator(Comparator<E> comparator) { + this(DEFAULT_CAPACITY,comparator); + } + public SortedCompositeIterator(int capacity, Comparator<E> comparator) { + q = new PriorityQueue<IPeekableIterator<E>>(capacity, + new PeekableIteratorComparator<E>(comparator)); + } + public void addAll(Collection<Iterator<E>> toAdd) { + for(Iterator<E> e : toAdd) { + addIterator(e); + } + } + public void addIterator(Iterator<E> itr) { + IPeekableIterator<E> i = null; + if(itr instanceof IPeekableIterator) { + i = (IPeekableIterator<E>) itr; + } else { + i = AbstractPeekableIterator.wrap(itr); + } + if(i.hasNext()) { + q.add(i); + } + } + + public boolean hasNext() { + return (q.peek() != null); + } + + public E next() { + IPeekableIterator<E> i = q.poll(); + if(i == null) { + throw new NoSuchElementException("Call hasNext!"); + } + E tmp = i.next(); + if(i.hasNext()) { + q.add(i); + } + return tmp; + } + public void remove() { + throw new UnsupportedOperationException("No remove"); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/iterator/SortedCompositeIterator.java ___________________________________________________________________ Added: svn:executable + * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-06-16 17:06:02
|
Revision: 3473 http://archive-access.svn.sourceforge.net/archive-access/?rev=3473&view=rev Author: bradtofel Date: 2011-06-16 17:05:56 +0000 (Thu, 16 Jun 2011) Log Message: ----------- TWEAK: now stores the original string used to compose the IPRange, and allows access via a Getter Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/IPRange.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/IPRange.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/IPRange.java 2011-06-16 17:03:49 UTC (rev 3472) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/IPRange.java 2011-06-16 17:05:56 UTC (rev 3473) @@ -46,6 +46,7 @@ // INSTANCE MEMBERS: private byte[] ip = null; private byte[] mask = null; + private String original = null; // INSTANCE METHODS: public byte[] getIp() { @@ -72,11 +73,17 @@ public String getRangeString() { return null; } + public void setRangeString(String range) { setRange(range); } + public String getOriginal() { + return original; + } + public boolean setRange(String range) { + original = range; Matcher m = IP_MASK_PATTERN.matcher(range); if(m != null) { if(m.matches()) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3472 http://archive-access.svn.sourceforge.net/archive-access/?rev=3472&view=rev Author: bradtofel Date: 2011-06-16 17:03:49 +0000 (Thu, 16 Jun 2011) Log Message: ----------- LOGGING Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/authenticationcontrol/IPMatchesBooleanOperator.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/authenticationcontrol/IPMatchesBooleanOperator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/authenticationcontrol/IPMatchesBooleanOperator.java 2011-06-16 17:03:23 UTC (rev 3471) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/authenticationcontrol/IPMatchesBooleanOperator.java 2011-06-16 17:03:49 UTC (rev 3472) @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.logging.Level; import java.util.logging.Logger; import org.archive.wayback.core.WaybackRequest; @@ -71,9 +72,22 @@ return false; } byte[] ip = IPRange.matchIP(ipString); - for(IPRange range : allowedRanges) { - if(range.contains(ip)) { - return true; + if(ip == null) { + LOGGER.severe("Unable to parse remote IP address("+ipString+")"); + } else { + for(IPRange range : allowedRanges) { + if(range.contains(ip)) { + if(LOGGER.isLoggable(Level.FINE)){ + LOGGER.fine(String.format("Range(%s) matched(%s)", + range.getOriginal(),ipString)); + } + return true; + } else { + if(LOGGER.isLoggable(Level.FINE)){ + LOGGER.fine(String.format("Range(%s) NO match(%s)", + range.getOriginal(),ipString)); + } + } } } return false; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3471 http://archive-access.svn.sourceforge.net/archive-access/?rev=3471&view=rev Author: bradtofel Date: 2011-06-16 17:03:23 +0000 (Thu, 16 Jun 2011) Log Message: ----------- BUGFIX: was reporting LiveWebCacheUnavailable when it should have been a timeout. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java 2011-06-16 17:01:54 UTC (rev 3470) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java 2011-06-16 17:03:23 UTC (rev 3471) @@ -22,6 +22,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.ConnectException; +import java.net.SocketException; import java.net.SocketTimeoutException; import java.net.URL; import java.util.logging.Logger; @@ -32,6 +33,7 @@ import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; +import org.apache.commons.httpclient.NoHttpResponseException; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpClientParams; import org.archive.io.arc.ARCRecord; @@ -99,17 +101,26 @@ } else { throw new LiveWebCacheUnavailableException(urlString); } + } catch (ResourceNotAvailableException e) { throw new LiveDocumentNotAvailableException(urlString); + } catch (NoHttpResponseException e) { + + throw new LiveWebCacheUnavailableException("No Http Response for " + + urlString); + } catch (ConnectException e) { throw new LiveWebCacheUnavailableException(e.getLocalizedMessage() + " : " + urlString); + } catch (SocketException e) { + throw new LiveWebCacheUnavailableException(e.getLocalizedMessage() + + " : " + urlString); } catch (SocketTimeoutException e) { throw new LiveWebTimeoutException(e.getLocalizedMessage() + " : " + urlString); } catch(ConnectTimeoutException e) { - throw new LiveWebCacheUnavailableException(e.getLocalizedMessage() + throw new LiveWebTimeoutException(e.getLocalizedMessage() + " : " + urlString); } finally { method.releaseConnection(); @@ -145,13 +156,29 @@ setMaxTotalConnections(maxTotalConnections); } /** + * @return the HttpConnectionManagerParams maxTotalConnections config + */ + public int getMaxTotalConnections() { + return connectionManager.getParams().getMaxTotalConnections(); + } + + /** * @param maxHostConnections the HttpConnectionManagerParams config */ public void setMaxHostConnections(int maxHostConnections) { connectionManager.getParams(). setMaxConnectionsPerHost(hostConfiguration, maxHostConnections); } - /** + + /** + * @return the HttpConnectionManagerParams maxHostConnections config + */ + public int getMaxHostConnections() { + return connectionManager.getParams(). + getMaxConnectionsPerHost(hostConfiguration); + } + + /** * @return the connectionTimeoutMS */ public int getConnectionTimeoutMS() { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-06-16 17:02:01
|
Revision: 3470 http://archive-access.svn.sourceforge.net/archive-access/?rev=3470&view=rev Author: bradtofel Date: 2011-06-16 17:01:54 +0000 (Thu, 16 Jun 2011) Log Message: ----------- INITIAL REV: a generic RegEx pattern replacement StringTransformer, and a compounder of StringTransformer, allowing multiple RegEx transformers to be "stacked" Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MultiRegexReplaceStringTransformer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/RegexReplaceStringTransformer.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MultiRegexReplaceStringTransformer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MultiRegexReplaceStringTransformer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MultiRegexReplaceStringTransformer.java 2011-06-16 17:01:54 UTC (rev 3470) @@ -0,0 +1,51 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.replay.html.transformer; + +import java.util.List; + +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.replay.html.StringTransformer; + +public class MultiRegexReplaceStringTransformer implements StringTransformer { + List<StringTransformer> transformers; + public String transform(ReplayParseContext context, String input) { + if(transformers == null) { + return input; + } + for(StringTransformer t : transformers) { + input = t.transform(context, input); + } + return input; + } + /** + * @return the transformers + */ + public List<StringTransformer> getTransformers() { + return transformers; + } + /** + * @param transformers the transformers to set + */ + public void setTransformers(List<StringTransformer> transformers) { + this.transformers = transformers; + } + +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/RegexReplaceStringTransformer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/RegexReplaceStringTransformer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/RegexReplaceStringTransformer.java 2011-06-16 17:01:54 UTC (rev 3470) @@ -0,0 +1,69 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.replay.html.transformer; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.replay.html.StringTransformer; + +public class RegexReplaceStringTransformer implements StringTransformer { + private String regex = ""; + private String replacement = ""; + private Pattern pattern = null; + + public String transform(ReplayParseContext context, String input) { + if(pattern == null) { + return input; + } + Matcher m = pattern.matcher(input); + return m.replaceAll(replacement); + } + + /** + * @return the regex + */ + public String getRegex() { + return regex; + } + + /** + * @param regex the regex to set + */ + public void setRegex(String regex) { + this.regex = regex; + pattern = Pattern.compile(regex); + } + + /** + * @return the replacement + */ + public String getReplacement() { + return replacement; + } + + /** + * @param replacement the replacement to set + */ + public void setReplacement(String replacement) { + this.replacement = replacement; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3469 http://archive-access.svn.sourceforge.net/archive-access/?rev=3469&view=rev Author: bradtofel Date: 2011-06-16 16:51:59 +0000 (Thu, 16 Jun 2011) Log Message: ----------- INTERFACE: now allow any old StringTransformer to provide for javascript: urls.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/URLStringTransformer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/URLStringTransformer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/URLStringTransformer.java 2011-06-16 16:50:12 UTC (rev 3468) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/URLStringTransformer.java 2011-06-16 16:51:59 UTC (rev 3469) @@ -28,7 +28,7 @@ */ public class URLStringTransformer implements StringTransformer { private String flags; - private JSStringTransformer jsTransformer = null; + private StringTransformer jsTransformer = null; /** Default constructor */ public URLStringTransformer() {} /** @@ -64,10 +64,10 @@ this.flags = flags; } - public JSStringTransformer getJsTransformer() { + public StringTransformer getJsTransformer() { return jsTransformer; } - public void setJsTransformer(JSStringTransformer jsTransformer) { + public void setJsTransformer(StringTransformer jsTransformer) { this.jsTransformer = jsTransformer; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-06-16 16:50:19
|
Revision: 3468 http://archive-access.svn.sourceforge.net/archive-access/?rev=3468&view=rev Author: bradtofel Date: 2011-06-16 16:50:12 +0000 (Thu, 16 Jun 2011) Log Message: ----------- BUGFIX: the closest tracking filter was part of the QueryCaptureFilterGroup, forcing this group to be last. In fact, we definitely want to do date and URL filtering before exclusions, and probably want to do it as early as possible. Moved ClosestTrackingFilter into it's own FilterGroup, which now is installed last. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroupFactory.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2011-06-16 16:41:19 UTC (rev 3467) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2011-06-16 16:50:12 UTC (rev 3468) @@ -42,6 +42,7 @@ import org.archive.wayback.resourceindex.adapters.CaptureToUrlSearchResultIterator; import org.archive.wayback.resourceindex.filterfactory.AccessPointCaptureFilterGroupFactory; import org.archive.wayback.resourceindex.filterfactory.CaptureFilterGroup; +import org.archive.wayback.resourceindex.filterfactory.ClosestTrackingCaptureFilterGroupFactory; import org.archive.wayback.resourceindex.filterfactory.CoreCaptureFilterGroupFactory; import org.archive.wayback.resourceindex.filterfactory.ExclusionCaptureFilterGroupFactory; import org.archive.wayback.resourceindex.filterfactory.FilterGroupFactory; @@ -118,8 +119,9 @@ fgFactories = new ArrayList<FilterGroupFactory>(); fgFactories.add(new CoreCaptureFilterGroupFactory()); fgFactories.add(new AccessPointCaptureFilterGroupFactory()); + fgFactories.add(new QueryCaptureFilterGroupFactory()); fgFactories.add(new ExclusionCaptureFilterGroupFactory()); - fgFactories.add(new QueryCaptureFilterGroupFactory()); + fgFactories.add(new ClosestTrackingCaptureFilterGroupFactory()); } private void cleanupIterator(CloseableIterator<? extends SearchResult> itr) Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroup.java 2011-06-16 16:50:12 UTC (rev 3468) @@ -0,0 +1,45 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.CaptureSearchResults; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.AccessControlException; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.resourceindex.filters.ClosestResultTrackingFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; + +public class ClosestTrackingCaptureFilterGroup implements CaptureFilterGroup { + private ObjectFilterChain<CaptureSearchResult> chain = null; + private ClosestResultTrackingFilter closestTracker = null; + public ClosestTrackingCaptureFilterGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer) { + chain = new ObjectFilterChain<CaptureSearchResult>(); + if(request.isCaptureQueryRequest() || + request.isReplayRequest()) { + closestTracker = + new ClosestResultTrackingFilter(request.getReplayDate().getTime()); + chain.addFilter(closestTracker); + } + } + + public List<ObjectFilter<CaptureSearchResult>> getFilters() { + return chain.getFilters(); + } + + public void annotateResults(SearchResults results) + throws ResourceNotInArchiveException, BadQueryException, + AccessControlException { + if(closestTracker != null) { + if(results instanceof CaptureSearchResults) { + CaptureSearchResults cResults = (CaptureSearchResults) results; + cResults.setClosest(closestTracker.getClosest()); + } + } + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroupFactory.java 2011-06-16 16:50:12 UTC (rev 3468) @@ -0,0 +1,16 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public class ClosestTrackingCaptureFilterGroupFactory implements FilterGroupFactory { + + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException { + return new ClosestTrackingCaptureFilterGroup(request,canonicalizer); + } + +} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java 2011-06-16 16:41:19 UTC (rev 3467) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java 2011-06-16 16:50:12 UTC (rev 3468) @@ -27,11 +27,9 @@ import org.apache.commons.httpclient.URIException; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.core.CaptureSearchResults; import org.archive.wayback.core.SearchResults; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BadQueryException; -import org.archive.wayback.resourceindex.filters.ClosestResultTrackingFilter; import org.archive.wayback.resourceindex.filters.DateRangeFilter; import org.archive.wayback.resourceindex.filters.HostMatchFilter; import org.archive.wayback.resourceindex.filters.SchemeMatchFilter; @@ -44,12 +42,7 @@ import org.archive.wayback.util.url.UrlOperations; public class QueryCaptureFilterGroup implements CaptureFilterGroup { -// private ObjectFilter<CaptureSearchResult> prefixFilter = null; -// private ObjectFilter<CaptureSearchResult> dateFilter = null; -// private ObjectFilter<CaptureSearchResult> selfRedirectFilter = null; -// private ObjectFilter<CaptureSearchResult> exactHost = null; -// private ObjectFilter<CaptureSearchResult> exactScheme = null; - private ClosestResultTrackingFilter closestTracker = null; + private ObjectFilterChain<CaptureSearchResult> chain = null; private String requestType = null; private String keyUrl = null; @@ -94,14 +87,9 @@ Timestamp.parseBefore(anchorTS).getDate().getTime(); } } - - closestTracker = new ClosestResultTrackingFilter( - request.getReplayDate().getTime()); } else if(request.isCaptureQueryRequest()) { chain.addFilter(new UrlMatchFilter(keyUrl)); - closestTracker = new ClosestResultTrackingFilter( - request.getReplayDate().getTime()); } else if(request.isUrlQueryRequest()) { chain.addFilter(new UrlPrefixMatchFilter(keyUrl)); } @@ -130,9 +118,6 @@ chain.addFilter(new SchemeMatchFilter( UrlOperations.urlToScheme(request.getRequestUrl()),this)); } - if(closestTracker != null) { - chain.addFilter(closestTracker); - } } public List<ObjectFilter<CaptureSearchResult>> getFilters() { @@ -152,12 +137,6 @@ if(!closeMatches.isEmpty()) { results.setCloseMatches(new ArrayList<String>(closeMatches.values())); } - if(closestTracker != null) { - if(results instanceof CaptureSearchResults) { - CaptureSearchResults cResults = (CaptureSearchResults) results; - cResults.setClosest(closestTracker.getClosest()); - } - } } public void addCloseMatch(String host, String closeMatch) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3467 http://archive-access.svn.sourceforge.net/archive-access/?rev=3467&view=rev Author: bradtofel Date: 2011-06-16 16:41:19 +0000 (Thu, 16 Jun 2011) Log Message: ----------- FEATURE: added performance logging of HTTP 1.1 resource requests Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java 2011-06-16 16:39:40 UTC (rev 3466) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java 2011-06-16 16:41:19 UTC (rev 3467) @@ -36,6 +36,7 @@ import org.archive.io.warc.WARCRecord; import org.archive.wayback.core.Resource; import org.archive.wayback.exception.ResourceNotAvailableException; +import org.archive.wayback.webapp.PerformanceLogger; /** * Static factory class for constructing ARC/WARC Resources from @@ -89,6 +90,7 @@ Resource r = null; // TODO: allow configuration of timeouts -- now using defaults.. + long start = System.currentTimeMillis(); TimeoutArchiveReaderFactory tarf = new TimeoutArchiveReaderFactory(); ArchiveReader reader = tarf.getArchiveReader(url,offset); if(reader instanceof ARCReader) { @@ -102,6 +104,8 @@ } else { throw new ResourceNotAvailableException("Unknown ArchiveReader"); } + long elapsed = System.currentTimeMillis() - start; + PerformanceLogger.noteElapsed("Http11Resource", elapsed, url.toExternalForm()); return r; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-06-16 16:39:46
|
Revision: 3466 http://archive-access.svn.sourceforge.net/archive-access/?rev=3466&view=rev Author: bradtofel Date: 2011-06-16 16:39:40 +0000 (Thu, 16 Jun 2011) Log Message: ----------- FEATURE: added new global method noteElapsed() to simplify configuration of performance logging for various points in the code. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java 2011-06-16 16:36:14 UTC (rev 3465) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java 2011-06-16 16:39:40 UTC (rev 3466) @@ -19,6 +19,7 @@ */ package org.archive.wayback.webapp; +import java.util.logging.Level; import java.util.logging.Logger; /** @@ -87,4 +88,18 @@ sb.append(info); LOGGER.finer(sb.toString()); } + public static void noteElapsed(String message, long elapsed, String note) { + if(LOGGER.isLoggable(Level.INFO)) { + StringBuilder sb = new StringBuilder(); + sb.append("WB-PERF\t").append(message).append("\t").append(elapsed); + if(note != null) { + sb.append("\t").append(note); + } + LOGGER.info(sb.toString()); + } + } + + public static void noteElapsed(String message, long elapsed) { + noteElapsed(message,elapsed,null); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-06-16 16:36:20
|
Revision: 3465 http://archive-access.svn.sourceforge.net/archive-access/?rev=3465&view=rev Author: bradtofel Date: 2011-06-16 16:36:14 +0000 (Thu, 16 Jun 2011) Log Message: ----------- LOGGING Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/PortMapper.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/PortMapper.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/PortMapper.java 2011-06-16 16:34:59 UTC (rev 3464) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/PortMapper.java 2011-06-16 16:36:14 UTC (rev 3465) @@ -135,22 +135,36 @@ pathPrefix.append("/"); // } String firstPath = requestToFirstPath(request); - RequestHandler handler = pathMap.get(hostPathToKey(host,firstPath)); + String key = hostPathToKey(host,firstPath); + RequestHandler handler = pathMap.get(key); if(handler != null) { + LOGGER.fine("Mapped to RequestHandler with " + key); return new RequestHandlerContext(handler, pathPrefix.append(firstPath).toString()); + } else { + LOGGER.finer("No mapping for " + key); } - handler = pathMap.get(hostPathToKey(host,null)); + key = hostPathToKey(host,null); + handler = pathMap.get(key); if(handler != null) { + LOGGER.fine("Mapped to RequestHandler with " + key); return new RequestHandlerContext(handler,contextPath); + } else { + LOGGER.finer("No mapping for " + key); } - handler = pathMap.get(hostPathToKey(null,firstPath)); + key = hostPathToKey(null,firstPath); + handler = pathMap.get(key); if(handler != null) { + LOGGER.fine("Mapped to RequestHandler with " + key); + return new RequestHandlerContext(handler, pathPrefix.append(firstPath).toString()); + } else { + LOGGER.finer("No mapping for " + key); } handler = pathMap.get(null); if(handler != null) { + LOGGER.fine("Mapped to RequestHandler with null"); return new RequestHandlerContext(handler,contextPath); } // Nothing matching this port:host:path. Try to help get user back on This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-06-16 16:35:06
|
Revision: 3464 http://archive-access.svn.sourceforge.net/archive-access/?rev=3464&view=rev Author: bradtofel Date: 2011-06-16 16:34:59 +0000 (Thu, 16 Jun 2011) Log Message: ----------- LOGGING Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestMapper.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestMapper.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestMapper.java 2011-06-13 18:20:21 UTC (rev 3463) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestMapper.java 2011-06-16 16:34:59 UTC (rev 3464) @@ -149,6 +149,8 @@ PortMapper portMapper = portMap.get(portInt); if(portMapper != null) { handlerContext = portMapper.getRequestHandlerContext(request); + } else { + LOGGER.warning("No PortMapper for port " + port); } return handlerContext; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2011-06-13 18:20:28
|
Revision: 3463 http://archive-access.svn.sourceforge.net/archive-access/?rev=3463&view=rev Author: binzino Date: 2011-06-13 18:20:21 +0000 (Mon, 13 Jun 2011) Log Message: ----------- Added custom reducer to allow for multiple values for the same key. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2011-06-10 02:24:23 UTC (rev 3462) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2011-06-13 18:20:21 UTC (rev 3463) @@ -36,6 +36,7 @@ import org.apache.hadoop.mapred.JobStatus; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.TextInputFormat; @@ -94,7 +95,7 @@ * to the importing of ARC files. I've noted those details with * comments prefaced with "?:". */ -public class Importer extends Configured implements Tool, Mapper<WritableComparable, Writable, Text, NutchWritable> +public class Importer extends Configured implements Tool, Mapper<WritableComparable, Writable, Text, NutchWritable>, Reducer<WritableComparable,Writable,WritableComparable,Writable> { public static final Log LOG = LogFactory.getLog( Importer.class ); @@ -154,6 +155,23 @@ } + public void reduce( WritableComparable key, + Iterator<Writable> values, + OutputCollector<WritableComparable, Writable> output, + Reporter reporter + ) + throws IOException + { + + while ( values.hasNext( ) ) + { + if ( LOG.isInfoEnabled() ) LOG.info( "Reduce: key = " + key.toString() ); + + output.collect( key, values.next( ) ); + } + } + + /** * <p>Runs the Map job to import records from an archive file into a * Nutch segment.</p> @@ -588,7 +606,11 @@ if ( LOG.isWarnEnabled() ) LOG.warn( "Couldn't pass score, url = " + key, e ); } - output.collect( key, new NutchWritable( new ParseImpl( new ParseText( parse.getText() ), parse.getData(), parse.isCanonical() ) ) ); + String parsedText = parse.getText(); + + // TODO: Limit size of parsedText. + + output.collect( key, new NutchWritable( new ParseImpl( new ParseText( parsedText ), parse.getData(), parse.isCanonical() ) ) ); } } } @@ -719,7 +741,8 @@ FileInputFormat.addInputPath( job, manifestPath ); job.setInputFormat( TextInputFormat.class ); - job.setMapperClass( Importer.class ); + job.setMapperClass ( Importer.class ); + job.setReducerClass( Importer.class ); //job.setOutputPath ( segmentPath ); FileOutputFormat.setOutputPath( job, segmentPath ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3462 http://archive-access.svn.sourceforge.net/archive-access/?rev=3462&view=rev Author: bradtofel Date: 2011-06-10 02:24:23 +0000 (Fri, 10 Jun 2011) Log Message: ----------- Added 3 example tests.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2011-05-26 23:18:59 UTC (rev 3461) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2011-06-10 02:24:23 UTC (rev 3462) @@ -64,7 +64,10 @@ // strip leading 'www##.' with no protocol checkCanonicalization("www12.foo.com/","foo.com/"); - + checkCanonicalization("http://www.example.com/","example.com/"); + checkCanonicalization("http://www.example.com","example.com/"); + checkCanonicalization("http://www.example.com/index.html","example.com/index.html"); + // leave alone an url with no protocol but non-empty path checkCanonicalization("foo.com/","foo.com/"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-26 23:19:05
|
Revision: 3461 http://archive-access.svn.sourceforge.net/archive-access/?rev=3461&view=rev Author: bradtofel Date: 2011-05-26 23:18:59 +0000 (Thu, 26 May 2011) Log Message: ----------- JAVADOC Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java 2011-05-25 19:40:28 UTC (rev 3460) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java 2011-05-26 23:18:59 UTC (rev 3461) @@ -50,6 +50,8 @@ * @throws LiveWebCacheUnavailableException if there was a problem either * accessing the live web, in proxying to the live web, or in * maintaining the cache for the live web + * @throws LiveWebTimeoutException if there is no response from the live + * web cache before a timeout occurred. * @throws IOException for the usual reasons */ public Resource getCachedResource(URL url, long maxCacheMS, This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 19:40:35
|
Revision: 3460 http://archive-access.svn.sourceforge.net/archive-access/?rev=3460&view=rev Author: bradtofel Date: 2011-05-25 19:40:28 +0000 (Wed, 25 May 2011) Log Message: ----------- FEATURE: added code to allow parseStart and parseComplete event handlers Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseStartHandler.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2011-05-25 01:51:34 UTC (rev 3459) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2011-05-25 19:40:28 UTC (rev 3460) @@ -167,6 +167,7 @@ ContextAwareLexer lex = new ContextAwareLexer(lexer, context); Node node; try { + delegator.handleParseStart(context); while((node = lex.nextNode()) != null) { delegator.handleNode(context, node); } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java 2011-05-25 01:51:34 UTC (rev 3459) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java 2011-05-25 19:40:28 UTC (rev 3460) @@ -57,7 +57,8 @@ FastArchivalUrlReplayParseEventHandler.class.toString(); private String jspInsertPath = "/WEB-INF/replay/DisclaimChooser.jsp"; - private String commentJsp = "/WEB-INF/replay/ArchiveComment.jsp"; + private String endJsp = "/WEB-INF/replay/ArchiveComment.jsp"; + private String startJsp = null; private final String[] okHeadTags = { "![CDATA[*", "![CDATA[", "?", "!DOCTYPE", "HTML", "HEAD", "BASE", "LINK", "META", "TITLE", @@ -332,13 +333,14 @@ } return false; } + public void handleParseComplete(ParseContext pContext) throws IOException { - if(commentJsp != null) { + if(endJsp != null) { ReplayParseContext context = (ReplayParseContext) pContext; OutputStream out = context.getOutputStream(); String tmp = null; try { - tmp = context.getJspExec().jspToString(commentJsp); + tmp = context.getJspExec().jspToString(endJsp); } catch (ServletException e) { e.printStackTrace(); } @@ -350,6 +352,24 @@ } } + public void handleParseStart(ParseContext pContext) throws IOException { + if(startJsp != null) { + ReplayParseContext context = (ReplayParseContext) pContext; + OutputStream out = context.getOutputStream(); + String tmp = null; + try { + tmp = context.getJspExec().jspToString(startJsp); + } catch (ServletException e) { + e.printStackTrace(); + } + if(tmp != null) { +// Charset charset = Charset.forName(context.getOutputCharset()); + String charset = context.getOutputCharset(); + out.write(tmp.getBytes(charset)); + } + } + } + /** * @return the jspInsertPath */ @@ -366,15 +386,47 @@ /** * @return the commentJsp + * @deprecated use getEndJsp() */ public String getCommentJsp() { - return commentJsp; + return getEndJsp(); } /** * @param commentJsp the commentJsp to set + * @deprecated use setEndJsp() */ public void setCommentJsp(String commentJsp) { - this.commentJsp = commentJsp; + setEndJsp(commentJsp); } + /** + * @return the path to the JSP to execute and include at the start of the + * document + */ + public String getStartsp() { + return startJsp; + } + + /** + * @param endJsp the path to the JSP to execute and include at the start + * of the document + */ + public void setStartJsp(String startJsp) { + this.startJsp = startJsp; + } + /** + * @return the path to the JSP to execute and include at the end of the + * document + */ + public String getEndJsp() { + return endJsp; + } + + /** + * @param endJsp the path to the JSP to execute and include at the end + * of the document + */ + public void setEndJsp(String endJsp) { + this.endJsp = endJsp; + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegator.java 2011-05-25 01:51:34 UTC (rev 3459) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegator.java 2011-05-25 19:40:28 UTC (rev 3460) @@ -84,6 +84,12 @@ } + public void handleParseStart(ParseContext context) throws IOException { + preModifyDelegator.handleParseStart(context); + modifyDelegator.handleParseStart(context); + postModifyDelegator.handleParseStart(context); + } + public void handleParseComplete(ParseContext context) throws IOException { preModifyDelegator.handleParseComplete(context); modifyDelegator.handleParseComplete(context); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java 2011-05-25 01:51:34 UTC (rev 3459) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java 2011-05-25 19:40:28 UTC (rev 3460) @@ -31,6 +31,7 @@ import org.archive.wayback.util.htmllex.handlers.JSTextHandler; import org.archive.wayback.util.htmllex.handlers.OpenTagHandler; import org.archive.wayback.util.htmllex.handlers.ParseCompleteHandler; +import org.archive.wayback.util.htmllex.handlers.ParseStartHandler; import org.archive.wayback.util.htmllex.handlers.RemarkTextHandler; import org.htmlparser.Node; import org.htmlparser.nodes.RemarkNode; @@ -71,6 +72,7 @@ private List<RemarkTextHandler> remarkTextHandler = null; private List<ContentTextHandler> contentTextHandler = null; private List<ParseCompleteHandler> parseCompleteHandlers = null; + private List<ParseStartHandler> parseStartHandlers = null; private List<ParseEventDelegatorVisitor> parserVisitors = null; @@ -234,6 +236,20 @@ } } + public void addParseStartHandler(ParseStartHandler v) { + if(parseStartHandlers == null) { + parseStartHandlers = new ArrayList<ParseStartHandler>(); + } + parseStartHandlers.add(v); + } + public void handleParseStart(ParseContext context) throws IOException { + if(parseStartHandlers != null) { + for(ParseStartHandler v : parseStartHandlers) { + v.handleParseStart(context); + } + } + } + /** * @return the parserVisitors */ Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java 2011-05-25 01:51:34 UTC (rev 3459) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java 2011-05-25 19:40:28 UTC (rev 3460) @@ -32,6 +32,8 @@ * */ public interface ParseEventHandler { + + public void handleParseStart(ParseContext context) throws IOException; public void handleNode(ParseContext context, Node node) throws IOException; public void handleParseComplete(ParseContext context) throws IOException; Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseStartHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseStartHandler.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseStartHandler.java 2011-05-25 19:40:28 UTC (rev 3460) @@ -0,0 +1,10 @@ +package org.archive.wayback.util.htmllex.handlers; + +import java.io.IOException; + +import org.archive.wayback.util.htmllex.ParseContext; + +public interface ParseStartHandler { + public void handleParseStart(ParseContext context) + throws IOException; +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:51:40
|
Revision: 3459 http://archive-access.svn.sourceforge.net/archive-access/?rev=3459&view=rev Author: bradtofel Date: 2011-05-25 01:51:34 +0000 (Wed, 25 May 2011) Log Message: ----------- OPTIMIZ: now use static reference to ByteOp.UTF8 Charset object. Previously, it was either being "assumed" as default, as in, not specified, or referenced by name, causing a lookup of the Charset object, which was causing lock contention Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSort.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingRecordReader.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSort.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSort.java 2011-05-25 01:50:17 UTC (rev 3458) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSort.java 2011-05-25 01:51:34 UTC (rev 3459) @@ -21,6 +21,7 @@ import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; @@ -58,6 +59,7 @@ import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; +import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; public class CDXSort extends Configured implements Tool { @@ -134,8 +136,9 @@ // load the split file, find and set the number of reduces AlphaPartitioner partitioner = new AlphaPartitioner(); File localSplitFile = new File(splitPath); - FileReader is = new FileReader(localSplitFile); - BufferedReader bis = new BufferedReader(is); + FileInputStream fis = new FileInputStream(localSplitFile); + InputStreamReader isr = new InputStreamReader(fis,ByteOp.UTF8); + BufferedReader bis = new BufferedReader(isr); // try { // partitioner.loadBoundaries(bis); // } catch (IOException except) { @@ -314,7 +317,7 @@ } try { BufferedReader br = new BufferedReader( - new InputStreamReader(is)); + new InputStreamReader(is,ByteOp.UTF8)); String tmpS = null; long line = 0; while((tmpS = br.readLine()) != null) { Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java 2011-05-25 01:50:17 UTC (rev 3458) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java 2011-05-25 01:51:34 UTC (rev 3459) @@ -37,6 +37,7 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; +import org.archive.wayback.util.ByteOp; /** * @author brad @@ -54,8 +55,7 @@ throws IOException { FileSystem fs = path.getFileSystem(conf); FSDataInputStream is = fs.open(path); - BufferedReader br = new BufferedReader(new InputStreamReader(is, - "utf-8")); + BufferedReader br = new BufferedReader(new InputStreamReader(is, ByteOp.UTF8)); int lineCount = 0; while (br.readLine() != null) { lineCount++; Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingRecordReader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingRecordReader.java 2011-05-25 01:50:17 UTC (rev 3458) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingRecordReader.java 2011-05-25 01:51:34 UTC (rev 3459) @@ -10,6 +10,7 @@ import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; +import org.archive.wayback.util.ByteOp; public class GZIPRangeLineDereferencingRecordReader extends LineDereferencingRecordReader{ String curInputLine = null; @@ -61,7 +62,7 @@ // the whole chunk is now in buffer: InputStream is = new GZIPInputStream(new ByteArrayInputStream(buffer,0,length)); - curReader = new BufferedReader(new InputStreamReader(is)); + curReader = new BufferedReader(new InputStreamReader(is,ByteOp.UTF8)); curLine = 0; } else { Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java 2011-05-25 01:50:17 UTC (rev 3458) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java 2011-05-25 01:51:34 UTC (rev 3459) @@ -34,6 +34,7 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; +import org.archive.wayback.util.ByteOp; /** * RecordReader which reads pointers to actual files from an internal @@ -92,7 +93,7 @@ // is = new GZIPInputStream(is); is = new MultiMemberGZIPInputStream(is); } - curReader = new BufferedReader(new InputStreamReader(is)); + curReader = new BufferedReader(new InputStreamReader(is,ByteOp.UTF8)); } else { // all done: This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:50:23
|
Revision: 3458 http://archive-access.svn.sourceforge.net/archive-access/?rev=3458&view=rev Author: bradtofel Date: 2011-05-25 01:50:17 +0000 (Wed, 25 May 2011) Log Message: ----------- TWEAK: robot and live web exception text Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties 2011-05-25 01:49:36 UTC (rev 3457) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties 2011-05-25 01:50:17 UTC (rev 3458) @@ -4,6 +4,10 @@ Exception.accessControl.message=Access to this content has been blocked. {0} Exception.accessRobots.title=Blocked Content Exception.accessRobots.message=Access to this content has been blocked by the sites robots.txt document. +Exception.accessRobotTimeout.title=We were unable to get the robots.txt document to display this page. +Exception.accessRobotTimeout.message=Our request Timed Out. +Exception.accessWebNotAvailable.title=We were unable to get the robots.txt document to display this page. +Exception.accessWebNotAvailable.message=The gateway to the live web is not available. Please try again later. Exception.authenticationControl.title=Authentication Control Exception Exception.authenticationControl.message=This content is not accessible as the current user or from your current location. {0} Exception.badContent.title=Bad Content Exception This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:49:42
|
Revision: 3457 http://archive-access.svn.sourceforge.net/archive-access/?rev=3457&view=rev Author: bradtofel Date: 2011-05-25 01:49:36 +0000 (Wed, 25 May 2011) Log Message: ----------- Fixed AccessPoint list urls to include the request ContextPath, allowing this to work with non-ROOT deployments Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp 2011-05-25 01:47:34 UTC (rev 3456) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp 2011-05-25 01:49:36 UTC (rev 3457) @@ -17,7 +17,7 @@ } for(String accessPoint : accessPoints) { %> - <a href="<%= accessPoint %>/"><%= accessPoint %></a><br></br> + <a href="<%= request.getContextPath() + "/" + accessPoint %>/"><%= accessPoint %></a><br></br> <% } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:47:40
|
Revision: 3456 http://archive-access.svn.sourceforge.net/archive-access/?rev=3456&view=rev Author: bradtofel Date: 2011-05-25 01:47:34 +0000 (Wed, 25 May 2011) Log Message: ----------- REFACTOR: moved flag assignment and parsing code into ArchivalUrl Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/UrlRedirectNotice.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/UrlRedirectNotice.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/UrlRedirectNotice.jsp 2011-05-25 01:46:53 UTC (rev 3455) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/UrlRedirectNotice.jsp 2011-05-25 01:47:34 UTC (rev 3456) @@ -5,6 +5,7 @@ %><%@ page import="java.lang.StringBuffer" %><%@ page import="org.archive.wayback.archivalurl.ArchivalUrlDateRedirectReplayRenderer" %><%@ page import="org.archive.wayback.ResultURIConverter" +%><%@ page import="org.archive.wayback.archivalurl.ArchivalUrl" %><%@ page import="org.archive.wayback.core.UIResults" %><%@ page import="org.archive.wayback.core.WaybackRequest" %><%@ page import="org.archive.wayback.core.CaptureSearchResult" @@ -37,8 +38,8 @@ } } // TODO: Handle replay if we still don't have a redirect.. -String dateSpec = - ArchivalUrlDateRedirectReplayRenderer.makeFlagDateSpec(captureTS, wbr); +ArchivalUrl aUrl = new ArchivalUrl(wbr); +String dateSpec = aUrl.getDateSpec(captureTS); String targetReplayUrl = uric.makeReplayURI(dateSpec,targetUrl); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:47:00
|
Revision: 3455 http://archive-access.svn.sourceforge.net/archive-access/?rev=3455&view=rev Author: bradtofel Date: 2011-05-25 01:46:53 +0000 (Wed, 25 May 2011) Log Message: ----------- OPTIMIZ: now use static reference to ByteOp.UTF8 Charset object. Previously, it was either being "assumed" as default, as in, not specified, or referenced by name, causing a lookup of the Charset object, which was causing lock contention Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotRules.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/RemoteSubmitFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/BDBResourceFileLocationDB.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/RemoteResourceFileLocationDB.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/ResourceFileLocationDBLog.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/JspUrlResourceFileSource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/UrlLinkExtractor.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ByteOp.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/bdb/BDBRecordSet.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotRules.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotRules.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotRules.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -31,6 +31,8 @@ import java.util.logging.Logger; +import org.archive.wayback.util.ByteOp; + /** * Class which parses a robots.txt file, storing the rules contained therein, * and then allows for testing if path/userAgent tuples are blocked by those @@ -80,7 +82,7 @@ public void parse(InputStream is) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader( - (InputStream) is)); + (InputStream) is,ByteOp.UTF8)); String read; ArrayList<String> current = null; while (br != null) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -34,6 +34,7 @@ import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.bdb.BDBRecord; import org.archive.wayback.util.bdb.BDBRecordSet; @@ -204,7 +205,7 @@ } else if(op.compareTo("-w") == 0) { BufferedReader br = new BufferedReader( - new InputStreamReader(System.in)); + new InputStreamReader(System.in,ByteOp.UTF8)); RecordIterator itrS = new RecordIterator(br); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBRecordToSearchResultAdapter.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBRecordToSearchResultAdapter.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -24,6 +24,7 @@ import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.resourceindex.cdx.CDXLineToSearchResultAdapter; import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.bdb.BDBRecord; /** @@ -50,18 +51,13 @@ */ public CaptureSearchResult adapt(BDBRecord record) { sb.setLength(0); - try { - String key = new String(record.getKey().getData(),"UTF-8"); - int urlEnd = key.indexOf(' '); - int dateSpecEnd = key.indexOf(' ',urlEnd + 1); - sb.append(key.substring(0,dateSpecEnd)); - sb.append(" "); - sb.append(new String(record.getValue().getData(),"UTF-8")); - sb.append(key.substring(dateSpecEnd)); - } catch (UnsupportedEncodingException e) { - // should not happen with UTF-8 hard-coded.. - e.printStackTrace(); - } + String key = new String(record.getKey().getData(),ByteOp.UTF8); + int urlEnd = key.indexOf(' '); + int dateSpecEnd = key.indexOf(' ',urlEnd + 1); + sb.append(key.substring(0,dateSpecEnd)); + sb.append(" "); + sb.append(new String(record.getValue().getData(),ByteOp.UTF8)); + sb.append(key.substring(dateSpecEnd)); return CDXLineToSearchResultAdapter.doAdapt(sb.toString()); } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatIndex.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatIndex.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -20,14 +20,17 @@ package org.archive.wayback.resourceindex.cdx; import java.io.BufferedReader; +import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; +import java.io.InputStreamReader; import java.util.Iterator; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.resourceindex.cdx.format.CDXFormat; import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.CloseableIterator; public class CDXFormatIndex extends CDXIndex { @@ -44,7 +47,9 @@ try { // BUGBUG: I don't think java will let us do much better than // this... No way to stat() a filehandle, right? - BufferedReader fr = new BufferedReader(new FileReader(file)); + FileInputStream fis = new FileInputStream(file); + InputStreamReader isr = new InputStreamReader(fis,ByteOp.UTF8); + BufferedReader fr = new BufferedReader(isr); cdx = new CDXFormat(fr.readLine()); lastMod = nowMod; fr.close(); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/RemoteSubmitFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/RemoteSubmitFilter.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/RemoteSubmitFilter.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -40,6 +40,8 @@ import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; +import org.archive.wayback.util.ByteOp; + /** * Filter that accepts PUT HTTP requests to insert CDX files into the incoming * directory for a local BDBIndex. @@ -152,7 +154,7 @@ InputStream input; input = request.getInputStream(); BufferedInputStream in = new BufferedInputStream(input); - BufferedReader reader = new BufferedReader(new InputStreamReader(in)); + BufferedReader reader = new BufferedReader(new InputStreamReader(in,ByteOp.UTF8)); FileWriter out = new FileWriter(tmpFile); while ((i = reader.read()) != -1) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -27,6 +27,8 @@ import java.util.logging.Logger; import java.util.zip.GZIPInputStream; +import org.archive.wayback.util.ByteOp; + /** * @author brad * @@ -73,6 +75,6 @@ URLConnection uc = u.openConnection(); uc.setRequestProperty(RANGE_HEADER, sb.toString()); return new BufferedReader(new InputStreamReader( - new GZIPInputStream(uc.getInputStream()))); + new GZIPInputStream(uc.getInputStream()),ByteOp.UTF8)); } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -31,6 +31,7 @@ import java.util.zip.GZIPInputStream; import org.archive.wayback.exception.RuntimeIOException; +import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.CloseableIterator; /** @@ -134,7 +135,7 @@ long offset = i * ZiplinedBlock.BLOCK_SIZE; raf.seek(offset); BufferedReader br = new BufferedReader(new InputStreamReader( - new GZIPInputStream(new FileInputStream(raf.getFD())))); + new GZIPInputStream(new FileInputStream(raf.getFD())),ByteOp.UTF8)); String line = br.readLine(); if(line == null) { System.err.println("Bad block at " + offset + " in " + args[0]); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -21,13 +21,16 @@ import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; +import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.logging.Logger; import org.archive.wayback.Shutdownable; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; +import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.DirMaker; @@ -102,7 +105,9 @@ public long getLastMark() throws IOException { long mark = 0; if(file.isFile() && file.length() > 0) { - BufferedReader ir = new BufferedReader(new FileReader(file)); + FileInputStream fis = new FileInputStream(file); + InputStreamReader isr = new InputStreamReader(fis,ByteOp.UTF8); + BufferedReader ir = new BufferedReader(isr); String line = ir.readLine(); if(line != null) { mark = Long.parseLong(line); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/BDBResourceFileLocationDB.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/BDBResourceFileLocationDB.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/BDBResourceFileLocationDB.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -25,6 +25,7 @@ import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBLog; +import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.bdb.BDBRecordSet; @@ -277,7 +278,7 @@ db.setBdbName(bdbName); db.setLogPath(logPath); BufferedReader r = new BufferedReader( - new InputStreamReader(System.in)); + new InputStreamReader(System.in,ByteOp.UTF8)); String line; int exitCode = 0; try { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/RemoteResourceFileLocationDB.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/RemoteResourceFileLocationDB.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/RemoteResourceFileLocationDB.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -34,6 +34,7 @@ import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.PostMethod; import org.apache.commons.httpclient.util.ParameterFormatter; +import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.WrappedCloseableIterator; @@ -265,7 +266,7 @@ if(operation.equalsIgnoreCase("add-stream")) { BufferedReader r = new BufferedReader( - new InputStreamReader(System.in)); + new InputStreamReader(System.in,ByteOp.UTF8)); String line; try { while((line = r.readLine()) != null) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/ResourceFileLocationDBLog.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/ResourceFileLocationDBLog.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/ResourceFileLocationDBLog.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -21,11 +21,14 @@ import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; +import java.io.InputStreamReader; import java.io.RandomAccessFile; +import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.flatfile.RecordIterator; @@ -87,7 +90,9 @@ RandomAccessFile raf = new RandomAccessFile(this, "r"); raf.seek(start); - BufferedReader is = new BufferedReader(new FileReader(raf.getFD())); + FileInputStream fis = new FileInputStream(raf.getFD()); + InputStreamReader isr = new InputStreamReader(fis,ByteOp.UTF8); + BufferedReader is = new BufferedReader(isr); return new BufferedRangeIterator(new RecordIterator(is),end - start); } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/JspUrlResourceFileSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/JspUrlResourceFileSource.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/JspUrlResourceFileSource.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -24,6 +24,8 @@ import java.io.InputStreamReader; import java.net.URL; +import org.archive.wayback.util.ByteOp; + /** * * @@ -57,7 +59,7 @@ String url = "http://localhost:8080" + jsp + "?url=" + prefix; URL u = new URL(url); InputStream is = u.openStream(); - InputStreamReader isr = new InputStreamReader(is); + InputStreamReader isr = new InputStreamReader(is,ByteOp.UTF8); StringBuilder sb = new StringBuilder(2000); int READ_SIZE = 2048; char cbuf[] = new char[READ_SIZE]; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/UrlLinkExtractor.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/UrlLinkExtractor.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/UrlLinkExtractor.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -27,7 +27,9 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.archive.wayback.util.ByteOp; + /** * * @@ -61,7 +63,7 @@ public static List<String> extractLinks(final String url) throws IOException { URL u = new URL(url); InputStream is = u.openStream(); - InputStreamReader isr = new InputStreamReader(is); + InputStreamReader isr = new InputStreamReader(is,ByteOp.UTF8); StringBuilder sb = new StringBuilder(2000); int READ_SIZE = 2048; char cbuf[] = new char[READ_SIZE]; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ByteOp.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ByteOp.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ByteOp.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.charset.Charset; /** * Byte oriented static methods. Likely a lot of overlap with apache- commons @@ -33,6 +34,7 @@ public class ByteOp { /** Default buffer size for IO ops */ public final static int BUFFER_SIZE = 4096; + public final static Charset UTF8 = Charset.forName("utf-8"); /** * Create a new byte array with contents initialized to values from the Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/bdb/BDBRecordSet.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/bdb/BDBRecordSet.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/bdb/BDBRecordSet.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -24,7 +24,9 @@ import java.io.UnsupportedEncodingException; import java.util.Iterator; +import org.archive.wayback.util.ByteOp; + import com.sleepycat.je.Cursor; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseConfig; @@ -119,26 +121,14 @@ * @return byte array representation of String s in UTF-8 */ public static byte[] stringToBytes(String s) { - try { - return s.getBytes("UTF-8"); - } catch (UnsupportedEncodingException e) { - // no UTF-8, huh? - e.printStackTrace(); - return s.getBytes(); - } + return s.getBytes(ByteOp.UTF8); } /** * @param ba * @return String of UTF-8 encoded bytes ba */ public static String bytesToString(byte[] ba) { - try { - return new String(ba,"UTF-8"); - } catch (UnsupportedEncodingException e) { - // not likely.. - e.printStackTrace(); - return new String(ba); - } + return new String(ba,ByteOp.UTF8); } /** Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -21,13 +21,16 @@ import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; +import java.io.InputStreamReader; import java.io.PrintWriter; import java.io.RandomAccessFile; import java.util.Comparator; import java.util.Iterator; +import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.CompositeSortedIterator; @@ -180,7 +183,9 @@ RandomAccessFile raf = new RandomAccessFile(file,"r"); long offset = findKeyOffset(raf,prefix); lastMatchOffset = offset; - BufferedReader br = new BufferedReader(new FileReader(raf.getFD())); + FileInputStream is = new FileInputStream(raf.getFD()); + InputStreamReader isr = new InputStreamReader(is, ByteOp.UTF8); + BufferedReader br = new BufferedReader(isr); itr = new RecordIterator(br); return itr; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2011-05-25 01:44:09 UTC (rev 3454) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2011-05-25 01:46:53 UTC (rev 3455) @@ -31,6 +31,7 @@ import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.util.ByteOp; /** * Class that performs the standard Heritrix URL canonicalization. Eventually, @@ -365,7 +366,7 @@ for(int idx = 0; idx < columns.size(); idx++) { cols[idx] = columns.get(idx).intValue() - 1; } - BufferedReader r = new BufferedReader(new InputStreamReader(System.in)); + BufferedReader r = new BufferedReader(new InputStreamReader(System.in,ByteOp.UTF8)); StringBuilder sb = new StringBuilder(); String line = null; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:44:15
|
Revision: 3454 http://archive-access.svn.sourceforge.net/archive-access/?rev=3454&view=rev Author: bradtofel Date: 2011-05-25 01:44:09 +0000 (Wed, 25 May 2011) Log Message: ----------- BUGFIX: now uses current Heritrix W/ARC readers, to work around Java GZIPInputStream "bugfix" Modified Paths: -------------- trunk/archive-access/projects/wayback/pom.xml trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ARCCreator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java Modified: trunk/archive-access/projects/wayback/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/pom.xml 2011-05-25 01:40:30 UTC (rev 3453) +++ trunk/archive-access/projects/wayback/pom.xml 2011-05-25 01:44:09 UTC (rev 3454) @@ -262,7 +262,7 @@ <dependency> <groupId>org.archive.heritrix</groupId> <artifactId>heritrix-commons</artifactId> - <version>3.1.1-SNAPSHOT</version> + <version>3.0.1-SNAPSHOT</version> </dependency> <dependency> <groupId>org.archive.access-control</groupId> @@ -297,7 +297,7 @@ <dependency> <groupId>com.flagstone</groupId> <artifactId>transform</artifactId> - <version>3.0.1-SNAPSHOT</version> + <version>3.0.2</version> </dependency> <dependency> <artifactId>hadoop-core</artifactId> Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java 2011-05-25 01:40:30 UTC (rev 3453) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java 2011-05-25 01:44:09 UTC (rev 3454) @@ -159,6 +159,14 @@ public String getTemplate() { return LIVE_WAYBACK_TEMPLATE; } + + public boolean getFrequentFlushes() { + return true; + } + + public int getWriteBufferSize() { + return 4096; + } }; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ARCCreator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ARCCreator.java 2011-05-25 01:40:30 UTC (rev 3453) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ARCCreator.java 2011-05-25 01:44:09 UTC (rev 3454) @@ -26,9 +26,11 @@ import java.text.ParseException; import java.util.Arrays; import java.util.HashMap; +import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Logger; +import org.archive.io.WriterPoolSettings; import org.archive.io.arc.ARCConstants; import org.archive.io.arc.ARCWriter; import org.archive.util.ArchiveUtils; @@ -98,9 +100,9 @@ throws IOException { File target[] = {tgtDir}; + ARCWriter writer = new ARCWriter(new AtomicInteger(), - Arrays.asList(target),prefix,true, - ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE); + getSettings(true,prefix,Arrays.asList(target))); File sources[] = srcDir.listFiles(); LOGGER.info("Found " + sources.length + " files in " + srcDir); for(int i = 0; i<sources.length; i++) { @@ -121,6 +123,43 @@ LOGGER.info("Closed arc file named " + writer.getFile().getAbsolutePath()); } + private WriterPoolSettings getSettings(final boolean isCompressed, + final String prefix, final List<File> arcDirs) { + return new WriterPoolSettings() { + public List<File> getOutputDirs() { + return arcDirs; + } + + @SuppressWarnings({ "unchecked", "rawtypes" }) + public List getMetadata() { + return null; + } + + public String getPrefix() { + return prefix; + } + + public boolean getCompress() { + return isCompressed; + } + + public long getMaxFileSizeBytes() { + return ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE; + } + + public String getTemplate() { + return "${prefix}-${timestamp17}-${serialno}"; + } + + public boolean getFrequentFlushes() { + return false; + } + + public int getWriteBufferSize() { + return 4096; + } + }; + } /** * @param args Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java 2011-05-25 01:40:30 UTC (rev 3453) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java 2011-05-25 01:44:09 UTC (rev 3454) @@ -26,8 +26,14 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import org.archive.io.WriterPoolSettings; +import org.archive.io.arc.ARCConstants; import org.archive.io.warc.WARCWriter; +import org.archive.io.warc.WARCWriterPoolSettings; +import org.archive.uid.RecordIDGenerator; +import org.archive.uid.UUIDGenerator; import org.archive.util.anvl.ANVLRecord; public class WARCHeader { @@ -45,8 +51,7 @@ List<String> metadata = new ArrayList<String>(1); metadata.add(ar.toString()); - writer = new WARCWriter(null, bos, target, true, null, - metadata); + writer = new WARCWriter(new AtomicInteger(),bos,target,getSettings(true, null, null, metadata)); // Write a warcinfo record with description about how this WARC // was made. writer.writeWarcinfoRecord(target.getName(), "Made from " @@ -54,7 +59,48 @@ + this.getClass().getName()); } + private WARCWriterPoolSettings getSettings(final boolean isCompressed, + final String prefix, final List<File> arcDirs, final List metadata) { + return new WARCWriterPoolSettings() { + public List<File> getOutputDirs() { + return arcDirs; + } + @SuppressWarnings({ "unchecked", "rawtypes" }) + public List getMetadata() { + return metadata; + } + + public String getPrefix() { + return prefix; + } + + public boolean getCompress() { + return isCompressed; + } + + public long getMaxFileSizeBytes() { + return ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE; + } + + public String getTemplate() { + return "${prefix}-${timestamp17}-${serialno}"; + } + + public boolean getFrequentFlushes() { + return false; + } + + public int getWriteBufferSize() { + return 4096; + } + + public RecordIDGenerator getRecordIDGenerator() { + return new UUIDGenerator(); + } + }; + } + public static void main(String[] args) { if (args.length != 3) { System.err.println("USAGE: tgtWarc fieldsSrc id"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |