From: <bra...@us...> - 2010-04-02 03:06:34
|
Revision: 3010 http://archive-access.svn.sourceforge.net/archive-access/?rev=3010&view=rev Author: bradtofel Date: 2010-04-02 03:06:28 +0000 (Fri, 02 Apr 2010) Log Message: ----------- Many unreported bugfixes, slight change of interface to allow grabbing an iterator of String(lines), added a main() method, and added a truncated() method to the iterators, currently not exposed enough to be useful, but potentially allowing an external user to determine if the search was cut off because too many blocks had to be searched. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java 2010-04-02 02:53:44 UTC (rev 3009) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java 2010-04-02 03:06:28 UTC (rev 3010) @@ -39,10 +39,17 @@ Iterator<String> inner = null; private String cachedNext = null; private boolean done = false; + private boolean truncated = false; public StringPrefixIterator(Iterator<String> inner, String prefix) { this.prefix = prefix; this.inner = inner; + if(inner instanceof ZiplinesChunkIterator) { + truncated = ((ZiplinesChunkIterator)inner).isTruncated(); + } } + public boolean isTruncated() { + return truncated; + } /* (non-Javadoc) * @see java.util.Iterator#hasNext() */ Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2010-04-02 02:53:44 UTC (rev 3009) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2010-04-02 03:06:28 UTC (rev 3010) @@ -32,11 +32,16 @@ import java.net.URLConnection; import java.util.zip.GZIPInputStream; +import org.apache.log4j.Logger; + /** * @author brad * */ public class ZiplinedBlock { + private static final Logger LOGGER = Logger.getLogger( + ZiplinedBlock.class.getName()); + String urlOrPath = null; long offset = -1; public final static int BLOCK_SIZE = 128 * 1024; @@ -56,11 +61,13 @@ * @throws IOException for usual reasons */ public BufferedReader readBlock() throws IOException { - URL u = new URL(urlOrPath); - URLConnection uc = u.openConnection(); StringBuilder sb = new StringBuilder(16); sb.append(BYTES_HEADER).append(offset).append(BYTES_MINUS); sb.append((offset + BLOCK_SIZE)-1); + LOGGER.trace("Reading block:" + urlOrPath + "("+sb.toString()+")"); + // TODO: timeouts + URL u = new URL(urlOrPath); + URLConnection uc = u.openConnection(); uc.setRequestProperty(RANGE_HEADER, sb.toString()); return new BufferedReader(new InputStreamReader( new GZIPInputStream(uc.getInputStream()))); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java 2010-04-02 02:53:44 UTC (rev 3009) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java 2010-04-02 03:06:28 UTC (rev 3010) @@ -37,20 +37,27 @@ import java.util.RandomAccess; import java.util.zip.GZIPInputStream; +import org.apache.log4j.Logger; import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.webapp.AccessPoint; /** * @author brad * */ public class ZiplinesChunkIterator implements CloseableIterator<String> { + private static final Logger LOGGER = Logger.getLogger( + ZiplinesChunkIterator.class.getName()); + private BufferedReader br = null; private Iterator<ZiplinedBlock> blockItr = null; private String cachedNext = null; + private boolean truncated = false; /** * @param blocks which should be fetched and unzipped, one after another */ public ZiplinesChunkIterator(List<ZiplinedBlock> blocks) { + LOGGER.info("initialized with " + blocks.size() + " blocks"); blockItr = blocks.iterator(); } /* (non-Javadoc) @@ -148,4 +155,16 @@ System.exit(1); } } + /** + * @return the truncated + */ + public boolean isTruncated() { + return truncated; + } + /** + * @param truncated the truncated to set + */ + public void setTruncated(boolean truncated) { + this.truncated = truncated; + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-04-02 02:53:44 UTC (rev 3009) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-04-02 03:06:28 UTC (rev 3010) @@ -28,8 +28,10 @@ import it.unimi.dsi.mg4j.util.FrontCodedStringList; import java.io.BufferedReader; +import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; +import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -82,6 +84,7 @@ private String chunkMapPath = null; private HashMap<String,String> chunkMap = null; private CDXFormat format = null; + private int maxBlocks = 1000; public ZiplinesSearchResultSource() { } @@ -130,40 +133,51 @@ } } - public Iterator<String> getStringPrefixIterator(String prefix) throws ResourceIndexNotAvailableException, IOException { - CloseableIterator<String> itr = chunkIndex.getRecordIteratorLT(prefix); + public Iterator<String> getStringPrefixIterator(String prefix) + throws ResourceIndexNotAvailableException, IOException { + ArrayList<ZiplinedBlock> blocks = new ArrayList<ZiplinedBlock>(); boolean first = true; - while(itr.hasNext()) { - String blockDescriptor = itr.next(); - String parts[] = blockDescriptor.split("\t"); - if(parts.length != 3) { - throw new ResourceIndexNotAvailableException("Bad line(" + - blockDescriptor + ")"); + int numBlocks = 0; + boolean truncated = false; + CloseableIterator<String> itr = null; + try { + itr = chunkIndex.getRecordIteratorLT(prefix); + while(itr.hasNext()) { + if(numBlocks >= maxBlocks) { + truncated = true; + break; + } + String blockDescriptor = itr.next(); + numBlocks++; + String parts[] = blockDescriptor.split("\t"); + if(parts.length != 3) { + throw new ResourceIndexNotAvailableException("Bad line(" + + blockDescriptor + ")"); + } + // only compare the correct length: + String prefCmp = prefix; + String blockCmp = parts[0]; + if(first) { + // always add first: + first = false; + } else if(!blockCmp.startsWith(prefCmp)) { + // all done; + break; + } + // add this and keep lookin... + String url = chunkMap.get(parts[1]); + long offset = Long.parseLong(parts[2]); + blocks.add(new ZiplinedBlock(url, offset)); } - // only compare the correct length: - String prefCmp = prefix; - String blockCmp = parts[0]; -// if(prefCmp.length() < blockCmp.length()) { -// blockCmp = blockCmp.substring(0,prefCmp.length()); -// } else { -// prefCmp = prefCmp.substring(0,blockCmp.length()); -// } - if(first) { - // always add first: - first = false; -// } else if(blockCmp.compareTo(prefCmp) > 0) { - } else if(!blockCmp.startsWith(prefCmp)) { - // all done; - break; + } finally { + if(itr != null) { + itr.close(); } - // add this and keep lookin... - String url = chunkMap.get(parts[1]); - long offset = Long.parseLong(parts[2]); - blocks.add(new ZiplinedBlock(url, offset)); } - itr.close(); - return new StringPrefixIterator(new ZiplinesChunkIterator(blocks),prefix); + ZiplinesChunkIterator zci = new ZiplinesChunkIterator(blocks); + zci.setTruncated(truncated); + return new StringPrefixIterator(zci,prefix); } /* (non-Javadoc) @@ -216,5 +230,103 @@ public void setChunkMapPath(String chunkMapPath) { this.chunkMapPath = chunkMapPath; } + /** + * @return the maxBlocks + */ + public int getMaxBlocks() { + return maxBlocks; + } + /** + * @param maxBlocks the maxBlocks to set + */ + public void setMaxBlocks(int maxBlocks) { + this.maxBlocks = maxBlocks; + } + private static void USAGE() { + System.err.println("USAGE:"); + System.err.println(""); + System.err.println("zl-bin-search [-format FORMAT] [-max MAX_BLOCKS] SUMMARY LOCATION KEY"); + System.err.println(""); + System.err.println("Search a ziplined compressed CDX format index for key"); + System.err.println("KEY to STDOUT. SUMMARY and LOCATION are paths to the"); + System.err.println("block summary and file location files."); + System.err.println("With -format, output CDX in format FORMAT."); + System.err.println("With -max, limit search at most MAX_BLOCKS blocks."); + System.exit(1); + } + + /** + * @param args + */ + public static void main(String[] args) { +// String cdxSpec = CDXFormatIndex.CDX_HEADER_MAGIC; + String cdxSpec = " CDX N b a m s k r V g"; + CDXFormat format = null; + try { + format = new CDXFormat(cdxSpec); + } catch (CDXFormatException e1) { + e1.printStackTrace(); + System.exit(1); + } + ZiplinesSearchResultSource zl = new ZiplinesSearchResultSource(format); + PrintWriter pw = new PrintWriter(System.out); + int idx; + for(idx = 0; idx < args.length; idx++) { + if(args[idx].equals("-format")) { + idx++; + if(idx >= args.length) { + USAGE(); + } + try { + zl.setFormat(new CDXFormat(args[idx])); + } catch (CDXFormatException e1) { + e1.printStackTrace(); + System.exit(1); + } + } else if(args[idx].equals("-max")) { + idx++; + if(idx >= args.length) { + USAGE(); + } + try { + zl.setMaxBlocks(Integer.parseInt(args[idx])); + } catch(NumberFormatException e) { + USAGE(); + System.exit(1); + } + + } else { + break; + } + } + if(args.length < idx + 3) { + USAGE(); + } + // first is summary path, then location path, then search key: + zl.setChunkIndexPath(args[idx++]); + zl.setChunkMapPath(args[idx++]); + String key = args[idx++]; + + try { + zl.init(); + Iterator<String> itr = zl.getStringPrefixIterator(key); + boolean truncated = ((StringPrefixIterator)itr).isTruncated(); + while(itr.hasNext()) { + pw.println(itr.next()); + } + pw.close(); + if(truncated) { + System.err.println("Note that results are truncated..."); + } + } catch (ResourceIndexNotAvailableException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + System.exit(1); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + System.exit(1); + } + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-06-16 17:26:38
|
Revision: 3478 http://archive-access.svn.sourceforge.net/archive-access/?rev=3478&view=rev Author: bradtofel Date: 2011-06-16 17:26:31 +0000 (Thu, 16 Jun 2011) Log Message: ----------- FEATURE: abstracted out fetching of byte chunks from local/remote files moved current code into Http11BlockLoader, which now uses a multithreaded HTTP connection manager to reuse connections implemented an HDFS BlockLoader Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/BlockLoader.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/Http11BlockLoader.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/RemoteHttp11BlockLoader.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/BlockLoader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/BlockLoader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/BlockLoader.java 2011-06-16 17:26:31 UTC (rev 3478) @@ -0,0 +1,19 @@ +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; + +public interface BlockLoader { + /** + * Fetch a range of bytes from a particular URL. Note that the bytes are + * read into memory all at once, so care should be taken with the length + * argument. + * + * @param url String URL to fetch + * @param offset byte start offset of the desired range + * @param length number of octets to fetch + * @return a new byte[] containing the octets fetched + * @throws IOException on Network and protocol failures, as well as Timeouts + */ + public byte[] getBlock(String url, long offset, int length) + throws IOException; +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java 2011-06-16 17:26:31 UTC (rev 3478) @@ -0,0 +1,46 @@ +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +public class HDFSBlockLoader implements BlockLoader { + FileSystem fs = null; + String defaultFSURI = null; + public HDFSBlockLoader(String defaultFSURI) { + this.defaultFSURI = defaultFSURI; + } + public void init() throws IOException, URISyntaxException { + Configuration c = new Configuration(); + c.set("fs.default.name",defaultFSURI); + fs = FileSystem.get(new URI(defaultFSURI),c); + } + + public byte[] getBlock(String url, long offset, int length) + throws IOException { + Path path = new Path(url); + FSDataInputStream s = fs.open(path); + byte buffer[] = new byte[length]; + s.readFully(offset, buffer); + return buffer; + } + + /** + * @return the defaultFSURI + */ + public String getDefaultFSURI() { + return defaultFSURI; + } + + /** + * @param defaultFSURI the defaultFSURI to set + */ + public void setDefaultFSURI(String defaultFSURI) { + this.defaultFSURI = defaultFSURI; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/Http11BlockLoader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/Http11BlockLoader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/Http11BlockLoader.java 2011-06-16 17:26:31 UTC (rev 3478) @@ -0,0 +1,164 @@ +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; +import java.io.InputStream; +import java.util.logging.Logger; +import org.apache.commons.httpclient.HostConfiguration; +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; +import org.apache.commons.httpclient.methods.GetMethod; +import org.apache.commons.httpclient.params.HttpClientParams; +import org.archive.wayback.webapp.PerformanceLogger; + +import com.google.common.io.ByteStreams; + +/** + * Class which wraps most of the complexity of an apache commons httpclient + * MultiThreaderHttpConnectionManager, exposing common configuration elements + * to Spring configuration. + * + * This class is a near direct copy of RemoteLiveWebCache: refactoring needed. + * + * @author brad + * + */ +public class Http11BlockLoader implements BlockLoader { + private static final Logger LOGGER = Logger.getLogger( + Http11BlockLoader.class.getName()); + + private MultiThreadedHttpConnectionManager connectionManager = null; + private HostConfiguration hostConfiguration = null; + private HttpClient http = null; + + /** + * + */ + public Http11BlockLoader() { + connectionManager = new MultiThreadedHttpConnectionManager(); + hostConfiguration = new HostConfiguration(); + HttpClientParams params = new HttpClientParams(); +// params.setParameter(HttpClientParams.RETRY_HANDLER, new NoRetryHandler()); + http = new HttpClient(params,connectionManager); + http.setHostConfiguration(hostConfiguration); + } + + /** + * Fetch a range of bytes from a particular URL. Note that the bytes are + * read into memory all at once, so care should be taken with the length + * argument. + * + * @param url String URL to fetch + * @param offset byte start offset of the desired range + * @param length number of octets to fetch + * @return a new byte[] containing the octets fetched + * @throws IOException on HTTP and Socket failures, as well as Timeouts + */ + public byte[] getBlock(String url, long offset, int length) + throws IOException { + + HttpMethod method = null; + try { + method = new GetMethod(url); + } catch(IllegalArgumentException e) { + LOGGER.warning("Bad URL for live web fetch:" + url); + throw new IOException("Url:" + url + " does not look like an URL?"); + } + StringBuilder sb = new StringBuilder(16); + sb.append(ZiplinedBlock.BYTES_HEADER).append(offset); + sb.append(ZiplinedBlock.BYTES_MINUS).append((offset + length)-1); + String rangeHeader = sb.toString(); + method.addRequestHeader(ZiplinedBlock.RANGE_HEADER, rangeHeader); + //uc.setRequestProperty(RANGE_HEADER, sb.toString()); + long start = System.currentTimeMillis(); + try { + LOGGER.fine("Reading block:" + url + "("+rangeHeader+")"); + int status = http.executeMethod(method); + if((status == 200) || (status == 206)) { + InputStream is = method.getResponseBodyAsStream(); + byte[] block = new byte[length]; + ByteStreams.readFully(is, block); + long elapsed = System.currentTimeMillis() - start; + PerformanceLogger.noteElapsed("CDXBlockLoad",elapsed,url); + return block; + + } else { + throw new IOException("Bad status for " + url); + } + } finally { + method.releaseConnection(); + } + } + + /** + * @param hostPort to proxy requests through - ex. "localhost:3128" + */ + public void setProxyHostPort(String hostPort) { + int colonIdx = hostPort.indexOf(':'); + if(colonIdx > 0) { + String host = hostPort.substring(0,colonIdx); + int port = Integer.valueOf(hostPort.substring(colonIdx+1)); + + hostConfiguration.setProxy(host, port); + } + } + + /** + * @param maxTotalConnections the HttpConnectionManagerParams config + */ + public void setMaxTotalConnections(int maxTotalConnections) { + connectionManager.getParams(). + setMaxTotalConnections(maxTotalConnections); + } + + /** + * @return the HttpConnectionManagerParams maxTotalConnections config + */ + public int getMaxTotalConnections() { + return connectionManager.getParams().getMaxTotalConnections(); + } + + /** + * @param maxHostConnections the HttpConnectionManagerParams config + */ + public void setMaxHostConnections(int maxHostConnections) { + connectionManager.getParams(). + setMaxConnectionsPerHost(hostConfiguration, maxHostConnections); + } + + /** + * @return the HttpConnectionManagerParams maxHostConnections config + */ + public int getMaxHostConnections() { + return connectionManager.getParams(). + getMaxConnectionsPerHost(hostConfiguration); + } + + /** + * @return the connectionTimeoutMS + */ + public int getConnectionTimeoutMS() { + return connectionManager.getParams().getConnectionTimeout(); + } + + /** + * @param connectionTimeoutMS the connectionTimeoutMS to set + */ + public void setConnectionTimeoutMS(int connectionTimeoutMS) { + connectionManager.getParams().setConnectionTimeout(connectionTimeoutMS); + } + + /** + * @return the socketTimeoutMS + */ + public int getSocketTimeoutMS() { + return connectionManager.getParams().getSoTimeout(); + } + + /** + * @param socketTimeoutMS the socketTimeoutMS to set + */ + public void setSocketTimeoutMS(int socketTimeoutMS) { + connectionManager.getParams().setSoTimeout(socketTimeoutMS); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/RemoteHttp11BlockLoader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/RemoteHttp11BlockLoader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/RemoteHttp11BlockLoader.java 2011-06-16 17:26:31 UTC (rev 3478) @@ -0,0 +1,9 @@ +package org.archive.wayback.resourceindex.ziplines; + +/** + * @author brad + * @deprecated use Http11BlockLoader + */ +public class RemoteHttp11BlockLoader extends Http11BlockLoader { + +} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2011-06-16 17:23:08 UTC (rev 3477) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2011-06-16 17:26:31 UTC (rev 3478) @@ -20,6 +20,7 @@ package org.archive.wayback.resourceindex.ziplines; import java.io.BufferedReader; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; @@ -37,13 +38,14 @@ private static final Logger LOGGER = Logger.getLogger( ZiplinedBlock.class.getName()); + BlockLoader loader = null; String urlOrPath = null; long offset = -1; int count = 0; public final static int BLOCK_SIZE = 128 * 1024; - private final static String RANGE_HEADER = "Range"; - private final static String BYTES_HEADER = "bytes="; - private final static String BYTES_MINUS = "-"; + public final static String RANGE_HEADER = "Range"; + public final static String BYTES_HEADER = "bytes="; + public final static String BYTES_MINUS = "-"; /** * @param urlOrPath URL where this file can be downloaded * @param offset start of 128K block boundary. @@ -62,10 +64,29 @@ this.count = count; } /** + * @param loader the RemoteHttp11BlockLoader to use when fetching this block + */ + public void setLoader(BlockLoader loader) { + this.loader = loader; + } + /** * @return a BufferedReader of the underlying compressed data in this block * @throws IOException for usual reasons */ public BufferedReader readBlock() throws IOException { + if(loader != null) { + return readBlockEfficiently(loader); + } + return readBlockInefficiently(); + } + private BufferedReader readBlockEfficiently(BlockLoader remote) + throws IOException { + byte bytes[] = remote.getBlock(urlOrPath, offset, BLOCK_SIZE); + return new BufferedReader(new InputStreamReader( + new GZIPInputStream(new ByteArrayInputStream(bytes)), + ByteOp.UTF8)); + } + private BufferedReader readBlockInefficiently() throws IOException { StringBuilder sb = new StringBuilder(16); sb.append(BYTES_HEADER).append(offset).append(BYTES_MINUS); sb.append((offset + BLOCK_SIZE)-1); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-06-16 17:23:08 UTC (rev 3477) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-06-16 17:26:31 UTC (rev 3478) @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.PrintWriter; +import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -78,6 +79,7 @@ private HashMap<String,String> chunkMap = null; private CDXFormat format = null; private int maxBlocks = 1000; + private BlockLoader blockLoader = null; public ZiplinesSearchResultSource() { } @@ -165,7 +167,9 @@ String url = chunkMap.get(parts[1]); long offset = Long.parseLong(parts[2]); LOGGER.info("Adding block source(" + parts[1] + "):" + offset); - blocks.add(new ZiplinedBlock(url, offset)); + ZiplinedBlock block = new ZiplinedBlock(url, offset); + block.setLoader(blockLoader); + blocks.add(block); } } finally { if(itr != null) { @@ -245,8 +249,22 @@ */ public void setMaxBlocks(int maxBlocks) { this.maxBlocks = maxBlocks; - } + } + /** + * @return the blockLoader + */ + public BlockLoader getBlockLoader() { + return blockLoader; + } + + /** + * @param blockLoader the blockLoader to set + */ + public void setBlockLoader(BlockLoader blockLoader) { + this.blockLoader = blockLoader; + } + private static void USAGE() { System.err.println("USAGE:"); System.err.println(""); @@ -267,6 +285,7 @@ // String cdxSpec = CDXFormatIndex.CDX_HEADER_MAGIC; String cdxSpec = " CDX N b a m s k r V g"; CDXFormat format = null; + BlockLoader blockLoader = new Http11BlockLoader(); try { format = new CDXFormat(cdxSpec); } catch (CDXFormatException e1) { @@ -291,6 +310,23 @@ } } else if(args[idx].equals("-blockDump")) { blockDump = true; + } else if(args[idx].equals("-hdfs")) { + idx++; + if(idx >= args.length) { + USAGE(); + } + blockLoader = new HDFSBlockLoader(args[idx]); + try { + ((HDFSBlockLoader)blockLoader).init(); + } catch (IOException e) { + e.printStackTrace(); + USAGE(); + System.exit(1); + } catch (URISyntaxException e) { + e.printStackTrace(); + USAGE(); + System.exit(1); + } } else if(args[idx].equals("-max")) { idx++; if(idx >= args.length) { @@ -319,6 +355,7 @@ USAGE(); } // first is summary path, then location path, then search key: + zl.setBlockLoader(blockLoader); zl.setChunkIndexPath(args[idx++]); zl.setChunkMapPath(args[idx++]); String key = args[idx++]; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-09-06 03:59:30
|
Revision: 3521 http://archive-access.svn.sourceforge.net/archive-access/?rev=3521&view=rev Author: bradtofel Date: 2011-09-06 03:59:24 +0000 (Tue, 06 Sep 2011) Log Message: ----------- LICENSE Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/Http11BlockLoader.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java 2011-09-06 03:58:31 UTC (rev 3520) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java 2011-09-06 03:59:24 UTC (rev 3521) @@ -1,3 +1,22 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.archive.wayback.resourceindex.ziplines; import java.io.IOException; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/Http11BlockLoader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/Http11BlockLoader.java 2011-09-06 03:58:31 UTC (rev 3520) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/Http11BlockLoader.java 2011-09-06 03:59:24 UTC (rev 3521) @@ -1,3 +1,22 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.archive.wayback.resourceindex.ziplines; import java.io.IOException; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-09-06 04:06:15
|
Revision: 3524 http://archive-access.svn.sourceforge.net/archive-access/?rev=3524&view=rev Author: bradtofel Date: 2011-09-06 04:06:09 +0000 (Tue, 06 Sep 2011) Log Message: ----------- FEATURE: allows multiple possible locations for a block - attempts to read will try each in order Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/BlockLocation.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/BlockLocation.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/BlockLocation.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/BlockLocation.java 2011-09-06 04:06:09 UTC (rev 3524) @@ -0,0 +1,41 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.resourceindex.ziplines; + +public class BlockLocation { + String name; + String locations[]; + public BlockLocation(String name, String locations[]) { + this.name = name; + this.locations = locations; + } + /** + * @return the name + */ + public String getName() { + return name; + } + /** + * @return the locations + */ + public String[] getLocations() { + return locations; + } +} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java 2011-09-06 04:02:57 UTC (rev 3523) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java 2011-09-06 04:06:09 UTC (rev 3524) @@ -37,11 +37,11 @@ ZiplinedBlockStringSequence.class.getName()); private FlatFile chunkIndex = null; - private HashMap<String,String> chunkMap = null; + private HashMap<String,BlockLocation> chunkMap = null; private int maxBlocks = 10000; public ZiplinedBlockStringSequence(FlatFile chunkIndex, - HashMap<String,String> chunkMap) { + HashMap<String,BlockLocation> chunkMap) { this.chunkIndex = chunkIndex; this.chunkMap = chunkMap; } @@ -80,11 +80,11 @@ break; } // add this and keep lookin... - String url = chunkMap.get(parts[1]); + BlockLocation bl = chunkMap.get(parts[1]); long offset = Long.parseLong(parts[2]); int count = Integer.parseInt(parts[3]); - blocks.add(new ZiplinedBlock(url, offset, count)); + blocks.add(new ZiplinedBlock(bl.getLocations(), offset, count)); } } finally { if(itr != null) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-09-06 04:18:07
|
Revision: 3531 http://archive-access.svn.sourceforge.net/archive-access/?rev=3531&view=rev Author: bradtofel Date: 2011-09-06 04:18:00 +0000 (Tue, 06 Sep 2011) Log Message: ----------- INITIAL REV: BlockLoader which loads from Local RandomAccessFiles, and a generic BlockLoader which tries to do "the right thing" with each location, choosing the correct block loader based on the URL/String prefix Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/GenericBlockLoader.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/LocalFileBlockLoader.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/GenericBlockLoader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/GenericBlockLoader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/GenericBlockLoader.java 2011-09-06 04:18:00 UTC (rev 3531) @@ -0,0 +1,78 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; +import java.net.URISyntaxException; + +/** + * Generic BlockLoader, which may simplify configuration - inspecting each + * location to attempt to choose the correct BlockLoader: + * HDFS, HTTP, or LocalFile + * @author brad + * + */ +public class GenericBlockLoader implements BlockLoader { + Http11BlockLoader http = null; + HDFSBlockLoader hdfs = null; + LocalFileBlockLoader local = null; + private String defaultFSURI; + public GenericBlockLoader() { + http = new Http11BlockLoader(); +// hdfs = new HDFSBlockLoader(null); +// try { +// hdfs.init(); +// } catch (IOException e) { +// // TODO Auto-generated catch block +// e.printStackTrace(); +// } catch (URISyntaxException e) { +// // TODO Auto-generated catch block +// e.printStackTrace(); +// } + local = new LocalFileBlockLoader(); + } + public void init() { + if(defaultFSURI != null) { + hdfs = new HDFSBlockLoader(defaultFSURI); + try { + hdfs.init(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (URISyntaxException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + public byte[] getBlock(String url, long offset, int length) + throws IOException { + if(hdfs != null && url.startsWith("hdfs://")) { + return hdfs.getBlock(url, offset, length); + } else if(url.startsWith("/")) { + return local.getBlock(url, offset, length); + } + return http.getBlock(url, offset, length); + } + public void setDefaultFSURI(String uri) { + defaultFSURI = uri; +// hdfs.setDefaultFSURI(uri); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/LocalFileBlockLoader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/LocalFileBlockLoader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/LocalFileBlockLoader.java 2011-09-06 04:18:00 UTC (rev 3531) @@ -0,0 +1,47 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.resourceindex.ziplines; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; + +/** + * Simple block loader which uses RandomAccessFiles to grab ranges of local + * files. + * @author brad + * + */ +public class LocalFileBlockLoader implements BlockLoader { + + public byte[] getBlock(String url, long offset, int length) + throws IOException { + File file = new File(url); + RandomAccessFile raf = new RandomAccessFile(file, "r"); + raf.seek(offset); + if(raf.getFilePointer() != offset) { + throw new IOException("Failed seek("+offset+") in ("+url+")"); + } + byte b[] = new byte[length]; + raf.readFully(b); + return b; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |