[Archive-access-cvs] SF.net SVN: archive-access:[3010] trunk/archive-access/projects/wayback/ wayb

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3010
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3010&view=rev
Author:   bradtofel
Date:     2010-04-02 03:06:28 +0000 (Fri, 02 Apr 2010)

Log Message:
-----------
Many unreported bugfixes, slight change of interface to allow grabbing an iterator of String(lines), added a main() method, and added a truncated() method to the iterators, currently not exposed enough to be useful, but potentially allowing an external user to determine if the search was cut off because too many blocks had to be searched.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java
===================================================================

--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java	2010-04-02 02:53:44 UTC (rev 3009)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java	2010-04-02 03:06:28 UTC (rev 3010)
@@ -39,10 +39,17 @@
 	Iterator<String> inner = null;
 	private String cachedNext = null;
 	private boolean done = false;
+	private boolean truncated = false;
 	public StringPrefixIterator(Iterator<String> inner, String prefix) {
 		this.prefix = prefix;
 		this.inner = inner;
+		if(inner instanceof ZiplinesChunkIterator) {
+			truncated = ((ZiplinesChunkIterator)inner).isTruncated();
+		}
 	}
+	public boolean isTruncated() {
+		return truncated;
+	}
 	/* (non-Javadoc)
 	 * @see java.util.Iterator#hasNext()
 	 */

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java	2010-04-02 02:53:44 UTC (rev 3009)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java	2010-04-02 03:06:28 UTC (rev 3010)
@@ -32,11 +32,16 @@
 import java.net.URLConnection;
 import java.util.zip.GZIPInputStream;
 
+import org.apache.log4j.Logger;
+
 /**
  * @author brad
  *
  */
 public class ZiplinedBlock {
+	private static final Logger LOGGER = Logger.getLogger(
+			ZiplinedBlock.class.getName());
+
 	String urlOrPath = null;
 	long offset = -1;
 	public final static int BLOCK_SIZE = 128 * 1024;
@@ -56,11 +61,13 @@
 	 * @throws IOException for usual reasons
 	 */
 	public BufferedReader readBlock() throws IOException {
-		URL u = new URL(urlOrPath);
-		URLConnection uc = u.openConnection();
 		StringBuilder sb = new StringBuilder(16);
 		sb.append(BYTES_HEADER).append(offset).append(BYTES_MINUS);
 		sb.append((offset + BLOCK_SIZE)-1);
+		LOGGER.trace("Reading block:" + urlOrPath + "("+sb.toString()+")");
+		// TODO: timeouts
+		URL u = new URL(urlOrPath);
+		URLConnection uc = u.openConnection();
 		uc.setRequestProperty(RANGE_HEADER, sb.toString());
 		return new BufferedReader(new InputStreamReader(
 				new GZIPInputStream(uc.getInputStream())));

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java	2010-04-02 02:53:44 UTC (rev 3009)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java	2010-04-02 03:06:28 UTC (rev 3010)
@@ -37,20 +37,27 @@
 import java.util.RandomAccess;
 import java.util.zip.GZIPInputStream;
 
+import org.apache.log4j.Logger;
 import org.archive.wayback.util.CloseableIterator;
+import org.archive.wayback.webapp.AccessPoint;
 
 /**
  * @author brad
  *
  */
 public class ZiplinesChunkIterator implements CloseableIterator<String> {
+	private static final Logger LOGGER = Logger.getLogger(
+			ZiplinesChunkIterator.class.getName());
+
 	private BufferedReader br = null;
 	private Iterator<ZiplinedBlock> blockItr = null;
 	private String cachedNext = null;
+	private boolean truncated = false;
 	/**
 	 * @param blocks which should be fetched and unzipped, one after another
 	 */
 	public ZiplinesChunkIterator(List<ZiplinedBlock> blocks) {
+		LOGGER.info("initialized with " + blocks.size() + " blocks");
 		blockItr = blocks.iterator();
 	}
 	/* (non-Javadoc)
@@ -148,4 +155,16 @@
 			System.exit(1);
 		}
 	}
+	/**
+	 * @return the truncated
+	 */
+	public boolean isTruncated() {
+		return truncated;
+	}
+	/**
+	 * @param truncated the truncated to set
+	 */
+	public void setTruncated(boolean truncated) {
+		this.truncated = truncated;
+	}
 }

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java	2010-04-02 02:53:44 UTC (rev 3009)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java	2010-04-02 03:06:28 UTC (rev 3010)
@@ -28,8 +28,10 @@
 import it.unimi.dsi.mg4j.util.FrontCodedStringList;
 
 import java.io.BufferedReader;
+import java.io.FileNotFoundException;
 import java.io.FileReader;
 import java.io.IOException;
+import java.io.PrintWriter;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -82,6 +84,7 @@
 	private String chunkMapPath = null;
 	private HashMap<String,String> chunkMap = null;
 	private CDXFormat format = null;
+	private int maxBlocks = 1000;
 	
 	public ZiplinesSearchResultSource() {
 	}
@@ -130,40 +133,51 @@
 		}
 	}
 	
-	public Iterator<String> getStringPrefixIterator(String prefix) throws ResourceIndexNotAvailableException, IOException {
-		CloseableIterator<String> itr = chunkIndex.getRecordIteratorLT(prefix);
+	public Iterator<String> getStringPrefixIterator(String prefix) 
+		throws ResourceIndexNotAvailableException, IOException {
+
 		ArrayList<ZiplinedBlock> blocks = new ArrayList<ZiplinedBlock>();
 		boolean first = true;
-		while(itr.hasNext()) {
-			String blockDescriptor = itr.next();
-			String parts[] = blockDescriptor.split("\t");
-			if(parts.length != 3) {
-				throw new ResourceIndexNotAvailableException("Bad line(" + 
-						blockDescriptor + ")");
+		int numBlocks = 0;
+		boolean truncated = false;
+		CloseableIterator<String> itr = null;
+		try {
+			itr = chunkIndex.getRecordIteratorLT(prefix);
+			while(itr.hasNext()) {
+				if(numBlocks >= maxBlocks) {
+					truncated = true;
+					break;
+				}
+				String blockDescriptor = itr.next();
+				numBlocks++;
+				String parts[] = blockDescriptor.split("\t");
+				if(parts.length != 3) {
+					throw new ResourceIndexNotAvailableException("Bad line(" + 
+							blockDescriptor + ")");
+				}
+				// only compare the correct length:
+				String prefCmp = prefix;
+				String blockCmp = parts[0];
+				if(first) {
+					// always add first:
+					first = false;
+				} else if(!blockCmp.startsWith(prefCmp)) {
+					// all done;
+					break;
+				}
+				// add this and keep lookin...
+				String url = chunkMap.get(parts[1]);
+				long offset = Long.parseLong(parts[2]);
+				blocks.add(new ZiplinedBlock(url, offset));
 			}
-			// only compare the correct length:
-			String prefCmp = prefix;
-			String blockCmp = parts[0];
-//			if(prefCmp.length() < blockCmp.length()) {
-//				blockCmp = blockCmp.substring(0,prefCmp.length());
-//			} else {
-//				prefCmp = prefCmp.substring(0,blockCmp.length());
-//			}
-			if(first) {
-				// always add first:
-				first = false;
-//			} else if(blockCmp.compareTo(prefCmp) > 0) {
-			} else if(!blockCmp.startsWith(prefCmp)) {
-				// all done;
-				break;
+		} finally {
+			if(itr != null) {
+				itr.close();
 			}
-			// add this and keep lookin...
-			String url = chunkMap.get(parts[1]);
-			long offset = Long.parseLong(parts[2]);
-			blocks.add(new ZiplinedBlock(url, offset));
 		}
-		itr.close();
-		return new StringPrefixIterator(new ZiplinesChunkIterator(blocks),prefix);
+		ZiplinesChunkIterator zci = new ZiplinesChunkIterator(blocks);
+		zci.setTruncated(truncated);
+		return new StringPrefixIterator(zci,prefix);
 	}
 
 	/* (non-Javadoc)
@@ -216,5 +230,103 @@
 	public void setChunkMapPath(String chunkMapPath) {
 		this.chunkMapPath = chunkMapPath;
 	}
+	/**
+	 * @return the maxBlocks
+	 */
+	public int getMaxBlocks() {
+		return maxBlocks;
+	}
+	/**
+	 * @param maxBlocks the maxBlocks to set
+	 */
+	public void setMaxBlocks(int maxBlocks) {
+		this.maxBlocks = maxBlocks;
+	}		
 
+	private static void USAGE() {
+		System.err.println("USAGE:");
+		System.err.println("");
+		System.err.println("zl-bin-search [-format FORMAT] [-max MAX_BLOCKS] SUMMARY LOCATION KEY");
+		System.err.println("");
+		System.err.println("Search a ziplined compressed CDX format index for key");
+		System.err.println("KEY to STDOUT. SUMMARY and LOCATION are paths to the");
+		System.err.println("block summary and file location files.");
+		System.err.println("With -format, output CDX in format FORMAT.");
+		System.err.println("With -max, limit search at most MAX_BLOCKS blocks.");
+		System.exit(1);
+	}
+	
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+//		String cdxSpec = CDXFormatIndex.CDX_HEADER_MAGIC;
+		String cdxSpec = " CDX N b a m s k r V g";
+		CDXFormat format = null;
+		try {
+			format = new CDXFormat(cdxSpec);
+		} catch (CDXFormatException e1) {
+			e1.printStackTrace();
+			System.exit(1);
+		}
+		ZiplinesSearchResultSource zl = new ZiplinesSearchResultSource(format);
+		PrintWriter pw = new PrintWriter(System.out);
+		int idx;
+		for(idx = 0; idx < args.length; idx++) {
+			if(args[idx].equals("-format")) {
+				idx++;
+				if(idx >= args.length) {
+					USAGE();
+				}
+				try {
+					zl.setFormat(new CDXFormat(args[idx]));
+				} catch (CDXFormatException e1) {
+					e1.printStackTrace();
+					System.exit(1);
+				}
+			} else if(args[idx].equals("-max")) {
+				idx++;
+				if(idx >= args.length) {
+					USAGE();
+				}
+				try {
+					zl.setMaxBlocks(Integer.parseInt(args[idx]));
+				} catch(NumberFormatException e) {
+					USAGE();
+					System.exit(1);
+				}
+
+			} else {
+				break;
+			}
+		}
+		if(args.length < idx + 3) {
+			USAGE();
+		}
+		// first is summary path, then location path, then search key:
+		zl.setChunkIndexPath(args[idx++]);
+		zl.setChunkMapPath(args[idx++]);
+		String key = args[idx++];
+		
+		try {
+			zl.init();
+			Iterator<String> itr = zl.getStringPrefixIterator(key);
+			boolean truncated = ((StringPrefixIterator)itr).isTruncated();
+			while(itr.hasNext()) {
+				pw.println(itr.next());
+			}
+			pw.close();
+			if(truncated) {
+				System.err.println("Note that results are truncated...");
+			}
+		} catch (ResourceIndexNotAvailableException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+			System.exit(1);
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+			System.exit(1);
+		}
+	}
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access:[3010] trunk/archive-access/projects/wayback/ wayb

[Archive-access-cvs] SF.net SVN: archive-access:[3010] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines