From: <bra...@us...> - 2009-12-09 06:50:16
|
Revision: 2938 http://archive-access.svn.sourceforge.net/archive-access/?rev=2938&view=rev Author: bradtofel Date: 2009-12-09 06:50:07 +0000 (Wed, 09 Dec 2009) Log Message: ----------- INITIAL REV: SearchResultSource composed of a series of alphabetically partitioned ziplined CDX files. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java 2009-12-09 06:50:07 UTC (rev 2938) @@ -0,0 +1,90 @@ +/* StringPrefixIterator + * + * $Id$: + * + * Created on Nov 23, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; +import java.util.Iterator; + +import org.archive.wayback.util.CloseableIterator; + +/** + * @author brad + * + */ +public class StringPrefixIterator implements CloseableIterator<String> { + private String prefix = null; + Iterator<String> inner = null; + private String cachedNext = null; + private boolean done = false; + public StringPrefixIterator(Iterator<String> inner, String prefix) { + this.prefix = prefix; + this.inner = inner; + } + /* (non-Javadoc) + * @see java.util.Iterator#hasNext() + */ + public boolean hasNext() { + if(done) return false; + if(cachedNext != null) { + return true; + } + while(inner.hasNext()) { + String tmp = inner.next(); + if(tmp.startsWith(prefix)) { + cachedNext = tmp; + return true; + } else if(tmp.compareTo(prefix) > 0) { + done = true; + return false; + } + } + return false; + } + /* (non-Javadoc) + * @see java.util.Iterator#next() + */ + public String next() { + String tmp = cachedNext; + cachedNext = null; + return tmp; + } + /* (non-Javadoc) + * @see java.util.Iterator#remove() + */ + public void remove() { + // TODO Auto-generated method stub + + } + /* (non-Javadoc) + * @see java.io.Closeable#close() + */ + public void close() throws IOException { + if(inner instanceof CloseableIterator) { + CloseableIterator<String> toBeClosed = (CloseableIterator<String>) inner; + toBeClosed.close(); + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2009-12-09 06:50:07 UTC (rev 2938) @@ -0,0 +1,68 @@ +/* ZiplinedBlock + * + * $Id$: + * + * Created on Nov 23, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.util.zip.GZIPInputStream; + +/** + * @author brad + * + */ +public class ZiplinedBlock { + String urlOrPath = null; + long offset = -1; + public final static int BLOCK_SIZE = 128 * 1024; + private final static String RANGE_HEADER = "Range"; + private final static String BYTES_HEADER = "bytes="; + private final static String BYTES_MINUS = "-"; + /** + * @param urlOrPath URL where this file can be downloaded + * @param offset start of 128K block boundary. + */ + public ZiplinedBlock(String urlOrPath, long offset) { + this.urlOrPath = urlOrPath; + this.offset = offset; + } + /** + * @return a BufferedReader of the underlying compressed data in this block + * @throws IOException for usual reasons + */ + public BufferedReader readBlock() throws IOException { + URL u = new URL(urlOrPath); + URLConnection uc = u.openConnection(); + StringBuilder sb = new StringBuilder(16); + sb.append(BYTES_HEADER).append(offset).append(BYTES_MINUS); + sb.append((offset + BLOCK_SIZE)-1); + uc.setRequestProperty(RANGE_HEADER, sb.toString()); + return new BufferedReader(new InputStreamReader( + new GZIPInputStream(uc.getInputStream()))); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java 2009-12-09 06:50:07 UTC (rev 2938) @@ -0,0 +1,151 @@ +/* ZiplinesChunkIterator + * + * $Id$: + * + * Created on Nov 23, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.RandomAccessFile; +import java.util.Iterator; +import java.util.List; +import java.util.RandomAccess; +import java.util.zip.GZIPInputStream; + +import org.archive.wayback.util.CloseableIterator; + +/** + * @author brad + * + */ +public class ZiplinesChunkIterator implements CloseableIterator<String> { + private BufferedReader br = null; + private Iterator<ZiplinedBlock> blockItr = null; + private String cachedNext = null; + /** + * @param blocks which should be fetched and unzipped, one after another + */ + public ZiplinesChunkIterator(List<ZiplinedBlock> blocks) { + blockItr = blocks.iterator(); + } + /* (non-Javadoc) + * @see java.util.Iterator#hasNext() + */ + public boolean hasNext() { + if(cachedNext != null) { + return true; + } + while(cachedNext == null) { + if(br != null) { + // attempt to read the next line from this: + try { + cachedNext = br.readLine(); + if(cachedNext == null) { + br = null; + // next loop: + } else { + return true; + } + } catch (IOException e) { + e.printStackTrace(); + br = null; + } + } else { + // do we have more blocks to use? + if(blockItr.hasNext()) { + try { + br = blockItr.next().readBlock(); + } catch (IOException e) { + e.printStackTrace(); + } + } else { + return false; + } + } + } + + return false; + } + + /* (non-Javadoc) + * @see java.util.Iterator#next() + */ + public String next() { + String tmp = cachedNext; + cachedNext = null; + return tmp; + } + + /* (non-Javadoc) + * @see java.util.Iterator#remove() + */ + public void remove() { + throw new UnsupportedOperationException(); + } + + /* (non-Javadoc) + * @see java.io.Closeable#close() + */ + public void close() throws IOException { + if(br != null) { + br.close(); + } + } + public static void main(String[] args) { + if(args.length != 1) { + System.err.println("Usage: ZIPLINES_PATH"); + System.exit(1); + } + File f = new File(args[0]); + long size = f.length(); + long numBlocks = (long) (size / ZiplinedBlock.BLOCK_SIZE); + long size2 = numBlocks * ZiplinedBlock.BLOCK_SIZE; + if(size != size2) { + System.err.println("File size of " + args[0] + " is not a mulitple" + + " of " + ZiplinedBlock.BLOCK_SIZE); + } + try { + RandomAccessFile raf = new RandomAccessFile(f, "r"); + for(int i = 0; i < numBlocks; i++) { + long offset = i * ZiplinedBlock.BLOCK_SIZE; + raf.seek(offset); + BufferedReader br = new BufferedReader(new InputStreamReader( + new GZIPInputStream(new FileInputStream(raf.getFD())))); + String line = br.readLine(); + if(line == null) { + System.err.println("Bad block at " + offset + " in " + args[0]); + System.exit(1); + } + System.out.println(args[0] + " " + offset + " " + line); + } + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2009-12-09 06:50:07 UTC (rev 2938) @@ -0,0 +1,218 @@ +/* ZiplinesSearchResultSource + * + * $Id$: + * + * Created on Nov 23, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import it.unimi.dsi.mg4j.util.FrontCodedStringList; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.exception.ResourceIndexNotAvailableException; +import org.archive.wayback.resourceindex.SearchResultSource; +import org.archive.wayback.resourceindex.cdx.CDXFormatToSearchResultAdapter; +import org.archive.wayback.resourceindex.cdx.format.CDXFormat; +import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.flatfile.FlatFile; + +/** + * A set of Ziplines files, which are CDX files specially compressed into a + * series of GZipMembers such that: + * + * 1) each member is exactly 128K, padded using a GZip comment header + * 2) each member contains complete lines: no line spans two GZip members + * + * If the data put into these files is sorted, then the data within the files + * can be uncompressed when needed, minimizing the total data to be uncompressed + * + * This SearchResultSource assumes a set of alphabetically partitioned Ziplined + * CDX files, so that each file is sorted, and no regions overlap. + * + * This class takes 2 files as input: + * 1) a specially constructed map of the first N bytes of data from each GZip + * member, and the filename and offset of that GZip member. + * 2) a mapping of filenames to URLs + * + * Data from #1 is actually stored in a serialized + * + * + * + * @author brad + * + */ +public class ZiplinesSearchResultSource implements SearchResultSource { + + /** + * Local path containing map of URL,TIMESTAMP,CHUNK,OFFSET for each 128K chunk + */ + private String chunkIndexPath = null; + private FlatFile chunkIndex = null; + /** + * Local path containing URL for each CHUNK + */ + private String chunkMapPath = null; + private HashMap<String,String> chunkMap = null; + private CDXFormat format = null; + + public ZiplinesSearchResultSource() { + } + public ZiplinesSearchResultSource(CDXFormat format) { + this.format = format; + } + public void init() throws IOException { + chunkMap = new HashMap<String, String>(); + FlatFile ff = new FlatFile(chunkMapPath); + Iterator<String> lines = ff.getSequentialIterator(); + while(lines.hasNext()) { + String line = lines.next(); + String[] parts = line.split("\\s"); + if(parts.length != 2) { + throw new IOException("Bad line(" + line +") in (" + + chunkMapPath + ")"); + } + chunkMap.put(parts[0],parts[1]); + } + chunkIndex = new FlatFile(chunkIndexPath); + } + protected CloseableIterator<CaptureSearchResult> adaptIterator(Iterator<String> itr) + throws IOException { + return new AdaptedIterator<String,CaptureSearchResult>(itr, + new CDXFormatToSearchResultAdapter(format)); + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultSource#cleanup(org.archive.wayback.util.CloseableIterator) + */ + public void cleanup(CloseableIterator<CaptureSearchResult> c) + throws IOException { + c.close(); + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultSource#getPrefixIterator(java.lang.String) + */ + public CloseableIterator<CaptureSearchResult> getPrefixIterator( + String prefix) throws ResourceIndexNotAvailableException { + try { + return adaptIterator(getStringPrefixIterator(prefix)); + } catch (IOException e) { + throw new ResourceIndexNotAvailableException(e.getMessage()); + } + } + + public Iterator<String> getStringPrefixIterator(String prefix) throws ResourceIndexNotAvailableException, IOException { + Iterator<String> itr = chunkIndex.getRecordIteratorLT(prefix); + ArrayList<ZiplinedBlock> blocks = new ArrayList<ZiplinedBlock>(); + boolean first = true; + while(itr.hasNext()) { + String blockDescriptor = itr.next(); + String parts[] = blockDescriptor.split("\t"); + if(parts.length != 3) { + throw new ResourceIndexNotAvailableException("Bad line(" + + blockDescriptor + ")"); + } + // only compare the correct length: + String prefCmp = prefix; + String blockCmp = parts[0]; +// if(prefCmp.length() < blockCmp.length()) { +// blockCmp = blockCmp.substring(0,prefCmp.length()); +// } else { +// prefCmp = prefCmp.substring(0,blockCmp.length()); +// } + if(first) { + // always add first: + first = false; +// } else if(blockCmp.compareTo(prefCmp) > 0) { + } else if(!blockCmp.startsWith(prefCmp)) { + // all done; + break; + } + // add this and keep lookin... + String url = chunkMap.get(parts[1]); + long offset = Long.parseLong(parts[2]); + blocks.add(new ZiplinedBlock(url, offset)); + } + return new StringPrefixIterator(new ZiplinesChunkIterator(blocks),prefix); + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultSource#getPrefixReverseIterator(java.lang.String) + */ + public CloseableIterator<CaptureSearchResult> getPrefixReverseIterator( + String prefix) throws ResourceIndexNotAvailableException { + throw new ResourceIndexNotAvailableException("unsupported op"); + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultSource#shutdown() + */ + public void shutdown() throws IOException { + // no-op.. + } + /** + * @return the format + */ + public CDXFormat getFormat() { + return format; + } + /** + * @param format the format to set + */ + public void setFormat(CDXFormat format) { + this.format = format; + } + /** + * @return the chunkIndexPath + */ + public String getChunkIndexPath() { + return chunkIndexPath; + } + /** + * @param chunkIndexPath the chunkIndexPath to set + */ + public void setChunkIndexPath(String chunkIndexPath) { + this.chunkIndexPath = chunkIndexPath; + } + /** + * @return the chunkMapPath + */ + public String getChunkMapPath() { + return chunkMapPath; + } + /** + * @param chunkMapPath the chunkMapPath to set + */ + public void setChunkMapPath(String chunkMapPath) { + this.chunkMapPath = chunkMapPath; + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java 2009-12-09 06:50:07 UTC (rev 2938) @@ -0,0 +1,64 @@ +/* ZiplinesSearchResultSourceTest + * + * $Id$: + * + * Created on Nov 23, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.util.Iterator; + +import org.archive.wayback.resourceindex.cdx.format.CDXFormat; +import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; + +import junit.framework.TestCase; + +/** + * @author brad + * + */ +public class ZiplinesSearchResultSourceTest extends TestCase { + + /** + * Test method for {@link org.archive.wayback.resourceindex.ziplines.ZiplinesSearchResultSource#getPrefixIterator(java.lang.String)}. + * @throws CDXFormatException + */ + public void testGetPrefixIterator() throws Exception { + CDXFormat format = new CDXFormat(" CDX N b a m s k r M V g"); + ZiplinesSearchResultSource zsrs = new ZiplinesSearchResultSource(format); +// zsrs.setChunkIndexPath("/home/brad/zipline-test/part-00005-frag.cdx.zlm"); +// zsrs.setChunkMapPath("/home/brad/zipline-test/manifest.txt"); + zsrs.setChunkIndexPath("/home/brad/ALL.summary"); + zsrs.setChunkMapPath("/home/brad/ALL.loc"); + zsrs.init(); + Iterator<String> i = zsrs.getStringPrefixIterator("krunch.com/ "); + int max = 100; + int done = 0; + while(i.hasNext()) { + System.out.println(i.next()); + if(done++ > max) { + break; + } + } + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |