From: <bra...@us...> - 2010-05-18 22:44:29
|
Revision: 3103 http://archive-access.svn.sourceforge.net/archive-access/?rev=3103&view=rev Author: bradtofel Date: 2010-05-18 22:44:22 +0000 (Tue, 18 May 2010) Log Message: ----------- Experimental: changes to make a SearchResultSource that can be directly indexed by ordinal position. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/SequencedSearchResultSource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/SkippingStringPrefixIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplineBlockMatches.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/SequencedSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/SequencedSearchResultSource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/SequencedSearchResultSource.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -0,0 +1,40 @@ +/* SequencedSearchResultSource + * + * $Id$: + * + * Created on May 14, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.exception.ResourceIndexNotAvailableException; +import org.archive.wayback.util.CloseableIterator; + +/** + * @author brad + * + */ +public interface SequencedSearchResultSource extends SearchResultSource { + public CloseableIterator<CaptureSearchResult> + getPrefixIterator(final String prefix, int startIdx) + throws ResourceIndexNotAvailableException; +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/SequencedSearchResultSource.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/SkippingStringPrefixIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/SkippingStringPrefixIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/SkippingStringPrefixIterator.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -0,0 +1,63 @@ +/* SkippingStringPrefixIterator + * + * $Id$: + * + * Created on May 14, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.util.Iterator; + +/** + * @author brad + * + */ +public class SkippingStringPrefixIterator extends StringPrefixIterator { + private long skipCount = 0; + private long totalMatches = -1; + + public SkippingStringPrefixIterator(Iterator<String> inner, String prefix, + long skipCount) { + super(inner,prefix); + this.skipCount = skipCount; + } + public SkippingStringPrefixIterator(Iterator<String> inner, String prefix) { + super(inner,prefix); + } + public long getTotalMatches() { + return totalMatches; + } + public void setTotalMatches(long totalMatches) { + this.totalMatches = totalMatches; + } + public boolean hasNext() { + while(skipCount > 0) { + if(super.hasNext()) { + next(); + skipCount--; + } else { + return false; + } + } + return super.hasNext(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/SkippingStringPrefixIterator.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java 2010-05-18 22:38:59 UTC (rev 3102) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -47,6 +47,9 @@ truncated = ((ZiplinesChunkIterator)inner).isTruncated(); } } + public long getTotalMatches() { + return 0 ; + } public boolean isTruncated() { return truncated; } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplineBlockMatches.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplineBlockMatches.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplineBlockMatches.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -0,0 +1,141 @@ +/* ZiplineBlockMatches + * + * $Id$: + * + * Created on May 14, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; + +/** + * @author brad + * + */ +public class ZiplineBlockMatches { + private ArrayList<ZiplinedBlock> blocks = null; + private String prefix = null; + private int cachedFirstCount = -1; + private int cachedLastCount = -1; + public ZiplineBlockMatches(ArrayList<ZiplinedBlock> blocks, String prefix) { + this.blocks = blocks; + this.prefix = prefix; + cachedFirstCount = -1; + cachedLastCount = -1; + } + + public StringPrefixIterator getIterator() { + ZiplinesChunkIterator zci = new ZiplinesChunkIterator(blocks); + zci.setTruncated(false); + return new StringPrefixIterator(zci,prefix); + } + + public StringPrefixIterator getIteratorAt(long skip) throws IOException { + SkippingStringPrefixIterator itr = null; + ArrayList<ZiplinedBlock> matchingBlocked = + new ArrayList<ZiplinedBlock>(); + long total = getTotalMatching(); + if(skip > total) { + // TODO: should return empty itr... + return null; + } + long firstBlockMatches = + countMatchesInStartBlock(blocks.get(0), prefix); + if(skip < firstBlockMatches) { + ZiplinesChunkIterator zci = new ZiplinesChunkIterator(blocks); + itr = new SkippingStringPrefixIterator(zci,prefix,skip); + itr.setTotalMatches(total); + return itr; + } + skip -= firstBlockMatches; + int size = blocks.size(); + for(int i = 1; i < size; i++) { + ZiplinedBlock block = blocks.get(i); + if(block.count > skip) { + // this is the block to start: + ZiplinesChunkIterator zci = + new ZiplinesChunkIterator(blocks.subList(i, size)); + itr = new SkippingStringPrefixIterator(zci,prefix,skip); + itr.setTotalMatches(total); + return itr; + } + skip -= block.count; + } + // should never get here... + return null; + } + + public long getTotalMatching() throws IOException { + if(blocks == null) { + return 0; + } + int size = blocks.size(); + if(size == 0) { + return 0; + } + long count = countMatchesInStartBlock(blocks.get(0),prefix); + if(size == 1) { + return count; + } + for(int i = 1; i < size-1; i++) { + count += blocks.get(i).count; + } + count += countMatchesInLastBlock(blocks.get(size-1), prefix); + return count; + } + private long countMatchesInStartBlock(ZiplinedBlock block, String prefix) + throws IOException { + if(cachedFirstCount == -1) { + BufferedReader r = block.readBlock(); + int matches = block.count; + while(true) { + String nextLine = r.readLine(); + if((nextLine == null) || nextLine.startsWith(prefix)) { + r.close(); + cachedFirstCount = matches; + break; + } + matches--; + } + } + return cachedFirstCount; + } + private long countMatchesInLastBlock(ZiplinedBlock block, String prefix) + throws IOException { + if(cachedLastCount == -1) { + BufferedReader r = block.readBlock(); + int matches = 0; + while(true) { + String nextLine = r.readLine(); + if((nextLine == null) || !nextLine.startsWith(prefix)) { + r.close(); + cachedLastCount = matches; + break; + } + matches++; + } + } + return cachedLastCount; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplineBlockMatches.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2010-05-18 22:38:59 UTC (rev 3102) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -44,6 +44,7 @@ String urlOrPath = null; long offset = -1; + int count = 0; public final static int BLOCK_SIZE = 128 * 1024; private final static String RANGE_HEADER = "Range"; private final static String BYTES_HEADER = "bytes="; @@ -53,8 +54,17 @@ * @param offset start of 128K block boundary. */ public ZiplinedBlock(String urlOrPath, long offset) { + this(urlOrPath,offset,0); + } + /** + * @param urlOrPath URL where this file can be downloaded + * @param offset start of 128K block boundary. + * @param count number of records in this block + */ + public ZiplinedBlock(String urlOrPath, long offset, int count) { this.urlOrPath = urlOrPath; this.offset = offset; + this.count = count; } /** * @return a BufferedReader of the underlying compressed data in this block Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -0,0 +1,107 @@ +/* ZiplinedBlockIndex + * + * $Id$: + * + * Created on May 14, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; + +import org.archive.wayback.exception.ResourceIndexNotAvailableException; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.flatfile.FlatFile; + +/** + * @author brad + * + */ +public class ZiplinedBlockStringSequence { + private FlatFile chunkIndex = null; + private HashMap<String,String> chunkMap = null; + private int maxBlocks = 10000; + + public ZiplinedBlockStringSequence(FlatFile chunkIndex, + HashMap<String,String> chunkMap) { + this.chunkIndex = chunkIndex; + this.chunkMap = chunkMap; + } + + private ZiplineBlockMatches getBlockMatches(String prefix) + throws IOException, ResourceIndexNotAvailableException { + ArrayList<ZiplinedBlock> blocks = new ArrayList<ZiplinedBlock>(); + boolean first = true; + int numBlocks = 0; + boolean truncated = false; + CloseableIterator<String> itr = null; + try { + itr = chunkIndex.getRecordIteratorLT(prefix); + while(itr.hasNext()) { + if(numBlocks >= maxBlocks) { + truncated = true; + break; + } + String blockDescriptor = itr.next(); + numBlocks++; + String parts[] = blockDescriptor.split("\t"); + if(parts.length != 4) { + throw new ResourceIndexNotAvailableException("Bad line(" + + blockDescriptor + ")"); + } + // only compare the correct length: + String prefCmp = prefix; + String blockCmp = parts[0]; + if(first) { + // always add first: + first = false; + } else if(!blockCmp.startsWith(prefCmp)) { + // all done; + break; + } + // add this and keep lookin... + String url = chunkMap.get(parts[1]); + long offset = Long.parseLong(parts[2]); + int count = Integer.parseInt(parts[3]); + + blocks.add(new ZiplinedBlock(url, offset, count)); + } + } finally { + if(itr != null) { + itr.close(); + } + } + return new ZiplineBlockMatches(blocks,prefix); + } + + public StringPrefixIterator getIterator(String prefix, long skip) + throws ResourceIndexNotAvailableException, IOException { + ZiplineBlockMatches matches = getBlockMatches(prefix); + return matches.getIteratorAt(skip); + } + public StringPrefixIterator getIterator(String prefix) + throws ResourceIndexNotAvailableException, IOException { + ZiplineBlockMatches matches = getBlockMatches(prefix); + return matches.getIterator(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -0,0 +1,85 @@ +/* ZiplinedBlockStringSequenceTest + * + * $Id$: + * + * Created on May 14, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; +import java.util.HashMap; + +import org.archive.wayback.exception.ResourceIndexNotAvailableException; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.flatfile.FlatFile; + +import junit.framework.TestCase; + +/** + * @author brad + * + */ +public class ZiplinedBlockStringSequenceTest extends TestCase { + private String indexPath = "/home/brad/os-cdx/CDX-201002-clean/ALL.count.summary"; + private String mapPath = "/home/brad/os-cdx/CDX-201002-clean/ALL.loc-workstation"; + + private ZiplinedBlockStringSequence getSequence() throws IOException { + HashMap<String, String> chunkMap = new HashMap<String, String>(); + FlatFile ff = new FlatFile(mapPath); + CloseableIterator<String> lines = ff.getSequentialIterator(); + while(lines.hasNext()) { + String line = lines.next(); + String[] parts = line.split("\\s"); + if(parts.length != 2) { + throw new IOException("Bad line(" + line +") in (" + + mapPath + ")"); + } + chunkMap.put(parts[0],parts[1]); + } + lines.close(); + FlatFile chunkIndex = new FlatFile(indexPath); + return new ZiplinedBlockStringSequence(chunkIndex, chunkMap); + } + /** + * Test method for {@link org.archive.wayback.resourceindex.ziplines.ZiplinedBlockStringSequence#getIterator(java.lang.String, long)}. + * @throws IOException + * @throws ResourceIndexNotAvailableException + */ + public void testGetIteratorStringLong() throws IOException, ResourceIndexNotAvailableException { + ZiplinedBlockStringSequence seq = getSequence(); + StringPrefixIterator itr = seq.getIterator("yahoo.com/", 1000000); + System.out.format("Total Matches %d\n",itr.getTotalMatches()); + for(int i = 0; i < 10; i++) { + if(itr.hasNext()) { + System.out.format("Line(%d): %s\n",i,itr.next()); + } + } + } + + /** + * Test method for {@link org.archive.wayback.resourceindex.ziplines.ZiplinedBlockStringSequence#getIterator(java.lang.String)}. + */ + public void testGetIteratorString() { +// fail("Not yet implemented"); + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-05-18 22:38:59 UTC (rev 3102) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -39,6 +39,7 @@ import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.resourceindex.SearchResultSource; +import org.archive.wayback.resourceindex.SequencedSearchResultSource; import org.archive.wayback.resourceindex.cdx.CDXFormatToSearchResultAdapter; import org.archive.wayback.resourceindex.cdx.format.CDXFormat; import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; @@ -132,10 +133,9 @@ throw new ResourceIndexNotAvailableException(e.getMessage()); } } - - public Iterator<String> getStringPrefixIterator(String prefix) - throws ResourceIndexNotAvailableException, IOException { + private ArrayList<ZiplinedBlock> getBlockListForPrefix(String prefix) + throws IOException, ResourceIndexNotAvailableException { ArrayList<ZiplinedBlock> blocks = new ArrayList<ZiplinedBlock>(); boolean first = true; int numBlocks = 0; @@ -175,8 +175,15 @@ itr.close(); } } + return blocks; + } + + public Iterator<String> getStringPrefixIterator(String prefix) + throws ResourceIndexNotAvailableException, IOException { + + ArrayList<ZiplinedBlock> blocks = getBlockListForPrefix(prefix); ZiplinesChunkIterator zci = new ZiplinesChunkIterator(blocks); - zci.setTruncated(truncated); + zci.setTruncated(false); return new StringPrefixIterator(zci,prefix); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |