Revision: 2985 http://archive-access.svn.sourceforge.net/archive-access/?rev=2985&view=rev Author: bradtofel Date: 2010-03-20 00:59:42 +0000 (Sat, 20 Mar 2010) Log Message: ----------- BUGFIX: now closes iterators so open filehandles don't stack up. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-03-20 00:58:14 UTC (rev 2984) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-03-20 00:59:42 UTC (rev 2985) @@ -91,7 +91,7 @@ public void init() throws IOException { chunkMap = new HashMap<String, String>(); FlatFile ff = new FlatFile(chunkMapPath); - Iterator<String> lines = ff.getSequentialIterator(); + CloseableIterator<String> lines = ff.getSequentialIterator(); while(lines.hasNext()) { String line = lines.next(); String[] parts = line.split("\\s"); @@ -101,6 +101,7 @@ } chunkMap.put(parts[0],parts[1]); } + lines.close(); chunkIndex = new FlatFile(chunkIndexPath); } protected CloseableIterator<CaptureSearchResult> adaptIterator(Iterator<String> itr) @@ -130,7 +131,7 @@ } public Iterator<String> getStringPrefixIterator(String prefix) throws ResourceIndexNotAvailableException, IOException { - Iterator<String> itr = chunkIndex.getRecordIteratorLT(prefix); + CloseableIterator<String> itr = chunkIndex.getRecordIteratorLT(prefix); ArrayList<ZiplinedBlock> blocks = new ArrayList<ZiplinedBlock>(); boolean first = true; while(itr.hasNext()) { @@ -161,6 +162,7 @@ long offset = Long.parseLong(parts[2]); blocks.add(new ZiplinedBlock(url, offset)); } + itr.close(); return new StringPrefixIterator(new ZiplinesChunkIterator(blocks),prefix); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3183 http://archive-access.svn.sourceforge.net/archive-access/?rev=3183&view=rev Author: bradtofel Date: 2010-07-20 23:56:53 +0000 (Tue, 20 Jul 2010) Log Message: ----------- LOGGIN Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-07-20 23:54:26 UTC (rev 3182) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-07-20 23:56:53 UTC (rev 3183) @@ -25,21 +25,16 @@ package org.archive.wayback.resourceindex.ziplines; -import it.unimi.dsi.mg4j.util.FrontCodedStringList; - -import java.io.BufferedReader; -import java.io.FileNotFoundException; -import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; +import org.apache.log4j.Logger; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.resourceindex.SearchResultSource; -import org.archive.wayback.resourceindex.SequencedSearchResultSource; import org.archive.wayback.resourceindex.cdx.CDXFormatToSearchResultAdapter; import org.archive.wayback.resourceindex.cdx.format.CDXFormat; import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; @@ -73,6 +68,8 @@ * */ public class ZiplinesSearchResultSource implements SearchResultSource { + private static final Logger LOGGER = Logger.getLogger( + ZiplinesSearchResultSource.class.getName()); /** * Local path containing map of URL,TIMESTAMP,CHUNK,OFFSET for each 128K chunk @@ -100,6 +97,8 @@ String line = lines.next(); String[] parts = line.split("\\s"); if(parts.length != 2) { + LOGGER.error("Bad line(" + line +") in (" + + chunkMapPath + ")"); throw new IOException("Bad line(" + line +") in (" + chunkMapPath + ")"); } @@ -152,6 +151,8 @@ numBlocks++; String parts[] = blockDescriptor.split("\t"); if(parts.length != 3) { + LOGGER.error("Bad line(" + blockDescriptor +") in (" + + chunkMapPath + ")"); throw new ResourceIndexNotAvailableException("Bad line(" + blockDescriptor + ")"); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3426 http://archive-access.svn.sourceforge.net/archive-access/?rev=3426&view=rev Author: bradtofel Date: 2011-03-09 05:50:11 +0000 (Wed, 09 Mar 2011) Log Message: ----------- LOGGING: added extra log message about adding a block to be searched, added -debug command line option to force-enable logging in a few related classes Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-03-09 05:48:30 UTC (rev 3425) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-03-09 05:50:11 UTC (rev 3426) @@ -24,6 +24,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; +import java.util.logging.Level; import java.util.logging.Logger; import org.archive.wayback.core.CaptureSearchResult; @@ -163,6 +164,7 @@ // add this and keep lookin... String url = chunkMap.get(parts[1]); long offset = Long.parseLong(parts[2]); + LOGGER.info("Adding block source(" + parts[1] + "):" + offset); blocks.add(new ZiplinedBlock(url, offset)); } } finally { @@ -298,6 +300,14 @@ System.exit(1); } + } else if(args[idx].equals("-debug")) { + Logger.getLogger( + ZiplinesSearchResultSource.class.getName()).setLevel(Level.ALL); + Logger.getLogger( + ZiplinesChunkIterator.class.getName()).setLevel(Level.ALL); + Logger.getLogger( + ZiplinedBlock.class.getName()).setLevel(Level.ALL); + } else { break; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3446 http://archive-access.svn.sourceforge.net/archive-access/?rev=3446&view=rev Author: bradtofel Date: 2011-05-25 01:21:30 +0000 (Wed, 25 May 2011) Log Message: ----------- FEATURE: added "-blockDump" argument, which only produces a list of matching block-offset tuples to STDOUT Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-05-25 01:20:09 UTC (rev 3445) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-05-25 01:21:30 UTC (rev 3446) @@ -276,6 +276,7 @@ ZiplinesSearchResultSource zl = new ZiplinesSearchResultSource(format); PrintWriter pw = new PrintWriter(System.out); int idx; + boolean blockDump = false; for(idx = 0; idx < args.length; idx++) { if(args[idx].equals("-format")) { idx++; @@ -288,6 +289,8 @@ e1.printStackTrace(); System.exit(1); } + } else if(args[idx].equals("-blockDump")) { + blockDump = true; } else if(args[idx].equals("-max")) { idx++; if(idx >= args.length) { @@ -322,15 +325,25 @@ try { zl.init(); - Iterator<String> itr = zl.getStringPrefixIterator(key); - boolean truncated = ((StringPrefixIterator)itr).isTruncated(); - while(itr.hasNext()) { - pw.println(itr.next()); + if(blockDump) { + + ArrayList<ZiplinedBlock> blocks = zl.getBlockListForPrefix(key); + for(ZiplinedBlock block : blocks) { + pw.format("%s\t%s\n", block.urlOrPath, block.offset); + } + pw.close(); + + } else { + Iterator<String> itr = zl.getStringPrefixIterator(key); + boolean truncated = ((StringPrefixIterator)itr).isTruncated(); + while(itr.hasNext()) { + pw.println(itr.next()); + } + pw.close(); + if(truncated) { + System.err.println("Note that results are truncated..."); + } } - pw.close(); - if(truncated) { - System.err.println("Note that results are truncated..."); - } } catch (ResourceIndexNotAvailableException e) { // TODO Auto-generated catch block e.printStackTrace(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3530 http://archive-access.svn.sourceforge.net/archive-access/?rev=3530&view=rev Author: bradtofel Date: 2011-09-06 04:15:02 +0000 (Tue, 06 Sep 2011) Log Message: ----------- FEATURE: - now allows -flexFormat in command line usage - allows configuration of multiple locations of each block source file - improved logging of errors Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-09-06 04:12:58 UTC (rev 3529) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-09-06 04:15:02 UTC (rev 3530) @@ -32,6 +32,7 @@ import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.resourceindex.SearchResultSource; import org.archive.wayback.resourceindex.cdx.CDXFormatToSearchResultAdapter; +import org.archive.wayback.resourceindex.cdx.format.CDXFlexFormat; import org.archive.wayback.resourceindex.cdx.format.CDXFormat; import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; import org.archive.wayback.util.AdaptedIterator; @@ -76,7 +77,7 @@ * Local path containing URL for each CHUNK */ private String chunkMapPath = null; - private HashMap<String,String> chunkMap = null; + private HashMap<String,BlockLocation> chunkMap = null; private CDXFormat format = null; private int maxBlocks = 1000; private BlockLoader blockLoader = null; @@ -87,19 +88,25 @@ this.format = format; } public void init() throws IOException { - chunkMap = new HashMap<String, String>(); + chunkMap = new HashMap<String, BlockLocation>(); FlatFile ff = new FlatFile(chunkMapPath); CloseableIterator<String> lines = ff.getSequentialIterator(); while(lines.hasNext()) { String line = lines.next(); String[] parts = line.split("\\s"); - if(parts.length != 2) { + if(parts.length < 2) { LOGGER.severe("Bad line(" + line +") in (" + chunkMapPath + ")"); throw new IOException("Bad line(" + line +") in (" + chunkMapPath + ")"); } - chunkMap.put(parts[0],parts[1]); + + String locations[] = new String[parts.length - 1]; + for(int i = 1; i < parts.length; i++) { + locations[i-1] = parts[i]; + } + BlockLocation bl = new BlockLocation(parts[0], locations); + chunkMap.put(parts[0],bl); } lines.close(); chunkIndex = new FlatFile(chunkIndexPath); @@ -141,13 +148,14 @@ itr = chunkIndex.getRecordIteratorLT(prefix); while(itr.hasNext()) { if(numBlocks >= maxBlocks) { + LOGGER.warning("Truncated by blocks for " + prefix); truncated = true; break; } String blockDescriptor = itr.next(); numBlocks++; String parts[] = blockDescriptor.split("\t"); - if(parts.length != 3) { + if((parts.length < 3) || (parts.length > 4)) { LOGGER.severe("Bad line(" + blockDescriptor +") in (" + chunkMapPath + ")"); throw new ResourceIndexNotAvailableException("Bad line(" + @@ -164,10 +172,22 @@ break; } // add this and keep lookin... - String url = chunkMap.get(parts[1]); + BlockLocation bl = chunkMap.get(parts[1]); + if(bl == null) { + LOGGER.severe("No locations for block(" + parts[1] +")"); + throw new ResourceIndexNotAvailableException( + "No locations for block(" + parts[1] + ")"); + } long offset = Long.parseLong(parts[2]); - LOGGER.info("Adding block source(" + parts[1] + "):" + offset); - ZiplinedBlock block = new ZiplinedBlock(url, offset); + ZiplinedBlock block; + if(parts.length == 3) { + LOGGER.info("Adding block source(" + parts[1] + "):" + offset); + block = new ZiplinedBlock(bl.getLocations(), offset); + } else { + int length = Integer.parseInt(parts[3]); + LOGGER.info("Adding block source(" + parts[1] + "):" + offset + " - " + length); + block = new ZiplinedBlock(bl.getLocations(), offset, length); + } block.setLoader(blockLoader); blocks.add(block); } @@ -308,6 +328,13 @@ e1.printStackTrace(); System.exit(1); } + } else if(args[idx].equals("-flexFormat")) { + try { + zl.setFormat(new CDXFlexFormat(" CDX A")); + } catch (CDXFormatException e1) { + e1.printStackTrace(); + System.exit(1); + } } else if(args[idx].equals("-blockDump")) { blockDump = true; } else if(args[idx].equals("-hdfs")) { @@ -366,7 +393,7 @@ ArrayList<ZiplinedBlock> blocks = zl.getBlockListForPrefix(key); for(ZiplinedBlock block : blocks) { - pw.format("%s\t%s\n", block.urlOrPath, block.offset); + pw.format("%s\t%s\n", block.urlOrPaths[0], block.offset); } pw.close(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |