From: <bra...@us...> - 2007-03-01 23:20:16
|
Revision: 1536 http://archive-access.svn.sourceforge.net/archive-access/?rev=1536&view=rev Author: bradtofel Date: 2007-03-01 15:20:15 -0800 (Thu, 01 Mar 2007) Log Message: ----------- INTERFACE UPDATE: using new archive-commons jar that supports WARC format, and refactored WritePool. Modified Paths: -------------- trunk/archive-access/projects/wayback/project.properties trunk/archive-access/projects/wayback/project.xml trunk/archive-access/projects/wayback/src/java/org/archive/wayback/accesscontrol/RoboCache.java trunk/archive-access/projects/wayback/src/java/org/archive/wayback/accesscontrol/UrlCacher.java trunk/archive-access/projects/wayback/src/java/org/archive/wayback/resourceindex/indexer/ArcIndexer.java trunk/archive-access/projects/wayback/src/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java trunk/archive-access/projects/wayback/src/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java trunk/archive-access/projects/wayback/src/java/org/archive/wayback/util/ARCCreator.java Added Paths: ----------- trunk/archive-access/projects/wayback/lib/archive-commons-1.11.0-200703012033.jar Added: trunk/archive-access/projects/wayback/lib/archive-commons-1.11.0-200703012033.jar =================================================================== (Binary files differ) Property changes on: trunk/archive-access/projects/wayback/lib/archive-commons-1.11.0-200703012033.jar ___________________________________________________________________ Name: svn:mime-type + application/octet-stream Modified: trunk/archive-access/projects/wayback/project.properties =================================================================== --- trunk/archive-access/projects/wayback/project.properties 2007-03-01 23:15:22 UTC (rev 1535) +++ trunk/archive-access/projects/wayback/project.properties 2007-03-01 23:20:15 UTC (rev 1536) @@ -26,7 +26,7 @@ maven.jar.commons-httpclient-local = ${basedir}/lib/commons-httpclient-3.0-rc3.jar maven.jar.commons-logging-local = ${basedir}/lib/commons-logging-1.0.4.jar maven.jar.je = ${basedir}/lib/je-2.0.83.jar -maven.jar.archive-commons = ${basedir}/lib/archive-commons-1.9.0-200607171747.jar +maven.jar.archive-commons = ${basedir}/lib/archive-commons-1.11.0-200703012033.jar maven.jar.libidn = ${basedir}/lib/libidn-0.5.9.jar maven.jar.commons-codec = ${basedir}/lib/commons-codec-1.3.jar maven.jar.commons-pool = ${basedir}/lib/commons-pool-1.3-kb.jar Modified: trunk/archive-access/projects/wayback/project.xml =================================================================== --- trunk/archive-access/projects/wayback/project.xml 2007-03-01 23:15:22 UTC (rev 1535) +++ trunk/archive-access/projects/wayback/project.xml 2007-03-01 23:20:15 UTC (rev 1536) @@ -186,7 +186,7 @@ <dependency> <groupId>heritrix</groupId> <artifactId>archive-commons</artifactId> - <version>1.9.0-200607171747</version> + <version>1.11.0-200703012033</version> <url>http://builds.archive.org:8080/cruisecontrol</url> <properties> <war.bundle>true</war.bundle> Modified: trunk/archive-access/projects/wayback/src/java/org/archive/wayback/accesscontrol/RoboCache.java =================================================================== --- trunk/archive-access/projects/wayback/src/java/org/archive/wayback/accesscontrol/RoboCache.java 2007-03-01 23:15:22 UTC (rev 1535) +++ trunk/archive-access/projects/wayback/src/java/org/archive/wayback/accesscontrol/RoboCache.java 2007-03-01 23:20:15 UTC (rev 1536) @@ -47,7 +47,8 @@ import org.archive.io.arc.ARCRecord; import org.archive.io.arc.ARCWriter; import org.archive.io.arc.ARCWriterPool; -import org.archive.io.arc.ARCWriterSettings; +import org.archive.io.ArchiveRecord; +import org.archive.io.WriterPoolSettings; import org.archive.util.ArchiveUtils; import org.archive.wayback.exception.ConfigurationException; @@ -199,7 +200,7 @@ } File[] files = { arcDir }; boolean compress = true; - ARCWriterSettings settings = getSettings(compress, prefix, files); + WriterPoolSettings settings = getSettings(compress, prefix, files); pool = new ARCWriterPool(settings, MAX_POOL_WRITERS, MAX_POOL_WAIT); } @@ -419,7 +420,11 @@ ARCReader reader = ARCReaderFactory.get(new File(location.getName()), true, location.getOffset()); - ARCRecord rec = reader.get(location.getOffset()); + ArchiveRecord aRec = reader.get(location.getOffset()); + if(!(aRec instanceof ARCRecord)) { + throw new IOException("Not ARCRecord..."); + } + ARCRecord rec = (ARCRecord) aRec; rec.skipHttpHeader(); LinkedList userAgents = new LinkedList(); userAgents.add(userAgent); @@ -499,8 +504,8 @@ ARCLocation fresh; ARCWriter writer; try { - writer = pool.borrowARCWriter(); - writer.checkARCFileSize(); + writer = (ARCWriter) pool.borrowFile(); + writer.checkSize(); } catch (IOException e) { // TODO better... e.printStackTrace(); @@ -509,7 +514,7 @@ String robotUrlString = "http://" + url.getHost() + "/robots.txt"; fresh = urlCacher.cache(writer, robotUrlString); try { - pool.returnARCWriter(writer); + pool.returnFile(writer); } catch (IOException e) { // TODO better.... e.printStackTrace(); @@ -544,21 +549,13 @@ return fresh; } - private ARCWriterSettings getSettings(final boolean isCompressed, + private WriterPoolSettings getSettings(final boolean isCompressed, final String prefix, final File[] arcDirs) { - return new ARCWriterSettings() { - public int getArcMaxSize() { + return new WriterPoolSettings() { + public int getMaxSize() { return ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE; } - public String getArcPrefix() { - return prefix; - } - - public String getArcSuffix() { - return ""; - } - public List getOutputDirs() { return Arrays.asList(arcDirs); } @@ -570,6 +567,15 @@ public List getMetadata() { return null; } + + public String getPrefix() { + return prefix; + } + + public String getSuffix() { + // TODO: is correct? + return ARCConstants.DOT_ARC_FILE_EXTENSION; + } }; } } Modified: trunk/archive-access/projects/wayback/src/java/org/archive/wayback/accesscontrol/UrlCacher.java =================================================================== --- trunk/archive-access/projects/wayback/src/java/org/archive/wayback/accesscontrol/UrlCacher.java 2007-03-01 23:15:22 UTC (rev 1535) +++ trunk/archive-access/projects/wayback/src/java/org/archive/wayback/accesscontrol/UrlCacher.java 2007-03-01 23:20:15 UTC (rev 1536) @@ -35,6 +35,7 @@ import java.net.URL; import java.util.Arrays; import java.util.Date; +import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Logger; import org.apache.commons.httpclient.Header; @@ -156,8 +157,8 @@ String ip = method.getRemoteIP(); Date captureDate = method.getCaptureDate(); - writer.checkARCFileSize(); - String arcPathTmp = writer.getArcFile().getAbsolutePath(); + writer.checkSize(); + String arcPathTmp = writer.getFile().getAbsolutePath(); final long oldOffset = writer.getPosition(); writer.write(urlString,mime,ip,captureDate.getTime(),len,fis); @@ -209,8 +210,9 @@ } File [] files = {arcDir}; boolean compress = true; - ARCWriter writer = new ARCWriter(Arrays.asList(files), - "test", compress, DEFAULT_MAX_ARC_FILE_SIZE); + ARCWriter writer = new ARCWriter(new AtomicInteger(), + Arrays.asList(files), "test", compress, + DEFAULT_MAX_ARC_FILE_SIZE); for(int k = 2; k < args.length; k++) { UrlCacher uc; Modified: trunk/archive-access/projects/wayback/src/java/org/archive/wayback/resourceindex/indexer/ArcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/src/java/org/archive/wayback/resourceindex/indexer/ArcIndexer.java 2007-03-01 23:15:22 UTC (rev 1535) +++ trunk/archive-access/projects/wayback/src/java/org/archive/wayback/resourceindex/indexer/ArcIndexer.java 2007-03-01 23:20:15 UTC (rev 1536) @@ -171,7 +171,7 @@ // initialize with default HTTP code... result.put(WaybackConstants.RESULT_HTTP_CODE, "-"); - result.put(WaybackConstants.RESULT_MD5_DIGEST, meta.getDigest()); + result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype()); result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate()); Modified: trunk/archive-access/projects/wayback/src/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/src/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java 2007-03-01 23:15:22 UTC (rev 1535) +++ trunk/archive-access/projects/wayback/src/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java 2007-03-01 23:20:15 UTC (rev 1536) @@ -28,6 +28,7 @@ import java.net.URL; import java.util.Properties; +import org.archive.io.ArchiveRecord; import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCReaderFactory; import org.archive.io.arc.ARCRecord; @@ -86,9 +87,14 @@ Resource r = null; try { ARCReader ar = ARCReaderFactory.get(new URL(arcUrl),offset); - ARCRecord rec = ar.get(); - r = new Resource(rec,ar); + // TODO: handle other types... + ArchiveRecord rec = ar.get(); + if(!(rec instanceof ARCRecord)) { + throw new ResourceNotAvailableException("Bad ARCRecord format"); + } + r = new Resource((ARCRecord) rec,ar); } catch (IOException e) { + e.printStackTrace(); throw new ResourceNotAvailableException("Unable to retrieve", e.getLocalizedMessage()); } Modified: trunk/archive-access/projects/wayback/src/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/src/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java 2007-03-01 23:15:22 UTC (rev 1535) +++ trunk/archive-access/projects/wayback/src/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java 2007-03-01 23:20:15 UTC (rev 1536) @@ -32,9 +32,11 @@ import java.util.logging.Logger; import org.apache.commons.httpclient.HttpException; +import org.archive.io.ArchiveRecord; import org.archive.io.arc.ARCLocation; import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCReaderFactory; +import org.archive.io.arc.ARCRecord; import org.archive.wayback.ResourceStore; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.Resource; @@ -148,7 +150,12 @@ ARCReader reader = ARCReaderFactory.get(arcFile); - Resource r = new Resource(reader.get(location.getOffset()), reader); + ArchiveRecord rec = reader.get(location.getOffset()); + // TODO: handle other types of ArchiveRecords... + if(!(rec instanceof ARCRecord)) { + throw new ResourceNotAvailableException("Bad ARCRecord format"); + } + Resource r = new Resource((ARCRecord) rec, reader); return r; } } Modified: trunk/archive-access/projects/wayback/src/java/org/archive/wayback/util/ARCCreator.java =================================================================== --- trunk/archive-access/projects/wayback/src/java/org/archive/wayback/util/ARCCreator.java 2007-03-01 23:15:22 UTC (rev 1535) +++ trunk/archive-access/projects/wayback/src/java/org/archive/wayback/util/ARCCreator.java 2007-03-01 23:20:15 UTC (rev 1536) @@ -31,12 +31,11 @@ import java.text.ParseException; import java.util.Arrays; import java.util.HashMap; -import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Logger; import org.archive.io.arc.ARCConstants; import org.archive.io.arc.ARCWriter; -import org.archive.io.arc.ARCWriterSettings; import org.archive.util.ArchiveUtils; /** @@ -104,7 +103,9 @@ throws IOException { File target[] = {tgtDir}; - ARCWriter writer = new ARCWriter(getSettings(true,prefix,target)); + ARCWriter writer = new ARCWriter(new AtomicInteger(), + Arrays.asList(target),prefix,true, + ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE); File sources[] = srcDir.listFiles(); logger.info("Found " + sources.length + " files in " + srcDir); for(int i = 0; i<sources.length; i++) { @@ -123,7 +124,7 @@ } writer.close(); logger.info("Closed arc file named " + - writer.getArcFile().getAbsolutePath()); + writer.getFile().getAbsolutePath()); } /** @@ -232,34 +233,4 @@ } } - - private ARCWriterSettings getSettings(final boolean isCompressed, - final String prefix, final File[] arcDirs) { - - return new ARCWriterSettings() { - public int getArcMaxSize() { - return ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE; - } - - public String getArcPrefix() { - return prefix; - } - - public String getArcSuffix() { - return ""; - } - - public List getOutputDirs() { - return Arrays.asList(arcDirs); - } - - public boolean isCompressed() { - return isCompressed; - } - - public List getMetadata() { - return null; - } - }; - } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |