You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
Revision: 2105 http://archive-access.svn.sourceforge.net/archive-access/?rev=2105&view=rev Author: bradtofel Date: 2007-12-11 14:27:08 -0800 (Tue, 11 Dec 2007) Log Message: ----------- REFACTOR: now can handle ARC and WARC, but this is soon to be replaced.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java 2007-12-11 22:25:59 UTC (rev 2104) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java 2007-12-11 22:27:08 UTC (rev 2105) @@ -27,10 +27,6 @@ import java.io.IOException; import java.net.URL; -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCReaderFactory; -import org.archive.io.arc.ARCRecord; import org.archive.wayback.ResourceStore; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.Resource; @@ -58,30 +54,24 @@ // extract ARC filename + add .arc.gz if it is not present String arcName = result.get(WaybackConstants.RESULT_ARC_FILE); if(arcName == null || arcName.length() < 1) { - throw new IOException("No ARC name in search result..."); + throw new IOException("No ARC/WARC name in search result..."); } - if (!arcName.endsWith(ARCReader.DOT_COMPRESSED_ARC_FILE_EXTENSION)) { - arcName += ARCReader.DOT_COMPRESSED_ARC_FILE_EXTENSION; - } // extract ARC offset + convert to long final String offsetString = result.get(WaybackConstants.RESULT_OFFSET); if(offsetString == null || offsetString.length() < 1) { - throw new IOException("No ARC offset in search result..."); + throw new IOException("No ARC/WARC offset in search result..."); } final long offset = Long.parseLong(offsetString); String arcUrl = urlPrefix + arcName; Resource r = null; try { - ARCReader ar = ARCReaderFactory.get(new URL(arcUrl),offset); - // TODO: handle other types... - ArchiveRecord rec = ar.get(); - if(!(rec instanceof ARCRecord)) { - throw new ResourceNotAvailableException("Bad ARCRecord format"); - } - r = new Resource((ARCRecord) rec,ar); + + r = ResourceFactory.getResource(new URL(arcUrl), offset); + } catch (IOException e) { + e.printStackTrace(); throw new ResourceNotAvailableException("Unable to retrieve", e.getLocalizedMessage()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2104 http://archive-access.svn.sourceforge.net/archive-access/?rev=2104&view=rev Author: bradtofel Date: 2007-12-11 14:25:59 -0800 (Tue, 11 Dec 2007) Log Message: ----------- BUGFIX: made indexThread non-static REFACTOR: now explicitly creates ARCResource REFACTOR: moved all directory code to DirMaker COMMENT: removed unused code Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java 2007-12-11 02:27:58 UTC (rev 2103) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java 2007-12-11 22:25:59 UTC (rev 2104) @@ -45,6 +45,7 @@ import org.archive.wayback.exception.ResourceNotAvailableException; import org.archive.wayback.resourceindex.indexer.IndexClient; //import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.DirMaker; /** * Implements ResourceStore using a local directory of ARC files. @@ -59,10 +60,8 @@ private final static int DEFAULT_RUN_INTERVAL_MS = 10000; private File arcDir = null; -// private File tmpDir = null; private File workDir = null; private File queuedDir = null; -// private String indexTarget = null; private int runInterval = DEFAULT_RUN_INTERVAL_MS; private IndexClient indexClient = null; private ArcIndexer indexer = new ArcIndexer(); @@ -71,7 +70,7 @@ * Thread object of update thread -- also is flag indicating if the thread * has already been started -- static, and access to it is synchronized. */ - private static Thread indexThread = null; + private Thread indexThread = null; /** * @throws ConfigurationException @@ -111,7 +110,7 @@ if(!(rec instanceof ARCRecord)) { throw new ResourceNotAvailableException("Bad ARCRecord format"); } - Resource r = new Resource((ARCRecord) rec, reader); + Resource r = new ArcResource((ARCRecord) rec, reader); return r; } } @@ -159,40 +158,6 @@ } } -// private boolean uploadCDX(File cdxFile) { -// boolean uploaded = false; -// if(indexClient == null) { -// // assume we just need to move it to a local directory: -// File toBeMergedDir = new File(indexTarget); -// File toBeMergedFile = new File(toBeMergedDir,cdxFile.getName()); -// if(toBeMergedFile.exists()) { -// LOGGER.severe("WARNING: "+toBeMergedFile.getAbsolutePath() + -// "already exists!"); -// } else { -// if(cdxFile.renameTo(toBeMergedFile)) { -// LOGGER.info("Queued " + toBeMergedFile.getAbsolutePath() + -// " for merging."); -// uploaded = true; -// } else { -// LOGGER.severe("FAILED rename("+cdxFile.getAbsolutePath()+ -// ") to ("+toBeMergedFile.getAbsolutePath()+")"); -// } -// } -// } else { -// // use indexClient to upload: -// try { -// indexClient.uploadCDX(cdxFile); -// LOGGER.info("Uploaded " + cdxFile.getAbsolutePath()); -// uploaded = true; -// } catch (HttpException e) { -// e.printStackTrace(); -// } catch (IOException e) { -// e.printStackTrace(); -// } -// } -// return uploaded; -// } -// /** * Index up to 'max' ARC files queued for indexing, queueing the resulting * CDX files for merging with the BDBIndex. @@ -330,91 +295,34 @@ } } } - - // TODO: refactor to single location - private File ensureDir(String path) throws ConfigurationException { - if(path.length() < 1) { - throw new ConfigurationException("Empty directory path"); - } - File dir = new File(path); - if(dir.exists()) { - if(!dir.isDirectory()) { - throw new ConfigurationException("path " + path + "exists" + - "but is not a directory"); - } - } else { - if(!dir.mkdirs()) { - throw new ConfigurationException("unable to create directory" + - " at " + path); - } - } - return dir; - } -// -// /** -// * @return String path to tmpDir -// */ -// public String getTmpDir() { -// if(tmpDir == null) { -// return null; -// } -// return tmpDir.getAbsolutePath(); -// } -// /** -// * @param tmpDir the tmpDir to set -// * @throws ConfigurationException -// */ -// public void setTmpDir(String tmpDir) throws ConfigurationException { -// this.tmpDir = ensureDir(tmpDir); -// } - /** * @return String path to workDir */ public String getWorkDir() { - if(workDir == null) { - return null; - } - return workDir.getAbsolutePath(); + return DirMaker.getAbsolutePath(workDir); } /** * @param workDir the workDir to set - * @throws ConfigurationException + * @throws IOException */ - public void setWorkDir(String workDir) throws ConfigurationException { - this.workDir = ensureDir(workDir); + public void setWorkDir(String workDir) throws IOException { + this.workDir = DirMaker.ensureDir(workDir); } /** * @return String path to queuedDir */ public String getQueuedDir() { - if(queuedDir == null) { - return null; - } - return queuedDir.getAbsolutePath(); + return DirMaker.getAbsolutePath(queuedDir); } /** * @param queuedDir the queuedDir to set - * @throws ConfigurationException + * @throws IOException */ - public void setQueuedDir(String queuedDir) throws ConfigurationException { - this.queuedDir = ensureDir(queuedDir); + public void setQueuedDir(String queuedDir) throws IOException { + this.queuedDir = DirMaker.ensureDir(queuedDir); } -// -// /** -// * @return -// */ -// public String getIndexTarget() { -// return indexTarget; -// } -// /** -// * @param indexTarget the indexTarget to set -// */ -// public void setIndexTarget(String indexTarget) { -// this.indexTarget = indexTarget; -// } /** * @return integer milliseconds between polls for new ARC content. @@ -432,16 +340,14 @@ * @return the arcDir */ public String getArcDir() { - if(arcDir == null) { - return null; - } - return arcDir.getAbsolutePath(); + return DirMaker.getAbsolutePath(arcDir); } /** * @param arcDir the arcDir to set + * @throws IOException */ - public void setArcDir(String arcDir) { - this.arcDir = new File(arcDir); + public void setArcDir(String arcDir) throws IOException { + this.arcDir = DirMaker.ensureDir(arcDir); } /** This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-11 02:27:55
|
Revision: 2103 http://archive-access.svn.sourceforge.net/archive-access/?rev=2103&view=rev Author: bradtofel Date: 2007-12-10 18:27:58 -0800 (Mon, 10 Dec 2007) Log Message: ----------- FEATURE: added second version of ensureDir which does not expect a second "property name" argument, added getAbsolutePath(File) which does just that, iff File is not null. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/DirMaker.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/DirMaker.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/DirMaker.java 2007-12-11 02:26:18 UTC (rev 2102) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/DirMaker.java 2007-12-11 02:27:58 UTC (rev 2103) @@ -28,8 +28,9 @@ import java.io.IOException; /** + * Lots of things need to transform Strings to Files, constructing them if + * needed. These are static methods for doing that. * - * * @author brad * @version $Date$, $Revision$ */ @@ -59,4 +60,13 @@ } return dir; } + public static File ensureDir(String path) throws IOException { + return ensureDir(path,""); + } + public static String getAbsolutePath(File file) { + if(file == null) { + return null; + } + return file.getAbsolutePath(); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-11 02:26:13
|
Revision: 2102 http://archive-access.svn.sourceforge.net/archive-access/?rev=2102&view=rev Author: bradtofel Date: 2007-12-10 18:26:18 -0800 (Mon, 10 Dec 2007) Log Message: ----------- FEATURE: Command line main() now accepts multiple fields to canonicalize in a single pass. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2007-12-11 02:25:10 UTC (rev 2101) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2007-12-11 02:26:18 UTC (rev 2102) @@ -27,6 +27,7 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; +import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -341,7 +342,8 @@ UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); int n = 0; int i = 0; - int column = 0; + ArrayList<Integer> columns = new ArrayList<Integer>(); + long lineNumber = 0; boolean cdxPassThru = false; String delimiter = " "; @@ -357,7 +359,7 @@ } String val = args[n+1]; if(arg.compareTo("-f") == 0) { - column = Integer.parseInt(val) - 1; + columns.add(new Integer(val)); } else if(arg.compareTo("-d") == 0) { delimiter = val; } else { @@ -365,9 +367,20 @@ } n += 2; } + // place default '0' in case none specified: + if(columns.size() == 0) { + columns.add(new Integer(1)); + } + + // convert to int[]: + int[] cols = new int[columns.size()]; + for(int idx = 0; idx < columns.size(); idx++) { + cols[idx] = columns.get(idx).intValue() - 1; + } BufferedReader r = new BufferedReader(new InputStreamReader(System.in)); StringBuilder sb = new StringBuilder(); String line = null; + while(true) { try { line = r.readLine(); @@ -384,27 +397,29 @@ continue; } String parts[] = line.split(delimiter); - if(column >= parts.length) { - System.err.println("Invalid line " + lineNumber + " (" + - line + ") skipped"); - } else { - try { - parts[column] = canonicalizer.urlStringToKey(parts[column]); - } catch (URIException e) { - System.err.println("Invalid URL in line " + lineNumber + " (" + - line + ") skipped"); - e.printStackTrace(); - continue; - } - sb.setLength(0); - for(i = 0; i < parts.length; i++) { - sb.append(parts[i]); - if(i < (parts.length-1)) { - sb.append(delimiter); + for(int column : cols) { + if(column >= parts.length) { + System.err.println("Invalid line " + lineNumber + " (" + + line + ") skipped"); + } else { + try { + parts[column] = canonicalizer.urlStringToKey(parts[column]); + } catch (URIException e) { + System.err.println("Invalid URL in line " + lineNumber + " (" + + line + ") skipped (" + parts[column] + ")"); + e.printStackTrace(); + continue; } } - System.out.println(sb.toString()); } + sb.setLength(0); + for(i = 0; i < parts.length; i++) { + sb.append(parts[i]); + if(i < (parts.length-1)) { + sb.append(delimiter); + } + } + System.out.println(sb.toString()); } } } \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-11 02:25:07
|
Revision: 2101 http://archive-access.svn.sourceforge.net/archive-access/?rev=2101&view=rev Author: bradtofel Date: 2007-12-10 18:25:10 -0800 (Mon, 10 Dec 2007) Log Message: ----------- TWEAK: no longer need to call parseHeaders() on Resource Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/UIReplayResult.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/UIReplayResult.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/UIReplayResult.java 2007-12-11 02:24:42 UTC (rev 2100) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/UIReplayResult.java 2007-12-11 02:25:10 UTC (rev 2101) @@ -71,7 +71,6 @@ this.result = result; this.resource = resource; this.uriConverter = uriConverter; - resource.parseHeaders(); } /** This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2100 http://archive-access.svn.sourceforge.net/archive-access/?rev=2100&view=rev Author: bradtofel Date: 2007-12-10 18:24:42 -0800 (Mon, 10 Dec 2007) Log Message: ----------- TWEAK: no longer need to call parseHeaders() on Resource Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TransparentReplayRenderer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TransparentReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TransparentReplayRenderer.java 2007-11-29 21:06:55 UTC (rev 2099) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TransparentReplayRenderer.java 2007-12-11 02:24:42 UTC (rev 2100) @@ -61,9 +61,6 @@ ResultURIConverter uriConverter, SearchResults results) throws ServletException, IOException, BadContentException { - // cause underlying resource to read thru HTTP headers: - resource.parseHeaders(); - HttpHeaderOperation.copyHTTPMessageHeader(resource, httpResponse); Map<String,String> headers = HttpHeaderOperation.processHeaders( This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2099 http://archive-access.svn.sourceforge.net/archive-access/?rev=2099&view=rev Author: bradtofel Date: 2007-11-29 13:06:55 -0800 (Thu, 29 Nov 2007) Log Message: ----------- INTERFACE: no longer call parseHeaders on Resource Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2007-11-29 21:05:22 UTC (rev 2098) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2007-11-29 21:06:55 UTC (rev 2099) @@ -148,7 +148,6 @@ tmpRules = new RobotRules(); Resource resource = webCache.getCachedResource(new URL(urlString), maxCacheMS,true); - resource.parseHeaders(); tmpRules.parse(resource); rulesCache.put(firstUrlString,tmpRules); rules = tmpRules; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-11-29 21:05:24
|
Revision: 2098 http://archive-access.svn.sourceforge.net/archive-access/?rev=2098&view=rev Author: bradtofel Date: 2007-11-29 13:05:22 -0800 (Thu, 29 Nov 2007) Log Message: ----------- FEATURE: now store original request URL, for access within subsequent JSP processes. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UIResults.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UIResults.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UIResults.java 2007-11-29 20:39:33 UTC (rev 2097) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UIResults.java 2007-11-29 21:05:22 UTC (rev 2098) @@ -41,6 +41,7 @@ private final static String FERRET_NAME = "ui-results"; protected WaybackRequest wbRequest; private String contentJsp = null; + private String originalRequestURL = null; /** @@ -74,7 +75,8 @@ public void storeInRequest(HttpServletRequest httpRequest, String contentJsp) { this.contentJsp = contentJsp; - httpRequest.setAttribute(FERRET_NAME, this); + this.originalRequestURL = httpRequest.getRequestURL().toString(); + httpRequest.setAttribute(FERRET_NAME, this); } /** @@ -198,5 +200,8 @@ } return configValue; } + public String getOriginalRequestURL() { + return originalRequestURL; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-11-29 20:39:30
|
Revision: 2097 http://archive-access.svn.sourceforge.net/archive-access/?rev=2097&view=rev Author: bradtofel Date: 2007-11-29 12:39:33 -0800 (Thu, 29 Nov 2007) Log Message: ----------- FEATURE: added store() method to write Iterator<String> into target file. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java 2007-11-29 20:32:18 UTC (rev 2096) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java 2007-11-29 20:39:33 UTC (rev 2097) @@ -28,6 +28,7 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; +import java.io.PrintWriter; import java.io.RandomAccessFile; import java.util.Iterator; @@ -177,6 +178,14 @@ return itr; } + public void store(Iterator<String> itr) throws IOException { + PrintWriter pw = new PrintWriter(file); + while(itr.hasNext()) { + pw.println(file); + } + pw.close(); + } + private static void USAGE() { System.err.println("Usage: PREFIX FILE1 [FILE2] ..."); System.exit(3); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2096 http://archive-access.svn.sourceforge.net/archive-access/?rev=2096&view=rev Author: bradtofel Date: 2007-11-29 12:32:18 -0800 (Thu, 29 Nov 2007) Log Message: ----------- COMMENT: removed unused (and commented out) imports Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java 2007-11-29 20:30:29 UTC (rev 2095) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java 2007-11-29 20:32:18 UTC (rev 2096) @@ -24,16 +24,12 @@ */ package org.archive.wayback.resourceindex.indexer; -//import java.io.BufferedOutputStream; import java.io.File; -//import java.io.FileFilter; import java.io.BufferedOutputStream; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; -//import java.io.OutputStream; -//import java.io.PrintWriter; import java.util.Iterator; import java.util.logging.Logger; @@ -42,9 +38,6 @@ import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.InputStreamRequestEntity; import org.apache.commons.httpclient.methods.PutMethod; -//import org.archive.wayback.core.SearchResults; -//import org.archive.wayback.resourcestore.ArcIndexer; -//import org.archive.wayback.resourcestore.http.FileLocationDBClient; import org.archive.wayback.core.SearchResult; import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2095 http://archive-access.svn.sourceforge.net/archive-access/?rev=2095&view=rev Author: bradtofel Date: 2007-11-29 12:30:29 -0800 (Thu, 29 Nov 2007) Log Message: ----------- REFACTOR: no longer call parseHeaders() on Resource Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixReplayRenderer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixReplayRenderer.java 2007-11-29 20:29:26 UTC (rev 2094) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixReplayRenderer.java 2007-11-29 20:30:29 UTC (rev 2095) @@ -76,7 +76,6 @@ SearchResult result, Resource resource, ResultURIConverter uriConverter, SearchResults results) throws ServletException, IOException, BadContentException { - resource.parseHeaders(); HttpHeaderOperation.copyHTTPMessageHeader(resource, httpResponse); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-11-29 20:29:22
|
Revision: 2094 http://archive-access.svn.sourceforge.net/archive-access/?rev=2094&view=rev Author: bradtofel Date: 2007-11-29 12:29:26 -0800 (Thu, 29 Nov 2007) Log Message: ----------- INTERFACE: changed to use new ArcResource Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java 2007-11-28 03:15:42 UTC (rev 2093) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCacheDirectory.java 2007-11-29 20:29:26 UTC (rev 2094) @@ -30,15 +30,13 @@ import java.util.List; import java.util.logging.Logger; -import org.archive.io.ArchiveRecord; import org.archive.io.WriterPoolSettings; import org.archive.io.arc.ARCConstants; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCReaderFactory; -import org.archive.io.arc.ARCRecord; import org.archive.io.arc.ARCWriter; import org.archive.io.arc.ARCWriterPool; import org.archive.wayback.core.Resource; +import org.archive.wayback.exception.ResourceNotAvailableException; +import org.archive.wayback.resourcestore.ResourceFactory; import org.archive.wayback.util.DirMaker; /** @@ -119,7 +117,6 @@ * @throws IOException */ public Resource getResource(String path, long offset) throws IOException { - Resource resource = null; File arc = new File(path); if(!arc.exists()) { String base = arc.getName(); @@ -132,23 +129,13 @@ } } } - LOGGER.info("Retrieving record at " + offset + " in " + - arc.getAbsolutePath()); - ARCReader reader = null; + arc.getAbsolutePath()); try { - reader = ARCReaderFactory.get(arc,true,offset); - } catch (IOException e) { - throw new RuntimeException(e); + return ResourceFactory.getResource(arc, offset); + } catch (ResourceNotAvailableException e1) { + throw new IOException(e1.getMessage()); } - - ArchiveRecord aRec = reader.get(offset); - if(!(aRec instanceof ARCRecord)) { - throw new IOException("Not ARCRecord..."); - } - ARCRecord rec = (ARCRecord) aRec; - resource = new Resource(rec,reader); - return resource; } private WriterPoolSettings getSettings(final boolean isCompressed, Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java 2007-11-28 03:15:42 UTC (rev 2093) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java 2007-11-29 20:29:26 UTC (rev 2094) @@ -43,6 +43,7 @@ import org.archive.wayback.exception.ResourceNotInArchiveException; import org.archive.wayback.exception.WaybackException; import org.archive.wayback.resourcestore.ARCRecordToSearchResultAdapter; +import org.archive.wayback.resourcestore.ArcResource; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.UrlCanonicalizer; @@ -203,15 +204,18 @@ "ARC(" + name + ") at (" + offset + ")"); resource = arcCacheDir.getResource(name, offset); // add the result to the index: - ARCRecord record = (ARCRecord) resource.getArcRecord(); + if(resource instanceof ArcResource) { + ArcResource aResource = (ArcResource) resource; + ARCRecord record = (ARCRecord) aResource.getArcRecord(); - SearchResult result = adapter.adapt(record); - index.addSearchResult(result); - LOGGER.info("Added URL(" + url.toString() + ") in " + - "ARC(" + name + ") at (" + offset + ") to LiveIndex"); + SearchResult result = adapter.adapt(record); + index.addSearchResult(result); + LOGGER.info("Added URL(" + url.toString() + ") in " + + "ARC(" + name + ") at (" + offset + ") to LiveIndex"); - // we just read thru the doc in order to index it. Reset: - resource = arcCacheDir.getResource(name, offset); + // we just read thru the doc in order to index it. Reset: + resource = arcCacheDir.getResource(name, offset); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2093 http://archive-access.svn.sourceforge.net/archive-access/?rev=2093&view=rev Author: bradtofel Date: 2007-11-27 19:15:42 -0800 (Tue, 27 Nov 2007) Log Message: ----------- BUGFIX: (ACC-9) thread variable was static, which limited total number of index merging threads to... 1. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndexUpdater.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndexUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndexUpdater.java 2007-11-28 03:14:45 UTC (rev 2092) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndexUpdater.java 2007-11-28 03:15:42 UTC (rev 2093) @@ -68,13 +68,12 @@ private int runInterval = DEFAULT_RUN_INTERVAL_MS; -// private ArcIndexer indexer = new ArcIndexer(); - /** * Thread object of update thread -- also is flag indicating if the thread - * has already been started -- static, and access to it is synchronized. + * has already been started. Access to it is synchronized. */ - private static Thread updateThread = null; + private Thread updateThread = null; + /** * Default constructor */ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2092 http://archive-access.svn.sourceforge.net/archive-access/?rev=2092&view=rev Author: bradtofel Date: 2007-11-27 19:14:45 -0800 (Tue, 27 Nov 2007) Log Message: ----------- BUGFIX: allow parsing of '-' values for offset. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2007-11-28 03:13:41 UTC (rev 2091) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2007-11-28 03:14:45 UTC (rev 2092) @@ -61,7 +61,10 @@ String httpResponseCode = tokens[4]; String md5Fragment = tokens[5]; String redirectUrl = tokens[6]; - long compressedOffset = Long.parseLong(tokens[7]); + long compressedOffset = -1; + if(!tokens[7].equals("-")) { + compressedOffset = Long.parseLong(tokens[7]); + } String arcFileName = tokens[8]; String origUrl = url; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2091 http://archive-access.svn.sourceforge.net/archive-access/?rev=2091&view=rev Author: bradtofel Date: 2007-11-27 19:13:41 -0800 (Tue, 27 Nov 2007) Log Message: ----------- COMMENT: removed lots of unused code.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java 2007-11-28 03:13:01 UTC (rev 2090) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java 2007-11-28 03:13:41 UTC (rev 2091) @@ -60,16 +60,10 @@ private static final Logger LOGGER = Logger.getLogger(IndexClient .class.getName()); -// private final static String ARC_SUFFIX = ".arc"; -// private final static String ARC_GZ_SUFFIX = ".arc.gz"; -// private final static String CDX_SUFFIX = ".cdx"; - private String target = null; private File tmpDir = null; -// private String submitUrl = null; private HttpClient client = new HttpClient(); -// private ArcIndexer indexer = null; /** * @param cdx @@ -180,159 +174,6 @@ boolean added = addCDX(tmpFile); return added; } - -// -// /** -// * Inject File argument into the index pipeline specified for this client -// * using HTTP PUT -// * -// * @param cdx -// * @throws HttpException -// * @throws IOException -// */ -// public void uploadCDX(File cdx) throws HttpException, IOException { -// String basename = cdx.getName(); -// String finalUrl = submitUrl + "/" + basename; -// PutMethod method = new PutMethod(finalUrl); -// method.setRequestEntity(new InputStreamRequestEntity( -// new FileInputStream(cdx))); -// -// int statusCode = client.executeMethod(method); -// if (statusCode != HttpStatus.SC_OK) { -// throw new IOException("Method failed: " + method.getStatusLine() -// + " for URL " + finalUrl + " on file " -// + cdx.getAbsolutePath()); -// } -// LOGGER.info("Uploaded cdx " + cdx.getAbsolutePath()); -// } -// -// /** -// * Create a CDX file for the arc argument, and add it to the remote -// * index pipeline for this client. -// * -// * @param arc -// * @param workDir -// * @throws IOException -// */ -// public void addArcToIndex(File arc,File workDir) throws IOException { -// String arcBase = arc.getName(); -// if(arcBase.endsWith(ARC_SUFFIX)) { -// arcBase = arcBase.substring(0,arcBase.length() - -// ARC_SUFFIX.length()); -// } -// String cdxBase = arcBase + CDX_SUFFIX; -// File tmpCDX = new File(workDir,cdxBase); -// LOGGER.info("Indexing arc " + arc.getAbsolutePath()); -// SearchResults results = indexer.indexArc(arc); -// indexer.serializeResults(results, tmpCDX); -// uploadCDX(tmpCDX); -// if(!tmpCDX.delete()) { -// throw new IOException("Unable to unlink " + -// tmpCDX.getAbsolutePath()); -// } -// } -// -// /** -// * @param arc -// * @param os -// * @throws IOException -// */ -// public void dumpArcIndex(File arc, OutputStream os) throws IOException { -// BufferedOutputStream bos = new BufferedOutputStream(os); -// PrintWriter pw = new PrintWriter(bos); -// SearchResults results = indexer.indexArc(arc); -// indexer.serializeResults(results,pw); -// } -// -// /** -// * Index each ARC in directory, upload CDX to the remote pipeline, and -// * poke the remote locationDB to let it know where this ARC can be found. -// * -// * @param directory -// * @param httpPrefix -// * @param locationClient -// * @param workDir -// * @throws IOException -// */ -// public void indexDirectory(File directory, String httpPrefix, -// FileLocationDBClient locationClient, File workDir) -// throws IOException { -// if(!workDir.isDirectory()) { -// if(workDir.exists()) { -// throw new IOException("workDir path " + -// workDir.getAbsolutePath() + " exists but is not a " + -// "directory"); -// } -// if(!workDir.mkdirs()) { -// throw new IOException("Failed to mkdir(" + -// workDir.getAbsolutePath() + ")"); -// } -// } -// -// if(!httpPrefix.endsWith("/")) { -// httpPrefix += "/"; -// } -// -// FileFilter filter = new FileFilter() { -// public boolean accept(File daFile) { -// return daFile.getName().endsWith(ARC_SUFFIX); -// } -// }; -// -// File[] arcs = directory.listFiles(filter); -// if(arcs == null) { -// throw new IOException("Directory " + directory.getAbsolutePath() + -// " is not a directory or had an IO error"); -// } -// for(int i = 0; i < arcs.length; i++) { -// File arc = arcs[i]; -// String arcName = arc.getName(); -// String arcUrl = httpPrefix + arcName; -// addArcToIndex(arc,workDir); -// LOGGER.info("Adding location " + arcUrl + " for arc " + arcName); -// locationClient.addArcUrl(arcName,arcUrl); -// } -// } -// -// /** -// * @param args -// */ -// public static void main(String[] args) { -// if(args.length == 1) { -// File arc = new File(args[0]); -// ArcIndexer indexer = new ArcIndexer(); -// -// BufferedOutputStream bos = new BufferedOutputStream(System.out); -// PrintWriter pw = new PrintWriter(bos); -// SearchResults results; -// try { -// results = indexer.indexArc(arc); -// indexer.serializeResults(results,pw); -// } catch (IOException e) { -// e.printStackTrace(); -// System.exit(1); -// } -// return; -// } else if(args.length != 5) { -// System.err.println("Usage: workDir pipelineUrl locationUrl arcDir arcUrlPrefix"); -// System.err.println("Usage: arcPath"); -// return; -// } -// File workDir = new File(args[0]); -// String pipelineUrl = args[1]; -// String locationUrl = args[2]; -// File arcDir = new File(args[3]); -// String arcDirPrefix = args[4]; -// IndexClient pipeClient; -// FileLocationDBClient locClient = new FileLocationDBClient(locationUrl); -// try { -// pipeClient = new IndexClient(pipelineUrl); -// pipeClient.indexDirectory(arcDir,arcDirPrefix,locClient,workDir); -// } catch (IOException e) { -// e.printStackTrace(); -// System.exit(1); -// } -// } /** * @return the target This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-11-28 03:12:59
|
Revision: 2090 http://archive-access.svn.sourceforge.net/archive-access/?rev=2090&view=rev Author: bradtofel Date: 2007-11-27 19:13:01 -0800 (Tue, 27 Nov 2007) Log Message: ----------- INITIAL REV: new LocalResourceStore implementation that allows compressed or uncompressed ARCs and WARCs Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2007-11-28 03:13:01 UTC (rev 2090) @@ -0,0 +1,216 @@ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.logging.Logger; + +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.indexer.IndexClient; +import org.archive.wayback.util.DirMaker; + +/** + * Thread that repeatedly notices new files in the LocalResourceStore, indexes + * those files, and hands them off to a ResourceIndex via an IndexClient + * + * @author brad + * @version $Date$, $Revision$ + */ +public class AutoIndexThread extends Thread { + private static final Logger LOGGER = + Logger.getLogger(AutoIndexThread.class.getName()); + + private final static int DEFAULT_RUN_INTERVAL_MS = 10000; + private LocalResourceStore store = null; + private File workDir = null; + private File queuedDir = null; + private int runInterval = DEFAULT_RUN_INTERVAL_MS; + private IndexClient indexClient = null; + + /** + * @param store + * @param runInterval + */ + public AutoIndexThread() { + super("AutoARCIndexThread"); + super.setDaemon(true); + } + + public void run() { + LOGGER.info("AutoIndexThread is alive."); + int sleepInterval = runInterval; + if(store == null) { + throw new RuntimeException("No LocalResourceStore set"); + } + while (true) { + try { + int numIndexed = indexNewArcs(); + if (numIndexed == 0) { + sleep(sleepInterval); + sleepInterval += runInterval; + } else { + sleepInterval = runInterval; + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + /** + * Scan for new ARC files, and index any new files discovered. + * + * There are 3 main steps, which could be broken into separate threads: + * 1) detect new ARCs + * 2) create CDX files for each new ARC + * 3) upload CDX files to target (or rename to local "incoming" directory) + * + * for now these are sequential. + * + * @return number of ARC files indexed + */ + public int indexNewArcs() { + int numIndexed = 0; + try { + queueNewArcsForIndex(); + } catch (IOException e) { + e.printStackTrace(); + } + try { + numIndexed = indexArcs(10); + } catch (MalformedURLException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return numIndexed; + } + /** + * Find any new ARC files and queue them for indexing. + * @throws IOException + */ + public void queueNewArcsForIndex() throws IOException { + + // build a HashMap of what has been queued already: + HashMap<String,String> queued = new HashMap<String, String>(); + String entries[] = queuedDir.list(); + if(entries != null) { + for (int i = 0; i < entries.length; i++) { + queued.put(entries[i], "i"); + } + } + // now scan thru arcDir, and make a flag file for anything that was not + // already there: + Iterator<String> files = store.fileNamesIterator(); + if(files != null) { + while(files.hasNext()) { + String fileName = files.next(); + if(!queued.containsKey(fileName)) { + File newQueuedFile = new File(queuedDir,fileName); + File newToBeIndexedFile = new File(workDir,fileName); + newToBeIndexedFile.createNewFile(); + newQueuedFile.createNewFile(); + } + } + } + } + + private String fileNameToBase(final String fileName) { + return fileName; + } + + /** + * Index up to 'max' ARC/WARC files queued for indexing, queueing the + * resulting CDX files for merging with the BDBIndex. + * + * @param indexer + * @param max maximum number to index in this method call, 0 for unlimited + * @return int number of ARC/WARC files indexed + * @throws MalformedURLException + * @throws IOException + */ + public int indexArcs(int max) + throws MalformedURLException, IOException { + + int numIndexed = 0; + String toBeIndexed[] = workDir.list(); + + if (toBeIndexed != null) { + for (int i = 0; i < toBeIndexed.length; i++) { + String fileName = toBeIndexed[i]; + File file = store.getLocalFile(fileName); + if(file != null) { + File workFlagFile = new File(workDir,fileName); + String cdxBase = fileNameToBase(fileName); + + try { + + LOGGER.info("Indexing " + file.getAbsolutePath()); + Iterator<SearchResult> itr = store.indexFile(file); + + if(indexClient.addSearchResults(cdxBase, itr)) { + if (!workFlagFile.delete()) { + throw new IOException("Unable to delete " + + workFlagFile.getAbsolutePath()); + } + } + numIndexed++; + } catch (IOException e) { + LOGGER.severe("FAILED index: " + file.getAbsolutePath() + + " cause: " + e.getLocalizedMessage()); + } + if(max > 0 && (numIndexed >= max)) { + break; + } + } + } + } + return numIndexed; + } + + + + public LocalResourceStore getStore() { + return store; + } + + public void setStore(LocalResourceStore store) { + this.store = store; + } + + public String getWorkDir() { + return workDir == null ? null : workDir.getAbsolutePath(); + } + + public void setWorkDir(String workDir) throws IOException { + this.workDir = DirMaker.ensureDir(workDir); + } + + public String getQueuedDir() { + return queuedDir == null ? null : queuedDir.getAbsolutePath(); + } + + public void setQueuedDir(String queuedDir) throws IOException { + this.queuedDir = DirMaker.ensureDir(queuedDir); + } + + public int getRunInterval() { + return runInterval; + } + + public void setRunInterval(int runInterval) { + this.runInterval = runInterval; + } + + public IndexClient getIndexClient() { + return indexClient; + } + + public void setIndexClient(IndexClient indexClient) { + this.indexClient = indexClient; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2007-11-28 03:13:01 UTC (rev 2090) @@ -0,0 +1,142 @@ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import org.archive.wayback.ResourceStore; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.exception.ConfigurationException; +import org.archive.wayback.exception.ResourceNotAvailableException; + +/** + * Class which implements a local ARC, WARC, ARC.gz, WARC.gz, ResourceStore + * including an optional automatic indexing thread + * + * @author brad + * @version $Date$, $Revision$ + */ +public class LocalResourceStore implements ResourceStore { + + private File dataDir = null; + private AutoIndexThread indexThread = null; + + private ArcIndexer arcIndexer = new ArcIndexer(); + private WarcIndexer warcIndexer = new WarcIndexer(); + public final static String ARC_EXTENSION = ".arc"; + public final static String ARC_GZ_EXTENSION = ".arc.gz"; + public final static String WARC_EXTENSION = ".warc"; + public final static String WARC_GZ_EXTENSION = ".warc.gz"; + public final static String OPEN_EXTENSION = ".open"; + private final static String[] SUFFIXES = { + "", ARC_EXTENSION, ARC_GZ_EXTENSION, WARC_EXTENSION, WARC_GZ_EXTENSION + }; + private FilenameFilter filter = new ArcWarcFilenameFilter(); + + public void init() throws ConfigurationException { + if(indexThread != null) { + indexThread.setStore(this); + indexThread.start(); + } + } + protected String resultToFileName(SearchResult result) { + return result.get(WaybackConstants.RESULT_ARC_FILE); + } + + protected long resultToOffset(SearchResult result) { + return Long.parseLong(result.get(WaybackConstants.RESULT_OFFSET)); + } + + public File getLocalFile(String fileName) { + // try adding suffixes: empty string is first in the list + File file = null; + for(String suffix : SUFFIXES) { + file = new File(dataDir,fileName + suffix); + if(file.exists() && file.canRead()) { + return file; + } + } + // this might work if the full path is in the index... + file = new File(fileName); + if(file.exists() && file.canRead()) { + return file; + } + // doh. + return null; + } + + public Resource retrieveResource(SearchResult result) throws IOException, + ResourceNotAvailableException { + String fileName = resultToFileName(result); + long offset = resultToOffset(result); + File file = getLocalFile(fileName); + if (file == null) { + + // TODO: this needs to be prettied up for end user consumption.. + throw new ResourceNotAvailableException("Cannot find ARC file (" + + fileName + ")"); + } else { + + Resource r = ResourceFactory.getResource(file, offset); + return r; + } + } + + public Iterator<SearchResult> indexFile(File dataFile) throws IOException { + Iterator<SearchResult> itr = null; + + String name = dataFile.getName(); + if(name.endsWith(ARC_EXTENSION)) { + itr = arcIndexer.iterator(dataFile); + } else if(name.endsWith(ARC_GZ_EXTENSION)) { + itr = arcIndexer.iterator(dataFile); + } else if(name.endsWith(WARC_EXTENSION)) { + itr = warcIndexer.iterator(dataFile); + } else if(name.endsWith(WARC_GZ_EXTENSION)) { + itr = warcIndexer.iterator(dataFile); + } + return itr; + } + + public Iterator<String> fileNamesIterator() throws IOException { + if(dataDir != null) { + String[] files = dataDir.list(filter); + List<String> l = Arrays.asList(files); + return l.iterator(); + } + return null; + } + + public File getDataDir() { + return dataDir; + } + + public void setDataDir(File dataDir) { + this.dataDir = dataDir; + } + + private class ArcWarcFilenameFilter implements FilenameFilter { + public boolean accept(File dir, String name) { + File tmp = new File(dir,name); + if(tmp.isFile() && tmp.canRead()) { + return name.endsWith(ARC_EXTENSION) || + name.endsWith(ARC_GZ_EXTENSION) || + name.endsWith(WARC_GZ_EXTENSION) || + name.endsWith(WARC_EXTENSION); + } + return false; + } + } + + public AutoIndexThread getIndexThread() { + return indexThread; + } + public void setIndexThread(AutoIndexThread indexThread) { + this.indexThread = indexThread; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2089 http://archive-access.svn.sourceforge.net/archive-access/?rev=2089&view=rev Author: bradtofel Date: 2007-11-27 19:12:07 -0800 (Tue, 27 Nov 2007) Log Message: ----------- BUGFIX: was checking if WARCRecords were of wrong type... Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java 2007-11-28 03:11:14 UTC (rev 2088) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java 2007-11-28 03:12:07 UTC (rev 2089) @@ -20,6 +20,10 @@ Resource r = null; String name = file.getName(); + if(name.endsWith(LocalResourceStore.OPEN_EXTENSION)) { + name = name.substring(0, name.length() - + LocalResourceStore.OPEN_EXTENSION.length()); + } if(name.endsWith(LocalResourceStore.ARC_EXTENSION) || name.endsWith(LocalResourceStore.ARC_GZ_EXTENSION)) { @@ -37,8 +41,8 @@ WARCReader reader = WARCReaderFactory.get(file); ArchiveRecord rec = reader.get(offset); - if(!(rec instanceof ARCRecord)) { - throw new ResourceNotAvailableException("Bad ARCRecord format"); + if(!(rec instanceof WARCRecord)) { + throw new ResourceNotAvailableException("Bad WARCRecord format"); } WarcResource wr = new WarcResource((WARCRecord) rec, reader); wr.parseHeaders(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-11-28 03:11:09
|
Revision: 2088 http://archive-access.svn.sourceforge.net/archive-access/?rev=2088&view=rev Author: bradtofel Date: 2007-11-27 19:11:14 -0800 (Tue, 27 Nov 2007) Log Message: ----------- FEATURE: allow parsing into base of AccessPoints when trailing slash is omitted from path Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestMapper.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestMapper.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestMapper.java 2007-11-28 03:08:09 UTC (rev 2087) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestMapper.java 2007-11-28 03:11:14 UTC (rev 2088) @@ -24,6 +24,7 @@ */ package org.archive.wayback.webapp; +import java.util.ArrayList; import java.util.logging.Logger; import javax.servlet.ServletContext; @@ -50,6 +51,8 @@ private final static String PORT_SEPARATOR = ":"; + private final static String ACCESS_POINT_CLASSNAME = + "org.archive.wayback.webapp.AccessPoint"; private final static String CONFIG_PATH = "config-path"; // private WaybackContext defaultContext = null; @@ -87,6 +90,8 @@ if(secondSlash != -1) { collection = PORT_SEPARATOR + requestPath.substring(1,requestPath.indexOf("/",1)); + } else { + collection = PORT_SEPARATOR + requestPath.substring(1); } } return String.valueOf(request.getLocalPort()) + collection; @@ -99,14 +104,14 @@ public RequestContext mapContext(HttpServletRequest request) { RequestContext context = null; - String contextID = String.valueOf(request.getLocalPort()); - if(factory.containsBean(contextID)) { - Object o = factory.getBean(contextID); + String portStr = String.valueOf(request.getLocalPort()); + if(factory.containsBean(portStr)) { + Object o = factory.getBean(portStr); if(o instanceof RequestContext) { context = (RequestContext) o; } } else { - contextID = getContextID(request); + String contextID = getContextID(request); if(factory.containsBean(contextID)) { Object o = factory.getBean(contextID); if(o instanceof RequestContext) { @@ -114,9 +119,31 @@ } } } + if(context == null) { + ArrayList<String> names = getAccessPointNamesOnPort(portStr); + request.setAttribute("AccessPointNames", names); + } return context; } + @SuppressWarnings("unchecked") + public ArrayList<String> getAccessPointNamesOnPort(String portStr) { + ArrayList<String> names = new ArrayList<String>(); + try { + Class accessPointClass = Class.forName(ACCESS_POINT_CLASSNAME); + String[] apNames = factory.getBeanNamesForType(accessPointClass); + String portStrColon = portStr + ":"; + for(String apName : apNames) { + if(apName.startsWith(portStrColon)) { + names.add(apName.substring(portStrColon.length())); + } + } + } catch (ClassNotFoundException e) { + // boy, we're in trouble now.. + e.printStackTrace(); + } + return names; + } /** * clean up all WaybackContexts, which should release resources gracefully. */ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-11-28 03:08:06
|
Revision: 2087 http://archive-access.svn.sourceforge.net/archive-access/?rev=2087&view=rev Author: bradtofel Date: 2007-11-27 19:08:09 -0800 (Tue, 27 Nov 2007) Log Message: ----------- FEATURE: added closest indicator for queries, allowed missing trailing '/' on requests to base of AccessPoint Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2007-11-28 02:50:25 UTC (rev 2086) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2007-11-28 03:08:09 UTC (rev 2087) @@ -137,6 +137,10 @@ } String contextPath = getContextPath(httpRequest); if (!origRequestPath.startsWith(contextPath)) { + if(contextPath.startsWith(origRequestPath)) { + // missing trailing '/', just omit: + return ""; + } return null; } return origRequestPath.substring(contextPath.length()); @@ -321,7 +325,10 @@ SearchResults results = collection.getResourceIndex().query(wbRequest); if(results.getResultsType().equals( WaybackConstants.RESULTS_TYPE_CAPTURE)) { - + CaptureSearchResults cResults = (CaptureSearchResults) results; + SearchResult closest = cResults.getClosest(wbRequest); + closest.put(WaybackConstants.RESULT_CLOSEST_INDICATOR, + WaybackConstants.RESULT_CLOSEST_VALUE); query.renderUrlResults(httpRequest,httpResponse,wbRequest, results,uriConverter); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-11-28 02:50:20
|
Revision: 2086 http://archive-access.svn.sourceforge.net/archive-access/?rev=2086&view=rev Author: bradtofel Date: 2007-11-27 18:50:25 -0800 (Tue, 27 Nov 2007) Log Message: ----------- INITIAL REV: command line access to org.archive.wayback.resourcestore.WarcIndexer main() Added Paths: ----------- trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer Added: trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer 2007-11-28 02:50:25 UTC (rev 2086) @@ -0,0 +1,82 @@ +#!/usr/bin/env sh +## +## This script creates a CDX file for all ARC files in a directory +## PUTs those CDX files into a remote pipeline, and informs a remote +## LocationDB of the locations of all the ARC files. +## +## Optional environment variables +## +## JAVA_HOME Point at a JDK install to use. +## +## WAYBACK_HOME Pointer to your wayback install. If not present, we +## make an educated guess based of position relative to this +## script. +## +## JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. +## + +# Resolve links - $0 may be a softlink +PRG="$0" +while [ -h "$PRG" ]; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`/"$link" + fi +done +PRGDIR=`dirname "$PRG"` + +# Set WAYBACK_HOME. +if [ -z "$WAYBACK_HOME" ] +then + WAYBACK_HOME=`cd "$PRGDIR/.." ; pwd` +fi + +# Find JAVA_HOME. +if [ -z "$JAVA_HOME" ] +then + JAVA=`which java` + if [ -z "$JAVA" ] + then + echo "Cannot find JAVA. Please set JAVA_HOME or your PATH." + exit 1 + fi + JAVA_BINDIR=`dirname $JAVA` + JAVA_HOME=$JAVA_BINDIR/.. +fi + +if [ -z "$JAVACMD" ] +then + # It may be defined in env - including flags!! + JAVACMD=$JAVA_HOME/bin/java +fi + +# Ignore previous classpath. Build one that contains heritrix jar and content +# of the lib directory into the variable CP. +for jar in `ls $WAYBACK_HOME/lib/*.jar $WAYBACK_HOME/*.jar 2> /dev/null` +do + CP=${CP}:${jar} +done + +# cygwin path translation +if expr `uname` : 'CYGWIN*' > /dev/null; then + CP=`cygpath -p -w "$CP"` + WAYBACK_HOME=`cygpath -p -w "$WAYBACK_HOME"` +fi + +# Make sure of java opts. +if [ -z "$JAVA_OPTS" ] +then + JAVA_OPTS=" -Xmx256m" +fi + +# Main ArcIndexer class. +if [ -z "$CLASS_MAIN" ] +then + CLASS_MAIN='org.archive.wayback.resourcestore.WarcIndexer' +fi + +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN $@ + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2085 http://archive-access.svn.sourceforge.net/archive-access/?rev=2085&view=rev Author: bradtofel Date: 2007-11-27 18:08:02 -0800 (Tue, 27 Nov 2007) Log Message: ----------- INITIAL REV: class to transform a WARC file into an Iterator<SearchResult>. Includes main() to support command line conversion. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2007-11-28 02:08:02 UTC (rev 2085) @@ -0,0 +1,97 @@ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Iterator; + +import org.archive.io.ArchiveRecord; +import org.archive.io.warc.WARCReader; +import org.archive.io.warc.WARCReaderFactory; +import org.archive.io.warc.WARCRecord; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.CloseableIterator; + +public class WarcIndexer { + + /** + * CDX Header line for these fields. not very configurable.. + */ + public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; + + /** + * @param arc + * @return Iterator of SearchResults for input arc File + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(File warc) + throws IOException { + + Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter(); + + Adapter<WARCRecord, SearchResult> adapter2 = new WARCRecordToSearchResultAdapter(); + WARCReader reader = WARCReaderFactory.get(warc); + + Iterator<ArchiveRecord> itr1 = reader.iterator(); + + CloseableIterator<WARCRecord> itr2 = new AdaptedIterator<ArchiveRecord, WARCRecord>( + itr1, adapter1); + + return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2); + } + + private class ArchiveRecordToWARCRecordAdapter implements + Adapter<ArchiveRecord, WARCRecord> { + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public WARCRecord adapt(ArchiveRecord o) { + WARCRecord rec = null; + if (o instanceof WARCRecord) { + rec = (WARCRecord) o; + } + return rec; + } + } + + private static void USAGE() { + System.err.println("USAGE:"); + System.err.println(""); + System.err.println("warc-indexer WARCFILE"); + System.err.println("warc-indexer WARCFILE CDXFILE"); + System.err.println(""); + System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); + System.exit(1); + } + + /** + * @param args + */ + public static void main(String[] args) { + WarcIndexer indexer = new WarcIndexer(); + File arc = new File(args[0]); + PrintWriter pw = null; + try { + if (args.length == 1) { + // dump to STDOUT: + pw = new PrintWriter(System.out); + } else if (args.length == 2) { + pw = new PrintWriter(args[1]); + } else { + USAGE(); + } + Iterator<SearchResult> res = indexer.iterator(arc); + Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); + while (lines.hasNext()) { + pw.println(lines.next()); + } + pw.close(); + } catch (Exception e) { + e.printStackTrace(); + } + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2084 http://archive-access.svn.sourceforge.net/archive-access/?rev=2084&view=rev Author: bradtofel Date: 2007-11-27 18:06:27 -0800 (Tue, 27 Nov 2007) Log Message: ----------- INITIAL REV: class which adapts (some) WARCRecords into SearchResult objects. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java 2007-11-28 02:06:27 UTC (rev 2084) @@ -0,0 +1,302 @@ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.IOException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.URIException; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.RecoverableIOException; +import org.archive.io.arc.ARCConstants; +import org.archive.io.warc.WARCConstants; +import org.archive.io.warc.WARCRecord; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.UrlCanonicalizer; + +/** + * Adapts certain WARCRecords into SearchResults. DNS and response records are + * mostly straightforward, but SearchResult objects generated from revisit + * records contain lots of "placeholder" fields, which are expected to be + * understood by later processes traversing a stream of SearchResult objects. + * + * See org.archive.wayback.resourceindex.DeduplicateSearchResultAnnotationAdapter. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class WARCRecordToSearchResultAdapter +implements Adapter<WARCRecord,SearchResult>{ + + private final static String DEFAULT_VALUE = "-"; + private final static String SEARCH_FIELDS[] = { + WaybackConstants.RESULT_URL, + WaybackConstants.RESULT_URL_KEY, + WaybackConstants.RESULT_ORIG_HOST, + WaybackConstants.RESULT_CAPTURE_DATE, + WaybackConstants.RESULT_MD5_DIGEST, + WaybackConstants.RESULT_MIME_TYPE, + WaybackConstants.RESULT_HTTP_CODE, + WaybackConstants.RESULT_REDIRECT_URL, + WaybackConstants.RESULT_ARC_FILE, + WaybackConstants.RESULT_OFFSET, + }; + + private static final Logger LOGGER = Logger.getLogger( + WARCRecordToSearchResultAdapter.class.getName()); + + // TODO: make this configurable based on the ResourceIndex + private static UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public SearchResult adapt(WARCRecord rec) { + try { + return adaptInner(rec); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + /* + * Transform input date to 14-digit timestamp: + * 2007-08-29T18:00:26Z => 20070829180026 + */ + private static String transformDate(final String input) { + + StringBuilder output = new StringBuilder(14); + + output.append(input.substring(0,4)); + output.append(input.substring(5,7)); + output.append(input.substring(8,10)); + output.append(input.substring(11,13)); + output.append(input.substring(14,16)); + output.append(input.substring(17,19)); + + return output.toString(); + } + + private static String transformHTTPMime(final String input) { + int semiIdx = input.indexOf(";"); + if(semiIdx > 0) { + return input.substring(0,semiIdx).trim(); + } + return input.trim(); + } + + private String transformWarcFilename(String readerIdentifier) { + String warcName = readerIdentifier; + int index = warcName.lastIndexOf(File.separator); + if (index > 0 && (index + 1) < warcName.length()) { + warcName = warcName.substring(index + 1); + } + return warcName; + } + + private String transformDigest(final Object o) { + if(o == null) { + return DEFAULT_VALUE; + } + String orig = o.toString(); + if(orig.startsWith("sha1:")) { + return orig.substring(5); + } + return orig; + } + + private SearchResult getBlankSearchResult() { + SearchResult result = new SearchResult(); + for(String field : SEARCH_FIELDS) { + result.put(field, DEFAULT_VALUE); + } + return result; + } + + private void addUrlDataToSearchResult(SearchResult result, String urlStr) + throws IOException { + + result.put(WaybackConstants.RESULT_URL, urlStr); + result.put(WaybackConstants.RESULT_URL_KEY, urlStr); + + + UURI uri = UURIFactory.getInstance(urlStr); + String uriHost = uri.getHost(); + if (uriHost == null) { + + LOGGER.info("No host in " + urlStr); + + } else { + + result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); + } + + String urlKey = canonicalizer.urlStringToKey(urlStr); + result.put(WaybackConstants.RESULT_URL_KEY, urlKey); + } + + private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_ARC_FILE, + transformWarcFilename(header.getReaderIdentifier())); + result.put(WaybackConstants.RESULT_OFFSET, + String.valueOf(header.getOffset())); + + String uriStr = header.getUrl(); + + String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX + .length()); + result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype()); + + result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); + result.put(WaybackConstants.RESULT_URL, uriStr); + result.put(WaybackConstants.RESULT_URL_KEY, uriStr); + + rec.close(); + result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); + + return result; + } + + private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_MD5_DIGEST, + transformDigest(header.getHeaderValue( + WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); + + addUrlDataToSearchResult(result,header.getUrl()); + + return result; + } + + /** + * borrowed(copied) from org.archive.io.arc.ARCRecord... + * + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_ARC_FILE, + transformWarcFilename(header.getReaderIdentifier())); + result.put(WaybackConstants.RESULT_OFFSET, + String.valueOf(header.getOffset())); + + String origUrl = header.getUrl(); + addUrlDataToSearchResult(result,origUrl); + + // need to parse the documents HTTP message and headers here: WARCReader + // does not implement this... yet.. + + byte [] statusBytes = HttpParser.readRawLine(rec); + int eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new RecoverableIOException("Failed to read http status where one " + + " was expected: " + new String(statusBytes)); + } + String statusLine = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + if ((statusLine == null) || + !StatusLine.startsWithHTTP(statusLine)) { + throw new RecoverableIOException("Failed parse of http status line."); + } + StatusLine status = new StatusLine(statusLine); + result.put(WaybackConstants.RESULT_HTTP_CODE, + String.valueOf(status.getStatusCode())); + + Header[] headers = HttpParser.parseHeaders(rec, + ARCConstants.DEFAULT_ENCODING); + + rec.close(); + result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); + + if (headers != null) { + + for (Header httpHeader : headers) { + if (httpHeader.getName().equals( + WaybackConstants.LOCATION_HTTP_HEADER)) { + + String locationStr = httpHeader.getValue(); + // TODO: "Location" is supposed to be absolute: + // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) + // (section 14.30) but Content-Location can be + // relative. + // is it correct to resolve a relative Location, as + // we are? + // it's also possible to have both in the HTTP + // headers... + // should we prefer one over the other? + // right now, we're ignoring "Content-Location" + try { + UURI uriRedirect = UURIFactory.getInstance(origUrl, + locationStr); + result.put(WaybackConstants.RESULT_REDIRECT_URL, + uriRedirect.getEscapedURI()); + } catch (URIException e) { + LOGGER.info("Bad Location: " + locationStr + + " for " + origUrl + " in " + + header.getReaderIdentifier() + " Skipped"); + } + } else if(httpHeader.getName().toLowerCase().equals("content-type")) { + result.put(WaybackConstants.RESULT_MIME_TYPE, + transformHTTPMime(httpHeader.getValue())); + } + } + } + return result; + } + + private SearchResult adaptInner(WARCRecord rec) throws IOException { + + SearchResult result = null; + ArchiveRecordHeader header = rec.getHeader(); + String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); + if(type.equals(WARCConstants.RESPONSE)) { + String mime = header.getMimetype(); + if(mime.equals("text/dns")) { + result = adaptDNS(header,rec); + } else { + result = adaptResponse(header,rec); + } + } else if(type.equals(WARCConstants.REVISIT)) { + result = adaptRevisit(header,rec); + } + + return result; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2083 http://archive-access.svn.sourceforge.net/archive-access/?rev=2083&view=rev Author: bradtofel Date: 2007-11-27 18:02:06 -0800 (Tue, 27 Nov 2007) Log Message: ----------- INITIAL REV: single static method to coerce a ARC/WARC file+offset into a Wayback Resource Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java 2007-11-28 02:02:06 UTC (rev 2083) @@ -0,0 +1,50 @@ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.IOException; + +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCReader; +import org.archive.io.arc.ARCReaderFactory; +import org.archive.io.arc.ARCRecord; +import org.archive.io.warc.WARCReader; +import org.archive.io.warc.WARCReaderFactory; +import org.archive.io.warc.WARCRecord; +import org.archive.wayback.core.Resource; +import org.archive.wayback.exception.ResourceNotAvailableException; + +public class ResourceFactory { + + public static Resource getResource(File file, long offset) + throws IOException, ResourceNotAvailableException { + + Resource r = null; + String name = file.getName(); + if(name.endsWith(LocalResourceStore.ARC_EXTENSION) || + name.endsWith(LocalResourceStore.ARC_GZ_EXTENSION)) { + + ARCReader reader = ARCReaderFactory.get(file); + ArchiveRecord rec = reader.get(offset); + if(!(rec instanceof ARCRecord)) { + throw new ResourceNotAvailableException("Bad ARCRecord format"); + } + ArcResource ar = new ArcResource((ARCRecord) rec, reader); + ar.parseHeaders(); + r = ar; + + } else if(name.endsWith(LocalResourceStore.WARC_EXTENSION) || + name.endsWith(LocalResourceStore.WARC_GZ_EXTENSION)) { + + WARCReader reader = WARCReaderFactory.get(file); + ArchiveRecord rec = reader.get(offset); + if(!(rec instanceof ARCRecord)) { + throw new ResourceNotAvailableException("Bad ARCRecord format"); + } + WarcResource wr = new WarcResource((WARCRecord) rec, reader); + wr.parseHeaders(); + r = wr; + } + + return r; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-11-28 02:00:27
|
Revision: 2082 http://archive-access.svn.sourceforge.net/archive-access/?rev=2082&view=rev Author: bradtofel Date: 2007-11-27 18:00:31 -0800 (Tue, 27 Nov 2007) Log Message: ----------- REFACTOR/FEATURE: made Resource abstract, moved ARC-specific code to ArcResource, added WARC-specific coercion code to WarcResource. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java 2007-11-28 00:59:27 UTC (rev 2081) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java 2007-11-28 02:00:31 UTC (rev 2082) @@ -26,236 +26,118 @@ import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; -import java.util.Enumeration; -import java.util.HashMap; -import java.util.Hashtable; -import java.util.Iterator; import java.util.Map; -import java.util.Set; -import java.util.logging.Logger; -import org.apache.commons.httpclient.Header; -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCRecord; - /** - * Slightly more than an ARCRecord. This class is designed to be an abstraction - * to allow the Wayback to operator with non-ARC file format resources. Probably - * the interface required will end up looking very much like ARCRecord, but can - * be reimplemented to handle new ARC formats or non-ARC formats. + * Abstraction on top of a document stored in a WaybackCollection. Currently + * implemented subclasses include ArcResource and WarcResource. * * @author Brad Tofel * @version $Date$, $Revision$ */ -public class Resource extends InputStream { - /** - * Logger for this class - */ - private static final Logger LOGGER = Logger.getLogger(Resource.class - .getName()); - - /** - * String prefix for ARC file related metadata namespace of keys within - * metaData Properties bag. - */ - private static String ARC_META_PREFIX = "arcmeta."; - /** - * String prefix for HTTP Header related metadata namespace of keys within - * metaData Properties bag. - */ - private static String HTTP_HEADER_PREFIX = "httpheader."; - /** - * object for ARCRecord - */ - ARCRecord arcRecord = null; - /** - * object for ARCReader -- need to hold on to this in order to call close() - * to release filehandle after completing access to this record. optional - */ - ARCReader arcReader = null; - /** - * flag to indicate if the ARCRecord skipHTTPHeader() has been called - */ - boolean parsedHeader = false; - /** - * Expandable property bag for holding metadata associated with this - * resource - */ - Hashtable<String,String> metaData = new Hashtable<String,String>(); +public abstract class Resource extends InputStream { - private BufferedInputStream bis; - - /** - * Constructor - * - * @param rec - * @param reader - */ - public Resource(final ARCRecord rec,final ARCReader reader) { - super(); - arcRecord = rec; - arcReader = reader; - bis = new BufferedInputStream(rec); - } + private InputStream is; - /** parse the headers on the underlying ARC record, and extract all - * @throws IOException - */ - public void parseHeaders () throws IOException { - if(!parsedHeader) { - arcRecord.skipHttpHeader(); - // copy all HTTP headers to metaData, prefixing with - // HTTP_HEADER_PREFIX - Header[] headers = arcRecord.getHttpHeaders(); - if (headers != null) { - for (int i = 0; i < headers.length; i++) { - String value = headers[i].getValue(); - String name = headers[i].getName(); - metaData.put(HTTP_HEADER_PREFIX + name,value); - } - } + public abstract void close() throws IOException; + public abstract int getStatusCode(); + public abstract long getRecordLength(); + public abstract Map<String,String> getHttpHeaders(); - // copy all ARC record header fields to metaData, prefixing with - // ARC_META_PREFIX - @SuppressWarnings("unchecked") - Map<String,Object> headerMetaMap = arcRecord.getMetaData().getHeaderFields(); - Set<String> keys = headerMetaMap.keySet(); - Iterator<String> itr = keys.iterator(); - while(itr.hasNext()) { - String metaKey = itr.next(); - Object value = headerMetaMap.get(metaKey); - String metaValue = ""; - if(value != null) { - metaValue = value.toString(); - } - metaData.put(ARC_META_PREFIX + metaKey,metaValue); - } - - parsedHeader = true; + protected void setInputStream(InputStream is) { + if(is.markSupported()) { + this.is = is; + } else { + this.is = new BufferedInputStream(is); } } - /** - * @param prefix - * @return a Properties of all elements in metaData starting with 'prefix'. - * keys in the returned Properties have 'prefix' removed. + * @return + * @throws IOException + * @see java.io.BufferedInputStream#available() */ - public Map<String,String> filterMeta(String prefix) { - HashMap<String,String> matching = new HashMap<String,String>(); - for (Enumeration<String> e = metaData.keys(); e.hasMoreElements();) { - String key = e.nextElement(); - if (key.startsWith(prefix)) { - String finalKey = key.substring(prefix.length()); - String value = metaData.get(key); - matching.put(finalKey, value); - } + public int available() throws IOException { + if(is == null) { + throw new IOException("No InputStream"); } - return matching; + return is.available(); } - /** - * @return a Properties containing all HTTP header fields for this record + * @param readlimit + * @see java.io.BufferedInputStream#mark(int) */ - public Map<String,String> getHttpHeaders() { - return filterMeta(HTTP_HEADER_PREFIX); + public void mark(int readlimit) { + if(is != null) { + is.mark(readlimit); + } } - /** - * @return a Properties containing all ARC Meta fields for this record + * @return + * @see java.io.BufferedInputStream#markSupported() */ - public Map<String,String> getARCMetadata() { - return filterMeta(ARC_META_PREFIX); + public boolean markSupported() { + if(is == null) { + return false; + } + return is.markSupported(); } - /** - * (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#getStatusCode() - * @return int HTTP status code returned with this document. + * @return + * @throws IOException + * @see java.io.BufferedInputStream#read() */ - public int getStatusCode() { - return arcRecord.getStatusCode(); + public int read() throws IOException { + if(is == null) { + throw new IOException("No InputStream"); + } + return is.read(); } - /** - * @return the ARCRecord underlying this Resource. + * @param b + * @param off + * @param len + * @return + * @throws IOException + * @see java.io.BufferedInputStream#read(byte[], int, int) */ - public ArchiveRecord getArcRecord() { - return arcRecord; + public int read(byte[] b, int off, int len) throws IOException { + if(is == null) { + throw new IOException("No InputStream"); + } + return is.read(b, off, len); } - - /* (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#read() + /** + * @param b + * @return + * @throws IOException + * @see java.io.FilterInputStream#read(byte[]) */ - public int read() throws IOException { - return bis.read(); - } - - /* (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#read(byte[], int, int) - */ - public int read(byte[] arg0, int arg1, int arg2) throws IOException { - return bis.read(arg0, arg1, arg2); - } - - /* (non-Javadoc) - * @see java.io.InputStream#read(byte[]) - */ public int read(byte[] b) throws IOException { - return bis.read(b); + if(is == null) { + throw new IOException("No InputStream"); + } + return is.read(b); } - - /* (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#skip(long) - */ - public long skip(long arg0) throws IOException { - return bis.skip(arg0); - } - - /* (non-Javadoc) - * @see java.io.BufferedInputStream#available() - */ - public int available() throws IOException { - return bis.available(); - } - - /* (non-Javadoc) - * @see java.io.BufferedInputStream#mark(int) - */ - public void mark(int readlimit) { - bis.mark(readlimit); - } - - /* (non-Javadoc) - * @see java.io.BufferedInputStream#markSupported() - */ - public boolean markSupported() { - return bis.markSupported(); - } - - /* (non-Javadoc) + /** + * @throws IOException * @see java.io.BufferedInputStream#reset() */ public void reset() throws IOException { - bis.reset(); - } - - /* (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#close() - */ - public void close() throws IOException { - //LOGGER.info("About to close..("+arcReader+")"); - arcRecord.close(); - if(arcReader != null) { - arcReader.close(); - LOGGER.info("closed..("+arcReader+")"); + if(is == null) { + throw new IOException("No InputStream"); } + is.reset(); } - /** - * @return byte length claimed in ARC record metadata line. + * @param n + * @return + * @throws IOException + * @see java.io.BufferedInputStream#skip(long) */ - public long getRecordLength() { - return arcRecord.getMetaData().getLength(); + public long skip(long n) throws IOException { + if(is == null) { + throw new IOException("No InputStream"); + } + return is.skip(n); } } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java 2007-11-28 02:00:31 UTC (rev 2082) @@ -0,0 +1,170 @@ +package org.archive.wayback.resourcestore; + +import java.io.IOException; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.Hashtable; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCReader; +import org.archive.io.arc.ARCRecord; +import org.archive.wayback.core.Resource; + +public class ArcResource extends Resource { + /** + * Logger for this class + */ + private static final Logger LOGGER = Logger.getLogger(ArcResource.class + .getName()); + + /** + * String prefix for ARC file related metadata namespace of keys within + * metaData Properties bag. + */ + private static String ARC_META_PREFIX = "arcmeta."; + /** + * String prefix for HTTP Header related metadata namespace of keys within + * metaData Properties bag. + */ + private static String HTTP_HEADER_PREFIX = "httpheader."; + /** + * object for ARCRecord + */ + ARCRecord arcRecord = null; + /** + * object for ARCReader -- need to hold on to this in order to call close() + * to release filehandle after completing access to this record. optional + */ + ARCReader arcReader = null; + /** + * flag to indicate if the ARCRecord skipHTTPHeader() has been called + */ + boolean parsedHeader = false; + /** + * Expandable property bag for holding metadata associated with this + * resource + */ + Hashtable<String,String> metaData = new Hashtable<String,String>(); + + /** + * Constructor + * + * @param rec + * @param reader + */ + public ArcResource(final ARCRecord rec,final ARCReader reader) { + super(); + arcRecord = rec; + arcReader = reader; + setInputStream(rec); + } + + /** parse the headers on the underlying ARC record, and extract all + * @throws IOException + */ + public void parseHeaders () throws IOException { + if(!parsedHeader) { + arcRecord.skipHttpHeader(); + // copy all HTTP headers to metaData, prefixing with + // HTTP_HEADER_PREFIX + Header[] headers = arcRecord.getHttpHeaders(); + if (headers != null) { + for (int i = 0; i < headers.length; i++) { + String value = headers[i].getValue(); + String name = headers[i].getName(); + metaData.put(HTTP_HEADER_PREFIX + name,value); + } + } + + // copy all ARC record header fields to metaData, prefixing with + // ARC_META_PREFIX + @SuppressWarnings("unchecked") + Map<String,Object> headerMetaMap = arcRecord.getMetaData().getHeaderFields(); + Set<String> keys = headerMetaMap.keySet(); + Iterator<String> itr = keys.iterator(); + while(itr.hasNext()) { + String metaKey = itr.next(); + Object value = headerMetaMap.get(metaKey); + String metaValue = ""; + if(value != null) { + metaValue = value.toString(); + } + metaData.put(ARC_META_PREFIX + metaKey,metaValue); + } + + parsedHeader = true; + } + } + + /** + * @param prefix + * @return a Properties of all elements in metaData starting with 'prefix'. + * keys in the returned Properties have 'prefix' removed. + */ + public Map<String,String> filterMeta(String prefix) { + HashMap<String,String> matching = new HashMap<String,String>(); + for (Enumeration<String> e = metaData.keys(); e.hasMoreElements();) { + String key = e.nextElement(); + if (key.startsWith(prefix)) { + String finalKey = key.substring(prefix.length()); + String value = metaData.get(key); + matching.put(finalKey, value); + } + } + return matching; + } + + /** + * @return a Properties containing all HTTP header fields for this record + */ + public Map<String,String> getHttpHeaders() { + return filterMeta(HTTP_HEADER_PREFIX); + } + + /** + * @return a Properties containing all ARC Meta fields for this record + */ + public Map<String,String> getARCMetadata() { + return filterMeta(ARC_META_PREFIX); + } + + /** + * (non-Javadoc) + * @see org.archive.io.arc.ARCRecord#getStatusCode() + * @return int HTTP status code returned with this document. + */ + public int getStatusCode() { + return arcRecord.getStatusCode(); + } + + /** + * @return the ARCRecord underlying this Resource. + */ + public ArchiveRecord getArcRecord() { + return arcRecord; + } + + /* (non-Javadoc) + * @see org.archive.io.arc.ARCRecord#close() + */ + public void close() throws IOException { + //LOGGER.info("About to close..("+arcReader+")"); + arcRecord.close(); + if(arcReader != null) { + arcReader.close(); + LOGGER.info("closed..("+arcReader+")"); + } + } + + /** + * @return byte length claimed in ARC record metadata line. + */ + public long getRecordLength() { + return arcRecord.getMetaData().getLength(); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java 2007-11-28 02:00:31 UTC (rev 2082) @@ -0,0 +1,98 @@ +package org.archive.wayback.resourcestore; + +import java.io.IOException; +import java.util.Hashtable; +import java.util.Map; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.RecoverableIOException; +import org.archive.io.arc.ARCConstants; +import org.archive.io.warc.WARCReader; +import org.archive.io.warc.WARCRecord; +import org.archive.wayback.core.Resource; + +public class WarcResource extends Resource { + private WARCRecord rec = null; + private WARCReader reader = null; + private Map<String, String> headers = null; + private long length = 0; + private int status = 0; + private boolean parsedHeaders = false; + public WarcResource(WARCRecord rec, WARCReader reader) { + this.rec = rec; + this.reader = reader; + } + + /** + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + public void parseHeaders() throws IOException { + if(parsedHeaders) { + return; + } + + byte [] statusBytes = HttpParser.readRawLine(rec); + int eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new RecoverableIOException("Failed to read http status where one " + + " was expected: " + new String(statusBytes)); + } + String statusLineStr = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + if ((statusLineStr == null) || + !StatusLine.startsWithHTTP(statusLineStr)) { + throw new RecoverableIOException("Failed parse of http status line."); + } + StatusLine statusLine = new StatusLine(statusLineStr); + + this.status = statusLine.getStatusCode(); + + Header[] tmpHeaders = HttpParser.parseHeaders(rec, + ARCConstants.DEFAULT_ENCODING); + headers = new Hashtable<String,String>(); + for(Header header: tmpHeaders) { + headers.put(header.getName(), header.getValue()); + } + this.setInputStream(rec); + parsedHeaders = true; + } + + + @Override + public Map<String, String> getHttpHeaders() { + return headers; + } + + @Override + public long getRecordLength() { + // TODO Auto-generated method stub + return length; + } + + @Override + public int getStatusCode() { + return status; + } + + @Override + public void close() throws IOException { + rec.close(); + reader.close(); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-11-28 00:59:29
|
Revision: 2081 http://archive-access.svn.sourceforge.net/archive-access/?rev=2081&view=rev Author: bradtofel Date: 2007-11-27 16:59:27 -0800 (Tue, 27 Nov 2007) Log Message: ----------- COMMENT: changed class comment and removed unused private class.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2007-11-28 00:57:47 UTC (rev 2080) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2007-11-28 00:59:27 UTC (rev 2081) @@ -40,8 +40,7 @@ import org.archive.wayback.util.CloseableIterator; /** - * Transforms an ARC file into SearchResults, or a serialized SearchResults - * file(CDX). + * Transforms an ARC file into Iterator<SearchResult>. * * @author brad * @version $Date$, $Revision$ @@ -92,86 +91,6 @@ return rec; } } - -// private class DurableArchiveRecordIterator -// implements Iterator<ArchiveRecord> { -// -// private long lastRestart = 0; -// private File arc = null; -// Iterator<ArchiveRecord> innerItr = null; -// ArchiveRecord cachedNext = null; -// -// public DurableArchiveRecordIterator(File arc) throws IOException { -// this.arc = arc; -// restart(0); -// } -// -// private void restart(long offset) throws IOException { -// ARCReader arcReader = ARCReaderFactory.get(arc,offset); -// arcReader.setParseHttpHeaders(true); -// innerItr = arcReader.iterator(); -// } -// -// private long parseErrorOffset(String message) { -// long found = -1; -// int idx = message.indexOf("Offset "); -// if(idx >= 0) { -// int idx2 = message.indexOf(")"); -// if(idx2 > 0) { -// String part = message.substring(idx + 7,idx2); -// System.err.println("Found(" + part +") from (" + message + ")"); -// found = Long.parseLong(part) + 100; -// } -// } -// return found; -// } -// -// public boolean hasNext() { -// if(cachedNext != null) { -// return true; -// } -// while(true) { -// try { -// if(!innerItr.hasNext()) { -// return false; -// } -// cachedNext = innerItr.next(); -// } catch (RuntimeException e) { -// long offset = parseErrorOffset(e.getMessage()); -// if(offset > 0) { -// if(lastRestart == offset) { -// return false; -// } -// lastRestart = offset; -// try { -// restart(offset); -// } catch (IOException e1) { -// throw new RuntimeException(e1); -// } -// } else { -// throw e; -// } -// } -// if(cachedNext != null) { -// break; -// } -// } -// return true; -// } -// -// public ArchiveRecord next() { -// if(cachedNext == null) { -// throw new NoSuchElementException("next() without hasNext()"); -// } -// ArchiveRecord tmp = cachedNext; -// cachedNext = null; -// return tmp; -// } -// -// public void remove() { -// throw new UnsupportedOperationException("remove unimplemented"); -// } -// } private static void USAGE() { System.err.println("USAGE:"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |