From: <bra...@us...> - 2008-11-07 22:38:52
|
Revision: 2639 http://archive-access.svn.sourceforge.net/archive-access/?rev=2639&view=rev Author: bradtofel Date: 2008-11-07 22:38:48 +0000 (Fri, 07 Nov 2008) Log Message: ----------- FEATURE: added -all option to warc-indexer command line tool, causing the tool to output records for request and metadata records as well as duplicate, capture, and dns records. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2008-11-07 22:35:24 UTC (rev 2638) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2008-11-07 22:38:48 UTC (rev 2639) @@ -2,7 +2,7 @@ import java.io.File; import java.io.IOException; -//import java.util.logging.Logger; +import java.util.logging.Logger; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpParser; @@ -33,14 +33,23 @@ */ public class WARCRecordToSearchResultAdapter implements Adapter<WARCRecord,CaptureSearchResult>{ + private static final Logger LOGGER = + Logger.getLogger(WARCRecordToSearchResultAdapter.class.getName()); private final static String DEFAULT_VALUE = "-"; -// private static final Logger LOGGER = Logger.getLogger( -// WARCRecordToSearchResultAdapter.class.getName()); - private UrlCanonicalizer canonicalizer = null; + + private boolean processAll = false; + public boolean isProcessAll() { + return processAll; + } + + public void setProcessAll(boolean processAll) { + this.processAll = processAll; + } + public WARCRecordToSearchResultAdapter() { canonicalizer = new AggressiveUrlCanonicalizer(); } @@ -75,12 +84,19 @@ return output.toString(); } - private static String transformHTTPMime(final String input) { + private static String escapeSpaces(final String input) { + if(input.contains(" ")) { + return input.replace(" ", "%20"); + } + return input; + } + + private static String transformHTTPMime(String input) { int semiIdx = input.indexOf(";"); if(semiIdx > 0) { - return input.substring(0,semiIdx).trim(); + return escapeSpaces(input.substring(0,semiIdx).trim()); } - return input.trim(); + return escapeSpaces(input.trim()); } private String transformWarcFilename(String readerIdentifier) { @@ -148,16 +164,21 @@ return result; } - private CaptureSearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) + private CaptureSearchResult adaptGeneric(ArchiveRecordHeader header, + WARCRecord rec, String mime) throws IOException { CaptureSearchResult result = getBlankSearchResult(); result.setCaptureTimestamp(transformDate(header.getDate())); + result.setFile(transformWarcFilename(header.getReaderIdentifier())); + result.setOffset(header.getOffset()); result.setDigest(transformDigest(header.getHeaderValue( - WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); + WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); addUrlDataToSearchResult(result,header.getUrl()); + + result.setMimeType(mime); return result; } @@ -243,7 +264,7 @@ } return result; } - + private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException { CaptureSearchResult result = null; @@ -257,7 +278,17 @@ result = adaptResponse(header,rec); } } else if(type.equals(WARCConstants.REVISIT)) { - result = adaptRevisit(header,rec); + result = adaptGeneric(header,rec,"warc/revisit"); + } else if(type.equals(WARCConstants.REQUEST)) { + if(processAll) { + result = adaptGeneric(header,rec,"warc/request"); + } + } else if(type.equals(WARCConstants.METADATA)) { + if(processAll) { + result = adaptGeneric(header,rec,"warc/metadata"); + } + } else { + LOGGER.info("Skipping record type : " + type); } return result; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2008-11-07 22:35:24 UTC (rev 2638) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2008-11-07 22:38:48 UTC (rev 2639) @@ -26,9 +26,19 @@ public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; private UrlCanonicalizer canonicalizer = null; + private boolean processAll = false; public WarcIndexer() { canonicalizer = new AggressiveUrlCanonicalizer(); } + + public boolean isProcessAll() { + return processAll; + } + + public void setProcessAll(boolean processAll) { + this.processAll = processAll; + } + /** * @param warc @@ -61,6 +71,7 @@ WARCRecordToSearchResultAdapter adapter2 = new WARCRecordToSearchResultAdapter(); adapter2.setCanonicalizer(canonicalizer); + adapter2.setProcessAll(processAll); ArchiveReaderCloseableIterator itr1 = new ArchiveReaderCloseableIterator(reader,reader.iterator()); @@ -82,11 +93,12 @@ private static void USAGE() { System.err.println("USAGE:"); System.err.println(""); - System.err.println("warc-indexer [-identity] WARCFILE"); - System.err.println("warc-indexer [-identity] WARCFILE CDXFILE"); + System.err.println("warc-indexer [-identity] [-all] WARCFILE"); + System.err.println("warc-indexer [-identity] [-all] WARCFILE CDXFILE"); System.err.println(""); System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); System.err.println("With -identity, perform no url canonicalization."); + System.err.println("With -all, output request and metadata records."); System.exit(1); } @@ -96,8 +108,14 @@ public static void main(String[] args) { WarcIndexer indexer = new WarcIndexer(); int idx = 0; - if(args[0] != null && args[0].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); + while(args[idx] != null) { + if(args[idx].equals("-identity")) { + indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); + } else if(args[idx].equals("-all")) { + indexer.setProcessAll(true); + } else { + break; + } idx++; } File arc = new File(args[idx]); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 01:49:43
|
Revision: 2885 http://archive-access.svn.sourceforge.net/archive-access/?rev=2885&view=rev Author: bradtofel Date: 2009-11-06 01:49:32 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REFACTOR: moved main() from ArcIndexer and WarcIndexer into IndexWorker Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2009-11-06 01:42:28 UTC (rev 2884) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2009-11-06 01:49:32 UTC (rev 2885) @@ -25,9 +25,7 @@ package org.archive.wayback.resourcestore.indexer; import java.io.File; -import java.io.PrintWriter; import java.io.IOException; -import java.util.Iterator; import org.archive.io.ArchiveRecord; import org.archive.io.arc.ARCReader; @@ -35,12 +33,10 @@ import org.archive.io.arc.ARCRecord; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.IdentityUrlCanonicalizer; /** * Transforms an ARC file into Iterator<CaptureSearchResult>. @@ -50,10 +46,6 @@ */ public class ArcIndexer { - /** - * CDX Header line for these fields. not very configurable.. - */ - public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; private UrlCanonicalizer canonicalizer = null; public ArcIndexer() { @@ -113,51 +105,6 @@ this.canonicalizer = canonicalizer; } - private static void USAGE() { - System.err.println("USAGE:"); - System.err.println(""); - System.err.println("arc-indexer [-identity] ARCFILE"); - System.err.println("arc-indexer [-identity] ARCFILE CDXFILE"); - System.err.println(""); - System.err.println("Create a CDX format index at CDXFILE or to STDOUT."); - System.err.println("With -identity, perform no url canonicalization."); - System.exit(1); - } - - /** - * @param args - */ - public static void main(String[] args) { - ArcIndexer indexer = new ArcIndexer(); - int idx = 0; - if(args[0] != null && args[0].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); - idx++; - } - File arc = new File(args[idx]); - idx++; - PrintWriter pw = null; - try { - if(args.length == idx) { - // dump to STDOUT: - pw = new PrintWriter(System.out); - } else if(args.length == (idx + 1)) { - pw = new PrintWriter(args[idx]); - } else { - USAGE(); - } - Iterator<CaptureSearchResult> res = indexer.iterator(arc); - Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); - while(lines.hasNext()) { - pw.println(lines.next()); - } - pw.close(); - } catch (Exception e) { - e.printStackTrace(); - System.exit(1); - } - } - private class ArchiveRecordToARCRecordAdapter implements Adapter<ArchiveRecord,ARCRecord> { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2009-11-06 01:42:28 UTC (rev 2884) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2009-11-06 01:49:32 UTC (rev 2885) @@ -24,16 +24,23 @@ */ package org.archive.wayback.resourcestore.indexer; +import java.io.FileNotFoundException; import java.io.IOException; +import java.io.PrintWriter; +import java.util.Iterator; import java.util.logging.Logger; import org.archive.wayback.Shutdownable; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.resourceindex.cdx.CDXFormatIndex; +import org.archive.wayback.resourceindex.cdx.SearchResultToCDXFormatAdapter; +import org.archive.wayback.resourceindex.cdx.format.CDXFormat; +import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; import org.archive.wayback.resourceindex.updater.IndexClient; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; import org.archive.wayback.util.CloseableIterator; -//import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; import org.archive.wayback.util.url.IdentityUrlCanonicalizer; /** @@ -112,6 +119,7 @@ } } catch(IOException e) { LOGGER.severe("FAILED to index or upload (" + name + ")"); + e.printStackTrace(); } } return worked; @@ -133,7 +141,86 @@ } return itr; } + + private static void USAGE() { + System.err.println("USAGE:"); + System.err.println(""); + System.err.println("cdx-indexer [-format FORMAT|-identity] FILE"); + System.err.println("cdx-indexer [-format FORMAT|-identity] FILE CDXFILE"); + System.err.println(""); + System.err.println("Create a CDX format index from ARC or WARC file"); + System.err.println("FILE at CDXFILE or to STDOUT."); + System.err.println("With -identity, perform no url canonicalization."); + System.err.println("With -format, output CDX in format FORMAT."); + System.exit(1); + } + /** + * @param args + */ + public static void main(String[] args) { + String cdxSpec = CDXFormatIndex.CDX_HEADER_MAGIC; + PrintWriter pw = new PrintWriter(System.out); + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + boolean setFormat = false; + boolean isIdentity = false; + String path = null; + for(int idx = 0; idx < args.length; idx++) { + if(args[idx].equals("-identity")) { + canonicalizer = new IdentityUrlCanonicalizer(); + isIdentity = true; + } else if(args[idx].equals("-format")) { + idx++; + if(idx >= args.length) { + USAGE(); + } + cdxSpec = args[idx]; + setFormat = true; + } else { + // either input filename: + if(path == null) { + path = args[idx]; + } else { + // or if that's already been specified, then target file: + if(idx+1 != args.length){ + USAGE(); + } + try { + pw = new PrintWriter(args[idx]); + } catch (FileNotFoundException e) { + e.printStackTrace(); + System.exit(1); + } + break; + } + } + } + if(!setFormat && isIdentity) { + cdxSpec = cdxSpec.replace(" N ", " a "); + } + IndexWorker worker = new IndexWorker(); + worker.canonicalizer = canonicalizer; + worker.interval = 0; + worker.init(); + try { + CloseableIterator<CaptureSearchResult> itr = worker.indexFile(path); + CDXFormat cdxFormat = new CDXFormat(cdxSpec); + Iterator<String> lines = + SearchResultToCDXFormatAdapter.adapt(itr, cdxFormat); + pw.println(cdxSpec); + while(lines.hasNext()) { + pw.println(lines.next()); + } + pw.close(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } catch (CDXFormatException e) { + e.printStackTrace(); + System.exit(1); + } + + } private class WorkerThread extends Thread { private long runInterval = 120000; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2009-11-06 01:42:28 UTC (rev 2884) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2009-11-06 01:49:32 UTC (rev 2885) @@ -2,8 +2,6 @@ import java.io.File; import java.io.IOException; -import java.io.PrintWriter; -import java.util.Iterator; import org.archive.io.ArchiveRecord; import org.archive.io.warc.WARCReader; @@ -11,20 +9,13 @@ import org.archive.io.warc.WARCRecord; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.IdentityUrlCanonicalizer; public class WarcIndexer { - /** - * CDX Header line for these fields. not very configurable.. - */ - public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; - private UrlCanonicalizer canonicalizer = null; private boolean processAll = false; public WarcIndexer() { @@ -89,60 +80,7 @@ public void setCanonicalizer(UrlCanonicalizer canonicalizer) { this.canonicalizer = canonicalizer; } - - private static void USAGE() { - System.err.println("USAGE:"); - System.err.println(""); - System.err.println("warc-indexer [-identity] [-all] WARCFILE"); - System.err.println("warc-indexer [-identity] [-all] WARCFILE CDXFILE"); - System.err.println(""); - System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); - System.err.println("With -identity, perform no url canonicalization."); - System.err.println("With -all, output request and metadata records."); - System.exit(1); - } - /** - * @param args - */ - public static void main(String[] args) { - WarcIndexer indexer = new WarcIndexer(); - int idx = 0; - while(args[idx] != null) { - if(args[idx].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); - } else if(args[idx].equals("-all")) { - indexer.setProcessAll(true); - } else { - break; - } - idx++; - } - File arc = new File(args[idx]); - idx++; - PrintWriter pw = null; - try { - if (args.length == idx) { - // dump to STDOUT: - pw = new PrintWriter(System.out); - } else if (args.length == (idx+1)) { - pw = new PrintWriter(args[1]); - } else { - USAGE(); - } - Iterator<CaptureSearchResult> res = indexer.iterator(arc); - Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); - while (lines.hasNext()) { - pw.println(lines.next()); - } - pw.close(); - - } catch (Exception e) { - e.printStackTrace(); - System.exit(1); - } - } - private class ArchiveRecordToWARCRecordAdapter implements Adapter<ArchiveRecord, WARCRecord> { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 01:53:33
|
Revision: 2887 http://archive-access.svn.sourceforge.net/archive-access/?rev=2887&view=rev Author: bradtofel Date: 2009-11-06 01:53:23 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REFACTOR: Moved common HTTP header parsing code into HTTPRecordAnnotater FEATURE: HTML content is now parsed using the SAX parser, to search for META robots tags FEATURE: Now HTTP headers are inspected for Robot related instructions Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2009-11-06 01:50:20 UTC (rev 2886) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -36,7 +36,6 @@ import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.url.IdentityUrlCanonicalizer; -import org.archive.wayback.util.url.UrlOperations; /** * @@ -50,13 +49,14 @@ // private static final Logger LOGGER = Logger.getLogger( // ARCRecordToSearchResultAdapter.class.getName()); + private HTTPRecordAnnotater annotater = null; private UrlCanonicalizer canonicalizer = null; public ARCRecordToSearchResultAdapter() { canonicalizer = new IdentityUrlCanonicalizer(); + annotater = new HTTPRecordAnnotater(); } -// public static SearchResult arcRecordToSearchResult(final ARCRecord rec) -// throws IOException, ParseException { + /* (non-Javadoc) * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) */ @@ -68,7 +68,7 @@ return null; } } - + private CaptureSearchResult adaptInner(ARCRecord rec) throws IOException { rec.close(); ARCRecordMetaData meta = rec.getMetaData(); @@ -84,12 +84,14 @@ // initialize with default HTTP code... result.setHttpCode("-"); + result.setRedirectUrl("-"); result.setDigest(rec.getDigestStr()); - result.setMimeType(meta.getMimetype()); result.setCaptureTimestamp(meta.getDate()); - String uriStr = meta.getUrl(); + result.setOriginalUrl(uriStr); + + if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { // skip filedesc record altogether... return null; @@ -97,49 +99,20 @@ if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { // skip URL + HTTP header processing for dns records... - result.setOriginalUrl(uriStr); - result.setRedirectUrl("-"); result.setUrlKey(uriStr); - + result.setMimeType("text/dns"); + result.setEndOffset(rec.compressedBytes); + } else { - result.setOriginalUrl(uriStr); + result.setUrlKey(canonicalizer.urlStringToKey(uriStr)); - String statusCode = (meta.getStatusCode() == null) ? "-" : meta .getStatusCode(); result.setHttpCode(statusCode); - String redirectUrl = "-"; Header[] headers = rec.getHttpHeaders(); - if (headers != null) { - - for (int i = 0; i < headers.length; i++) { - if (headers[i].getName().equals( - WaybackConstants.LOCATION_HTTP_HEADER)) { - - String locationStr = headers[i].getValue(); - // TODO: "Location" is supposed to be absolute: - // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) - // (section 14.30) but Content-Location can be - // relative. - // is it correct to resolve a relative Location, as - // we are? - // it's also possible to have both in the HTTP - // headers... - // should we prefer one over the other? - // right now, we're ignoring "Content-Location" - redirectUrl = UrlOperations.resolveUrl(uriStr, - locationStr); - - break; - } - } - result.setRedirectUrl(redirectUrl); - - String urlKey = canonicalizer.urlStringToKey(meta.getUrl()); - result.setUrlKey(urlKey); - } + annotater.annotateHTTPContent(result, rec, headers, meta.getMimetype()); } return result; } @@ -149,4 +122,18 @@ public void setCanonicalizer(UrlCanonicalizer canonicalizer) { this.canonicalizer = canonicalizer; } + + /** + * @return the annotater + */ + public HTTPRecordAnnotater getAnnotater() { + return annotater; + } + + /** + * @param annotater the annotater to set + */ + public void setAnnotater(HTTPRecordAnnotater annotater) { + this.annotater = annotater; + } } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -0,0 +1,144 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.htmllex.ContextAwareLexer; +import org.archive.wayback.util.htmllex.ParseEventDelegator; +import org.archive.wayback.util.htmllex.ParseContext; +import org.archive.wayback.util.url.UrlOperations; +import org.htmlparser.Node; +import org.htmlparser.lexer.Lexer; +import org.htmlparser.lexer.Page; +import org.htmlparser.util.ParserException; + +public class HTTPRecordAnnotater { + private RobotMetaRule rule = null; + private ParseEventDelegator rules = null; + private RobotMetaFlags robotFlags; + private static final Logger LOGGER = + Logger.getLogger(HTTPRecordAnnotater.class.getName()); + + private final static String[] mimes = { + "html" + }; + public HTTPRecordAnnotater() { + rules = new ParseEventDelegator(); + rules.init(); + rule = new RobotMetaRule(); + robotFlags = new RobotMetaFlags(); + rule.setRobotFlags(robotFlags); + rule.visit(rules); + } + public boolean isHTML(String mimeType) { + String mimeLower = mimeType.toLowerCase(); + for(String mime : mimes) { + if(mimeLower.contains(mime)) { + return true; + } + } + return false; + } + + private String escapeSpaces(final String input) { + if(input.contains(" ")) { + return input.replace(" ", "%20"); + } + return input; + } + + public String transformHTTPMime(String input) { + int semiIdx = input.indexOf(";"); + if(semiIdx > 0) { + return escapeSpaces(input.substring(0,semiIdx).trim()); + } + return escapeSpaces(input.trim()); + } + + public void annotateHTTPContent(CaptureSearchResult result, + InputStream is, Header[] headers, String mimeGuess) { + robotFlags.reset(); + String mimeType = null; + if (headers != null) { + + for (Header httpHeader : headers) { + if (httpHeader.getName().equals( + WaybackConstants.LOCATION_HTTP_HEADER)) { + + String locationStr = httpHeader.getValue(); + // TODO: "Location" is supposed to be absolute: + // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) + // (section 14.30) but Content-Location can be + // relative. + // is it correct to resolve a relative Location, as + // we are? + // it's also possible to have both in the HTTP + // headers... + // should we prefer one over the other? + // right now, we're ignoring "Content-Location" + result.setRedirectUrl( + UrlOperations.resolveUrl(result.getOriginalUrl(), + locationStr)); + + } else if(httpHeader.getName().toLowerCase().equals("content-type")) { + mimeType = transformHTTPMime(httpHeader.getValue()); + } else if(httpHeader.getName().toLowerCase().equals( + WaybackConstants.X_ROBOTS_HTTP_HEADER)) { + + robotFlags.parse(httpHeader.getValue()); + } + } + } + + // TODO: get the encoding: + String encoding = "utf-8"; + if(mimeType == null) { + // nothing present in the HTTP headers.. Use the WARC field: + mimeType = transformHTTPMime(mimeGuess); + } + result.setMimeType(mimeType); + // Now the sticky part: If it looks like an HTML document, look for + // robot meta tags: + if(isHTML(mimeType)) { + String fileContext = result.getFile() + ":" + result.getOffset(); + annotateHTMLContent(is, encoding, fileContext, result); + } + robotFlags.apply(result); + + } + + public void annotateHTMLContent(InputStream is, String charSet, String fileContext, + CaptureSearchResult result) { + + ParseContext context = new ParseContext(); + + Node node; + try { + ContextAwareLexer lex = new ContextAwareLexer( + new Lexer(new Page(is,charSet)),context); + while((node = lex.nextNode()) != null) { +// System.err.println("\nDEBUG-Node:js("+context.isInJS()+")css("+context.isInCSS()+"):"); +// System.err.println("-------------------/START"); +// System.err.println(node.toHtml(true)); +// System.err.println("-------------------/END"); + rules.handleNode(context, node); + } + rules.handleParseComplete(context); + } catch (ParserException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); + } catch (UnsupportedEncodingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); + } catch (IOException e) { + LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -0,0 +1,44 @@ +package org.archive.wayback.resourcestore.indexer; + +import org.archive.wayback.core.CaptureSearchResult; + +public class RobotMetaFlags { + private static String NO_NOTHIN_MATCH = "NONE"; + private static String NO_FOLLOW_MATCH = "NOFOLLOW"; + private static String NO_INDEX_MATCH = "NOINDEX"; + private static String NO_ARCHIVE_MATCH = "NOARCHIVE"; + + private boolean noArchive = false; + private boolean noIndex = false; + private boolean noFollow = false; + public void reset() { + noArchive = false; + noIndex = false; + noFollow = false; + } + public void parse(String content) { + if(content == null) { + return; + } + String up = content.replaceAll("-", "").toUpperCase(); + if(up.contains(NO_FOLLOW_MATCH)) { + noFollow = true; + } + if(up.contains(NO_ARCHIVE_MATCH)) { + noArchive = true; + } + if(up.contains(NO_INDEX_MATCH)) { + noIndex = true; + } + if(up.contains(NO_NOTHIN_MATCH)) { + noFollow = true; + noArchive = true; + noIndex = true; + } + } + public void apply(CaptureSearchResult result) { + if(noFollow) result.setRobotNoFollow(); + if(noIndex) result.setRobotNoIndex(); + if(noArchive) result.setRobotNoArchive(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -0,0 +1,47 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.IOException; + +import org.archive.wayback.util.htmllex.ParseEventDelegator; +import org.archive.wayback.util.htmllex.ParseEventDelegatorVisitor; +import org.archive.wayback.util.htmllex.ParseContext; +import org.archive.wayback.util.htmllex.handlers.OpenTagHandler; +import org.htmlparser.nodes.TagNode; + +public class RobotMetaRule implements ParseEventDelegatorVisitor, OpenTagHandler { + + private RobotMetaFlags robotFlags = null; + + public void visit(ParseEventDelegator rules) { + // register for <META> Start tags: + rules.addOpenTagHandler(this, "META"); + } + + public void handleOpenTagNode(ParseContext context, TagNode node) + throws IOException { + String nameVal = node.getAttribute("name"); + if(nameVal != null) { + if(nameVal.toUpperCase().equals("ROBOTS")) { + String content = node.getAttribute("content"); + if(content != null) { + robotFlags.parse(content); + } + } + } + } + + /** + * @return the robotFlags + */ + public RobotMetaFlags getRobotFlags() { + return robotFlags; + } + + /** + * @param robotFlags the robotFlags to set + */ + public void setRobotFlags(RobotMetaFlags robotFlags) { + this.robotFlags = robotFlags; + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2009-11-06 01:50:20 UTC (rev 2886) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -2,23 +2,23 @@ import java.io.File; import java.io.IOException; -import java.util.logging.Logger; +//import java.util.logging.Logger; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpParser; import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.util.EncodingUtil; +import org.apache.log4j.Logger; import org.archive.io.ArchiveRecordHeader; import org.archive.io.RecoverableIOException; import org.archive.io.arc.ARCConstants; import org.archive.io.warc.WARCConstants; import org.archive.io.warc.WARCRecord; import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.UrlOperations; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; /** * Adapts certain WARCRecords into SearchResults. DNS and response records are @@ -33,29 +33,23 @@ */ public class WARCRecordToSearchResultAdapter implements Adapter<WARCRecord,CaptureSearchResult>{ + private static final Logger LOGGER = Logger.getLogger(WARCRecordToSearchResultAdapter.class.getName()); private final static String DEFAULT_VALUE = "-"; - private UrlCanonicalizer canonicalizer = null; + private HTTPRecordAnnotater annotater = null; private boolean processAll = false; - public boolean isProcessAll() { - return processAll; - } - - public void setProcessAll(boolean processAll) { - this.processAll = processAll; - } - public WARCRecordToSearchResultAdapter() { - canonicalizer = new AggressiveUrlCanonicalizer(); + canonicalizer = new IdentityUrlCanonicalizer(); + annotater = new HTTPRecordAnnotater(); } - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + /* + * This just calls adaptInner, returning null if an Exception is thrown: */ public CaptureSearchResult adapt(WARCRecord rec) { try { @@ -65,121 +59,94 @@ return null; } } - - /* - * Transform input date to 14-digit timestamp: - * 2007-08-29T18:00:26Z => 20070829180026 - */ - private static String transformDate(final String input) { + + private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException { - StringBuilder output = new StringBuilder(14); - - output.append(input.substring(0,4)); - output.append(input.substring(5,7)); - output.append(input.substring(8,10)); - output.append(input.substring(11,13)); - output.append(input.substring(14,16)); - output.append(input.substring(17,19)); - - return output.toString(); - } - - private static String escapeSpaces(final String input) { - if(input.contains(" ")) { - return input.replace(" ", "%20"); - } - return input; - } - - private static String transformHTTPMime(String input) { - int semiIdx = input.indexOf(";"); - if(semiIdx > 0) { - return escapeSpaces(input.substring(0,semiIdx).trim()); - } - return escapeSpaces(input.trim()); - } + ArchiveRecordHeader header = rec.getHeader(); - private String transformWarcFilename(String readerIdentifier) { - String warcName = readerIdentifier; - int index = warcName.lastIndexOf(File.separator); - if (index > 0 && (index + 1) < warcName.length()) { - warcName = warcName.substring(index + 1); + String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); + if(type.equals(WARCConstants.WARCINFO)) { + LOGGER.info("Skipping record type : " + type); + return null; } - return warcName; - } - private String transformDigest(final Object o) { - if(o == null) { - return DEFAULT_VALUE; + CaptureSearchResult result = genericResult(rec); + + if(type.equals(WARCConstants.RESPONSE)) { + String mime = annotater.transformHTTPMime(header.getMimetype()); + if(mime.equals("text/dns")) { + // close to complete reading, then the digest is legit + // TODO: DO we want to use the WARC header digest for this? + rec.close(); + result.setDigest(transformWARCDigest(rec.getDigestStr())); + result.setMimeType(mime); + } else { + result = adaptWARCHTTPResponse(result,rec); + } + } else if(type.equals(WARCConstants.REVISIT)) { + // also set the mime type: + result.setMimeType("warc/revisit"); + + } else if(type.equals(WARCConstants.REQUEST)) { + + if(processAll) { + // also set the mime type: + result.setMimeType("warc/request"); + } else { + result = null; + } + } else if(type.equals(WARCConstants.METADATA)) { + + if(processAll) { + // also set the mime type: + result.setMimeType("warc/metadata"); + } else { + result = null; + } + } else { + LOGGER.info("Skipping record type : " + type); } - String orig = o.toString(); - if(orig.startsWith("sha1:")) { - return orig.substring(5); - } - return orig; + + return result; } - private CaptureSearchResult getBlankSearchResult() { + // ALL HELPER METHODS BELOW: + + /* + * Extract all common WARC fields into a CaptureSearchResult. This is the + * same for all WARC record types: + * + * file, offset, timestamp, digest, urlKey, originalUrl + */ + private CaptureSearchResult genericResult(WARCRecord rec) { + CaptureSearchResult result = new CaptureSearchResult(); - result.setUrlKey(DEFAULT_VALUE); - result.setOriginalUrl(DEFAULT_VALUE); - result.setCaptureTimestamp(DEFAULT_VALUE); - result.setDigest(DEFAULT_VALUE); result.setMimeType(DEFAULT_VALUE); result.setHttpCode(DEFAULT_VALUE); result.setRedirectUrl(DEFAULT_VALUE); - result.setFile(DEFAULT_VALUE); - result.setOffset(0); - return result; - } - - private void addUrlDataToSearchResult(CaptureSearchResult result, String urlStr) - throws IOException { - result.setOriginalUrl(urlStr); - String urlKey = canonicalizer.urlStringToKey(urlStr); - result.setUrlKey(urlKey); - } + ArchiveRecordHeader header = rec.getHeader(); - private CaptureSearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - - CaptureSearchResult result = getBlankSearchResult(); - - result.setCaptureTimestamp(transformDate(header.getDate())); - result.setFile(transformWarcFilename(header.getReaderIdentifier())); - result.setOffset(header.getOffset()); + String file = transformWARCFilename(header.getReaderIdentifier()); + long offset = header.getOffset(); - String uriStr = header.getUrl(); - - result.setMimeType(header.getMimetype()); - - result.setOriginalUrl(uriStr); - result.setUrlKey(uriStr); - - rec.close(); - result.setDigest(rec.getDigestStr()); - - return result; - } - - private CaptureSearchResult adaptGeneric(ArchiveRecordHeader header, - WARCRecord rec, String mime) - throws IOException { - - CaptureSearchResult result = getBlankSearchResult(); - - result.setCaptureTimestamp(transformDate(header.getDate())); - result.setFile(transformWarcFilename(header.getReaderIdentifier())); - result.setOffset(header.getOffset()); - result.setDigest(transformDigest(header.getHeaderValue( + result.setCaptureTimestamp(transformWARCDate(header.getDate())); + result.setFile(file); + result.setOffset(offset); + result.setDigest(transformWARCDigest(header.getHeaderValue( WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); - addUrlDataToSearchResult(result,header.getUrl()); - - result.setMimeType(mime); - + String origUrl = header.getUrl(); + result.setOriginalUrl(origUrl); + try { + String urlKey = canonicalizer.urlStringToKey(origUrl); + result.setUrlKey(urlKey); + } catch (URIException e) { + LOGGER.warn("FAILED canonicalize(" + origUrl + "):" + + file + " " + offset); + result.setUrlKey(origUrl); + } return result; } @@ -200,19 +167,55 @@ } return count; } - - private CaptureSearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - CaptureSearchResult result = getBlankSearchResult(); + private String transformWARCFilename(String readerIdentifier) { + String warcName = readerIdentifier; + int index = warcName.lastIndexOf(File.separator); + if (index > 0 && (index + 1) < warcName.length()) { + warcName = warcName.substring(index + 1); + } + return warcName; + } - result.setCaptureTimestamp(transformDate(header.getDate())); - result.setFile(transformWarcFilename(header.getReaderIdentifier())); - result.setOffset(header.getOffset()); + private String transformWARCDigest(final Object o) { + if(o == null) { + return DEFAULT_VALUE; + } + String orig = o.toString(); + if(orig.startsWith("sha1:")) { + return orig.substring(5); + } + return orig; + } + + /* + * Transform input date to 14-digit timestamp: + * 2007-08-29T18:00:26Z => 20070829180026 + */ + private static String transformWARCDate(final String input) { - String origUrl = header.getUrl(); - addUrlDataToSearchResult(result,origUrl); + StringBuilder output = new StringBuilder(14); + + output.append(input.substring(0,4)); + output.append(input.substring(5,7)); + output.append(input.substring(8,10)); + output.append(input.substring(11,13)); + output.append(input.substring(14,16)); + output.append(input.substring(17,19)); + + return output.toString(); + } + /* + * Currently the WARCReader doesn't parse HTTP headers. This method parses + * them then calls the common ARC/WARC shared record parsing code, which + * addresses HTTP headers, and possibly even parses HTML content to look + * for Robot Meta tags. + */ + private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, + WARCRecord rec) throws IOException { + + ArchiveRecordHeader header = rec.getHeader(); // need to parse the documents HTTP message and headers here: WARCReader // does not implement this... yet.. @@ -234,66 +237,13 @@ Header[] headers = HttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING); - rec.close(); - result.setDigest(transformDigest(header.getHeaderValue( - WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); - - if (headers != null) { - - for (Header httpHeader : headers) { - if (httpHeader.getName().equals( - WaybackConstants.LOCATION_HTTP_HEADER)) { - - String locationStr = httpHeader.getValue(); - // TODO: "Location" is supposed to be absolute: - // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) - // (section 14.30) but Content-Location can be - // relative. - // is it correct to resolve a relative Location, as - // we are? - // it's also possible to have both in the HTTP - // headers... - // should we prefer one over the other? - // right now, we're ignoring "Content-Location" - result.setRedirectUrl( - UrlOperations.resolveUrl(origUrl, locationStr)); - } else if(httpHeader.getName().toLowerCase().equals("content-type")) { - result.setMimeType(transformHTTPMime(httpHeader.getValue())); - } - } - } - return result; - } - - private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException { - CaptureSearchResult result = null; - ArchiveRecordHeader header = rec.getHeader(); - String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); - if(type.equals(WARCConstants.RESPONSE)) { - String mime = header.getMimetype(); - if(mime.equals("text/dns")) { - result = adaptDNS(header,rec); - } else { - result = adaptResponse(header,rec); - } - } else if(type.equals(WARCConstants.REVISIT)) { - result = adaptGeneric(header,rec,"warc/revisit"); - } else if(type.equals(WARCConstants.REQUEST)) { - if(processAll) { - result = adaptGeneric(header,rec,"warc/request"); - } - } else if(type.equals(WARCConstants.METADATA)) { - if(processAll) { - result = adaptGeneric(header,rec,"warc/metadata"); - } - } else { - LOGGER.info("Skipping record type : " + type); - } + annotater.annotateHTTPContent(result,rec,headers,header.getMimetype()); return result; } + public UrlCanonicalizer getCanonicalizer() { return canonicalizer; } @@ -301,4 +251,25 @@ public void setCanonicalizer(UrlCanonicalizer canonicalizer) { this.canonicalizer = canonicalizer; } + + public boolean isProcessAll() { + return processAll; + } + + public void setProcessAll(boolean processAll) { + this.processAll = processAll; + } + /** + * @return the annotater + */ + public HTTPRecordAnnotater getAnnotater() { + return annotater; + } + + /** + * @param annotater the annotater to set + */ + public void setAnnotater(HTTPRecordAnnotater annotater) { + this.annotater = annotater; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-12-22 05:16:06
|
Revision: 2941 http://archive-access.svn.sourceforge.net/archive-access/?rev=2941&view=rev Author: bradtofel Date: 2009-12-22 05:15:56 +0000 (Tue, 22 Dec 2009) Log Message: ----------- Sending File not String to ArchiveReaderFactory.get() methods Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2009-12-18 00:34:47 UTC (rev 2940) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2009-12-22 05:15:56 UTC (rev 2941) @@ -69,7 +69,12 @@ */ public CloseableIterator<CaptureSearchResult> iterator(String pathOrUrl) throws IOException { - return iterator(ARCReaderFactory.get(pathOrUrl)); + File f = new File(pathOrUrl); + if(f.isFile()) { + return iterator(ARCReaderFactory.get(f)); + } else { + return iterator(ARCReaderFactory.get(pathOrUrl)); + } } /** Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2009-12-18 00:34:47 UTC (rev 2940) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2009-12-22 05:15:56 UTC (rev 2941) @@ -71,7 +71,12 @@ */ public CloseableIterator<CaptureSearchResult> iterator(String pathOrUrl) throws IOException { - return iterator(WARCReaderFactory.get(pathOrUrl)); + File f = new File(pathOrUrl); + if(f.isFile()) { + return iterator(WARCReaderFactory.get(f)); + } else { + return iterator(WARCReaderFactory.get(pathOrUrl)); + } } /** * @param arc This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-02-06 14:36:52
|
Revision: 3394 http://archive-access.svn.sourceforge.net/archive-access/?rev=3394&view=rev Author: bradtofel Date: 2011-02-06 14:36:45 +0000 (Sun, 06 Feb 2011) Log Message: ----------- Now include version info in filedesc and warcinfo records Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2011-02-06 14:35:50 UTC (rev 3393) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2011-02-06 14:36:45 UTC (rev 3394) @@ -42,7 +42,8 @@ // private static final Logger LOGGER = Logger.getLogger( // ARCRecordToSearchResultAdapter.class.getName()); - + private static final String VERSION = "0.1.0"; + private static final String ARC_FILEDESC_VERSION = "arc/filedesc" + VERSION; private HTTPRecordAnnotater annotater = null; private UrlCanonicalizer canonicalizer = null; @@ -88,10 +89,8 @@ if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { - // skip filedesc record altogether... - return null; - } - if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { + result.setMimeType(ARC_FILEDESC_VERSION); + } else if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { // skip URL + HTTP header processing for dns records... result.setUrlKey(uriStr); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2011-02-06 14:35:50 UTC (rev 3393) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2011-02-06 14:36:45 UTC (rev 3394) @@ -54,6 +54,10 @@ private static final Logger LOGGER = Logger.getLogger(WARCRecordToSearchResultAdapter.class.getName()); + + private static final String VERSION = "0.1.0"; + private static final String WARC_FILEDESC_VERSION = + "warc/warcinfo" + VERSION; private final static String DEFAULT_VALUE = "-"; private UrlCanonicalizer canonicalizer = null; @@ -126,7 +130,7 @@ } } else if(type.equals(WARCConstants.WARCINFO)) { - result.setMimeType("warc/warcinfo"); + result.setMimeType(WARC_FILEDESC_VERSION); } else { LOGGER.info("Skipping record type : " + type); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |