From: <bra...@us...> - 2009-11-06 01:53:33
|
Revision: 2887 http://archive-access.svn.sourceforge.net/archive-access/?rev=2887&view=rev Author: bradtofel Date: 2009-11-06 01:53:23 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REFACTOR: Moved common HTTP header parsing code into HTTPRecordAnnotater FEATURE: HTML content is now parsed using the SAX parser, to search for META robots tags FEATURE: Now HTTP headers are inspected for Robot related instructions Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2009-11-06 01:50:20 UTC (rev 2886) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -36,7 +36,6 @@ import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.url.IdentityUrlCanonicalizer; -import org.archive.wayback.util.url.UrlOperations; /** * @@ -50,13 +49,14 @@ // private static final Logger LOGGER = Logger.getLogger( // ARCRecordToSearchResultAdapter.class.getName()); + private HTTPRecordAnnotater annotater = null; private UrlCanonicalizer canonicalizer = null; public ARCRecordToSearchResultAdapter() { canonicalizer = new IdentityUrlCanonicalizer(); + annotater = new HTTPRecordAnnotater(); } -// public static SearchResult arcRecordToSearchResult(final ARCRecord rec) -// throws IOException, ParseException { + /* (non-Javadoc) * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) */ @@ -68,7 +68,7 @@ return null; } } - + private CaptureSearchResult adaptInner(ARCRecord rec) throws IOException { rec.close(); ARCRecordMetaData meta = rec.getMetaData(); @@ -84,12 +84,14 @@ // initialize with default HTTP code... result.setHttpCode("-"); + result.setRedirectUrl("-"); result.setDigest(rec.getDigestStr()); - result.setMimeType(meta.getMimetype()); result.setCaptureTimestamp(meta.getDate()); - String uriStr = meta.getUrl(); + result.setOriginalUrl(uriStr); + + if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { // skip filedesc record altogether... return null; @@ -97,49 +99,20 @@ if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { // skip URL + HTTP header processing for dns records... - result.setOriginalUrl(uriStr); - result.setRedirectUrl("-"); result.setUrlKey(uriStr); - + result.setMimeType("text/dns"); + result.setEndOffset(rec.compressedBytes); + } else { - result.setOriginalUrl(uriStr); + result.setUrlKey(canonicalizer.urlStringToKey(uriStr)); - String statusCode = (meta.getStatusCode() == null) ? "-" : meta .getStatusCode(); result.setHttpCode(statusCode); - String redirectUrl = "-"; Header[] headers = rec.getHttpHeaders(); - if (headers != null) { - - for (int i = 0; i < headers.length; i++) { - if (headers[i].getName().equals( - WaybackConstants.LOCATION_HTTP_HEADER)) { - - String locationStr = headers[i].getValue(); - // TODO: "Location" is supposed to be absolute: - // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) - // (section 14.30) but Content-Location can be - // relative. - // is it correct to resolve a relative Location, as - // we are? - // it's also possible to have both in the HTTP - // headers... - // should we prefer one over the other? - // right now, we're ignoring "Content-Location" - redirectUrl = UrlOperations.resolveUrl(uriStr, - locationStr); - - break; - } - } - result.setRedirectUrl(redirectUrl); - - String urlKey = canonicalizer.urlStringToKey(meta.getUrl()); - result.setUrlKey(urlKey); - } + annotater.annotateHTTPContent(result, rec, headers, meta.getMimetype()); } return result; } @@ -149,4 +122,18 @@ public void setCanonicalizer(UrlCanonicalizer canonicalizer) { this.canonicalizer = canonicalizer; } + + /** + * @return the annotater + */ + public HTTPRecordAnnotater getAnnotater() { + return annotater; + } + + /** + * @param annotater the annotater to set + */ + public void setAnnotater(HTTPRecordAnnotater annotater) { + this.annotater = annotater; + } } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -0,0 +1,144 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.htmllex.ContextAwareLexer; +import org.archive.wayback.util.htmllex.ParseEventDelegator; +import org.archive.wayback.util.htmllex.ParseContext; +import org.archive.wayback.util.url.UrlOperations; +import org.htmlparser.Node; +import org.htmlparser.lexer.Lexer; +import org.htmlparser.lexer.Page; +import org.htmlparser.util.ParserException; + +public class HTTPRecordAnnotater { + private RobotMetaRule rule = null; + private ParseEventDelegator rules = null; + private RobotMetaFlags robotFlags; + private static final Logger LOGGER = + Logger.getLogger(HTTPRecordAnnotater.class.getName()); + + private final static String[] mimes = { + "html" + }; + public HTTPRecordAnnotater() { + rules = new ParseEventDelegator(); + rules.init(); + rule = new RobotMetaRule(); + robotFlags = new RobotMetaFlags(); + rule.setRobotFlags(robotFlags); + rule.visit(rules); + } + public boolean isHTML(String mimeType) { + String mimeLower = mimeType.toLowerCase(); + for(String mime : mimes) { + if(mimeLower.contains(mime)) { + return true; + } + } + return false; + } + + private String escapeSpaces(final String input) { + if(input.contains(" ")) { + return input.replace(" ", "%20"); + } + return input; + } + + public String transformHTTPMime(String input) { + int semiIdx = input.indexOf(";"); + if(semiIdx > 0) { + return escapeSpaces(input.substring(0,semiIdx).trim()); + } + return escapeSpaces(input.trim()); + } + + public void annotateHTTPContent(CaptureSearchResult result, + InputStream is, Header[] headers, String mimeGuess) { + robotFlags.reset(); + String mimeType = null; + if (headers != null) { + + for (Header httpHeader : headers) { + if (httpHeader.getName().equals( + WaybackConstants.LOCATION_HTTP_HEADER)) { + + String locationStr = httpHeader.getValue(); + // TODO: "Location" is supposed to be absolute: + // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) + // (section 14.30) but Content-Location can be + // relative. + // is it correct to resolve a relative Location, as + // we are? + // it's also possible to have both in the HTTP + // headers... + // should we prefer one over the other? + // right now, we're ignoring "Content-Location" + result.setRedirectUrl( + UrlOperations.resolveUrl(result.getOriginalUrl(), + locationStr)); + + } else if(httpHeader.getName().toLowerCase().equals("content-type")) { + mimeType = transformHTTPMime(httpHeader.getValue()); + } else if(httpHeader.getName().toLowerCase().equals( + WaybackConstants.X_ROBOTS_HTTP_HEADER)) { + + robotFlags.parse(httpHeader.getValue()); + } + } + } + + // TODO: get the encoding: + String encoding = "utf-8"; + if(mimeType == null) { + // nothing present in the HTTP headers.. Use the WARC field: + mimeType = transformHTTPMime(mimeGuess); + } + result.setMimeType(mimeType); + // Now the sticky part: If it looks like an HTML document, look for + // robot meta tags: + if(isHTML(mimeType)) { + String fileContext = result.getFile() + ":" + result.getOffset(); + annotateHTMLContent(is, encoding, fileContext, result); + } + robotFlags.apply(result); + + } + + public void annotateHTMLContent(InputStream is, String charSet, String fileContext, + CaptureSearchResult result) { + + ParseContext context = new ParseContext(); + + Node node; + try { + ContextAwareLexer lex = new ContextAwareLexer( + new Lexer(new Page(is,charSet)),context); + while((node = lex.nextNode()) != null) { +// System.err.println("\nDEBUG-Node:js("+context.isInJS()+")css("+context.isInCSS()+"):"); +// System.err.println("-------------------/START"); +// System.err.println(node.toHtml(true)); +// System.err.println("-------------------/END"); + rules.handleNode(context, node); + } + rules.handleParseComplete(context); + } catch (ParserException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); + } catch (UnsupportedEncodingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); + } catch (IOException e) { + LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -0,0 +1,44 @@ +package org.archive.wayback.resourcestore.indexer; + +import org.archive.wayback.core.CaptureSearchResult; + +public class RobotMetaFlags { + private static String NO_NOTHIN_MATCH = "NONE"; + private static String NO_FOLLOW_MATCH = "NOFOLLOW"; + private static String NO_INDEX_MATCH = "NOINDEX"; + private static String NO_ARCHIVE_MATCH = "NOARCHIVE"; + + private boolean noArchive = false; + private boolean noIndex = false; + private boolean noFollow = false; + public void reset() { + noArchive = false; + noIndex = false; + noFollow = false; + } + public void parse(String content) { + if(content == null) { + return; + } + String up = content.replaceAll("-", "").toUpperCase(); + if(up.contains(NO_FOLLOW_MATCH)) { + noFollow = true; + } + if(up.contains(NO_ARCHIVE_MATCH)) { + noArchive = true; + } + if(up.contains(NO_INDEX_MATCH)) { + noIndex = true; + } + if(up.contains(NO_NOTHIN_MATCH)) { + noFollow = true; + noArchive = true; + noIndex = true; + } + } + public void apply(CaptureSearchResult result) { + if(noFollow) result.setRobotNoFollow(); + if(noIndex) result.setRobotNoIndex(); + if(noArchive) result.setRobotNoArchive(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -0,0 +1,47 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.IOException; + +import org.archive.wayback.util.htmllex.ParseEventDelegator; +import org.archive.wayback.util.htmllex.ParseEventDelegatorVisitor; +import org.archive.wayback.util.htmllex.ParseContext; +import org.archive.wayback.util.htmllex.handlers.OpenTagHandler; +import org.htmlparser.nodes.TagNode; + +public class RobotMetaRule implements ParseEventDelegatorVisitor, OpenTagHandler { + + private RobotMetaFlags robotFlags = null; + + public void visit(ParseEventDelegator rules) { + // register for <META> Start tags: + rules.addOpenTagHandler(this, "META"); + } + + public void handleOpenTagNode(ParseContext context, TagNode node) + throws IOException { + String nameVal = node.getAttribute("name"); + if(nameVal != null) { + if(nameVal.toUpperCase().equals("ROBOTS")) { + String content = node.getAttribute("content"); + if(content != null) { + robotFlags.parse(content); + } + } + } + } + + /** + * @return the robotFlags + */ + public RobotMetaFlags getRobotFlags() { + return robotFlags; + } + + /** + * @param robotFlags the robotFlags to set + */ + public void setRobotFlags(RobotMetaFlags robotFlags) { + this.robotFlags = robotFlags; + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2009-11-06 01:50:20 UTC (rev 2886) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -2,23 +2,23 @@ import java.io.File; import java.io.IOException; -import java.util.logging.Logger; +//import java.util.logging.Logger; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpParser; import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.util.EncodingUtil; +import org.apache.log4j.Logger; import org.archive.io.ArchiveRecordHeader; import org.archive.io.RecoverableIOException; import org.archive.io.arc.ARCConstants; import org.archive.io.warc.WARCConstants; import org.archive.io.warc.WARCRecord; import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.UrlOperations; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; /** * Adapts certain WARCRecords into SearchResults. DNS and response records are @@ -33,29 +33,23 @@ */ public class WARCRecordToSearchResultAdapter implements Adapter<WARCRecord,CaptureSearchResult>{ + private static final Logger LOGGER = Logger.getLogger(WARCRecordToSearchResultAdapter.class.getName()); private final static String DEFAULT_VALUE = "-"; - private UrlCanonicalizer canonicalizer = null; + private HTTPRecordAnnotater annotater = null; private boolean processAll = false; - public boolean isProcessAll() { - return processAll; - } - - public void setProcessAll(boolean processAll) { - this.processAll = processAll; - } - public WARCRecordToSearchResultAdapter() { - canonicalizer = new AggressiveUrlCanonicalizer(); + canonicalizer = new IdentityUrlCanonicalizer(); + annotater = new HTTPRecordAnnotater(); } - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + /* + * This just calls adaptInner, returning null if an Exception is thrown: */ public CaptureSearchResult adapt(WARCRecord rec) { try { @@ -65,121 +59,94 @@ return null; } } - - /* - * Transform input date to 14-digit timestamp: - * 2007-08-29T18:00:26Z => 20070829180026 - */ - private static String transformDate(final String input) { + + private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException { - StringBuilder output = new StringBuilder(14); - - output.append(input.substring(0,4)); - output.append(input.substring(5,7)); - output.append(input.substring(8,10)); - output.append(input.substring(11,13)); - output.append(input.substring(14,16)); - output.append(input.substring(17,19)); - - return output.toString(); - } - - private static String escapeSpaces(final String input) { - if(input.contains(" ")) { - return input.replace(" ", "%20"); - } - return input; - } - - private static String transformHTTPMime(String input) { - int semiIdx = input.indexOf(";"); - if(semiIdx > 0) { - return escapeSpaces(input.substring(0,semiIdx).trim()); - } - return escapeSpaces(input.trim()); - } + ArchiveRecordHeader header = rec.getHeader(); - private String transformWarcFilename(String readerIdentifier) { - String warcName = readerIdentifier; - int index = warcName.lastIndexOf(File.separator); - if (index > 0 && (index + 1) < warcName.length()) { - warcName = warcName.substring(index + 1); + String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); + if(type.equals(WARCConstants.WARCINFO)) { + LOGGER.info("Skipping record type : " + type); + return null; } - return warcName; - } - private String transformDigest(final Object o) { - if(o == null) { - return DEFAULT_VALUE; + CaptureSearchResult result = genericResult(rec); + + if(type.equals(WARCConstants.RESPONSE)) { + String mime = annotater.transformHTTPMime(header.getMimetype()); + if(mime.equals("text/dns")) { + // close to complete reading, then the digest is legit + // TODO: DO we want to use the WARC header digest for this? + rec.close(); + result.setDigest(transformWARCDigest(rec.getDigestStr())); + result.setMimeType(mime); + } else { + result = adaptWARCHTTPResponse(result,rec); + } + } else if(type.equals(WARCConstants.REVISIT)) { + // also set the mime type: + result.setMimeType("warc/revisit"); + + } else if(type.equals(WARCConstants.REQUEST)) { + + if(processAll) { + // also set the mime type: + result.setMimeType("warc/request"); + } else { + result = null; + } + } else if(type.equals(WARCConstants.METADATA)) { + + if(processAll) { + // also set the mime type: + result.setMimeType("warc/metadata"); + } else { + result = null; + } + } else { + LOGGER.info("Skipping record type : " + type); } - String orig = o.toString(); - if(orig.startsWith("sha1:")) { - return orig.substring(5); - } - return orig; + + return result; } - private CaptureSearchResult getBlankSearchResult() { + // ALL HELPER METHODS BELOW: + + /* + * Extract all common WARC fields into a CaptureSearchResult. This is the + * same for all WARC record types: + * + * file, offset, timestamp, digest, urlKey, originalUrl + */ + private CaptureSearchResult genericResult(WARCRecord rec) { + CaptureSearchResult result = new CaptureSearchResult(); - result.setUrlKey(DEFAULT_VALUE); - result.setOriginalUrl(DEFAULT_VALUE); - result.setCaptureTimestamp(DEFAULT_VALUE); - result.setDigest(DEFAULT_VALUE); result.setMimeType(DEFAULT_VALUE); result.setHttpCode(DEFAULT_VALUE); result.setRedirectUrl(DEFAULT_VALUE); - result.setFile(DEFAULT_VALUE); - result.setOffset(0); - return result; - } - - private void addUrlDataToSearchResult(CaptureSearchResult result, String urlStr) - throws IOException { - result.setOriginalUrl(urlStr); - String urlKey = canonicalizer.urlStringToKey(urlStr); - result.setUrlKey(urlKey); - } + ArchiveRecordHeader header = rec.getHeader(); - private CaptureSearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - - CaptureSearchResult result = getBlankSearchResult(); - - result.setCaptureTimestamp(transformDate(header.getDate())); - result.setFile(transformWarcFilename(header.getReaderIdentifier())); - result.setOffset(header.getOffset()); + String file = transformWARCFilename(header.getReaderIdentifier()); + long offset = header.getOffset(); - String uriStr = header.getUrl(); - - result.setMimeType(header.getMimetype()); - - result.setOriginalUrl(uriStr); - result.setUrlKey(uriStr); - - rec.close(); - result.setDigest(rec.getDigestStr()); - - return result; - } - - private CaptureSearchResult adaptGeneric(ArchiveRecordHeader header, - WARCRecord rec, String mime) - throws IOException { - - CaptureSearchResult result = getBlankSearchResult(); - - result.setCaptureTimestamp(transformDate(header.getDate())); - result.setFile(transformWarcFilename(header.getReaderIdentifier())); - result.setOffset(header.getOffset()); - result.setDigest(transformDigest(header.getHeaderValue( + result.setCaptureTimestamp(transformWARCDate(header.getDate())); + result.setFile(file); + result.setOffset(offset); + result.setDigest(transformWARCDigest(header.getHeaderValue( WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); - addUrlDataToSearchResult(result,header.getUrl()); - - result.setMimeType(mime); - + String origUrl = header.getUrl(); + result.setOriginalUrl(origUrl); + try { + String urlKey = canonicalizer.urlStringToKey(origUrl); + result.setUrlKey(urlKey); + } catch (URIException e) { + LOGGER.warn("FAILED canonicalize(" + origUrl + "):" + + file + " " + offset); + result.setUrlKey(origUrl); + } return result; } @@ -200,19 +167,55 @@ } return count; } - - private CaptureSearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - CaptureSearchResult result = getBlankSearchResult(); + private String transformWARCFilename(String readerIdentifier) { + String warcName = readerIdentifier; + int index = warcName.lastIndexOf(File.separator); + if (index > 0 && (index + 1) < warcName.length()) { + warcName = warcName.substring(index + 1); + } + return warcName; + } - result.setCaptureTimestamp(transformDate(header.getDate())); - result.setFile(transformWarcFilename(header.getReaderIdentifier())); - result.setOffset(header.getOffset()); + private String transformWARCDigest(final Object o) { + if(o == null) { + return DEFAULT_VALUE; + } + String orig = o.toString(); + if(orig.startsWith("sha1:")) { + return orig.substring(5); + } + return orig; + } + + /* + * Transform input date to 14-digit timestamp: + * 2007-08-29T18:00:26Z => 20070829180026 + */ + private static String transformWARCDate(final String input) { - String origUrl = header.getUrl(); - addUrlDataToSearchResult(result,origUrl); + StringBuilder output = new StringBuilder(14); + + output.append(input.substring(0,4)); + output.append(input.substring(5,7)); + output.append(input.substring(8,10)); + output.append(input.substring(11,13)); + output.append(input.substring(14,16)); + output.append(input.substring(17,19)); + + return output.toString(); + } + /* + * Currently the WARCReader doesn't parse HTTP headers. This method parses + * them then calls the common ARC/WARC shared record parsing code, which + * addresses HTTP headers, and possibly even parses HTML content to look + * for Robot Meta tags. + */ + private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, + WARCRecord rec) throws IOException { + + ArchiveRecordHeader header = rec.getHeader(); // need to parse the documents HTTP message and headers here: WARCReader // does not implement this... yet.. @@ -234,66 +237,13 @@ Header[] headers = HttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING); - rec.close(); - result.setDigest(transformDigest(header.getHeaderValue( - WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); - - if (headers != null) { - - for (Header httpHeader : headers) { - if (httpHeader.getName().equals( - WaybackConstants.LOCATION_HTTP_HEADER)) { - - String locationStr = httpHeader.getValue(); - // TODO: "Location" is supposed to be absolute: - // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) - // (section 14.30) but Content-Location can be - // relative. - // is it correct to resolve a relative Location, as - // we are? - // it's also possible to have both in the HTTP - // headers... - // should we prefer one over the other? - // right now, we're ignoring "Content-Location" - result.setRedirectUrl( - UrlOperations.resolveUrl(origUrl, locationStr)); - } else if(httpHeader.getName().toLowerCase().equals("content-type")) { - result.setMimeType(transformHTTPMime(httpHeader.getValue())); - } - } - } - return result; - } - - private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException { - CaptureSearchResult result = null; - ArchiveRecordHeader header = rec.getHeader(); - String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); - if(type.equals(WARCConstants.RESPONSE)) { - String mime = header.getMimetype(); - if(mime.equals("text/dns")) { - result = adaptDNS(header,rec); - } else { - result = adaptResponse(header,rec); - } - } else if(type.equals(WARCConstants.REVISIT)) { - result = adaptGeneric(header,rec,"warc/revisit"); - } else if(type.equals(WARCConstants.REQUEST)) { - if(processAll) { - result = adaptGeneric(header,rec,"warc/request"); - } - } else if(type.equals(WARCConstants.METADATA)) { - if(processAll) { - result = adaptGeneric(header,rec,"warc/metadata"); - } - } else { - LOGGER.info("Skipping record type : " + type); - } + annotater.annotateHTTPContent(result,rec,headers,header.getMimetype()); return result; } + public UrlCanonicalizer getCanonicalizer() { return canonicalizer; } @@ -301,4 +251,25 @@ public void setCanonicalizer(UrlCanonicalizer canonicalizer) { this.canonicalizer = canonicalizer; } + + public boolean isProcessAll() { + return processAll; + } + + public void setProcessAll(boolean processAll) { + this.processAll = processAll; + } + /** + * @return the annotater + */ + public HTTPRecordAnnotater getAnnotater() { + return annotater; + } + + /** + * @param annotater the annotater to set + */ + public void setAnnotater(HTTPRecordAnnotater annotater) { + this.annotater = annotater; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |