Revision: 2084 http://archive-access.svn.sourceforge.net/archive-access/?rev=2084&view=rev Author: bradtofel Date: 2007-11-27 18:06:27 -0800 (Tue, 27 Nov 2007) Log Message: ----------- INITIAL REV: class which adapts (some) WARCRecords into SearchResult objects. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java 2007-11-28 02:06:27 UTC (rev 2084) @@ -0,0 +1,302 @@ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.IOException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.URIException; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.RecoverableIOException; +import org.archive.io.arc.ARCConstants; +import org.archive.io.warc.WARCConstants; +import org.archive.io.warc.WARCRecord; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.UrlCanonicalizer; + +/** + * Adapts certain WARCRecords into SearchResults. DNS and response records are + * mostly straightforward, but SearchResult objects generated from revisit + * records contain lots of "placeholder" fields, which are expected to be + * understood by later processes traversing a stream of SearchResult objects. + * + * See org.archive.wayback.resourceindex.DeduplicateSearchResultAnnotationAdapter. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class WARCRecordToSearchResultAdapter +implements Adapter<WARCRecord,SearchResult>{ + + private final static String DEFAULT_VALUE = "-"; + private final static String SEARCH_FIELDS[] = { + WaybackConstants.RESULT_URL, + WaybackConstants.RESULT_URL_KEY, + WaybackConstants.RESULT_ORIG_HOST, + WaybackConstants.RESULT_CAPTURE_DATE, + WaybackConstants.RESULT_MD5_DIGEST, + WaybackConstants.RESULT_MIME_TYPE, + WaybackConstants.RESULT_HTTP_CODE, + WaybackConstants.RESULT_REDIRECT_URL, + WaybackConstants.RESULT_ARC_FILE, + WaybackConstants.RESULT_OFFSET, + }; + + private static final Logger LOGGER = Logger.getLogger( + WARCRecordToSearchResultAdapter.class.getName()); + + // TODO: make this configurable based on the ResourceIndex + private static UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public SearchResult adapt(WARCRecord rec) { + try { + return adaptInner(rec); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + /* + * Transform input date to 14-digit timestamp: + * 2007-08-29T18:00:26Z => 20070829180026 + */ + private static String transformDate(final String input) { + + StringBuilder output = new StringBuilder(14); + + output.append(input.substring(0,4)); + output.append(input.substring(5,7)); + output.append(input.substring(8,10)); + output.append(input.substring(11,13)); + output.append(input.substring(14,16)); + output.append(input.substring(17,19)); + + return output.toString(); + } + + private static String transformHTTPMime(final String input) { + int semiIdx = input.indexOf(";"); + if(semiIdx > 0) { + return input.substring(0,semiIdx).trim(); + } + return input.trim(); + } + + private String transformWarcFilename(String readerIdentifier) { + String warcName = readerIdentifier; + int index = warcName.lastIndexOf(File.separator); + if (index > 0 && (index + 1) < warcName.length()) { + warcName = warcName.substring(index + 1); + } + return warcName; + } + + private String transformDigest(final Object o) { + if(o == null) { + return DEFAULT_VALUE; + } + String orig = o.toString(); + if(orig.startsWith("sha1:")) { + return orig.substring(5); + } + return orig; + } + + private SearchResult getBlankSearchResult() { + SearchResult result = new SearchResult(); + for(String field : SEARCH_FIELDS) { + result.put(field, DEFAULT_VALUE); + } + return result; + } + + private void addUrlDataToSearchResult(SearchResult result, String urlStr) + throws IOException { + + result.put(WaybackConstants.RESULT_URL, urlStr); + result.put(WaybackConstants.RESULT_URL_KEY, urlStr); + + + UURI uri = UURIFactory.getInstance(urlStr); + String uriHost = uri.getHost(); + if (uriHost == null) { + + LOGGER.info("No host in " + urlStr); + + } else { + + result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); + } + + String urlKey = canonicalizer.urlStringToKey(urlStr); + result.put(WaybackConstants.RESULT_URL_KEY, urlKey); + } + + private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_ARC_FILE, + transformWarcFilename(header.getReaderIdentifier())); + result.put(WaybackConstants.RESULT_OFFSET, + String.valueOf(header.getOffset())); + + String uriStr = header.getUrl(); + + String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX + .length()); + result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype()); + + result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); + result.put(WaybackConstants.RESULT_URL, uriStr); + result.put(WaybackConstants.RESULT_URL_KEY, uriStr); + + rec.close(); + result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); + + return result; + } + + private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_MD5_DIGEST, + transformDigest(header.getHeaderValue( + WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); + + addUrlDataToSearchResult(result,header.getUrl()); + + return result; + } + + /** + * borrowed(copied) from org.archive.io.arc.ARCRecord... + * + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_ARC_FILE, + transformWarcFilename(header.getReaderIdentifier())); + result.put(WaybackConstants.RESULT_OFFSET, + String.valueOf(header.getOffset())); + + String origUrl = header.getUrl(); + addUrlDataToSearchResult(result,origUrl); + + // need to parse the documents HTTP message and headers here: WARCReader + // does not implement this... yet.. + + byte [] statusBytes = HttpParser.readRawLine(rec); + int eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new RecoverableIOException("Failed to read http status where one " + + " was expected: " + new String(statusBytes)); + } + String statusLine = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + if ((statusLine == null) || + !StatusLine.startsWithHTTP(statusLine)) { + throw new RecoverableIOException("Failed parse of http status line."); + } + StatusLine status = new StatusLine(statusLine); + result.put(WaybackConstants.RESULT_HTTP_CODE, + String.valueOf(status.getStatusCode())); + + Header[] headers = HttpParser.parseHeaders(rec, + ARCConstants.DEFAULT_ENCODING); + + rec.close(); + result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); + + if (headers != null) { + + for (Header httpHeader : headers) { + if (httpHeader.getName().equals( + WaybackConstants.LOCATION_HTTP_HEADER)) { + + String locationStr = httpHeader.getValue(); + // TODO: "Location" is supposed to be absolute: + // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) + // (section 14.30) but Content-Location can be + // relative. + // is it correct to resolve a relative Location, as + // we are? + // it's also possible to have both in the HTTP + // headers... + // should we prefer one over the other? + // right now, we're ignoring "Content-Location" + try { + UURI uriRedirect = UURIFactory.getInstance(origUrl, + locationStr); + result.put(WaybackConstants.RESULT_REDIRECT_URL, + uriRedirect.getEscapedURI()); + } catch (URIException e) { + LOGGER.info("Bad Location: " + locationStr + + " for " + origUrl + " in " + + header.getReaderIdentifier() + " Skipped"); + } + } else if(httpHeader.getName().toLowerCase().equals("content-type")) { + result.put(WaybackConstants.RESULT_MIME_TYPE, + transformHTTPMime(httpHeader.getValue())); + } + } + } + return result; + } + + private SearchResult adaptInner(WARCRecord rec) throws IOException { + + SearchResult result = null; + ArchiveRecordHeader header = rec.getHeader(); + String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); + if(type.equals(WARCConstants.RESPONSE)) { + String mime = header.getMimetype(); + if(mime.equals("text/dns")) { + result = adaptDNS(header,rec); + } else { + result = adaptResponse(header,rec); + } + } else if(type.equals(WARCConstants.REVISIT)) { + result = adaptRevisit(header,rec); + } + + return result; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |