From: <bra...@us...> - 2008-06-25 01:30:12
|
Revision: 2321 http://archive-access.svn.sourceforge.net/archive-access/?rev=2321&view=rev Author: bradtofel Date: 2008-06-24 18:30:18 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: moved indexing related code into indexer package Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,173 +0,0 @@ -/* ArcRecordToSearchResultAdapter - * - * $Id$ - * - * Created on 3:27:03 PM Jul 26, 2007. - * - * Copyright (C) 2007 Internet Archive. - * - * This file is part of wayback-core. - * - * wayback-core is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback-core is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback-core; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.URIException; -import org.archive.io.arc.ARCRecord; -import org.archive.io.arc.ARCRecordMetaData; -import org.archive.net.UURI; -import org.archive.net.UURIFactory; -import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; - -/** - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class ARCRecordToSearchResultAdapter -implements Adapter<ARCRecord,SearchResult>{ - - private static final Logger LOGGER = Logger.getLogger( - ARCRecordToSearchResultAdapter.class.getName()); - - private UrlCanonicalizer canonicalizer = null; - - public ARCRecordToSearchResultAdapter() { - canonicalizer = new AggressiveUrlCanonicalizer(); - } -// public static SearchResult arcRecordToSearchResult(final ARCRecord rec) -// throws IOException, ParseException { - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public SearchResult adapt(ARCRecord rec) { - try { - return adaptInner(rec); - } catch (IOException e) { - e.printStackTrace(); - return null; - } - } - - private SearchResult adaptInner(ARCRecord rec) throws IOException { - rec.close(); - ARCRecordMetaData meta = rec.getMetaData(); - - SearchResult result = new SearchResult(); - String arcName = meta.getArc(); - int index = arcName.lastIndexOf(File.separator); - if (index > 0 && (index + 1) < arcName.length()) { - arcName = arcName.substring(index + 1); - } - result.put(WaybackConstants.RESULT_ARC_FILE, arcName); - result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta - .getOffset())); - - // initialize with default HTTP code... - result.put(WaybackConstants.RESULT_HTTP_CODE, "-"); - - result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); - result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype()); - result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate()); - - String uriStr = meta.getUrl(); - if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { - // skip filedesc record altogether... - return null; - } - if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { - // skip URL + HTTP header processing for dns records... - - String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX - .length()); - result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); - result.put(WaybackConstants.RESULT_REDIRECT_URL, "-"); - result.put(WaybackConstants.RESULT_URL, uriStr); - result.put(WaybackConstants.RESULT_URL_KEY, uriStr); - - } else { - - UURI uri = UURIFactory.getInstance(uriStr); - result.put(WaybackConstants.RESULT_URL, uriStr); - - String uriHost = uri.getHost(); - if (uriHost == null) { - LOGGER.info("No host in " + uriStr + " in " + meta.getArc()); - } else { - result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); - - String statusCode = (meta.getStatusCode() == null) ? "-" : meta - .getStatusCode(); - result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode); - - String redirectUrl = "-"; - Header[] headers = rec.getHttpHeaders(); - if (headers != null) { - - for (int i = 0; i < headers.length; i++) { - if (headers[i].getName().equals( - WaybackConstants.LOCATION_HTTP_HEADER)) { - - String locationStr = headers[i].getValue(); - // TODO: "Location" is supposed to be absolute: - // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) - // (section 14.30) but Content-Location can be - // relative. - // is it correct to resolve a relative Location, as - // we are? - // it's also possible to have both in the HTTP - // headers... - // should we prefer one over the other? - // right now, we're ignoring "Content-Location" - try { - UURI uriRedirect = UURIFactory.getInstance(uri, - locationStr); - redirectUrl = uriRedirect.getEscapedURI(); - - } catch (URIException e) { - LOGGER.info("Bad Location: " + locationStr - + " for " + uriStr + " in " - + meta.getArc() + " Skipped"); - } - break; - } - } - } - result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl); - - String indexUrl = canonicalizer.urlStringToKey(meta.getUrl()); - result.put(WaybackConstants.RESULT_URL_KEY, indexUrl); - } - - } - return result; - } - public UrlCanonicalizer getCanonicalizer() { - return canonicalizer; - } - public void setCanonicalizer(UrlCanonicalizer canonicalizer) { - this.canonicalizer = canonicalizer; - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,175 +0,0 @@ -/* ArcIndexer - * - * $Id$ - * - * Created on 2:33:29 PM Oct 11, 2006. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.PrintWriter; -import java.io.IOException; -import java.util.Iterator; - -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCReaderFactory; -import org.archive.io.arc.ARCRecord; -import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; -import org.archive.wayback.util.AdaptedIterator; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.IdentityUrlCanonicalizer; - -/** - * Transforms an ARC file into Iterator<SearchResult>. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class ArcIndexer { - - /** - * CDX Header line for these fields. not very configurable.. - */ - public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; - private UrlCanonicalizer canonicalizer = null; - - public ArcIndexer() { - canonicalizer = new AggressiveUrlCanonicalizer(); - } - - /** - * @param arc - * @return Iterator of SearchResults for input arc File - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(File arc) - throws IOException { - return iterator(ARCReaderFactory.get(arc)); - } - - /** - * @param pathOrUrl - * @return Iterator of SearchResults for input pathOrUrl - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(String pathOrUrl) - throws IOException { - return iterator(ARCReaderFactory.get(pathOrUrl)); - } - - /** - * @param arcReader - * @return Iterator of SearchResults for input ARCReader - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(ARCReader arcReader) - throws IOException { - arcReader.setParseHttpHeaders(true); - - Adapter<ArchiveRecord,ARCRecord> adapter1 = - new ArchiveRecordToARCRecordAdapter(); - - ARCRecordToSearchResultAdapter adapter2 = - new ARCRecordToSearchResultAdapter(); - adapter2.setCanonicalizer(canonicalizer); - - ArchiveReaderCloseableIterator itr1 = - new ArchiveReaderCloseableIterator(arcReader,arcReader.iterator()); - - CloseableIterator<ARCRecord> itr2 = - new AdaptedIterator<ArchiveRecord,ARCRecord>(itr1,adapter1); - - return new AdaptedIterator<ARCRecord,SearchResult>(itr2,adapter2); - } - - public UrlCanonicalizer getCanonicalizer() { - return canonicalizer; - } - - public void setCanonicalizer(UrlCanonicalizer canonicalizer) { - this.canonicalizer = canonicalizer; - } - - private static void USAGE() { - System.err.println("USAGE:"); - System.err.println(""); - System.err.println("arc-indexer [-identity] ARCFILE"); - System.err.println("arc-indexer [-identity] ARCFILE CDXFILE"); - System.err.println(""); - System.err.println("Create a CDX format index at CDXFILE or to STDOUT."); - System.err.println("With -identity, perform no url canonicalization."); - System.exit(1); - } - - /** - * @param args - */ - public static void main(String[] args) { - ArcIndexer indexer = new ArcIndexer(); - int idx = 0; - if(args[0] != null && args[0].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); - idx++; - } - File arc = new File(args[idx]); - idx++; - PrintWriter pw = null; - try { - if(args.length == idx) { - // dump to STDOUT: - pw = new PrintWriter(System.out); - } else if(args.length == (idx + 1)) { - pw = new PrintWriter(args[idx]); - } else { - USAGE(); - } - Iterator<SearchResult> res = indexer.iterator(arc); - Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); - while(lines.hasNext()) { - pw.println(lines.next()); - } - pw.close(); - } catch (Exception e) { - e.printStackTrace(); - System.exit(1); - } - } - - private class ArchiveRecordToARCRecordAdapter - implements Adapter<ArchiveRecord,ARCRecord> { - - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public ARCRecord adapt(ArchiveRecord o) { - ARCRecord rec = null; - if(o instanceof ARCRecord) { - rec = (ARCRecord) o; - } - return rec; - } - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,29 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.IOException; -import java.util.Iterator; - -import org.archive.io.ArchiveReader; -import org.archive.io.ArchiveRecord; -import org.archive.wayback.util.CloseableIterator; - -public class ArchiveReaderCloseableIterator implements CloseableIterator<ArchiveRecord> { - private ArchiveReader reader = null; - private Iterator<ArchiveRecord> itr = null; - public ArchiveReaderCloseableIterator(ArchiveReader reader, Iterator<ArchiveRecord> itr) { - this.reader = reader; - this.itr = itr; - } - public boolean hasNext() { - return itr.hasNext(); - } - public ArchiveRecord next() { - return itr.next(); - } - public void remove() { - itr.remove(); - } - public void close() throws IOException { - reader.close(); - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,318 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpParser; -import org.apache.commons.httpclient.StatusLine; -import org.apache.commons.httpclient.URIException; -import org.apache.commons.httpclient.util.EncodingUtil; -import org.archive.io.ArchiveRecordHeader; -import org.archive.io.RecoverableIOException; -import org.archive.io.arc.ARCConstants; -import org.archive.io.warc.WARCConstants; -import org.archive.io.warc.WARCRecord; -import org.archive.net.UURI; -import org.archive.net.UURIFactory; -import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; - -/** - * Adapts certain WARCRecords into SearchResults. DNS and response records are - * mostly straightforward, but SearchResult objects generated from revisit - * records contain lots of "placeholder" fields, which are expected to be - * understood by later processes traversing a stream of SearchResult objects. - * - * See org.archive.wayback.resourceindex.DeduplicateSearchResultAnnotationAdapter. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class WARCRecordToSearchResultAdapter -implements Adapter<WARCRecord,SearchResult>{ - - private final static String DEFAULT_VALUE = "-"; - private final static String SEARCH_FIELDS[] = { - WaybackConstants.RESULT_URL, - WaybackConstants.RESULT_URL_KEY, - WaybackConstants.RESULT_ORIG_HOST, - WaybackConstants.RESULT_CAPTURE_DATE, - WaybackConstants.RESULT_MD5_DIGEST, - WaybackConstants.RESULT_MIME_TYPE, - WaybackConstants.RESULT_HTTP_CODE, - WaybackConstants.RESULT_REDIRECT_URL, - WaybackConstants.RESULT_ARC_FILE, - WaybackConstants.RESULT_OFFSET, - }; - - private static final Logger LOGGER = Logger.getLogger( - WARCRecordToSearchResultAdapter.class.getName()); - - private UrlCanonicalizer canonicalizer = null; - - public WARCRecordToSearchResultAdapter() { - canonicalizer = new AggressiveUrlCanonicalizer(); - } - - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public SearchResult adapt(WARCRecord rec) { - try { - return adaptInner(rec); - } catch (IOException e) { - e.printStackTrace(); - return null; - } - } - - /* - * Transform input date to 14-digit timestamp: - * 2007-08-29T18:00:26Z => 20070829180026 - */ - private static String transformDate(final String input) { - - StringBuilder output = new StringBuilder(14); - - output.append(input.substring(0,4)); - output.append(input.substring(5,7)); - output.append(input.substring(8,10)); - output.append(input.substring(11,13)); - output.append(input.substring(14,16)); - output.append(input.substring(17,19)); - - return output.toString(); - } - - private static String transformHTTPMime(final String input) { - int semiIdx = input.indexOf(";"); - if(semiIdx > 0) { - return input.substring(0,semiIdx).trim(); - } - return input.trim(); - } - - private String transformWarcFilename(String readerIdentifier) { - String warcName = readerIdentifier; - int index = warcName.lastIndexOf(File.separator); - if (index > 0 && (index + 1) < warcName.length()) { - warcName = warcName.substring(index + 1); - } - return warcName; - } - - private String transformDigest(final Object o) { - if(o == null) { - return DEFAULT_VALUE; - } - String orig = o.toString(); - if(orig.startsWith("sha1:")) { - return orig.substring(5); - } - return orig; - } - - private SearchResult getBlankSearchResult() { - SearchResult result = new SearchResult(); - for(String field : SEARCH_FIELDS) { - result.put(field, DEFAULT_VALUE); - } - return result; - } - - private UURI addUrlDataToSearchResult(SearchResult result, String urlStr) - throws IOException { - - result.put(WaybackConstants.RESULT_URL, urlStr); - result.put(WaybackConstants.RESULT_URL_KEY, urlStr); - - - UURI uri = UURIFactory.getInstance(urlStr); - String uriHost = uri.getHost(); - if (uriHost == null) { - - LOGGER.info("No host in " + urlStr); - - } else { - - result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); - } - - String urlKey = canonicalizer.urlStringToKey(urlStr); - result.put(WaybackConstants.RESULT_URL_KEY, urlKey); - - return uri; - } - - private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - - SearchResult result = getBlankSearchResult(); - - result.put(WaybackConstants.RESULT_CAPTURE_DATE, - transformDate(header.getDate())); - result.put(WaybackConstants.RESULT_ARC_FILE, - transformWarcFilename(header.getReaderIdentifier())); - result.put(WaybackConstants.RESULT_OFFSET, - String.valueOf(header.getOffset())); - - String uriStr = header.getUrl(); - - String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX - .length()); - result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype()); - - result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); - result.put(WaybackConstants.RESULT_URL, uriStr); - result.put(WaybackConstants.RESULT_URL_KEY, uriStr); - - rec.close(); - result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); - - return result; - } - - private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - - SearchResult result = getBlankSearchResult(); - - result.put(WaybackConstants.RESULT_CAPTURE_DATE, - transformDate(header.getDate())); - result.put(WaybackConstants.RESULT_MD5_DIGEST, - transformDigest(header.getHeaderValue( - WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); - - addUrlDataToSearchResult(result,header.getUrl()); - - return result; - } - - /** - * borrowed(copied) from org.archive.io.arc.ARCRecord... - * - * @param bytes Array of bytes to examine for an EOL. - * @return Count of end-of-line characters or zero if none. - */ - private int getEolCharsCount(byte [] bytes) { - int count = 0; - if (bytes != null && bytes.length >=1 && - bytes[bytes.length - 1] == '\n') { - count++; - if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { - count++; - } - } - return count; - } - - private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - - SearchResult result = getBlankSearchResult(); - - result.put(WaybackConstants.RESULT_CAPTURE_DATE, - transformDate(header.getDate())); - result.put(WaybackConstants.RESULT_ARC_FILE, - transformWarcFilename(header.getReaderIdentifier())); - result.put(WaybackConstants.RESULT_OFFSET, - String.valueOf(header.getOffset())); - - String origUrl = header.getUrl(); - UURI uri = addUrlDataToSearchResult(result,origUrl); - - // need to parse the documents HTTP message and headers here: WARCReader - // does not implement this... yet.. - - byte [] statusBytes = HttpParser.readRawLine(rec); - int eolCharCount = getEolCharsCount(statusBytes); - if (eolCharCount <= 0) { - throw new RecoverableIOException("Failed to read http status where one " + - " was expected: " + new String(statusBytes)); - } - String statusLine = EncodingUtil.getString(statusBytes, 0, - statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); - if ((statusLine == null) || - !StatusLine.startsWithHTTP(statusLine)) { - throw new RecoverableIOException("Failed parse of http status line."); - } - StatusLine status = new StatusLine(statusLine); - result.put(WaybackConstants.RESULT_HTTP_CODE, - String.valueOf(status.getStatusCode())); - - Header[] headers = HttpParser.parseHeaders(rec, - ARCConstants.DEFAULT_ENCODING); - - rec.close(); - result.put(WaybackConstants.RESULT_MD5_DIGEST, - transformDigest(header.getHeaderValue( - WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); - - if (headers != null) { - - for (Header httpHeader : headers) { - if (httpHeader.getName().equals( - WaybackConstants.LOCATION_HTTP_HEADER)) { - - String locationStr = httpHeader.getValue(); - // TODO: "Location" is supposed to be absolute: - // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) - // (section 14.30) but Content-Location can be - // relative. - // is it correct to resolve a relative Location, as - // we are? - // it's also possible to have both in the HTTP - // headers... - // should we prefer one over the other? - // right now, we're ignoring "Content-Location" - try { - UURI uriRedirect = UURIFactory.getInstance(uri, - locationStr); - result.put(WaybackConstants.RESULT_REDIRECT_URL, - uriRedirect.getEscapedURI()); - } catch (URIException e) { - LOGGER.info("Bad Location: " + locationStr - + " for " + origUrl + " in " - + header.getReaderIdentifier() + " Skipped"); - } - } else if(httpHeader.getName().toLowerCase().equals("content-type")) { - result.put(WaybackConstants.RESULT_MIME_TYPE, - transformHTTPMime(httpHeader.getValue())); - } - } - } - return result; - } - - private SearchResult adaptInner(WARCRecord rec) throws IOException { - - SearchResult result = null; - ArchiveRecordHeader header = rec.getHeader(); - String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); - if(type.equals(WARCConstants.RESPONSE)) { - String mime = header.getMimetype(); - if(mime.equals("text/dns")) { - result = adaptDNS(header,rec); - } else { - result = adaptResponse(header,rec); - } - } else if(type.equals(WARCConstants.REVISIT)) { - result = adaptRevisit(header,rec); - } - - return result; - } - - public UrlCanonicalizer getCanonicalizer() { - return canonicalizer; - } - - public void setCanonicalizer(UrlCanonicalizer canonicalizer) { - this.canonicalizer = canonicalizer; - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,140 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.Iterator; - -import org.archive.io.ArchiveRecord; -import org.archive.io.warc.WARCReader; -import org.archive.io.warc.WARCReaderFactory; -import org.archive.io.warc.WARCRecord; -import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; -import org.archive.wayback.util.AdaptedIterator; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.IdentityUrlCanonicalizer; - -public class WarcIndexer { - - /** - * CDX Header line for these fields. not very configurable.. - */ - public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; - - private UrlCanonicalizer canonicalizer = null; - public WarcIndexer() { - canonicalizer = new AggressiveUrlCanonicalizer(); - } - - /** - * @param warc - * @return Iterator of SearchResults for input arc File - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(File warc) - throws IOException { - return iterator(WARCReaderFactory.get(warc)); - } - /** - * @param pathOrUrl - * @return Iterator of SearchResults for input pathOrUrl - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(String pathOrUrl) - throws IOException { - return iterator(WARCReaderFactory.get(pathOrUrl)); - } - /** - * @param arc - * @return Iterator of SearchResults for input arc File - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(WARCReader reader) - throws IOException { - - Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter(); - - WARCRecordToSearchResultAdapter adapter2 = - new WARCRecordToSearchResultAdapter(); - adapter2.setCanonicalizer(canonicalizer); - - ArchiveReaderCloseableIterator itr1 = - new ArchiveReaderCloseableIterator(reader,reader.iterator()); - - CloseableIterator<WARCRecord> itr2 = - new AdaptedIterator<ArchiveRecord, WARCRecord>(itr1, adapter1); - - return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2); - } - - public UrlCanonicalizer getCanonicalizer() { - return canonicalizer; - } - - public void setCanonicalizer(UrlCanonicalizer canonicalizer) { - this.canonicalizer = canonicalizer; - } - - private static void USAGE() { - System.err.println("USAGE:"); - System.err.println(""); - System.err.println("warc-indexer [-identity] WARCFILE"); - System.err.println("warc-indexer [-identity] WARCFILE CDXFILE"); - System.err.println(""); - System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); - System.err.println("With -identity, perform no url canonicalization."); - System.exit(1); - } - - /** - * @param args - */ - public static void main(String[] args) { - WarcIndexer indexer = new WarcIndexer(); - int idx = 0; - if(args[0] != null && args[0].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); - idx++; - } - File arc = new File(args[idx]); - idx++; - PrintWriter pw = null; - try { - if (args.length == idx) { - // dump to STDOUT: - pw = new PrintWriter(System.out); - } else if (args.length == (idx+1)) { - pw = new PrintWriter(args[1]); - } else { - USAGE(); - } - Iterator<SearchResult> res = indexer.iterator(arc); - Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); - while (lines.hasNext()) { - pw.println(lines.next()); - } - pw.close(); - } catch (Exception e) { - e.printStackTrace(); - } - } - - private class ArchiveRecordToWARCRecordAdapter implements - Adapter<ArchiveRecord, WARCRecord> { - - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public WARCRecord adapt(ArchiveRecord o) { - WARCRecord rec = null; - if (o instanceof WARCRecord) { - rec = (WARCRecord) o; - } - return rec; - } - } -} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java (from rev 2138, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,173 @@ +/* ArcRecordToSearchResultAdapter + * + * $Id$ + * + * Created on 3:27:03 PM Jul 26, 2007. + * + * Copyright (C) 2007 Internet Archive. + * + * This file is part of wayback-core. + * + * wayback-core is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback-core is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback-core; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.IOException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.URIException; +import org.archive.io.arc.ARCRecord; +import org.archive.io.arc.ARCRecordMetaData; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ARCRecordToSearchResultAdapter +implements Adapter<ARCRecord,SearchResult>{ + + private static final Logger LOGGER = Logger.getLogger( + ARCRecordToSearchResultAdapter.class.getName()); + + private UrlCanonicalizer canonicalizer = null; + + public ARCRecordToSearchResultAdapter() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } +// public static SearchResult arcRecordToSearchResult(final ARCRecord rec) +// throws IOException, ParseException { + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public SearchResult adapt(ARCRecord rec) { + try { + return adaptInner(rec); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + private SearchResult adaptInner(ARCRecord rec) throws IOException { + rec.close(); + ARCRecordMetaData meta = rec.getMetaData(); + + SearchResult result = new SearchResult(); + String arcName = meta.getArc(); + int index = arcName.lastIndexOf(File.separator); + if (index > 0 && (index + 1) < arcName.length()) { + arcName = arcName.substring(index + 1); + } + result.put(WaybackConstants.RESULT_ARC_FILE, arcName); + result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta + .getOffset())); + + // initialize with default HTTP code... + result.put(WaybackConstants.RESULT_HTTP_CODE, "-"); + + result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); + result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype()); + result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate()); + + String uriStr = meta.getUrl(); + if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { + // skip filedesc record altogether... + return null; + } + if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { + // skip URL + HTTP header processing for dns records... + + String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX + .length()); + result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); + result.put(WaybackConstants.RESULT_REDIRECT_URL, "-"); + result.put(WaybackConstants.RESULT_URL, uriStr); + result.put(WaybackConstants.RESULT_URL_KEY, uriStr); + + } else { + + UURI uri = UURIFactory.getInstance(uriStr); + result.put(WaybackConstants.RESULT_URL, uriStr); + + String uriHost = uri.getHost(); + if (uriHost == null) { + LOGGER.info("No host in " + uriStr + " in " + meta.getArc()); + } else { + result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); + + String statusCode = (meta.getStatusCode() == null) ? "-" : meta + .getStatusCode(); + result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode); + + String redirectUrl = "-"; + Header[] headers = rec.getHttpHeaders(); + if (headers != null) { + + for (int i = 0; i < headers.length; i++) { + if (headers[i].getName().equals( + WaybackConstants.LOCATION_HTTP_HEADER)) { + + String locationStr = headers[i].getValue(); + // TODO: "Location" is supposed to be absolute: + // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) + // (section 14.30) but Content-Location can be + // relative. + // is it correct to resolve a relative Location, as + // we are? + // it's also possible to have both in the HTTP + // headers... + // should we prefer one over the other? + // right now, we're ignoring "Content-Location" + try { + UURI uriRedirect = UURIFactory.getInstance(uri, + locationStr); + redirectUrl = uriRedirect.getEscapedURI(); + + } catch (URIException e) { + LOGGER.info("Bad Location: " + locationStr + + " for " + uriStr + " in " + + meta.getArc() + " Skipped"); + } + break; + } + } + } + result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl); + + String indexUrl = canonicalizer.urlStringToKey(meta.getUrl()); + result.put(WaybackConstants.RESULT_URL_KEY, indexUrl); + } + + } + return result; + } + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java (from rev 2280, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,175 @@ +/* ArcIndexer + * + * $Id$ + * + * Created on 2:33:29 PM Oct 11, 2006. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.PrintWriter; +import java.io.IOException; +import java.util.Iterator; + +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCReader; +import org.archive.io.arc.ARCReaderFactory; +import org.archive.io.arc.ARCRecord; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; + +/** + * Transforms an ARC file into Iterator<SearchResult>. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ArcIndexer { + + /** + * CDX Header line for these fields. not very configurable.. + */ + public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; + private UrlCanonicalizer canonicalizer = null; + + public ArcIndexer() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + + /** + * @param arc + * @return Iterator of SearchResults for input arc File + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(File arc) + throws IOException { + return iterator(ARCReaderFactory.get(arc)); + } + + /** + * @param pathOrUrl + * @return Iterator of SearchResults for input pathOrUrl + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(String pathOrUrl) + throws IOException { + return iterator(ARCReaderFactory.get(pathOrUrl)); + } + + /** + * @param arcReader + * @return Iterator of SearchResults for input ARCReader + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(ARCReader arcReader) + throws IOException { + arcReader.setParseHttpHeaders(true); + + Adapter<ArchiveRecord,ARCRecord> adapter1 = + new ArchiveRecordToARCRecordAdapter(); + + ARCRecordToSearchResultAdapter adapter2 = + new ARCRecordToSearchResultAdapter(); + adapter2.setCanonicalizer(canonicalizer); + + ArchiveReaderCloseableIterator itr1 = + new ArchiveReaderCloseableIterator(arcReader,arcReader.iterator()); + + CloseableIterator<ARCRecord> itr2 = + new AdaptedIterator<ArchiveRecord,ARCRecord>(itr1,adapter1); + + return new AdaptedIterator<ARCRecord,SearchResult>(itr2,adapter2); + } + + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } + + private static void USAGE() { + System.err.println("USAGE:"); + System.err.println(""); + System.err.println("arc-indexer [-identity] ARCFILE"); + System.err.println("arc-indexer [-identity] ARCFILE CDXFILE"); + System.err.println(""); + System.err.println("Create a CDX format index at CDXFILE or to STDOUT."); + System.err.println("With -identity, perform no url canonicalization."); + System.exit(1); + } + + /** + * @param args + */ + public static void main(String[] args) { + ArcIndexer indexer = new ArcIndexer(); + int idx = 0; + if(args[0] != null && args[0].equals("-identity")) { + indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); + idx++; + } + File arc = new File(args[idx]); + idx++; + PrintWriter pw = null; + try { + if(args.length == idx) { + // dump to STDOUT: + pw = new PrintWriter(System.out); + } else if(args.length == (idx + 1)) { + pw = new PrintWriter(args[idx]); + } else { + USAGE(); + } + Iterator<SearchResult> res = indexer.iterator(arc); + Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); + while(lines.hasNext()) { + pw.println(lines.next()); + } + pw.close(); + } catch (Exception e) { + e.printStackTrace(); + System.exit(1); + } + } + + private class ArchiveRecordToARCRecordAdapter + implements Adapter<ArchiveRecord,ARCRecord> { + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public ARCRecord adapt(ArchiveRecord o) { + ARCRecord rec = null; + if(o instanceof ARCRecord) { + rec = (ARCRecord) o; + } + return rec; + } + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java (from rev 2209, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,29 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.IOException; +import java.util.Iterator; + +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveRecord; +import org.archive.wayback.util.CloseableIterator; + +public class ArchiveReaderCloseableIterator implements CloseableIterator<ArchiveRecord> { + private ArchiveReader reader = null; + private Iterator<ArchiveRecord> itr = null; + public ArchiveReaderCloseableIterator(ArchiveReader reader, Iterator<ArchiveRecord> itr) { + this.reader = reader; + this.itr = itr; + } + public boolean hasNext() { + return itr.hasNext(); + } + public ArchiveRecord next() { + return itr.next(); + } + public void remove() { + itr.remove(); + } + public void close() throws IOException { + reader.close(); + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java (from rev 2138, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,318 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.IOException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.URIException; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.RecoverableIOException; +import org.archive.io.arc.ARCConstants; +import org.archive.io.warc.WARCConstants; +import org.archive.io.warc.WARCRecord; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +/** + * Adapts certain WARCRecords into SearchResults. DNS and response records are + * mostly straightforward, but SearchResult objects generated from revisit + * records contain lots of "placeholder" fields, which are expected to be + * understood by later processes traversing a stream of SearchResult objects. + * + * See org.archive.wayback.resourceindex.DeduplicateSearchResultAnnotationAdapter. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class WARCRecordToSearchResultAdapter +implements Adapter<WARCRecord,SearchResult>{ + + private final static String DEFAULT_VALUE = "-"; + private final static String SEARCH_FIELDS[] = { + WaybackConstants.RESULT_URL, + WaybackConstants.RESULT_URL_KEY, + WaybackConstants.RESULT_ORIG_HOST, + WaybackConstants.RESULT_CAPTURE_DATE, + WaybackConstants.RESULT_MD5_DIGEST, + WaybackConstants.RESULT_MIME_TYPE, + WaybackConstants.RESULT_HTTP_CODE, + WaybackConstants.RESULT_REDIRECT_URL, + WaybackConstants.RESULT_ARC_FILE, + WaybackConstants.RESULT_OFFSET, + }; + + private static final Logger LOGGER = Logger.getLogger( + WARCRecordToSearchResultAdapter.class.getName()); + + private UrlCanonicalizer canonicalizer = null; + + public WARCRecordToSearchResultAdapter() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public SearchResult adapt(WARCRecord rec) { + try { + return adaptInner(rec); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + /* + * Transform input date to 14-digit timestamp: + * 2007-08-29T18:00:26Z => 20070829180026 + */ + private static String transformDate(final String input) { + + StringBuilder output = new StringBuilder(14); + + output.append(input.substring(0,4)); + output.append(input.substring(5,7)); + output.append(input.substring(8,10)); + output.append(input.substring(11,13)); + output.append(input.substring(14,16)); + output.append(input.substring(17,19)); + + return output.toString(); + } + + private static String transformHTTPMime(final String input) { + int semiIdx = input.indexOf(";"); + if(semiIdx > 0) { + return input.substring(0,semiIdx).trim(); + } + return input.trim(); + } + + private String transformWarcFilename(String readerIdentifier) { + String warcName = readerIdentifier; + int index = warcName.lastIndexOf(File.separator); + if (index > 0 && (index + 1) < warcName.length()) { + warcName = warcName.substring(index + 1); + } + return warcName; + } + + private String transformDigest(final Object o) { + if(o == null) { + return DEFAULT_VALUE; + } + String orig = o.toString(); + if(orig.startsWith("sha1:")) { + return orig.substring(5); + } + return orig; + } + + private SearchResult getBlankSearchResult() { + SearchResult result = new SearchResult(); + for(String field : SEARCH_FIELDS) { + result.put(field, DEFAULT_VALUE); + } + return result; + } + + private UURI addUrlDataToSearchResult(SearchResult result, String urlStr) + throws IOException { + + result.put(WaybackConstants.RESULT_URL, urlStr); + result.put(WaybackConstants.RESULT_URL_KEY, urlStr); + + + UURI uri = UURIFactory.getInstance(urlStr); + String uriHost = uri.getHost(); + if (uriHost == null) { + + LOGGER.info("No host in " + urlStr); + + } else { + + result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); + } + + String urlKey = canonicalizer.urlStringToKey(urlStr); + result.put(WaybackConstants.RESULT_URL_KEY, urlKey); + + return uri; + } + + private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_ARC_FILE, + transformWarcFilename(header.getReaderIdentifier())); + result.put(WaybackConstants.RESULT_OFFSET, + String.valueOf(header.getOffset())); + + String uriStr = header.getUrl(); + + String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX + .length()); + result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype()); + + result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); + result.put(WaybackConstants.RESULT_URL, uriStr); + result.put(WaybackConstants.RESULT_URL_KEY, uriStr); + + rec.close(); + result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); + + return result; + } + + private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_MD5_DIGEST, + transformDigest(header.getHeaderValue( + WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); + + addUrlDataToSearchResult(result,header.getUrl()); + + return result; + } + + /** + * borrowed(copied) from org.archive.io.arc.ARCRecord... + * + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_ARC_FILE, + transformWarcFilename(header.getReaderIdentifier())); + result.put(WaybackConstants.RESULT_OFFSET, + String.valueOf(header.getOffset())); + + String origUrl = header.getUrl(); + UURI uri = addUrlDataToSearchResult(result,origUrl); + + // need to parse the documents HTTP message and headers here: WARCReader + // does not implement this... yet.. + + byte [] statusBytes = HttpParser.readRawLine(rec); + int eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new RecoverableIOException("Failed to read http status where one " + + " was expected: " + new String(statusBytes)); + } + String statusLine = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + if ((statusLine == null) || + !StatusLine.startsWithHTTP(statusLine)) { + throw new RecoverableIOException("Failed parse of http status line."); + } + StatusLine status = new StatusLine(statusLine); + result.put(WaybackConstants.RESULT_HTTP_CODE, + String.valueOf(status.getStatusCode())); + + Header[] headers = HttpParser.parseHeaders(rec, + ARCConstants.DEFAULT_ENCODING); + + rec.close(); + result.put(WaybackConstants.RESULT_MD5_DIGEST, + transformDigest(header.getHeaderValue( + WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); + + if (headers != null) { + + for (Header httpHeader : headers) { + if (httpHeader.getName().equals( + WaybackConstants.LOCATION_HTTP_HEADER)) { + + String locationStr = httpHeader.getValue(); + // TODO: "Location" is supposed to be absolute: + // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) + // (section 14.30) but Content-Location can be + // relative. + // is it correct to resolve a relative Location, as + // we are? + // it's also possible to have both in the HTTP + // headers... + // should we prefer one over the other? + // right now, we're ignoring "Content-Location" + try { + UURI uriRedirect = UURIFactory.getInstance(uri, + locationStr); + result.put(WaybackConstants.RESULT_REDIRECT_URL, + uriRedirect.getEscapedURI()); + } catch (URIException e) { + LOGGER.info("Bad Location: " + locationStr + + " for " + origUrl + " in " + + header.getReaderIdentifier() + " Skipped"); + } + } else if(httpHeader.getName().toLowerCase().equals("content-type")) { + result.put(WaybackConstants.RESULT_MIME_TYPE, + transformHTTPMime(httpHeader.getValue())); + } + } + } + return result; + } + + private SearchResult adaptInner(WARCRecord rec) throws IOException { + + SearchResult result = null; + ArchiveRecordHeader header = rec.getHeader(); + String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); + if(type.equals(WARCConstants.RESPONSE)) { + String mime = header.getMimetype(); + if(mime.equals("text/dns")) { + result = adaptDNS(header,rec); + } else { + result = adaptResponse(header,rec); + } + } else if(type.equals(WARCConstants.REVISIT)) { + result = adaptRevisit(header,rec); + } + + return result; + } + + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java (from rev 2280, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,140 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Iterator; + +import org.archive.io.ArchiveRecord; +import org.archive.io.warc.WARCReader; +import org.archive.io.warc.WARCReaderFactory; +import org.archive.io.warc.WARCRecord; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; + +public class WarcIndexer { + + /** + * CDX Header line for these fields. not very configurable.. + */ + public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; + + private UrlCanonicalizer canonicalizer = null; + public WarcIndexer() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + + /** + * @param warc + * @return Iterator of SearchResults for input arc File + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(File warc) + throws IOException { + return iterator(WARCReaderFactory.get(warc)); + } + /** + * @param pathOrUrl + * @return Iterator of SearchResults for input pathOrUrl + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(String pathOrUrl) + throws IOException { + return iterator(WARCReaderFactory.get(pathOrUrl)); + } + /** + * @param arc + * @return Iterator of SearchResults for input arc File + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(WARCReader reader) + throws IOException { + + Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter(); + + WARCRecordToSearchResultAdapter adapter2 = + new WARCRecordToSearchResultAdapter(); + adapter2.setCanonicalizer(canonicalizer); + + ArchiveReaderCloseableIterator itr1 = + new ArchiveReaderCloseableIterator(reader,reader.iterator()); + + CloseableIterator<WARCRecord> itr2 = + new AdaptedIterator<ArchiveRecord, WARCRecord>(itr1, adapter1); + + return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2); + } + + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } + + private static void USAGE() { + System.err.println("USAGE:"); + System.err.println(""); + System.err.println("warc-indexer [-identity] WARCFILE"); + System.err.println("warc-indexer [-identity] WARCFILE CDXFILE"); + System.err.println(""); + System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); + System.err.println("With -identity, perform no url canonicalization."); + System.exit(1); + } + + /** + * @param args + */ + public static void main(String[] args) { + WarcIndexer indexer = new WarcIndexer(); + int idx = 0; + if(args[0] != null && args[0].equals("-identity")) { + indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); + idx++; + } + File arc = new File(args[idx]); + idx++; + PrintWriter pw = null; + try { + if (args.length == idx) { + // dump to STDOUT: + pw = new PrintWriter(System.out); + } else if (args.length == (idx+1)) { + pw = new PrintWriter(args[1]); + } else { + USAGE(); + } + Iterator<SearchResult> res = indexer.iterator(arc); + Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); + while (lines.hasNext()) { + pw.println(lines.next()); + } + pw.close(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private class ArchiveRecordToWARCRecordAdapter implements + Adapter<ArchiveRecord, WARCRecord> { + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public WARCRecord adapt(ArchiveRecord o) { + WARCRecord rec = null; + if (o instanceof WARCRecord) { + rec = (WARCRecord) o; + } + return rec; + } + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |