From: <bra...@us...> - 2008-07-01 23:44:57
|
Revision: 2374 http://archive-access.svn.sourceforge.net/archive-access/?rev=2374&view=rev Author: bradtofel Date: 2008-07-01 16:45:04 -0700 (Tue, 01 Jul 2008) Log Message: ----------- REFACTOR: SearchResult => (Url|Capture)SearchResult this includes use of accessor methods FEATURE: IdentityUrlCanonicalizer is now default! Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2008-07-01 23:44:18 UTC (rev 2373) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2008-07-01 23:45:04 UTC (rev 2374) @@ -26,19 +26,17 @@ import java.io.File; import java.io.IOException; -import java.util.logging.Logger; +//import java.util.logging.Logger; import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.URIException; import org.archive.io.arc.ARCRecord; import org.archive.io.arc.ARCRecordMetaData; -import org.archive.net.UURI; -import org.archive.net.UURIFactory; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.SearchResult; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; +import org.archive.wayback.util.url.UrlOperations; /** * @@ -47,22 +45,22 @@ * @version $Date$, $Revision$ */ public class ARCRecordToSearchResultAdapter -implements Adapter<ARCRecord,SearchResult>{ +implements Adapter<ARCRecord,CaptureSearchResult>{ - private static final Logger LOGGER = Logger.getLogger( - ARCRecordToSearchResultAdapter.class.getName()); +// private static final Logger LOGGER = Logger.getLogger( +// ARCRecordToSearchResultAdapter.class.getName()); private UrlCanonicalizer canonicalizer = null; public ARCRecordToSearchResultAdapter() { - canonicalizer = new AggressiveUrlCanonicalizer(); + canonicalizer = new IdentityUrlCanonicalizer(); } // public static SearchResult arcRecordToSearchResult(final ARCRecord rec) // throws IOException, ParseException { /* (non-Javadoc) * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) */ - public SearchResult adapt(ARCRecord rec) { + public CaptureSearchResult adapt(ARCRecord rec) { try { return adaptInner(rec); } catch (IOException e) { @@ -71,26 +69,25 @@ } } - private SearchResult adaptInner(ARCRecord rec) throws IOException { + private CaptureSearchResult adaptInner(ARCRecord rec) throws IOException { rec.close(); ARCRecordMetaData meta = rec.getMetaData(); - SearchResult result = new SearchResult(); + CaptureSearchResult result = new CaptureSearchResult(); String arcName = meta.getArc(); int index = arcName.lastIndexOf(File.separator); if (index > 0 && (index + 1) < arcName.length()) { arcName = arcName.substring(index + 1); } - result.put(WaybackConstants.RESULT_ARC_FILE, arcName); - result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta - .getOffset())); + result.setFile(arcName); + result.setOffset(meta.getOffset()); // initialize with default HTTP code... - result.put(WaybackConstants.RESULT_HTTP_CODE, "-"); + result.setHttpCode("-"); - result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); - result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype()); - result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate()); + result.setDigest(rec.getDigestStr()); + result.setMimeType(meta.getMimetype()); + result.setCaptureTimestamp(meta.getDate()); String uriStr = meta.getUrl(); if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { @@ -100,67 +97,49 @@ if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { // skip URL + HTTP header processing for dns records... - String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX - .length()); - result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); - result.put(WaybackConstants.RESULT_REDIRECT_URL, "-"); - result.put(WaybackConstants.RESULT_URL, uriStr); - result.put(WaybackConstants.RESULT_URL_KEY, uriStr); + result.setOriginalUrl(uriStr); + result.setRedirectUrl("-"); + result.setUrlKey(uriStr); } else { - UURI uri = UURIFactory.getInstance(uriStr); - result.put(WaybackConstants.RESULT_URL, uriStr); + result.setOriginalUrl(uriStr); - String uriHost = uri.getHost(); - if (uriHost == null) { - LOGGER.info("No host in " + uriStr + " in " + meta.getArc()); - } else { - result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); - String statusCode = (meta.getStatusCode() == null) ? "-" : meta - .getStatusCode(); - result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode); - - String redirectUrl = "-"; - Header[] headers = rec.getHttpHeaders(); - if (headers != null) { - - for (int i = 0; i < headers.length; i++) { - if (headers[i].getName().equals( - WaybackConstants.LOCATION_HTTP_HEADER)) { + String statusCode = (meta.getStatusCode() == null) ? "-" : meta + .getStatusCode(); + result.setHttpCode(statusCode); + + String redirectUrl = "-"; + Header[] headers = rec.getHttpHeaders(); + if (headers != null) { + + for (int i = 0; i < headers.length; i++) { + if (headers[i].getName().equals( + WaybackConstants.LOCATION_HTTP_HEADER)) { - String locationStr = headers[i].getValue(); - // TODO: "Location" is supposed to be absolute: - // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) - // (section 14.30) but Content-Location can be - // relative. - // is it correct to resolve a relative Location, as - // we are? - // it's also possible to have both in the HTTP - // headers... - // should we prefer one over the other? - // right now, we're ignoring "Content-Location" - try { - UURI uriRedirect = UURIFactory.getInstance(uri, - locationStr); - redirectUrl = uriRedirect.getEscapedURI(); - - } catch (URIException e) { - LOGGER.info("Bad Location: " + locationStr - + " for " + uriStr + " in " - + meta.getArc() + " Skipped"); - } - break; - } + String locationStr = headers[i].getValue(); + // TODO: "Location" is supposed to be absolute: + // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) + // (section 14.30) but Content-Location can be + // relative. + // is it correct to resolve a relative Location, as + // we are? + // it's also possible to have both in the HTTP + // headers... + // should we prefer one over the other? + // right now, we're ignoring "Content-Location" + redirectUrl = UrlOperations.resolveUrl(uriStr, + locationStr); + + break; } } - result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl); + result.setRedirectUrl(redirectUrl); - String indexUrl = canonicalizer.urlStringToKey(meta.getUrl()); - result.put(WaybackConstants.RESULT_URL_KEY, indexUrl); + String urlKey = canonicalizer.urlStringToKey(meta.getUrl()); + result.setUrlKey(urlKey); } - } return result; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2008-07-01 23:44:18 UTC (rev 2373) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2008-07-01 23:45:04 UTC (rev 2374) @@ -34,7 +34,7 @@ import org.archive.io.arc.ARCReaderFactory; import org.archive.io.arc.ARCRecord; import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.core.SearchResult; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; @@ -43,7 +43,7 @@ import org.archive.wayback.util.url.IdentityUrlCanonicalizer; /** - * Transforms an ARC file into Iterator<SearchResult>. + * Transforms an ARC file into Iterator<CaptureSearchResult>. * * @author brad * @version $Date$, $Revision$ @@ -65,7 +65,7 @@ * @return Iterator of SearchResults for input arc File * @throws IOException */ - public CloseableIterator<SearchResult> iterator(File arc) + public CloseableIterator<CaptureSearchResult> iterator(File arc) throws IOException { return iterator(ARCReaderFactory.get(arc)); } @@ -75,7 +75,7 @@ * @return Iterator of SearchResults for input pathOrUrl * @throws IOException */ - public CloseableIterator<SearchResult> iterator(String pathOrUrl) + public CloseableIterator<CaptureSearchResult> iterator(String pathOrUrl) throws IOException { return iterator(ARCReaderFactory.get(pathOrUrl)); } @@ -85,7 +85,7 @@ * @return Iterator of SearchResults for input ARCReader * @throws IOException */ - public CloseableIterator<SearchResult> iterator(ARCReader arcReader) + public CloseableIterator<CaptureSearchResult> iterator(ARCReader arcReader) throws IOException { arcReader.setParseHttpHeaders(true); @@ -102,7 +102,7 @@ CloseableIterator<ARCRecord> itr2 = new AdaptedIterator<ArchiveRecord,ARCRecord>(itr1,adapter1); - return new AdaptedIterator<ARCRecord,SearchResult>(itr2,adapter2); + return new AdaptedIterator<ARCRecord,CaptureSearchResult>(itr2,adapter2); } public UrlCanonicalizer getCanonicalizer() { @@ -146,7 +146,7 @@ } else { USAGE(); } - Iterator<SearchResult> res = indexer.iterator(arc); + Iterator<CaptureSearchResult> res = indexer.iterator(arc); Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); while(lines.hasNext()) { pw.println(lines.next()); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2008-07-01 23:44:18 UTC (rev 2373) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2008-07-01 23:45:04 UTC (rev 2374) @@ -2,25 +2,23 @@ import java.io.File; import java.io.IOException; -import java.util.logging.Logger; +//import java.util.logging.Logger; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpParser; import org.apache.commons.httpclient.StatusLine; -import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.util.EncodingUtil; import org.archive.io.ArchiveRecordHeader; import org.archive.io.RecoverableIOException; import org.archive.io.arc.ARCConstants; import org.archive.io.warc.WARCConstants; import org.archive.io.warc.WARCRecord; -import org.archive.net.UURI; -import org.archive.net.UURIFactory; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.SearchResult; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.UrlOperations; /** * Adapts certain WARCRecords into SearchResults. DNS and response records are @@ -34,24 +32,12 @@ * @version $Date$, $Revision$ */ public class WARCRecordToSearchResultAdapter -implements Adapter<WARCRecord,SearchResult>{ +implements Adapter<WARCRecord,CaptureSearchResult>{ private final static String DEFAULT_VALUE = "-"; - private final static String SEARCH_FIELDS[] = { - WaybackConstants.RESULT_URL, - WaybackConstants.RESULT_URL_KEY, - WaybackConstants.RESULT_ORIG_HOST, - WaybackConstants.RESULT_CAPTURE_DATE, - WaybackConstants.RESULT_MD5_DIGEST, - WaybackConstants.RESULT_MIME_TYPE, - WaybackConstants.RESULT_HTTP_CODE, - WaybackConstants.RESULT_REDIRECT_URL, - WaybackConstants.RESULT_ARC_FILE, - WaybackConstants.RESULT_OFFSET, - }; - private static final Logger LOGGER = Logger.getLogger( - WARCRecordToSearchResultAdapter.class.getName()); +// private static final Logger LOGGER = Logger.getLogger( +// WARCRecordToSearchResultAdapter.class.getName()); private UrlCanonicalizer canonicalizer = null; @@ -62,7 +48,7 @@ /* (non-Javadoc) * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) */ - public SearchResult adapt(WARCRecord rec) { + public CaptureSearchResult adapt(WARCRecord rec) { try { return adaptInner(rec); } catch (IOException e) { @@ -117,75 +103,58 @@ return orig; } - private SearchResult getBlankSearchResult() { - SearchResult result = new SearchResult(); - for(String field : SEARCH_FIELDS) { - result.put(field, DEFAULT_VALUE); - } + private CaptureSearchResult getBlankSearchResult() { + CaptureSearchResult result = new CaptureSearchResult(); + + result.setUrlKey(DEFAULT_VALUE); + result.setOriginalUrl(DEFAULT_VALUE); + result.setCaptureTimestamp(DEFAULT_VALUE); + result.setDigest(DEFAULT_VALUE); + result.setMimeType(DEFAULT_VALUE); + result.setHttpCode(DEFAULT_VALUE); + result.setRedirectUrl(DEFAULT_VALUE); + result.setFile(DEFAULT_VALUE); + result.setOffset(0); return result; } - private UURI addUrlDataToSearchResult(SearchResult result, String urlStr) + private void addUrlDataToSearchResult(CaptureSearchResult result, String urlStr) throws IOException { - result.put(WaybackConstants.RESULT_URL, urlStr); - result.put(WaybackConstants.RESULT_URL_KEY, urlStr); - - - UURI uri = UURIFactory.getInstance(urlStr); - String uriHost = uri.getHost(); - if (uriHost == null) { - - LOGGER.info("No host in " + urlStr); - - } else { - - result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); - } - + result.setOriginalUrl(urlStr); String urlKey = canonicalizer.urlStringToKey(urlStr); - result.put(WaybackConstants.RESULT_URL_KEY, urlKey); - - return uri; + result.setUrlKey(urlKey); } - private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) + private CaptureSearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) throws IOException { - SearchResult result = getBlankSearchResult(); + CaptureSearchResult result = getBlankSearchResult(); - result.put(WaybackConstants.RESULT_CAPTURE_DATE, - transformDate(header.getDate())); - result.put(WaybackConstants.RESULT_ARC_FILE, - transformWarcFilename(header.getReaderIdentifier())); - result.put(WaybackConstants.RESULT_OFFSET, - String.valueOf(header.getOffset())); + result.setCaptureTimestamp(transformDate(header.getDate())); + result.setFile(transformWarcFilename(header.getReaderIdentifier())); + result.setOffset(header.getOffset()); String uriStr = header.getUrl(); - String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX - .length()); - result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype()); + result.setMimeType(header.getMimetype()); - result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); - result.put(WaybackConstants.RESULT_URL, uriStr); - result.put(WaybackConstants.RESULT_URL_KEY, uriStr); + result.setOriginalUrl(uriStr); + result.setUrlKey(uriStr); rec.close(); - result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); + result.setDigest(rec.getDigestStr()); return result; } - private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) + private CaptureSearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) throws IOException { - SearchResult result = getBlankSearchResult(); + CaptureSearchResult result = getBlankSearchResult(); - result.put(WaybackConstants.RESULT_CAPTURE_DATE, - transformDate(header.getDate())); - result.put(WaybackConstants.RESULT_MD5_DIGEST, - transformDigest(header.getHeaderValue( + result.setCaptureTimestamp(transformDate(header.getDate())); + result.setDigest(transformDigest(header.getHeaderValue( WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); addUrlDataToSearchResult(result,header.getUrl()); @@ -211,20 +180,17 @@ return count; } - private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) + private CaptureSearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) throws IOException { - SearchResult result = getBlankSearchResult(); + CaptureSearchResult result = getBlankSearchResult(); - result.put(WaybackConstants.RESULT_CAPTURE_DATE, - transformDate(header.getDate())); - result.put(WaybackConstants.RESULT_ARC_FILE, - transformWarcFilename(header.getReaderIdentifier())); - result.put(WaybackConstants.RESULT_OFFSET, - String.valueOf(header.getOffset())); + result.setCaptureTimestamp(transformDate(header.getDate())); + result.setFile(transformWarcFilename(header.getReaderIdentifier())); + result.setOffset(header.getOffset()); String origUrl = header.getUrl(); - UURI uri = addUrlDataToSearchResult(result,origUrl); + addUrlDataToSearchResult(result,origUrl); // need to parse the documents HTTP message and headers here: WARCReader // does not implement this... yet.. @@ -242,15 +208,13 @@ throw new RecoverableIOException("Failed parse of http status line."); } StatusLine status = new StatusLine(statusLine); - result.put(WaybackConstants.RESULT_HTTP_CODE, - String.valueOf(status.getStatusCode())); + result.setHttpCode(String.valueOf(status.getStatusCode())); Header[] headers = HttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING); rec.close(); - result.put(WaybackConstants.RESULT_MD5_DIGEST, - transformDigest(header.getHeaderValue( + result.setDigest(transformDigest(header.getHeaderValue( WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); if (headers != null) { @@ -270,28 +234,19 @@ // headers... // should we prefer one over the other? // right now, we're ignoring "Content-Location" - try { - UURI uriRedirect = UURIFactory.getInstance(uri, - locationStr); - result.put(WaybackConstants.RESULT_REDIRECT_URL, - uriRedirect.getEscapedURI()); - } catch (URIException e) { - LOGGER.info("Bad Location: " + locationStr - + " for " + origUrl + " in " - + header.getReaderIdentifier() + " Skipped"); - } + result.setRedirectUrl( + UrlOperations.resolveUrl(origUrl, locationStr)); } else if(httpHeader.getName().toLowerCase().equals("content-type")) { - result.put(WaybackConstants.RESULT_MIME_TYPE, - transformHTTPMime(httpHeader.getValue())); + result.setMimeType(transformHTTPMime(httpHeader.getValue())); } } } return result; } - private SearchResult adaptInner(WARCRecord rec) throws IOException { + private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException { - SearchResult result = null; + CaptureSearchResult result = null; ArchiveRecordHeader header = rec.getHeader(); String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); if(type.equals(WARCConstants.RESPONSE)) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2008-07-01 23:44:18 UTC (rev 2373) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2008-07-01 23:45:04 UTC (rev 2374) @@ -10,7 +10,7 @@ import org.archive.io.warc.WARCReaderFactory; import org.archive.io.warc.WARCRecord; import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.core.SearchResult; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; @@ -35,7 +35,7 @@ * @return Iterator of SearchResults for input arc File * @throws IOException */ - public CloseableIterator<SearchResult> iterator(File warc) + public CloseableIterator<CaptureSearchResult> iterator(File warc) throws IOException { return iterator(WARCReaderFactory.get(warc)); } @@ -44,7 +44,7 @@ * @return Iterator of SearchResults for input pathOrUrl * @throws IOException */ - public CloseableIterator<SearchResult> iterator(String pathOrUrl) + public CloseableIterator<CaptureSearchResult> iterator(String pathOrUrl) throws IOException { return iterator(WARCReaderFactory.get(pathOrUrl)); } @@ -53,7 +53,7 @@ * @return Iterator of SearchResults for input arc File * @throws IOException */ - public CloseableIterator<SearchResult> iterator(WARCReader reader) + public CloseableIterator<CaptureSearchResult> iterator(WARCReader reader) throws IOException { Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter(); @@ -68,7 +68,7 @@ CloseableIterator<WARCRecord> itr2 = new AdaptedIterator<ArchiveRecord, WARCRecord>(itr1, adapter1); - return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2); + return new AdaptedIterator<WARCRecord, CaptureSearchResult>(itr2, adapter2); } public UrlCanonicalizer getCanonicalizer() { @@ -112,7 +112,7 @@ } else { USAGE(); } - Iterator<SearchResult> res = indexer.iterator(arc); + Iterator<CaptureSearchResult> res = indexer.iterator(arc); Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); while (lines.hasNext()) { pw.println(lines.next()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |