From: Brad <bra...@us...> - 2005-11-17 02:49:18
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11862/src/java/org/archive/wayback/cdx/indexer Modified Files: ArcIndexer.java Log Message: BUGFIX: was not using CDXRecord to "serialize" SearchResult objects BUGFIX: was not escaping Location: HTTP Header URLs FEATURE: added skipping of dns: records in ARC files FEATURE: added fixing/resolving of relative Location: HTTP headers, altho this is against spec. Index: ArcIndexer.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer/ArcIndexer.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ArcIndexer.java 16 Nov 2005 03:11:29 -0000 1.1 --- ArcIndexer.java 17 Nov 2005 02:49:08 -0000 1.2 *************** *** 35,42 **** --- 35,45 ---- import org.archive.io.arc.ARCRecordMetaData; import org.archive.net.UURI; + import org.archive.net.UURIFactory; import org.archive.wayback.WaybackConstants; + import org.archive.wayback.cdx.CDXRecord; import org.archive.wayback.core.SearchResult; import org.archive.wayback.core.SearchResults; import org.apache.commons.httpclient.Header; + import org.apache.commons.httpclient.URIException; /** *************** *** 50,54 **** private final static String LOCATION_HTTP_HEADER = "Location"; private final static String CDX_HEADER_STRING = " CDX N b h m s k r V g"; ! /** --- 53,57 ---- private final static String LOCATION_HTTP_HEADER = "Location"; private final static String CDX_HEADER_STRING = " CDX N b h m s k r V g"; ! private final static String DNS_URL_PREFIX = "dns:"; /** *************** *** 113,118 **** return null; } UURI uri = new UURI(uriStr, false); ! result.put(WaybackConstants.RESULT_ORIG_HOST,uri.getHost()); String redirectUrl = "-"; --- 116,132 ---- return null; } + if(uriStr.startsWith(DNS_URL_PREFIX)) { + // skip dns records... + return null; + } + UURI uri = new UURI(uriStr, false); ! String uriHost = uri.getHost(); ! if(uriHost == null) { ! System.out.println("No host in " + uriStr + " in " + ! arc.getAbsolutePath()); ! return null; ! } ! result.put(WaybackConstants.RESULT_ORIG_HOST,uriHost); String redirectUrl = "-"; *************** *** 121,125 **** for (int i = 0; i < headers.length; i++) { if (headers[i].getName().equals(LOCATION_HTTP_HEADER)) { ! redirectUrl = headers[i].getValue(); break; } --- 135,155 ---- for (int i = 0; i < headers.length; i++) { if (headers[i].getName().equals(LOCATION_HTTP_HEADER)) { ! String locationStr = headers[i].getValue(); ! // TODO: "Location" is supposed to be absolute: ! // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) ! // (section 14.30) but Content-Location can be relative. ! // is it correct to resolve a relative Location, as we are? ! // it's also possible to have both in the HTTP headers... ! // should we prefer one over the other? ! // right now, we're ignoring "Content-Location" ! try { ! UURI uriRedirect = UURIFactory.getInstance(uri,locationStr); ! redirectUrl = uriRedirect.getEscapedURI(); ! ! } catch (URIException e) { ! System.out.println("Bad Location: " + locationStr + ! " for " + uriStr + " in " + ! arc.getAbsolutePath() + " Skipped"); ! } break; } *************** *** 151,159 **** FileOutputStream output = new FileOutputStream(target); output.write((CDX_HEADER_STRING + "\n").getBytes()); ! Iterator itr = results.iterator(); while (itr.hasNext()) { SearchResult result = (SearchResult) itr.next(); ! output.write((result.toString() + "\n").getBytes()); } } --- 181,190 ---- FileOutputStream output = new FileOutputStream(target); output.write((CDX_HEADER_STRING + "\n").getBytes()); ! CDXRecord cdxRecord = new CDXRecord(); Iterator itr = results.iterator(); while (itr.hasNext()) { SearchResult result = (SearchResult) itr.next(); ! cdxRecord.fromSearchResult(result); ! output.write((cdxRecord.toValue() + "\n").getBytes()); } } |