Revision: 2518 http://archive-access.svn.sourceforge.net/archive-access/?rev=2518&view=rev Author: bradtofel Date: 2008-07-29 02:16:35 +0000 (Tue, 29 Jul 2008) Log Message: ----------- FEATURE: added translation of old ORIGINAL-HOST field to ORIGINAL-URL field Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2008-07-29 02:15:02 UTC (rev 2517) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2008-07-29 02:16:35 UTC (rev 2518) @@ -27,6 +27,7 @@ import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.url.UrlOperations; /** * Adapter that converts a CDX record String into a CaptureSearchResult @@ -36,6 +37,43 @@ */ public class CDXLineToSearchResultAdapter implements Adapter<String,CaptureSearchResult> { + + private final static String SCHEME_STRING = "://"; + private final static String DEFAULT_SCHEME = "http://"; + + private static int getEndOfHostIndex(String url) { + int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); + int pathIdx = url.indexOf(UrlOperations.PATH_START); + if(portIdx == -1 && pathIdx == -1) { + return url.length(); + } + if(portIdx == -1) { + return pathIdx; + } + if(pathIdx == -1) { + return portIdx; + } + if(pathIdx > portIdx) { + return portIdx; + } else { + return pathIdx; + } + } + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public CaptureSearchResult adapt(CaptureSearchResult o) { + String urlKey = o.getUrlKey(); + StringBuilder sb = new StringBuilder(urlKey.length()); + sb.append(DEFAULT_SCHEME); + sb.append(o.getOriginalUrl()); + sb.append(urlKey.substring(getEndOfHostIndex(urlKey))); + o.setOriginalUrl(sb.toString()); + return o; + } + + public CaptureSearchResult adapt(String line) { return doAdapt(line); } @@ -53,6 +91,15 @@ String urlKey = tokens[0]; String captureTS = tokens[1]; String originalUrl = tokens[2]; + + // convert from ORIG_HOST to ORIG_URL here: + if(!originalUrl.contains(SCHEME_STRING)) { + StringBuilder sb = new StringBuilder(urlKey.length()); + sb.append(DEFAULT_SCHEME); + sb.append(originalUrl); + sb.append(urlKey.substring(getEndOfHostIndex(urlKey))); + originalUrl = sb.toString(); + } String mimeType = tokens[3]; String httpCode = tokens[4]; String digest = tokens[5]; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2641 http://archive-access.svn.sourceforge.net/archive-access/?rev=2641&view=rev Author: bradtofel Date: 2008-11-07 22:41:50 +0000 (Fri, 07 Nov 2008) Log Message: ----------- TWEAK: removed code which looked like it did something but had wrong signature, so was unused and confusing. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2008-11-07 22:40:02 UTC (rev 2640) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2008-11-07 22:41:50 UTC (rev 2641) @@ -60,20 +60,6 @@ } } - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public CaptureSearchResult adapt(CaptureSearchResult o) { - String urlKey = o.getUrlKey(); - StringBuilder sb = new StringBuilder(urlKey.length()); - sb.append(DEFAULT_SCHEME); - sb.append(o.getOriginalUrl()); - sb.append(urlKey.substring(getEndOfHostIndex(urlKey))); - o.setOriginalUrl(sb.toString()); - return o; - } - - public CaptureSearchResult adapt(String line) { return doAdapt(line); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3013 http://archive-access.svn.sourceforge.net/archive-access/?rev=3013&view=rev Author: bradtofel Date: 2010-04-02 03:10:33 +0000 (Fri, 02 Apr 2010) Log Message: ----------- LOG: added log warnings if it fails to convert a line because of a numberformatexception in the offset field - which is now caught instead of throwing an exception. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2010-04-02 03:08:24 UTC (rev 3012) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2010-04-02 03:10:33 UTC (rev 3013) @@ -25,6 +25,7 @@ package org.archive.wayback.resourceindex.cdx; +import org.apache.log4j.Logger; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.url.UrlOperations; @@ -37,6 +38,8 @@ */ public class CDXLineToSearchResultAdapter implements Adapter<String,CaptureSearchResult> { + private static final Logger LOGGER = Logger.getLogger( + CDXLineToSearchResultAdapter.class.getName()); private final static String SCHEME_STRING = "://"; private final static String DEFAULT_SCHEME = "http://"; @@ -103,7 +106,13 @@ } if(!tokens[nextToken].equals("-")) { - compressedOffset = Long.parseLong(tokens[nextToken]); + try { + compressedOffset = Long.parseLong(tokens[nextToken]); + } catch (NumberFormatException e) { + LOGGER.warn("Bad compressed Offset field("+nextToken+") in (" + + line +")"); + return null; + } } nextToken++; String fileName = tokens[nextToken]; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3581 http://archive-access.svn.sourceforge.net/archive-access/?rev=3581&view=rev Author: bradtofel Date: 2011-12-18 03:50:55 +0000 (Sun, 18 Dec 2011) Log Message: ----------- FEATURE: now allows 11 fields - including the compressed length field.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2011-12-18 03:48:59 UTC (rev 3580) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2011-12-18 03:50:55 UTC (rev 3581) @@ -70,9 +70,12 @@ CaptureSearchResult result = new CaptureSearchResult(); String[] tokens = line.split(" "); boolean hasRobotFlags = false; + boolean hasLengthFlag = false; if (tokens.length != 9) { + hasRobotFlags = true; if(tokens.length == 10) { - hasRobotFlags = true; + } else if(tokens.length == 11) { + hasLengthFlag = true; } else { return null; } @@ -100,10 +103,19 @@ result.setRobotFlags(tokens[nextToken]); nextToken++; } + String length = "-"; + if(hasLengthFlag) { + length = tokens[nextToken]; + nextToken++; + } if(!tokens[nextToken].equals("-")) { try { compressedOffset = Long.parseLong(tokens[nextToken]); + if(!length.equals("-")) { + // try to set the endOffset: + result.setCompressedLength(Long.parseLong(length)); + } } catch (NumberFormatException e) { LOGGER.warning("Bad compressed Offset field("+nextToken+") in (" + line +")"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |