Revision: 3526 http://archive-access.svn.sourceforge.net/archive-access/?rev=3526&view=rev Author: bradtofel Date: 2011-09-06 04:08:28 +0000 (Tue, 06 Sep 2011) Log Message: ----------- HACK: subclass of CDXFormat - which simply does old style CDX line parsing - allowing either 9 or 10 columns. This allows a CDXFormatIndex to be used with a CDX file containing both 9 and 10 columns in various lines Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFlexFormat.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFlexFormat.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFlexFormat.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFlexFormat.java 2011-09-06 04:08:28 UTC (rev 3526) @@ -0,0 +1,100 @@ +package org.archive.wayback.resourceindex.cdx.format; + +import java.util.logging.Logger; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.url.UrlOperations; + +public class CDXFlexFormat extends CDXFormat { + private final static String SCHEME_STRING = "://"; + private final static String DEFAULT_SCHEME = "http://"; + + private static final Logger LOGGER = + Logger.getLogger(CDXFlexFormat.class.getName()); + public CDXFlexFormat(String cdxSpec) throws CDXFormatException { + super(cdxSpec); + } + private static int getEndOfHostIndex(String url) { + int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); + int pathIdx = url.indexOf(UrlOperations.PATH_START); + if(portIdx == -1 && pathIdx == -1) { + return url.length(); + } + if(portIdx == -1) { + return pathIdx; + } + if(pathIdx == -1) { + return portIdx; + } + if(pathIdx > portIdx) { + return portIdx; + } else { + return pathIdx; + } + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.cdx.format.CDXFormat#parseResult(java.lang.String) + */ + @Override + public CaptureSearchResult parseResult(String line) + throws CDXFormatException { + CaptureSearchResult result = new CaptureSearchResult(); + String[] tokens = line.split(" "); + boolean hasRobotFlags = false; + if (tokens.length != 9) { + if(tokens.length == 10) { + hasRobotFlags = true; + } else { + return null; + } + //throw new IllegalArgumentException("Need 9 columns("+line+")"); + } + String urlKey = tokens[0]; + String captureTS = tokens[1]; + String originalUrl = tokens[2]; + + // convert from ORIG_HOST to ORIG_URL here: + if(!originalUrl.contains(SCHEME_STRING)) { + StringBuilder sb = new StringBuilder(urlKey.length()); + sb.append(DEFAULT_SCHEME); + sb.append(originalUrl); + sb.append(urlKey.substring(getEndOfHostIndex(urlKey))); + originalUrl = sb.toString(); + } + String mimeType = tokens[3]; + String httpCode = tokens[4]; + String digest = tokens[5]; + String redirectUrl = tokens[6]; + long compressedOffset = -1; + int nextToken = 7; + if(hasRobotFlags) { + result.setRobotFlags(tokens[nextToken]); + nextToken++; + } + + if(!tokens[nextToken].equals("-")) { + try { + compressedOffset = Long.parseLong(tokens[nextToken]); + } catch (NumberFormatException e) { + LOGGER.warning("Bad compressed Offset field("+nextToken+") in (" + + line +")"); + return null; + } + } + nextToken++; + String fileName = tokens[nextToken]; + result.setUrlKey(urlKey); + result.setCaptureTimestamp(captureTS); + result.setOriginalUrl(originalUrl); + result.setMimeType(mimeType); + result.setHttpCode(httpCode); + result.setDigest(digest); + result.setRedirectUrl(redirectUrl); + result.setOffset(compressedOffset); + result.setFile(fileName); + + return result; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |