From: <bra...@us...> - 2007-11-28 02:00:27
|
Revision: 2082 http://archive-access.svn.sourceforge.net/archive-access/?rev=2082&view=rev Author: bradtofel Date: 2007-11-27 18:00:31 -0800 (Tue, 27 Nov 2007) Log Message: ----------- REFACTOR/FEATURE: made Resource abstract, moved ARC-specific code to ArcResource, added WARC-specific coercion code to WarcResource. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java 2007-11-28 00:59:27 UTC (rev 2081) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java 2007-11-28 02:00:31 UTC (rev 2082) @@ -26,236 +26,118 @@ import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; -import java.util.Enumeration; -import java.util.HashMap; -import java.util.Hashtable; -import java.util.Iterator; import java.util.Map; -import java.util.Set; -import java.util.logging.Logger; -import org.apache.commons.httpclient.Header; -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCRecord; - /** - * Slightly more than an ARCRecord. This class is designed to be an abstraction - * to allow the Wayback to operator with non-ARC file format resources. Probably - * the interface required will end up looking very much like ARCRecord, but can - * be reimplemented to handle new ARC formats or non-ARC formats. + * Abstraction on top of a document stored in a WaybackCollection. Currently + * implemented subclasses include ArcResource and WarcResource. * * @author Brad Tofel * @version $Date$, $Revision$ */ -public class Resource extends InputStream { - /** - * Logger for this class - */ - private static final Logger LOGGER = Logger.getLogger(Resource.class - .getName()); - - /** - * String prefix for ARC file related metadata namespace of keys within - * metaData Properties bag. - */ - private static String ARC_META_PREFIX = "arcmeta."; - /** - * String prefix for HTTP Header related metadata namespace of keys within - * metaData Properties bag. - */ - private static String HTTP_HEADER_PREFIX = "httpheader."; - /** - * object for ARCRecord - */ - ARCRecord arcRecord = null; - /** - * object for ARCReader -- need to hold on to this in order to call close() - * to release filehandle after completing access to this record. optional - */ - ARCReader arcReader = null; - /** - * flag to indicate if the ARCRecord skipHTTPHeader() has been called - */ - boolean parsedHeader = false; - /** - * Expandable property bag for holding metadata associated with this - * resource - */ - Hashtable<String,String> metaData = new Hashtable<String,String>(); +public abstract class Resource extends InputStream { - private BufferedInputStream bis; - - /** - * Constructor - * - * @param rec - * @param reader - */ - public Resource(final ARCRecord rec,final ARCReader reader) { - super(); - arcRecord = rec; - arcReader = reader; - bis = new BufferedInputStream(rec); - } + private InputStream is; - /** parse the headers on the underlying ARC record, and extract all - * @throws IOException - */ - public void parseHeaders () throws IOException { - if(!parsedHeader) { - arcRecord.skipHttpHeader(); - // copy all HTTP headers to metaData, prefixing with - // HTTP_HEADER_PREFIX - Header[] headers = arcRecord.getHttpHeaders(); - if (headers != null) { - for (int i = 0; i < headers.length; i++) { - String value = headers[i].getValue(); - String name = headers[i].getName(); - metaData.put(HTTP_HEADER_PREFIX + name,value); - } - } + public abstract void close() throws IOException; + public abstract int getStatusCode(); + public abstract long getRecordLength(); + public abstract Map<String,String> getHttpHeaders(); - // copy all ARC record header fields to metaData, prefixing with - // ARC_META_PREFIX - @SuppressWarnings("unchecked") - Map<String,Object> headerMetaMap = arcRecord.getMetaData().getHeaderFields(); - Set<String> keys = headerMetaMap.keySet(); - Iterator<String> itr = keys.iterator(); - while(itr.hasNext()) { - String metaKey = itr.next(); - Object value = headerMetaMap.get(metaKey); - String metaValue = ""; - if(value != null) { - metaValue = value.toString(); - } - metaData.put(ARC_META_PREFIX + metaKey,metaValue); - } - - parsedHeader = true; + protected void setInputStream(InputStream is) { + if(is.markSupported()) { + this.is = is; + } else { + this.is = new BufferedInputStream(is); } } - /** - * @param prefix - * @return a Properties of all elements in metaData starting with 'prefix'. - * keys in the returned Properties have 'prefix' removed. + * @return + * @throws IOException + * @see java.io.BufferedInputStream#available() */ - public Map<String,String> filterMeta(String prefix) { - HashMap<String,String> matching = new HashMap<String,String>(); - for (Enumeration<String> e = metaData.keys(); e.hasMoreElements();) { - String key = e.nextElement(); - if (key.startsWith(prefix)) { - String finalKey = key.substring(prefix.length()); - String value = metaData.get(key); - matching.put(finalKey, value); - } + public int available() throws IOException { + if(is == null) { + throw new IOException("No InputStream"); } - return matching; + return is.available(); } - /** - * @return a Properties containing all HTTP header fields for this record + * @param readlimit + * @see java.io.BufferedInputStream#mark(int) */ - public Map<String,String> getHttpHeaders() { - return filterMeta(HTTP_HEADER_PREFIX); + public void mark(int readlimit) { + if(is != null) { + is.mark(readlimit); + } } - /** - * @return a Properties containing all ARC Meta fields for this record + * @return + * @see java.io.BufferedInputStream#markSupported() */ - public Map<String,String> getARCMetadata() { - return filterMeta(ARC_META_PREFIX); + public boolean markSupported() { + if(is == null) { + return false; + } + return is.markSupported(); } - /** - * (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#getStatusCode() - * @return int HTTP status code returned with this document. + * @return + * @throws IOException + * @see java.io.BufferedInputStream#read() */ - public int getStatusCode() { - return arcRecord.getStatusCode(); + public int read() throws IOException { + if(is == null) { + throw new IOException("No InputStream"); + } + return is.read(); } - /** - * @return the ARCRecord underlying this Resource. + * @param b + * @param off + * @param len + * @return + * @throws IOException + * @see java.io.BufferedInputStream#read(byte[], int, int) */ - public ArchiveRecord getArcRecord() { - return arcRecord; + public int read(byte[] b, int off, int len) throws IOException { + if(is == null) { + throw new IOException("No InputStream"); + } + return is.read(b, off, len); } - - /* (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#read() + /** + * @param b + * @return + * @throws IOException + * @see java.io.FilterInputStream#read(byte[]) */ - public int read() throws IOException { - return bis.read(); - } - - /* (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#read(byte[], int, int) - */ - public int read(byte[] arg0, int arg1, int arg2) throws IOException { - return bis.read(arg0, arg1, arg2); - } - - /* (non-Javadoc) - * @see java.io.InputStream#read(byte[]) - */ public int read(byte[] b) throws IOException { - return bis.read(b); + if(is == null) { + throw new IOException("No InputStream"); + } + return is.read(b); } - - /* (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#skip(long) - */ - public long skip(long arg0) throws IOException { - return bis.skip(arg0); - } - - /* (non-Javadoc) - * @see java.io.BufferedInputStream#available() - */ - public int available() throws IOException { - return bis.available(); - } - - /* (non-Javadoc) - * @see java.io.BufferedInputStream#mark(int) - */ - public void mark(int readlimit) { - bis.mark(readlimit); - } - - /* (non-Javadoc) - * @see java.io.BufferedInputStream#markSupported() - */ - public boolean markSupported() { - return bis.markSupported(); - } - - /* (non-Javadoc) + /** + * @throws IOException * @see java.io.BufferedInputStream#reset() */ public void reset() throws IOException { - bis.reset(); - } - - /* (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#close() - */ - public void close() throws IOException { - //LOGGER.info("About to close..("+arcReader+")"); - arcRecord.close(); - if(arcReader != null) { - arcReader.close(); - LOGGER.info("closed..("+arcReader+")"); + if(is == null) { + throw new IOException("No InputStream"); } + is.reset(); } - /** - * @return byte length claimed in ARC record metadata line. + * @param n + * @return + * @throws IOException + * @see java.io.BufferedInputStream#skip(long) */ - public long getRecordLength() { - return arcRecord.getMetaData().getLength(); + public long skip(long n) throws IOException { + if(is == null) { + throw new IOException("No InputStream"); + } + return is.skip(n); } } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java 2007-11-28 02:00:31 UTC (rev 2082) @@ -0,0 +1,170 @@ +package org.archive.wayback.resourcestore; + +import java.io.IOException; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.Hashtable; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCReader; +import org.archive.io.arc.ARCRecord; +import org.archive.wayback.core.Resource; + +public class ArcResource extends Resource { + /** + * Logger for this class + */ + private static final Logger LOGGER = Logger.getLogger(ArcResource.class + .getName()); + + /** + * String prefix for ARC file related metadata namespace of keys within + * metaData Properties bag. + */ + private static String ARC_META_PREFIX = "arcmeta."; + /** + * String prefix for HTTP Header related metadata namespace of keys within + * metaData Properties bag. + */ + private static String HTTP_HEADER_PREFIX = "httpheader."; + /** + * object for ARCRecord + */ + ARCRecord arcRecord = null; + /** + * object for ARCReader -- need to hold on to this in order to call close() + * to release filehandle after completing access to this record. optional + */ + ARCReader arcReader = null; + /** + * flag to indicate if the ARCRecord skipHTTPHeader() has been called + */ + boolean parsedHeader = false; + /** + * Expandable property bag for holding metadata associated with this + * resource + */ + Hashtable<String,String> metaData = new Hashtable<String,String>(); + + /** + * Constructor + * + * @param rec + * @param reader + */ + public ArcResource(final ARCRecord rec,final ARCReader reader) { + super(); + arcRecord = rec; + arcReader = reader; + setInputStream(rec); + } + + /** parse the headers on the underlying ARC record, and extract all + * @throws IOException + */ + public void parseHeaders () throws IOException { + if(!parsedHeader) { + arcRecord.skipHttpHeader(); + // copy all HTTP headers to metaData, prefixing with + // HTTP_HEADER_PREFIX + Header[] headers = arcRecord.getHttpHeaders(); + if (headers != null) { + for (int i = 0; i < headers.length; i++) { + String value = headers[i].getValue(); + String name = headers[i].getName(); + metaData.put(HTTP_HEADER_PREFIX + name,value); + } + } + + // copy all ARC record header fields to metaData, prefixing with + // ARC_META_PREFIX + @SuppressWarnings("unchecked") + Map<String,Object> headerMetaMap = arcRecord.getMetaData().getHeaderFields(); + Set<String> keys = headerMetaMap.keySet(); + Iterator<String> itr = keys.iterator(); + while(itr.hasNext()) { + String metaKey = itr.next(); + Object value = headerMetaMap.get(metaKey); + String metaValue = ""; + if(value != null) { + metaValue = value.toString(); + } + metaData.put(ARC_META_PREFIX + metaKey,metaValue); + } + + parsedHeader = true; + } + } + + /** + * @param prefix + * @return a Properties of all elements in metaData starting with 'prefix'. + * keys in the returned Properties have 'prefix' removed. + */ + public Map<String,String> filterMeta(String prefix) { + HashMap<String,String> matching = new HashMap<String,String>(); + for (Enumeration<String> e = metaData.keys(); e.hasMoreElements();) { + String key = e.nextElement(); + if (key.startsWith(prefix)) { + String finalKey = key.substring(prefix.length()); + String value = metaData.get(key); + matching.put(finalKey, value); + } + } + return matching; + } + + /** + * @return a Properties containing all HTTP header fields for this record + */ + public Map<String,String> getHttpHeaders() { + return filterMeta(HTTP_HEADER_PREFIX); + } + + /** + * @return a Properties containing all ARC Meta fields for this record + */ + public Map<String,String> getARCMetadata() { + return filterMeta(ARC_META_PREFIX); + } + + /** + * (non-Javadoc) + * @see org.archive.io.arc.ARCRecord#getStatusCode() + * @return int HTTP status code returned with this document. + */ + public int getStatusCode() { + return arcRecord.getStatusCode(); + } + + /** + * @return the ARCRecord underlying this Resource. + */ + public ArchiveRecord getArcRecord() { + return arcRecord; + } + + /* (non-Javadoc) + * @see org.archive.io.arc.ARCRecord#close() + */ + public void close() throws IOException { + //LOGGER.info("About to close..("+arcReader+")"); + arcRecord.close(); + if(arcReader != null) { + arcReader.close(); + LOGGER.info("closed..("+arcReader+")"); + } + } + + /** + * @return byte length claimed in ARC record metadata line. + */ + public long getRecordLength() { + return arcRecord.getMetaData().getLength(); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java 2007-11-28 02:00:31 UTC (rev 2082) @@ -0,0 +1,98 @@ +package org.archive.wayback.resourcestore; + +import java.io.IOException; +import java.util.Hashtable; +import java.util.Map; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.RecoverableIOException; +import org.archive.io.arc.ARCConstants; +import org.archive.io.warc.WARCReader; +import org.archive.io.warc.WARCRecord; +import org.archive.wayback.core.Resource; + +public class WarcResource extends Resource { + private WARCRecord rec = null; + private WARCReader reader = null; + private Map<String, String> headers = null; + private long length = 0; + private int status = 0; + private boolean parsedHeaders = false; + public WarcResource(WARCRecord rec, WARCReader reader) { + this.rec = rec; + this.reader = reader; + } + + /** + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + public void parseHeaders() throws IOException { + if(parsedHeaders) { + return; + } + + byte [] statusBytes = HttpParser.readRawLine(rec); + int eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new RecoverableIOException("Failed to read http status where one " + + " was expected: " + new String(statusBytes)); + } + String statusLineStr = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + if ((statusLineStr == null) || + !StatusLine.startsWithHTTP(statusLineStr)) { + throw new RecoverableIOException("Failed parse of http status line."); + } + StatusLine statusLine = new StatusLine(statusLineStr); + + this.status = statusLine.getStatusCode(); + + Header[] tmpHeaders = HttpParser.parseHeaders(rec, + ARCConstants.DEFAULT_ENCODING); + headers = new Hashtable<String,String>(); + for(Header header: tmpHeaders) { + headers.put(header.getName(), header.getValue()); + } + this.setInputStream(rec); + parsedHeaders = true; + } + + + @Override + public Map<String, String> getHttpHeaders() { + return headers; + } + + @Override + public long getRecordLength() { + // TODO Auto-generated method stub + return length; + } + + @Override + public int getStatusCode() { + return status; + } + + @Override + public void close() throws IOException { + rec.close(); + reader.close(); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |