From: <bra...@us...> - 2011-11-18 23:15:53
|
Revision: 3565 http://archive-access.svn.sourceforge.net/archive-access/?rev=3565&view=rev Author: bradtofel Date: 2011-11-18 23:15:42 +0000 (Fri, 18 Nov 2011) Log Message: ----------- INITITAL REV- common code for GZIP, ARC, WARC, HTTP, HTML Parsing, JSON, URL canonicalization,... Added Paths: ----------- trunk/archive-access/projects/archive-commons/.classpath trunk/archive-access/projects/archive-commons/.project trunk/archive-access/projects/archive-commons/.settings/ trunk/archive-access/projects/archive-commons/.settings/org.eclipse.jdt.core.prefs trunk/archive-access/projects/archive-commons/pom.xml trunk/archive-access/projects/archive-commons/src/ trunk/archive-access/projects/archive-commons/src/main/ trunk/archive-access/projects/archive-commons/src/main/java/ trunk/archive-access/projects/archive-commons/src/main/java/org/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/RecoverableRecordFormatException.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/CDXExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/DumpingExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceProducer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/FilteredExtractorOuput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/JSONViewExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ProducerUtils.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/RealCDXExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceFactoryMapper.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/WATExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/ARCConstants.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/ARCFormatException.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/ARCMetaData.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/ARCMetaDataParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/FiledescRecord.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/FiledescRecordParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/dns/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/dns/DNSParseException.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/dns/DNSRecord.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/dns/DNSResponse.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/dns/DNSResponseParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPConstants.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPDecoder.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPFExtraRecord.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPFExtraRecords.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPFooter.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPFormatException.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPHeader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPMemberWriter.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPMemberWriterCommittedOutputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPSeriesMember.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPStaticHeader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/zipnum/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpConstants.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpHeader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpHeaderObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpHeaderParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpHeaders.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpMessage.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpMessageParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpParseException.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpParseObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpRequest.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpRequestMessage.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpRequestMessageObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpRequestMessageParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpRequestParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpResponse.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpResponseMessage.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpResponseMessageObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpResponseMessageParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpResponseParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/CompoundORJSONPathSpec.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/CrossProductOfLists.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/JSONPathSpec.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/JSONPathSpecFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/JSONUtils.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/JSONView.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/SimpleJSONPathSpec.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/charset/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/charset/CharsetDetector.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/charset/RotatingCharsetDetector.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/charset/StandardCharsetDetector.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/html/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/html/CDATALexer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/html/LexParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/html/NodeUtils.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/html/ParseObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/warc/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/warc/WARCConstants.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/warc/WARCRecordWriter.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/ArchiveJSONViewLoader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/ResourceContext.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/ResourceInputFormat.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/ResourceRecordReader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/func/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/func/JSONViewEvalFunc.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/func/TupleFunc.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/func/URLResolverFunc.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/AbstractEmptyResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/AbstractResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/MetaData.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/MetaDataConstants.java-normal trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/Resource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/ResourceConstants.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/ResourceContainer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/ResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/ResourceParseException.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/ResourceProducer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/TransformingResourceProducer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/arc/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/arc/ARCResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/arc/ARCResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/arc/record/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/arc/record/FiledescResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/arc/record/FiledescResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/generic/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/generic/GenericResourceProducer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/generic/GenericStreamResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/gzip/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/gzip/GZIPMetaData.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/gzip/GZIPResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/HTMLMetaData.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/HTMLResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/HTMLResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPHeadersResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPRequestResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPRequestResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPResponseResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPResponseResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/producer/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/producer/ARCFile.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/producer/EnvelopedResourceFile.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/producer/WARCFile.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/WARCResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/WARCResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/DNSResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/DNSResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/WARCMetaDataResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/AbstractBufferingStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/ByteArrayWrappedStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/HDFSStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/HTTP11Stream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/RandomAccessFileStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/SimpleStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/Stream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/StreamWrappedInputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/CanonicalizeRules.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/CanonicalizerConstants.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/DefaultIACanonicalizerRules.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/DefaultIAURLCanonicalizer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/HandyURL.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/IAURLCanonicalizer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/NonMassagingIAURLCanonicalizer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/SURT.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/SURTTokenizer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/URLCanonicalizer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/URLKeyMaker.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/URLParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/URLRegexTransformer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/WaybackURLKeyMaker.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/Base32.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/ByteOp.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/CrossProduct.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/DateUtils.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/FileNameSpec.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/IAUtils.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/NestedMap.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/StreamCopy.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/StringFieldExtractor.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/StringParse.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/FileSearchTool.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/SeekableLineReader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/SeekableLineReaderFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/SortedTextFile.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/HDFSSeekableLineReader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/HDFSSeekableLineReaderFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/HTTPSeakableLineReader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/NIOSeekableLineReader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/NIOSeekableLineReaderFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/RandomAccessFileSeekableLineReader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/RandomAccessFileSeekableLineReaderFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/BytesReadObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/CRCInputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/CRCOutputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/CommitedOutputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/EOFNotifyingInputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/EOFObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/MultiMemberOpenJDKGZIPInputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/NotifyingInputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/PushBackOneByteInputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/AbstractPeekableIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/BoundedStringIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/CachingStringFilter.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/CloseableIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/CloseableIteratorUtil.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/FilterStringIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/PeekableIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/SortedCompositeIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/StartBoundedStringIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/StringFilter.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/StringTransformer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/TransformingPrefixStringFilter.java trunk/archive-access/projects/archive-commons/src/test/ trunk/archive-access/projects/archive-commons/src/test/java/ trunk/archive-access/projects/archive-commons/src/test/java/org/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/dns/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/dns/DNSResponseParserTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/gzip/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/gzip/zipnum/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/http/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/http/HttpResponseParserTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/json/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/json/JSONViewTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/text/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/text/html/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/text/html/CDATALexerTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/arc/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/html/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/warc/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/DefaultIAURLCanonicalizerTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/GoogleURLCanonicalizerTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/HandyURLTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/IAURLCanonicalizerTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/URLParserTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/URLRegexTransformerTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/ByteOpTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/CrossProductTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/StringFieldExtractorTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/TestUtils.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/binsearch/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/iterator/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/iterator/CachingStringFilterTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/iterator/FilterStringIteratorTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java trunk/archive-access/projects/archive-commons/src/test/resources/ trunk/archive-access/projects/archive-commons/src/test/resources/org/ trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/ trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/container/ trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/ trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/gzip/ trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/gzip/abcd.gz trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/gzip/double-single-inflate-error.gz trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/gzip/empty.gz trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/gzip/hi-2.gz trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/gzip/hi.gz Added: trunk/archive-access/projects/archive-commons/.classpath =================================================================== --- trunk/archive-access/projects/archive-commons/.classpath (rev 0) +++ trunk/archive-access/projects/archive-commons/.classpath 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,9 @@ +<?xml version="1.0" encoding="UTF-8"?> +<classpath> + <classpathentry kind="src" output="target/classes" path="src/main/java"/> + <classpathentry kind="src" output="target/test-classes" path="src/test/java"/> + <classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources"/> + <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/J2SE-1.5"/> + <classpathentry kind="con" path="org.maven.ide.eclipse.MAVEN2_CLASSPATH_CONTAINER"/> + <classpathentry kind="output" path="target/classes"/> +</classpath> Added: trunk/archive-access/projects/archive-commons/.project =================================================================== --- trunk/archive-access/projects/archive-commons/.project (rev 0) +++ trunk/archive-access/projects/archive-commons/.project 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="UTF-8"?> +<projectDescription> + <name>archive-commons</name> + <comment>NO_M2ECLIPSE_SUPPORT: Project files created with the maven-eclipse-plugin are not supported in M2Eclipse.</comment> + <projects> + <project>archive-surt</project> + </projects> + <buildSpec> + <buildCommand> + <name>org.eclipse.jdt.core.javabuilder</name> + <arguments> + </arguments> + </buildCommand> + <buildCommand> + <name>org.maven.ide.eclipse.maven2Builder</name> + <arguments> + </arguments> + </buildCommand> + </buildSpec> + <natures> + <nature>org.maven.ide.eclipse.maven2Nature</nature> + <nature>org.eclipse.jdt.core.javanature</nature> + </natures> +</projectDescription> Added: trunk/archive-access/projects/archive-commons/.settings/org.eclipse.jdt.core.prefs =================================================================== --- trunk/archive-access/projects/archive-commons/.settings/org.eclipse.jdt.core.prefs (rev 0) +++ trunk/archive-access/projects/archive-commons/.settings/org.eclipse.jdt.core.prefs 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,5 @@ +#Thu Nov 17 17:49:12 PST 2011 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5 +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.source=1.5 +org.eclipse.jdt.core.compiler.compliance=1.5 Added: trunk/archive-access/projects/archive-commons/pom.xml =================================================================== --- trunk/archive-access/projects/archive-commons/pom.xml (rev 0) +++ trunk/archive-access/projects/archive-commons/pom.xml 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,183 @@ +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>org.archive</groupId> + <artifactId>archive-commons</artifactId> + <version>0.0.1-SNAPSHOT</version> + <packaging>jar</packaging> + + <name>archive-commons</name> + <url>http://maven.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + + <dependencies> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>3.8.1</version> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>r08</version> + </dependency> + + <dependency> + <groupId>org.json</groupId> + <artifactId>json</artifactId> + <version>20090211</version> + </dependency> + <dependency> + <groupId>org.htmlparser</groupId> + <artifactId>htmlparser</artifactId> + <version>1.6</version> + </dependency> + + <dependency> + <groupId>org.mozilla</groupId> + <artifactId>juniversalchardet</artifactId> + <version>1.0.3</version> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-core</artifactId> + <version>0.20.2</version> + <exclusions> + <exclusion> + <groupId>commons-httpclient</groupId> + <artifactId>commons-httpclient</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> + <groupId>org.apache.pig</groupId> + <artifactId>pig</artifactId> + <version>0.8.0</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>commons-lang</groupId> + <artifactId>commons-lang</artifactId> + <version>2.5</version> + </dependency> + + <dependency> + <groupId>org.archive</groupId> + <artifactId>archive-surt</artifactId> + <version>1.0-SNAPSHOT</version> + <exclusions> + <exclusion> + <groupId>org.archive.heritrix</groupId> + <artifactId>heritrix-commons</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-io</artifactId> + <version>1.3.2</version> + </dependency> + + <dependency> + <groupId>commons-httpclient</groupId> + <artifactId>commons-httpclient</artifactId> + <version>3.1</version> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>2.3.2</version> + <configuration> + <source>1.5</source> + <target>1.5</target> + </configuration> + </plugin> + <plugin> + <artifactId>maven-assembly-plugin</artifactId> + <version>2.2-beta-1</version> + <configuration> + <descriptorRefs> + <descriptorRef>jar-with-dependencies</descriptorRef> + </descriptorRefs> + <finalName>archive-commons</finalName> + </configuration> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>attached</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-antrun-plugin</artifactId> + <executions> + <execution> + <phase>generate-resources</phase> + <goals> + <goal>run</goal> + </goals> + <configuration> + <tasks> + <!-- Safety --> + <mkdir dir="${project.build.directory}"/> + + <tstamp> + <format property="last.updated" pattern="yyyyMMddhhmmss"/> + </tstamp> + <echo file="${basedir}/target/filter.properties" message="build.time=${last.updated}"/> + </tasks> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + <resources> + <resource> + <directory>src/main/resources</directory> + <filtering>true</filtering> + </resource> + </resources> + <filters> + <filter>${basedir}/target/filter.properties</filter> + </filters> + + </build> + <repositories> + <repository> + <id>internetarchive</id> + <name>Internet Archive Maven Repository</name> + <url>http://builds.archive.org:8080/maven2</url> + <layout>default</layout> + + <releases> + <enabled>true</enabled> + <updatePolicy>daily</updatePolicy> + <checksumPolicy>warn</checksumPolicy> + </releases> + <snapshots> + <enabled>true</enabled> + <updatePolicy>daily</updatePolicy> + <checksumPolicy>warn</checksumPolicy> + </snapshots> + </repository> + </repositories> + + +</project> Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/RecoverableRecordFormatException.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/RecoverableRecordFormatException.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/RecoverableRecordFormatException.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,24 @@ +package org.archive; + +import java.io.IOException; + +public class RecoverableRecordFormatException extends IOException { + + /** + * + */ + private static final long serialVersionUID = 2775048979983919630L; + public RecoverableRecordFormatException() { + super(); + } + public RecoverableRecordFormatException(String message) { + super(message); + } + public RecoverableRecordFormatException(Exception e) { + super(e); + } + public RecoverableRecordFormatException(String message, IOException e) { + super(message,e); + } + +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/CDXExtractorOutput.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/CDXExtractorOutput.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/CDXExtractorOutput.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,60 @@ +package org.archive.extract; + +import java.io.IOException; +import java.io.PrintStream; +import java.util.List; + +import org.archive.format.json.JSONView; +import org.archive.resource.Resource; +import org.archive.util.StreamCopy; + +public class CDXExtractorOutput implements ExtractorOutput { + // CANON DATE URL MIME HTTP-CODE SHA1 REDIR OFFSET FILE + private static String URL_SPEC = "Envelope.ARC-Header-Metadata.Target-URI|Envelope.WARC-Header-Metadata.Target-URI"; + private static String DATE_SPEC = "Envelope.ARC-Header-Metadata.Date"; + private static String MIME_SPEC = "Envelope.ARC-Header-Metadata.Content-Type"; + private static String HTTP_CODE_SPEC = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Response-Message.Status"; + private static String SHA1_SPEC = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Entity-Digest"; + private static String REDIR_SPEC = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers.Content-Location"; + private static String OFFSET_SPEC = "Container.Offset"; + private static String FILENAME_SPEC = "Container.Filename"; + private static String SPECS[] = { + URL_SPEC, DATE_SPEC, URL_SPEC, MIME_SPEC, HTTP_CODE_SPEC, SHA1_SPEC, + REDIR_SPEC, OFFSET_SPEC, FILENAME_SPEC + }; + private static char EMPTY = '-'; + private static char DELIM = ' '; + JSONView view; + private PrintStream out; + public CDXExtractorOutput(PrintStream out) { + view = new JSONView(SPECS); + this.out = out; + } + public void output(Resource resource) throws IOException { + StreamCopy.readToEOF(resource.getInputStream()); + List<List<String>> res = view.apply(resource.getMetaData().getTopMetaData()); + StringBuilder sb = new StringBuilder(); + for(List<String> actual : res) { + sb.setLength(0); +// boolean first = true; + for(int i = 0; i < actual.size(); i++) { +// actual.set(5, actual.get(5).substring(5)); +// for(String f : actual) { + if(i > 0) { + sb.append(DELIM); + } + String f = actual.get(i); + if((f == null) || (f.length() == 0)) { + sb.append(EMPTY); + } else { + if(i == 5) { + sb.append(f.substring(5)); + } else { + sb.append(f); + } + } + } + out.println(sb.toString()); + } + } +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/DumpingExtractorOutput.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/DumpingExtractorOutput.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/DumpingExtractorOutput.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,38 @@ +package org.archive.extract; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintStream; +import java.util.logging.Logger; + +import org.archive.resource.Resource; +import org.archive.util.StreamCopy; +import org.json.JSONException; + +import com.google.common.io.CountingOutputStream; +import com.google.common.io.NullOutputStream; + +public class DumpingExtractorOutput implements ExtractorOutput { + private static final Logger LOG = + Logger.getLogger(DumpingExtractorOutput.class.getName()); + + private PrintStream out; + public DumpingExtractorOutput(OutputStream out) { + this.out = new PrintStream(out); + } + + public void output(Resource resource) throws IOException { + NullOutputStream nullo = new NullOutputStream(); + CountingOutputStream co = new CountingOutputStream(nullo); + StreamCopy.copy(resource.getInputStream(), co); + long bytes = co.getCount(); + if(bytes > 0) { + LOG.info(bytes + " unconsumed bytes in Resource InputStream."); + } + try { + out.println(resource.getMetaData().getTopMetaData().toString(1)); + } catch (JSONException e) { + LOG.warning(e.getMessage()); + } + } +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,215 @@ +package org.archive.extract; + +import java.util.Iterator; +import java.util.logging.Logger; + +import org.archive.format.arc.ARCConstants; +import org.archive.format.warc.WARCConstants; +import org.archive.resource.MetaData; +import org.archive.resource.Resource; +import org.archive.resource.ResourceFactory; +import org.archive.resource.arc.ARCResource; +import org.archive.resource.arc.record.FiledescResourceFactory; +import org.archive.resource.html.HTMLResourceFactory; +import org.archive.resource.http.HTTPHeadersResourceFactory; +import org.archive.resource.http.HTTPRequestResourceFactory; +import org.archive.resource.http.HTTPResponseResource; +import org.archive.resource.http.HTTPResponseResourceFactory; +import org.archive.resource.warc.WARCResource; +import org.archive.resource.warc.record.DNSResourceFactory; +import org.archive.resource.warc.record.WARCJSONMetaDataResourceFactory; +import org.archive.resource.warc.record.WARCMetaDataResourceFactory; +import org.json.JSONException; +import org.json.JSONObject; + +public class ExtractingResourceFactoryMapper implements ResourceFactoryMapper { + + private static final Logger LOG = + Logger.getLogger(ExtractingResourceFactoryMapper.class.getName()); + + private HTTPResponseResourceFactory httpResponseF = + new HTTPResponseResourceFactory(); + + private HTTPRequestResourceFactory httpRequestF = + new HTTPRequestResourceFactory(); + + private HTMLResourceFactory htmlF = new HTMLResourceFactory(); + + private HTTPHeadersResourceFactory warcinfoF = + new HTTPHeadersResourceFactory(WARCINFO_METADATA,PAYLOAD_TYPE_WARCINFO); + + private DNSResourceFactory dnsF = new DNSResourceFactory(); + + private WARCMetaDataResourceFactory warcmetaF = + new WARCMetaDataResourceFactory(); + + private WARCJSONMetaDataResourceFactory warcjsonF = + new WARCJSONMetaDataResourceFactory(); + + private FiledescResourceFactory filedescF = + new FiledescResourceFactory(); + + private String getChildField(MetaData m, String child, String key) { + try { + if(m.has(child)) { + JSONObject c = m.getJSONObject(child); + if(c.has(key)) { + return c.getString(key); + } + } + } catch (JSONException e) { + LOG.warning(e.getMessage()); + } + return null; + } + + private boolean childFieldStartsWith(MetaData m, String child, + String key, String search) { + String val = getChildField(m,child,key); + return val == null ? false : + val.toLowerCase().startsWith(search.toLowerCase()); + } + + private boolean childFieldContains(MetaData m, String child, + String key, String search) { + String val = getChildField(m,child,key); + return val == null ? false : + val.toLowerCase().contains(search.toLowerCase()); + } + + private boolean childFieldEquals(MetaData m, String child, + String key, String search) { + String val = getChildField(m,child,key); + return val == null ? false : + val.equals(search); + } + + private String caseInsensitiveKeyScan(MetaData m, String child, String k) { + try { + if(m.has(child)) { + String kLC = k.toLowerCase(); + JSONObject childJSObj = m.getJSONObject(child); + @SuppressWarnings("rawtypes") + Iterator i = childJSObj.keys(); + while(i.hasNext()) { + Object kObj = i.next(); + if(kObj instanceof String) { + String kString = (String) kObj; + if(kString.toLowerCase().equals(kLC)) { + return childJSObj.getString(kString); + } + } + } + } + } catch (JSONException e) { + LOG.warning(e.getMessage()); + } + return null; + } + + private boolean isFileDescARCResource(MetaData envelope) { + return childFieldStartsWith(envelope, ARC_HEADER_METADATA, + ARCConstants.URL_KEY, ARCConstants.FILEDESC_SCHEME); + } + private boolean isDNSARCResource(MetaData envelope) { + return childFieldContains(envelope, ARC_HEADER_METADATA, + ARCConstants.MIME_KEY, ARCConstants.DNS_MIME); + } + private boolean isDATARCResource(MetaData envelope) { + return childFieldContains(envelope, ARC_HEADER_METADATA, + ARCConstants.MIME_KEY, ARCConstants.ALEXA_DAT_MIME); + } + private boolean isHTTPARCResource(MetaData envelope) { + return childFieldStartsWith(envelope, ARC_HEADER_METADATA, + ARCConstants.URL_KEY, "http"); + } + + private boolean isHTMLHttpResource(MetaData m) { + String type = caseInsensitiveKeyScan(m,HTTP_HEADERS_LIST, + "Content-Type"); + return type == null ? false : type.toLowerCase().contains("html"); + } + + private boolean isWARCType(MetaData envelope, String type) { + return childFieldEquals(envelope,WARC_HEADER_METADATA, + WARCConstants.HEADER_KEY_TYPE,type); + } + private boolean isWARCRevisitResource(MetaData envelope) { + return isWARCType(envelope, WARCConstants.REVISIT); + } + private boolean isWARCResponseResource(MetaData envelope) { + return isWARCType(envelope, WARCConstants.RESPONSE); + } + private boolean isWARCRequestResource(MetaData envelope) { + return isWARCType(envelope, WARCConstants.REQUEST); + } + private boolean isWARCMetaDataResource(MetaData envelope) { + return isWARCType(envelope, WARCConstants.METADATA); + } + private boolean isWARCInfoResource(MetaData envelope) { + return isWARCType(envelope, WARCConstants.WARCINFO); + } + private boolean isHTTPResponseWARCResource(MetaData envelope) { + return childFieldEquals(envelope,WARC_HEADER_METADATA, + WARCConstants.CONTENT_TYPE, + WARCConstants.HTTP_RESPONSE_MIMETYPE); + } + private boolean isWARCJSONResource(MetaData envelope) { + return childFieldEquals(envelope,WARC_HEADER_METADATA, + WARCConstants.CONTENT_TYPE, + "application/json"); + } + private boolean isDNSResponseWARCResource(MetaData envelope) { + return childFieldEquals(envelope,WARC_HEADER_METADATA, + WARCConstants.CONTENT_TYPE,PAYLOAD_TYPE_DNS); + } + + public ResourceFactory mapResourceToFactory(Resource resource) { + + if(resource instanceof WARCResource) { + WARCResource wr = (WARCResource) resource; + MetaData envelope = wr.getEnvelopeMetaData(); + if(isWARCMetaDataResource(envelope)) { + if(isWARCJSONResource(envelope)) { + return warcjsonF; + } else { + return warcmetaF; + } + } else if(isWARCRequestResource(envelope)) { + return httpRequestF; + } else if(isWARCInfoResource(envelope)) { + return warcinfoF; + } else if(isWARCResponseResource(envelope)) { + if(isHTTPResponseWARCResource(envelope)) { + return httpResponseF; + } else if(isDNSResponseWARCResource(envelope)) { + return dnsF; + } + } else if(isWARCRevisitResource(envelope)) { + return httpResponseF; + } + } else if(resource instanceof ARCResource) { + ARCResource ar = (ARCResource) resource; + MetaData envelope = ar.getEnvelopeMetaData(); + if(isFileDescARCResource( envelope)) { + return filedescF; + } else if(isDNSARCResource(envelope)) { + return dnsF; + } else if(isDATARCResource(envelope)) { + // TODO: + } else if(isHTTPARCResource(envelope)) { + return httpResponseF; + } else { + // TODO: ftp? what else? + } + + } else if(resource instanceof HTTPResponseResource) { + if(isHTMLHttpResource(resource.getMetaData())) { + return htmlF; + } else { + // TODO: more formats... + } + } + return null; + } +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceProducer.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceProducer.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceProducer.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,53 @@ +package org.archive.extract; + +import java.io.IOException; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.archive.resource.Resource; +import org.archive.resource.ResourceFactory; +import org.archive.resource.ResourceParseException; +import org.archive.resource.ResourceProducer; + +public class ExtractingResourceProducer implements ResourceProducer { + private static final Logger LOG = + Logger.getLogger(ExtractingResourceProducer.class.getName()); + private ResourceProducer producer; + private ResourceFactoryMapper mapper; + + public ExtractingResourceProducer(ResourceProducer producer, + ResourceFactoryMapper mapper) { + + this.producer = producer; + this.mapper = mapper; + } + + public Resource getNext() throws ResourceParseException, IOException { + Resource current = producer.getNext(); + if(current == null) { + return null; + } + while(true) { + ResourceFactory f = mapper.mapResourceToFactory(current); + if(f == null) { + return current; + } + if(LOG.isLoggable(Level.INFO)) { + LOG.info(String.format("Extracting (%s) with (%s)\n", + current.getClass().toString(), + f.getClass().toString())); + } + current = f.getResource(current.getInputStream(), + current.getMetaData(), current.getContainer()); + } + } + + public void close() throws IOException { + producer.close(); + } + + public String getContext() { + return producer.getContext(); + } + +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractorOutput.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractorOutput.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractorOutput.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,9 @@ +package org.archive.extract; + +import java.io.IOException; + +import org.archive.resource.Resource; + +public interface ExtractorOutput { + public void output(Resource resource) throws IOException; +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/FilteredExtractorOuput.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/FilteredExtractorOuput.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/FilteredExtractorOuput.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,34 @@ +package org.archive.extract; + +import java.io.IOException; +import java.io.PrintStream; +import java.util.List; + +import org.archive.format.json.JSONUtils; +import org.archive.resource.Resource; +import org.archive.util.StreamCopy; + +public class FilteredExtractorOuput implements ExtractorOutput { + private String filterPath; + private PrintStream out; + public FilteredExtractorOuput(PrintStream out, String filterPath) { + this.filterPath = filterPath; + this.out = out; + } + public void output(Resource resource) throws IOException { + StreamCopy.readToEOF(resource.getInputStream()); + List<String> results = JSONUtils.extractFancy(resource.getMetaData().getTopMetaData(), filterPath); + if(results != null) { + for(String result: results) { + out.println("Result: " + result); + } + } + } + public void output2(Resource resource) throws IOException { + String result = JSONUtils.extractSingle(resource.getMetaData().getTopMetaData(), filterPath); + if(result != null) { + out.println("Result:" + result); + } + } + +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/JSONViewExtractorOutput.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/JSONViewExtractorOutput.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/JSONViewExtractorOutput.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,30 @@ +package org.archive.extract; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintStream; +import java.util.List; + +import org.apache.commons.lang.StringUtils; +import org.archive.format.json.JSONView; +import org.archive.resource.Resource; +import org.archive.util.StreamCopy; + +public class JSONViewExtractorOutput implements ExtractorOutput { + private PrintStream out; + private JSONView view; + public JSONViewExtractorOutput(OutputStream out, String filterPath) { + view = new JSONView(filterPath.split(",")); + this.out = new PrintStream(out); + } + public void output(Resource resource) throws IOException { + StreamCopy.readToEOF(resource.getInputStream()); + List<List<String>> data = + view.apply(resource.getMetaData().getTopMetaData()); + if(data != null) { + for(List<String> d : data) { + out.println(StringUtils.join(d,"\t")); + } + } + } +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ProducerUtils.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ProducerUtils.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ProducerUtils.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,86 @@ +package org.archive.extract; + +import java.io.File; +import java.io.IOException; +import java.net.URL; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.archive.resource.ResourceProducer; +import org.archive.resource.producer.ARCFile; +import org.archive.resource.producer.EnvelopedResourceFile; +import org.archive.resource.producer.WARCFile; + +public class ProducerUtils { + public static boolean STRICT_GZ = false; + + public static ResourceProducer getProducer(String path) throws IOException { + return getProducer(path,0); + } + public static ResourceProducer getProducer(String path, long offset) throws IOException { + ResourceProducer producer = null; + EnvelopedResourceFile ef = new EnvelopedResourceFile(null); + ef.setStrict(STRICT_GZ); + ARCFile af = new ARCFile(); + af.setStrict(STRICT_GZ); + WARCFile wf = new WARCFile(); + wf.setStrict(STRICT_GZ); + File file = new File(path); + + if(path.startsWith("hdfs://")) { + String name = file.getName(); + Path fsPath = new Path(path); + FileSystem fs = fsPath.getFileSystem(new Configuration()); + FSDataInputStream fsdis = fs.open(fsPath); + + if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) { + producer = wf.getGZResourceProducer(fsdis,name,offset); + } else if(path.endsWith(".arc.gz")) { + producer = af.getGZResourceProducer(fsdis,name,offset); + } else if(path.endsWith(".arc")) { + producer = af.getResourceProducer(fsdis,name,offset); + } else if(path.endsWith(".warc") || path.endsWith(".wat")) { + producer = wf.getResourceProducer(fsdis,name,offset); + } else if(path.endsWith(".gz")) { + producer = ef.getGZResourceProducer(fsdis,name,offset); + } + + } else if(path.startsWith("http://")) { + String name = file.getName(); + URL url = new URL(path); + + if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) { + producer = wf.getGZResourceProducer(url,name,offset); + } else if(path.endsWith(".arc.gz")) { + producer = af.getGZResourceProducer(url,name,offset); + } else if(path.endsWith(".arc")) { + producer = af.getResourceProducer(url,name,offset); + } else if(path.endsWith(".warc") || path.endsWith(".wat")) { + producer = wf.getResourceProducer(url,name,offset); + } else if(path.endsWith(".gz")) { + producer = ef.getGZResourceProducer(url,name,offset); + } + + } else { + + if(!(file.exists() && file.canRead())) { + System.err.println(path + " is not a readable file."); + return null; + } + if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) { + producer = wf.getGZResourceProducer(file,offset); + } else if(path.endsWith(".arc.gz")) { + producer = af.getGZResourceProducer(file,offset); + } else if(pa... [truncated message content] |