You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bra...@us...> - 2011-11-18 23:15:53
|
Revision: 3565 http://archive-access.svn.sourceforge.net/archive-access/?rev=3565&view=rev Author: bradtofel Date: 2011-11-18 23:15:42 +0000 (Fri, 18 Nov 2011) Log Message: ----------- INITITAL REV- common code for GZIP, ARC, WARC, HTTP, HTML Parsing, JSON, URL canonicalization,... Added Paths: ----------- trunk/archive-access/projects/archive-commons/.classpath trunk/archive-access/projects/archive-commons/.project trunk/archive-access/projects/archive-commons/.settings/ trunk/archive-access/projects/archive-commons/.settings/org.eclipse.jdt.core.prefs trunk/archive-access/projects/archive-commons/pom.xml trunk/archive-access/projects/archive-commons/src/ trunk/archive-access/projects/archive-commons/src/main/ trunk/archive-access/projects/archive-commons/src/main/java/ trunk/archive-access/projects/archive-commons/src/main/java/org/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/RecoverableRecordFormatException.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/CDXExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/DumpingExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceProducer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/FilteredExtractorOuput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/JSONViewExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ProducerUtils.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/RealCDXExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceFactoryMapper.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/WATExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/ARCConstants.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/ARCFormatException.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/ARCMetaData.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/ARCMetaDataParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/FiledescRecord.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/arc/FiledescRecordParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/dns/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/dns/DNSParseException.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/dns/DNSRecord.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/dns/DNSResponse.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/dns/DNSResponseParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPConstants.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPDecoder.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPFExtraRecord.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPFExtraRecords.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPFooter.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPFormatException.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPHeader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPMemberSeries.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPMemberWriter.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPMemberWriterCommittedOutputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPSeriesMember.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/GZIPStaticHeader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/zipnum/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpConstants.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpHeader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpHeaderObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpHeaderParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpHeaders.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpMessage.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpMessageParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpParseException.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpParseObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpRequest.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpRequestMessage.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpRequestMessageObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpRequestMessageParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpRequestParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpResponse.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpResponseMessage.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpResponseMessageObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpResponseMessageParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/http/HttpResponseParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/CompoundORJSONPathSpec.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/CrossProductOfLists.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/JSONPathSpec.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/JSONPathSpecFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/JSONUtils.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/JSONView.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/json/SimpleJSONPathSpec.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/charset/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/charset/CharsetDetector.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/charset/RotatingCharsetDetector.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/charset/StandardCharsetDetector.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/html/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/html/CDATALexer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/html/LexParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/html/NodeUtils.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/text/html/ParseObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/warc/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/warc/WARCConstants.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/format/warc/WARCRecordWriter.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/ArchiveJSONViewLoader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/ArchiveMetadataLoader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/ResourceContext.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/ResourceInputFormat.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/ResourceRecordReader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/func/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/func/JSONViewEvalFunc.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/func/TupleFunc.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/func/URLResolverFunc.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/AbstractEmptyResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/AbstractResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/MetaData.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/MetaDataConstants.java-normal trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/Resource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/ResourceConstants.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/ResourceContainer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/ResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/ResourceParseException.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/ResourceProducer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/TransformingResourceProducer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/arc/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/arc/ARCResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/arc/ARCResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/arc/record/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/arc/record/FiledescResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/arc/record/FiledescResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/generic/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/generic/GenericResourceProducer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/generic/GenericStreamResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/gzip/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/gzip/GZIPMetaData.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/gzip/GZIPResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/gzip/GZIPResourceContainer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/HTMLMetaData.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/HTMLResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/HTMLResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPHeadersResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPHeadersResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPRequestResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPRequestResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPResponseResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPResponseResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/producer/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/producer/ARCFile.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/producer/EnvelopedResourceFile.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/producer/WARCFile.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/WARCResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/WARCResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/DNSResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/DNSResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/WARCMetaDataResource.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/WARCMetaDataResourceFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/AbstractBufferingStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/ByteArrayWrappedStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/HDFSStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/HTTP11Stream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/RandomAccessFileStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/SimpleStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/Stream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/streamcontext/StreamWrappedInputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/CanonicalizeRules.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/CanonicalizerConstants.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/DefaultIACanonicalizerRules.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/DefaultIAURLCanonicalizer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/HandyURL.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/IAURLCanonicalizer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/NonMassagingIAURLCanonicalizer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/SURT.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/SURTTokenizer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/URLCanonicalizer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/URLKeyMaker.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/URLParser.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/URLRegexTransformer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/WaybackURLKeyMaker.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/Base32.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/ByteOp.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/CrossProduct.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/DateUtils.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/FileNameSpec.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/IAUtils.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/NestedMap.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/StreamCopy.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/StringFieldExtractor.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/StringParse.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/FileSearchTool.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/SeekableLineReader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/SeekableLineReaderFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/SortedTextFile.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/HDFSSeekableLineReader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/HDFSSeekableLineReaderFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/HTTPSeakableLineReader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/HTTPSeekableLineReaderFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/NIOSeekableLineReader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/NIOSeekableLineReaderFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/RandomAccessFileSeekableLineReader.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/binsearch/impl/RandomAccessFileSeekableLineReaderFactory.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/BytesReadObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/CRCInputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/CRCOutputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/CommitedOutputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/EOFNotifyingInputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/EOFObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/MultiMemberOpenJDKGZIPInputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/NotifyingInputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/io/PushBackOneByteInputStream.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/AbstractPeekableIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/BoundedStringIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/CachingStringFilter.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/CloseableIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/CloseableIteratorUtil.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/FilterStringIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/PeekableIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/SortedCompositeIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/StartBoundedStringIterator.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/StringFilter.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/StringTransformer.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/util/iterator/TransformingPrefixStringFilter.java trunk/archive-access/projects/archive-commons/src/test/ trunk/archive-access/projects/archive-commons/src/test/java/ trunk/archive-access/projects/archive-commons/src/test/java/org/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/dns/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/dns/DNSResponseParserTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/gzip/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/gzip/GZIPMemberSeriesTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/gzip/zipnum/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/gzip/zipnum/ZipNumWriterTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/http/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/http/HttpRequestMessageParserTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/http/HttpResponseParserTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/json/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/json/JSONPathSpecFactoryTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/json/JSONViewTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/text/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/text/html/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/format/text/html/CDATALexerTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/arc/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/html/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/html/HTMLMetaDataTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/warc/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/DefaultIAURLCanonicalizerTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/GoogleURLCanonicalizerTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/HandyURLTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/IAURLCanonicalizerTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/URLParserTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/URLRegexTransformerTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/ByteOpTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/CrossProductTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/StringFieldExtractorTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/TestUtils.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/binsearch/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/binsearch/SortedTextFileTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/iterator/ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/iterator/CachingStringFilterTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/iterator/FilterStringIteratorTest.java trunk/archive-access/projects/archive-commons/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java trunk/archive-access/projects/archive-commons/src/test/resources/ trunk/archive-access/projects/archive-commons/src/test/resources/org/ trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/ trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/container/ trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/ trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/gzip/ trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/gzip/abcd.gz trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/gzip/double-single-inflate-error.gz trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/gzip/empty.gz trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/gzip/hi-2.gz trunk/archive-access/projects/archive-commons/src/test/resources/org/archive/format/gzip/hi.gz Added: trunk/archive-access/projects/archive-commons/.classpath =================================================================== --- trunk/archive-access/projects/archive-commons/.classpath (rev 0) +++ trunk/archive-access/projects/archive-commons/.classpath 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,9 @@ +<?xml version="1.0" encoding="UTF-8"?> +<classpath> + <classpathentry kind="src" output="target/classes" path="src/main/java"/> + <classpathentry kind="src" output="target/test-classes" path="src/test/java"/> + <classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources"/> + <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/J2SE-1.5"/> + <classpathentry kind="con" path="org.maven.ide.eclipse.MAVEN2_CLASSPATH_CONTAINER"/> + <classpathentry kind="output" path="target/classes"/> +</classpath> Added: trunk/archive-access/projects/archive-commons/.project =================================================================== --- trunk/archive-access/projects/archive-commons/.project (rev 0) +++ trunk/archive-access/projects/archive-commons/.project 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="UTF-8"?> +<projectDescription> + <name>archive-commons</name> + <comment>NO_M2ECLIPSE_SUPPORT: Project files created with the maven-eclipse-plugin are not supported in M2Eclipse.</comment> + <projects> + <project>archive-surt</project> + </projects> + <buildSpec> + <buildCommand> + <name>org.eclipse.jdt.core.javabuilder</name> + <arguments> + </arguments> + </buildCommand> + <buildCommand> + <name>org.maven.ide.eclipse.maven2Builder</name> + <arguments> + </arguments> + </buildCommand> + </buildSpec> + <natures> + <nature>org.maven.ide.eclipse.maven2Nature</nature> + <nature>org.eclipse.jdt.core.javanature</nature> + </natures> +</projectDescription> Added: trunk/archive-access/projects/archive-commons/.settings/org.eclipse.jdt.core.prefs =================================================================== --- trunk/archive-access/projects/archive-commons/.settings/org.eclipse.jdt.core.prefs (rev 0) +++ trunk/archive-access/projects/archive-commons/.settings/org.eclipse.jdt.core.prefs 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,5 @@ +#Thu Nov 17 17:49:12 PST 2011 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5 +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.source=1.5 +org.eclipse.jdt.core.compiler.compliance=1.5 Added: trunk/archive-access/projects/archive-commons/pom.xml =================================================================== --- trunk/archive-access/projects/archive-commons/pom.xml (rev 0) +++ trunk/archive-access/projects/archive-commons/pom.xml 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,183 @@ +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>org.archive</groupId> + <artifactId>archive-commons</artifactId> + <version>0.0.1-SNAPSHOT</version> + <packaging>jar</packaging> + + <name>archive-commons</name> + <url>http://maven.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + + <dependencies> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>3.8.1</version> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>r08</version> + </dependency> + + <dependency> + <groupId>org.json</groupId> + <artifactId>json</artifactId> + <version>20090211</version> + </dependency> + <dependency> + <groupId>org.htmlparser</groupId> + <artifactId>htmlparser</artifactId> + <version>1.6</version> + </dependency> + + <dependency> + <groupId>org.mozilla</groupId> + <artifactId>juniversalchardet</artifactId> + <version>1.0.3</version> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-core</artifactId> + <version>0.20.2</version> + <exclusions> + <exclusion> + <groupId>commons-httpclient</groupId> + <artifactId>commons-httpclient</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> + <groupId>org.apache.pig</groupId> + <artifactId>pig</artifactId> + <version>0.8.0</version> + <scope>provided</scope> + </dependency> + + <dependency> + <groupId>commons-lang</groupId> + <artifactId>commons-lang</artifactId> + <version>2.5</version> + </dependency> + + <dependency> + <groupId>org.archive</groupId> + <artifactId>archive-surt</artifactId> + <version>1.0-SNAPSHOT</version> + <exclusions> + <exclusion> + <groupId>org.archive.heritrix</groupId> + <artifactId>heritrix-commons</artifactId> + </exclusion> + </exclusions> + </dependency> + + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-io</artifactId> + <version>1.3.2</version> + </dependency> + + <dependency> + <groupId>commons-httpclient</groupId> + <artifactId>commons-httpclient</artifactId> + <version>3.1</version> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>2.3.2</version> + <configuration> + <source>1.5</source> + <target>1.5</target> + </configuration> + </plugin> + <plugin> + <artifactId>maven-assembly-plugin</artifactId> + <version>2.2-beta-1</version> + <configuration> + <descriptorRefs> + <descriptorRef>jar-with-dependencies</descriptorRef> + </descriptorRefs> + <finalName>archive-commons</finalName> + </configuration> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>attached</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-antrun-plugin</artifactId> + <executions> + <execution> + <phase>generate-resources</phase> + <goals> + <goal>run</goal> + </goals> + <configuration> + <tasks> + <!-- Safety --> + <mkdir dir="${project.build.directory}"/> + + <tstamp> + <format property="last.updated" pattern="yyyyMMddhhmmss"/> + </tstamp> + <echo file="${basedir}/target/filter.properties" message="build.time=${last.updated}"/> + </tasks> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + <resources> + <resource> + <directory>src/main/resources</directory> + <filtering>true</filtering> + </resource> + </resources> + <filters> + <filter>${basedir}/target/filter.properties</filter> + </filters> + + </build> + <repositories> + <repository> + <id>internetarchive</id> + <name>Internet Archive Maven Repository</name> + <url>http://builds.archive.org:8080/maven2</url> + <layout>default</layout> + + <releases> + <enabled>true</enabled> + <updatePolicy>daily</updatePolicy> + <checksumPolicy>warn</checksumPolicy> + </releases> + <snapshots> + <enabled>true</enabled> + <updatePolicy>daily</updatePolicy> + <checksumPolicy>warn</checksumPolicy> + </snapshots> + </repository> + </repositories> + + +</project> Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/RecoverableRecordFormatException.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/RecoverableRecordFormatException.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/RecoverableRecordFormatException.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,24 @@ +package org.archive; + +import java.io.IOException; + +public class RecoverableRecordFormatException extends IOException { + + /** + * + */ + private static final long serialVersionUID = 2775048979983919630L; + public RecoverableRecordFormatException() { + super(); + } + public RecoverableRecordFormatException(String message) { + super(message); + } + public RecoverableRecordFormatException(Exception e) { + super(e); + } + public RecoverableRecordFormatException(String message, IOException e) { + super(message,e); + } + +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/CDXExtractorOutput.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/CDXExtractorOutput.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/CDXExtractorOutput.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,60 @@ +package org.archive.extract; + +import java.io.IOException; +import java.io.PrintStream; +import java.util.List; + +import org.archive.format.json.JSONView; +import org.archive.resource.Resource; +import org.archive.util.StreamCopy; + +public class CDXExtractorOutput implements ExtractorOutput { + // CANON DATE URL MIME HTTP-CODE SHA1 REDIR OFFSET FILE + private static String URL_SPEC = "Envelope.ARC-Header-Metadata.Target-URI|Envelope.WARC-Header-Metadata.Target-URI"; + private static String DATE_SPEC = "Envelope.ARC-Header-Metadata.Date"; + private static String MIME_SPEC = "Envelope.ARC-Header-Metadata.Content-Type"; + private static String HTTP_CODE_SPEC = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Response-Message.Status"; + private static String SHA1_SPEC = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Entity-Digest"; + private static String REDIR_SPEC = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers.Content-Location"; + private static String OFFSET_SPEC = "Container.Offset"; + private static String FILENAME_SPEC = "Container.Filename"; + private static String SPECS[] = { + URL_SPEC, DATE_SPEC, URL_SPEC, MIME_SPEC, HTTP_CODE_SPEC, SHA1_SPEC, + REDIR_SPEC, OFFSET_SPEC, FILENAME_SPEC + }; + private static char EMPTY = '-'; + private static char DELIM = ' '; + JSONView view; + private PrintStream out; + public CDXExtractorOutput(PrintStream out) { + view = new JSONView(SPECS); + this.out = out; + } + public void output(Resource resource) throws IOException { + StreamCopy.readToEOF(resource.getInputStream()); + List<List<String>> res = view.apply(resource.getMetaData().getTopMetaData()); + StringBuilder sb = new StringBuilder(); + for(List<String> actual : res) { + sb.setLength(0); +// boolean first = true; + for(int i = 0; i < actual.size(); i++) { +// actual.set(5, actual.get(5).substring(5)); +// for(String f : actual) { + if(i > 0) { + sb.append(DELIM); + } + String f = actual.get(i); + if((f == null) || (f.length() == 0)) { + sb.append(EMPTY); + } else { + if(i == 5) { + sb.append(f.substring(5)); + } else { + sb.append(f); + } + } + } + out.println(sb.toString()); + } + } +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/DumpingExtractorOutput.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/DumpingExtractorOutput.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/DumpingExtractorOutput.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,38 @@ +package org.archive.extract; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintStream; +import java.util.logging.Logger; + +import org.archive.resource.Resource; +import org.archive.util.StreamCopy; +import org.json.JSONException; + +import com.google.common.io.CountingOutputStream; +import com.google.common.io.NullOutputStream; + +public class DumpingExtractorOutput implements ExtractorOutput { + private static final Logger LOG = + Logger.getLogger(DumpingExtractorOutput.class.getName()); + + private PrintStream out; + public DumpingExtractorOutput(OutputStream out) { + this.out = new PrintStream(out); + } + + public void output(Resource resource) throws IOException { + NullOutputStream nullo = new NullOutputStream(); + CountingOutputStream co = new CountingOutputStream(nullo); + StreamCopy.copy(resource.getInputStream(), co); + long bytes = co.getCount(); + if(bytes > 0) { + LOG.info(bytes + " unconsumed bytes in Resource InputStream."); + } + try { + out.println(resource.getMetaData().getTopMetaData().toString(1)); + } catch (JSONException e) { + LOG.warning(e.getMessage()); + } + } +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,215 @@ +package org.archive.extract; + +import java.util.Iterator; +import java.util.logging.Logger; + +import org.archive.format.arc.ARCConstants; +import org.archive.format.warc.WARCConstants; +import org.archive.resource.MetaData; +import org.archive.resource.Resource; +import org.archive.resource.ResourceFactory; +import org.archive.resource.arc.ARCResource; +import org.archive.resource.arc.record.FiledescResourceFactory; +import org.archive.resource.html.HTMLResourceFactory; +import org.archive.resource.http.HTTPHeadersResourceFactory; +import org.archive.resource.http.HTTPRequestResourceFactory; +import org.archive.resource.http.HTTPResponseResource; +import org.archive.resource.http.HTTPResponseResourceFactory; +import org.archive.resource.warc.WARCResource; +import org.archive.resource.warc.record.DNSResourceFactory; +import org.archive.resource.warc.record.WARCJSONMetaDataResourceFactory; +import org.archive.resource.warc.record.WARCMetaDataResourceFactory; +import org.json.JSONException; +import org.json.JSONObject; + +public class ExtractingResourceFactoryMapper implements ResourceFactoryMapper { + + private static final Logger LOG = + Logger.getLogger(ExtractingResourceFactoryMapper.class.getName()); + + private HTTPResponseResourceFactory httpResponseF = + new HTTPResponseResourceFactory(); + + private HTTPRequestResourceFactory httpRequestF = + new HTTPRequestResourceFactory(); + + private HTMLResourceFactory htmlF = new HTMLResourceFactory(); + + private HTTPHeadersResourceFactory warcinfoF = + new HTTPHeadersResourceFactory(WARCINFO_METADATA,PAYLOAD_TYPE_WARCINFO); + + private DNSResourceFactory dnsF = new DNSResourceFactory(); + + private WARCMetaDataResourceFactory warcmetaF = + new WARCMetaDataResourceFactory(); + + private WARCJSONMetaDataResourceFactory warcjsonF = + new WARCJSONMetaDataResourceFactory(); + + private FiledescResourceFactory filedescF = + new FiledescResourceFactory(); + + private String getChildField(MetaData m, String child, String key) { + try { + if(m.has(child)) { + JSONObject c = m.getJSONObject(child); + if(c.has(key)) { + return c.getString(key); + } + } + } catch (JSONException e) { + LOG.warning(e.getMessage()); + } + return null; + } + + private boolean childFieldStartsWith(MetaData m, String child, + String key, String search) { + String val = getChildField(m,child,key); + return val == null ? false : + val.toLowerCase().startsWith(search.toLowerCase()); + } + + private boolean childFieldContains(MetaData m, String child, + String key, String search) { + String val = getChildField(m,child,key); + return val == null ? false : + val.toLowerCase().contains(search.toLowerCase()); + } + + private boolean childFieldEquals(MetaData m, String child, + String key, String search) { + String val = getChildField(m,child,key); + return val == null ? false : + val.equals(search); + } + + private String caseInsensitiveKeyScan(MetaData m, String child, String k) { + try { + if(m.has(child)) { + String kLC = k.toLowerCase(); + JSONObject childJSObj = m.getJSONObject(child); + @SuppressWarnings("rawtypes") + Iterator i = childJSObj.keys(); + while(i.hasNext()) { + Object kObj = i.next(); + if(kObj instanceof String) { + String kString = (String) kObj; + if(kString.toLowerCase().equals(kLC)) { + return childJSObj.getString(kString); + } + } + } + } + } catch (JSONException e) { + LOG.warning(e.getMessage()); + } + return null; + } + + private boolean isFileDescARCResource(MetaData envelope) { + return childFieldStartsWith(envelope, ARC_HEADER_METADATA, + ARCConstants.URL_KEY, ARCConstants.FILEDESC_SCHEME); + } + private boolean isDNSARCResource(MetaData envelope) { + return childFieldContains(envelope, ARC_HEADER_METADATA, + ARCConstants.MIME_KEY, ARCConstants.DNS_MIME); + } + private boolean isDATARCResource(MetaData envelope) { + return childFieldContains(envelope, ARC_HEADER_METADATA, + ARCConstants.MIME_KEY, ARCConstants.ALEXA_DAT_MIME); + } + private boolean isHTTPARCResource(MetaData envelope) { + return childFieldStartsWith(envelope, ARC_HEADER_METADATA, + ARCConstants.URL_KEY, "http"); + } + + private boolean isHTMLHttpResource(MetaData m) { + String type = caseInsensitiveKeyScan(m,HTTP_HEADERS_LIST, + "Content-Type"); + return type == null ? false : type.toLowerCase().contains("html"); + } + + private boolean isWARCType(MetaData envelope, String type) { + return childFieldEquals(envelope,WARC_HEADER_METADATA, + WARCConstants.HEADER_KEY_TYPE,type); + } + private boolean isWARCRevisitResource(MetaData envelope) { + return isWARCType(envelope, WARCConstants.REVISIT); + } + private boolean isWARCResponseResource(MetaData envelope) { + return isWARCType(envelope, WARCConstants.RESPONSE); + } + private boolean isWARCRequestResource(MetaData envelope) { + return isWARCType(envelope, WARCConstants.REQUEST); + } + private boolean isWARCMetaDataResource(MetaData envelope) { + return isWARCType(envelope, WARCConstants.METADATA); + } + private boolean isWARCInfoResource(MetaData envelope) { + return isWARCType(envelope, WARCConstants.WARCINFO); + } + private boolean isHTTPResponseWARCResource(MetaData envelope) { + return childFieldEquals(envelope,WARC_HEADER_METADATA, + WARCConstants.CONTENT_TYPE, + WARCConstants.HTTP_RESPONSE_MIMETYPE); + } + private boolean isWARCJSONResource(MetaData envelope) { + return childFieldEquals(envelope,WARC_HEADER_METADATA, + WARCConstants.CONTENT_TYPE, + "application/json"); + } + private boolean isDNSResponseWARCResource(MetaData envelope) { + return childFieldEquals(envelope,WARC_HEADER_METADATA, + WARCConstants.CONTENT_TYPE,PAYLOAD_TYPE_DNS); + } + + public ResourceFactory mapResourceToFactory(Resource resource) { + + if(resource instanceof WARCResource) { + WARCResource wr = (WARCResource) resource; + MetaData envelope = wr.getEnvelopeMetaData(); + if(isWARCMetaDataResource(envelope)) { + if(isWARCJSONResource(envelope)) { + return warcjsonF; + } else { + return warcmetaF; + } + } else if(isWARCRequestResource(envelope)) { + return httpRequestF; + } else if(isWARCInfoResource(envelope)) { + return warcinfoF; + } else if(isWARCResponseResource(envelope)) { + if(isHTTPResponseWARCResource(envelope)) { + return httpResponseF; + } else if(isDNSResponseWARCResource(envelope)) { + return dnsF; + } + } else if(isWARCRevisitResource(envelope)) { + return httpResponseF; + } + } else if(resource instanceof ARCResource) { + ARCResource ar = (ARCResource) resource; + MetaData envelope = ar.getEnvelopeMetaData(); + if(isFileDescARCResource( envelope)) { + return filedescF; + } else if(isDNSARCResource(envelope)) { + return dnsF; + } else if(isDATARCResource(envelope)) { + // TODO: + } else if(isHTTPARCResource(envelope)) { + return httpResponseF; + } else { + // TODO: ftp? what else? + } + + } else if(resource instanceof HTTPResponseResource) { + if(isHTMLHttpResource(resource.getMetaData())) { + return htmlF; + } else { + // TODO: more formats... + } + } + return null; + } +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceProducer.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceProducer.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractingResourceProducer.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,53 @@ +package org.archive.extract; + +import java.io.IOException; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.archive.resource.Resource; +import org.archive.resource.ResourceFactory; +import org.archive.resource.ResourceParseException; +import org.archive.resource.ResourceProducer; + +public class ExtractingResourceProducer implements ResourceProducer { + private static final Logger LOG = + Logger.getLogger(ExtractingResourceProducer.class.getName()); + private ResourceProducer producer; + private ResourceFactoryMapper mapper; + + public ExtractingResourceProducer(ResourceProducer producer, + ResourceFactoryMapper mapper) { + + this.producer = producer; + this.mapper = mapper; + } + + public Resource getNext() throws ResourceParseException, IOException { + Resource current = producer.getNext(); + if(current == null) { + return null; + } + while(true) { + ResourceFactory f = mapper.mapResourceToFactory(current); + if(f == null) { + return current; + } + if(LOG.isLoggable(Level.INFO)) { + LOG.info(String.format("Extracting (%s) with (%s)\n", + current.getClass().toString(), + f.getClass().toString())); + } + current = f.getResource(current.getInputStream(), + current.getMetaData(), current.getContainer()); + } + } + + public void close() throws IOException { + producer.close(); + } + + public String getContext() { + return producer.getContext(); + } + +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractorOutput.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractorOutput.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ExtractorOutput.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,9 @@ +package org.archive.extract; + +import java.io.IOException; + +import org.archive.resource.Resource; + +public interface ExtractorOutput { + public void output(Resource resource) throws IOException; +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/FilteredExtractorOuput.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/FilteredExtractorOuput.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/FilteredExtractorOuput.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,34 @@ +package org.archive.extract; + +import java.io.IOException; +import java.io.PrintStream; +import java.util.List; + +import org.archive.format.json.JSONUtils; +import org.archive.resource.Resource; +import org.archive.util.StreamCopy; + +public class FilteredExtractorOuput implements ExtractorOutput { + private String filterPath; + private PrintStream out; + public FilteredExtractorOuput(PrintStream out, String filterPath) { + this.filterPath = filterPath; + this.out = out; + } + public void output(Resource resource) throws IOException { + StreamCopy.readToEOF(resource.getInputStream()); + List<String> results = JSONUtils.extractFancy(resource.getMetaData().getTopMetaData(), filterPath); + if(results != null) { + for(String result: results) { + out.println("Result: " + result); + } + } + } + public void output2(Resource resource) throws IOException { + String result = JSONUtils.extractSingle(resource.getMetaData().getTopMetaData(), filterPath); + if(result != null) { + out.println("Result:" + result); + } + } + +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/JSONViewExtractorOutput.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/JSONViewExtractorOutput.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/JSONViewExtractorOutput.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,30 @@ +package org.archive.extract; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintStream; +import java.util.List; + +import org.apache.commons.lang.StringUtils; +import org.archive.format.json.JSONView; +import org.archive.resource.Resource; +import org.archive.util.StreamCopy; + +public class JSONViewExtractorOutput implements ExtractorOutput { + private PrintStream out; + private JSONView view; + public JSONViewExtractorOutput(OutputStream out, String filterPath) { + view = new JSONView(filterPath.split(",")); + this.out = new PrintStream(out); + } + public void output(Resource resource) throws IOException { + StreamCopy.readToEOF(resource.getInputStream()); + List<List<String>> data = + view.apply(resource.getMetaData().getTopMetaData()); + if(data != null) { + for(List<String> d : data) { + out.println(StringUtils.join(d,"\t")); + } + } + } +} Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ProducerUtils.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ProducerUtils.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ProducerUtils.java 2011-11-18 23:15:42 UTC (rev 3565) @@ -0,0 +1,86 @@ +package org.archive.extract; + +import java.io.File; +import java.io.IOException; +import java.net.URL; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.archive.resource.ResourceProducer; +import org.archive.resource.producer.ARCFile; +import org.archive.resource.producer.EnvelopedResourceFile; +import org.archive.resource.producer.WARCFile; + +public class ProducerUtils { + public static boolean STRICT_GZ = false; + + public static ResourceProducer getProducer(String path) throws IOException { + return getProducer(path,0); + } + public static ResourceProducer getProducer(String path, long offset) throws IOException { + ResourceProducer producer = null; + EnvelopedResourceFile ef = new EnvelopedResourceFile(null); + ef.setStrict(STRICT_GZ); + ARCFile af = new ARCFile(); + af.setStrict(STRICT_GZ); + WARCFile wf = new WARCFile(); + wf.setStrict(STRICT_GZ); + File file = new File(path); + + if(path.startsWith("hdfs://")) { + String name = file.getName(); + Path fsPath = new Path(path); + FileSystem fs = fsPath.getFileSystem(new Configuration()); + FSDataInputStream fsdis = fs.open(fsPath); + + if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) { + producer = wf.getGZResourceProducer(fsdis,name,offset); + } else if(path.endsWith(".arc.gz")) { + producer = af.getGZResourceProducer(fsdis,name,offset); + } else if(path.endsWith(".arc")) { + producer = af.getResourceProducer(fsdis,name,offset); + } else if(path.endsWith(".warc") || path.endsWith(".wat")) { + producer = wf.getResourceProducer(fsdis,name,offset); + } else if(path.endsWith(".gz")) { + producer = ef.getGZResourceProducer(fsdis,name,offset); + } + + } else if(path.startsWith("http://")) { + String name = file.getName(); + URL url = new URL(path); + + if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) { + producer = wf.getGZResourceProducer(url,name,offset); + } else if(path.endsWith(".arc.gz")) { + producer = af.getGZResourceProducer(url,name,offset); + } else if(path.endsWith(".arc")) { + producer = af.getResourceProducer(url,name,offset); + } else if(path.endsWith(".warc") || path.endsWith(".wat")) { + producer = wf.getResourceProducer(url,name,offset); + } else if(path.endsWith(".gz")) { + producer = ef.getGZResourceProducer(url,name,offset); + } + + } else { + + if(!(file.exists() && file.canRead())) { + System.err.println(path + " is not a readable file."); + return null; + } + if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) { + producer = wf.getGZResourceProducer(file,offset); + } else if(path.endsWith(".arc.gz")) { + producer = af.getGZResourceProducer(file,offset); + } else if(pa... [truncated message content] |
From: <bra...@us...> - 2011-11-18 23:07:05
|
Revision: 3564 http://archive-access.svn.sourceforge.net/archive-access/?rev=3564&view=rev Author: bradtofel Date: 2011-11-18 23:06:59 +0000 (Fri, 18 Nov 2011) Log Message: ----------- Added Paths: ----------- trunk/archive-access/projects/archive-commons/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-11-17 18:51:21
|
Revision: 3563 http://archive-access.svn.sourceforge.net/archive-access/?rev=3563&view=rev Author: bradtofel Date: 2011-11-17 18:51:13 +0000 (Thu, 17 Nov 2011) Log Message: ----------- Initial import - Maven project which uses JARJAR to create a minimal JAR containing H3 SURTTokenizer code, plus OpenJDK7GZipInputStream Added Paths: ----------- trunk/archive-access/projects/archive-surt/.classpath trunk/archive-access/projects/archive-surt/.project trunk/archive-access/projects/archive-surt/.settings/ trunk/archive-access/projects/archive-surt/.settings/org.eclipse.jdt.core.prefs trunk/archive-access/projects/archive-surt/.settings/org.maven.ide.eclipse.prefs trunk/archive-access/projects/archive-surt/pom.xml trunk/archive-access/projects/archive-surt/src/ trunk/archive-access/projects/archive-surt/src/main/ trunk/archive-access/projects/archive-surt/src/main/java/ trunk/archive-access/projects/archive-surt/src/main/java/org/ trunk/archive-access/projects/archive-surt/src/main/java/org/archive/ trunk/archive-access/projects/archive-surt/src/test/ trunk/archive-access/projects/archive-surt/src/test/java/ trunk/archive-access/projects/archive-surt/src/test/java/org/ trunk/archive-access/projects/archive-surt/src/test/java/org/archive/ Property Changed: ---------------- trunk/archive-access/projects/archive-surt/ Property changes on: trunk/archive-access/projects/archive-surt ___________________________________________________________________ Added: svn:ignore + target Added: trunk/archive-access/projects/archive-surt/.classpath =================================================================== --- trunk/archive-access/projects/archive-surt/.classpath (rev 0) +++ trunk/archive-access/projects/archive-surt/.classpath 2011-11-17 18:51:13 UTC (rev 3563) @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<classpath> + <classpathentry kind="src" output="target/classes" path="src/main/java"/> + <classpathentry kind="src" output="target/test-classes" path="src/test/java"/> + <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/J2SE-1.5"/> + <classpathentry kind="con" path="org.maven.ide.eclipse.MAVEN2_CLASSPATH_CONTAINER"/> + <classpathentry kind="output" path="target/classes"/> +</classpath> Added: trunk/archive-access/projects/archive-surt/.project =================================================================== --- trunk/archive-access/projects/archive-surt/.project (rev 0) +++ trunk/archive-access/projects/archive-surt/.project 2011-11-17 18:51:13 UTC (rev 3563) @@ -0,0 +1,23 @@ +<?xml version="1.0" encoding="UTF-8"?> +<projectDescription> + <name>archive-surt</name> + <comment>NO_M2ECLIPSE_SUPPORT: Project files created with the maven-eclipse-plugin are not supported in M2Eclipse.</comment> + <projects> + </projects> + <buildSpec> + <buildCommand> + <name>org.eclipse.jdt.core.javabuilder</name> + <arguments> + </arguments> + </buildCommand> + <buildCommand> + <name>org.maven.ide.eclipse.maven2Builder</name> + <arguments> + </arguments> + </buildCommand> + </buildSpec> + <natures> + <nature>org.maven.ide.eclipse.maven2Nature</nature> + <nature>org.eclipse.jdt.core.javanature</nature> + </natures> +</projectDescription> Added: trunk/archive-access/projects/archive-surt/.settings/org.eclipse.jdt.core.prefs =================================================================== --- trunk/archive-access/projects/archive-surt/.settings/org.eclipse.jdt.core.prefs (rev 0) +++ trunk/archive-access/projects/archive-surt/.settings/org.eclipse.jdt.core.prefs 2011-11-17 18:51:13 UTC (rev 3563) @@ -0,0 +1,6 @@ +#Tue Nov 01 11:11:47 PDT 2011 +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5 +org.eclipse.jdt.core.compiler.compliance=1.5 +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.source=1.5 Added: trunk/archive-access/projects/archive-surt/.settings/org.maven.ide.eclipse.prefs =================================================================== --- trunk/archive-access/projects/archive-surt/.settings/org.maven.ide.eclipse.prefs (rev 0) +++ trunk/archive-access/projects/archive-surt/.settings/org.maven.ide.eclipse.prefs 2011-11-17 18:51:13 UTC (rev 3563) @@ -0,0 +1,9 @@ +#Tue Nov 01 11:11:46 PDT 2011 +activeProfiles= +eclipse.preferences.version=1 +fullBuildGoals=process-test-resources +includeModules=false +resolveWorkspaceProjects=true +resourceFilterGoals=process-resources resources\:testResources +skipCompilerPlugin=true +version=1 Added: trunk/archive-access/projects/archive-surt/pom.xml =================================================================== --- trunk/archive-access/projects/archive-surt/pom.xml (rev 0) +++ trunk/archive-access/projects/archive-surt/pom.xml 2011-11-17 18:51:13 UTC (rev 3563) @@ -0,0 +1,95 @@ +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>org.archive</groupId> + <artifactId>archive-surt</artifactId> + <version>1.0-SNAPSHOT</version> + <packaging>jar</packaging> + + <name>archive-surt</name> + <url>http://maven.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + + <dependencies> + <dependency> + <groupId>org.archive.heritrix</groupId> + <artifactId>heritrix-commons</artifactId> + <version>3.1.0-SNAPSHOT</version> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>3.8.1</version> + <scope>test</scope> + </dependency> + </dependencies> + <build> + <pluginManagement> + <plugins> + <plugin> + <groupId>org.sonatype.plugins</groupId> + <artifactId>jarjar-maven-plugin</artifactId> + <version>1.5</version> + </plugin> + </plugins> + </pluginManagement> + <plugins> + <plugin> + <groupId>org.sonatype.plugins</groupId> + <artifactId>jarjar-maven-plugin</artifactId> + <version>1.5</version> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>jarjar</goal> + </goals> + <configuration> + + <rules> + <keep> + <pattern>org.archive.surt.SURTTokenizer</pattern> + </keep> + <keep> + <pattern>org.archive.util.zip.OpenJDK7GZIPInputStream</pattern> + </keep> + </rules> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>2.3.2</version> + <configuration> + <source>1.5</source> + <target>1.5</target> + </configuration> + </plugin> + </plugins> + </build> + <repositories> + <repository> + <id>internetarchive</id> + <name>Internet Archive Maven Repository</name> + <url>http://builds.archive.org:8080/maven2</url> + <layout>default</layout> + + <releases> + <enabled>true</enabled> + <updatePolicy>daily</updatePolicy> + <checksumPolicy>warn</checksumPolicy> + </releases> + <snapshots> + <enabled>true</enabled> + <updatePolicy>daily</updatePolicy> + <checksumPolicy>fail</checksumPolicy> + </snapshots> + </repository> + </repositories> + </project> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-11-17 18:49:20
|
Revision: 3562 http://archive-access.svn.sourceforge.net/archive-access/?rev=3562&view=rev Author: bradtofel Date: 2011-11-17 18:49:14 +0000 (Thu, 17 Nov 2011) Log Message: ----------- Initial import - Maven project which uses JARJAR to create a minimal JAR containing H3 SURTTokenizer code, plus OpenJDK7GZipInputStream Added Paths: ----------- trunk/archive-access/projects/archive-surt/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3561 http://archive-access.svn.sourceforge.net/archive-access/?rev=3561&view=rev Author: bradtofel Date: 2011-11-16 23:17:12 +0000 (Wed, 16 Nov 2011) Log Message: ----------- BUGFIX: if there were no headers, and no content, prevent Tomcat from making something up (transparently) by setting the Content-Length to 0 Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TransparentReplayRenderer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TransparentReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TransparentReplayRenderer.java 2011-11-16 22:19:49 UTC (rev 3560) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TransparentReplayRenderer.java 2011-11-16 23:17:12 UTC (rev 3561) @@ -76,8 +76,16 @@ // and copy the raw byte-stream. OutputStream os = httpResponse.getOutputStream(); byte[] buffer = new byte[BUFFER_SIZE]; + long total = 0; for (int r = -1; (r = resource.read(buffer, 0, BUFFER_SIZE)) != -1;) { os.write(buffer, 0, r); + total += r; } + if(total == 0) { + if(headers.size() == 0) { + // totally empty response + httpResponse.setContentLength(0); + } + } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-11-16 22:19:55
|
Revision: 3560 http://archive-access.svn.sourceforge.net/archive-access/?rev=3560&view=rev Author: bradtofel Date: 2011-11-16 22:19:49 +0000 (Wed, 16 Nov 2011) Log Message: ----------- INITIAL REV: drop in replacement for StaticMapExclusionFilter*, which is much more performant, and has better test coverage Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterFactory.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilter.java 2011-11-16 22:19:49 UTC (rev 3560) @@ -0,0 +1,86 @@ +package org.archive.wayback.accesscontrol.staticmap; + +import java.util.Map; +import java.util.TreeSet; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.URIException; +import org.archive.util.SURT; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.resourceindex.filters.ExclusionFilter; +import org.archive.wayback.surt.SURTTokenizer; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +public class StaticListExclusionFilter extends ExclusionFilter { + private static final Logger LOGGER = Logger.getLogger( + StaticMapExclusionFilter.class.getName()); + + private String lastChecked = null; + private boolean lastCheckedExcluded = false; + private boolean notifiedSeen = false; + private boolean notifiedPassed = false; + TreeSet<String> exclusions = null; + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + /** + * @param map where each String key is a SURT that is blocked. + */ + public StaticListExclusionFilter(TreeSet<String> exclusions, UrlCanonicalizer canonicalizer) { + this.exclusions = exclusions; + this.canonicalizer = canonicalizer; + } + + protected boolean isExcluded(String surt) { + String possiblePrefix = exclusions.floor(surt); + return (possiblePrefix != null && surt.startsWith(possiblePrefix)); + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultFilter#filterSearchResult(org.archive.wayback.core.SearchResult) + */ + public int filterObject(CaptureSearchResult r) { + if(!notifiedSeen) { + if(filterGroup != null) { + filterGroup.setSawAdministrative(); + } + notifiedSeen = true; + } + String surt; + try { + String url = canonicalizer.urlStringToKey(r.getOriginalUrl()); + surt = SURT.fromPlain(url); +// surt = SURTTokenizer.prefixKey(url); + } catch (URIException e) { + + //e.printStackTrace(); + return FILTER_EXCLUDE; + } + if(lastChecked != null) { + if(lastChecked.equals(surt)) { + if(lastCheckedExcluded) { + return ObjectFilter.FILTER_EXCLUDE; + } else { + // don't need to: already did last time... + //filterGroup.setPassedAdministrative(); + return ObjectFilter.FILTER_INCLUDE; + } + } + } + lastChecked = surt; + lastCheckedExcluded = isExcluded(surt); + if(lastCheckedExcluded) { + return ObjectFilter.FILTER_EXCLUDE; + } else { + if(!notifiedPassed) { + if(filterGroup != null) { + filterGroup.setPassedAdministrative(); + } + notifiedPassed = true; + } + return ObjectFilter.FILTER_INCLUDE; + } + + } + +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterFactory.java 2011-11-16 22:19:49 UTC (rev 3560) @@ -0,0 +1,186 @@ +package org.archive.wayback.accesscontrol.staticmap; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.TreeSet; +import java.util.logging.Logger; + +import org.archive.util.SURT; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.accesscontrol.ExclusionFilterFactory; +import org.archive.wayback.resourceindex.filters.ExclusionFilter; +import org.archive.wayback.surt.SURTTokenizer; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.flatfile.FlatFile; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +public class StaticListExclusionFilterFactory implements ExclusionFilterFactory { + private static final Logger LOGGER = + Logger.getLogger(StaticMapExclusionFilterFactory.class.getName()); + + private int checkInterval = 0; + private TreeSet<String> excludes = null; + private File file = null; + long lastUpdated = 0; + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + + /** + * Thread object of update thread -- also is flag indicating if the thread + * has already been started -- static, and access to it is synchronized. + */ + private static Thread updateThread = null; + + /** + * load exclusion file and startup polling thread to check for updates + * @throws IOException if the exclusion file could not be read. + */ + public void init() throws IOException { + reloadFile(); + if(checkInterval > 0) { + startUpdateThread(); + } + } + + protected void reloadFile() throws IOException { + long currentMod = file.lastModified(); + if(currentMod == lastUpdated) { + if(currentMod == 0) { + LOGGER.severe("No exclude file at " + file.getAbsolutePath()); + } + return; + } + LOGGER.info("Reloading exclusion file " + file.getAbsolutePath()); + try { + excludes = loadFile(file.getAbsolutePath()); + lastUpdated = currentMod; + LOGGER.info("Reload " + file.getAbsolutePath() + " OK"); + } catch(IOException e) { + lastUpdated = -1; + excludes = null; + e.printStackTrace(); + LOGGER.severe("Reload " + file.getAbsolutePath() + " FAILED:" + + e.getLocalizedMessage()); + } + } + protected TreeSet<String> loadFile(String path) throws IOException { + TreeSet<String> excludes = new TreeSet<String>(); + FlatFile ff = new FlatFile(path); + CloseableIterator<String> itr = ff.getSequentialIterator(); + while(itr.hasNext()) { + String line = (String) itr.next(); + line = line.trim(); + if(line.length() == 0) { + continue; + } + line = canonicalizer.urlStringToKey(line); + String surt = line.startsWith("(") ? line : SURT.fromPlain(line); +// SURTTokenizer.prefixKey(line); + LOGGER.fine("EXCLUSION-MAP: adding " + surt); + excludes.add(surt); + } + itr.close(); + return excludes; + } + + /** + * @return ObjectFilter which blocks CaptureSearchResults in the + * exclusion file. + */ + public ExclusionFilter get() { + if(excludes == null) { + return null; + } + return new StaticListExclusionFilter(excludes, canonicalizer); + } + + private synchronized void startUpdateThread() { + if (updateThread != null) { + return; + } + updateThread = new CacheUpdaterThread(this,checkInterval); + updateThread.start(); + } + private synchronized void stopUpdateThread() { + if (updateThread == null) { + return; + } + updateThread.interrupt(); + } + + private class CacheUpdaterThread extends Thread { + /** + * object which merges CDX files with the BDBResourceIndex + */ + private StaticListExclusionFilterFactory service = null; + + private int runInterval; + + /** + * @param service ExclusionFactory which will be reloaded + * @param runInterval int number of seconds between reloads + */ + public CacheUpdaterThread(StaticListExclusionFilterFactory service, int runInterval) { + super("CacheUpdaterThread"); + super.setDaemon(true); + this.service = service; + this.runInterval = runInterval; + LOGGER.info("CacheUpdaterThread is alive."); + } + + public void run() { + int sleepInterval = runInterval; + while (true) { + try { + try { + service.reloadFile(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + Thread.sleep(sleepInterval * 1000); + } catch (InterruptedException e) { + e.printStackTrace(); + return; + } + } + } + } + + /** + * @return the checkInterval in seconds + */ + public int getCheckInterval() { + return checkInterval; + } + + /** + * @param checkInterval the checkInterval in seconds to set + */ + public void setCheckInterval(int checkInterval) { + this.checkInterval = checkInterval; + } + + /** + * @return the path + */ + public String getFile() { + return file.getAbsolutePath(); + } + + /** + * @param path the file to set + */ + public void setFile(String path) { + this.file = new File(path); + } + + /* (non-Javadoc) + * @see org.archive.wayback.accesscontrol.ExclusionFilterFactory#shutdown() + */ + public void shutdown() { + stopUpdateThread(); + } + +} Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterTest.java 2011-11-16 22:19:49 UTC (rev 3560) @@ -0,0 +1,164 @@ +package org.archive.wayback.accesscontrol.staticmap; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.TreeSet; + +import org.archive.util.SURT; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +import junit.framework.TestCase; + +public class StaticListExclusionFilterTest extends TestCase { + File tmpFile = null; + StaticListExclusionFilterFactory factory = null; + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + + protected void setUp() throws Exception { + super.setUp(); + factory = new StaticListExclusionFilterFactory(); + tmpFile = File.createTempFile("static-map", ".tmp"); +// Properties p = new Properties(); +// p.put("resourceindex.exclusionpath", tmpFile.getAbsolutePath()); +// factory.init(p); + } + + /* + * @see TestCase#tearDown() + */ + protected void tearDown() throws Exception { + super.tearDown(); + if(tmpFile != null && tmpFile.exists()) { + tmpFile.delete(); + } + } + + /** + * @throws Exception + */ + public void testRealWorld() throws Exception { + String bases[] = { "pho-c.co.jp/~clever", + "sf.net/pop/Roger", + "www.eva-stu.vn", + "mins.com.br/", + "24.ne.jp", + "24.ne.jp/~nekko"}; +// setTmpContents(bases); + + + ObjectFilter<CaptureSearchResult> filter = getFilter(bases); + assertFalse("unmassaged",isBlocked(filter,"24.ne.jp.idpnt.com/robots.txt")); + assertTrue("massage",isBlocked(filter,"http://24.ne.jp:80/")); + assertTrue("unmassaged",isBlocked(filter,"http://www.pho-c.co.jp/~clever")); + assertTrue("massage",isBlocked(filter,"http://24.ne.jp")); + + + assertTrue("unmassaged",isBlocked(filter,"http://www.pho-c.co.jp/~clever")); + assertTrue("massaged",isBlocked(filter,"http://pho-c.co.jp/~clever")); + assertTrue("trailing-slash",isBlocked(filter,"http://pho-c.co.jp/~clever/")); + assertTrue("subpath",isBlocked(filter,"http://pho-c.co.jp/~clever/foo.txt")); + + assertTrue("full-port",isBlocked(filter,"http://www.mins.com.br:80")); + assertTrue("tail-slash-port",isBlocked(filter,"http://www.mins.com.br:80/")); + assertTrue("full",isBlocked(filter,"http://www.mins.com.br")); + assertTrue("tail-slash",isBlocked(filter,"http://www.mins.com.br/")); + assertTrue("full-massage",isBlocked(filter,"http://mins.com.br")); + assertTrue("tail-slash-massage",isBlocked(filter,"http://mins.com.br/")); + assertTrue("massage",isBlocked(filter,"http://mins.com.br/foo.txt")); + assertTrue("subpath",isBlocked(filter,"http://www13.mins.com.br/~clever/foo.txt")); + + assertTrue("massage",isBlocked(filter,"24.ne.jp")); + assertTrue("full",isBlocked(filter,"http://www.mins.com.br")); + assertTrue("subpath",isBlocked(filter,"www.24.ne.jp")); + assertTrue("tail-slash-massage",isBlocked(filter,"http://mins.com.br/")); + assertTrue("subpath",isBlocked(filter,"http://www.24.ne.jp:80/")); + + + + + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger")); + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger/")); + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger//")); + assertFalse(isBlocked(filter,"http://sf.net/pop/")); + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger/2")); + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger/23")); + assertTrue(isBlocked(filter,"http://www.sf.net/pop/Roger")); + assertTrue(isBlocked(filter,"http://www1.sf.net/pop/Roger")); + assertTrue(isBlocked(filter,"http://www23.sf.net/pop/Roger")); + + assertTrue(isBlocked(filter,"http://www23.eva-stu.vn/")); + assertTrue(isBlocked(filter,"http://www23.eva-stu.vn")); + assertTrue(isBlocked(filter,"http://eva-stu.vn")); + assertTrue(isBlocked(filter,"http://www.eva-stu.vn/")); + assertTrue(isBlocked(filter,"http://eva-stu.vn/")); + assertTrue(isBlocked(filter,"http://www.eva-stu.vn/foo.txt")); + assertTrue(isBlocked(filter,"http://www2.eva-stu.vn/foo/bar.txt")); + assertTrue(isBlocked(filter,"http://eva-stu.vn/foo/bar.txt")); + + } + + + /** + * @throws Exception + */ + public void testBaseNoPrefix() throws Exception { + + String str = "http://peagreenboat.com/"; +// String str = "http://(com,peagreenboat"; + System.out.format("(%s) -> [%s]\n", str,SURT.prefixFromPlain(str)); + + + String bases[] = {"http://www.peagreenboat.com/", + "http://peagreenboat.com/"}; +// setTmpContents(bases); + ObjectFilter<CaptureSearchResult> filter = getFilter(bases); + assertTrue("unmassaged",isBlocked(filter,"http://www.peagreenboat.com")); + assertTrue("unmassaged",isBlocked(filter,"http://peagreenboat.com")); + assertFalse("other1",isBlocked(filter,"http://peagreenboatt.com")); + assertFalse("other2",isBlocked(filter,"http://peagreenboat.org")); + assertFalse("other3",isBlocked(filter,"http://www.peagreenboat.org")); + // there is a problem with the SURTTokenizer... deal with ports! +// assertFalse("other4",isBlocked(filter,"http://www.peagreenboat.com:8080")); + assertTrue("subpath",isBlocked(filter,"http://www.peagreenboat.com/foo")); + assertTrue("emptypath",isBlocked(filter,"http://www.peagreenboat.com/")); + } + + private boolean isBlocked(ObjectFilter<CaptureSearchResult> filter, String url) { + CaptureSearchResult result = new CaptureSearchResult(); + result.setOriginalUrl(url); + int filterResult = filter.filterObject(result); + if(filterResult == ObjectFilter.FILTER_EXCLUDE) { + return true; + } + return false; + } + + private ObjectFilter<CaptureSearchResult> getFilter(String lines[]) + throws IOException { + + setTmpContents(lines); + TreeSet<String> excludes = factory.loadFile(tmpFile.getAbsolutePath()); + return new StaticListExclusionFilter(excludes,canonicalizer); + } + + private void setTmpContents(String[] lines) throws IOException { + if(tmpFile != null && tmpFile.exists()) { + tmpFile.delete(); + } +// tmpFile = File.createTempFile("range-map","tmp"); + FileWriter writer = new FileWriter(tmpFile); + StringBuilder sb = new StringBuilder(); + for(int i=0; i<lines.length; i++) { + sb.append(lines[i]).append("\n"); + } + String contents = sb.toString(); + writer.write(contents); + writer.close(); + //factory.reloadFile(); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-11-16 22:18:05
|
Revision: 3559 http://archive-access.svn.sourceforge.net/archive-access/?rev=3559&view=rev Author: bradtofel Date: 2011-11-16 22:17:57 +0000 (Wed, 16 Nov 2011) Log Message: ----------- INITIAL REV: not fully tested but much improved robots.txt handling. Uses copy of current H3 robots handling - allows + disallow, more robust parsing, cleaner separation of responsibility to clean up the code Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,11 @@ +package org.archive.wayback.accesscontrol.robotstxt; + +public class FixedRobotsDirectives extends RobotsDirectives { + private boolean result; + public FixedRobotsDirectives(boolean result) { + this.result = result; + } + public boolean allows(String path) { + return result; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,164 @@ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.Resource; +import org.archive.wayback.exception.LiveDocumentNotAvailableException; +import org.archive.wayback.exception.LiveWebCacheUnavailableException; +import org.archive.wayback.exception.LiveWebTimeoutException; +import org.archive.wayback.liveweb.LiveWebCache; +import org.archive.wayback.resourceindex.filters.ExclusionFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.UrlOperations; +import org.archive.wayback.webapp.PerformanceLogger; + +public class HRobotExclusionFilter extends ExclusionFilter { + + private final static String ROBOT_SUFFIX = "/robots.txt"; + private final static Logger LOGGER = + Logger.getLogger(HRobotExclusionFilter.class.getName()); + + // TODO: this is not the right thing! + private Charset cs = Charset.forName("UTF-8"); + + private RobotsDirectiveAggregation aggregation = null; + private LiveWebCache webCache = null; + + private String userAgent = null; + private boolean notifiedSeen = false; + private boolean notifiedPassed = false; + private static final FixedRobotsDirectives ALLOW_ROBOT_DIRECTIVE = + new FixedRobotsDirectives(true); + + /** + * Construct a new HRobotExclusionFilter that uses webCache to pull + * robots.txt documents. filtering is based on userAgent, and cached + * documents newer than maxCacheMS in the webCache are considered valid. + * + * @param webCache LiveWebCache from which documents can be retrieved + * @param userAgent String user agent to use for requests to the live web. + * @param maxCacheMS long number of milliseconds to cache documents in the + * LiveWebCache + */ + public HRobotExclusionFilter(LiveWebCache webCache, String userAgent, + long maxCacheMS) { + aggregation = new RobotsDirectiveAggregation(); + this.webCache = webCache; + this.userAgent = userAgent; + } + + private void updateAggregation(String host) + throws LiveWebCacheUnavailableException, + LiveWebTimeoutException, MalformedURLException, IOException { + + List<String> missing = aggregation.getMissingRobotUrls(host); + for(String robotUrl : missing) { + long start = System.currentTimeMillis(); + Resource resource; + try { + resource = webCache.getCachedResource(new URL(robotUrl), + 0,true); + if(resource.getStatusCode() != 200) { + LOGGER.info("ROBOT: Non200("+robotUrl+")"); + // consider it an allow: + aggregation.addDirectives(robotUrl, ALLOW_ROBOT_DIRECTIVE); + } else { + InputStreamReader isr = new InputStreamReader(resource, cs); + BufferedReader br = new BufferedReader(isr); + Robotstxt robotsTxt = new Robotstxt(br); + RobotsDirectives directives = robotsTxt.getDirectivesFor(userAgent); + aggregation.addDirectives(robotUrl, directives); + } + } catch (LiveDocumentNotAvailableException e) { + if(LOGGER.isLoggable(Level.INFO)) { + LOGGER.info("ROBOT: LiveDocumentNotAvailableException(" + + robotUrl + ")"); + } + // consider it an allow: + aggregation.addDirectives(robotUrl, ALLOW_ROBOT_DIRECTIVE); + } + long elapsed = System.currentTimeMillis() - start; + PerformanceLogger.noteElapsed("RobotRequest", elapsed, robotUrl); + } + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultFilter#filterSearchResult(org.archive.wayback.core.SearchResult) + */ + public int filterObject(CaptureSearchResult r) { + if(!notifiedSeen) { + if(filterGroup != null) { + filterGroup.setSawRobots(); + } + notifiedSeen = true; + } + String originalURL = r.getOriginalUrl(); + String path = UrlOperations.getURLPath(originalURL); + if(path.equals(ROBOT_SUFFIX)) { + if(!notifiedPassed) { + if(filterGroup != null) { + filterGroup.setPassedRobots(); + } + notifiedPassed = true; + } + return ObjectFilter.FILTER_INCLUDE; + } + String host = UrlOperations.urlToHost(originalURL); + boolean updated = false; + try { + updateAggregation(host); + if(!aggregation.isBlocked(path)) { + if(LOGGER.isLoggable(Level.INFO)) { + LOGGER.fine("ROBOT: BLOCKED(" + originalURL + ")"); + } + if(LOGGER.isLoggable(Level.FINE)) { + LOGGER.finer("ROBOT: ALLOWED(" + originalURL + ")"); + } + if(!notifiedPassed) { + if(filterGroup != null) { + filterGroup.setPassedRobots(); + } + notifiedPassed = true; + } + return ObjectFilter.FILTER_INCLUDE; + } + +// } catch (LiveDocumentNotAvailableException e) { + } catch (LiveWebCacheUnavailableException e) { + LOGGER.severe("ROBOT: LiveWebCacheUnavailableException(" + + originalURL + ")"); + filterGroup.setLiveWebGone(); + + } catch (LiveWebTimeoutException e) { + LOGGER.severe("ROBOT: LiveDocumentTimedOutException(" + + originalURL + ")"); + filterGroup.setRobotTimedOut(); + + } catch (MalformedURLException e) { + + LOGGER.warning("ROBOT: MalformedURLException(" + + originalURL + ")"); + + } catch (IOException e) { + e.printStackTrace(); + return ObjectFilter.FILTER_EXCLUDE; + } + + if(filterGroup.getRobotTimedOut() || filterGroup.getLiveWebGone()) { + return ObjectFilter.FILTER_ABORT; + } + if(LOGGER.isLoggable(Level.INFO)) { + LOGGER.fine("ROBOT: BLOCKED(" + originalURL + ")"); + } + return ObjectFilter.FILTER_EXCLUDE; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,111 @@ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Class which acts as an aggregation of RobotsDirectives. + * + * If given a host String, will return a list of additional robot URLs that + * need to be added to the current aggregation. + * + * Allows a user to then add new RobotsDirectives for one or more robot URLs. + * + * Finally, allows the aggregation to be queried to see if any of the + * directives block a particular path. + * + * + * @author brad + * + */ +public class RobotsDirectiveAggregation { + private final static Logger LOGGER = + Logger.getLogger(RobotsDirectiveAggregation.class.getName()); + + private final static String HTTP_PREFIX = "http://"; + private final static String ROBOT_SUFFIX = "/robots.txt"; + + private static String WWWN_REGEX = "^www[0-9]+\\."; + private final static Pattern WWWN_PATTERN = Pattern.compile(WWWN_REGEX); + + private HashMap<String,RobotsDirectives> cache = + new HashMap<String, RobotsDirectives>(); + + private StringBuilder sb = new StringBuilder(); + + private String hostToRobotUrlString(final String host) { + sb.setLength(0); + sb.append(HTTP_PREFIX).append(host).append(ROBOT_SUFFIX); + String robotUrl = sb.toString(); + LOGGER.fine("Adding robot URL:" + robotUrl); + return robotUrl; + } + /* + */ + /** + * @param resultHost + * @return a List of all robots.txt urls to attempt for this HOST: + * If HOST starts with "www.DOMAIN": + * [ + * http://HOST/robots.txt, + * http://DOMAIN/robots.txt + * ] + * If HOST starts with "www[0-9]+.DOMAIN": + * [ + * http://HOST/robots.txt, + * http://www.DOMAIN/robots.txt, + * http://DOMAIN/robots.txt + * ] + * Otherwise: + * [ + * http://HOST/robots.txt, + * http://www.HOST/robots.txt + * ] + */ + List<String> hostToRobotUrlStrings(final String resultHost) { + ArrayList<String> list = new ArrayList<String>(); + list.add(hostToRobotUrlString(resultHost)); + + if(resultHost.startsWith("www")) { + if(resultHost.startsWith("www.")) { + list.add(hostToRobotUrlString(resultHost.substring(4))); + } else { + Matcher m = WWWN_PATTERN.matcher(resultHost); + if(m.find()) { + String massagedHost = resultHost.substring(m.end()); + list.add(hostToRobotUrlString("www." + massagedHost)); + list.add(hostToRobotUrlString(massagedHost)); + } + } + } else { + list.add(hostToRobotUrlString("www." + resultHost)); + } + return list; + } + + public List<String> getMissingRobotUrls(String host) { + ArrayList<String> missing = new ArrayList<String>(); + List<String> needed = hostToRobotUrlStrings(host); + for(String need : needed) { + if(!cache.containsKey(need)) { + missing.add(need); + } + } + return missing; + } + public void addDirectives(String url, RobotsDirectives directives) { + cache.put(url, directives); + } + public boolean isBlocked(String path) { + for(RobotsDirectives directives : cache.values()) { + if(!directives.allows(path)) { + return true; + } + } + return false; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,75 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.io.Serializable; +import java.util.concurrent.ConcurrentSkipListSet; + +/** + * Represents the directives that apply to a user-agent (or set of + * user-agents) + */ +public class RobotsDirectives implements Serializable { + private static final long serialVersionUID = 5386542759286155383L; + + ConcurrentSkipListSet<String> disallows = new ConcurrentSkipListSet<String>(); + ConcurrentSkipListSet<String> allows = new ConcurrentSkipListSet<String>(); + float crawlDelay = -1; + + public boolean allows(String path) { + return !(longestPrefixLength(disallows, path) > longestPrefixLength(allows, path)); + } + + /** + * @param prefixSet + * @param str + * @return length of longest entry in {@code prefixSet} that prefixes {@code str}, or zero + * if no entry prefixes {@code str} + */ + protected int longestPrefixLength(ConcurrentSkipListSet<String> prefixSet, + String str) { + String possiblePrefix = prefixSet.floor(str); + if (possiblePrefix != null && str.startsWith(possiblePrefix)) { + return possiblePrefix.length(); + } else { + return 0; + } + } + + public void addDisallow(String path) { + if(path.length()==0) { + // ignore empty-string disallows + // (they really mean allow, when alone) + return; + } + disallows.add(path); + } + + public void addAllow(String path) { + allows.add(path); + } + + public void setCrawlDelay(float i) { + crawlDelay=i; + } + + public float getCrawlDelay() { + return crawlDelay; + } +} \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,234 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Serializable; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.io.IOUtils; +import org.archive.io.ReadSource; + +/** + * Utility class for parsing and representing 'robots.txt' format + * directives, into a list of named user-agents and map from user-agents + * to RobotsDirectives. + */ +public class Robotstxt implements Serializable { + static final long serialVersionUID = 7025386509301303890L; + private static final Logger logger = + Logger.getLogger(Robotstxt.class.getName()); + + // all user agents contained in this robots.txt + // in order of declaration + // TODO: consider discarding irrelevant entries + LinkedList<String> namedUserAgents = new LinkedList<String>(); + // map user-agents to directives + Map<String,RobotsDirectives> agentsToDirectives = + new HashMap<String,RobotsDirectives>(); + RobotsDirectives wildcardDirectives = null; + + boolean hasErrors = false; + + static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives(); + /** empty, reusable instance for all sites providing no rules */ + public static Robotstxt NO_ROBOTS = new Robotstxt(); + + public Robotstxt() { + } + + public Robotstxt(BufferedReader reader) throws IOException { + initializeFromReader(reader); + } + + public Robotstxt(ReadSource customRobots) { + BufferedReader reader = new BufferedReader(customRobots.obtainReader()); + try { + initializeFromReader(reader); + } catch (IOException e) { + logger.log(Level.SEVERE, + "robots ReadSource problem: potential for inadvertent overcrawling", + e); + } finally { + IOUtils.closeQuietly(reader); + } + } + + protected void initializeFromReader(BufferedReader reader) throws IOException { + String read; + // current is the disallowed paths for the preceding User-Agent(s) + RobotsDirectives current = null; + // whether a non-'User-Agent' directive has been encountered + boolean hasDirectivesYet = false; + while (reader != null) { + do { + read = reader.readLine(); + // Skip comments & blanks + } while ((read != null) && ((read = read.trim()).startsWith("#") || + read.length() == 0)); + if (read == null) { + reader.close(); + reader = null; + } else { + // remove any html markup + read = read.replaceAll("<[^>]+>",""); + int commentIndex = read.indexOf("#"); + if (commentIndex > -1) { + // Strip trailing comment + read = read.substring(0, commentIndex); + } + read = read.trim(); + if (read.matches("(?i)^User-agent:.*")) { + String ua = read.substring(11).trim().toLowerCase(); + if (current == null || hasDirectivesYet ) { + // only create new rules-list if necessary + // otherwise share with previous user-agent + current = new RobotsDirectives(); + hasDirectivesYet = false; + } + if (ua.equals("*")) { + wildcardDirectives = current; + } else { + namedUserAgents.addLast(ua); + agentsToDirectives.put(ua, current); + } + continue; + } + if (read.matches("(?i)Disallow:.*")) { + if (current == null) { + // buggy robots.txt + hasErrors = true; + continue; + } + String path = read.substring(9).trim(); + // tolerate common error of ending path with '*' character + // (not allowed by original spec; redundant but harmless with + // Google's wildcarding extensions -- which we don't yet fully + // support). + if(path.endsWith("*")) { + path = path.substring(0,path.length()-1); + } + current.addDisallow(path); + hasDirectivesYet = true; + continue; + } + if (read.matches("(?i)Crawl-delay:.*")) { + if (current == null) { + // buggy robots.txt + hasErrors = true; + continue; + } + // consider a crawl-delay, even though we don't + // yet understand it, as sufficient to end a + // grouping of User-Agent lines + hasDirectivesYet = true; + String val = read.substring(12).trim(); + val = val.split("[^\\d\\.]+")[0]; + try { + current.setCrawlDelay(Float.parseFloat(val)); + } catch (NumberFormatException nfe) { + // ignore + } + continue; + } + if (read.matches("(?i)Allow:.*")) { + if (current == null) { + // buggy robots.txt + hasErrors = true; + continue; + } + String path = read.substring(6).trim(); + // tolerate common error of ending path with '*' character + // (not allowed by original spec; redundant but harmless with + // Google's wildcarding extensions -- which we don't yet fully + // support). + if(path.endsWith("*")) { + path = path.substring(0,path.length()-1); + } + current.addAllow(path); + hasDirectivesYet = true; + continue; + } + // unknown line; do nothing for now + } + } + } + + /** + * Does this policy effectively allow everything? (No + * disallows or timing (crawl-delay) directives?) + * @return + */ + public boolean allowsAll() { + // TODO: refine so directives that are all empty are also + // recognized as allowing all + return agentsToDirectives.isEmpty(); + } + + public List<String> getNamedUserAgents() { + return namedUserAgents; + } + + /** + * Return the RobotsDirectives, if any, appropriate for the given User-Agent + * string. If useFallbacks is true, a wildcard ('*') directives or the default + * of NO_DIRECTIVES will be returned, as appropriate, if there is no better + * match. If useFallbacks is false, a null will be returned if no declared + * directives targeted the given User-Agent. + * + * @param ua String User-Agent to lookup + * @param useFallbacks if true, fall-back to wildcard directives or + * default allow as needed + * @return directives to use, or null if useFallbacks is false and no + * non-wildcard directives match the supplied User-Agent + */ + public RobotsDirectives getDirectivesFor(String ua, boolean useFallbacks) { + // find matching ua + for(String uaListed : namedUserAgents) { + if(ua.indexOf(uaListed)>-1) { + return agentsToDirectives.get(uaListed); + } + } + if(useFallbacks==false) { + return null; + } + if (wildcardDirectives!=null) { + return wildcardDirectives; + } + // no applicable user-agents, so empty directives + return NO_DIRECTIVES; + } + + /** + * Return directives to use for the given User-Agent, resorting to wildcard + * rules or the default no-directives if necessary. + * + * @param userAgent String User-Agent to lookup + * @return directives to use + */ + public RobotsDirectives getDirectivesFor(String userAgent) { + return getDirectivesFor(userAgent, true); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,109 @@ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang.StringEscapeUtils; + +import com.google.common.base.Strings; +import com.google.common.collect.Lists; + +import junit.framework.TestCase; + +public class RobotsDirectiveAggregationTest extends TestCase { + + private String[] mapRobotUrls(String[] in ) { + String res[] = new String[in.length]; + for(int i = 0; i < in.length; i++) { + res[i] = "http://" + in[i] + "/robots.txt"; + } + return res; + } + + + /** + * + */ + public void testHostToRobotUrlStrings() { + RobotsDirectiveAggregation f = new RobotsDirectiveAggregation(); + String test1[] = {"www.foo.com","foo.com"}; + compareListTo(f.hostToRobotUrlStrings("www.foo.com"),mapRobotUrls(test1)); + + String test2[] = {"foo.com","www.foo.com"}; + compareListTo(f.hostToRobotUrlStrings("foo.com"),mapRobotUrls(test2)); + + String test3[] = {"fool.foo.com","www.fool.foo.com"}; + compareListTo(f.hostToRobotUrlStrings("fool.foo.com"),mapRobotUrls(test3)); + + String test4[] = {"www4.foo.com","www.foo.com","foo.com"}; + compareListTo(f.hostToRobotUrlStrings("www4.foo.com"),mapRobotUrls(test4)); + + String test5[] = {"www4w.foo.com"}; + compareListTo(f.hostToRobotUrlStrings("www4w.foo.com"),mapRobotUrls(test5)); + + String test6[] = {"www.www.foo.com","www.foo.com"}; + compareListTo(f.hostToRobotUrlStrings("www.www.foo.com"),mapRobotUrls(test6)); + } + private String strJoin(Iterable<String> i, char del) { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for(String s : i) { + if(first) { + first = false; + } else { + sb.append(del); + } + sb.append(s); + } + return sb.toString(); + } + private List<String> sortA(String[] a) { + Arrays.sort(a); + return Lists.newArrayList(a); + } + private List<String> sortL(List<String> a) { + String[] Empty = new String[0]; + String[] tmp; + tmp = a.toArray(Empty); + Arrays.sort(tmp); + return Lists.newArrayList(tmp); + } + private void compareListTo(List<String> list, String strings[]) { + + boolean match = list.size() == strings.length; + List<String> ls = sortL(list); + List<String> ss = sortA(strings); + if(match) { + for(int i = 0; i < strings.length; i++) { + if(!ls.get(i).equals(ss.get(i))) { + match = false; + break; + } + } + } + if(!match) { + String a1 = strJoin(ls,','); + String a2 = strJoin(ss,','); + String msg = String.format("ArrayCMP (%s) != (%s)",a1,a2); + assertTrue(msg,false); + } + } + + public void testInteraction() { + RobotsDirectiveAggregation agg = new RobotsDirectiveAggregation(); + String test1[] = {"http://foo.com/robots.txt","http://www.foo.com/robots.txt"}; + compareListTo(agg.getMissingRobotUrls("foo.com"),test1); + compareListTo(agg.getMissingRobotUrls("www.foo.com"),test1); + agg.addDirectives("http://foo.com/robots.txt", new FixedRobotsDirectives(true)); + String test2[] = {"http://www.foo.com/robots.txt"}; + compareListTo(agg.getMissingRobotUrls("foo.com"),test2); + assertFalse(agg.isBlocked("/foo")); + + agg.addDirectives("http://www.foo.com/robots.txt", new FixedRobotsDirectives(false)); + String test3[] = {}; + compareListTo(agg.getMissingRobotUrls("foo.com"),test3); + assertTrue(agg.isBlocked("/foo")); + + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-11-16 22:10:44
|
Revision: 3558 http://archive-access.svn.sourceforge.net/archive-access/?rev=3558&view=rev Author: bradtofel Date: 2011-11-16 22:10:38 +0000 (Wed, 16 Nov 2011) Log Message: ----------- FEATURE: Now optionally allows configuration of a set of Spring files to be monitored by a background Thread, which will re-load and swap a new configuration into place if one of the files in the monitored set changes. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestFilter.java 2011-11-16 22:09:06 UTC (rev 3557) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestFilter.java 2011-11-16 22:10:38 UTC (rev 3558) @@ -23,6 +23,8 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.util.logging.LogManager; import java.util.logging.Logger; @@ -36,6 +38,8 @@ import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; +import org.archive.wayback.util.MonitoredFileSet; + /** * Top-Level integration point between a series of RequestHandler mappings and a * generic ServletContext. This filter is assumed to be responsible for matching @@ -46,19 +50,25 @@ * @author brad */ public class RequestFilter implements Filter { - private static final Logger LOGGER = Logger.getLogger(RequestFilter.class - .getName()); - private RequestMapper mapper = null; + private static final Logger LOGGER = + Logger.getLogger(RequestFilter.class.getName()); + private final static String CONFIG_PATH = "config-path"; private final static String LOGGING_CONFIG_PATH = "logging-config-path"; + private final static String MONITOR_MS_CONFIG = "monitor-ms"; + private final static String MONITOR_FILES_CONFIG = "monitor-files"; + private UpdateThread thread = null; + private RequestMapper mapper = null; + private ServletContext context; + private String springConfigPath; + public void init(FilterConfig config) throws ServletException { - ServletContext servletContext = config.getServletContext(); + context = config.getServletContext(); - String logConfigPath = servletContext - .getInitParameter(LOGGING_CONFIG_PATH); + String logConfigPath = context.getInitParameter(LOGGING_CONFIG_PATH); if (logConfigPath != null) { - String resolvedLogPath = servletContext.getRealPath(logConfigPath); + String resolvedLogPath = context.getRealPath(logConfigPath); File logConfigFile = new File(resolvedLogPath); if (logConfigFile.exists()) { FileInputStream finp = null; @@ -83,21 +93,46 @@ } } - String configPath = servletContext.getInitParameter(CONFIG_PATH); + String configPath = context.getInitParameter(CONFIG_PATH); if (configPath == null) { throw new ServletException("Missing " + CONFIG_PATH + " parameter"); } - String resolvedPath = servletContext.getRealPath(configPath); + springConfigPath = context.getRealPath(configPath); - LOGGER.info("Initializing Spring config at: " + resolvedPath); - mapper = SpringReader.readSpringConfig(resolvedPath, servletContext); - LOGGER.info("Initialized Spring config at: " + resolvedPath); - } + String monitorFiles = context.getInitParameter(MONITOR_FILES_CONFIG); + if(monitorFiles == null) { + // just load once: + mapper = loadRequestMapper(); + } else { - public void destroy() { - LOGGER.info("Shutdown starting."); - mapper.shutdown(); - LOGGER.info("Shutdown complete."); + // we're in fancy mode: start the background thread to watch + // our Spring config - it will swap out our mapper when things + // change + + String monitorMSString = context.getInitParameter(MONITOR_MS_CONFIG); + long monitorMS = 10000; + if(monitorMSString != null) { + try { + monitorMS = Long.parseLong(monitorMSString); + } catch(NumberFormatException e) { + throw new ServletException("Non int for " + MONITOR_MS_CONFIG); + } + } + String[] monitored = monitorFiles.split(","); + + ArrayList<String> monitoredL = new ArrayList<String>(); + for(String monitoredPath : monitored) { + monitoredL.add(monitoredPath); + } + thread = new UpdateThread(this, monitorMS, monitoredL); + + // TODO: should we force initial load of a mapper? + // it means incoming requests will block until we're ready.. + // if we don't the thread will immediately being loading + // the Spring config, and will swap it in when it's ready + thread.reloadMapper(); + thread.start(); + } } public void doFilter(ServletRequest request, ServletResponse response, @@ -107,8 +142,12 @@ try { if (request instanceof HttpServletRequest) { if (response instanceof HttpServletResponse) { - handled = mapper.handleRequest((HttpServletRequest) request, - (HttpServletResponse) response); + if(mapper != null) { + handled = mapper.handleRequest( + (HttpServletRequest) request, + (HttpServletResponse) response); + + } } } } finally { @@ -118,4 +157,112 @@ chain.doFilter(request, response); } } + + public void destroy() { + LOGGER.info("Shutdown starting."); + if(thread != null) { + thread.interrupt(); + } + if(mapper != null) { + mapper.shutdown(); + } + LOGGER.info("Shutdown complete."); + } + + + private RequestMapper loadRequestMapper() { + LOGGER.info("Initializing Spring config at: " + springConfigPath); + RequestMapper newMapper = SpringReader.readSpringConfig(springConfigPath, context); + LOGGER.info("Initialized Spring config at: " + springConfigPath); + return newMapper; + } + + /** + * @return the mapper + */ + public RequestMapper getMapper() { + return mapper; + } + + /** + * @param mapper the mapper to set + */ + public void setMapper(RequestMapper mapper) { + this.mapper = mapper; + } + + /** + * Thread that repeatedly checks a set of Spring config files. If any + * change, then a new RequestMapper is created from them, which is then + * swapped in on the containing RequestFilter. The old one if present is + * shut down. + * + * @author Brad Tofel + */ + private class UpdateThread extends Thread { + /** + * object which merges CDX files with the BDBResourceIndex + */ + private RequestFilter filter = null; + private long runInterval; + + private MonitoredFileSet fileSet; + private MonitoredFileSet.FileState activeState; + + /** + * @param filter the RequestFilter we will update + * @param runInterval number of MS bewtween checks + * @param monitored List of files to check Mod Time to trigger reload + */ + public UpdateThread(RequestFilter filter, + long runInterval, List<String> monitored) { + + super("RequestFilter.UpdateThread"); + super.setDaemon(true); + this.filter = filter; + this.runInterval = runInterval; + + fileSet = new MonitoredFileSet(monitored); + activeState = null; + } + + public void reloadMapper() { + + MonitoredFileSet.FileState startState = fileSet.getFileState(); + + RequestMapper mapper = filter.loadRequestMapper(); + + if(fileSet.isChanged(startState)) { + // erk.. files changed during the operation.. update nothing.. + LOGGER.warning("Files changed during Spring reload... discarding.."); + mapper.shutdown(); + + } else { + LOGGER.warning("Loaded RequestMapper."); + RequestMapper oldMapper = filter.getMapper(); + filter.setMapper(mapper); + if(oldMapper != null) { + // shut it down (cross fingers first) + LOGGER.warning("Shutting Down old RequestMapper."); + oldMapper.shutdown(); + } + activeState = startState; + } + } + + public void run() { + LOGGER.info("RequestFilter.UpdateThread is alive."); + while (true) { + try { + if((activeState == null) || fileSet.isChanged(activeState)) { + reloadMapper(); + } + sleep(runInterval); + } catch (InterruptedException e) { + LOGGER.info("Shutting Down."); + return; + } + } + } + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-11-16 22:09:12
|
Revision: 3557 http://archive-access.svn.sourceforge.net/archive-access/?rev=3557&view=rev Author: bradtofel Date: 2011-11-16 22:09:06 +0000 (Wed, 16 Nov 2011) Log Message: ----------- INITIAL REV: class which monitors a set of files, and indicates when a file in the group has changed Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/MonitoredFileSet.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/MonitoredFileSetTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/MonitoredFileSet.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/MonitoredFileSet.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/MonitoredFileSet.java 2011-11-16 22:09:06 UTC (rev 3557) @@ -0,0 +1,50 @@ +package org.archive.wayback.util; + +import java.io.File; +import java.util.Date; +import java.util.HashMap; +import java.util.List; + +public class MonitoredFileSet { + List<String> files; + + public MonitoredFileSet(List<String> files) { + this.files = files; + } + public boolean isChanged(FileState fileState) { + FileState currentFileState = getFileState(); + return currentFileState.isChanged(fileState); + } + public FileState getFileState() { + FileState fileState = new FileState(); + + for(String path : files) { + File file = new File(path); + if(file.isFile()) { + fileState.put(path, new Date(file.lastModified())); + } else { + fileState.put(path, null); + } + } + return fileState; + } + + public class FileState extends HashMap<String,Date> { + public boolean isChanged(FileState other) { + for(String path : keySet()) { + if(other.containsKey(path)) { + Date otherDate = other.get(path); + Date thisDate = get(path); + if((otherDate == null) && (thisDate == null)) { + // treat both missing as the same.. + continue; + } + if(!otherDate.equals(thisDate)) { + return true; + } + } + } + return false; + } + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/MonitoredFileSetTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/MonitoredFileSetTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/MonitoredFileSetTest.java 2011-11-16 22:09:06 UTC (rev 3557) @@ -0,0 +1,43 @@ +package org.archive.wayback.util; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; + +import junit.framework.TestCase; + +public class MonitoredFileSetTest extends TestCase { + + public void testIsChanged() throws IOException, InterruptedException { + File f1 = new File("/tmp/file-set-1.tmp"); + File f2 = new File("/tmp/file-set-2.tmp"); + writeFile(f1,"one"); + writeFile(f2,"two"); + ArrayList<String> l = new ArrayList<String>(); + l.add(f1.getAbsolutePath()); + l.add(f2.getAbsolutePath()); + + MonitoredFileSet fs = new MonitoredFileSet(l); + MonitoredFileSet.FileState s1 = fs.getFileState(); + MonitoredFileSet.FileState s2 = fs.getFileState(); + assertFalse(fs.isChanged(s1)); + assertFalse(fs.isChanged(s2)); + Thread.sleep(1001); + writeFile(f2,"two2"); + MonitoredFileSet.FileState s3 = fs.getFileState(); + assertTrue(fs.isChanged(s2)); + assertTrue(s3.isChanged(s2)); + Thread.sleep(1001); + assertTrue(fs.isChanged(s2)); + assertFalse(fs.isChanged(s3)); + } + private void writeFile(File f, String stuff) throws IOException { + if(f.exists()) { + f.delete(); + } + FileOutputStream fos = new FileOutputStream(f,false); + fos.write(stuff.getBytes()); + fos.close(); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-11-16 22:07:04
|
Revision: 3556 http://archive-access.svn.sourceforge.net/archive-access/?rev=3556&view=rev Author: bradtofel Date: 2011-11-16 22:06:58 +0000 (Wed, 16 Nov 2011) Log Message: ----------- INTERFACE: making private method public Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/graph/GraphEncoder.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/graph/GraphEncoder.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/graph/GraphEncoder.java 2011-11-16 22:02:54 UTC (rev 3555) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/graph/GraphEncoder.java 2011-11-16 22:06:58 UTC (rev 3556) @@ -132,7 +132,7 @@ return sb.toString(); } - private static String encodeHex(int values[]) { + public static String encodeHex(int values[]) { StringBuilder sb = new StringBuilder(values.length); for(int value : values) { if((value > 15) || (value < 0)){ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3555 http://archive-access.svn.sourceforge.net/archive-access/?rev=3555&view=rev Author: bradtofel Date: 2011-11-16 22:02:54 +0000 (Wed, 16 Nov 2011) Log Message: ----------- JAVADOC Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java 2011-11-16 22:02:07 UTC (rev 3554) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java 2011-11-16 22:02:54 UTC (rev 3555) @@ -25,6 +25,11 @@ import org.archive.wayback.util.CloseableIterator; /** + * Iterator<String> decorator, which assumes the decorated is in SORTED order. + * This iterator will discard all elements in the iterator LESS than prefix + * constructor argument, return all elements STARTING with prefix, and stop + * iterating when a record is GREATER than prefix. + * * @author brad * */ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3554 http://archive-access.svn.sourceforge.net/archive-access/?rev=3554&view=rev Author: bradtofel Date: 2011-11-16 22:02:07 +0000 (Wed, 16 Nov 2011) Log Message: ----------- OPTIMIZ: adding hard-stop end-date filter, to help speed up completion when scanning a single SearchResultSource holding multiple logical collections Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java 2011-10-31 20:49:36 UTC (rev 3553) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java 2011-11-16 22:02:07 UTC (rev 3554) @@ -31,6 +31,7 @@ import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BadQueryException; import org.archive.wayback.resourceindex.filters.DateRangeFilter; +import org.archive.wayback.resourceindex.filters.EndDateFilter; import org.archive.wayback.resourceindex.filters.HostMatchFilter; import org.archive.wayback.resourceindex.filters.SchemeMatchFilter; import org.archive.wayback.resourceindex.filters.SelfRedirectFilter; @@ -70,6 +71,15 @@ throw new BadQueryException("Bad request URL(" + request.getRequestUrl() +")"); } + // Date-Filters: + startDate = request.getStartTimestamp(); + if(startDate == null) { + startDate = Timestamp.earliestTimestamp().getDateStr(); + } + endDate = request.getEndTimestamp(); + if(endDate == null) { + endDate = Timestamp.latestTimestamp().getDateStr(); + } if(request.isReplayRequest()) { exactDate = request.getReplayTimestamp(); if(exactDate == null) { @@ -90,20 +100,17 @@ } else if(request.isCaptureQueryRequest()) { chain.addFilter(new UrlMatchFilter(keyUrl)); + // OPTIMIZ: EndDateFilter is a hard stop: ABORT + // DateRangeFilter is an INCLUDE/EXCLUDE + // one class which EXCLUDEs before startDate, and ABORTs + // after endDate would save a compare.. + chain.addFilter(new EndDateFilter(endDate)); + chain.addFilter(new DateRangeFilter(startDate, endDate)); } else if(request.isUrlQueryRequest()) { chain.addFilter(new UrlPrefixMatchFilter(keyUrl)); + chain.addFilter(new DateRangeFilter(startDate, endDate)); } - // Date-Filters: - startDate = request.getStartTimestamp(); - if(startDate == null) { - startDate = Timestamp.earliestTimestamp().getDateStr(); - } - endDate = request.getEndTimestamp(); - if(endDate == null) { - endDate = Timestamp.latestTimestamp().getDateStr(); - } - chain.addFilter(new DateRangeFilter(startDate, endDate)); // Other Filters: if(request.isExactHost()) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2011-10-31 20:54:50
|
Wayback-1 - Build # 85 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/85/ to view the results. |
From: <bra...@us...> - 2011-10-31 20:49:43
|
Revision: 3553 http://archive-access.svn.sourceforge.net/archive-access/?rev=3553&view=rev Author: bradtofel Date: 2011-10-31 20:49:36 +0000 (Mon, 31 Oct 2011) Log Message: ----------- VERSION: adding magic -SNAPSHOT Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/pom.xml trunk/archive-access/projects/wayback/pom.xml trunk/archive-access/projects/wayback/wayback-core/pom.xml trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml trunk/archive-access/projects/wayback/wayback-webapp/pom.xml Modified: trunk/archive-access/projects/wayback/dist/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/pom.xml 2011-10-26 16:15:53 UTC (rev 3552) +++ trunk/archive-access/projects/wayback/dist/pom.xml 2011-10-31 20:49:36 UTC (rev 3553) @@ -7,7 +7,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive.wayback</groupId> - <version>1.7.0</version> + <version>1.7.0-SNAPSHOT</version> </parent> <artifactId>dist</artifactId> Modified: trunk/archive-access/projects/wayback/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/pom.xml 2011-10-26 16:15:53 UTC (rev 3552) +++ trunk/archive-access/projects/wayback/pom.xml 2011-10-31 20:49:36 UTC (rev 3553) @@ -7,7 +7,7 @@ <groupId>org.archive.wayback</groupId> <artifactId>wayback</artifactId> <packaging>pom</packaging> - <version>1.7.0</version> + <version>1.7.0-SNAPSHOT</version> <name>Wayback</name> <modules> Modified: trunk/archive-access/projects/wayback/wayback-core/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/pom.xml 2011-10-26 16:15:53 UTC (rev 3552) +++ trunk/archive-access/projects/wayback/wayback-core/pom.xml 2011-10-31 20:49:36 UTC (rev 3553) @@ -8,7 +8,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive.wayback</groupId> - <version>1.7.0</version> + <version>1.7.0-SNAPSHOT</version> </parent> <artifactId>wayback-core</artifactId> Modified: trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml 2011-10-26 16:15:53 UTC (rev 3552) +++ trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml 2011-10-31 20:49:36 UTC (rev 3553) @@ -8,7 +8,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive.wayback</groupId> - <version>1.7.0</version> + <version>1.7.0-SNAPSHOT</version> </parent> <artifactId>wayback-hadoop</artifactId> Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml 2011-10-26 16:15:53 UTC (rev 3552) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml 2011-10-31 20:49:36 UTC (rev 3553) @@ -8,7 +8,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive.wayback</groupId> - <version>1.7.0</version> + <version>1.7.0-SNAPSHOT</version> </parent> <artifactId>wayback-hadoop-java</artifactId> Modified: trunk/archive-access/projects/wayback/wayback-webapp/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2011-10-26 16:15:53 UTC (rev 3552) +++ trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2011-10-31 20:49:36 UTC (rev 3553) @@ -7,7 +7,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive.wayback</groupId> - <version>1.7.0</version> + <version>1.7.0-SNAPSHOT</version> </parent> <artifactId>wayback-webapp</artifactId> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2011-10-26 16:20:40
|
Wayback-1 - Build # 84 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/84/ to view the results. |
Revision: 3552 http://archive-access.svn.sourceforge.net/archive-access/?rev=3552&view=rev Author: bradtofel Date: 2011-10-26 16:15:53 +0000 (Wed, 26 Oct 2011) Log Message: ----------- BUGFIX: moved QueryFilterGroup above AccessPointFilterGroup - we really need that URL prefix to happen before anything that could potentially skip a lot of records.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2011-10-25 17:54:42 UTC (rev 3551) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2011-10-26 16:15:53 UTC (rev 3552) @@ -118,8 +118,8 @@ canonicalizer = new AggressiveUrlCanonicalizer(); fgFactories = new ArrayList<FilterGroupFactory>(); fgFactories.add(new CoreCaptureFilterGroupFactory()); + fgFactories.add(new QueryCaptureFilterGroupFactory()); fgFactories.add(new AccessPointCaptureFilterGroupFactory()); - fgFactories.add(new QueryCaptureFilterGroupFactory()); fgFactories.add(new ExclusionCaptureFilterGroupFactory()); fgFactories.add(new ClosestTrackingCaptureFilterGroupFactory()); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2011-10-25 18:00:49
|
Wayback-1 - Build # 83 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/83/ to view the results. |
Revision: 3551 http://archive-access.svn.sourceforge.net/archive-access/?rev=3551&view=rev Author: bradtofel Date: 2011-10-25 17:54:42 +0000 (Tue, 25 Oct 2011) Log Message: ----------- OPTIMIZ: now uses more efficient SortedCompositeIterator Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXIndex.java 2011-10-25 01:13:05 UTC (rev 3550) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXIndex.java 2011-10-25 17:54:42 UTC (rev 3551) @@ -28,8 +28,8 @@ import org.archive.wayback.resourceindex.SearchResultSource; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.CompositeSortedIterator; import org.archive.wayback.util.flatfile.FlatFile; +import org.archive.wayback.util.iterator.SortedCompositeIterator; /** * @@ -96,10 +96,10 @@ Iterator<CaptureSearchResult> forwardItr = adaptIterator(getRecordIterator(prefix)); Iterator<CaptureSearchResult> reverseItr = adaptIterator(getReverseRecordIterator(prefix)); Comparator<CaptureSearchResult> comparator = new CaptureSRComparator(wantDate); - CompositeSortedIterator<CaptureSearchResult> itr = - new CompositeSortedIterator<CaptureSearchResult>(comparator); - itr.addComponent(forwardItr); - itr.addComponent(reverseItr); + SortedCompositeIterator<CaptureSearchResult> itr = + new SortedCompositeIterator<CaptureSearchResult>(comparator); + itr.addIterator(forwardItr); + itr.addIterator(reverseItr); return itr; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2011-10-25 01:20:43
|
Wayback-1 - Build # 82 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/82/ to view the results. |
From: <bra...@us...> - 2011-10-25 01:13:11
|
Revision: 3550 http://archive-access.svn.sourceforge.net/archive-access/?rev=3550&view=rev Author: bradtofel Date: 2011-10-25 01:13:05 +0000 (Tue, 25 Oct 2011) Log Message: ----------- FEATURE: Now delegates response generation to the BetterRequestException Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2011-10-25 01:04:13 UTC (rev 3549) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2011-10-25 01:13:05 UTC (rev 3550) @@ -254,10 +254,7 @@ } } catch(BetterRequestException e) { - - httpResponse.setStatus(e.getStatus()); - httpResponse.setHeader("Location", e.getBetterURI()); -// httpResponse.sendRedirect(e.getBetterURI()); + e.generateResponse(httpResponse); handled = true; } catch(WaybackException e) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2011-10-25 01:10:30
|
Wayback-1 - Build # 81 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/81/ to view the results. |
From: <nl...@ar...> - 2011-10-25 01:05:30
|
Wayback-1 - Build # 80 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/80/ to view the results. |
Revision: 3549 http://archive-access.svn.sourceforge.net/archive-access/?rev=3549&view=rev Author: bradtofel Date: 2011-10-25 01:04:13 +0000 (Tue, 25 Oct 2011) Log Message: ----------- BUGFIX: now uses OpenJDK GZIPInputstream, to work around the problem with Oracles GZIP header parsing.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java 2011-10-25 01:03:45 UTC (rev 3548) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java 2011-10-25 01:04:13 UTC (rev 3549) @@ -30,6 +30,7 @@ import java.util.logging.Logger; import java.util.zip.GZIPInputStream; +import org.archive.util.zip.OpenJDK7GZIPInputStream; import org.archive.wayback.exception.RuntimeIOException; import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.CloseableIterator; @@ -134,8 +135,10 @@ for(int i = 0; i < numBlocks; i++) { long offset = i * ZiplinedBlock.BLOCK_SIZE; raf.seek(offset); +// BufferedReader br = new BufferedReader(new InputStreamReader( +// new GZIPInputStream(new FileInputStream(raf.getFD())),ByteOp.UTF8)); BufferedReader br = new BufferedReader(new InputStreamReader( - new GZIPInputStream(new FileInputStream(raf.getFD())),ByteOp.UTF8)); + new OpenJDK7GZIPInputStream(new FileInputStream(raf.getFD())),ByteOp.UTF8)); String line = br.readLine(); if(line == null) { System.err.println("Bad block at " + offset + " in " + args[0]); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3548 http://archive-access.svn.sourceforge.net/archive-access/?rev=3548&view=rev Author: bradtofel Date: 2011-10-25 01:03:45 +0000 (Tue, 25 Oct 2011) Log Message: ----------- BUGFIX: now uses OpenJDK GZIPInputstream, to work around the problem with Oracles GZIP header parsing.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2011-10-25 01:02:55 UTC (rev 3547) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2011-10-25 01:03:45 UTC (rev 3548) @@ -28,6 +28,7 @@ import java.util.logging.Logger; import java.util.zip.GZIPInputStream; +import org.archive.util.zip.OpenJDK7GZIPInputStream; import org.archive.wayback.util.ByteOp; /** @@ -98,8 +99,11 @@ throw new IOException("Unable to load block!"); } return new BufferedReader(new InputStreamReader( - new GZIPInputStream(new ByteArrayInputStream(bytes)), + new OpenJDK7GZIPInputStream(new ByteArrayInputStream(bytes)), ByteOp.UTF8)); +// return new BufferedReader(new InputStreamReader( +// new GZIPInputStream(new ByteArrayInputStream(bytes)), +// ByteOp.UTF8)); } private BufferedReader readBlockInefficiently() throws IOException { StringBuilder sb = new StringBuilder(16); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3547 http://archive-access.svn.sourceforge.net/archive-access/?rev=3547&view=rev Author: bradtofel Date: 2011-10-25 01:02:55 +0000 (Tue, 25 Oct 2011) Log Message: ----------- BUGFIX(unreported): now we try to make sure the FSDataInputStream gets closed.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java 2011-10-25 00:59:30 UTC (rev 3546) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/HDFSBlockLoader.java 2011-10-25 01:02:55 UTC (rev 3547) @@ -53,7 +53,11 @@ Path path = new Path(url); FSDataInputStream s = fs.open(path); byte buffer[] = new byte[length]; - s.readFully(offset, buffer); + try { + s.readFully(offset, buffer); + } finally { + s.close(); + } return buffer; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |