You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bi...@us...> - 2008-06-26 22:34:14
|
Revision: 2330 http://archive-access.svn.sourceforge.net/archive-access/?rev=2330&view=rev Author: binzino Date: 2008-06-26 15:34:24 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Add libraries from Wayback-1.2.1 binary distribution. These are needed by the WaybackURLFilter plugin. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/lib/libidn-0.6.5.jar trunk/archive-access/projects/nutchwax/archive/lib/mg4j-1.0.1.jar trunk/archive-access/projects/nutchwax/archive/lib/wayback-core-1.2.1.jar Added: trunk/archive-access/projects/nutchwax/archive/lib/libidn-0.6.5.jar =================================================================== (Binary files differ) Property changes on: trunk/archive-access/projects/nutchwax/archive/lib/libidn-0.6.5.jar ___________________________________________________________________ Name: svn:mime-type + application/octet-stream Added: trunk/archive-access/projects/nutchwax/archive/lib/mg4j-1.0.1.jar =================================================================== (Binary files differ) Property changes on: trunk/archive-access/projects/nutchwax/archive/lib/mg4j-1.0.1.jar ___________________________________________________________________ Name: svn:mime-type + application/octet-stream Added: trunk/archive-access/projects/nutchwax/archive/lib/wayback-core-1.2.1.jar =================================================================== (Binary files differ) Property changes on: trunk/archive-access/projects/nutchwax/archive/lib/wayback-core-1.2.1.jar ___________________________________________________________________ Name: svn:mime-type + application/octet-stream This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-26 22:33:27
|
Revision: 2329 http://archive-access.svn.sourceforge.net/archive-access/?rev=2329&view=rev Author: binzino Date: 2008-06-26 15:33:28 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Add key for archive digest, distinct from whatever digest Nutch might calculate itself. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-06-26 22:32:40 UTC (rev 2328) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-06-26 22:33:28 UTC (rev 2329) @@ -26,4 +26,5 @@ public static final String COLLECTION_KEY = "collection"; public static final String CONTENT_TYPE_KEY = "type"; public static final String DATE_KEY = "date"; + public static final String DIGEST_KEY = "archive-digest"; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-26 22:32:35
|
Revision: 2328 http://archive-access.svn.sourceforge.net/archive-access/?rev=2328&view=rev Author: binzino Date: 2008-06-26 15:32:40 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Ensure digest calculation is enabled in ARC reading. Convert dates read from WARC files from WARC format to 14-digit format. Explicitly set digest if reading from WARC file. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/ArcReader.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/ArcReader.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/ArcReader.java 2008-06-26 22:30:24 UTC (rev 2327) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/ArcReader.java 2008-06-26 22:32:40 UTC (rev 2328) @@ -209,13 +209,27 @@ Map arcMetadataFields = new HashMap( ); arcMetadataFields.put( ARCConstants.URL_FIELD_KEY, header.getHeaderValue( WARCConstants.HEADER_KEY_URI ) ); arcMetadataFields.put( ARCConstants.IP_HEADER_FIELD_KEY, header.getHeaderValue( WARCConstants.HEADER_KEY_IP ) ); - arcMetadataFields.put( ARCConstants.DATE_FIELD_KEY, header.getHeaderValue( WARCConstants.HEADER_KEY_DATE ) ); arcMetadataFields.put( ARCConstants.MIMETYPE_FIELD_KEY, header.getHeaderValue( null ) ); // We don't know the MIME type of the *payload* in a WARC (yet) arcMetadataFields.put( ARCConstants.LENGTH_FIELD_KEY, header.getHeaderValue( WARCConstants.CONTENT_LENGTH ) ); arcMetadataFields.put( ARCConstants.VERSION_FIELD_KEY, header.getHeaderValue( null ) ); // FIXME: Do we need actual values for these? arcMetadataFields.put( ARCConstants.ABSOLUTE_OFFSET_KEY, header.getHeaderValue( null ) ); // FIXME: Do we need actual values for these? + + // Dates must be converted from WARC format to 14-digit format, + // that is, from YYYY-MM-DDTHH:MM:SSZ to YYYYMMDDHHMMSS + String warcDate = (String) header.getHeaderValue( WARCConstants.HEADER_KEY_DATE ); + StringBuilder date = new StringBuilder( ) + .append( warcDate, 0, 4 ) + .append( warcDate, 5, 7 ) + .append( warcDate, 8, 10 ) + .append( warcDate, 11, 13 ) + .append( warcDate, 14, 16 ) + .append( warcDate, 17, 19 ); + + arcMetadataFields.put( ARCConstants.DATE_FIELD_KEY, date.toString( ) ); ARCRecordMetaData metadata = new ARCRecordMetaData( header.getReaderIdentifier( ), arcMetadataFields ); + + metadata.setDigest( (String) header.getHeaderValue( WARCConstants.HEADER_KEY_PAYLOAD_DIGEST ) ); // Then, create an ARCRecord using the WARCRecord and the // ARCRecordMetaData object we just created. @@ -250,6 +264,7 @@ } + /** * Simple test/debug driver to read an archive file and print out * the header for each record. @@ -258,18 +273,22 @@ { if ( args.length != 1 ) { - System.out.println( "ReaderTest <(w)arc file>" ); + System.out.println( "ArcReader <(w)arc file>" ); System.exit( 1 ); } String arcName = args[0]; ArchiveReader r = ArchiveReaderFactory.get( arcName ); + r.setDigest( true ); ArcReader reader = new ArcReader( r ); for ( ARCRecord rec : reader ) { + // Must call close() for digest calculation to be finished. + rec.close( ); + if ( rec != null ) System.out.println( rec.getHeader( ) ); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-26 22:30:16
|
Revision: 2327 http://archive-access.svn.sourceforge.net/archive-access/?rev=2327&view=rev Author: binzino Date: 2008-06-26 15:30:24 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Initial revision of modified version of Lucene's ParallalReader. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/ trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/ trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2008-06-26 22:30:24 UTC (rev 2327) @@ -0,0 +1,614 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * ARCHIVE: This must be in the lucene index package because it needs + * to call protected methods on other IndexReader objects. + */ +package org.apache.lucene.index; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.document.FieldSelectorResult; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.TermVectorMapper; + +import java.io.IOException; +import java.util.*; + + +/** An IndexReader which reads multiple, parallel indexes. Each index added + * must have the same number of documents, but typically each contains + * different fields. Each document contains the union of the fields of all + * documents with the same document number. When searching, matches for a + * query term are from the first index added that has the field. + * + * <p>This is useful, e.g., with collections that have large fields which + * change rarely and small fields that change more frequently. The smaller + * fields may be re-indexed in a new index and both indexes may be searched + * together. + * + * <p><strong>Warning:</strong> It is up to you to make sure all indexes + * are created and modified the same way. For example, if you add + * documents to one index, you need to add the same documents in the + * same order to the other indexes. <em>Failure to do so will result in + * undefined behavior</em>. + */ +public class ArchiveParallelReader extends IndexReader { + private List readers = new ArrayList(); + private List decrefOnClose = new ArrayList(); // remember which subreaders to decRef on close + boolean incRefReaders = false; + private SortedMap fieldToReader = new TreeMap(); + + private int maxDoc; + private int numDocs; + private boolean hasDeletions; + + /** Construct a ArchiveParallelReader. + * <p>Note that all subreaders are closed if this ArchiveParallelReader is closed.</p> + */ + public ArchiveParallelReader() throws IOException { this(true); } + + /** Construct a ArchiveParallelReader. + * @param closeSubReaders indicates whether the subreaders should be closed + * when this ArchiveParallelReader is closed + */ + public ArchiveParallelReader(boolean closeSubReaders) throws IOException { + super(); + this.incRefReaders = !closeSubReaders; + } + + /** Add an IndexReader. + * @throws IOException if there is a low-level IO error + */ + public void add(IndexReader reader) throws IOException + { + ensureOpen(); + if (readers.size() == 0) { + this.maxDoc = reader.maxDoc(); + this.numDocs = reader.numDocs(); + this.hasDeletions = reader.hasDeletions(); + } + + if (reader.maxDoc() != maxDoc) // check compatibility + throw new IllegalArgumentException + ("All readers must have same maxDoc: "+maxDoc+"!="+reader.maxDoc()); + if (reader.numDocs() != numDocs) + throw new IllegalArgumentException + ("All readers must have same numDocs: "+numDocs+"!="+reader.numDocs()); + + Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL); + Iterator i = fields.iterator(); + while (i.hasNext()) { // update fieldToReader map + String field = (String)i.next(); + if (fieldToReader.get(field) == null) + fieldToReader.put(field, reader); + } + + readers.add(reader); + + if (incRefReaders) { + reader.incRef(); + } + decrefOnClose.add(Boolean.valueOf(incRefReaders)); + } + + /** + * Tries to reopen the subreaders. + * <br> + * If one or more subreaders could be re-opened (i. e. subReader.reopen() + * returned a new instance != subReader), then a new ArchiveParallelReader instance + * is returned, otherwise this instance is returned. + * <p> + * A re-opened instance might share one or more subreaders with the old + * instance. Index modification operations result in undefined behavior + * when performed before the old instance is closed. + * (see {@link IndexReader#reopen()}). + * <p> + * If subreaders are shared, then the reference count of those + * readers is increased to ensure that the subreaders remain open + * until the last referring reader is closed. + * + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + public IndexReader reopen() throws CorruptIndexException, IOException { + ensureOpen(); + + boolean reopened = false; + List newReaders = new ArrayList(); + List newDecrefOnClose = new ArrayList(); + + boolean success = false; + + try { + + for (int i = 0; i < readers.size(); i++) { + IndexReader oldReader = (IndexReader) readers.get(i); + IndexReader newReader = oldReader.reopen(); + newReaders.add(newReader); + // if at least one of the subreaders was updated we remember that + // and return a new MultiReader + if (newReader != oldReader) { + reopened = true; + } + } + + if (reopened) { + ArchiveParallelReader pr = new ArchiveParallelReader(); + for (int i = 0; i < readers.size(); i++) { + IndexReader oldReader = (IndexReader) readers.get(i); + IndexReader newReader = (IndexReader) newReaders.get(i); + if (newReader == oldReader) { + newDecrefOnClose.add(Boolean.TRUE); + newReader.incRef(); + } else { + // this is a new subreader instance, so on close() we don't + // decRef but close it + newDecrefOnClose.add(Boolean.FALSE); + } + pr.add(newReader); + } + pr.decrefOnClose = newDecrefOnClose; + pr.incRefReaders = incRefReaders; + success = true; + return pr; + } else { + success = true; + // No subreader was refreshed + return this; + } + } finally { + if (!success && reopened) { + for (int i = 0; i < newReaders.size(); i++) { + IndexReader r = (IndexReader) newReaders.get(i); + if (r != null) { + try { + if (((Boolean) newDecrefOnClose.get(i)).booleanValue()) { + r.decRef(); + } else { + r.close(); + } + } catch (IOException ignore) { + // keep going - we want to clean up as much as possible + } + } + } + } + } + } + + + public int numDocs() { + // Don't call ensureOpen() here (it could affect performance) + return numDocs; + } + + public int maxDoc() { + // Don't call ensureOpen() here (it could affect performance) + return maxDoc; + } + + public boolean hasDeletions() { + // Don't call ensureOpen() here (it could affect performance) + return hasDeletions; + } + + // check first reader + public boolean isDeleted(int n) { + // Don't call ensureOpen() here (it could affect performance) + if (readers.size() > 0) + return ((IndexReader)readers.get(0)).isDeleted(n); + return false; + } + + // delete in all readers + protected void doDelete(int n) throws CorruptIndexException, IOException { + for (int i = 0; i < readers.size(); i++) { + ((IndexReader)readers.get(i)).deleteDocument(n); + } + hasDeletions = true; + } + + /** + * @see org.apache.lucene.index.ParallelReader.doUndeleteAll + */ + protected void doUndeleteAll() throws CorruptIndexException, IOException { + for (int i = 0; i < readers.size(); i++) { + ((IndexReader)readers.get(i)).undeleteAll(); + } + hasDeletions = false; + } + + /** + * <p><strong>ARCHIVE</strong> modification</p> + * <p>Return a <code>Document</code> with fields merged from parallel + * indices. The values for a given field will <strong>only</strong> + * come from the first index that has the field. This matches the + * searching behavior where a field is only searched in the first + * index that has the field.</p> + * <p>This differs from the bundled Lucene <code>ParallelReader</code>, + * which adds all vales from every index that has the field.</p> + * <p>The <code>fieldSelector<code> parameter is ignored.</p> + * <h3>Implementation Notes</h3> + * <p>Since getting the document from the reader is the expensive + * operation, we only get it once from each reader. Once we've + * gotten the document from the reader, we iterate through the + * fields and only copy those fields that are mapped to the reader.</p> + * <p>The first implementation iterated through the field names, + * getting the document from the corresponding reader for each + * field name (10 fields => 10 document gets) which was a big + * performance hit.</p> + * <p>In this implementation, there are only as many document gets as + * there are readers.</p> + * @param n ordinal position of document to return + * @param fieldSelector ignored + * @return the document with field values assembled from parallel indicdes + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + public Document document(int n, FieldSelector fieldSelector) + throws CorruptIndexException, IOException + { + ensureOpen(); + Document result = new Document(); + + for ( IndexReader reader : (List<IndexReader>) readers ) + { + Document d = reader.document( n ); + + for ( Fieldable f : ((List<Fieldable>) d.getFields()) ) + { + if ( fieldToReader.get( f.name( ) ) == reader ) + { + result.add( f ); + } + } + } + + return result; + } + + // get all vectors + public TermFreqVector[] getTermFreqVectors(int n) throws IOException { + ensureOpen(); + ArrayList results = new ArrayList(); + Iterator i = fieldToReader.entrySet().iterator(); + while (i.hasNext()) { + Map.Entry e = (Map.Entry)i.next(); + String field = (String)e.getKey(); + IndexReader reader = (IndexReader)e.getValue(); + TermFreqVector vector = reader.getTermFreqVector(n, field); + if (vector != null) + results.add(vector); + } + return (TermFreqVector[]) + results.toArray(new TermFreqVector[results.size()]); + } + + public TermFreqVector getTermFreqVector(int n, String field) + throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader==null ? null : reader.getTermFreqVector(n, field); + } + + + public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader != null) { + reader.getTermFreqVector(docNumber, field, mapper); + } + } + + public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { + ensureOpen(); + ensureOpen(); + + Iterator i = fieldToReader.entrySet().iterator(); + while (i.hasNext()) { + Map.Entry e = (Map.Entry)i.next(); + String field = (String)e.getKey(); + IndexReader reader = (IndexReader)e.getValue(); + reader.getTermFreqVector(docNumber, field, mapper); + } + + } + + public boolean hasNorms(String field) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader==null ? false : reader.hasNorms(field); + } + + public byte[] norms(String field) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader==null ? null : reader.norms(field); + } + + public void norms(String field, byte[] result, int offset) + throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader!=null) + reader.norms(field, result, offset); + } + + protected void doSetNorm(int n, String field, byte value) + throws CorruptIndexException, IOException { + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader!=null) + reader.doSetNorm(n, field, value); + } + + public TermEnum terms() throws IOException { + ensureOpen(); + return new ParallelTermEnum(); + } + + public TermEnum terms(Term term) throws IOException { + ensureOpen(); + return new ParallelTermEnum(term); + } + + public int docFreq(Term term) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + return reader==null ? 0 : reader.docFreq(term); + } + + public TermDocs termDocs(Term term) throws IOException { + ensureOpen(); + return new ParallelTermDocs(term); + } + + public TermDocs termDocs() throws IOException { + ensureOpen(); + return new ParallelTermDocs(); + } + + public TermPositions termPositions(Term term) throws IOException { + ensureOpen(); + return new ParallelTermPositions(term); + } + + public TermPositions termPositions() throws IOException { + ensureOpen(); + return new ParallelTermPositions(); + } + + /** + * Checks recursively if all subreaders are up to date. + */ + public boolean isCurrent() throws CorruptIndexException, IOException { + for (int i = 0; i < readers.size(); i++) { + if (!((IndexReader)readers.get(i)).isCurrent()) { + return false; + } + } + + // all subreaders are up to date + return true; + } + + /** + * Checks recursively if all subindexes are optimized + */ + public boolean isOptimized() { + for (int i = 0; i < readers.size(); i++) { + if (!((IndexReader)readers.get(i)).isOptimized()) { + return false; + } + } + + // all subindexes are optimized + return true; + } + + + /** Not implemented. + * @throws UnsupportedOperationException + */ + public long getVersion() { + throw new UnsupportedOperationException("ArchiveParallelReader does not support this method."); + } + + // for testing + IndexReader[] getSubReaders() { + return (IndexReader[]) readers.toArray(new IndexReader[readers.size()]); + } + + protected void doCommit() throws IOException { + for (int i = 0; i < readers.size(); i++) + ((IndexReader)readers.get(i)).commit(); + } + + protected synchronized void doClose() throws IOException { + for (int i = 0; i < readers.size(); i++) { + if (((Boolean) decrefOnClose.get(i)).booleanValue()) { + ((IndexReader)readers.get(i)).decRef(); + } else { + ((IndexReader)readers.get(i)).close(); + } + } + } + + public Collection getFieldNames (IndexReader.FieldOption fieldNames) { + ensureOpen(); + Set fieldSet = new HashSet(); + for (int i = 0; i < readers.size(); i++) { + IndexReader reader = ((IndexReader)readers.get(i)); + Collection names = reader.getFieldNames(fieldNames); + fieldSet.addAll(names); + } + return fieldSet; + } + + private class ParallelTermEnum extends TermEnum { + private String field; + private Iterator fieldIterator; + private TermEnum termEnum; + + public ParallelTermEnum() throws IOException { + field = (String)fieldToReader.firstKey(); + if (field != null) + termEnum = ((IndexReader)fieldToReader.get(field)).terms(); + } + + public ParallelTermEnum(Term term) throws IOException { + field = term.field(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader!=null) + termEnum = reader.terms(term); + } + + public boolean next() throws IOException { + if (termEnum==null) + return false; + + // another term in this field? + if (termEnum.next() && termEnum.term().field()==field) + return true; // yes, keep going + + termEnum.close(); // close old termEnum + + // find the next field with terms, if any + if (fieldIterator==null) { + fieldIterator = fieldToReader.tailMap(field).keySet().iterator(); + fieldIterator.next(); // Skip field to get next one + } + while (fieldIterator.hasNext()) { + field = (String) fieldIterator.next(); + termEnum = ((IndexReader)fieldToReader.get(field)).terms(new Term(field, "")); + Term term = termEnum.term(); + if (term!=null && term.field()==field) + return true; + else + termEnum.close(); + } + + return false; // no more fields + } + + public Term term() { + if (termEnum==null) + return null; + + return termEnum.term(); + } + + public int docFreq() { + if (termEnum==null) + return 0; + + return termEnum.docFreq(); + } + + public void close() throws IOException { + if (termEnum!=null) + termEnum.close(); + } + + } + + // wrap a TermDocs in order to support seek(Term) + private class ParallelTermDocs implements TermDocs { + protected TermDocs termDocs; + + public ParallelTermDocs() {} + public ParallelTermDocs(Term term) throws IOException { seek(term); } + + public int doc() { return termDocs.doc(); } + public int freq() { return termDocs.freq(); } + + public void seek(Term term) throws IOException { + IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + termDocs = reader!=null ? reader.termDocs(term) : null; + } + + public void seek(TermEnum termEnum) throws IOException { + seek(termEnum.term()); + } + + public boolean next() throws IOException { + if (termDocs==null) + return false; + + return termDocs.next(); + } + + public int read(final int[] docs, final int[] freqs) throws IOException { + if (termDocs==null) + return 0; + + return termDocs.read(docs, freqs); + } + + public boolean skipTo(int target) throws IOException { + if (termDocs==null) + return false; + + return termDocs.skipTo(target); + } + + public void close() throws IOException { + if (termDocs!=null) + termDocs.close(); + } + + } + + private class ParallelTermPositions + extends ParallelTermDocs implements TermPositions { + + public ParallelTermPositions() {} + public ParallelTermPositions(Term term) throws IOException { seek(term); } + + public void seek(Term term) throws IOException { + IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + termDocs = reader!=null ? reader.termPositions(term) : null; + } + + public int nextPosition() throws IOException { + // It is an error to call this if there is no next position, e.g. if termDocs==null + return ((TermPositions)termDocs).nextPosition(); + } + + public int getPayloadLength() { + return ((TermPositions)termDocs).getPayloadLength(); + } + + public byte[] getPayload(byte[] data, int offset) throws IOException { + return ((TermPositions)termDocs).getPayload(data, offset); + } + + + // TODO: Remove warning after API has been finalized + public boolean isPayloadAvailable() { + return ((TermPositions) termDocs).isPayloadAvailable(); + } + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-26 22:29:12
|
Revision: 2326 http://archive-access.svn.sourceforge.net/archive-access/?rev=2326&view=rev Author: binzino Date: 2008-06-26 15:29:21 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Initial revision of WaybackURLFilter and associated changes to build files. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/build-plugin.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/plugin.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/build.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/plugin.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/build-plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/build-plugin.xml 2008-06-26 22:26:10 UTC (rev 2325) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/build-plugin.xml 2008-06-26 22:29:21 UTC (rev 2326) @@ -55,6 +55,10 @@ <fileset dir="${nutch.root}/lib"> <include name="*.jar" /> </fileset> + <!-- This is the contrib/archive/lib directory --> + <fileset dir="../../../lib"> + <include name="*.jar" /> + </fileset> <path refid="plugin.deps"/> </path> Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml 2008-06-26 22:26:10 UTC (rev 2325) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/build.xml 2008-06-26 22:29:21 UTC (rev 2326) @@ -26,17 +26,18 @@ <!-- Build & deploy all the plugin jars. --> <!-- ====================================================== --> <target name="deploy"> - <ant dir="index-nutchwax" target="deploy"/> - <ant dir="query-nutchwax" target="deploy"/> + <ant dir="index-nutchwax" target="deploy"/> + <ant dir="query-nutchwax" target="deploy"/> + <ant dir="urlfilter-nutchwax" target="deploy"/> </target> <!-- ====================================================== --> <!-- Clean all of the plugins. --> <!-- ====================================================== --> <target name="clean"> - <ant dir="index-nutchwax" target="clean"/> - <ant dir="query-nutchwax" target="clean"/> + <ant dir="index-nutchwax" target="clean"/> + <ant dir="query-nutchwax" target="clean"/> + <ant dir="urlfilter-nutchwax" target="clean"/> </target> </project> - Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/plugin.xml 2008-06-26 22:26:10 UTC (rev 2325) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/plugin.xml 2008-06-26 22:29:21 UTC (rev 2326) @@ -21,7 +21,7 @@ --> <plugin id="index-nutchwax" - name="NutchWax Indexing Filter" + name="NutchWAX Indexing Filter" version="1.0.0" provider-name="archive.org"> @@ -36,9 +36,9 @@ </requires> <extension id="org.apache.nutch.indexer.basic" - name="NutchWax Indexing Filter" + name="Configurable Indexing Filter" point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="NutchWaxIndexingFilter" + <implementation id="ConfigurableIndexingFilter" class="org.archive.nutchwax.index.ConfigurableIndexingFilter" /> </extension> Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml 2008-06-26 22:26:10 UTC (rev 2325) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/plugin.xml 2008-06-26 22:29:21 UTC (rev 2326) @@ -21,7 +21,7 @@ --> <plugin id="query-nutchwax" - name="NutchWax Query Filter" + name="NutchWAX Query Filter" version="1.0.0" provider-name="archive.org"> Added: trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/build.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/build.xml (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/build.xml 2008-06-26 22:29:21 UTC (rev 2326) @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlfilter-nutchwax" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Added: trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/plugin.xml (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/plugin.xml 2008-06-26 22:29:21 UTC (rev 2326) @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlfilter-nutchwax" + name="NutchWAX URL Filter" + version="1.0.0" + provider-name="archive.org"> + + <runtime> + <library name="urlfilter-nutchwax.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.archive.nutchwax.urlfilter.wayback" + name="Wayback URL Filter" + point="org.apache.nutch.net.URLFilter"> + <implementation id="WaybackURLFilter" + class="org.archive.nutchwax.urlfilter.WaybackURLFilter"/> + </extension> + +</plugin> Added: trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java 2008-06-26 22:29:21 UTC (rev 2326) @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2008 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.nutchwax.urlfilter; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.commons.httpclient.URIException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.nutch.net.URLFilter; + +import org.archive.wayback.UrlCanonicalizer; + +/** + * Nutch URLFilter that filters a URL based on URL+digest+date + * metadata values, where the URL can also be canonicalized using the + * same logic as the Wayback. By making Wayback canonicalization + * available, we can use exclusion rules generated from CDX files. + */ +// TODO: Add logging +public class WaybackURLFilter implements URLFilter +{ + public static final Log LOG = LogFactory.getLog( WaybackURLFilter.class ); + + private Configuration conf; + private UrlCanonicalizer canonicalizer; + private Set<String> exclusions; + + public WaybackURLFilter( ) + { + } + + /** + * + */ + public String filter( String urlString ) + { + // Assume input is in expected form of space-separated values + // url + // digest + // 14-digit timestamp + String s[] = urlString.split( "\\s+" ); + + if ( s.length != 3 ) + { + // Don't filter. + LOG.info( "Allowing: " + urlString ); + + return urlString; + } + + boolean exclude = false; + + String url = s[0]; + String digest = s[1]; + String date = s[2]; + + try + { + // First, transform the URL into the same form that the + // Wayback uses for CDX files. + url = this.canonicalizer.urlStringToKey( url ); + + // Then, build a key to be compared against the exclusion + // list. + String key = url + " " + digest + " " + date; + + exclude = this.exclusions.contains( key ); + } + catch ( URIException e ) + { + // If we can't handle the URL, we let it through. + exclude = false; + } + + if ( exclude ) + { + LOG.info( "Excluding: " + urlString ); + + return null; + } + + LOG.info( "Allowing : " + urlString ); + + return urlString; + } + + public Configuration getConf( ) + { + return conf; + } + + public void setConf( Configuration conf ) + { + this.conf = conf; + + this.canonicalizer = getCanonicalizer( conf ); + this.exclusions = getExclusions ( conf ); + } + + /** + * Utility function to instantiate a UrlCanonicalizer based on an + * implementation specified in the configuration. + */ + public static UrlCanonicalizer getCanonicalizer( Configuration conf ) + { + // Which Wayback canonicalizer to use: Aggressive, Identity, etc. + String canonicalizerClassName = conf.get( "nutchwax.urlfilter.wayback.canonicalizer" ); + + if ( canonicalizerClassName == null || canonicalizerClassName.trim().length() == 0 ) + { + throw new RuntimeException( "Missing value for property: nutchwax.urlfilter.wayback.canonicalizer" ); + } + + try + { + UrlCanonicalizer canonicalizer = (UrlCanonicalizer) Class.forName( canonicalizerClassName ).newInstance( ); + + return canonicalizer; + } + catch ( Exception e ) + { + // If we can't instantiate it, there's not much else we can do + // other than just throw the Exception. + throw new RuntimeException( e ); + } + } + + /** + * Utility function to read a list of exclusion records from a file + * specified in the configuration. + */ + public static Set<String> getExclusions( Configuration conf ) + { + String exclusionsPath = conf.get( "nutchwax.urlfilter.wayback.exclusions" ); + + if ( exclusionsPath == null || exclusionsPath.trim().length() == 0 ) + { + LOG.warn( "No exclusions file set for property: \"nutchwax.urlfilter.wayback.exclusions\"" ); + + return Collections.EMPTY_SET; + } + + LOG.warn( "Using exclusions: " + exclusionsPath ); + + Set<String> exclusions = new HashSet<String>( ); + + BufferedReader reader = null; + try + { + Path p = new Path( exclusionsPath.trim() ); + + FileSystem fs = FileSystem.get( conf ); + + if ( fs.exists( p ) ) + { + InputStream is = p.getFileSystem( conf ).open( p ); + + reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) ); + + String line; + while ( (line = reader.readLine()) != null ) + { + exclusions.add( line ); + } + } + else + { + LOG.warn( "Exclusions doesn't exist: " + exclusionsPath ); + } + } + catch ( IOException e ) + { + // Umm, what to do? + throw new RuntimeException( e ); + } + finally + { + try + { + if ( reader != null ) + { + reader.close( ); + } + } + catch ( IOException e ) + { + // Ignore it. + } + } + + return exclusions; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-26 22:26:04
|
Revision: 2325 http://archive-access.svn.sourceforge.net/archive-access/?rev=2325&view=rev Author: binzino Date: 2008-06-26 15:26:10 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Initial revision of scripts for processing CDX files for duplicate and revisit records. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx trunk/archive-access/projects/nutchwax/archive/bin/dups-from trunk/archive-access/projects/nutchwax/archive/bin/revisits Added: trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx =================================================================== --- trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx 2008-06-26 22:26:10 UTC (rev 2325) @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +if [ "$#" -eq 0 ]; +then + echo "Usage: dedup-cdx <cdx>..." + echo "To read from standard input, use \"-\" as a filename." + echo + echo "Finds duplicate records in a set of CDX files and outputs them " + echo "in a format suitable for use with NutchWAX tools." + echo + echo "Duplicate records are found by sorting all the CDX records, then" + echo "comparing subsequent records by URL+digest." + echo + echo "Output is in abbreviated form of \"URL digest date\", ex:" + echo + echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20080626121505" + echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20070208173443" + echo + echo "The output of this script can be used as an exclusions file for" + echo "importing (W)ARC files with NutchWAX, and also for adding dates" + echo "to a parallel index." + echo + exit 1; +fi + +cat $@ | awk '{ print $1 " sha1:" $6 " " $2 }' | sort | awk '{ if ( url == $1 && digest == $2 ) print $1 " " $2 " " $3 ; url = $1 ; digest = $2 }' Property changes on: trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx ___________________________________________________________________ Name: svn:executable + * Added: trunk/archive-access/projects/nutchwax/archive/bin/dups-from =================================================================== --- trunk/archive-access/projects/nutchwax/archive/bin/dups-from (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/bin/dups-from 2008-06-26 22:26:10 UTC (rev 2325) @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +if [ "$#" -lt 2 ]; +then + echo "Usage: dups-from <dups> <cdx>..." + echo "To read <cdx> from standard input, use \"-\" as a filename." + echo + echo "Extract the lines from <dups> that come from the <cdx>... files" + echo + exit 1; +fi + +dups=$1 +shift + +cat $@ | awk '{ print $1 " sha1:" $6 " " $2 }' | cat - ${dups} | sort | uniq -d Property changes on: trunk/archive-access/projects/nutchwax/archive/bin/dups-from ___________________________________________________________________ Name: svn:executable + * Added: trunk/archive-access/projects/nutchwax/archive/bin/revisits =================================================================== --- trunk/archive-access/projects/nutchwax/archive/bin/revisits (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/bin/revisits 2008-06-26 22:26:10 UTC (rev 2325) @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +if [ "$#" -eq 0 ]; +then + echo "Usage: revisits <cdx>..." + echo + echo "Extract revisit records from a CDX file." + echo "Normally only CDX's generated from WARCs will have revisit records." + exit 1; +fi + +cat $@ | awk '{ if ( $9 == "-" ) print $1 " sha1:" $6 " " $2 }' | sort Property changes on: trunk/archive-access/projects/nutchwax/archive/bin/revisits ___________________________________________________________________ Name: svn:executable + * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-25 01:34:18
|
Revision: 2324 http://archive-access.svn.sourceforge.net/archive-access/?rev=2324&view=rev Author: bradtofel Date: 2008-06-24 18:34:28 -0700 (Tue, 24 Jun 2008) Log Message: ----------- DELETED Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/http/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-25 01:33:11
|
Revision: 2323 http://archive-access.svn.sourceforge.net/archive-access/?rev=2323&view=rev Author: bradtofel Date: 2008-06-24 18:33:19 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: package shuffling reference updates. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-06-25 01:30:59 UTC (rev 2322) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -1,218 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.net.MalformedURLException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.logging.Logger; - -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.resourceindex.updater.IndexClient; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.DirMaker; - -/** - * Thread that repeatedly notices new files in the LocalResourceStore, indexes - * those files, and hands them off to a ResourceIndex via an IndexClient - * - * @author brad - * @version $Date$, $Revision$ - */ -public class AutoIndexThread extends Thread { - private static final Logger LOGGER = - Logger.getLogger(AutoIndexThread.class.getName()); - - private final static int DEFAULT_RUN_INTERVAL_MS = 10000; - private LocalResourceStore store = null; - private File workDir = null; - private File queuedDir = null; - private int runInterval = DEFAULT_RUN_INTERVAL_MS; - private IndexClient indexClient = null; - - /** - * @param store - * @param runInterval - */ - public AutoIndexThread() { - super("AutoARCIndexThread"); - super.setDaemon(true); - } - - public void run() { - LOGGER.info("AutoIndexThread is alive."); - int sleepInterval = runInterval; - if(store == null) { - throw new RuntimeException("No LocalResourceStore set"); - } - while (true) { - try { - int numIndexed = indexNewArcs(); - if (numIndexed == 0) { - sleep(sleepInterval); - sleepInterval += runInterval; - } else { - sleepInterval = runInterval; - } - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - } - - /** - * Scan for new ARC files, and index any new files discovered. - * - * There are 3 main steps, which could be broken into separate threads: - * 1) detect new ARCs - * 2) create CDX files for each new ARC - * 3) upload CDX files to target (or rename to local "incoming" directory) - * - * for now these are sequential. - * - * @return number of ARC files indexed - */ - public int indexNewArcs() { - int numIndexed = 0; - try { - queueNewArcsForIndex(); - } catch (IOException e) { - e.printStackTrace(); - } - try { - numIndexed = indexArcs(10); - } catch (MalformedURLException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - return numIndexed; - } - /** - * Find any new ARC files and queue them for indexing. - * @throws IOException - */ - public void queueNewArcsForIndex() throws IOException { - - // build a HashMap of what has been queued already: - HashMap<String,String> queued = new HashMap<String, String>(); - String entries[] = queuedDir.list(); - if(entries != null) { - for (int i = 0; i < entries.length; i++) { - queued.put(entries[i], "i"); - } - } - // now scan thru arcDir, and make a flag file for anything that was not - // already there: - Iterator<String> files = store.fileNamesIterator(); - if(files != null) { - while(files.hasNext()) { - String fileName = files.next(); - if(!queued.containsKey(fileName)) { - File newQueuedFile = new File(queuedDir,fileName); - File newToBeIndexedFile = new File(workDir,fileName); - newToBeIndexedFile.createNewFile(); - newQueuedFile.createNewFile(); - } - } - } - } - - private String fileNameToBase(final String fileName) { - return fileName; - } - - /** - * Index up to 'max' ARC/WARC files queued for indexing, queueing the - * resulting CDX files for merging with the BDBIndex. - * - * @param indexer - * @param max maximum number to index in this method call, 0 for unlimited - * @return int number of ARC/WARC files indexed - * @throws MalformedURLException - * @throws IOException - */ - public int indexArcs(int max) - throws MalformedURLException, IOException { - - int numIndexed = 0; - String toBeIndexed[] = workDir.list(); - - if (toBeIndexed != null) { - for (int i = 0; i < toBeIndexed.length; i++) { - String fileName = toBeIndexed[i]; - File file = store.getLocalFile(fileName); - if(file != null) { - File workFlagFile = new File(workDir,fileName); - String cdxBase = fileNameToBase(fileName); - - try { - - LOGGER.info("Indexing " + file.getAbsolutePath()); - CloseableIterator<SearchResult> itr = store.indexFile(file); - - if(indexClient.addSearchResults(cdxBase, itr)) { - if (!workFlagFile.delete()) { - throw new IOException("Unable to delete " - + workFlagFile.getAbsolutePath()); - } - } - itr.close(); - numIndexed++; - } catch (IOException e) { - LOGGER.severe("FAILED index: " + file.getAbsolutePath() - + " cause: " + e.getLocalizedMessage()); - } - if(max > 0 && (numIndexed >= max)) { - break; - } - } - } - } - return numIndexed; - } - - - - public LocalResourceStore getStore() { - return store; - } - - public void setStore(LocalResourceStore store) { - this.store = store; - } - - public String getWorkDir() { - return workDir == null ? null : workDir.getAbsolutePath(); - } - - public void setWorkDir(String workDir) throws IOException { - this.workDir = DirMaker.ensureDir(workDir); - } - - public String getQueuedDir() { - return queuedDir == null ? null : queuedDir.getAbsolutePath(); - } - - public void setQueuedDir(String queuedDir) throws IOException { - this.queuedDir = DirMaker.ensureDir(queuedDir); - } - - public int getRunInterval() { - return runInterval; - } - - public void setRunInterval(int runInterval) { - this.runInterval = runInterval; - } - - public IndexClient getIndexClient() { - return indexClient; - } - - public void setIndexClient(IndexClient indexClient) { - this.indexClient = indexClient; - } -} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -0,0 +1,218 @@ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.logging.Logger; + +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.updater.IndexClient; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.DirMaker; + +/** + * Thread that repeatedly notices new files in the LocalResourceStore, indexes + * those files, and hands them off to a ResourceIndex via an IndexClient + * + * @author brad + * @version $Date$, $Revision$ + */ +public class AutoIndexThread extends Thread { + private static final Logger LOGGER = + Logger.getLogger(AutoIndexThread.class.getName()); + + private final static int DEFAULT_RUN_INTERVAL_MS = 10000; + private LocalResourceStore store = null; + private File workDir = null; + private File queuedDir = null; + private int runInterval = DEFAULT_RUN_INTERVAL_MS; + private IndexClient indexClient = null; + + /** + * @param store + * @param runInterval + */ + public AutoIndexThread() { + super("AutoARCIndexThread"); + super.setDaemon(true); + } + + public void run() { + LOGGER.info("AutoIndexThread is alive."); + int sleepInterval = runInterval; + if(store == null) { + throw new RuntimeException("No LocalResourceStore set"); + } + while (true) { + try { + int numIndexed = indexNewArcs(); + if (numIndexed == 0) { + sleep(sleepInterval); + sleepInterval += runInterval; + } else { + sleepInterval = runInterval; + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + /** + * Scan for new ARC files, and index any new files discovered. + * + * There are 3 main steps, which could be broken into separate threads: + * 1) detect new ARCs + * 2) create CDX files for each new ARC + * 3) upload CDX files to target (or rename to local "incoming" directory) + * + * for now these are sequential. + * + * @return number of ARC files indexed + */ + public int indexNewArcs() { + int numIndexed = 0; + try { + queueNewArcsForIndex(); + } catch (IOException e) { + e.printStackTrace(); + } + try { + numIndexed = indexArcs(10); + } catch (MalformedURLException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return numIndexed; + } + /** + * Find any new ARC files and queue them for indexing. + * @throws IOException + */ + public void queueNewArcsForIndex() throws IOException { + + // build a HashMap of what has been queued already: + HashMap<String,String> queued = new HashMap<String, String>(); + String entries[] = queuedDir.list(); + if(entries != null) { + for (int i = 0; i < entries.length; i++) { + queued.put(entries[i], "i"); + } + } + // now scan thru arcDir, and make a flag file for anything that was not + // already there: + Iterator<String> files = store.fileNamesIterator(); + if(files != null) { + while(files.hasNext()) { + String fileName = files.next(); + if(!queued.containsKey(fileName)) { + File newQueuedFile = new File(queuedDir,fileName); + File newToBeIndexedFile = new File(workDir,fileName); + newToBeIndexedFile.createNewFile(); + newQueuedFile.createNewFile(); + } + } + } + } + + private String fileNameToBase(final String fileName) { + return fileName; + } + + /** + * Index up to 'max' ARC/WARC files queued for indexing, queueing the + * resulting CDX files for merging with the BDBIndex. + * + * @param indexer + * @param max maximum number to index in this method call, 0 for unlimited + * @return int number of ARC/WARC files indexed + * @throws MalformedURLException + * @throws IOException + */ + public int indexArcs(int max) + throws MalformedURLException, IOException { + + int numIndexed = 0; + String toBeIndexed[] = workDir.list(); + + if (toBeIndexed != null) { + for (int i = 0; i < toBeIndexed.length; i++) { + String fileName = toBeIndexed[i]; + File file = store.getLocalFile(fileName); + if(file != null) { + File workFlagFile = new File(workDir,fileName); + String cdxBase = fileNameToBase(fileName); + + try { + + LOGGER.info("Indexing " + file.getAbsolutePath()); + CloseableIterator<SearchResult> itr = store.indexFile(file); + + if(indexClient.addSearchResults(cdxBase, itr)) { + if (!workFlagFile.delete()) { + throw new IOException("Unable to delete " + + workFlagFile.getAbsolutePath()); + } + } + itr.close(); + numIndexed++; + } catch (IOException e) { + LOGGER.severe("FAILED index: " + file.getAbsolutePath() + + " cause: " + e.getLocalizedMessage()); + } + if(max > 0 && (numIndexed >= max)) { + break; + } + } + } + } + return numIndexed; + } + + + + public LocalResourceStore getStore() { + return store; + } + + public void setStore(LocalResourceStore store) { + this.store = store; + } + + public String getWorkDir() { + return workDir == null ? null : workDir.getAbsolutePath(); + } + + public void setWorkDir(String workDir) throws IOException { + this.workDir = DirMaker.ensureDir(workDir); + } + + public String getQueuedDir() { + return queuedDir == null ? null : queuedDir.getAbsolutePath(); + } + + public void setQueuedDir(String queuedDir) throws IOException { + this.queuedDir = DirMaker.ensureDir(queuedDir); + } + + public int getRunInterval() { + return runInterval; + } + + public void setRunInterval(int runInterval) { + this.runInterval = runInterval; + } + + public IndexClient getIndexClient() { + return indexClient; + } + + public void setIndexClient(IndexClient indexClient) { + this.indexClient = indexClient; + } +} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2008-06-25 01:30:59 UTC (rev 2322) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -32,6 +32,8 @@ import org.archive.wayback.core.Resource; import org.archive.wayback.core.SearchResult; import org.archive.wayback.exception.ResourceNotAvailableException; +import org.archive.wayback.resourcestore.resourcefile.ArcWarcFilenameFilter; +import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; /** @@ -63,11 +65,11 @@ throw new IOException("No ARC/WARC offset in search result..."); } final long offset = Long.parseLong(offsetString); - if(!fileName.endsWith(LocalResourceStore.ARC_EXTENSION) - && !fileName.endsWith(LocalResourceStore.ARC_GZ_EXTENSION) - && !fileName.endsWith(LocalResourceStore.WARC_EXTENSION) - && !fileName.endsWith(LocalResourceStore.WARC_GZ_EXTENSION)) { - fileName = fileName + LocalResourceStore.ARC_GZ_EXTENSION; + if(!fileName.endsWith(ArcWarcFilenameFilter.ARC_SUFFIX) + && !fileName.endsWith(ArcWarcFilenameFilter.ARC_GZ_SUFFIX) + && !fileName.endsWith(ArcWarcFilenameFilter.WARC_SUFFIX) + && !fileName.endsWith(ArcWarcFilenameFilter.WARC_GZ_SUFFIX)) { + fileName = fileName + ArcWarcFilenameFilter.ARC_GZ_SUFFIX; } String fileUrl = urlPrefix + fileName; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java 2008-06-25 01:30:59 UTC (rev 2322) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -34,6 +34,7 @@ import org.archive.wayback.core.SearchResult; import org.archive.wayback.exception.ResourceNotAvailableException; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; +import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; /** * Simple ResourceStore implementation, which uses a ResourceFileLocationDB to Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2008-06-25 01:30:59 UTC (rev 2322) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -1,147 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.FilenameFilter; -import java.io.IOException; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; - -import org.archive.wayback.ResourceStore; -import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.Resource; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.exception.ConfigurationException; -import org.archive.wayback.exception.ResourceNotAvailableException; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.DirMaker; - -/** - * Class which implements a local ARC, WARC, ARC.gz, WARC.gz, ResourceStore - * including an optional automatic indexing thread - * - * @author brad - * @version $Date$, $Revision$ - */ -public class LocalResourceStore implements ResourceStore { - - private File dataDir = null; - private AutoIndexThread indexThread = null; - - private ArcIndexer arcIndexer = new ArcIndexer(); - private WarcIndexer warcIndexer = new WarcIndexer(); - public final static String ARC_EXTENSION = ".arc"; - public final static String ARC_GZ_EXTENSION = ".arc.gz"; - public final static String WARC_EXTENSION = ".warc"; - public final static String WARC_GZ_EXTENSION = ".warc.gz"; - public final static String OPEN_EXTENSION = ".open"; - private final static String[] SUFFIXES = { - "", ARC_EXTENSION, ARC_GZ_EXTENSION, WARC_EXTENSION, WARC_GZ_EXTENSION - }; - private FilenameFilter filter = new ArcWarcFilenameFilter(); - - public void init() throws ConfigurationException { - if(indexThread != null) { - indexThread.setStore(this); - indexThread.start(); - } - } - protected String resultToFileName(SearchResult result) { - return result.get(WaybackConstants.RESULT_ARC_FILE); - } - - protected long resultToOffset(SearchResult result) { - return Long.parseLong(result.get(WaybackConstants.RESULT_OFFSET)); - } - - public File getLocalFile(String fileName) { - // try adding suffixes: empty string is first in the list - File file = null; - for(String suffix : SUFFIXES) { - file = new File(dataDir,fileName + suffix); - if(file.exists() && file.canRead()) { - return file; - } - } - // this might work if the full path is in the index... - file = new File(fileName); - if(file.exists() && file.canRead()) { - return file; - } - // doh. - return null; - } - - public Resource retrieveResource(SearchResult result) throws IOException, - ResourceNotAvailableException { - String fileName = resultToFileName(result); - long offset = resultToOffset(result); - File file = getLocalFile(fileName); - if (file == null) { - - // TODO: this needs to be prettied up for end user consumption.. - throw new ResourceNotAvailableException("Cannot find ARC file (" - + fileName + ")"); - } else { - - Resource r = ResourceFactory.getResource(file, offset); - return r; - } - } - - public CloseableIterator<SearchResult> indexFile(File dataFile) throws IOException { - CloseableIterator<SearchResult> itr = null; - - String name = dataFile.getName(); - if(name.endsWith(ARC_EXTENSION)) { - itr = arcIndexer.iterator(dataFile); - } else if(name.endsWith(ARC_GZ_EXTENSION)) { - itr = arcIndexer.iterator(dataFile); - } else if(name.endsWith(WARC_EXTENSION)) { - itr = warcIndexer.iterator(dataFile); - } else if(name.endsWith(WARC_GZ_EXTENSION)) { - itr = warcIndexer.iterator(dataFile); - } - return itr; - } - - public Iterator<String> fileNamesIterator() throws IOException { - if(dataDir != null) { - String[] files = dataDir.list(filter); - List<String> l = Arrays.asList(files); - return l.iterator(); - } - return null; - } - - public String getDataDir() { - return DirMaker.getAbsolutePath(dataDir); - } - - public void setDataDir(String dataDir) throws IOException { - this.dataDir = DirMaker.ensureDir(dataDir); - } - - private class ArcWarcFilenameFilter implements FilenameFilter { - public boolean accept(File dir, String name) { - File tmp = new File(dir,name); - if(tmp.isFile() && tmp.canRead()) { - return name.endsWith(ARC_EXTENSION) || - name.endsWith(ARC_GZ_EXTENSION) || - name.endsWith(WARC_GZ_EXTENSION) || - name.endsWith(WARC_EXTENSION); - } - return false; - } - } - - public AutoIndexThread getIndexThread() { - return indexThread; - } - public void setIndexThread(AutoIndexThread indexThread) { - this.indexThread = indexThread; - } - public void shutdown() throws IOException { - // no-op. could shut down threads - } -} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -0,0 +1,150 @@ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import org.archive.wayback.ResourceStore; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.exception.ConfigurationException; +import org.archive.wayback.exception.ResourceNotAvailableException; +import org.archive.wayback.resourcestore.indexer.ArcIndexer; +import org.archive.wayback.resourcestore.indexer.WarcIndexer; +import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.DirMaker; + +/** + * Class which implements a local ARC, WARC, ARC.gz, WARC.gz, ResourceStore + * including an optional automatic indexing thread + * + * @author brad + * @version $Date$, $Revision$ + */ +public class LocalResourceStore implements ResourceStore { + + private File dataDir = null; + private AutoIndexThread indexThread = null; + + private ArcIndexer arcIndexer = new ArcIndexer(); + private WarcIndexer warcIndexer = new WarcIndexer(); + public final static String ARC_EXTENSION = ".arc"; + public final static String ARC_GZ_EXTENSION = ".arc.gz"; + public final static String WARC_EXTENSION = ".warc"; + public final static String WARC_GZ_EXTENSION = ".warc.gz"; + public final static String OPEN_EXTENSION = ".open"; + private final static String[] SUFFIXES = { + "", ARC_EXTENSION, ARC_GZ_EXTENSION, WARC_EXTENSION, WARC_GZ_EXTENSION + }; + private FilenameFilter filter = new ArcWarcFilenameFilter(); + + public void init() throws ConfigurationException { + if(indexThread != null) { + indexThread.setStore(this); + indexThread.start(); + } + } + protected String resultToFileName(SearchResult result) { + return result.get(WaybackConstants.RESULT_ARC_FILE); + } + + protected long resultToOffset(SearchResult result) { + return Long.parseLong(result.get(WaybackConstants.RESULT_OFFSET)); + } + + public File getLocalFile(String fileName) { + // try adding suffixes: empty string is first in the list + File file = null; + for(String suffix : SUFFIXES) { + file = new File(dataDir,fileName + suffix); + if(file.exists() && file.canRead()) { + return file; + } + } + // this might work if the full path is in the index... + file = new File(fileName); + if(file.exists() && file.canRead()) { + return file; + } + // doh. + return null; + } + + public Resource retrieveResource(SearchResult result) throws IOException, + ResourceNotAvailableException { + String fileName = resultToFileName(result); + long offset = resultToOffset(result); + File file = getLocalFile(fileName); + if (file == null) { + + // TODO: this needs to be prettied up for end user consumption.. + throw new ResourceNotAvailableException("Cannot find ARC file (" + + fileName + ")"); + } else { + + Resource r = ResourceFactory.getResource(file, offset); + return r; + } + } + + public CloseableIterator<SearchResult> indexFile(File dataFile) throws IOException { + CloseableIterator<SearchResult> itr = null; + + String name = dataFile.getName(); + if(name.endsWith(ARC_EXTENSION)) { + itr = arcIndexer.iterator(dataFile); + } else if(name.endsWith(ARC_GZ_EXTENSION)) { + itr = arcIndexer.iterator(dataFile); + } else if(name.endsWith(WARC_EXTENSION)) { + itr = warcIndexer.iterator(dataFile); + } else if(name.endsWith(WARC_GZ_EXTENSION)) { + itr = warcIndexer.iterator(dataFile); + } + return itr; + } + + public Iterator<String> fileNamesIterator() throws IOException { + if(dataDir != null) { + String[] files = dataDir.list(filter); + List<String> l = Arrays.asList(files); + return l.iterator(); + } + return null; + } + + public String getDataDir() { + return DirMaker.getAbsolutePath(dataDir); + } + + public void setDataDir(String dataDir) throws IOException { + this.dataDir = DirMaker.ensureDir(dataDir); + } + + private class ArcWarcFilenameFilter implements FilenameFilter { + public boolean accept(File dir, String name) { + File tmp = new File(dir,name); + if(tmp.isFile() && tmp.canRead()) { + return name.endsWith(ARC_EXTENSION) || + name.endsWith(ARC_GZ_EXTENSION) || + name.endsWith(WARC_GZ_EXTENSION) || + name.endsWith(WARC_EXTENSION); + } + return false; + } + } + + public AutoIndexThread getIndexThread() { + return indexThread; + } + public void setIndexThread(AutoIndexThread indexThread) { + this.indexThread = indexThread; + } + public void shutdown() throws IOException { + // no-op. could shut down threads + } +} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2008-06-25 01:30:59 UTC (rev 2322) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -30,8 +30,6 @@ import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.SearchResult; import org.archive.wayback.resourceindex.updater.IndexClient; -import org.archive.wayback.resourcestore.ArcIndexer; -import org.archive.wayback.resourcestore.WarcIndexer; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; import org.archive.wayback.util.CloseableIterator; //import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java 2008-06-25 01:30:59 UTC (rev 2322) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -34,10 +34,11 @@ * @version $Date$, $Revision$ */ public class ArcWarcFilenameFilter implements FilenameFilter { - private final static String ARC_SUFFIX = ".arc"; - private final static String ARC_GZ_SUFFIX = ".arc.gz"; - private final static String WARC_SUFFIX = ".warc"; - private final static String WARC_GZ_SUFFIX = ".warc.gz"; + public final static String ARC_SUFFIX = ".arc"; + public final static String ARC_GZ_SUFFIX = ".arc.gz"; + public final static String WARC_SUFFIX = ".warc"; + public final static String WARC_GZ_SUFFIX = ".warc.gz"; + public final static String OPEN_SUFFIX = ".open"; public boolean accept(File dir, String name) { return name.endsWith(ARC_SUFFIX) || This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-25 01:30:51
|
Revision: 2322 http://archive-access.svn.sourceforge.net/archive-access/?rev=2322&view=rev Author: bradtofel Date: 2008-06-24 18:30:59 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: moved ARC/WARC record to Resource code into resourcefile package. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java 2008-06-25 01:30:18 UTC (rev 2321) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java 2008-06-25 01:30:59 UTC (rev 2322) @@ -1,170 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.IOException; -import java.util.Enumeration; -import java.util.HashMap; -import java.util.Hashtable; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.Header; -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCRecord; -import org.archive.wayback.core.Resource; - -public class ArcResource extends Resource { - /** - * Logger for this class - */ - private static final Logger LOGGER = Logger.getLogger(ArcResource.class - .getName()); - - /** - * String prefix for ARC file related metadata namespace of keys within - * metaData Properties bag. - */ - private static String ARC_META_PREFIX = "arcmeta."; - /** - * String prefix for HTTP Header related metadata namespace of keys within - * metaData Properties bag. - */ - private static String HTTP_HEADER_PREFIX = "httpheader."; - /** - * object for ARCRecord - */ - ARCRecord arcRecord = null; - /** - * object for ARCReader -- need to hold on to this in order to call close() - * to release filehandle after completing access to this record. optional - */ - ARCReader arcReader = null; - /** - * flag to indicate if the ARCRecord skipHTTPHeader() has been called - */ - boolean parsedHeader = false; - /** - * Expandable property bag for holding metadata associated with this - * resource - */ - Hashtable<String,String> metaData = new Hashtable<String,String>(); - - /** - * Constructor - * - * @param rec - * @param reader - */ - public ArcResource(final ARCRecord rec,final ARCReader reader) { - super(); - arcRecord = rec; - arcReader = reader; - setInputStream(rec); - } - - /** parse the headers on the underlying ARC record, and extract all - * @throws IOException - */ - public void parseHeaders () throws IOException { - if(!parsedHeader) { - arcRecord.skipHttpHeader(); - // copy all HTTP headers to metaData, prefixing with - // HTTP_HEADER_PREFIX - Header[] headers = arcRecord.getHttpHeaders(); - if (headers != null) { - for (int i = 0; i < headers.length; i++) { - String value = headers[i].getValue(); - String name = headers[i].getName(); - metaData.put(HTTP_HEADER_PREFIX + name,value); - } - } - - // copy all ARC record header fields to metaData, prefixing with - // ARC_META_PREFIX - @SuppressWarnings("unchecked") - Map<String,Object> headerMetaMap = arcRecord.getMetaData().getHeaderFields(); - Set<String> keys = headerMetaMap.keySet(); - Iterator<String> itr = keys.iterator(); - while(itr.hasNext()) { - String metaKey = itr.next(); - Object value = headerMetaMap.get(metaKey); - String metaValue = ""; - if(value != null) { - metaValue = value.toString(); - } - metaData.put(ARC_META_PREFIX + metaKey,metaValue); - } - - parsedHeader = true; - } - } - - /** - * @param prefix - * @return a Properties of all elements in metaData starting with 'prefix'. - * keys in the returned Properties have 'prefix' removed. - */ - public Map<String,String> filterMeta(String prefix) { - HashMap<String,String> matching = new HashMap<String,String>(); - for (Enumeration<String> e = metaData.keys(); e.hasMoreElements();) { - String key = e.nextElement(); - if (key.startsWith(prefix)) { - String finalKey = key.substring(prefix.length()); - String value = metaData.get(key); - matching.put(finalKey, value); - } - } - return matching; - } - - /** - * @return a Properties containing all HTTP header fields for this record - */ - public Map<String,String> getHttpHeaders() { - return filterMeta(HTTP_HEADER_PREFIX); - } - - /** - * @return a Properties containing all ARC Meta fields for this record - */ - public Map<String,String> getARCMetadata() { - return filterMeta(ARC_META_PREFIX); - } - - /** - * (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#getStatusCode() - * @return int HTTP status code returned with this document. - */ - public int getStatusCode() { - return arcRecord.getStatusCode(); - } - - /** - * @return the ARCRecord underlying this Resource. - */ - public ArchiveRecord getArcRecord() { - return arcRecord; - } - - /* (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#close() - */ - public void close() throws IOException { - //LOGGER.info("About to close..("+arcReader+")"); - arcRecord.close(); - if(arcReader != null) { - arcReader.close(); - LOGGER.info("closed..("+arcReader+")"); - } - } - - /** - * @return byte length claimed in ARC record metadata line. - */ - public long getRecordLength() { - return arcRecord.getMetaData().getLength(); - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java 2008-06-25 01:30:18 UTC (rev 2321) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java 2008-06-25 01:30:59 UTC (rev 2322) @@ -1,105 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.net.URL; - -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCReaderFactory; -import org.archive.io.arc.ARCRecord; -import org.archive.io.warc.WARCReader; -import org.archive.io.warc.WARCReaderFactory; -import org.archive.io.warc.WARCRecord; -import org.archive.wayback.core.Resource; -import org.archive.wayback.exception.ResourceNotAvailableException; - -/** - * Static factory class for constructing ARC/WARC Resources from - * File/URL + offset. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class ResourceFactory { - - public static Resource getResource(File file, long offset) - throws IOException, ResourceNotAvailableException { - - Resource r = null; - String name = file.getName(); - if (name.endsWith(LocalResourceStore.OPEN_EXTENSION)) { - name = name.substring(0, name.length() - - LocalResourceStore.OPEN_EXTENSION.length()); - } - if (isArc(name)) { - - ARCReader reader = ARCReaderFactory.get(file,offset); - r = ARCArchiveRecordToResource(reader.get(),reader); - - } else if (isWarc(name)) { - - WARCReader reader = WARCReaderFactory.get(file,offset); - r = WARCArchiveRecordToResource(reader.get(),reader); - - } else { - throw new ResourceNotAvailableException("Unknown extension"); - } - - return r; - } - - public static Resource getResource(URL url, long offset) - throws IOException, ResourceNotAvailableException { - Resource r = null; - String name = url.getFile(); - if (isArc(name)) { - - ARCReader reader = ARCReaderFactory.get(url, offset); - r = ARCArchiveRecordToResource(reader.get(),reader); - - } else if (isWarc(name)) { - - WARCReader reader = WARCReaderFactory.get(url, offset); - r = WARCArchiveRecordToResource(reader.get(),reader); - - } else { - throw new ResourceNotAvailableException("Unknown extension"); - } - return r; - } - - private static boolean isArc(final String name) { - - return (name.endsWith(LocalResourceStore.ARC_EXTENSION) - || name.endsWith(LocalResourceStore.ARC_GZ_EXTENSION)); - } - - private static boolean isWarc(final String name) { - - return (name.endsWith(LocalResourceStore.WARC_EXTENSION) - || name.endsWith(LocalResourceStore.WARC_GZ_EXTENSION)); - } - - private static Resource ARCArchiveRecordToResource(ArchiveRecord rec, - ARCReader reader) throws ResourceNotAvailableException, IOException { - - if (!(rec instanceof ARCRecord)) { - throw new ResourceNotAvailableException("Bad ARCRecord format"); - } - ArcResource ar = new ArcResource((ARCRecord) rec, reader); - ar.parseHeaders(); - return ar; - } - - private static Resource WARCArchiveRecordToResource(ArchiveRecord rec, - WARCReader reader) throws ResourceNotAvailableException, IOException { - - if (!(rec instanceof WARCRecord)) { - throw new ResourceNotAvailableException("Bad WARCRecord format"); - } - WarcResource wr = new WarcResource((WARCRecord) rec, reader); - wr.parseHeaders(); - return wr; - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java 2008-06-25 01:30:18 UTC (rev 2321) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java 2008-06-25 01:30:59 UTC (rev 2322) @@ -1,98 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.IOException; -import java.util.Hashtable; -import java.util.Map; - -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpParser; -import org.apache.commons.httpclient.StatusLine; -import org.apache.commons.httpclient.util.EncodingUtil; -import org.archive.io.RecoverableIOException; -import org.archive.io.arc.ARCConstants; -import org.archive.io.warc.WARCReader; -import org.archive.io.warc.WARCRecord; -import org.archive.wayback.core.Resource; - -public class WarcResource extends Resource { - private WARCRecord rec = null; - private WARCReader reader = null; - private Map<String, String> headers = null; - private long length = 0; - private int status = 0; - private boolean parsedHeaders = false; - public WarcResource(WARCRecord rec, WARCReader reader) { - this.rec = rec; - this.reader = reader; - } - - /** - * @param bytes Array of bytes to examine for an EOL. - * @return Count of end-of-line characters or zero if none. - */ - private int getEolCharsCount(byte [] bytes) { - int count = 0; - if (bytes != null && bytes.length >=1 && - bytes[bytes.length - 1] == '\n') { - count++; - if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { - count++; - } - } - return count; - } - - public void parseHeaders() throws IOException { - if(parsedHeaders) { - return; - } - - byte [] statusBytes = HttpParser.readRawLine(rec); - int eolCharCount = getEolCharsCount(statusBytes); - if (eolCharCount <= 0) { - throw new RecoverableIOException("Failed to read http status where one " + - " was expected: " + new String(statusBytes)); - } - String statusLineStr = EncodingUtil.getString(statusBytes, 0, - statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); - if ((statusLineStr == null) || - !StatusLine.startsWithHTTP(statusLineStr)) { - throw new RecoverableIOException("Failed parse of http status line."); - } - StatusLine statusLine = new StatusLine(statusLineStr); - - this.status = statusLine.getStatusCode(); - - Header[] tmpHeaders = HttpParser.parseHeaders(rec, - ARCConstants.DEFAULT_ENCODING); - headers = new Hashtable<String,String>(); - for(Header header: tmpHeaders) { - headers.put(header.getName(), header.getValue()); - } - this.setInputStream(rec); - parsedHeaders = true; - } - - - @Override - public Map<String, String> getHttpHeaders() { - return headers; - } - - @Override - public long getRecordLength() { - // TODO Auto-generated method stub - return length; - } - - @Override - public int getStatusCode() { - return status; - } - - @Override - public void close() throws IOException { - rec.close(); - reader.close(); - } -} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java (from rev 2082, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java 2008-06-25 01:30:59 UTC (rev 2322) @@ -0,0 +1,170 @@ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.IOException; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.Hashtable; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCReader; +import org.archive.io.arc.ARCRecord; +import org.archive.wayback.core.Resource; + +public class ArcResource extends Resource { + /** + * Logger for this class + */ + private static final Logger LOGGER = Logger.getLogger(ArcResource.class + .getName()); + + /** + * String prefix for ARC file related metadata namespace of keys within + * metaData Properties bag. + */ + private static String ARC_META_PREFIX = "arcmeta."; + /** + * String prefix for HTTP Header related metadata namespace of keys within + * metaData Properties bag. + */ + private static String HTTP_HEADER_PREFIX = "httpheader."; + /** + * object for ARCRecord + */ + ARCRecord arcRecord = null; + /** + * object for ARCReader -- need to hold on to this in order to call close() + * to release filehandle after completing access to this record. optional + */ + ARCReader arcReader = null; + /** + * flag to indicate if the ARCRecord skipHTTPHeader() has been called + */ + boolean parsedHeader = false; + /** + * Expandable property bag for holding metadata associated with this + * resource + */ + Hashtable<String,String> metaData = new Hashtable<String,String>(); + + /** + * Constructor + * + * @param rec + * @param reader + */ + public ArcResource(final ARCRecord rec,final ARCReader reader) { + super(); + arcRecord = rec; + arcReader = reader; + setInputStream(rec); + } + + /** parse the headers on the underlying ARC record, and extract all + * @throws IOException + */ + public void parseHeaders () throws IOException { + if(!parsedHeader) { + arcRecord.skipHttpHeader(); + // copy all HTTP headers to metaData, prefixing with + // HTTP_HEADER_PREFIX + Header[] headers = arcRecord.getHttpHeaders(); + if (headers != null) { + for (int i = 0; i < headers.length; i++) { + String value = headers[i].getValue(); + String name = headers[i].getName(); + metaData.put(HTTP_HEADER_PREFIX + name,value); + } + } + + // copy all ARC record header fields to metaData, prefixing with + // ARC_META_PREFIX + @SuppressWarnings("unchecked") + Map<String,Object> headerMetaMap = arcRecord.getMetaData().getHeaderFields(); + Set<String> keys = headerMetaMap.keySet(); + Iterator<String> itr = keys.iterator(); + while(itr.hasNext()) { + String metaKey = itr.next(); + Object value = headerMetaMap.get(metaKey); + String metaValue = ""; + if(value != null) { + metaValue = value.toString(); + } + metaData.put(ARC_META_PREFIX + metaKey,metaValue); + } + + parsedHeader = true; + } + } + + /** + * @param prefix + * @return a Properties of all elements in metaData starting with 'prefix'. + * keys in the returned Properties have 'prefix' removed. + */ + public Map<String,String> filterMeta(String prefix) { + HashMap<String,String> matching = new HashMap<String,String>(); + for (Enumeration<String> e = metaData.keys(); e.hasMoreElements();) { + String key = e.nextElement(); + if (key.startsWith(prefix)) { + String finalKey = key.substring(prefix.length()); + String value = metaData.get(key); + matching.put(finalKey, value); + } + } + return matching; + } + + /** + * @return a Properties containing all HTTP header fields for this record + */ + public Map<String,String> getHttpHeaders() { + return filterMeta(HTTP_HEADER_PREFIX); + } + + /** + * @return a Properties containing all ARC Meta fields for this record + */ + public Map<String,String> getARCMetadata() { + return filterMeta(ARC_META_PREFIX); + } + + /** + * (non-Javadoc) + * @see org.archive.io.arc.ARCRecord#getStatusCode() + * @return int HTTP status code returned with this document. + */ + public int getStatusCode() { + return arcRecord.getStatusCode(); + } + + /** + * @return the ARCRecord underlying this Resource. + */ + public ArchiveRecord getArcRecord() { + return arcRecord; + } + + /* (non-Javadoc) + * @see org.archive.io.arc.ARCRecord#close() + */ + public void close() throws IOException { + //LOGGER.info("About to close..("+arcReader+")"); + arcRecord.close(); + if(arcReader != null) { + arcReader.close(); + LOGGER.info("closed..("+arcReader+")"); + } + } + + /** + * @return byte length claimed in ARC record metadata line. + */ + public long getRecordLength() { + return arcRecord.getMetaData().getLength(); + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java (from rev 2122, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java 2008-06-25 01:30:59 UTC (rev 2322) @@ -0,0 +1,105 @@ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.File; +import java.io.IOException; +import java.net.URL; + +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCReader; +import org.archive.io.arc.ARCReaderFactory; +import org.archive.io.arc.ARCRecord; +import org.archive.io.warc.WARCReader; +import org.archive.io.warc.WARCReaderFactory; +import org.archive.io.warc.WARCRecord; +import org.archive.wayback.core.Resource; +import org.archive.wayback.exception.ResourceNotAvailableException; + +/** + * Static factory class for constructing ARC/WARC Resources from + * File/URL + offset. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ResourceFactory { + + public static Resource getResource(File file, long offset) + throws IOException, ResourceNotAvailableException { + + Resource r = null; + String name = file.getName(); + if (name.endsWith(ArcWarcFilenameFilter.OPEN_SUFFIX)) { + name = name.substring(0, name.length() + - ArcWarcFilenameFilter.OPEN_SUFFIX.length()); + } + if (isArc(name)) { + + ARCReader reader = ARCReaderFactory.get(file,offset); + r = ARCArchiveRecordToResource(reader.get(),reader); + + } else if (isWarc(name)) { + + WARCReader reader = WARCReaderFactory.get(file,offset); + r = WARCArchiveRecordToResource(reader.get(),reader); + + } else { + throw new ResourceNotAvailableException("Unknown extension"); + } + + return r; + } + + public static Resource getResource(URL url, long offset) + throws IOException, ResourceNotAvailableException { + Resource r = null; + String name = url.getFile(); + if (isArc(name)) { + + ARCReader reader = ARCReaderFactory.get(url, offset); + r = ARCArchiveRecordToResource(reader.get(),reader); + + } else if (isWarc(name)) { + + WARCReader reader = WARCReaderFactory.get(url, offset); + r = WARCArchiveRecordToResource(reader.get(),reader); + + } else { + throw new ResourceNotAvailableException("Unknown extension"); + } + return r; + } + + private static boolean isArc(final String name) { + + return (name.endsWith(ArcWarcFilenameFilter.ARC_SUFFIX) + || name.endsWith(ArcWarcFilenameFilter.ARC_GZ_SUFFIX)); + } + + private static boolean isWarc(final String name) { + + return (name.endsWith(ArcWarcFilenameFilter.WARC_SUFFIX) + || name.endsWith(ArcWarcFilenameFilter.WARC_GZ_SUFFIX)); + } + + private static Resource ARCArchiveRecordToResource(ArchiveRecord rec, + ARCReader reader) throws ResourceNotAvailableException, IOException { + + if (!(rec instanceof ARCRecord)) { + throw new ResourceNotAvailableException("Bad ARCRecord format"); + } + ArcResource ar = new ArcResource((ARCRecord) rec, reader); + ar.parseHeaders(); + return ar; + } + + private static Resource WARCArchiveRecordToResource(ArchiveRecord rec, + WARCReader reader) throws ResourceNotAvailableException, IOException { + + if (!(rec instanceof WARCRecord)) { + throw new ResourceNotAvailableException("Bad WARCRecord format"); + } + WarcResource wr = new WarcResource((WARCRecord) rec, reader); + wr.parseHeaders(); + return wr; + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java (from rev 2082, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java 2008-06-25 01:30:59 UTC (rev 2322) @@ -0,0 +1,98 @@ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.IOException; +import java.util.Hashtable; +import java.util.Map; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.RecoverableIOException; +import org.archive.io.arc.ARCConstants; +import org.archive.io.warc.WARCReader; +import org.archive.io.warc.WARCRecord; +import org.archive.wayback.core.Resource; + +public class WarcResource extends Resource { + private WARCRecord rec = null; + private WARCReader reader = null; + private Map<String, String> headers = null; + private long length = 0; + private int status = 0; + private boolean parsedHeaders = false; + public WarcResource(WARCRecord rec, WARCReader reader) { + this.rec = rec; + this.reader = reader; + } + + /** + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + public void parseHeaders() throws IOException { + if(parsedHeaders) { + return; + } + + byte [] statusBytes = HttpParser.readRawLine(rec); + int eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new RecoverableIOException("Failed to read http status where one " + + " was expected: " + new String(statusBytes)); + } + String statusLineStr = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + if ((statusLineStr == null) || + !StatusLine.startsWithHTTP(statusLineStr)) { + throw new RecoverableIOException("Failed parse of http status line."); + } + StatusLine statusLine = new StatusLine(statusLineStr); + + this.status = statusLine.getStatusCode(); + + Header[] tmpHeaders = HttpParser.parseHeaders(rec, + ARCConstants.DEFAULT_ENCODING); + headers = new Hashtable<String,String>(); + for(Header header: tmpHeaders) { + headers.put(header.getName(), header.getValue()); + } + this.setInputStream(rec); + parsedHeaders = true; + } + + + @Override + public Map<String, String> getHttpHeaders() { + return headers; + } + + @Override + public long getRecordLength() { + // TODO Auto-generated method stub + return length; + } + + @Override + public int getStatusCode() { + return status; + } + + @Override + public void close() throws IOException { + rec.close(); + reader.close(); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-25 01:30:12
|
Revision: 2321 http://archive-access.svn.sourceforge.net/archive-access/?rev=2321&view=rev Author: bradtofel Date: 2008-06-24 18:30:18 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: moved indexing related code into indexer package Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,173 +0,0 @@ -/* ArcRecordToSearchResultAdapter - * - * $Id$ - * - * Created on 3:27:03 PM Jul 26, 2007. - * - * Copyright (C) 2007 Internet Archive. - * - * This file is part of wayback-core. - * - * wayback-core is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback-core is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback-core; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.URIException; -import org.archive.io.arc.ARCRecord; -import org.archive.io.arc.ARCRecordMetaData; -import org.archive.net.UURI; -import org.archive.net.UURIFactory; -import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; - -/** - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class ARCRecordToSearchResultAdapter -implements Adapter<ARCRecord,SearchResult>{ - - private static final Logger LOGGER = Logger.getLogger( - ARCRecordToSearchResultAdapter.class.getName()); - - private UrlCanonicalizer canonicalizer = null; - - public ARCRecordToSearchResultAdapter() { - canonicalizer = new AggressiveUrlCanonicalizer(); - } -// public static SearchResult arcRecordToSearchResult(final ARCRecord rec) -// throws IOException, ParseException { - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public SearchResult adapt(ARCRecord rec) { - try { - return adaptInner(rec); - } catch (IOException e) { - e.printStackTrace(); - return null; - } - } - - private SearchResult adaptInner(ARCRecord rec) throws IOException { - rec.close(); - ARCRecordMetaData meta = rec.getMetaData(); - - SearchResult result = new SearchResult(); - String arcName = meta.getArc(); - int index = arcName.lastIndexOf(File.separator); - if (index > 0 && (index + 1) < arcName.length()) { - arcName = arcName.substring(index + 1); - } - result.put(WaybackConstants.RESULT_ARC_FILE, arcName); - result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta - .getOffset())); - - // initialize with default HTTP code... - result.put(WaybackConstants.RESULT_HTTP_CODE, "-"); - - result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); - result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype()); - result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate()); - - String uriStr = meta.getUrl(); - if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { - // skip filedesc record altogether... - return null; - } - if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { - // skip URL + HTTP header processing for dns records... - - String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX - .length()); - result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); - result.put(WaybackConstants.RESULT_REDIRECT_URL, "-"); - result.put(WaybackConstants.RESULT_URL, uriStr); - result.put(WaybackConstants.RESULT_URL_KEY, uriStr); - - } else { - - UURI uri = UURIFactory.getInstance(uriStr); - result.put(WaybackConstants.RESULT_URL, uriStr); - - String uriHost = uri.getHost(); - if (uriHost == null) { - LOGGER.info("No host in " + uriStr + " in " + meta.getArc()); - } else { - result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); - - String statusCode = (meta.getStatusCode() == null) ? "-" : meta - .getStatusCode(); - result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode); - - String redirectUrl = "-"; - Header[] headers = rec.getHttpHeaders(); - if (headers != null) { - - for (int i = 0; i < headers.length; i++) { - if (headers[i].getName().equals( - WaybackConstants.LOCATION_HTTP_HEADER)) { - - String locationStr = headers[i].getValue(); - // TODO: "Location" is supposed to be absolute: - // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) - // (section 14.30) but Content-Location can be - // relative. - // is it correct to resolve a relative Location, as - // we are? - // it's also possible to have both in the HTTP - // headers... - // should we prefer one over the other? - // right now, we're ignoring "Content-Location" - try { - UURI uriRedirect = UURIFactory.getInstance(uri, - locationStr); - redirectUrl = uriRedirect.getEscapedURI(); - - } catch (URIException e) { - LOGGER.info("Bad Location: " + locationStr - + " for " + uriStr + " in " - + meta.getArc() + " Skipped"); - } - break; - } - } - } - result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl); - - String indexUrl = canonicalizer.urlStringToKey(meta.getUrl()); - result.put(WaybackConstants.RESULT_URL_KEY, indexUrl); - } - - } - return result; - } - public UrlCanonicalizer getCanonicalizer() { - return canonicalizer; - } - public void setCanonicalizer(UrlCanonicalizer canonicalizer) { - this.canonicalizer = canonicalizer; - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,175 +0,0 @@ -/* ArcIndexer - * - * $Id$ - * - * Created on 2:33:29 PM Oct 11, 2006. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.PrintWriter; -import java.io.IOException; -import java.util.Iterator; - -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCReaderFactory; -import org.archive.io.arc.ARCRecord; -import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; -import org.archive.wayback.util.AdaptedIterator; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.IdentityUrlCanonicalizer; - -/** - * Transforms an ARC file into Iterator<SearchResult>. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class ArcIndexer { - - /** - * CDX Header line for these fields. not very configurable.. - */ - public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; - private UrlCanonicalizer canonicalizer = null; - - public ArcIndexer() { - canonicalizer = new AggressiveUrlCanonicalizer(); - } - - /** - * @param arc - * @return Iterator of SearchResults for input arc File - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(File arc) - throws IOException { - return iterator(ARCReaderFactory.get(arc)); - } - - /** - * @param pathOrUrl - * @return Iterator of SearchResults for input pathOrUrl - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(String pathOrUrl) - throws IOException { - return iterator(ARCReaderFactory.get(pathOrUrl)); - } - - /** - * @param arcReader - * @return Iterator of SearchResults for input ARCReader - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(ARCReader arcReader) - throws IOException { - arcReader.setParseHttpHeaders(true); - - Adapter<ArchiveRecord,ARCRecord> adapter1 = - new ArchiveRecordToARCRecordAdapter(); - - ARCRecordToSearchResultAdapter adapter2 = - new ARCRecordToSearchResultAdapter(); - adapter2.setCanonicalizer(canonicalizer); - - ArchiveReaderCloseableIterator itr1 = - new ArchiveReaderCloseableIterator(arcReader,arcReader.iterator()); - - CloseableIterator<ARCRecord> itr2 = - new AdaptedIterator<ArchiveRecord,ARCRecord>(itr1,adapter1); - - return new AdaptedIterator<ARCRecord,SearchResult>(itr2,adapter2); - } - - public UrlCanonicalizer getCanonicalizer() { - return canonicalizer; - } - - public void setCanonicalizer(UrlCanonicalizer canonicalizer) { - this.canonicalizer = canonicalizer; - } - - private static void USAGE() { - System.err.println("USAGE:"); - System.err.println(""); - System.err.println("arc-indexer [-identity] ARCFILE"); - System.err.println("arc-indexer [-identity] ARCFILE CDXFILE"); - System.err.println(""); - System.err.println("Create a CDX format index at CDXFILE or to STDOUT."); - System.err.println("With -identity, perform no url canonicalization."); - System.exit(1); - } - - /** - * @param args - */ - public static void main(String[] args) { - ArcIndexer indexer = new ArcIndexer(); - int idx = 0; - if(args[0] != null && args[0].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); - idx++; - } - File arc = new File(args[idx]); - idx++; - PrintWriter pw = null; - try { - if(args.length == idx) { - // dump to STDOUT: - pw = new PrintWriter(System.out); - } else if(args.length == (idx + 1)) { - pw = new PrintWriter(args[idx]); - } else { - USAGE(); - } - Iterator<SearchResult> res = indexer.iterator(arc); - Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); - while(lines.hasNext()) { - pw.println(lines.next()); - } - pw.close(); - } catch (Exception e) { - e.printStackTrace(); - System.exit(1); - } - } - - private class ArchiveRecordToARCRecordAdapter - implements Adapter<ArchiveRecord,ARCRecord> { - - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public ARCRecord adapt(ArchiveRecord o) { - ARCRecord rec = null; - if(o instanceof ARCRecord) { - rec = (ARCRecord) o; - } - return rec; - } - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,29 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.IOException; -import java.util.Iterator; - -import org.archive.io.ArchiveReader; -import org.archive.io.ArchiveRecord; -import org.archive.wayback.util.CloseableIterator; - -public class ArchiveReaderCloseableIterator implements CloseableIterator<ArchiveRecord> { - private ArchiveReader reader = null; - private Iterator<ArchiveRecord> itr = null; - public ArchiveReaderCloseableIterator(ArchiveReader reader, Iterator<ArchiveRecord> itr) { - this.reader = reader; - this.itr = itr; - } - public boolean hasNext() { - return itr.hasNext(); - } - public ArchiveRecord next() { - return itr.next(); - } - public void remove() { - itr.remove(); - } - public void close() throws IOException { - reader.close(); - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,318 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpParser; -import org.apache.commons.httpclient.StatusLine; -import org.apache.commons.httpclient.URIException; -import org.apache.commons.httpclient.util.EncodingUtil; -import org.archive.io.ArchiveRecordHeader; -import org.archive.io.RecoverableIOException; -import org.archive.io.arc.ARCConstants; -import org.archive.io.warc.WARCConstants; -import org.archive.io.warc.WARCRecord; -import org.archive.net.UURI; -import org.archive.net.UURIFactory; -import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; - -/** - * Adapts certain WARCRecords into SearchResults. DNS and response records are - * mostly straightforward, but SearchResult objects generated from revisit - * records contain lots of "placeholder" fields, which are expected to be - * understood by later processes traversing a stream of SearchResult objects. - * - * See org.archive.wayback.resourceindex.DeduplicateSearchResultAnnotationAdapter. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class WARCRecordToSearchResultAdapter -implements Adapter<WARCRecord,SearchResult>{ - - private final static String DEFAULT_VALUE = "-"; - private final static String SEARCH_FIELDS[] = { - WaybackConstants.RESULT_URL, - WaybackConstants.RESULT_URL_KEY, - WaybackConstants.RESULT_ORIG_HOST, - WaybackConstants.RESULT_CAPTURE_DATE, - WaybackConstants.RESULT_MD5_DIGEST, - WaybackConstants.RESULT_MIME_TYPE, - WaybackConstants.RESULT_HTTP_CODE, - WaybackConstants.RESULT_REDIRECT_URL, - WaybackConstants.RESULT_ARC_FILE, - WaybackConstants.RESULT_OFFSET, - }; - - private static final Logger LOGGER = Logger.getLogger( - WARCRecordToSearchResultAdapter.class.getName()); - - private UrlCanonicalizer canonicalizer = null; - - public WARCRecordToSearchResultAdapter() { - canonicalizer = new AggressiveUrlCanonicalizer(); - } - - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public SearchResult adapt(WARCRecord rec) { - try { - return adaptInner(rec); - } catch (IOException e) { - e.printStackTrace(); - return null; - } - } - - /* - * Transform input date to 14-digit timestamp: - * 2007-08-29T18:00:26Z => 20070829180026 - */ - private static String transformDate(final String input) { - - StringBuilder output = new StringBuilder(14); - - output.append(input.substring(0,4)); - output.append(input.substring(5,7)); - output.append(input.substring(8,10)); - output.append(input.substring(11,13)); - output.append(input.substring(14,16)); - output.append(input.substring(17,19)); - - return output.toString(); - } - - private static String transformHTTPMime(final String input) { - int semiIdx = input.indexOf(";"); - if(semiIdx > 0) { - return input.substring(0,semiIdx).trim(); - } - return input.trim(); - } - - private String transformWarcFilename(String readerIdentifier) { - String warcName = readerIdentifier; - int index = warcName.lastIndexOf(File.separator); - if (index > 0 && (index + 1) < warcName.length()) { - warcName = warcName.substring(index + 1); - } - return warcName; - } - - private String transformDigest(final Object o) { - if(o == null) { - return DEFAULT_VALUE; - } - String orig = o.toString(); - if(orig.startsWith("sha1:")) { - return orig.substring(5); - } - return orig; - } - - private SearchResult getBlankSearchResult() { - SearchResult result = new SearchResult(); - for(String field : SEARCH_FIELDS) { - result.put(field, DEFAULT_VALUE); - } - return result; - } - - private UURI addUrlDataToSearchResult(SearchResult result, String urlStr) - throws IOException { - - result.put(WaybackConstants.RESULT_URL, urlStr); - result.put(WaybackConstants.RESULT_URL_KEY, urlStr); - - - UURI uri = UURIFactory.getInstance(urlStr); - String uriHost = uri.getHost(); - if (uriHost == null) { - - LOGGER.info("No host in " + urlStr); - - } else { - - result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); - } - - String urlKey = canonicalizer.urlStringToKey(urlStr); - result.put(WaybackConstants.RESULT_URL_KEY, urlKey); - - return uri; - } - - private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - - SearchResult result = getBlankSearchResult(); - - result.put(WaybackConstants.RESULT_CAPTURE_DATE, - transformDate(header.getDate())); - result.put(WaybackConstants.RESULT_ARC_FILE, - transformWarcFilename(header.getReaderIdentifier())); - result.put(WaybackConstants.RESULT_OFFSET, - String.valueOf(header.getOffset())); - - String uriStr = header.getUrl(); - - String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX - .length()); - result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype()); - - result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); - result.put(WaybackConstants.RESULT_URL, uriStr); - result.put(WaybackConstants.RESULT_URL_KEY, uriStr); - - rec.close(); - result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); - - return result; - } - - private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - - SearchResult result = getBlankSearchResult(); - - result.put(WaybackConstants.RESULT_CAPTURE_DATE, - transformDate(header.getDate())); - result.put(WaybackConstants.RESULT_MD5_DIGEST, - transformDigest(header.getHeaderValue( - WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); - - addUrlDataToSearchResult(result,header.getUrl()); - - return result; - } - - /** - * borrowed(copied) from org.archive.io.arc.ARCRecord... - * - * @param bytes Array of bytes to examine for an EOL. - * @return Count of end-of-line characters or zero if none. - */ - private int getEolCharsCount(byte [] bytes) { - int count = 0; - if (bytes != null && bytes.length >=1 && - bytes[bytes.length - 1] == '\n') { - count++; - if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { - count++; - } - } - return count; - } - - private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - - SearchResult result = getBlankSearchResult(); - - result.put(WaybackConstants.RESULT_CAPTURE_DATE, - transformDate(header.getDate())); - result.put(WaybackConstants.RESULT_ARC_FILE, - transformWarcFilename(header.getReaderIdentifier())); - result.put(WaybackConstants.RESULT_OFFSET, - String.valueOf(header.getOffset())); - - String origUrl = header.getUrl(); - UURI uri = addUrlDataToSearchResult(result,origUrl); - - // need to parse the documents HTTP message and headers here: WARCReader - // does not implement this... yet.. - - byte [] statusBytes = HttpParser.readRawLine(rec); - int eolCharCount = getEolCharsCount(statusBytes); - if (eolCharCount <= 0) { - throw new RecoverableIOException("Failed to read http status where one " + - " was expected: " + new String(statusBytes)); - } - String statusLine = EncodingUtil.getString(statusBytes, 0, - statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); - if ((statusLine == null) || - !StatusLine.startsWithHTTP(statusLine)) { - throw new RecoverableIOException("Failed parse of http status line."); - } - StatusLine status = new StatusLine(statusLine); - result.put(WaybackConstants.RESULT_HTTP_CODE, - String.valueOf(status.getStatusCode())); - - Header[] headers = HttpParser.parseHeaders(rec, - ARCConstants.DEFAULT_ENCODING); - - rec.close(); - result.put(WaybackConstants.RESULT_MD5_DIGEST, - transformDigest(header.getHeaderValue( - WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); - - if (headers != null) { - - for (Header httpHeader : headers) { - if (httpHeader.getName().equals( - WaybackConstants.LOCATION_HTTP_HEADER)) { - - String locationStr = httpHeader.getValue(); - // TODO: "Location" is supposed to be absolute: - // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) - // (section 14.30) but Content-Location can be - // relative. - // is it correct to resolve a relative Location, as - // we are? - // it's also possible to have both in the HTTP - // headers... - // should we prefer one over the other? - // right now, we're ignoring "Content-Location" - try { - UURI uriRedirect = UURIFactory.getInstance(uri, - locationStr); - result.put(WaybackConstants.RESULT_REDIRECT_URL, - uriRedirect.getEscapedURI()); - } catch (URIException e) { - LOGGER.info("Bad Location: " + locationStr - + " for " + origUrl + " in " - + header.getReaderIdentifier() + " Skipped"); - } - } else if(httpHeader.getName().toLowerCase().equals("content-type")) { - result.put(WaybackConstants.RESULT_MIME_TYPE, - transformHTTPMime(httpHeader.getValue())); - } - } - } - return result; - } - - private SearchResult adaptInner(WARCRecord rec) throws IOException { - - SearchResult result = null; - ArchiveRecordHeader header = rec.getHeader(); - String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); - if(type.equals(WARCConstants.RESPONSE)) { - String mime = header.getMimetype(); - if(mime.equals("text/dns")) { - result = adaptDNS(header,rec); - } else { - result = adaptResponse(header,rec); - } - } else if(type.equals(WARCConstants.REVISIT)) { - result = adaptRevisit(header,rec); - } - - return result; - } - - public UrlCanonicalizer getCanonicalizer() { - return canonicalizer; - } - - public void setCanonicalizer(UrlCanonicalizer canonicalizer) { - this.canonicalizer = canonicalizer; - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,140 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.Iterator; - -import org.archive.io.ArchiveRecord; -import org.archive.io.warc.WARCReader; -import org.archive.io.warc.WARCReaderFactory; -import org.archive.io.warc.WARCRecord; -import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; -import org.archive.wayback.util.AdaptedIterator; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.IdentityUrlCanonicalizer; - -public class WarcIndexer { - - /** - * CDX Header line for these fields. not very configurable.. - */ - public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; - - private UrlCanonicalizer canonicalizer = null; - public WarcIndexer() { - canonicalizer = new AggressiveUrlCanonicalizer(); - } - - /** - * @param warc - * @return Iterator of SearchResults for input arc File - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(File warc) - throws IOException { - return iterator(WARCReaderFactory.get(warc)); - } - /** - * @param pathOrUrl - * @return Iterator of SearchResults for input pathOrUrl - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(String pathOrUrl) - throws IOException { - return iterator(WARCReaderFactory.get(pathOrUrl)); - } - /** - * @param arc - * @return Iterator of SearchResults for input arc File - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(WARCReader reader) - throws IOException { - - Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter(); - - WARCRecordToSearchResultAdapter adapter2 = - new WARCRecordToSearchResultAdapter(); - adapter2.setCanonicalizer(canonicalizer); - - ArchiveReaderCloseableIterator itr1 = - new ArchiveReaderCloseableIterator(reader,reader.iterator()); - - CloseableIterator<WARCRecord> itr2 = - new AdaptedIterator<ArchiveRecord, WARCRecord>(itr1, adapter1); - - return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2); - } - - public UrlCanonicalizer getCanonicalizer() { - return canonicalizer; - } - - public void setCanonicalizer(UrlCanonicalizer canonicalizer) { - this.canonicalizer = canonicalizer; - } - - private static void USAGE() { - System.err.println("USAGE:"); - System.err.println(""); - System.err.println("warc-indexer [-identity] WARCFILE"); - System.err.println("warc-indexer [-identity] WARCFILE CDXFILE"); - System.err.println(""); - System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); - System.err.println("With -identity, perform no url canonicalization."); - System.exit(1); - } - - /** - * @param args - */ - public static void main(String[] args) { - WarcIndexer indexer = new WarcIndexer(); - int idx = 0; - if(args[0] != null && args[0].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); - idx++; - } - File arc = new File(args[idx]); - idx++; - PrintWriter pw = null; - try { - if (args.length == idx) { - // dump to STDOUT: - pw = new PrintWriter(System.out); - } else if (args.length == (idx+1)) { - pw = new PrintWriter(args[1]); - } else { - USAGE(); - } - Iterator<SearchResult> res = indexer.iterator(arc); - Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); - while (lines.hasNext()) { - pw.println(lines.next()); - } - pw.close(); - } catch (Exception e) { - e.printStackTrace(); - } - } - - private class ArchiveRecordToWARCRecordAdapter implements - Adapter<ArchiveRecord, WARCRecord> { - - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public WARCRecord adapt(ArchiveRecord o) { - WARCRecord rec = null; - if (o instanceof WARCRecord) { - rec = (WARCRecord) o; - } - return rec; - } - } -} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java (from rev 2138, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,173 @@ +/* ArcRecordToSearchResultAdapter + * + * $Id$ + * + * Created on 3:27:03 PM Jul 26, 2007. + * + * Copyright (C) 2007 Internet Archive. + * + * This file is part of wayback-core. + * + * wayback-core is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback-core is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback-core; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.IOException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.URIException; +import org.archive.io.arc.ARCRecord; +import org.archive.io.arc.ARCRecordMetaData; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ARCRecordToSearchResultAdapter +implements Adapter<ARCRecord,SearchResult>{ + + private static final Logger LOGGER = Logger.getLogger( + ARCRecordToSearchResultAdapter.class.getName()); + + private UrlCanonicalizer canonicalizer = null; + + public ARCRecordToSearchResultAdapter() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } +// public static SearchResult arcRecordToSearchResult(final ARCRecord rec) +// throws IOException, ParseException { + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public SearchResult adapt(ARCRecord rec) { + try { + return adaptInner(rec); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + private SearchResult adaptInner(ARCRecord rec) throws IOException { + rec.close(); + ARCRecordMetaData meta = rec.getMetaData(); + + SearchResult result = new SearchResult(); + String arcName = meta.getArc(); + int index = arcName.lastIndexOf(File.separator); + if (index > 0 && (index + 1) < arcName.length()) { + arcName = arcName.substring(index + 1); + } + result.put(WaybackConstants.RESULT_ARC_FILE, arcName); + result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta + .getOffset())); + + // initialize with default HTTP code... + result.put(WaybackConstants.RESULT_HTTP_CODE, "-"); + + result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); + result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype()); + result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate()); + + String uriStr = meta.getUrl(); + if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { + // skip filedesc record altogether... + return null; + } + if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { + // skip URL + HTTP header processing for dns records... + + String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX + .length()); + result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); + result.put(WaybackConstants.RESULT_REDIRECT_URL, "-"); + result.put(WaybackConstants.RESULT_URL, uriStr); + result.put(WaybackConstants.RESULT_URL_KEY, uriStr); + + } else { + + UURI uri = UURIFactory.getInstance(uriStr); + result.put(WaybackConstants.RESULT_URL, uriStr); + + String uriHost = uri.getHost(); + if (uriHost == null) { + LOGGER.info("No host in " + uriStr + " in " + meta.getArc()); + } else { + result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); + + String statusCode = (meta.getStatusCode() == null) ? "-" : meta + .getStatusCode(); + result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode); + + String redirectUrl = "-"; + Header[] headers = rec.getHttpHeaders(); + if (headers != null) { + + for (int i = 0; i < headers.length; i++) { + if (headers[i].getName().equals( + WaybackConstants.LOCATION_HTTP_HEADER)) { + + String locationStr = headers[i].getValue(); + // TODO: "Location" is supposed to be absolute: + // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) + // (section 14.30) but Content-Location can be + // relative. + // is it correct to resolve a relative Location, as + // we are? + // it's also possible to have both in the HTTP + // headers... + // should we prefer one over the other? + // right now, we're ignoring "Content-Location" + try { + UURI uriRedirect = UURIFactory.getInstance(uri, + locationStr); + redirectUrl = uriRedirect.getEscapedURI(); + + } catch (URIException e) { + LOGGER.info("Bad Location: " + locationStr + + " for " + uriStr + " in " + + meta.getArc() + " Skipped"); + } + break; + } + } + } + result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl); + + String indexUrl = canonicalizer.urlStringToKey(meta.getUrl()); + result.put(WaybackConstants.RESULT_URL_KEY, indexUrl); + } + + } + return result; + } + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java (from rev 2280, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,175 @@ +/* ArcIndexer + * + * $Id$ + * + * Created on 2:33:29 PM Oct 11, 2006. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.PrintWriter; +import java.io.IOException; +import java.util.Iterator; + +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCReader; +import org.archive.io.arc.ARCReaderFactory; +import org.archive.io.arc.ARCRecord; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; + +/** + * Transforms an ARC file into Iterator<SearchResult>. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ArcIndexer { + + /** + * CDX Header line for these fields. not very configurable.. + */ + public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; + private UrlCanonicalizer canonicalizer = null; + + public ArcIndexer() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + + /** + * @param arc + * @return Iterator of SearchResults for input arc File + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(File arc) + throws IOException { + return iterator(ARCReaderFactory.get(arc)); + } + + /** + * @param pathOrUrl + * @return Iterator of SearchResults for input pathOrUrl + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(String pathOrUrl) + throws IOException { + return iterator(ARCReaderFactory.get(pathOrUrl)); + } + + /** + * @param arcReader + * @return Iterator of SearchResults for input ARCReader + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(ARCReader arcReader) + throws IOException { + arcReader.setParseHttpHeaders(true); + + Adapter<ArchiveRecord,ARCRecord> adapter1 = + new ArchiveRecordToARCRecordAdapter(); + + ARCRecordToSearchResultAdapter adapter2 = + new ARCRecordToSearchResultAdapter(); + adapter2.setCanonicalizer(canonicalizer); + + ArchiveReaderCloseableIterator itr1 = + new ArchiveReaderCloseableIterator(arcReader,arcReader.iterator()); + + CloseableIterator<ARCRecord> itr2 = + new AdaptedIterator<ArchiveRecord,ARCRecord>(itr1,adapter1); + + return new AdaptedIterator<ARCRecord,SearchResult>(itr2,adapter2); + } + + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } + + private static void USAGE() { + System.err.println("USAGE:"); + System.err.println(""); + System.err.println("arc-indexer [-identity] ARCFILE"); + System.err.println("arc-indexer [-identity] ARCFILE CDXFILE"); + System.err.println(""); + System.err.println("Create a CDX format index at CDXFILE or to STDOUT."); + System.err.println("With -identity, perform no url canonicalization."); + System.exit(1); + } + + /** + * @param args + */ + public static void main(String[] args) { + ArcIndexer indexer = new ArcIndexer(); + int idx = 0; + if(args[0] != null && args[0].equals("-identity")) { + indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); + idx++; + } + File arc = new File(args[idx]); + idx++; + PrintWriter pw = null; + try { + if(args.length == idx) { + // dump to STDOUT: + pw = new PrintWriter(System.out); + } else if(args.length == (idx + 1)) { + pw = new PrintWriter(args[idx]); + } else { + USAGE(); + } + Iterator<SearchResult> res = indexer.iterator(arc); + Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); + while(lines.hasNext()) { + pw.println(lines.next()); + } + pw.close(); + } catch (Exception e) { + e.printStackTrace(); + System.exit(1); + } + } + + private class ArchiveRecordToARCRecordAdapter + implements Adapter<ArchiveRecord,ARCRecord> { + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public ARCRecord adapt(ArchiveRecord o) { + ARCRecord rec = null; + if(o instanceof ARCRecord) { + rec = (ARCRecord) o; + } + return rec; + } + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java (from rev 2209, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,29 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.IOException; +import java.util.Iterator; + +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveRecord; +import org.archive.wayback.util.CloseableIterator; + +public class ArchiveReaderCloseableIterator implements CloseableIterator<ArchiveRecord> { + private ArchiveReader reader = null; + private Iterator<ArchiveRecord> itr = null; + public ArchiveReaderCloseableIterator(ArchiveReader reader, Iterator<ArchiveRecord> itr) { + this.reader = reader; + this.itr = itr; + } + public boolean hasNext() { + return itr.hasNext(); + } + public ArchiveRecord next() { + return itr.next(); + } + public void remove() { + itr.remove(); + } + public void close() throws IOException { + reader.close(); + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java (from rev 2138, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,318 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.IOException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.URIException; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.RecoverableIOException; +import org.archive.io.arc.ARCConstants; +import org.archive.io.warc.WARCConstants; +import org.archive.io.warc.WARCRecord; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +/** + * Adapts certain WARCRecords into SearchResults. DNS and response records are + * mostly straightforward, but SearchResult objects generated from revisit + * records contain lots of "placeholder" fields, which are expected to be + * understood by later processes traversing a stream of SearchResult objects. + * + * See org.archive.wayback.resourceindex.DeduplicateSearchResultAnnotationAdapter. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class WARCRecordToSearchResultAdapter +implements Adapter<WARCRecord,SearchResult>{ + + private final static String DEFAULT_VALUE = "-"; + private final static String SEARCH_FIELDS[] = { + WaybackConstants.RESULT_URL, + WaybackConstants.RESULT_URL_KEY, + WaybackConstants.RESULT_ORIG_HOST, + WaybackConstants.RESULT_CAPTURE_DATE, + WaybackConstants.RESULT_MD5_DIGEST, + WaybackConstants.RESULT_MIME_TYPE, + WaybackConstants.RESULT_HTTP_CODE, + WaybackConstants.RESULT_REDIRECT_URL, + WaybackConstants.RESULT_ARC_FILE, + WaybackConstants.RESULT_OFFSET, + }; + + private static final Logger LOGGER = Logger.getLogger( + WARCRecordToSearchResultAdapter.class.getName()); + + private UrlCanonicalizer canonicalizer = null; + + public WARCRecordToSearchResultAdapter() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public SearchResult adapt(WARCRecord rec) { + try { + return adaptInner(rec); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + /* + * Transform input date to 14-digit timestamp: + * 2007-08-29T18:00:26Z => 20070829180026 + */ + private static String transformDate(final String input) { + + StringBuilder output = new StringBuilder(14); + + output.append(input.substring(0,4)); + output.append(input.substring(5,7)); + output.append(input.substring(8,10)); + output.append(input.substring(11,13)); + output.append(input.substring(14,16)); + output.append(input.substring(17,19)); + + return output.toString(); + } + + private static String transformHTTPMime(final String input) { + int semiIdx = input.indexOf(";"); + if(semiIdx > 0) { + return input.substring(0,semiIdx).trim(); + } + return input.trim(); + } + + private String transformWarcFilename(String readerIdentifier) { + String warcName = readerIdentifier; + int index = warcName.lastIndexOf(File.separator); + if (index > 0 && (index + 1) < warcName.length()) { + warcName = warcName.substring(index + 1); + } + return warcName; + } + + private String transformDigest(final Object o) { + if(o == null) { + return DEFAULT_VALUE; + } + String orig = o.toString(); + if(orig.startsWith("sha1:")) { + return orig.substring(5); + } + return orig; + } + + private SearchResult getBlankSearchResult() { + SearchResult result = new SearchResult(); + for(String field : SEARCH_FIELDS) { + result.put(field, DEFAULT_VALUE); + } + return result; + } + + private UURI addUrlDataToSearchResult(SearchResult result, String urlStr) + throws IOException { + + result.put(WaybackConstants.RESULT_URL, urlStr); + result.put(WaybackConstants.RESULT_URL_KEY, urlStr); + + + UURI uri = UURIFactory.getInstance(urlStr); + String uriHost = uri.getHost(); + if (uriHost == null) { + + LOGGER.info("No host in " + urlStr); + + } else { + + result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); + } + + String urlKey = canonicalizer.urlStringToKey(urlStr); + result.put(WaybackConstants.RESULT_URL_KEY, urlKey); + + return uri; + } + + private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_ARC_FILE, + transformWarcFilename(header.getReaderIdentifier())); + result.put(WaybackConstants.RESULT_OFFSET, + String.valueOf(header.getOffset())); + + String uriStr = header.getUrl(); + + String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX + .length()); + result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype()); + + result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); + result.put(WaybackConstants.RESULT_URL, uriStr); + result.put(WaybackConstants.RESULT_URL_KEY, uriStr); + + rec.close(); + result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); + + return result; + } + + private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_MD5_DIGEST, + transformDigest(header.getHeaderValue( + WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); + + addUrlDataToSearchResult(result,header.getUrl()); + + return result; + } + + /** + * borrowed(copied) from org.archive.io.arc.ARCRecord... + * + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_ARC_FILE, + transformWarcFilename(header.getReaderIdentifier())); + result.put(WaybackConstants.RESULT_OFFSET, + String.valueOf(header.getOffset())); + + String origUrl = header.getUrl(); + UURI uri = addUrlDataToSearchResult(result,origUrl); + + // need to parse the documents HTTP message and headers here: WARCReader + // does not implement this... yet.. + + byte [] statusBytes = HttpParser.readRawLine(rec); + int eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new RecoverableIOException("Failed to read http status where one " + + " was expected: " + new String(statusBytes)); + } + String statusLine = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + if ((statusLine == null) || + !StatusLine.startsWithHTTP(statusLine)) { + throw new RecoverableIOException("Failed parse of http status line."); + } + StatusLine status = new StatusLine(statusLine); + result.put(WaybackConstants.RESULT_HTTP_CODE, + String.valueOf(status.getStatusCode())); + + Header[] headers = HttpParser.parseHeaders(rec, + ARCConstants.DEFAULT_ENCODING); + + rec.close(); + result.put(WaybackConstants.RESULT_MD5_DIGEST, + transformDigest(header.getHeaderValue( + WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); + + if (headers != null) { + + for (Header httpHeader : headers) { + if (httpHeader.getName().equals( + WaybackConstants.LOCATION_HTTP_HEADER)) { + + String locationStr = httpHeader.getValue(); + // TODO: "Location" is supposed to be absolute: + // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) + // (section 14.30) but Content-Location can be + // relative. + // is it correct to resolve a relative Location, as + // we are? + // it's also possible to have both in the HTTP + // headers... + // should we prefer one over the other? + // right now, we're ignoring "Content-Location" + try { + UURI uriRedirect = UURIFactory.getInstance(uri, + locationStr); + result.put(WaybackConstants.RESULT_REDIRECT_URL, + uriRedirect.getEscapedURI()); + } catch (URIException e) { + LOGGER.info("Bad Location: " + locationStr + + " for " + origUrl + " in " + + header.getReaderIdentifier() + " Skipped"); + } + } else if(httpHeader.getName().toLowerCase().equals("content-type")) { + result.put(WaybackConstants.RESULT_MIME_TYPE, + transformHTTPMime(httpHeader.getValue())); + } + } + } + return result; + } + + private SearchResult adaptInner(WARCRecord rec) throws IOException { + + SearchResult result = null; + ArchiveRecordHeader header = rec.getHeader(); + String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); + if(type.equals(WARCConstants.RESPONSE)) { + String mime = header.getMimetype(); + if(mime.equals("text/dns")) { + result = adaptDNS(header,rec); + } else { + result = adaptResponse(header,rec); + } + } else if(type.equals(WARCConstants.REVISIT)) { + result = adaptRevisit(header,rec); + } + + return result; + } + + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java (from rev 2280, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,140 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Iterator; + +import org.archive.io.ArchiveRecord; +import org.archive.io.warc.WARCReader; +import org.archive.io.warc.WARCReaderFactory; +import org.archive.io.warc.WARCRecord; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; + +public class WarcIndexer { + + /** + * CDX Header line for these fields. not very configurable.. + */ + public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; + + private UrlCanonicalizer canonicalizer = null; + public WarcIndexer() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + + /** + * @param warc + * @return Iterator of SearchResults for input arc File + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(File warc) + throws IOException { + return iterator(WARCReaderFactory.get(warc)); + } + /** + * @param pathOrUrl + * @return Iterator of SearchResults for input pathOrUrl + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(String pathOrUrl) + throws IOException { + return iterator(WARCReaderFactory.get(pathOrUrl)); + } + /** + * @param arc + * @return Iterator of SearchResults for input arc File + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(WARCReader reader) + throws IOException { + + Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter(); + + WARCRecordToSearchResultAdapter adapter2 = + new WARCRecordToSearchResultAdapter(); + adapter2.setCanonicalizer(canonicalizer); + + ArchiveReaderCloseableIterator itr1 = + new ArchiveReaderCloseableIterator(reader,reader.iterator()); + + CloseableIterator<WARCRecord> itr2 = + new AdaptedIterator<ArchiveRecord, WARCRecord>(itr1, adapter1); + + return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2); + } + + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } + + private static void USAGE() { + System.err.println("USAGE:"); + System.err.println(""); + System.err.println("warc-indexer [-identity] WARCFILE"); + System.err.println("warc-indexer [-identity] WARCFILE CDXFILE"); + System.err.println(""); + System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); + System.err.println("With -identity, perform no url canonicalization."); + System.exit(1); + } + + /** + * @param args + */ + public static void main(String[] args) { + WarcIndexer indexer = new WarcIndexer(); + int idx = 0; + if(args[0] != null && args[0].equals("-identity")) { + indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); + idx++; + } + File arc = new File(args[idx]); + idx++; + PrintWriter pw = null; + try { + if (args.length == idx) { + // dump to STDOUT: + pw = new PrintWriter(System.out); + } else if (args.length == (idx+1)) { + pw = new PrintWriter(args[1]); + } else { + USAGE(); + } + Iterator<SearchResult> res = indexer.iterator(arc); + Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); + while (lines.hasNext()) { + pw.println(lines.next()); + } + pw.close(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private class ArchiveRecordToWARCRecordAdapter implements + Adapter<ArchiveRecord, WARCRecord> { + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public WARCRecord adapt(ArchiveRecord o) { + WARCRecord rec = null; + if (o instanceof WARCRecord) { + rec = (WARCRecord) o; + } + return rec; + } + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2320 http://archive-access.svn.sourceforge.net/archive-access/?rev=2320&view=rev Author: bradtofel Date: 2008-06-24 17:32:57 -0700 (Tue, 24 Jun 2008) Log Message: ----------- INITIAL REV: simple but powerful ResourceStore implementation that leverages an underlying ResourceFileLocationDB to know where ARC/WARC files are located, either locally, or over a network, or both. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java 2008-06-25 00:32:57 UTC (rev 2320) @@ -0,0 +1,117 @@ +/* LocalResourceFileResourceStore + * + * $Id$ + * + * Created on 6:17:54 PM May 29, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.IOException; +import java.net.URL; + +import org.archive.wayback.ResourceStore; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.exception.ResourceNotAvailableException; +import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; + +/** + * Simple ResourceStore implementation, which uses a ResourceFileLocationDB to + * locate ARC/WARC files, that can be remote(via http://) or local paths. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class LocalResourceFileResourceStore implements ResourceStore { + + private ResourceFileLocationDB db = null; + + /* (non-Javadoc) + * @see org.archive.wayback.ResourceStore#retrieveResource(org.archive.wayback.core.SearchResult) + */ + public Resource retrieveResource(SearchResult result) throws IOException, + ResourceNotAvailableException { + // extract ARC filename + String fileName = result.get(WaybackConstants.RESULT_ARC_FILE); + if(fileName == null || fileName.length() < 1) { + throw new IOException("No ARC/WARC name in search result..."); + } + + // extract offset + convert to long + final String offsetString = result.get(WaybackConstants.RESULT_OFFSET); + if(offsetString == null || offsetString.length() < 1) { + throw new IOException("No ARC/WARC offset in search result..."); + } + String urls[] = db.nameToUrls(fileName); + if(urls == null || urls.length == 0) { + throw new ResourceNotAvailableException("Unable to locate(" + + fileName + ")"); + } + + final long offset = Long.parseLong(offsetString); + + Resource r = null; + // TODO: attempt multiple threads? + for(String url : urls) { + + try { + + if(url.startsWith("http://")) { + r = ResourceFactory.getResource(new URL(url), offset); + } else { + // assume local path: + r = ResourceFactory.getResource(new File(url), offset); + } + // TODO: attempt to grab the first few KB? The underlying + // InputStreams support mark(), so we could reset() after. + // wait for now, currently this will parse HTTP headers, + // which means we've already read some + + } catch (IOException e) { + e.printStackTrace(); + } + if(r != null) { + break; + } + } + if(r == null) { + throw new ResourceNotAvailableException("Unable to retrieve"); + } + return r; + } + + /* (non-Javadoc) + * @see org.archive.wayback.ResourceStore#shutdown() + */ + public void shutdown() throws IOException { + // NOOP + } + + public ResourceFileLocationDB getDb() { + return db; + } + + public void setDb(ResourceFileLocationDB db) { + this.db = db; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2319 http://archive-access.svn.sourceforge.net/archive-access/?rev=2319&view=rev Author: bradtofel Date: 2008-06-24 17:21:00 -0700 (Tue, 24 Jun 2008) Log Message: ----------- INTERFACE CHANGE: now uses addSearchResults() method on super class. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java 2008-06-25 00:20:01 UTC (rev 2318) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java 2008-06-25 00:21:00 UTC (rev 2319) @@ -24,13 +24,11 @@ */ package org.archive.wayback.liveweb; +import java.io.IOException; import java.util.ArrayList; import org.archive.wayback.core.SearchResult; import org.archive.wayback.resourceindex.LocalResourceIndex; -import org.archive.wayback.resourceindex.bdb.BDBIndex; -import org.archive.wayback.resourceindex.bdb.SearchResultToBDBRecordAdapter; -import org.archive.wayback.util.AdaptedIterator; /** * Alternate LocalResourceIndex that supports an alternate BDB configuration, @@ -44,13 +42,15 @@ /** * Add a single SearchResult to the index. * @param result + * @throws IOException + * @throws UnsupportedOperationException */ @SuppressWarnings("unchecked") - public void addSearchResult(SearchResult result) { + public void addSearchResult(SearchResult result) + throws UnsupportedOperationException, IOException { + ArrayList<SearchResult> l = new ArrayList<SearchResult>(); l.add(result); - BDBIndex bdbSource = (BDBIndex) source; - bdbSource.insertRecords(new AdaptedIterator(l.iterator(), - new SearchResultToBDBRecordAdapter())); + addSearchResults(l.iterator()); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2318 http://archive-access.svn.sourceforge.net/archive-access/?rev=2318&view=rev Author: bradtofel Date: 2008-06-24 17:20:01 -0700 (Tue, 24 Jun 2008) Log Message: ----------- INITIAL REV: interface extension to SearchResultSource that allows updates. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/UpdatableSearchResultSource.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/UpdatableSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/UpdatableSearchResultSource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/UpdatableSearchResultSource.java 2008-06-25 00:20:01 UTC (rev 2318) @@ -0,0 +1,42 @@ +/* UpdatableSearchResultSource + * + * $Id$ + * + * Created on 3:34:37 PM Jun 24, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex; + +import java.io.IOException; +import java.util.Iterator; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.SearchResult; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public interface UpdatableSearchResultSource extends SearchResultSource { + public void addSearchResults(Iterator<SearchResult> itr, + UrlCanonicalizer canonicalizer) throws IOException; +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2317 http://archive-access.svn.sourceforge.net/archive-access/?rev=2317&view=rev Author: bradtofel Date: 2008-06-24 17:19:00 -0700 (Tue, 24 Jun 2008) Log Message: ----------- FEATURE: now this class is aware of underlying UpdatableSearchResultSource implementations, and allows addSearchResults() operations to pass through to these implementations if they support it. Also added a method for users to interrogate if this class supports updates. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2008-06-25 00:16:59 UTC (rev 2316) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2008-06-25 00:19:00 UTC (rev 2317) @@ -445,6 +445,22 @@ return results; } + public void addSearchResults(Iterator<SearchResult> itr) throws IOException, + UnsupportedOperationException { + if(source instanceof UpdatableSearchResultSource) { + UpdatableSearchResultSource updatable = + (UpdatableSearchResultSource) source; + updatable.addSearchResults(itr,canonicalizer); + } else { + throw new UnsupportedOperationException("Underlying " + + "SearchResultSource is not Updatable."); + } + } + + public boolean isUpdatable() { + return (source instanceof UpdatableSearchResultSource); + } + /** * @param maxRecords the maxRecords to set */ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2316 http://archive-access.svn.sourceforge.net/archive-access/?rev=2316&view=rev Author: bradtofel Date: 2008-06-24 17:16:59 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: all update functionality was moved out into LocalResourceIndexUpdater class, so there are no longer dependencies from this class to the updater. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java 2008-06-25 00:15:28 UTC (rev 2315) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java 2008-06-25 00:16:59 UTC (rev 2316) @@ -39,7 +39,6 @@ import org.archive.wayback.resourceindex.UpdatableSearchResultSource; import org.archive.wayback.resourceindex.cdx.CDXLineToSearchResultAdapter; import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; -import org.archive.wayback.resourceindex.updater.BDBIndexUpdater; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.CloseableIterator; @@ -59,17 +58,12 @@ private String bdbPath = null; private String bdbName = null; - private BDBIndexUpdater updater = null; /** * @throws DatabaseException * @throws ConfigurationException */ public void init() throws DatabaseException, ConfigurationException { initializeDB(bdbPath,bdbName); - if(updater != null) { - updater.setIndex(this); - updater.startup(); - } } private CloseableIterator<SearchResult> adaptIterator( @@ -224,14 +218,6 @@ Iterator<SearchResult> itrSR = new AdaptedIterator<String,SearchResult>(itrS,adapterStoSR); -// Adapter<SearchResult,BDBRecord> adapterSRtoBDB = -// new SearchResultToBDBRecordAdapter(); -// -// Iterator<BDBRecord> itrBDB = -// new AdaptedIterator<SearchResult,BDBRecord>(itrSR, -// adapterSRtoBDB); -// -// index.insertRecords(itrBDB); try { index.addSearchResults(itrSR, canonicalizer); } catch (IOException e) { @@ -242,24 +228,28 @@ USAGE(); } } + /** * @return the bdbPath */ public String getBdbPath() { return bdbPath; } + /** * @param bdbPath the bdbPath to set */ public void setBdbPath(String bdbPath) { this.bdbPath = bdbPath; } + /** * @return the bdbName */ public String getBdbName() { return bdbName; } + /** * @param bdbName the bdbName to set */ @@ -267,20 +257,6 @@ this.bdbName = bdbName; } - /** - * @return the updater - */ - public BDBIndexUpdater getUpdater() { - return updater; - } - - /** - * @param updater the updater to set - */ - public void setUpdater(BDBIndexUpdater updater) { - this.updater = updater; - } - public void shutdown() throws IOException { try { shutdownDB(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2315 http://archive-access.svn.sourceforge.net/archive-access/?rev=2315&view=rev Author: bradtofel Date: 2008-06-24 17:15:28 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: Now this class is responsible for UrlCanonicalization, and assumes input records are Identity CDX records. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java 2008-06-25 00:14:19 UTC (rev 2314) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java 2008-06-25 00:15:28 UTC (rev 2315) @@ -24,6 +24,9 @@ */ package org.archive.wayback.resourceindex.bdb; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.URIException; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.WaybackConstants; import org.archive.wayback.bdb.BDBRecord; @@ -41,7 +44,9 @@ */ public class SearchResultToBDBRecordAdapter implements Adapter<SearchResult,BDBRecord> { - + private static final Logger LOGGER = + Logger.getLogger(SearchResultToBDBRecordAdapter.class.getName()); + DatabaseEntry key = new DatabaseEntry(); DatabaseEntry value = new DatabaseEntry(); @@ -65,8 +70,16 @@ StringBuilder keySB = new StringBuilder(40); StringBuilder valSB = new StringBuilder(100); - - keySB.append(result.get(WaybackConstants.RESULT_URL_KEY)); + String origUrl = result.getAbsoluteUrl(); + String urlKey; + try { + urlKey = canonicalizer.urlStringToKey(origUrl); + } catch (URIException e) { +// e.printStackTrace(); + LOGGER.warning("FAILED canonicalize(" + origUrl +")"); + urlKey = origUrl; + } + keySB.append(urlKey); keySB.append(DELIMITER); keySB.append(result.get(WaybackConstants.RESULT_CAPTURE_DATE)); keySB.append(DELIMITER); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2314 http://archive-access.svn.sourceforge.net/archive-access/?rev=2314&view=rev Author: bradtofel Date: 2008-06-24 17:14:19 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: Now some LocalResourceIndex implementations are UpdatableLocalResourceIndex. This class now operates on those. Refactored lots of directory creation code into DirMaker. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/LocalResourceIndexUpdater.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/LocalResourceIndexUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/LocalResourceIndexUpdater.java 2008-06-25 00:12:36 UTC (rev 2313) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/LocalResourceIndexUpdater.java 2008-06-25 00:14:19 UTC (rev 2314) @@ -26,17 +26,14 @@ import java.io.File; import java.io.IOException; -import java.util.Iterator; import java.util.logging.Logger; -import org.archive.wayback.bdb.BDBRecord; import org.archive.wayback.core.SearchResult; import org.archive.wayback.exception.ConfigurationException; -import org.archive.wayback.resourceindex.bdb.BDBIndex; -import org.archive.wayback.resourceindex.bdb.SearchResultToBDBRecordAdapter; +import org.archive.wayback.resourceindex.LocalResourceIndex; import org.archive.wayback.resourceindex.cdx.CDXLineToSearchResultAdapter; -//import org.archive.wayback.resourcestore.ArcIndexer; import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.DirMaker; import org.archive.wayback.util.flatfile.FlatFile; /** @@ -60,7 +57,7 @@ private final static int DEFAULT_RUN_INTERVAL_MS = 10000; - private BDBIndex index = null; + private LocalResourceIndex index = null; private File incoming = null; @@ -77,78 +74,25 @@ private Thread updateThread = null; /** - * Default constructor - */ - public LocalResourceIndexUpdater() { - - } - /** - * @param index - * @param incoming - */ - public LocalResourceIndexUpdater(BDBIndex index, File incoming) { - this.index = index; - this.incoming = incoming; - } - - /** * start the background index merging thread * @throws ConfigurationException */ public void init() throws ConfigurationException { if(index == null) { - throw new ConfigurationException("No index target on bdb updater"); + throw new ConfigurationException("No index target"); } + if(!index.isUpdatable()) { + throw new ConfigurationException("ResourceIndex is not updatable"); + } if(incoming == null) { - throw new ConfigurationException("No incoming on bdb updater"); + throw new ConfigurationException("No incoming"); } - startUpdateThread(); - } - - /** Ensure the argument directory exists - * @param dir - * @throws IOException - */ - private void ensureDir(File dir) throws IOException { - if (!dir.isDirectory() && !dir.mkdirs()) { - throw new IOException("FAILED to create " + dir.getAbsolutePath()); + if(runInterval > 0) { + updateThread = new UpdateThread(this,runInterval); + updateThread.start(); } } - - /** - * start a background thread that merges new CDX files in incoming into - * the BDBIndex. - * - * @throws ConfigurationException - */ - public void startup() throws ConfigurationException { - try { - ensureDir(incoming); - if(merged != null) ensureDir(merged); - if(failed != null) ensureDir(failed); - } catch (IOException e) { - e.printStackTrace(); - throw new ConfigurationException(e.getMessage()); - } - - if (updateThread == null) { - startUpdateThread(); - } - } - - /** - * start the BDBIndexUpdaterThread thread, which will scan for new cdx files - * in the incoming directory, and add them to the BDBIndex. - */ - private synchronized void startUpdateThread() { - if (updateThread != null) { - return; - } - updateThread = new BDBIndexUpdaterThread(this,runInterval); - updateThread.start(); - } - private boolean mergeFile(File cdxFile) { boolean added = false; try { @@ -157,10 +101,7 @@ new AdaptedIterator<String,SearchResult>( ffile.getSequentialIterator(), new CDXLineToSearchResultAdapter()); - Iterator<BDBRecord> it = new AdaptedIterator<SearchResult,BDBRecord> - (searchResultItr,new SearchResultToBDBRecordAdapter()); - - index.insertRecords(it); + index.addSearchResults(searchResultItr); added = true; } catch (IOException e) { e.printStackTrace(); @@ -182,25 +123,6 @@ return target; } - private File ensureDir(String path) throws ConfigurationException { - if(path.length() < 1) { - throw new ConfigurationException("Empty directory path"); - } - File dir = new File(path); - if(dir.exists()) { - if(!dir.isDirectory()) { - throw new ConfigurationException("path " + path + "exists" + - "but is not a directory"); - } - } else { - if(!dir.mkdirs()) { - throw new ConfigurationException("unable to create directory" + - " at " + path); - } - } - return dir; - } - private void handleMerged(File f) { if (merged == null) { if (!f.delete()) { @@ -260,87 +182,61 @@ /** * @return the index */ - public BDBIndex getIndex() { + public LocalResourceIndex getIndex() { return index; } /** * @param index the index to set */ - public void setIndex(BDBIndex index) { + public void setIndex(LocalResourceIndex index) { this.index = index; } /** - * @return the incoming + * @return the incoming directory path, or null if not set */ public String getIncoming() { - if(incoming == null) { - return null; - } - return incoming.getAbsolutePath(); + return DirMaker.getAbsolutePath(incoming); } /** * @param incoming the incoming to set - * @throws ConfigurationException + * @throws IOException */ - public void setIncoming(String incoming) throws ConfigurationException { - this.incoming = ensureDir(incoming); + public void setIncoming(String incoming) throws IOException { + this.incoming = DirMaker.ensureDir(incoming); } - /** - * @return the merged + * @return the merged directory path, or null if not set */ public String getMerged() { - if(merged == null) { - return null; - } - return merged.getAbsolutePath(); + return DirMaker.getAbsolutePath(merged); } /** - * @param merged The merged to set. - * @throws ConfigurationException - */ - public void setMerged(String merged) throws ConfigurationException { - this.merged = ensureDir(merged); - } - /** * @param merged * @throws IOException */ - public void setMerged(File merged) throws IOException { - ensureDir(merged); - this.merged = merged; + public void setMerged(String merged) throws IOException { + this.merged = DirMaker.ensureDir(merged); } /** - * @return the failed + * @return the failed directory path, or null if not set */ public String getFailed() { - if(failed == null) { - return null; - } - return failed.getAbsolutePath(); + return DirMaker.getAbsolutePath(failed); } /** * @param failed The failed to set. - * @throws ConfigurationException + * @throws IOException */ - public void setFailed(String failed) throws ConfigurationException { - this.failed = ensureDir(failed); + public void setFailed(String failed) throws IOException { + this.failed = DirMaker.ensureDir(failed); } - /** - * @param failed - * @throws IOException - */ - public void setFailed(File failed) throws IOException { - ensureDir(failed); - this.failed = failed; - } /** * @return the runInterval @@ -355,13 +251,14 @@ public void setRunInterval(int runInterval) { this.runInterval = runInterval; } + /** * Thread that repeatedly calls mergeAll on the BDBIndexUpdater. * * @author Brad Tofel * @version $Date$, $Revision$ */ - private class BDBIndexUpdaterThread extends Thread { + private class UpdateThread extends Thread { /** * object which merges CDX files with the BDBResourceIndex */ @@ -373,12 +270,14 @@ * @param updater * @param runInterval */ - public BDBIndexUpdaterThread(LocalResourceIndexUpdater updater, int runInterval) { - super("BDBIndexUpdaterThread"); + public UpdateThread(LocalResourceIndexUpdater updater, + int runInterval) { + + super("LocalResourceIndexUpdater.UpdateThread"); super.setDaemon(true); this.updater = updater; this.runInterval = runInterval; - LOGGER.info("BDBIndexUpdaterThread is alive."); + LOGGER.info("LocalResourceIndexUpdater.UpdateThread is alive."); } public void run() { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2313 http://archive-access.svn.sourceforge.net/archive-access/?rev=2313&view=rev Author: bradtofel Date: 2008-06-24 17:12:36 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: moved Index update code to separate package Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-06-25 00:12:00 UTC (rev 2312) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-06-25 00:12:36 UTC (rev 2313) @@ -8,7 +8,7 @@ import java.util.logging.Logger; import org.archive.wayback.core.SearchResult; -import org.archive.wayback.resourceindex.indexer.IndexClient; +import org.archive.wayback.resourceindex.updater.IndexClient; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.DirMaker; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2312 http://archive-access.svn.sourceforge.net/archive-access/?rev=2312&view=rev Author: bradtofel Date: 2008-06-24 17:12:00 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: moved Index update code to separate package Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2008-06-24 23:43:59 UTC (rev 2311) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2008-06-25 00:12:00 UTC (rev 2312) @@ -29,7 +29,7 @@ import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.SearchResult; -import org.archive.wayback.resourceindex.indexer.IndexClient; +import org.archive.wayback.resourceindex.updater.IndexClient; import org.archive.wayback.resourcestore.ArcIndexer; import org.archive.wayback.resourcestore.WarcIndexer; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-24 23:43:51
|
Revision: 2311 http://archive-access.svn.sourceforge.net/archive-access/?rev=2311&view=rev Author: bradtofel Date: 2008-06-24 16:43:59 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: Renaming to support other future UpdatableResourceIndex implementations. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/LocalResourceIndexUpdater.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/BDBIndexUpdater.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/BDBIndexUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/BDBIndexUpdater.java 2008-06-24 23:41:39 UTC (rev 2310) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/BDBIndexUpdater.java 2008-06-24 23:43:59 UTC (rev 2311) @@ -1,401 +0,0 @@ -/* BDBIndexUpdater - * - * $Id$ - * - * Created on 2:59:40 PM Oct 12, 2006. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourceindex.updater; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.logging.Logger; - -import org.archive.wayback.bdb.BDBRecord; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.exception.ConfigurationException; -import org.archive.wayback.resourceindex.bdb.BDBIndex; -import org.archive.wayback.resourceindex.bdb.SearchResultToBDBRecordAdapter; -import org.archive.wayback.resourceindex.cdx.CDXLineToSearchResultAdapter; -//import org.archive.wayback.resourcestore.ArcIndexer; -import org.archive.wayback.util.AdaptedIterator; -import org.archive.wayback.util.flatfile.FlatFile; - -/** - * Class which starts a background thread that repeatedly scans an incoming - * directory and merges files found therein(which are assumed to be in CDX - * format) with a BDBIndex. Optional configurations include: - * - * target directory where merged files are moved to (otherwise deleted) - * target directory where failed failed are moved(otherwise left in place) - * milliseconds between scans of the incoming directory(default 10000) - * - * @author brad - * @version $Date$, $Revision$ - */ -public class BDBIndexUpdater { - /** - * Logger for this class - */ - private static final Logger LOGGER = - Logger.getLogger(BDBIndexUpdater.class.getName()); - - private final static int DEFAULT_RUN_INTERVAL_MS = 10000; - - private BDBIndex index = null; - - private File incoming = null; - - private File merged = null; - - private File failed = null; - - private int runInterval = DEFAULT_RUN_INTERVAL_MS; - - /** - * Thread object of update thread -- also is flag indicating if the thread - * has already been started. Access to it is synchronized. - */ - private Thread updateThread = null; - - /** - * Default constructor - */ - public BDBIndexUpdater() { - - } - /** - * @param index - * @param incoming - */ - public BDBIndexUpdater(BDBIndex index, File incoming) { - this.index = index; - this.incoming = incoming; - } - - /** - * start the background index merging thread - * @throws ConfigurationException - */ - public void init() throws ConfigurationException { - if(index == null) { - throw new ConfigurationException("No index target on bdb updater"); - } - if(incoming == null) { - throw new ConfigurationException("No incoming on bdb updater"); - } - startUpdateThread(); - } - - /** Ensure the argument directory exists - * @param dir - * @throws IOException - */ - private void ensureDir(File dir) throws IOException { - if (!dir.isDirectory() && !dir.mkdirs()) { - throw new IOException("FAILED to create " + dir.getAbsolutePath()); - } - } - - /** - * start a background thread that merges new CDX files in incoming into - * the BDBIndex. - * - * @throws ConfigurationException - */ - public void startup() throws ConfigurationException { - try { - ensureDir(incoming); - if(merged != null) ensureDir(merged); - if(failed != null) ensureDir(failed); - } catch (IOException e) { - e.printStackTrace(); - throw new ConfigurationException(e.getMessage()); - } - - if (updateThread == null) { - startUpdateThread(); - } - } - - /** - * start the BDBIndexUpdaterThread thread, which will scan for new cdx files - * in the incoming directory, and add them to the BDBIndex. - */ - private synchronized void startUpdateThread() { - if (updateThread != null) { - return; - } - updateThread = new BDBIndexUpdaterThread(this,runInterval); - updateThread.start(); - } - - - private boolean mergeFile(File cdxFile) { - boolean added = false; - try { - FlatFile ffile = new FlatFile(cdxFile.getAbsolutePath()); - AdaptedIterator<String,SearchResult> searchResultItr = - new AdaptedIterator<String,SearchResult>( - ffile.getSequentialIterator(), - new CDXLineToSearchResultAdapter()); - Iterator<BDBRecord> it = new AdaptedIterator<SearchResult,BDBRecord> - (searchResultItr,new SearchResultToBDBRecordAdapter()); - - index.insertRecords(it); - added = true; - } catch (IOException e) { - e.printStackTrace(); - } - return added; - } - - private File getTargetFile(File f, File targetDir) { - File target = new File(targetDir, f.getName()); - int x = 0; - while(target.exists()) { - if(x++ > 255) { - throw new RuntimeException("too many " - + "duplicates of file " + f.getAbsolutePath() + - " in " + targetDir.getAbsolutePath()); - } - target = new File(targetDir,f.getName() + "." + x); - } - return target; - } - - private File ensureDir(String path) throws ConfigurationException { - if(path.length() < 1) { - throw new ConfigurationException("Empty directory path"); - } - File dir = new File(path); - if(dir.exists()) { - if(!dir.isDirectory()) { - throw new ConfigurationException("path " + path + "exists" + - "but is not a directory"); - } - } else { - if(!dir.mkdirs()) { - throw new ConfigurationException("unable to create directory" + - " at " + path); - } - } - return dir; - } - - private void handleMerged(File f) { - if (merged == null) { - if (!f.delete()) { - // big problems... lets exit - throw new RuntimeException("Unable to delete " - + f.getAbsolutePath()); - } - LOGGER.info("Removed merged file " + f.getAbsolutePath()); - } else { - // move to merged: - File target = getTargetFile(f,merged); - if (!f.renameTo(target)) { - throw new RuntimeException("FAILED rename" + "(" - + f.getAbsolutePath() + ") to " + "(" - + target.getAbsolutePath() + ")"); - } - LOGGER.info("Renamed merged file " + f.getAbsolutePath() + " to " + - target.getAbsolutePath()); - } - } - - private void handleFailed(File f) { - if (failed == null) { - // nothing much to do.. just complain and leave it. - LOGGER.info("FAILED INDEX: " + f.getAbsolutePath()); - } else { - // move to failed: - File target = getTargetFile(f,failed); - if (!f.renameTo(target)) { - throw new RuntimeException("FAILED rename" + "(" - + f.getAbsolutePath() + ") to " + "(" - + target.getAbsolutePath() + ")"); - } - LOGGER.info("Renamed failed merge file " + f.getAbsolutePath() + - " to " + target.getAbsolutePath()); - } - } - - protected int mergeAll() { - int numMerged = 0; - File incomingFiles[] = incoming.listFiles(); - int i = 0; - for (i = 0; i < incomingFiles.length; i++) { - File f = incomingFiles[i]; - if (f.isFile()) { - if (mergeFile(f)) { - handleMerged(f); - numMerged++; - } else { - handleFailed(f); - } - } - } - return numMerged; - } - - /** - * @return the index - */ - public BDBIndex getIndex() { - return index; - } - - /** - * @param index the index to set - */ - public void setIndex(BDBIndex index) { - this.index = index; - } - - /** - * @return the incoming - */ - public String getIncoming() { - if(incoming == null) { - return null; - } - return incoming.getAbsolutePath(); - } - - /** - * @param incoming the incoming to set - * @throws ConfigurationException - */ - public void setIncoming(String incoming) throws ConfigurationException { - this.incoming = ensureDir(incoming); - } - - - /** - * @return the merged - */ - public String getMerged() { - if(merged == null) { - return null; - } - return merged.getAbsolutePath(); - } - - /** - * @param merged The merged to set. - * @throws ConfigurationException - */ - public void setMerged(String merged) throws ConfigurationException { - this.merged = ensureDir(merged); - } - /** - * @param merged - * @throws IOException - */ - public void setMerged(File merged) throws IOException { - ensureDir(merged); - this.merged = merged; - } - - /** - * @return the failed - */ - public String getFailed() { - if(failed == null) { - return null; - } - return failed.getAbsolutePath(); - } - - /** - * @param failed The failed to set. - * @throws ConfigurationException - */ - public void setFailed(String failed) throws ConfigurationException { - this.failed = ensureDir(failed); - } - /** - * @param failed - * @throws IOException - */ - public void setFailed(File failed) throws IOException { - ensureDir(failed); - this.failed = failed; - } - - /** - * @return the runInterval - */ - public int getRunInterval() { - return runInterval; - } - - /** - * @param runInterval The runInterval to set. - */ - public void setRunInterval(int runInterval) { - this.runInterval = runInterval; - } - /** - * Thread that repeatedly calls mergeAll on the BDBIndexUpdater. - * - * @author Brad Tofel - * @version $Date$, $Revision$ - */ - private class BDBIndexUpdaterThread extends Thread { - /** - * object which merges CDX files with the BDBResourceIndex - */ - private BDBIndexUpdater updater = null; - - private int runInterval; - - /** - * @param updater - * @param runInterval - */ - public BDBIndexUpdaterThread(BDBIndexUpdater updater, int runInterval) { - super("BDBIndexUpdaterThread"); - super.setDaemon(true); - this.updater = updater; - this.runInterval = runInterval; - LOGGER.info("BDBIndexUpdaterThread is alive."); - } - - public void run() { - int sleepInterval = runInterval; - while (true) { - try { - int numMerged = updater.mergeAll(); - if (numMerged == 0) { - sleep(sleepInterval); - sleepInterval += runInterval; - } else { - sleepInterval = runInterval; - } - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - } - } -} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/LocalResourceIndexUpdater.java (from rev 2309, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/BDBIndexUpdater.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/LocalResourceIndexUpdater.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/LocalResourceIndexUpdater.java 2008-06-24 23:43:59 UTC (rev 2311) @@ -0,0 +1,401 @@ +/* BDBIndexUpdater + * + * $Id$ + * + * Created on 2:59:40 PM Oct 12, 2006. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.updater; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; +import java.util.logging.Logger; + +import org.archive.wayback.bdb.BDBRecord; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.exception.ConfigurationException; +import org.archive.wayback.resourceindex.bdb.BDBIndex; +import org.archive.wayback.resourceindex.bdb.SearchResultToBDBRecordAdapter; +import org.archive.wayback.resourceindex.cdx.CDXLineToSearchResultAdapter; +//import org.archive.wayback.resourcestore.ArcIndexer; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.flatfile.FlatFile; + +/** + * Class which starts a background thread that repeatedly scans an incoming + * directory and merges files found therein(which are assumed to be in CDX + * format) with a BDBIndex. Optional configurations include: + * + * target directory where merged files are moved to (otherwise deleted) + * target directory where failed failed are moved(otherwise left in place) + * milliseconds between scans of the incoming directory(default 10000) + * + * @author brad + * @version $Date$, $Revision$ + */ +public class LocalResourceIndexUpdater { + /** + * Logger for this class + */ + private static final Logger LOGGER = + Logger.getLogger(LocalResourceIndexUpdater.class.getName()); + + private final static int DEFAULT_RUN_INTERVAL_MS = 10000; + + private BDBIndex index = null; + + private File incoming = null; + + private File merged = null; + + private File failed = null; + + private int runInterval = DEFAULT_RUN_INTERVAL_MS; + + /** + * Thread object of update thread -- also is flag indicating if the thread + * has already been started. Access to it is synchronized. + */ + private Thread updateThread = null; + + /** + * Default constructor + */ + public LocalResourceIndexUpdater() { + + } + /** + * @param index + * @param incoming + */ + public LocalResourceIndexUpdater(BDBIndex index, File incoming) { + this.index = index; + this.incoming = incoming; + } + + /** + * start the background index merging thread + * @throws ConfigurationException + */ + public void init() throws ConfigurationException { + if(index == null) { + throw new ConfigurationException("No index target on bdb updater"); + } + if(incoming == null) { + throw new ConfigurationException("No incoming on bdb updater"); + } + startUpdateThread(); + } + + /** Ensure the argument directory exists + * @param dir + * @throws IOException + */ + private void ensureDir(File dir) throws IOException { + if (!dir.isDirectory() && !dir.mkdirs()) { + throw new IOException("FAILED to create " + dir.getAbsolutePath()); + } + } + + /** + * start a background thread that merges new CDX files in incoming into + * the BDBIndex. + * + * @throws ConfigurationException + */ + public void startup() throws ConfigurationException { + try { + ensureDir(incoming); + if(merged != null) ensureDir(merged); + if(failed != null) ensureDir(failed); + } catch (IOException e) { + e.printStackTrace(); + throw new ConfigurationException(e.getMessage()); + } + + if (updateThread == null) { + startUpdateThread(); + } + } + + /** + * start the BDBIndexUpdaterThread thread, which will scan for new cdx files + * in the incoming directory, and add them to the BDBIndex. + */ + private synchronized void startUpdateThread() { + if (updateThread != null) { + return; + } + updateThread = new BDBIndexUpdaterThread(this,runInterval); + updateThread.start(); + } + + + private boolean mergeFile(File cdxFile) { + boolean added = false; + try { + FlatFile ffile = new FlatFile(cdxFile.getAbsolutePath()); + AdaptedIterator<String,SearchResult> searchResultItr = + new AdaptedIterator<String,SearchResult>( + ffile.getSequentialIterator(), + new CDXLineToSearchResultAdapter()); + Iterator<BDBRecord> it = new AdaptedIterator<SearchResult,BDBRecord> + (searchResultItr,new SearchResultToBDBRecordAdapter()); + + index.insertRecords(it); + added = true; + } catch (IOException e) { + e.printStackTrace(); + } + return added; + } + + private File getTargetFile(File f, File targetDir) { + File target = new File(targetDir, f.getName()); + int x = 0; + while(target.exists()) { + if(x++ > 255) { + throw new RuntimeException("too many " + + "duplicates of file " + f.getAbsolutePath() + + " in " + targetDir.getAbsolutePath()); + } + target = new File(targetDir,f.getName() + "." + x); + } + return target; + } + + private File ensureDir(String path) throws ConfigurationException { + if(path.length() < 1) { + throw new ConfigurationException("Empty directory path"); + } + File dir = new File(path); + if(dir.exists()) { + if(!dir.isDirectory()) { + throw new ConfigurationException("path " + path + "exists" + + "but is not a directory"); + } + } else { + if(!dir.mkdirs()) { + throw new ConfigurationException("unable to create directory" + + " at " + path); + } + } + return dir; + } + + private void handleMerged(File f) { + if (merged == null) { + if (!f.delete()) { + // big problems... lets exit + throw new RuntimeException("Unable to delete " + + f.getAbsolutePath()); + } + LOGGER.info("Removed merged file " + f.getAbsolutePath()); + } else { + // move to merged: + File target = getTargetFile(f,merged); + if (!f.renameTo(target)) { + throw new RuntimeException("FAILED rename" + "(" + + f.getAbsolutePath() + ") to " + "(" + + target.getAbsolutePath() + ")"); + } + LOGGER.info("Renamed merged file " + f.getAbsolutePath() + " to " + + target.getAbsolutePath()); + } + } + + private void handleFailed(File f) { + if (failed == null) { + // nothing much to do.. just complain and leave it. + LOGGER.info("FAILED INDEX: " + f.getAbsolutePath()); + } else { + // move to failed: + File target = getTargetFile(f,failed); + if (!f.renameTo(target)) { + throw new RuntimeException("FAILED rename" + "(" + + f.getAbsolutePath() + ") to " + "(" + + target.getAbsolutePath() + ")"); + } + LOGGER.info("Renamed failed merge file " + f.getAbsolutePath() + + " to " + target.getAbsolutePath()); + } + } + + protected int mergeAll() { + int numMerged = 0; + File incomingFiles[] = incoming.listFiles(); + int i = 0; + for (i = 0; i < incomingFiles.length; i++) { + File f = incomingFiles[i]; + if (f.isFile()) { + if (mergeFile(f)) { + handleMerged(f); + numMerged++; + } else { + handleFailed(f); + } + } + } + return numMerged; + } + + /** + * @return the index + */ + public BDBIndex getIndex() { + return index; + } + + /** + * @param index the index to set + */ + public void setIndex(BDBIndex index) { + this.index = index; + } + + /** + * @return the incoming + */ + public String getIncoming() { + if(incoming == null) { + return null; + } + return incoming.getAbsolutePath(); + } + + /** + * @param incoming the incoming to set + * @throws ConfigurationException + */ + public void setIncoming(String incoming) throws ConfigurationException { + this.incoming = ensureDir(incoming); + } + + + /** + * @return the merged + */ + public String getMerged() { + if(merged == null) { + return null; + } + return merged.getAbsolutePath(); + } + + /** + * @param merged The merged to set. + * @throws ConfigurationException + */ + public void setMerged(String merged) throws ConfigurationException { + this.merged = ensureDir(merged); + } + /** + * @param merged + * @throws IOException + */ + public void setMerged(File merged) throws IOException { + ensureDir(merged); + this.merged = merged; + } + + /** + * @return the failed + */ + public String getFailed() { + if(failed == null) { + return null; + } + return failed.getAbsolutePath(); + } + + /** + * @param failed The failed to set. + * @throws ConfigurationException + */ + public void setFailed(String failed) throws ConfigurationException { + this.failed = ensureDir(failed); + } + /** + * @param failed + * @throws IOException + */ + public void setFailed(File failed) throws IOException { + ensureDir(failed); + this.failed = failed; + } + + /** + * @return the runInterval + */ + public int getRunInterval() { + return runInterval; + } + + /** + * @param runInterval The runInterval to set. + */ + public void setRunInterval(int runInterval) { + this.runInterval = runInterval; + } + /** + * Thread that repeatedly calls mergeAll on the BDBIndexUpdater. + * + * @author Brad Tofel + * @version $Date$, $Revision$ + */ + private class BDBIndexUpdaterThread extends Thread { + /** + * object which merges CDX files with the BDBResourceIndex + */ + private LocalResourceIndexUpdater updater = null; + + private int runInterval; + + /** + * @param updater + * @param runInterval + */ + public BDBIndexUpdaterThread(LocalResourceIndexUpdater updater, int runInterval) { + super("BDBIndexUpdaterThread"); + super.setDaemon(true); + this.updater = updater; + this.runInterval = runInterval; + LOGGER.info("BDBIndexUpdaterThread is alive."); + } + + public void run() { + int sleepInterval = runInterval; + while (true) { + try { + int numMerged = updater.mergeAll(); + if (numMerged == 0) { + sleep(sleepInterval); + sleepInterval += runInterval; + } else { + sleepInterval = runInterval; + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-24 23:41:31
|
Revision: 2310 http://archive-access.svn.sourceforge.net/archive-access/?rev=2310&view=rev Author: bradtofel Date: 2008-06-24 16:41:39 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: moving index update code to separate package. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/IndexClient.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/RemoteSubmitFilter.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/RemoteSubmitFilter.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java 2008-06-24 23:40:15 UTC (rev 2309) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java 2008-06-24 23:41:39 UTC (rev 2310) @@ -1,204 +0,0 @@ -/* IndexClient - * - * $Id$ - * - * Created on 4:22:52 PM Oct 12, 2006. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourceindex.indexer; - -import java.io.File; -import java.io.BufferedOutputStream; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.Iterator; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.HttpException; -import org.apache.commons.httpclient.HttpStatus; -import org.apache.commons.httpclient.methods.InputStreamRequestEntity; -import org.apache.commons.httpclient.methods.PutMethod; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; -import org.archive.wayback.util.AdaptedIterator; -import org.archive.wayback.util.Adapter; - -/** - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class IndexClient { - private static final Logger LOGGER = Logger.getLogger(IndexClient - .class.getName()); - - private String target = null; - private File tmpDir = null; - - private HttpClient client = new HttpClient(); - - /** - * @param cdx - * @return true if CDX was added to local or remote index - * @throws HttpException - * @throws IOException - */ - public boolean addCDX(File cdx) throws HttpException, IOException { - boolean added = false; - if(target == null) { - throw new IOException("No target set"); - } - String base = cdx.getName(); - if(target.startsWith("http://")) { - String finalUrl = target; - if(target.endsWith("/")) { - finalUrl = target + base; - } else { - finalUrl = target + "/" + base; - } - PutMethod method = new PutMethod(finalUrl); - method.setRequestEntity(new InputStreamRequestEntity( - new FileInputStream(cdx))); - - int statusCode = client.executeMethod(method); - if (statusCode == HttpStatus.SC_OK) { - LOGGER.info("Uploaded cdx " + cdx.getAbsolutePath() + " to " + - finalUrl); - if(!cdx.delete()) { - throw new IOException("FAILED delete " + - cdx.getAbsolutePath()); - } - - added = true; - } else { - throw new IOException("Method failed: " + method.getStatusLine() - + " for URL " + finalUrl + " on file " - + cdx.getAbsolutePath()); - } - - } else { - // assume a local directory: - File toBeMergedDir = new File(target); - if(!toBeMergedDir.exists()) { - toBeMergedDir.mkdirs(); - } - if(!toBeMergedDir.exists()) { - throw new IOException("Target " + target + " does not exist"); - } - if(!toBeMergedDir.isDirectory()) { - throw new IOException("Target " + target + " is not a dir"); - } - if(!toBeMergedDir.canWrite()) { - throw new IOException("Target " + target + " is not writable"); - } - File toBeMergedFile = new File(toBeMergedDir,base); - if(toBeMergedFile.exists()) { - LOGGER.severe("WARNING: "+toBeMergedFile.getAbsolutePath() + - "already exists!"); - } else { - if(cdx.renameTo(toBeMergedFile)) { - LOGGER.info("Queued " + toBeMergedFile.getAbsolutePath() + - " for merging."); - added = true; - } else { - LOGGER.severe("FAILED rename("+cdx.getAbsolutePath()+ - ") to ("+toBeMergedFile.getAbsolutePath()+")"); - } - } - } - return added; - } - - /** - * @param base - * @param itr - * @return true if data was added to local or remote index - * @throws HttpException - * @throws IOException - */ - public boolean addSearchResults(String base, Iterator<SearchResult> itr) - throws HttpException, IOException { - - if(tmpDir == null) { - throw new IOException("No tmpDir argument"); - } - File tmpFile = new File(tmpDir,base); - if(tmpFile.exists()) { - // TODO: is this safe? - if(!tmpFile.delete()) { - throw new IOException("Unable to remove tmp " + - tmpFile.getAbsolutePath()); - } - } - FileOutputStream os = new FileOutputStream(tmpFile); - BufferedOutputStream bos = new BufferedOutputStream(os); - PrintWriter pw = new PrintWriter(bos); - - Adapter<SearchResult,String> adapterSRtoS = - new SearchResultToCDXLineAdapter(); - Iterator<String> itrS = - new AdaptedIterator<SearchResult,String>(itr,adapterSRtoS); - - while(itrS.hasNext()) { - pw.println(itrS.next()); - } - pw.close(); - boolean added = addCDX(tmpFile); - return added; - } - - /** - * @return the target - */ - public String getTarget() { - return target; - } - - /** - * @param target the target to set - */ - public void setTarget(String target) { - this.target = target; - } - - /** - * @return the tmpDir - */ - public String getTmpDir() { - if(tmpDir == null) { - return null; - } - return tmpDir.getAbsolutePath(); - } - - /** - * @param tmpDir the tmpDir to set - */ - public void setTmpDir(String tmpDir) { - this.tmpDir = new File(tmpDir); - if(!this.tmpDir.isDirectory()) { - this.tmpDir.mkdirs(); - } - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/RemoteSubmitFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/RemoteSubmitFilter.java 2008-06-24 23:40:15 UTC (rev 2309) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/RemoteSubmitFilter.java 2008-06-24 23:41:39 UTC (rev 2310) @@ -1,187 +0,0 @@ -/* RemoteSubmitFilter - * - * $Id$ - * - * Created on 3:57:00 PM Oct 12, 2006. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourceindex.indexer; - -import java.io.BufferedInputStream; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.PrintWriter; -import java.util.Enumeration; -import java.util.Properties; - -import javax.servlet.Filter; -import javax.servlet.FilterChain; -import javax.servlet.FilterConfig; -import javax.servlet.ServletContext; -import javax.servlet.ServletException; -import javax.servlet.ServletRequest; -import javax.servlet.ServletResponse; -import javax.servlet.http.HttpServletRequest; -import javax.servlet.http.HttpServletResponse; - -/** - * Filter that accepts PUT HTTP requests to insert CDX files into the incoming - * directory for a local BDBIndex. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class RemoteSubmitFilter implements Filter { - - private final static String INCOMING_PATH = "config-tmp.incoming"; - private final static String HTTP_PUT_METHOD = "PUT"; - private File incoming = null; - private File tmpIncoming = null; - - // TODO: get rid of this - @SuppressWarnings("unchecked") - public void init(FilterConfig c) throws ServletException { - - Properties p = new Properties(); - ServletContext sc = c.getServletContext(); - for (Enumeration e = sc.getInitParameterNames(); e.hasMoreElements();) { - String key = (String) e.nextElement(); - p.put(key, sc.getInitParameter(key)); - } - for (Enumeration e = c.getInitParameterNames(); e.hasMoreElements();) { - String key = (String) e.nextElement(); - p.put(key, c.getInitParameter(key)); - } - - String cfgName = INCOMING_PATH; - String incomingPath = p.getProperty(cfgName); - if((incomingPath == null) || incomingPath.length() == 0) { - throw new ServletException("Invalid or missing " + cfgName + - " configuration"); - } - incoming = new File(incomingPath); - tmpIncoming = new File(incoming,"tmp"); - try { - ensureDir(incoming); - ensureDir(tmpIncoming); - } catch (IOException e) { - throw new ServletException(e); - } - } - private void ensureDir(File dir) throws IOException { - if(dir.exists()) { - if(!dir.isDirectory()) { - throw new IOException("Path " + dir.getAbsolutePath() + - "exists but is not a directory."); - } - } else { - if(!dir.mkdirs()) { - throw new IOException("FAILED mkdir " + dir.getAbsolutePath()); - } - } - } - - /* - * (non-Javadoc) - * - * @see javax.servlet.Filter#doFilter(javax.servlet.ServletRequest, - * javax.servlet.ServletResponse, javax.servlet.FilterChain) - */ - public void doFilter(ServletRequest request, ServletResponse response, - FilterChain chain) throws IOException, ServletException { - if (!handle(request, response)) { - chain.doFilter(request, response); - } - } - /** - * @param request - * @param response - * @return boolean, true unless something went wrong.. - * @throws IOException - * @throws ServletException - */ - protected boolean handle(final ServletRequest request, - final ServletResponse response) throws IOException, - ServletException { - if (!(request instanceof HttpServletRequest)) { - return false; - } - if (!(response instanceof HttpServletResponse)) { - return false; - } - HttpServletRequest httpRequest = (HttpServletRequest) request; - if(httpRequest.getMethod().equals(HTTP_PUT_METHOD)) { - - return handlePut(httpRequest,response); - - } - return false; - } - - protected boolean handlePut(final HttpServletRequest request, - final ServletResponse response) throws IOException, - ServletException { - - String reqURI = request.getRequestURI(); - int lastSlashIdx = reqURI.lastIndexOf("/"); - if (lastSlashIdx == -1) { - return false; - } - String targetFileName = reqURI.substring(lastSlashIdx + 1); - String tmpFileName = targetFileName + ".tmp"; - File tmpFile = new File(tmpIncoming,tmpFileName); - File targetFile = new File(incoming, targetFileName); - - int i; - InputStream input; - input = request.getInputStream(); - BufferedInputStream in = new BufferedInputStream(input); - BufferedReader reader = new BufferedReader(new InputStreamReader(in)); - FileWriter out = new FileWriter(tmpFile); - - while ((i = reader.read()) != -1) { - out.write(i); - } - - out.close(); - in.close(); - if (!tmpFile.renameTo(targetFile)) { - throw new IOException("Unable to rename " - + tmpFile.getAbsolutePath() + " to " - + targetFile.getAbsolutePath()); - } - - PrintWriter outHTML = response.getWriter(); - outHTML.println("done"); - return true; - } - - /* (non-Javadoc) - * @see javax.servlet.Filter#destroy() - */ - public void destroy() { - // TODO Auto-generated method stub - - } -} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/IndexClient.java (from rev 2302, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/IndexClient.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/IndexClient.java 2008-06-24 23:41:39 UTC (rev 2310) @@ -0,0 +1,204 @@ +/* IndexClient + * + * $Id$ + * + * Created on 4:22:52 PM Oct 12, 2006. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.updater; + +import java.io.File; +import java.io.BufferedOutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Iterator; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpStatus; +import org.apache.commons.httpclient.methods.InputStreamRequestEntity; +import org.apache.commons.httpclient.methods.PutMethod; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class IndexClient { + private static final Logger LOGGER = Logger.getLogger(IndexClient + .class.getName()); + + private String target = null; + private File tmpDir = null; + + private HttpClient client = new HttpClient(); + + /** + * @param cdx + * @return true if CDX was added to local or remote index + * @throws HttpException + * @throws IOException + */ + public boolean addCDX(File cdx) throws HttpException, IOException { + boolean added = false; + if(target == null) { + throw new IOException("No target set"); + } + String base = cdx.getName(); + if(target.startsWith("http://")) { + String finalUrl = target; + if(target.endsWith("/")) { + finalUrl = target + base; + } else { + finalUrl = target + "/" + base; + } + PutMethod method = new PutMethod(finalUrl); + method.setRequestEntity(new InputStreamRequestEntity( + new FileInputStream(cdx))); + + int statusCode = client.executeMethod(method); + if (statusCode == HttpStatus.SC_OK) { + LOGGER.info("Uploaded cdx " + cdx.getAbsolutePath() + " to " + + finalUrl); + if(!cdx.delete()) { + throw new IOException("FAILED delete " + + cdx.getAbsolutePath()); + } + + added = true; + } else { + throw new IOException("Method failed: " + method.getStatusLine() + + " for URL " + finalUrl + " on file " + + cdx.getAbsolutePath()); + } + + } else { + // assume a local directory: + File toBeMergedDir = new File(target); + if(!toBeMergedDir.exists()) { + toBeMergedDir.mkdirs(); + } + if(!toBeMergedDir.exists()) { + throw new IOException("Target " + target + " does not exist"); + } + if(!toBeMergedDir.isDirectory()) { + throw new IOException("Target " + target + " is not a dir"); + } + if(!toBeMergedDir.canWrite()) { + throw new IOException("Target " + target + " is not writable"); + } + File toBeMergedFile = new File(toBeMergedDir,base); + if(toBeMergedFile.exists()) { + LOGGER.severe("WARNING: "+toBeMergedFile.getAbsolutePath() + + "already exists!"); + } else { + if(cdx.renameTo(toBeMergedFile)) { + LOGGER.info("Queued " + toBeMergedFile.getAbsolutePath() + + " for merging."); + added = true; + } else { + LOGGER.severe("FAILED rename("+cdx.getAbsolutePath()+ + ") to ("+toBeMergedFile.getAbsolutePath()+")"); + } + } + } + return added; + } + + /** + * @param base + * @param itr + * @return true if data was added to local or remote index + * @throws HttpException + * @throws IOException + */ + public boolean addSearchResults(String base, Iterator<SearchResult> itr) + throws HttpException, IOException { + + if(tmpDir == null) { + throw new IOException("No tmpDir argument"); + } + File tmpFile = new File(tmpDir,base); + if(tmpFile.exists()) { + // TODO: is this safe? + if(!tmpFile.delete()) { + throw new IOException("Unable to remove tmp " + + tmpFile.getAbsolutePath()); + } + } + FileOutputStream os = new FileOutputStream(tmpFile); + BufferedOutputStream bos = new BufferedOutputStream(os); + PrintWriter pw = new PrintWriter(bos); + + Adapter<SearchResult,String> adapterSRtoS = + new SearchResultToCDXLineAdapter(); + Iterator<String> itrS = + new AdaptedIterator<SearchResult,String>(itr,adapterSRtoS); + + while(itrS.hasNext()) { + pw.println(itrS.next()); + } + pw.close(); + boolean added = addCDX(tmpFile); + return added; + } + + /** + * @return the target + */ + public String getTarget() { + return target; + } + + /** + * @param target the target to set + */ + public void setTarget(String target) { + this.target = target; + } + + /** + * @return the tmpDir + */ + public String getTmpDir() { + if(tmpDir == null) { + return null; + } + return tmpDir.getAbsolutePath(); + } + + /** + * @param tmpDir the tmpDir to set + */ + public void setTmpDir(String tmpDir) { + this.tmpDir = new File(tmpDir); + if(!this.tmpDir.isDirectory()) { + this.tmpDir.mkdirs(); + } + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/RemoteSubmitFilter.java (from rev 2302, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/RemoteSubmitFilter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/RemoteSubmitFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/RemoteSubmitFilter.java 2008-06-24 23:41:39 UTC (rev 2310) @@ -0,0 +1,187 @@ +/* RemoteSubmitFilter + * + * $Id$ + * + * Created on 3:57:00 PM Oct 12, 2006. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.updater; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.PrintWriter; +import java.util.Enumeration; +import java.util.Properties; + +import javax.servlet.Filter; +import javax.servlet.FilterChain; +import javax.servlet.FilterConfig; +import javax.servlet.ServletContext; +import javax.servlet.ServletException; +import javax.servlet.ServletRequest; +import javax.servlet.ServletResponse; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +/** + * Filter that accepts PUT HTTP requests to insert CDX files into the incoming + * directory for a local BDBIndex. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class RemoteSubmitFilter implements Filter { + + private final static String INCOMING_PATH = "config-tmp.incoming"; + private final static String HTTP_PUT_METHOD = "PUT"; + private File incoming = null; + private File tmpIncoming = null; + + // TODO: get rid of this + @SuppressWarnings("unchecked") + public void init(FilterConfig c) throws ServletException { + + Properties p = new Properties(); + ServletContext sc = c.getServletContext(); + for (Enumeration e = sc.getInitParameterNames(); e.hasMoreElements();) { + String key = (String) e.nextElement(); + p.put(key, sc.getInitParameter(key)); + } + for (Enumeration e = c.getInitParameterNames(); e.hasMoreElements();) { + String key = (String) e.nextElement(); + p.put(key, c.getInitParameter(key)); + } + + String cfgName = INCOMING_PATH; + String incomingPath = p.getProperty(cfgName); + if((incomingPath == null) || incomingPath.length() == 0) { + throw new ServletException("Invalid or missing " + cfgName + + " configuration"); + } + incoming = new File(incomingPath); + tmpIncoming = new File(incoming,"tmp"); + try { + ensureDir(incoming); + ensureDir(tmpIncoming); + } catch (IOException e) { + throw new ServletException(e); + } + } + private void ensureDir(File dir) throws IOException { + if(dir.exists()) { + if(!dir.isDirectory()) { + throw new IOException("Path " + dir.getAbsolutePath() + + "exists but is not a directory."); + } + } else { + if(!dir.mkdirs()) { + throw new IOException("FAILED mkdir " + dir.getAbsolutePath()); + } + } + } + + /* + * (non-Javadoc) + * + * @see javax.servlet.Filter#doFilter(javax.servlet.ServletRequest, + * javax.servlet.ServletResponse, javax.servlet.FilterChain) + */ + public void doFilter(ServletRequest request, ServletResponse response, + FilterChain chain) throws IOException, ServletException { + if (!handle(request, response)) { + chain.doFilter(request, response); + } + } + /** + * @param request + * @param response + * @return boolean, true unless something went wrong.. + * @throws IOException + * @throws ServletException + */ + protected boolean handle(final ServletRequest request, + final ServletResponse response) throws IOException, + ServletException { + if (!(request instanceof HttpServletRequest)) { + return false; + } + if (!(response instanceof HttpServletResponse)) { + return false; + } + HttpServletRequest httpRequest = (HttpServletRequest) request; + if(httpRequest.getMethod().equals(HTTP_PUT_METHOD)) { + + return handlePut(httpRequest,response); + + } + return false; + } + + protected boolean handlePut(final HttpServletRequest request, + final ServletResponse response) throws IOException, + ServletException { + + String reqURI = request.getRequestURI(); + int lastSlashIdx = reqURI.lastIndexOf("/"); + if (lastSlashIdx == -1) { + return false; + } + String targetFileName = reqURI.substring(lastSlashIdx + 1); + String tmpFileName = targetFileName + ".tmp"; + File tmpFile = new File(tmpIncoming,tmpFileName); + File targetFile = new File(incoming, targetFileName); + + int i; + InputStream input; + input = request.getInputStream(); + BufferedInputStream in = new BufferedInputStream(input); + BufferedReader reader = new BufferedReader(new InputStreamReader(in)); + FileWriter out = new FileWriter(tmpFile); + + while ((i = reader.read()) != -1) { + out.write(i); + } + + out.close(); + in.close(); + if (!tmpFile.renameTo(targetFile)) { + throw new IOException("Unable to rename " + + tmpFile.getAbsolutePath() + " to " + + targetFile.getAbsolutePath()); + } + + PrintWriter outHTML = response.getWriter(); + outHTML.println("done"); + return true; + } + + /* (non-Javadoc) + * @see javax.servlet.Filter#destroy() + */ + public void destroy() { + // TODO Auto-generated method stub + + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-24 23:40:07
|
Revision: 2309 http://archive-access.svn.sourceforge.net/archive-access/?rev=2309&view=rev Author: bradtofel Date: 2008-06-24 16:40:15 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: moving index update code to separate package. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/BDBIndexUpdater.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndexUpdater.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java 2008-06-24 23:36:30 UTC (rev 2308) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java 2008-06-24 23:40:15 UTC (rev 2309) @@ -30,18 +30,21 @@ import java.io.PrintWriter; import java.util.Iterator; +import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.bdb.BDBRecord; import org.archive.wayback.bdb.BDBRecordSet; import org.archive.wayback.core.SearchResult; import org.archive.wayback.exception.ConfigurationException; import org.archive.wayback.exception.ResourceIndexNotAvailableException; -import org.archive.wayback.resourceindex.SearchResultSource; +import org.archive.wayback.resourceindex.UpdatableSearchResultSource; import org.archive.wayback.resourceindex.cdx.CDXLineToSearchResultAdapter; import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; +import org.archive.wayback.resourceindex.updater.BDBIndexUpdater; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.flatfile.RecordIterator; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; import com.sleepycat.je.DatabaseException; @@ -51,7 +54,9 @@ * @author brad * @version $Date$, $Revision$ */ -public class BDBIndex extends BDBRecordSet implements SearchResultSource { +public class BDBIndex extends BDBRecordSet implements + UpdatableSearchResultSource { + private String bdbPath = null; private String bdbName = null; private BDBIndexUpdater updater = null; @@ -107,7 +112,22 @@ public void cleanup(CloseableIterator<SearchResult> c) throws IOException { c.close(); } - + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.UpdatableSearchResultSource#addSearchResults(java.util.Iterator) + */ + public void addSearchResults(Iterator<SearchResult> itr, + UrlCanonicalizer canonicalizer) throws IOException { + Adapter<SearchResult,BDBRecord> adapterSRtoBDB = + new SearchResultToBDBRecordAdapter(canonicalizer); + + Iterator<BDBRecord> itrBDB = + new AdaptedIterator<SearchResult,BDBRecord>(itr, + adapterSRtoBDB); + + insertRecords(itrBDB); + + } private static void USAGE() { System.err.println("Usage: DBPATH DBNAME -w"); System.err.println("\tRead lines from STDIN, inserting into BDBJE at\n" + @@ -133,7 +153,7 @@ String name = args[1]; String op = args[2]; BDBIndex index = new BDBIndex(); - + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); try { index.initializeDB(path,name); } catch (DatabaseException e) { @@ -204,14 +224,20 @@ Iterator<SearchResult> itrSR = new AdaptedIterator<String,SearchResult>(itrS,adapterStoSR); - Adapter<SearchResult,BDBRecord> adapterSRtoBDB = - new SearchResultToBDBRecordAdapter(); - - Iterator<BDBRecord> itrBDB = - new AdaptedIterator<SearchResult,BDBRecord>(itrSR, - adapterSRtoBDB); - - index.insertRecords(itrBDB); +// Adapter<SearchResult,BDBRecord> adapterSRtoBDB = +// new SearchResultToBDBRecordAdapter(); +// +// Iterator<BDBRecord> itrBDB = +// new AdaptedIterator<SearchResult,BDBRecord>(itrSR, +// adapterSRtoBDB); +// +// index.insertRecords(itrBDB); + try { + index.addSearchResults(itrSR, canonicalizer); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } } else { USAGE(); } Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndexUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndexUpdater.java 2008-06-24 23:36:30 UTC (rev 2308) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndexUpdater.java 2008-06-24 23:40:15 UTC (rev 2309) @@ -1,399 +0,0 @@ -/* BDBIndexUpdater - * - * $Id$ - * - * Created on 2:59:40 PM Oct 12, 2006. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourceindex.bdb; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.logging.Logger; - -import org.archive.wayback.bdb.BDBRecord; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.exception.ConfigurationException; -import org.archive.wayback.resourceindex.cdx.CDXLineToSearchResultAdapter; -//import org.archive.wayback.resourcestore.ArcIndexer; -import org.archive.wayback.util.AdaptedIterator; -import org.archive.wayback.util.flatfile.FlatFile; - -/** - * Class which starts a background thread that repeatedly scans an incoming - * directory and merges files found therein(which are assumed to be in CDX - * format) with a BDBIndex. Optional configurations include: - * - * target directory where merged files are moved to (otherwise deleted) - * target directory where failed failed are moved(otherwise left in place) - * milliseconds between scans of the incoming directory(default 10000) - * - * @author brad - * @version $Date$, $Revision$ - */ -public class BDBIndexUpdater { - /** - * Logger for this class - */ - private static final Logger LOGGER = - Logger.getLogger(BDBIndexUpdater.class.getName()); - - private final static int DEFAULT_RUN_INTERVAL_MS = 10000; - - private BDBIndex index = null; - - private File incoming = null; - - private File merged = null; - - private File failed = null; - - private int runInterval = DEFAULT_RUN_INTERVAL_MS; - - /** - * Thread object of update thread -- also is flag indicating if the thread - * has already been started. Access to it is synchronized. - */ - private Thread updateThread = null; - - /** - * Default constructor - */ - public BDBIndexUpdater() { - - } - /** - * @param index - * @param incoming - */ - public BDBIndexUpdater(BDBIndex index, File incoming) { - this.index = index; - this.incoming = incoming; - } - - /** - * start the background index merging thread - * @throws ConfigurationException - */ - public void init() throws ConfigurationException { - if(index == null) { - throw new ConfigurationException("No index target on bdb updater"); - } - if(incoming == null) { - throw new ConfigurationException("No incoming on bdb updater"); - } - startUpdateThread(); - } - - /** Ensure the argument directory exists - * @param dir - * @throws IOException - */ - private void ensureDir(File dir) throws IOException { - if (!dir.isDirectory() && !dir.mkdirs()) { - throw new IOException("FAILED to create " + dir.getAbsolutePath()); - } - } - - /** - * start a background thread that merges new CDX files in incoming into - * the BDBIndex. - * - * @throws ConfigurationException - */ - public void startup() throws ConfigurationException { - try { - ensureDir(incoming); - if(merged != null) ensureDir(merged); - if(failed != null) ensureDir(failed); - } catch (IOException e) { - e.printStackTrace(); - throw new ConfigurationException(e.getMessage()); - } - - if (updateThread == null) { - startUpdateThread(); - } - } - - /** - * start the BDBIndexUpdaterThread thread, which will scan for new cdx files - * in the incoming directory, and add them to the BDBIndex. - */ - private synchronized void startUpdateThread() { - if (updateThread != null) { - return; - } - updateThread = new BDBIndexUpdaterThread(this,runInterval); - updateThread.start(); - } - - - private boolean mergeFile(File cdxFile) { - boolean added = false; - try { - FlatFile ffile = new FlatFile(cdxFile.getAbsolutePath()); - AdaptedIterator<String,SearchResult> searchResultItr = - new AdaptedIterator<String,SearchResult>( - ffile.getSequentialIterator(), - new CDXLineToSearchResultAdapter()); - Iterator<BDBRecord> it = new AdaptedIterator<SearchResult,BDBRecord> - (searchResultItr,new SearchResultToBDBRecordAdapter()); - - index.insertRecords(it); - added = true; - } catch (IOException e) { - e.printStackTrace(); - } - return added; - } - - private File getTargetFile(File f, File targetDir) { - File target = new File(targetDir, f.getName()); - int x = 0; - while(target.exists()) { - if(x++ > 255) { - throw new RuntimeException("too many " - + "duplicates of file " + f.getAbsolutePath() + - " in " + targetDir.getAbsolutePath()); - } - target = new File(targetDir,f.getName() + "." + x); - } - return target; - } - - private File ensureDir(String path) throws ConfigurationException { - if(path.length() < 1) { - throw new ConfigurationException("Empty directory path"); - } - File dir = new File(path); - if(dir.exists()) { - if(!dir.isDirectory()) { - throw new ConfigurationException("path " + path + "exists" + - "but is not a directory"); - } - } else { - if(!dir.mkdirs()) { - throw new ConfigurationException("unable to create directory" + - " at " + path); - } - } - return dir; - } - - private void handleMerged(File f) { - if (merged == null) { - if (!f.delete()) { - // big problems... lets exit - throw new RuntimeException("Unable to delete " - + f.getAbsolutePath()); - } - LOGGER.info("Removed merged file " + f.getAbsolutePath()); - } else { - // move to merged: - File target = getTargetFile(f,merged); - if (!f.renameTo(target)) { - throw new RuntimeException("FAILED rename" + "(" - + f.getAbsolutePath() + ") to " + "(" - + target.getAbsolutePath() + ")"); - } - LOGGER.info("Renamed merged file " + f.getAbsolutePath() + " to " + - target.getAbsolutePath()); - } - } - - private void handleFailed(File f) { - if (failed == null) { - // nothing much to do.. just complain and leave it. - LOGGER.info("FAILED INDEX: " + f.getAbsolutePath()); - } else { - // move to failed: - File target = getTargetFile(f,failed); - if (!f.renameTo(target)) { - throw new RuntimeException("FAILED rename" + "(" - + f.getAbsolutePath() + ") to " + "(" - + target.getAbsolutePath() + ")"); - } - LOGGER.info("Renamed failed merge file " + f.getAbsolutePath() + - " to " + target.getAbsolutePath()); - } - } - - protected int mergeAll() { - int numMerged = 0; - File incomingFiles[] = incoming.listFiles(); - int i = 0; - for (i = 0; i < incomingFiles.length; i++) { - File f = incomingFiles[i]; - if (f.isFile()) { - if (mergeFile(f)) { - handleMerged(f); - numMerged++; - } else { - handleFailed(f); - } - } - } - return numMerged; - } - - /** - * @return the index - */ - public BDBIndex getIndex() { - return index; - } - - /** - * @param index the index to set - */ - public void setIndex(BDBIndex index) { - this.index = index; - } - - /** - * @return the incoming - */ - public String getIncoming() { - if(incoming == null) { - return null; - } - return incoming.getAbsolutePath(); - } - - /** - * @param incoming the incoming to set - * @throws ConfigurationException - */ - public void setIncoming(String incoming) throws ConfigurationException { - this.incoming = ensureDir(incoming); - } - - - /** - * @return the merged - */ - public String getMerged() { - if(merged == null) { - return null; - } - return merged.getAbsolutePath(); - } - - /** - * @param merged The merged to set. - * @throws ConfigurationException - */ - public void setMerged(String merged) throws ConfigurationException { - this.merged = ensureDir(merged); - } - /** - * @param merged - * @throws IOException - */ - public void setMerged(File merged) throws IOException { - ensureDir(merged); - this.merged = merged; - } - - /** - * @return the failed - */ - public String getFailed() { - if(failed == null) { - return null; - } - return failed.getAbsolutePath(); - } - - /** - * @param failed The failed to set. - * @throws ConfigurationException - */ - public void setFailed(String failed) throws ConfigurationException { - this.failed = ensureDir(failed); - } - /** - * @param failed - * @throws IOException - */ - public void setFailed(File failed) throws IOException { - ensureDir(failed); - this.failed = failed; - } - - /** - * @return the runInterval - */ - public int getRunInterval() { - return runInterval; - } - - /** - * @param runInterval The runInterval to set. - */ - public void setRunInterval(int runInterval) { - this.runInterval = runInterval; - } - /** - * Thread that repeatedly calls mergeAll on the BDBIndexUpdater. - * - * @author Brad Tofel - * @version $Date$, $Revision$ - */ - private class BDBIndexUpdaterThread extends Thread { - /** - * object which merges CDX files with the BDBResourceIndex - */ - private BDBIndexUpdater updater = null; - - private int runInterval; - - /** - * @param updater - * @param runInterval - */ - public BDBIndexUpdaterThread(BDBIndexUpdater updater, int runInterval) { - super("BDBIndexUpdaterThread"); - super.setDaemon(true); - this.updater = updater; - this.runInterval = runInterval; - LOGGER.info("BDBIndexUpdaterThread is alive."); - } - - public void run() { - int sleepInterval = runInterval; - while (true) { - try { - int numMerged = updater.mergeAll(); - if (numMerged == 0) { - sleep(sleepInterval); - sleepInterval += runInterval; - } else { - sleepInterval = runInterval; - } - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - } - } -} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java 2008-06-24 23:36:30 UTC (rev 2308) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java 2008-06-24 23:40:15 UTC (rev 2309) @@ -24,6 +24,7 @@ */ package org.archive.wayback.resourceindex.bdb; +import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.WaybackConstants; import org.archive.wayback.bdb.BDBRecord; import org.archive.wayback.bdb.BDBRecordSet; @@ -47,8 +48,14 @@ BDBRecord record = new BDBRecord(key, value); + private UrlCanonicalizer canonicalizer = null; + private final static String DELIMITER = " "; + public SearchResultToBDBRecordAdapter(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } + /* * (non-Javadoc) * Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/BDBIndexUpdater.java (from rev 2302, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndexUpdater.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/BDBIndexUpdater.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/BDBIndexUpdater.java 2008-06-24 23:40:15 UTC (rev 2309) @@ -0,0 +1,401 @@ +/* BDBIndexUpdater + * + * $Id$ + * + * Created on 2:59:40 PM Oct 12, 2006. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.updater; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; +import java.util.logging.Logger; + +import org.archive.wayback.bdb.BDBRecord; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.exception.ConfigurationException; +import org.archive.wayback.resourceindex.bdb.BDBIndex; +import org.archive.wayback.resourceindex.bdb.SearchResultToBDBRecordAdapter; +import org.archive.wayback.resourceindex.cdx.CDXLineToSearchResultAdapter; +//import org.archive.wayback.resourcestore.ArcIndexer; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.flatfile.FlatFile; + +/** + * Class which starts a background thread that repeatedly scans an incoming + * directory and merges files found therein(which are assumed to be in CDX + * format) with a BDBIndex. Optional configurations include: + * + * target directory where merged files are moved to (otherwise deleted) + * target directory where failed failed are moved(otherwise left in place) + * milliseconds between scans of the incoming directory(default 10000) + * + * @author brad + * @version $Date$, $Revision$ + */ +public class BDBIndexUpdater { + /** + * Logger for this class + */ + private static final Logger LOGGER = + Logger.getLogger(BDBIndexUpdater.class.getName()); + + private final static int DEFAULT_RUN_INTERVAL_MS = 10000; + + private BDBIndex index = null; + + private File incoming = null; + + private File merged = null; + + private File failed = null; + + private int runInterval = DEFAULT_RUN_INTERVAL_MS; + + /** + * Thread object of update thread -- also is flag indicating if the thread + * has already been started. Access to it is synchronized. + */ + private Thread updateThread = null; + + /** + * Default constructor + */ + public BDBIndexUpdater() { + + } + /** + * @param index + * @param incoming + */ + public BDBIndexUpdater(BDBIndex index, File incoming) { + this.index = index; + this.incoming = incoming; + } + + /** + * start the background index merging thread + * @throws ConfigurationException + */ + public void init() throws ConfigurationException { + if(index == null) { + throw new ConfigurationException("No index target on bdb updater"); + } + if(incoming == null) { + throw new ConfigurationException("No incoming on bdb updater"); + } + startUpdateThread(); + } + + /** Ensure the argument directory exists + * @param dir + * @throws IOException + */ + private void ensureDir(File dir) throws IOException { + if (!dir.isDirectory() && !dir.mkdirs()) { + throw new IOException("FAILED to create " + dir.getAbsolutePath()); + } + } + + /** + * start a background thread that merges new CDX files in incoming into + * the BDBIndex. + * + * @throws ConfigurationException + */ + public void startup() throws ConfigurationException { + try { + ensureDir(incoming); + if(merged != null) ensureDir(merged); + if(failed != null) ensureDir(failed); + } catch (IOException e) { + e.printStackTrace(); + throw new ConfigurationException(e.getMessage()); + } + + if (updateThread == null) { + startUpdateThread(); + } + } + + /** + * start the BDBIndexUpdaterThread thread, which will scan for new cdx files + * in the incoming directory, and add them to the BDBIndex. + */ + private synchronized void startUpdateThread() { + if (updateThread != null) { + return; + } + updateThread = new BDBIndexUpdaterThread(this,runInterval); + updateThread.start(); + } + + + private boolean mergeFile(File cdxFile) { + boolean added = false; + try { + FlatFile ffile = new FlatFile(cdxFile.getAbsolutePath()); + AdaptedIterator<String,SearchResult> searchResultItr = + new AdaptedIterator<String,SearchResult>( + ffile.getSequentialIterator(), + new CDXLineToSearchResultAdapter()); + Iterator<BDBRecord> it = new AdaptedIterator<SearchResult,BDBRecord> + (searchResultItr,new SearchResultToBDBRecordAdapter()); + + index.insertRecords(it); + added = true; + } catch (IOException e) { + e.printStackTrace(); + } + return added; + } + + private File getTargetFile(File f, File targetDir) { + File target = new File(targetDir, f.getName()); + int x = 0; + while(target.exists()) { + if(x++ > 255) { + throw new RuntimeException("too many " + + "duplicates of file " + f.getAbsolutePath() + + " in " + targetDir.getAbsolutePath()); + } + target = new File(targetDir,f.getName() + "." + x); + } + return target; + } + + private File ensureDir(String path) throws ConfigurationException { + if(path.length() < 1) { + throw new ConfigurationException("Empty directory path"); + } + File dir = new File(path); + if(dir.exists()) { + if(!dir.isDirectory()) { + throw new ConfigurationException("path " + path + "exists" + + "but is not a directory"); + } + } else { + if(!dir.mkdirs()) { + throw new ConfigurationException("unable to create directory" + + " at " + path); + } + } + return dir; + } + + private void handleMerged(File f) { + if (merged == null) { + if (!f.delete()) { + // big problems... lets exit + throw new RuntimeException("Unable to delete " + + f.getAbsolutePath()); + } + LOGGER.info("Removed merged file " + f.getAbsolutePath()); + } else { + // move to merged: + File target = getTargetFile(f,merged); + if (!f.renameTo(target)) { + throw new RuntimeException("FAILED rename" + "(" + + f.getAbsolutePath() + ") to " + "(" + + target.getAbsolutePath() + ")"); + } + LOGGER.info("Renamed merged file " + f.getAbsolutePath() + " to " + + target.getAbsolutePath()); + } + } + + private void handleFailed(File f) { + if (failed == null) { + // nothing much to do.. just complain and leave it. + LOGGER.info("FAILED INDEX: " + f.getAbsolutePath()); + } else { + // move to failed: + File target = getTargetFile(f,failed); + if (!f.renameTo(target)) { + throw new RuntimeException("FAILED rename" + "(" + + f.getAbsolutePath() + ") to " + "(" + + target.getAbsolutePath() + ")"); + } + LOGGER.info("Renamed failed merge file " + f.getAbsolutePath() + + " to " + target.getAbsolutePath()); + } + } + + protected int mergeAll() { + int numMerged = 0; + File incomingFiles[] = incoming.listFiles(); + int i = 0; + for (i = 0; i < incomingFiles.length; i++) { + File f = incomingFiles[i]; + if (f.isFile()) { + if (mergeFile(f)) { + handleMerged(f); + numMerged++; + } else { + handleFailed(f); + } + } + } + return numMerged; + } + + /** + * @return the index + */ + public BDBIndex getIndex() { + return index; + } + + /** + * @param index the index to set + */ + public void setIndex(BDBIndex index) { + this.index = index; + } + + /** + * @return the incoming + */ + public String getIncoming() { + if(incoming == null) { + return null; + } + return incoming.getAbsolutePath(); + } + + /** + * @param incoming the incoming to set + * @throws ConfigurationException + */ + public void setIncoming(String incoming) throws ConfigurationException { + this.incoming = ensureDir(incoming); + } + + + /** + * @return the merged + */ + public String getMerged() { + if(merged == null) { + return null; + } + return merged.getAbsolutePath(); + } + + /** + * @param merged The merged to set. + * @throws ConfigurationException + */ + public void setMerged(String merged) throws ConfigurationException { + this.merged = ensureDir(merged); + } + /** + * @param merged + * @throws IOException + */ + public void setMerged(File merged) throws IOException { + ensureDir(merged); + this.merged = merged; + } + + /** + * @return the failed + */ + public String getFailed() { + if(failed == null) { + return null; + } + return failed.getAbsolutePath(); + } + + /** + * @param failed The failed to set. + * @throws ConfigurationException + */ + public void setFailed(String failed) throws ConfigurationException { + this.failed = ensureDir(failed); + } + /** + * @param failed + * @throws IOException + */ + public void setFailed(File failed) throws IOException { + ensureDir(failed); + this.failed = failed; + } + + /** + * @return the runInterval + */ + public int getRunInterval() { + return runInterval; + } + + /** + * @param runInterval The runInterval to set. + */ + public void setRunInterval(int runInterval) { + this.runInterval = runInterval; + } + /** + * Thread that repeatedly calls mergeAll on the BDBIndexUpdater. + * + * @author Brad Tofel + * @version $Date$, $Revision$ + */ + private class BDBIndexUpdaterThread extends Thread { + /** + * object which merges CDX files with the BDBResourceIndex + */ + private BDBIndexUpdater updater = null; + + private int runInterval; + + /** + * @param updater + * @param runInterval + */ + public BDBIndexUpdaterThread(BDBIndexUpdater updater, int runInterval) { + super("BDBIndexUpdaterThread"); + super.setDaemon(true); + this.updater = updater; + this.runInterval = runInterval; + LOGGER.info("BDBIndexUpdaterThread is alive."); + } + + public void run() { + int sleepInterval = runInterval; + while (true) { + try { + int numMerged = updater.mergeAll(); + if (numMerged == 0) { + sleep(sleepInterval); + sleepInterval += runInterval; + } else { + sleepInterval = runInterval; + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-24 23:36:29
|
Revision: 2308 http://archive-access.svn.sourceforge.net/archive-access/?rev=2308&view=rev Author: bradtofel Date: 2008-06-24 16:36:30 -0700 (Tue, 24 Jun 2008) Log Message: ----------- new package for all update related index code. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/updater/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-24 22:58:45
|
Revision: 2307 http://archive-access.svn.sourceforge.net/archive-access/?rev=2307&view=rev Author: bradtofel Date: 2008-06-24 15:58:51 -0700 (Tue, 24 Jun 2008) Log Message: ----------- INITIAL REV: classes which: * monitor a ResourceFileLocationDB's log * create events when new files are noticed * add those new files to a queue of files needing indexing * monitor the index queue, performing indexing when needed * push new index data to a local or remote ResourceIndex Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/DirectoryIndexQueue.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueue.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/DirectoryIndexQueue.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/DirectoryIndexQueue.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/DirectoryIndexQueue.java 2008-06-24 22:58:51 UTC (rev 2307) @@ -0,0 +1,95 @@ +/* DirectoryIndexQueue + * + * $Id$ + * + * Created on 2:29:10 PM Jun 23, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.IOException; + +import org.archive.wayback.util.DirMaker; + +/** + * Simple queue implementation, which uses a directory containing empty files + * to indicate the presence of items in a queue (set in this case...) + * + * @author brad + * @version $Date$, $Revision$ + */ +public class DirectoryIndexQueue implements IndexQueue { + private File path = null; + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.indexer.IndexQueue#dequeue() + */ + public String dequeue() throws IOException { + String[] names = path.list(); + for(String name : names) { + File tmp = new File(path,name); + if(tmp.isFile()) { + if(tmp.delete()) { + return name; + } else { + throw new IOException("Unable to dequeue/delete (" + + tmp.getAbsolutePath()); + } + } + } + return null; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.indexer.IndexQueue#enqueue(java.lang.String) + */ + public void enqueue(String resourceFileName) throws IOException { + File tmp = new File(path,resourceFileName); + if(!tmp.isFile()) { + tmp.createNewFile(); + } + } + + /** + * @return the path + */ + public String getPath() { + if(path != null) { + return path.getAbsolutePath(); + } + return null; + } + + /** + * @param path the path to set + * @throws IOException + */ + public void setPath(String path) throws IOException { + this.path = DirMaker.ensureDir(path); + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.indexer.IndexQueue#recordStatus(java.lang.String, int) + */ + public void recordStatus(String resourceFileName, int status) { + + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueue.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueue.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueue.java 2008-06-24 22:58:51 UTC (rev 2307) @@ -0,0 +1,42 @@ +/* IndexQueue + * + * $Id$ + * + * Created on 2:05:12 PM Jun 23, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.IOException; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public interface IndexQueue { + public final static int STATUS_DONE = 0; + public final static int STATUS_FAIL = 1; + public final static int STATUS_RETRY = 2; + public void enqueue(String resourceFileName) throws IOException; + public String dequeue() throws IOException; + public void recordStatus(String resourceFileName, int status); +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java 2008-06-24 22:58:51 UTC (rev 2307) @@ -0,0 +1,221 @@ +/* IndexQueueUpdater + * + * $Id$ + * + * Created on 2:02:54 PM Jun 23, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.logging.Logger; + +import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.DirMaker; + +/** + * This class polls a ResourceFileLocationDB repeatedly, to notice new files + * arriving in the DB. Whenever new files are noticed, they are added to the + * Index Queue. + * + * It uses a local file to store the last known "mark" of the location DB. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class IndexQueueUpdater { + + private static final Logger LOGGER = + Logger.getLogger(IndexQueueUpdater.class.getName()); + + private ResourceFileLocationDB db = null; + private IndexQueue queue = null; + private UpdateThread thread = null; + private MarkMemoryFile lastMark = null; + private long interval = 120000; + + public void init() { + if(interval > 0) { + thread = new UpdateThread(this,interval); + thread.start(); + } + } + + public void shutdown() { + if(thread != null) { + thread.interrupt(); + try { + thread.join(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + public int updateQueue() throws IOException { + int added = 0; + long lastMarkPoint = lastMark.getLastMark(); + long currentMarkPoint = db.getCurrentMark(); + if(currentMarkPoint > lastMarkPoint) { + // TODO: touchy touchy... need transactions here to not have + // state sync problems if something goes badly in this block.. + // for example, it would be possible to constantly enqueue the + // same files forever.. + CloseableIterator<String> newNames = + db.getNamesBetweenMarks(lastMarkPoint, currentMarkPoint); + while(newNames.hasNext()) { + queue.enqueue(newNames.next()); + added++; + } + newNames.close(); + lastMark.setLastMark(currentMarkPoint); + } + return added; + } + + private class MarkMemoryFile { + private File file = null; + public MarkMemoryFile(File file) { + this.file = file; + } + + public long getLastMark() throws IOException { + long mark = 0; + if(file.isFile() && file.length() > 0) { + BufferedReader ir = new BufferedReader(new FileReader(file)); + String line = ir.readLine(); + if(line != null) { + mark = Long.parseLong(line); + } + } + return mark; + } + + public void setLastMark(long mark) throws IOException { + PrintWriter pw = new PrintWriter(file); + pw.println(mark); + pw.close(); + } + public String getAbsolutePath() { + return file.getAbsolutePath(); + } + } + + private class UpdateThread extends Thread { + private long runInterval = 120000; + private IndexQueueUpdater updater = null; + + public UpdateThread(IndexQueueUpdater updater, + long runInterval) { + + this.updater = updater; + this.runInterval = runInterval; + } + + public void run() { + LOGGER.info("alive"); + long sleepInterval = runInterval; + while (true) { + try { + int updated = updater.updateQueue(); + + if(updated > 0) { + LOGGER.info("Updated " + updated + " files.."); + sleepInterval = runInterval; + } else { + LOGGER.info("Updated ZERO files.."); + sleepInterval += runInterval; + } + sleep(sleepInterval); + } catch (InterruptedException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + /** + * @return the db + */ + public ResourceFileLocationDB getDb() { + return db; + } + + /** + * @param db the db to set + */ + public void setDb(ResourceFileLocationDB db) { + this.db = db; + } + + /** + * @return the queue + */ + public IndexQueue getQueue() { + return queue; + } + + /** + * @param queue the queue to set + */ + public void setQueue(IndexQueue queue) { + this.queue = queue; + } + + /** + * @return the stateFile + */ + public String getLastMark() { + if(lastMark != null) { + return lastMark.getAbsolutePath(); + } + return null; + } + + /** + * @param stateFile the stateFile to set + * @throws IOException + */ + public void setLastMark(String path) throws IOException { + File tmp = new File(path); + DirMaker.ensureDir(tmp.getParentFile().getAbsolutePath()); + lastMark = new MarkMemoryFile(tmp); + } + + /** + * @return the interval + */ + public long getInterval() { + return interval; + } + + /** + * @param interval the interval to set + */ + public void setInterval(long interval) { + this.interval = interval; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2008-06-24 22:58:51 UTC (rev 2307) @@ -0,0 +1,234 @@ +/* IndexWorker + * + * $Id$ + * + * Created on 2:58:51 PM Jun 23, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.IOException; +import java.util.logging.Logger; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.indexer.IndexClient; +import org.archive.wayback.resourcestore.ArcIndexer; +import org.archive.wayback.resourcestore.WarcIndexer; +import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; +import org.archive.wayback.util.CloseableIterator; +//import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; + +/** + * Simple worker, which gets tasks from an IndexQueue, in the case, the name + * of ARC/WARC files to be indexed, retrieves the ARC/WARC location from a + * ResourceFileLocationDB, creates the index, which is serialized into a file, + * and then hands that file off to a ResourceIndex for merging, using an + * IndexClient. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class IndexWorker { + private static final Logger LOGGER = + Logger.getLogger(IndexWorker.class.getName()); + + public final static String ARC_EXTENSION = ".arc"; + public final static String ARC_GZ_EXTENSION = ".arc.gz"; + public final static String WARC_EXTENSION = ".warc"; + public final static String WARC_GZ_EXTENSION = ".warc.gz"; + + private ArcIndexer arcIndexer = new ArcIndexer(); + private WarcIndexer warcIndexer = new WarcIndexer(); + + private UrlCanonicalizer canonicalizer = new IdentityUrlCanonicalizer(); +// private UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + + private long interval = 120000; + private IndexQueue queue = null; + private ResourceFileLocationDB db = null; + private IndexClient target = null; + private WorkerThread thread = null; + + public void init() { + arcIndexer.setCanonicalizer(canonicalizer); + warcIndexer.setCanonicalizer(canonicalizer); + if(interval > 0) { + thread = new WorkerThread(this,interval); + thread.start(); + } + } + + public void shutdown() { + if(thread != null) { + thread.interrupt(); + try { + thread.join(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + public boolean doWork() throws IOException { + boolean worked = false; + String name = queue.dequeue(); + if(name != null) { + worked = true; + String[] pathsOrUrls = null; + try { + pathsOrUrls = db.nameToUrls(name); + } catch(IOException e) { + LOGGER.severe("FAILED TO LOOKUP(" + name + ")" + + e.getLocalizedMessage()); + return false; + } + try { + if(pathsOrUrls != null) { + for(String pathOrUrl : pathsOrUrls) { + CloseableIterator<SearchResult> itr = indexFile(pathOrUrl); + target.addSearchResults(name, itr); + itr.close(); + break; + } + } + } catch(IOException e) { + LOGGER.severe("FAILED to index or upload (" + name + ")"); + } + } + return worked; + } + + public CloseableIterator<SearchResult> indexFile(String pathOrUrl) + throws IOException { + + CloseableIterator<SearchResult> itr = null; + + if(pathOrUrl.endsWith(ARC_EXTENSION)) { + itr = arcIndexer.iterator(pathOrUrl); + } else if(pathOrUrl.endsWith(ARC_GZ_EXTENSION)) { + itr = arcIndexer.iterator(pathOrUrl); + } else if(pathOrUrl.endsWith(WARC_EXTENSION)) { + itr = warcIndexer.iterator(pathOrUrl); + } else if(pathOrUrl.endsWith(WARC_GZ_EXTENSION)) { + itr = warcIndexer.iterator(pathOrUrl); + } + return itr; + } + + + private class WorkerThread extends Thread { + private long runInterval = 120000; + private IndexWorker worker = null; + + public WorkerThread(IndexWorker worker, long runInterval) { + this.worker = worker; + this.runInterval = runInterval; + } + + public void run() { + LOGGER.info("alive."); + long sleepInterval = runInterval; + while (true) { + try { + boolean worked = worker.doWork(); + + if(worked) { + LOGGER.info("Did work, no sleep.."); + sleepInterval = 0; + } else { + LOGGER.info("No Work to do - sleeping.."); + sleepInterval += runInterval; + } + if(sleepInterval > 0) { + sleep(sleepInterval); + } + } catch (InterruptedException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + + /** + * @return the interval + */ + public long getInterval() { + return interval; + } + /** + * @param interval the interval to set + */ + public void setInterval(long interval) { + this.interval = interval; + } + /** + * @return the queue + */ + public IndexQueue getQueue() { + return queue; + } + /** + * @param queue the queue to set + */ + public void setQueue(IndexQueue queue) { + this.queue = queue; + } + /** + * @return the db + */ + public ResourceFileLocationDB getDb() { + return db; + } + /** + * @param db the db to set + */ + public void setDb(ResourceFileLocationDB db) { + this.db = db; + } + /** + * @return the target + */ + public IndexClient getTarget() { + return target; + } + /** + * @param target the target to set + */ + public void setTarget(IndexClient target) { + this.target = target; + } + /** + * @return the canonicalizer + */ + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + /** + * @param canonicalizer the canonicalizer to set + */ + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2306 http://archive-access.svn.sourceforge.net/archive-access/?rev=2306&view=rev Author: bradtofel Date: 2008-06-24 15:56:31 -0700 (Tue, 24 Jun 2008) Log Message: ----------- INITIAL REV: class which monitors a local directory for files containing updates of ResourceFileSources, and merges them with a ResourceFileLocationDB Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/ResourceFileLocationDBUpdater.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/ResourceFileLocationDBUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/ResourceFileLocationDBUpdater.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/locationdb/ResourceFileLocationDBUpdater.java 2008-06-24 22:56:31 UTC (rev 2306) @@ -0,0 +1,228 @@ +/* ResourceFileLocationDBUpdater + * + * $Id$ + * + * Created on 2:26:44 PM Jun 16, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.locationdb; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; +import java.util.logging.Logger; + +import org.archive.wayback.resourcestore.resourcefile.ResourceFileList; +import org.archive.wayback.resourcestore.resourcefile.ResourceFileLocation; +import org.archive.wayback.util.DirMaker; + +/** + * Class which performs updates on a ResourceFileLocationDB, based on files + * appearing in a incoming directory. When files are noticed in the "incoming" + * directory, they are assumed to be in the format serialized by + * org.archive.wayback.resourcestore.resourcefile.ResourceFileList + * + * These files are synchronized with the ResourceFileLocationDB, and deleted. + * + * Each file has a logical name, which is assumed to uniquely identify a + * ResourceFileSource. As an optimization, the last state of each + * ResouceFileSource is kept in a file under the "state" directory. + * + * This allows this class to compute a difference of the last state with the + * new files in incoming, and only deltas: new files, removed files, + * and possibly moved files, need to applied to the ResourceFileLocationDB. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ResourceFileLocationDBUpdater { + private static final Logger LOGGER = + Logger.getLogger(ResourceFileLocationDBUpdater.class.getName()); + + private ResourceFileLocationDB db = null; + private File stateDir = null; + private File incomingDir = null; + private UpdateThread thread = null; + private long interval = 120000; + + public final static String TMP_SUFFIX = ".TMP"; + + public void init() { + if(interval > 0) { + thread = new UpdateThread(this,interval); + thread.start(); + } + } + + public void shutdown() { + if(thread != null) { + thread.interrupt(); + try { + thread.join(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + public int synchronizeIncoming() throws IOException { + File[] updates = incomingDir.listFiles(); + int updated = 0; + for(File update : updates) { + if(update.getName().endsWith(TMP_SUFFIX)) { + continue; + } + updated++; + synchronize(update); + } + return updated; + } + + public boolean synchronize(File update) throws IOException { + String name = update.getName(); + File current = new File(stateDir,name); + if(!current.isFile()) { + current.createNewFile(); + } + ResourceFileList updateFL = ResourceFileList.load(update); + ResourceFileList currentFL = ResourceFileList.load(current); + + boolean updated = false; + + ResourceFileList removedFiles = currentFL.subtract(updateFL); + ResourceFileList addedFiles = updateFL.subtract(currentFL); + + Iterator<ResourceFileLocation> addedItr = addedFiles.iterator(); + Iterator<ResourceFileLocation> removedItr = removedFiles.iterator(); + while(addedItr.hasNext()) { + updated = true; + ResourceFileLocation location = addedItr.next(); + LOGGER.info("Added " + location.getName() + " " + location.getUrl()); + db.addNameUrl(location.getName(), location.getUrl()); + } + while(removedItr.hasNext()) { + updated = true; + ResourceFileLocation location = removedItr.next(); + LOGGER.info("Removed " + location.getName() + " " + location.getUrl()); + db.removeNameUrl(location.getName(), location.getUrl()); + } + if(updated) { + // lastly replace the state file with the new version: + if(!current.delete()) { + throw new IOException("Unable to delete " + + current.getAbsolutePath()); + } + if(!update.renameTo(current)) { + throw new IOException("Unable to rename " + + update.getAbsolutePath() + " to " + + current.getAbsolutePath()); + } + } else { + if(!update.delete()) { + throw new IOException("Unable to delete " + + update.getAbsolutePath()); + } + } + return updated; + } + + private class UpdateThread extends Thread { + private long runInterval = 120000; + private ResourceFileLocationDBUpdater updater = null; + + public UpdateThread(ResourceFileLocationDBUpdater updater, long runInterval) { + this.updater = updater; + this.runInterval = runInterval; + } + public void run() { + LOGGER.info("ResourceFileLocationDBUpdater.UpdateThread is alive."); + long sleepInterval = runInterval; + while (true) { + try { + int updated = updater.synchronizeIncoming(); + + if(updated > 0) { + LOGGER.info("Updated " + updated + " files.."); + sleepInterval = runInterval; + } else { + LOGGER.info("Updated ZERO files.."); + sleepInterval += runInterval; + } + sleep(sleepInterval); + } catch (InterruptedException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + /** + * @return the db + */ + public ResourceFileLocationDB getDb() { + return db; + } + /** + * @param db the db to set + */ + public void setDb(ResourceFileLocationDB db) { + this.db = db; + } + /** + * @return the stateDir + */ + public String getStateDir() { + return DirMaker.getAbsolutePath(stateDir); + } + /** + * @param stateDir the stateDir to set + * @throws IOException + */ + public void setStateDir(String stateDir) throws IOException { + this.stateDir = DirMaker.ensureDir(stateDir); + } + /** + * @return the incomingDir + */ + public String getIncomingDir() { + return DirMaker.getAbsolutePath(incomingDir); + } + /** + * @param incomingDir the incomingDir to set + * @throws IOException + */ + public void setIncomingDir(String incomingDir) throws IOException { + this.incomingDir = DirMaker.ensureDir(incomingDir); + } + /** + * @return the interval + */ + public long getInterval() { + return interval; + } + /** + * @param interval the interval to set + */ + public void setInterval(long interval) { + this.interval = interval; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |