From: <bra...@us...> - 2007-08-07 01:15:24
|
Revision: 1898 http://archive-access.svn.sourceforge.net/archive-access/?rev=1898&view=rev Author: bradtofel Date: 2007-08-06 18:15:26 -0700 (Mon, 06 Aug 2007) Log Message: ----------- REFACTOR: nearly complete rework of ArcIndexer, BDBIndexUpdater to stream everything using AdaptedIterators. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndexUpdater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/SearchResultToCDXLineAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java 2007-08-02 03:02:45 UTC (rev 1897) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java 2007-08-07 01:15:26 UTC (rev 1898) @@ -26,7 +26,6 @@ import java.io.IOException; import java.net.URL; -import java.text.ParseException; import java.util.Date; import java.util.logging.Logger; @@ -43,7 +42,8 @@ import org.archive.wayback.exception.LiveDocumentNotAvailableException; import org.archive.wayback.exception.ResourceNotInArchiveException; import org.archive.wayback.exception.WaybackException; -import org.archive.wayback.resourceindex.indexer.ArcIndexer; +import org.archive.wayback.resourcestore.ARCRecordToSearchResultAdapter; +import org.archive.wayback.util.Adapter; import org.archive.wayback.util.UrlCanonicalizer; /** @@ -61,6 +61,8 @@ private URLCacher cacher = null; private LiveWebLocalResourceIndex index = null; static UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + private static Adapter<ARCRecord,SearchResult> adapter = + new ARCRecordToSearchResultAdapter(); /** * closes all resources (currently unused...) @@ -202,23 +204,15 @@ resource = arcCacheDir.getResource(name, offset); // add the result to the index: ARCRecord record = (ARCRecord) resource.getArcRecord(); - try { - SearchResult result = ArcIndexer.arcRecordToSearchResult(record); - index.addSearchResult(result); - LOGGER.info("Added URL(" + url.toString() + ") in " + - "ARC(" + name + ") at (" + offset + ") to LiveIndex"); + SearchResult result = adapter.adapt(record); + index.addSearchResult(result); + LOGGER.info("Added URL(" + url.toString() + ") in " + + "ARC(" + name + ") at (" + offset + ") to LiveIndex"); - // we just read thru the doc in order to index it. Reset: - resource = arcCacheDir.getResource(name, offset); - - } catch (ParseException e) { - // TODO: This case could be a big problem -- we might be unable - // to store the fact that we have a local copy. That means we - // could be slamming somebody else's site. - e.printStackTrace(); - throw new IOException(e.getLocalizedMessage()); - } + // we just read thru the doc in order to index it. Reset: + resource = arcCacheDir.getResource(name, offset); + } return resource; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java 2007-08-02 03:02:45 UTC (rev 1897) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java 2007-08-07 01:15:26 UTC (rev 1898) @@ -29,7 +29,7 @@ import org.archive.wayback.core.SearchResult; import org.archive.wayback.resourceindex.LocalResourceIndex; import org.archive.wayback.resourceindex.bdb.BDBIndex; -import org.archive.wayback.resourceindex.indexer.SearchResultToBDBRecordAdapter; +import org.archive.wayback.resourceindex.bdb.SearchResultToBDBRecordAdapter; import org.archive.wayback.util.AdaptedIterator; /** Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java 2007-08-02 03:02:45 UTC (rev 1897) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndex.java 2007-08-07 01:15:26 UTC (rev 1898) @@ -25,23 +25,23 @@ package org.archive.wayback.resourceindex.bdb; import java.io.BufferedReader; -import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.Iterator; -import org.archive.wayback.WaybackConstants; import org.archive.wayback.bdb.BDBRecord; import org.archive.wayback.bdb.BDBRecordSet; -import org.archive.wayback.core.CaptureSearchResults; import org.archive.wayback.core.SearchResult; import org.archive.wayback.exception.ConfigurationException; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.resourceindex.SearchResultSource; -import org.archive.wayback.resourceindex.indexer.ArcIndexer; +import org.archive.wayback.resourceindex.cdx.CDXLineToSearchResultAdapter; +import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.flatfile.RecordIterator; import com.sleepycat.je.DatabaseException; @@ -133,8 +133,6 @@ String name = args[1]; String op = args[2]; BDBIndex index = new BDBIndex(); - int BATCH_SIZE = 1000; - ArcIndexer indexer = new ArcIndexer(); try { index.initializeDB(path,name); @@ -145,118 +143,75 @@ if(op.compareTo("-r") == 0) { PrintWriter pw = new PrintWriter(System.out); - CaptureSearchResults results = new CaptureSearchResults(); + + CloseableIterator<SearchResult> itrSR = null; + Adapter<SearchResult,String> adapter = + new SearchResultToCDXLineAdapter(); + CloseableIterator<String> itrS; + if(args.length == 4) { String prefix = args[3]; - CloseableIterator<SearchResult> itr = null; try { - itr = index.getPrefixIterator(prefix); + itrSR = index.getPrefixIterator(prefix); } catch (ResourceIndexNotAvailableException e) { e.printStackTrace(); System.exit(1); } - while(itr.hasNext()) { - SearchResult result = (SearchResult) itr.next(); - String urlS = result.get(WaybackConstants.RESULT_URL_KEY); - if(!urlS.startsWith(prefix)) { + itrS = new AdaptedIterator<SearchResult,String>(itrSR,adapter); + while(itrS.hasNext()) { + String line = itrS.next(); + if(!line.startsWith(prefix)) { break; } - results.addSearchResult(result); - if(results.getResultCount() > BATCH_SIZE) { - try { - indexer.serializeResults(results,pw,false); - } catch (IOException e) { - e.printStackTrace(); - System.exit(2); - } - results = new CaptureSearchResults(); - } + pw.println(line); } - if(results.getResultCount() > 0) { - try { - indexer.serializeResults(results,pw,false); - } catch (IOException e) { - e.printStackTrace(); - System.exit(2); - } - } + } else { - CloseableIterator<SearchResult> itr = null; try { - itr = index.getPrefixIterator(" "); + itrSR = index.getPrefixIterator(" "); } catch (ResourceIndexNotAvailableException e) { e.printStackTrace(); System.exit(1); } - while(itr.hasNext()) { - SearchResult result = (SearchResult) itr.next(); - results.addSearchResult(result); - if(results.getResultCount() > BATCH_SIZE) { - try { - indexer.serializeResults(results,pw,false); - } catch (IOException e) { - e.printStackTrace(); - System.exit(2); - } - results = new CaptureSearchResults(); - } + itrS = new AdaptedIterator<SearchResult,String>(itrSR,adapter); + + while(itrS.hasNext()) { + pw.println(itrS.next()); } - if(results.getResultCount() > 0) { - try { - indexer.serializeResults(results,pw,false); - } catch (IOException e) { - e.printStackTrace(); - System.exit(2); - } - } - pw.flush(); - pw.close(); - } - - } else if(op.compareTo("-w") == 0) { - File tmpCDX = null; - int total = 0; - int numInTmp = 0; + try { - tmpCDX = File.createTempFile("reader",".cdx"); - PrintWriter pw = new PrintWriter(tmpCDX); - // need to break the results from STDIN into chunks -- each chunk - // is written to a file, then added to the index. - BufferedReader br = new BufferedReader( - new InputStreamReader(System.in)); - - while(true) { - String line = br.readLine(); - if(line == null) { - break; - } - pw.println(line); - numInTmp++; - total++; - if(numInTmp > BATCH_SIZE) { - pw.flush(); - pw.close(); - index.insertRecords( - indexer.getCDXFileBDBRecordIterator(tmpCDX)); - System.err.println("Wrote " + numInTmp + " to index.."); - pw = new PrintWriter(tmpCDX); - numInTmp = 0; - } - } - if(numInTmp > 0) { - pw.flush(); - pw.close(); - index.insertRecords( - indexer.getCDXFileBDBRecordIterator(tmpCDX)); - System.err.println("Wrote last " + numInTmp + " to index."); - } - tmpCDX.delete(); - System.out.println("Total of " + total + " docs inserted."); + itrS.close(); + itrSR.close(); } catch (IOException e) { + // TODO Auto-generated catch block e.printStackTrace(); - System.exit(1); + System.exit(2); } + pw.flush(); + pw.close(); + + } else if(op.compareTo("-w") == 0) { + + BufferedReader br = new BufferedReader( + new InputStreamReader(System.in)); + + RecordIterator itrS = new RecordIterator(br); + + Adapter<String,SearchResult> adapterStoSR = + new CDXLineToSearchResultAdapter(); + + Iterator<SearchResult> itrSR = + new AdaptedIterator<String,SearchResult>(itrS,adapterStoSR); + + Adapter<SearchResult,BDBRecord> adapterSRtoBDB = + new SearchResultToBDBRecordAdapter(); + + Iterator<BDBRecord> itrBDB = + new AdaptedIterator<SearchResult,BDBRecord>(itrSR, + adapterSRtoBDB); + + index.insertRecords(itrBDB); } else { USAGE(); } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndexUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndexUpdater.java 2007-08-02 03:02:45 UTC (rev 1897) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/BDBIndexUpdater.java 2007-08-07 01:15:26 UTC (rev 1898) @@ -30,8 +30,12 @@ import java.util.logging.Logger; import org.archive.wayback.bdb.BDBRecord; +import org.archive.wayback.core.SearchResult; import org.archive.wayback.exception.ConfigurationException; -import org.archive.wayback.resourceindex.indexer.ArcIndexer; +import org.archive.wayback.resourceindex.cdx.CDXLineToSearchResultAdapter; +//import org.archive.wayback.resourcestore.ArcIndexer; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.flatfile.FlatFile; /** * Class which starts a background thread that repeatedly scans an incoming @@ -64,7 +68,7 @@ private int runInterval = DEFAULT_RUN_INTERVAL_MS; - private ArcIndexer indexer = new ArcIndexer(); +// private ArcIndexer indexer = new ArcIndexer(); /** * Thread object of update thread -- also is flag indicating if the thread @@ -147,7 +151,14 @@ private boolean mergeFile(File cdxFile) { boolean added = false; try { - Iterator<BDBRecord> it = indexer.getCDXFileBDBRecordIterator(cdxFile); + FlatFile ffile = new FlatFile(cdxFile.getAbsolutePath()); + AdaptedIterator<String,SearchResult> searchResultItr = + new AdaptedIterator<String,SearchResult>( + ffile.getSequentialIterator(), + new CDXLineToSearchResultAdapter()); + Iterator<BDBRecord> it = new AdaptedIterator<SearchResult,BDBRecord> + (searchResultItr,new SearchResultToBDBRecordAdapter()); + index.insertRecords(it); added = true; } catch (IOException e) { Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java (from rev 1889, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/SearchResultToBDBRecordAdapter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/bdb/SearchResultToBDBRecordAdapter.java 2007-08-07 01:15:26 UTC (rev 1898) @@ -0,0 +1,86 @@ +/* SearchResultToBDBRecordAdapter + * + * $Id$ + * + * Created on 5:58:22 PM Mar 13, 2007. + * + * Copyright (C) 2007 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback-svn; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.bdb; + +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.bdb.BDBRecord; +import org.archive.wayback.bdb.BDBRecordSet; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.Adapter; + +import com.sleepycat.je.DatabaseEntry; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class SearchResultToBDBRecordAdapter implements + Adapter<SearchResult,BDBRecord> { + + DatabaseEntry key = new DatabaseEntry(); + + DatabaseEntry value = new DatabaseEntry(); + + BDBRecord record = new BDBRecord(key, value); + + private final static String DELIMITER = " "; + + /* + * (non-Javadoc) + * + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public BDBRecord adapt(SearchResult result) { + StringBuilder keySB = new StringBuilder(40); + StringBuilder valSB = new StringBuilder(100); + + + keySB.append(result.get(WaybackConstants.RESULT_URL_KEY)); + keySB.append(DELIMITER); + keySB.append(result.get(WaybackConstants.RESULT_CAPTURE_DATE)); + keySB.append(DELIMITER); + keySB.append(result.get(WaybackConstants.RESULT_OFFSET)); + keySB.append(DELIMITER); + keySB.append(result.get(WaybackConstants.RESULT_ARC_FILE)); + + + valSB.append(result.get(WaybackConstants.RESULT_ORIG_HOST)); + valSB.append(DELIMITER); + valSB.append(result.get(WaybackConstants.RESULT_MIME_TYPE)); + valSB.append(DELIMITER); + valSB.append(result.get(WaybackConstants.RESULT_HTTP_CODE)); + valSB.append(DELIMITER); + valSB.append(result.get(WaybackConstants.RESULT_MD5_DIGEST)); + valSB.append(DELIMITER); + valSB.append(result.get(WaybackConstants.RESULT_REDIRECT_URL)); + + key.setData(BDBRecordSet.stringToBytes(keySB.toString())); + value.setData(BDBRecordSet.stringToBytes(valSB.toString())); + + return record; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/SearchResultToCDXLineAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/SearchResultToCDXLineAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/SearchResultToCDXLineAdapter.java 2007-08-07 01:15:26 UTC (rev 1898) @@ -0,0 +1,71 @@ +/* SearchResultToCDXLineAdapter + * + * $Id$ + * + * Created on 3:22:15 PM Jul 26, 2007. + * + * Copyright (C) 2007 Internet Archive. + * + * This file is part of wayback-core. + * + * wayback-core is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback-core is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback-core; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx; + +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.Adapter; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class SearchResultToCDXLineAdapter implements +Adapter<SearchResult,String>{ + + private static int DEFAULT_CAPACITY = 120; + private final static String DELIMITER = " "; + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public String adapt(SearchResult result) { + + StringBuilder sb = new StringBuilder(DEFAULT_CAPACITY); + + sb.append(result.get(WaybackConstants.RESULT_URL_KEY)); + sb.append(DELIMITER); + sb.append(result.get(WaybackConstants.RESULT_CAPTURE_DATE)); + sb.append(DELIMITER); + sb.append(result.get(WaybackConstants.RESULT_ORIG_HOST)); + sb.append(DELIMITER); + sb.append(result.get(WaybackConstants.RESULT_MIME_TYPE)); + sb.append(DELIMITER); + sb.append(result.get(WaybackConstants.RESULT_HTTP_CODE)); + sb.append(DELIMITER); + sb.append(result.get(WaybackConstants.RESULT_MD5_DIGEST)); + sb.append(DELIMITER); + sb.append(result.get(WaybackConstants.RESULT_REDIRECT_URL)); + sb.append(DELIMITER); + sb.append(result.get(WaybackConstants.RESULT_OFFSET)); + sb.append(DELIMITER); + sb.append(result.get(WaybackConstants.RESULT_ARC_FILE)); + + return sb.toString(); + } + +} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java 2007-08-02 03:02:45 UTC (rev 1897) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/IndexClient.java 2007-08-07 01:15:26 UTC (rev 1898) @@ -24,13 +24,17 @@ */ package org.archive.wayback.resourceindex.indexer; +//import java.io.BufferedOutputStream; +import java.io.File; +//import java.io.FileFilter; import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileFilter; import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; -import java.io.OutputStream; import java.io.PrintWriter; +//import java.io.OutputStream; +//import java.io.PrintWriter; +import java.util.Iterator; import java.util.logging.Logger; import org.apache.commons.httpclient.HttpClient; @@ -38,8 +42,13 @@ import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.InputStreamRequestEntity; import org.apache.commons.httpclient.methods.PutMethod; -import org.archive.wayback.core.SearchResults; -import org.archive.wayback.resourcestore.http.FileLocationDBClient; +//import org.archive.wayback.core.SearchResults; +//import org.archive.wayback.resourcestore.ArcIndexer; +//import org.archive.wayback.resourcestore.http.FileLocationDBClient; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; /** * @@ -51,175 +60,305 @@ private static final Logger LOGGER = Logger.getLogger(IndexClient .class.getName()); - private final static String ARC_SUFFIX = ".arc.gz"; - private final static String CDX_SUFFIX = ".cdx"; +// private final static String ARC_SUFFIX = ".arc"; +// private final static String ARC_GZ_SUFFIX = ".arc.gz"; +// private final static String CDX_SUFFIX = ".cdx"; - private String submitUrl = null; - private HttpClient client = null; - private ArcIndexer indexer = null; - /** - * Create an IndexPipelineClient for adding ARC index information to a - * remote index pipeline. Attempts to create workDir if it does not already - * exist. - * - * @param submitUrl - */ - public IndexClient(final String submitUrl) { - super(); - this.submitUrl = submitUrl; - this.client = new HttpClient(); - this.indexer = new ArcIndexer(); - } + private String target = null; + private File tmpDir = null; +// private String submitUrl = null; + private HttpClient client = new HttpClient(); +// private ArcIndexer indexer = null; + /** - * Inject File argument into the index pipeline specified for this client - * using HTTP PUT - * * @param cdx + * @return true if CDX was added to local or remote index * @throws HttpException * @throws IOException */ - public void uploadCDX(File cdx) throws HttpException, IOException { - String basename = cdx.getName(); - String finalUrl = submitUrl + "/" + basename; - PutMethod method = new PutMethod(finalUrl); - method.setRequestEntity(new InputStreamRequestEntity( - new FileInputStream(cdx))); + public boolean addCDX(File cdx) throws HttpException, IOException { + boolean added = false; + if(target == null) { + throw new IOException("No target set"); + } + String base = cdx.getName(); + if(target.startsWith("http://")) { + String finalUrl = target; + if(target.endsWith("/")) { + finalUrl = target + base; + } else { + finalUrl = target + "/" + base; + } + PutMethod method = new PutMethod(finalUrl); + method.setRequestEntity(new InputStreamRequestEntity( + new FileInputStream(cdx))); - int statusCode = client.executeMethod(method); - if (statusCode != HttpStatus.SC_OK) { - throw new IOException("Method failed: " + method.getStatusLine() - + " for URL " + finalUrl + " on file " - + cdx.getAbsolutePath()); - } - LOGGER.info("Uploaded cdx " + cdx.getAbsolutePath()); + int statusCode = client.executeMethod(method); + if (statusCode == HttpStatus.SC_OK) { + LOGGER.info("Uploaded cdx " + cdx.getAbsolutePath() + " to " + + finalUrl); + if(!cdx.delete()) { + throw new IOException("FAILED delete " + + cdx.getAbsolutePath()); + } + + added = true; + } else { + throw new IOException("Method failed: " + method.getStatusLine() + + " for URL " + finalUrl + " on file " + + cdx.getAbsolutePath()); + } + + } else { + // assume a local directory: + File toBeMergedDir = new File(target); + if(!toBeMergedDir.exists()) { + throw new IOException("Target " + target + " does not exist"); + } + if(!toBeMergedDir.isDirectory()) { + throw new IOException("Target " + target + " is not a dir"); + } + if(!toBeMergedDir.canWrite()) { + throw new IOException("Target " + target + " is not writable"); + } + File toBeMergedFile = new File(toBeMergedDir,base); + if(toBeMergedFile.exists()) { + LOGGER.severe("WARNING: "+toBeMergedFile.getAbsolutePath() + + "already exists!"); + } else { + if(cdx.renameTo(toBeMergedFile)) { + LOGGER.info("Queued " + toBeMergedFile.getAbsolutePath() + + " for merging."); + added = true; + } else { + LOGGER.severe("FAILED rename("+cdx.getAbsolutePath()+ + ") to ("+toBeMergedFile.getAbsolutePath()+")"); + } + } + } + return added; } /** - * Create a CDX file for the arc argument, and add it to the remote - * index pipeline for this client. - * - * @param arc - * @param workDir - * @throws IOException + * @param base + * @param itr + * @return true if data was added to local or remote index + * @throws HttpException + * @throws IOException */ - public void addArcToIndex(File arc,File workDir) throws IOException { - String arcBase = arc.getName(); - if(arcBase.endsWith(ARC_SUFFIX)) { - arcBase = arcBase.substring(0,arcBase.length() - - ARC_SUFFIX.length()); + public boolean addSearchResults(String base, Iterator<SearchResult> itr) + throws HttpException, IOException { + + if(tmpDir == null) { + throw new IOException("No tmpDir argument"); } - String cdxBase = arcBase + CDX_SUFFIX; - File tmpCDX = new File(workDir,cdxBase); - LOGGER.info("Indexing arc " + arc.getAbsolutePath()); - SearchResults results = indexer.indexArc(arc); - indexer.serializeResults(results, tmpCDX); - uploadCDX(tmpCDX); - if(!tmpCDX.delete()) { - throw new IOException("Unable to unlink " + - tmpCDX.getAbsolutePath()); + File tmpFile = new File(tmpDir,base); + if(tmpFile.exists()) { + // TODO: is this safe? + if(!tmpFile.delete()) { + throw new IOException("Unable to remove tmp " + + tmpFile.getAbsolutePath()); + } } + FileOutputStream os = new FileOutputStream(tmpFile); + BufferedOutputStream bos = new BufferedOutputStream(os); + PrintWriter pw = new PrintWriter(bos); + + Adapter<SearchResult,String> adapterSRtoS = + new SearchResultToCDXLineAdapter(); + Iterator<String> itrS = + new AdaptedIterator<SearchResult,String>(itr,adapterSRtoS); + + while(itrS.hasNext()) { + pw.println(itrS.next()); + } + pw.close(); + boolean added = addCDX(tmpFile); + return added; } +// +// /** +// * Inject File argument into the index pipeline specified for this client +// * using HTTP PUT +// * +// * @param cdx +// * @throws HttpException +// * @throws IOException +// */ +// public void uploadCDX(File cdx) throws HttpException, IOException { +// String basename = cdx.getName(); +// String finalUrl = submitUrl + "/" + basename; +// PutMethod method = new PutMethod(finalUrl); +// method.setRequestEntity(new InputStreamRequestEntity( +// new FileInputStream(cdx))); +// +// int statusCode = client.executeMethod(method); +// if (statusCode != HttpStatus.SC_OK) { +// throw new IOException("Method failed: " + method.getStatusLine() +// + " for URL " + finalUrl + " on file " +// + cdx.getAbsolutePath()); +// } +// LOGGER.info("Uploaded cdx " + cdx.getAbsolutePath()); +// } +// +// /** +// * Create a CDX file for the arc argument, and add it to the remote +// * index pipeline for this client. +// * +// * @param arc +// * @param workDir +// * @throws IOException +// */ +// public void addArcToIndex(File arc,File workDir) throws IOException { +// String arcBase = arc.getName(); +// if(arcBase.endsWith(ARC_SUFFIX)) { +// arcBase = arcBase.substring(0,arcBase.length() - +// ARC_SUFFIX.length()); +// } +// String cdxBase = arcBase + CDX_SUFFIX; +// File tmpCDX = new File(workDir,cdxBase); +// LOGGER.info("Indexing arc " + arc.getAbsolutePath()); +// SearchResults results = indexer.indexArc(arc); +// indexer.serializeResults(results, tmpCDX); +// uploadCDX(tmpCDX); +// if(!tmpCDX.delete()) { +// throw new IOException("Unable to unlink " + +// tmpCDX.getAbsolutePath()); +// } +// } +// +// /** +// * @param arc +// * @param os +// * @throws IOException +// */ +// public void dumpArcIndex(File arc, OutputStream os) throws IOException { +// BufferedOutputStream bos = new BufferedOutputStream(os); +// PrintWriter pw = new PrintWriter(bos); +// SearchResults results = indexer.indexArc(arc); +// indexer.serializeResults(results,pw); +// } +// +// /** +// * Index each ARC in directory, upload CDX to the remote pipeline, and +// * poke the remote locationDB to let it know where this ARC can be found. +// * +// * @param directory +// * @param httpPrefix +// * @param locationClient +// * @param workDir +// * @throws IOException +// */ +// public void indexDirectory(File directory, String httpPrefix, +// FileLocationDBClient locationClient, File workDir) +// throws IOException { +// if(!workDir.isDirectory()) { +// if(workDir.exists()) { +// throw new IOException("workDir path " + +// workDir.getAbsolutePath() + " exists but is not a " + +// "directory"); +// } +// if(!workDir.mkdirs()) { +// throw new IOException("Failed to mkdir(" + +// workDir.getAbsolutePath() + ")"); +// } +// } +// +// if(!httpPrefix.endsWith("/")) { +// httpPrefix += "/"; +// } +// +// FileFilter filter = new FileFilter() { +// public boolean accept(File daFile) { +// return daFile.getName().endsWith(ARC_SUFFIX); +// } +// }; +// +// File[] arcs = directory.listFiles(filter); +// if(arcs == null) { +// throw new IOException("Directory " + directory.getAbsolutePath() + +// " is not a directory or had an IO error"); +// } +// for(int i = 0; i < arcs.length; i++) { +// File arc = arcs[i]; +// String arcName = arc.getName(); +// String arcUrl = httpPrefix + arcName; +// addArcToIndex(arc,workDir); +// LOGGER.info("Adding location " + arcUrl + " for arc " + arcName); +// locationClient.addArcUrl(arcName,arcUrl); +// } +// } +// +// /** +// * @param args +// */ +// public static void main(String[] args) { +// if(args.length == 1) { +// File arc = new File(args[0]); +// ArcIndexer indexer = new ArcIndexer(); +// +// BufferedOutputStream bos = new BufferedOutputStream(System.out); +// PrintWriter pw = new PrintWriter(bos); +// SearchResults results; +// try { +// results = indexer.indexArc(arc); +// indexer.serializeResults(results,pw); +// } catch (IOException e) { +// e.printStackTrace(); +// System.exit(1); +// } +// return; +// } else if(args.length != 5) { +// System.err.println("Usage: workDir pipelineUrl locationUrl arcDir arcUrlPrefix"); +// System.err.println("Usage: arcPath"); +// return; +// } +// File workDir = new File(args[0]); +// String pipelineUrl = args[1]; +// String locationUrl = args[2]; +// File arcDir = new File(args[3]); +// String arcDirPrefix = args[4]; +// IndexClient pipeClient; +// FileLocationDBClient locClient = new FileLocationDBClient(locationUrl); +// try { +// pipeClient = new IndexClient(pipelineUrl); +// pipeClient.indexDirectory(arcDir,arcDirPrefix,locClient,workDir); +// } catch (IOException e) { +// e.printStackTrace(); +// System.exit(1); +// } +// } + /** - * @param arc - * @param os - * @throws IOException + * @return the target */ - public void dumpArcIndex(File arc, OutputStream os) throws IOException { - BufferedOutputStream bos = new BufferedOutputStream(os); - PrintWriter pw = new PrintWriter(bos); - SearchResults results = indexer.indexArc(arc); - indexer.serializeResults(results,pw); + public String getTarget() { + return target; } - + /** - * Index each ARC in directory, upload CDX to the remote pipeline, and - * poke the remote locationDB to let it know where this ARC can be found. - * - * @param directory - * @param httpPrefix - * @param locationClient - * @param workDir - * @throws IOException + * @param target the target to set */ - public void indexDirectory(File directory, String httpPrefix, - FileLocationDBClient locationClient, File workDir) - throws IOException { - if(!workDir.isDirectory()) { - if(workDir.exists()) { - throw new IOException("workDir path " + - workDir.getAbsolutePath() + " exists but is not a " + - "directory"); - } - if(!workDir.mkdirs()) { - throw new IOException("Failed to mkdir(" + - workDir.getAbsolutePath() + ")"); - } - } - - if(!httpPrefix.endsWith("/")) { - httpPrefix += "/"; - } - - FileFilter filter = new FileFilter() { - public boolean accept(File daFile) { - return daFile.getName().endsWith(ARC_SUFFIX); - } - }; + public void setTarget(String target) { + this.target = target; + } - File[] arcs = directory.listFiles(filter); - if(arcs == null) { - throw new IOException("Directory " + directory.getAbsolutePath() + - " is not a directory or had an IO error"); + /** + * @return the tmpDir + */ + public String getTmpDir() { + if(tmpDir == null) { + return null; } - for(int i = 0; i < arcs.length; i++) { - File arc = arcs[i]; - String arcName = arc.getName(); - String arcUrl = httpPrefix + arcName; - addArcToIndex(arc,workDir); - LOGGER.info("Adding location " + arcUrl + " for arc " + arcName); - locationClient.addArcUrl(arcName,arcUrl); - } + return tmpDir.getAbsolutePath(); } - + /** - * @param args + * @param tmpDir the tmpDir to set */ - public static void main(String[] args) { - if(args.length == 1) { - File arc = new File(args[0]); - ArcIndexer indexer = new ArcIndexer(); - - BufferedOutputStream bos = new BufferedOutputStream(System.out); - PrintWriter pw = new PrintWriter(bos); - SearchResults results; - try { - results = indexer.indexArc(arc); - indexer.serializeResults(results,pw); - } catch (IOException e) { - e.printStackTrace(); - System.exit(1); - } - return; - } else if(args.length != 5) { - System.err.println("Usage: workDir pipelineUrl locationUrl arcDir arcUrlPrefix"); - System.err.println("Usage: arcPath"); - return; - } - File workDir = new File(args[0]); - String pipelineUrl = args[1]; - String locationUrl = args[2]; - File arcDir = new File(args[3]); - String arcDirPrefix = args[4]; - IndexClient pipeClient; - FileLocationDBClient locClient = new FileLocationDBClient(locationUrl); - try { - pipeClient = new IndexClient(pipelineUrl); - pipeClient.indexDirectory(arcDir,arcDirPrefix,locClient,workDir); - } catch (IOException e) { - e.printStackTrace(); - System.exit(1); - } + public void setTmpDir(String tmpDir) { + this.tmpDir = new File(tmpDir); } } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java 2007-08-07 01:15:26 UTC (rev 1898) @@ -0,0 +1,164 @@ +/* ArcRecordToSearchResultAdapter + * + * $Id$ + * + * Created on 3:27:03 PM Jul 26, 2007. + * + * Copyright (C) 2007 Internet Archive. + * + * This file is part of wayback-core. + * + * wayback-core is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback-core is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback-core; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.IOException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.URIException; +import org.archive.io.arc.ARCRecord; +import org.archive.io.arc.ARCRecordMetaData; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.UrlCanonicalizer; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ARCRecordToSearchResultAdapter +implements Adapter<ARCRecord,SearchResult>{ + + private static final Logger LOGGER = Logger.getLogger( + ARCRecordToSearchResultAdapter.class.getName()); + + // TODO: make this configurable based on the ResourceIndex + private static UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + +// public static SearchResult arcRecordToSearchResult(final ARCRecord rec) +// throws IOException, ParseException { + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public SearchResult adapt(ARCRecord rec) { + try { + return adaptInner(rec); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + private SearchResult adaptInner(ARCRecord rec) throws IOException { + rec.close(); + ARCRecordMetaData meta = rec.getMetaData(); + + SearchResult result = new SearchResult(); + String arcName = meta.getArc(); + int index = arcName.lastIndexOf(File.separator); + if (index > 0 && (index + 1) < arcName.length()) { + arcName = arcName.substring(index + 1); + } + result.put(WaybackConstants.RESULT_ARC_FILE, arcName); + result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta + .getOffset())); + + // initialize with default HTTP code... + result.put(WaybackConstants.RESULT_HTTP_CODE, "-"); + + result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); + result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype()); + result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate()); + + String uriStr = meta.getUrl(); + if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { + // skip filedesc record altogether... + return null; + } + if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { + // skip URL + HTTP header processing for dns records... + + String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX + .length()); + result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); + result.put(WaybackConstants.RESULT_REDIRECT_URL, "-"); + result.put(WaybackConstants.RESULT_URL, uriStr); + result.put(WaybackConstants.RESULT_URL_KEY, uriStr); + + } else { + + UURI uri = UURIFactory.getInstance(uriStr); + result.put(WaybackConstants.RESULT_URL, uriStr); + + String uriHost = uri.getHost(); + if (uriHost == null) { + LOGGER.info("No host in " + uriStr + " in " + meta.getArc()); + } else { + result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); + + String statusCode = (meta.getStatusCode() == null) ? "-" : meta + .getStatusCode(); + result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode); + + String redirectUrl = "-"; + Header[] headers = rec.getHttpHeaders(); + if (headers != null) { + + for (int i = 0; i < headers.length; i++) { + if (headers[i].getName().equals( + WaybackConstants.LOCATION_HTTP_HEADER)) { + + String locationStr = headers[i].getValue(); + // TODO: "Location" is supposed to be absolute: + // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) + // (section 14.30) but Content-Location can be + // relative. + // is it correct to resolve a relative Location, as + // we are? + // it's also possible to have both in the HTTP + // headers... + // should we prefer one over the other? + // right now, we're ignoring "Content-Location" + try { + UURI uriRedirect = UURIFactory.getInstance(uri, + locationStr); + redirectUrl = uriRedirect.getEscapedURI(); + + } catch (URIException e) { + LOGGER.info("Bad Location: " + locationStr + + " for " + uriStr + " in " + + meta.getArc() + " Skipped"); + } + break; + } + } + } + result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl); + + String indexUrl = canonicalizer.urlStringToKey(meta.getUrl()); + result.put(WaybackConstants.RESULT_URL_KEY, indexUrl); + } + + } + return result; + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java (from rev 1889, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/indexer/ArcIndexer.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2007-08-07 01:15:26 UTC (rev 1898) @@ -0,0 +1,432 @@ +/* ArcIndexer + * + * $Id$ + * + * Created on 2:33:29 PM Oct 11, 2006. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore; + +//import java.io.BufferedOutputStream; +import java.io.File; +//import java.io.FileOutputStream; +import java.io.IOException; +//import java.io.PrintWriter; +//import java.text.ParseException; +import java.util.Iterator; +//import java.util.logging.Logger; + +//import org.apache.commons.httpclient.Header; +//import org.apache.commons.httpclient.URIException; +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCReader; +import org.archive.io.arc.ARCReaderFactory; +import org.archive.io.arc.ARCRecord; +//import org.archive.io.arc.ARCRecordMetaData; +//import org.archive.net.UURI; +//import org.archive.net.UURIFactory; +//import org.archive.wayback.WaybackConstants; +//import org.archive.wayback.bdb.BDBRecord; +//import org.archive.wayback.core.CaptureSearchResults; +import org.archive.wayback.core.SearchResult; +//import org.archive.wayback.core.SearchResults; +//import org.archive.wayback.resourceindex.bdb.SearchResultToBDBRecordAdapter; +//import org.archive.wayback.resourceindex.cdx.CDXLineToSearchResultAdapter; +//import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.CloseableIterator; +//import org.archive.wayback.util.UrlCanonicalizer; +//import org.archive.wayback.util.flatfile.FlatFile; + +/** + * Transforms an ARC file into SearchResults, or a serialized SearchResults + * file(CDX). + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ArcIndexer { + + /** + * CDX Header line for these fields. not very configurable.. + */ + public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; + +// /** +// * Logger for this class +// */ +// private static final Logger LOGGER = Logger.getLogger(ArcIndexer.class +// .getName()); + +// /** +// * Constant indicating entire CDX line +// */ +// protected final static int TYPE_CDX_LINE = 0; +// +// /** +// * Constant indicating entire url + timestamp only +// */ +// protected final static int TYPE_CDX_KEY = 1; +// +// /** +// * Constant indicating trailing data fields from CDX line following url + +// * timestamp +// */ +// protected final static int TYPE_CDX_VALUE = 2; + +// static UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + +// private final static int DEFAULT_CAPACITY = 120; + + /** + * @param arc + * @return Iterator of SearchResults for input arc File + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(File arc) + throws IOException { + ARCReader arcReader = ARCReaderFactory.get(arc); + arcReader.setParseHttpHeaders(true); + + Adapter<ArchiveRecord,ARCRecord> adapter1 = + new ArchiveRecordToARCRecordAdapter(); + + Adapter<ARCRecord,SearchResult> adapter2 = + new ARCRecordToSearchResultAdapter(); + + Iterator<ArchiveRecord> itr1 = arcReader.iterator(); + + CloseableIterator<ARCRecord> itr2 = + new AdaptedIterator<ArchiveRecord,ARCRecord>(itr1,adapter1); + + return new AdaptedIterator<ARCRecord,SearchResult>(itr2,adapter2); + } + + + private class ArchiveRecordToARCRecordAdapter + implements Adapter<ArchiveRecord,ARCRecord> { + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public ARCRecord adapt(ArchiveRecord o) { + ARCRecord rec = null; + if(o instanceof ARCRecord) { + rec = (ARCRecord) o; + } + return rec; + } + + } + +// /** +// * Create a ResourceResults representing the records in ARC file at arcPath. +// * +// * @param arc +// * @return ResourceResults in arcPath. +// * @throws IOException +// */ +// public SearchResults indexArc(File arc) throws IOException { +// CaptureSearchResults results = new CaptureSearchResults(); +// ARCReader arcReader = ARCReaderFactory.get(arc); +// try { +// arcReader.setParseHttpHeaders(true); +// // doh. this does not generate quite the columns we need: +// // arcReader.createCDXIndexFile(arcPath); +// Iterator<ArchiveRecord> itr = arcReader.iterator(); +// while (itr.hasNext()) { +// ARCRecord rec = (ARCRecord) itr.next(); +// SearchResult result; +// try { +// result = arcRecordToSearchResult(rec); +// } catch (NullPointerException e) { +// e.printStackTrace(); +// continue; +// } catch (ParseException e) { +// e.printStackTrace(); +// continue; +// } +// if (result != null) { +// results.addSearchResult(result); +// } +// } +// } finally { +// arcReader.close(); +// } +// return results; +// } + +// /** +// * transform an ARCRecord into a SearchResult +// * +// * @param rec +// * @param arc +// * @return SearchResult for this document +// * @throws IOException +// * @throws ParseException +// */ +// public static SearchResult arcRecordToSearchResult(final ARCRecord rec) +// throws IOException, ParseException { +// rec.close(); +// ARCRecordMetaData meta = rec.getMetaData(); +// +// SearchResult result = new SearchResult(); +// String arcName = meta.getArc(); +// int index = arcName.lastIndexOf(File.separator); +// if (index > 0 && (index + 1) < arcName.length()) { +// arcName = arcName.substring(index + 1); +// } +// result.put(WaybackConstants.RESULT_ARC_FILE, arcName); +// result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta +// .getOffset())); +// +// // initialize with default HTTP code... +// result.put(WaybackConstants.RESULT_HTTP_CODE, "-"); +// +// result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); +// result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype()); +// result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate()); +// +// String uriStr = meta.getUrl(); +// if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { +// // skip filedesc record altogether... +// return null; +// } +// if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { +// // skip URL + HTTP header processing for dns records... +// +// String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX +// .length()); +// result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); +// result.put(WaybackConstants.RESULT_REDIRECT_URL, "-"); +// result.put(WaybackConstants.RESULT_URL, uriStr); +// result.put(WaybackConstants.RESULT_URL_KEY, uriStr); +// +// } else { +// +// UURI uri = UURIFactory.getInstance(uriStr); +// result.put(WaybackConstants.RESULT_URL, uriStr); +// +// String uriHost = uri.getHost(); +// if (uriHost == null) { +// LOGGER.info("No host in " + uriStr + " in " + meta.getArc()); +// } else { +// result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); +// +// String statusCode = (meta.getStatusCode() == null) ? "-" : meta +// .getStatusCode(); +// result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode); +// +// String redirectUrl = "-"; +// Header[] headers = rec.getHttpHeaders(); +// if (headers != null) { +// +// for (int i = 0; i < headers.length; i++) { +// if (headers[i].getName().equals(LOCATION_HTTP_HEADER)) { +// String locationStr = headers[i].getValue(); +// // TODO: "Location" is supposed to be absolute: +// // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) +// // (section 14.30) but Content-Location can be +// // relative. +// // is it correct to resolve a relative Location, as +// // we are? +// // it's also possible to have both in the HTTP +// // headers... +// // should we prefer one over the other? +// // right now, we're ignoring "Content-Location" +// try { +// UURI uriRedirect = UURIFactory.getInstance(uri, +// locationStr); +// redirectUrl = uriRedirect.getEscapedURI(); +// +// } catch (URIException e) { +// LOGGER.info("Bad Location: " + locationStr +// + " for " + uriStr + " in " +// + meta.getArc() + " Skipped"); +// } +// break; +// } +// } +// } +// result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl); +// +// String indexUrl = canonicalizer.urlStringToKey(meta.getUrl()); +// result.put(WaybackConstants.RESULT_URL_KEY, indexUrl); +// } +// +// } +// return result; +// } +// +// /** +// * Write out ResourceResults into CDX file at cdxPath +// * +// * @param results +// * @param target +// * @throws IOException +// */ +// public void serializeResults(final SearchResults results, File target) +// throws IOException { +// +// FileOutputStream os = new FileOutputStream(target); +// BufferedOutputStream bos = new BufferedOutputStream(os); +// PrintWriter pw = new PrintWriter(bos); +// try { +// serializeResults(results, pw); +// } finally { +// pw.close(); +// } +// } +// +// /** +// * @param results +// * @param pw +// * @param addHeader +// * @throws IOException +// */ +// public void serializeResults(final SearchResults results, PrintWriter pw, +// final boolean addHeader) +// throws IOException { +// if(addHeader) { +// pw.println(CDX_HEADER_MAGIC); +// } +// Iterator<SearchResult> itrR = results.iterator(); +// Iterator<String> itrS = new AdaptedIterator<SearchResult,String>(itrR, +// new SearchResultToCDXLineAdapter()); +// while (itrS.hasNext()) { +// pw.println(itrS.next()); +// } +// pw.flush(); +// } +// +// +// /** +// * @param results +// * @param pw +// * @throws IOException +// */ +// public void serializeResults(final SearchResults results, PrintWriter pw) +// throws IOException { +// serializeResults(results,pw,true); +// } + +// /** +// * @param rec +// * @return String in "CDX format" for rec argument +// * @throws IOException +// * @throws ParseException +// */ +// public static String arcRecordToCDXLine(ARCRecord rec) +// throws IOException, ParseException { +// return searchResultToString(arcRecordToSearchResult(rec),TYPE_CDX_LINE); +// } + +// /** +// * Transform a SearchResult into a String representation. +// * +// * @param result +// * @param type +// * @return String value of either line, key or value for the SearchResult +// */ +// protected static String searchResultToString(final SearchResult result, +// int type) { +// +// StringBuilder sb = new StringBuilder(DEFAULT_CAPACITY); +// +// if (type == TYPE_CDX_LINE) { +// +// sb.append(result.get(WaybackConstants.RESULT_URL_KEY)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_CAPTURE_DATE)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_ORIG_HOST)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_MIME_TYPE)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_HTTP_CODE)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_MD5_DIGEST)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_REDIRECT_URL)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_OFFSET)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_ARC_FILE)); +// +// } else if (type == TYPE_CDX_KEY) { +// +// sb.append(result.get(WaybackConstants.RESULT_URL_KEY)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_CAPTURE_DATE)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_OFFSET)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_ARC_FILE)); +// +// } else if (type == TYPE_CDX_VALUE) { +// +// sb.append(result.get(WaybackConstants.RESULT_ORIG_HOST)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_MIME_TYPE)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_HTTP_CODE)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_MD5_DIGEST)); +// sb.append(" "); +// sb.append(result.get(WaybackConstants.RESULT_REDIRECT_URL)); +// +// } else { +// throw new IllegalArgumentException("Unknown transformation type"); +// } +// return sb.toString(); +// } + +// /** +// * @param cdxFile +// * @return Iterator that will return BDBRecords, one for each line in +// * cdxFile argument +// * @throws IOException +// */ +// public Iterator<BDBRecord> getCDXFileBDBRecordIterator(File cdxFile) throws IOException { +// FlatFile ffile = new FlatFile(cdxFile.getAbsolutePath()); +// AdaptedIterator<String,SearchResult> searchResultItr = +// new AdaptedIterator<String,SearchResult>( +// ffile.getSequentialIterator(), +// new CDXLineToSearchResultAdapter()); +// return new AdaptedIterator<SearchResult,BDBRecord>(searchResultItr, +// new SearchResultToBDBRecordAdapter()); +// } + +// /** +// * @param args +// */ +// public static void main(String[] args) { +// ArcIndexer indexer = new ArcIndexer(); +// File arc = new File(args[0]); +// File cdx = new File(args[1]); +// try { +// SearchResults results = indexer.indexArc(arc); +// indexer.serializeResults(results, cdx); +// } catch (Exception e) { +// e.printStackTrace(); +// } +// } +} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java 2007-08-02 03:02:45 UTC (rev 1897) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java 2007-08-07 01:15:26 UTC (rev 1898) @@ -28,9 +28,10 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.HashMap; +import java.util.Iterator; import java.util.logging.Logger; -import org.apache.commons.httpclient.HttpException; +//import org.apache.commons.httpclient.HttpException; import org.archive.io.ArchiveRecord; import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCReaderFactory; @@ -39,11 +40,11 @@ import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.Resource; import org.archive.wayback.core.SearchResult; -import org.archive.wayback.core.SearchResults; +//import org.archive.wayback.core.SearchResults; import org.archive.wayback.exception.ConfigurationException; import org.archive.wayback.exception.ResourceNotAvailableException; -import org.archive.wayback.resourceindex.indexer.ArcIndexer; import org.archive.wayback.resourceindex.indexer.IndexClient; +//import org.archive.wayback.util.CloseableIterator; /** * Implements ResourceStore using a local directory of ARC files. @@ -58,10 +59,10 @@ private final static int DEFAULT_RUN_INTERVAL_MS = 10000; private File arcDir = null; - private File tmpDir = null; +// private File tmpDir = null; private File workDir = null; private File queuedDir = null; - private String indexTarget = null; +// private String indexTarget = null; private int runInterval = DEFAULT_RUN_INTERVAL_MS; private IndexClient indexClient = null; private ArcIndexer indexer = new ArcIndexer(); @@ -79,7 +80,7 @@ if(arcDir == null) { throw new ConfigurationException("No arcDir set"); } - if(indexTarget != null) { + if(indexClient != null) { startAutoIndexThread(); } } @@ -158,40 +159,40 @@ } } - private boolean uploadCDX(File cdxFile) { - boolean uploaded = false; - if(indexClient == null) { - // assume we just need to move it to a local directory: - File toBeMergedDir = new File(indexTarget); - File toBeMergedFile = new File(toBeMergedDir,cdxFile.getName()); - if(toBeMergedFile.exists()) { - LOGGER.severe("WARNING: "+toBeMergedFile.getAbsolutePath() + - "already exists!"); - } else { - if(cdxFile.renameTo(toBeMergedFile)) { - LOGGER.info("Queued " + toBeMergedFile.getAbsolutePath() + - " for merging."); - uploaded = true; - } else { - LOGGER.severe("FAILED rename("+cdxFile.getAbsolutePath()+ - ") to ("+toBeMergedFile.getAbsolutePath()+")"); - } - } - } else { - // use indexClient to upload: - try { - indexClient.uploadCDX(cdxFile); - LOGGER.info("Uploaded " + cdxFile.getAbsolutePath()); - uploaded = true; - } catch (HttpException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - } - return uploaded; - } - +// private boolean uploadCDX(File cdxFile) { +// boolean uploaded = false; +// if(indexClient == null) { +// // assume we just need to move it to a local directory: +// File toBeMergedDir = new File(indexTarget); +// File toBeMergedFile = new File(toBeMergedDir,cdxFile.getName()); +// if(toBeMergedFile.exists()) { +// LOGGER.severe("WARNING: "+toBeMergedFile.getAbsolutePath() + +/... [truncated message content] |