From: <bra...@us...> - 2007-07-25 00:33:21
|
Revision: 1870 http://archive-access.svn.sourceforge.net/archive-access/?rev=1870&view=rev Author: bradtofel Date: 2007-07-24 17:33:21 -0700 (Tue, 24 Jul 2007) Log Message: ----------- REFACTOR: removed all references to PropertyConfigurable interface Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java 2007-07-25 00:32:35 UTC (rev 1869) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java 2007-07-25 00:33:21 UTC (rev 1870) @@ -26,7 +26,6 @@ import java.io.IOException; import java.net.URL; -import java.util.Properties; import org.archive.io.ArchiveRecord; import org.archive.io.arc.ARCReader; @@ -34,10 +33,8 @@ import org.archive.io.arc.ARCRecord; import org.archive.wayback.ResourceStore; import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.PropertyConfiguration; import org.archive.wayback.core.Resource; import org.archive.wayback.core.SearchResult; -import org.archive.wayback.exception.ConfigurationException; import org.archive.wayback.exception.ResourceNotAvailableException; @@ -51,17 +48,9 @@ * @version $Date$, $Revision$ */ public class HttpARCResourceStore implements ResourceStore { - private static final String ARC_URL_PREFIX = "resourcestore.arcurlprefix"; private String urlPrefix = null; - public void init(Properties p) throws ConfigurationException { - PropertyConfiguration pc = new PropertyConfiguration(p); - urlPrefix = pc.getString(ARC_URL_PREFIX); - if(!urlPrefix.endsWith("/")) { - urlPrefix = urlPrefix + "/"; - } - } public Resource retrieveResource(SearchResult result) throws IOException, ResourceNotAvailableException { @@ -99,4 +88,18 @@ } return r; } + + /** + * @return the urlPrefix + */ + public String getUrlPrefix() { + return urlPrefix; + } + + /** + * @param urlPrefix the urlPrefix to set + */ + public void setUrlPrefix(String urlPrefix) { + this.urlPrefix = urlPrefix; + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java 2007-07-25 00:32:35 UTC (rev 1869) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalARCResourceStore.java 2007-07-25 00:33:21 UTC (rev 1870) @@ -28,7 +28,6 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.HashMap; -import java.util.Properties; import java.util.logging.Logger; import org.apache.commons.httpclient.HttpException; @@ -38,7 +37,6 @@ import org.archive.io.arc.ARCRecord; import org.archive.wayback.ResourceStore; import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.PropertyConfiguration; import org.archive.wayback.core.Resource; import org.archive.wayback.core.SearchResult; import org.archive.wayback.core.SearchResults; @@ -58,13 +56,6 @@ Logger.getLogger(LocalARCResourceStore.class.getName()); private final static int DEFAULT_RUN_INTERVAL_MS = 10000; - private static final String RESOURCE_PATH = "resourcestore.arcpath"; - private static final String AUTO_INDEX = "resourcestore.autoindex"; - private static final String TMP_PATH = "resourcestore.tmppath"; - private static final String WORK_PATH = "resourcestore.workpath"; - private static final String QUEUED_PATH = "resourcestore.queuedpath"; - private static final String INDEX_TARGET = "resourcestore.indextarget"; - private static final String INDEX_INTERVAL = "resourcestore.indexinterval"; private File arcDir = null; private File tmpDir = null; @@ -81,24 +72,6 @@ */ private static Thread indexThread = null; - public void init(Properties p) throws ConfigurationException { - PropertyConfiguration pc = new PropertyConfiguration(p); - arcDir = pc.getDir(RESOURCE_PATH, true); - String autoIndex = p.getProperty(AUTO_INDEX); - if((autoIndex != null) && (autoIndex.compareTo("1") == 0)) { - tmpDir = pc.getDir(TMP_PATH,true); - workDir = pc.getDir(WORK_PATH,true); - queuedDir = pc.getDir(QUEUED_PATH,true); - indexTarget = pc.getString(INDEX_TARGET); - - if(indexTarget.startsWith("http://")) { - indexClient = new IndexClient(indexTarget); - } - runInterval = pc.getInt(INDEX_INTERVAL,DEFAULT_RUN_INTERVAL_MS); - startAutoIndexThread(); - } - } - /** * @throws ConfigurationException */ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-11-28 03:12:59
|
Revision: 2090 http://archive-access.svn.sourceforge.net/archive-access/?rev=2090&view=rev Author: bradtofel Date: 2007-11-27 19:13:01 -0800 (Tue, 27 Nov 2007) Log Message: ----------- INITIAL REV: new LocalResourceStore implementation that allows compressed or uncompressed ARCs and WARCs Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2007-11-28 03:13:01 UTC (rev 2090) @@ -0,0 +1,216 @@ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.logging.Logger; + +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.indexer.IndexClient; +import org.archive.wayback.util.DirMaker; + +/** + * Thread that repeatedly notices new files in the LocalResourceStore, indexes + * those files, and hands them off to a ResourceIndex via an IndexClient + * + * @author brad + * @version $Date$, $Revision$ + */ +public class AutoIndexThread extends Thread { + private static final Logger LOGGER = + Logger.getLogger(AutoIndexThread.class.getName()); + + private final static int DEFAULT_RUN_INTERVAL_MS = 10000; + private LocalResourceStore store = null; + private File workDir = null; + private File queuedDir = null; + private int runInterval = DEFAULT_RUN_INTERVAL_MS; + private IndexClient indexClient = null; + + /** + * @param store + * @param runInterval + */ + public AutoIndexThread() { + super("AutoARCIndexThread"); + super.setDaemon(true); + } + + public void run() { + LOGGER.info("AutoIndexThread is alive."); + int sleepInterval = runInterval; + if(store == null) { + throw new RuntimeException("No LocalResourceStore set"); + } + while (true) { + try { + int numIndexed = indexNewArcs(); + if (numIndexed == 0) { + sleep(sleepInterval); + sleepInterval += runInterval; + } else { + sleepInterval = runInterval; + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + /** + * Scan for new ARC files, and index any new files discovered. + * + * There are 3 main steps, which could be broken into separate threads: + * 1) detect new ARCs + * 2) create CDX files for each new ARC + * 3) upload CDX files to target (or rename to local "incoming" directory) + * + * for now these are sequential. + * + * @return number of ARC files indexed + */ + public int indexNewArcs() { + int numIndexed = 0; + try { + queueNewArcsForIndex(); + } catch (IOException e) { + e.printStackTrace(); + } + try { + numIndexed = indexArcs(10); + } catch (MalformedURLException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return numIndexed; + } + /** + * Find any new ARC files and queue them for indexing. + * @throws IOException + */ + public void queueNewArcsForIndex() throws IOException { + + // build a HashMap of what has been queued already: + HashMap<String,String> queued = new HashMap<String, String>(); + String entries[] = queuedDir.list(); + if(entries != null) { + for (int i = 0; i < entries.length; i++) { + queued.put(entries[i], "i"); + } + } + // now scan thru arcDir, and make a flag file for anything that was not + // already there: + Iterator<String> files = store.fileNamesIterator(); + if(files != null) { + while(files.hasNext()) { + String fileName = files.next(); + if(!queued.containsKey(fileName)) { + File newQueuedFile = new File(queuedDir,fileName); + File newToBeIndexedFile = new File(workDir,fileName); + newToBeIndexedFile.createNewFile(); + newQueuedFile.createNewFile(); + } + } + } + } + + private String fileNameToBase(final String fileName) { + return fileName; + } + + /** + * Index up to 'max' ARC/WARC files queued for indexing, queueing the + * resulting CDX files for merging with the BDBIndex. + * + * @param indexer + * @param max maximum number to index in this method call, 0 for unlimited + * @return int number of ARC/WARC files indexed + * @throws MalformedURLException + * @throws IOException + */ + public int indexArcs(int max) + throws MalformedURLException, IOException { + + int numIndexed = 0; + String toBeIndexed[] = workDir.list(); + + if (toBeIndexed != null) { + for (int i = 0; i < toBeIndexed.length; i++) { + String fileName = toBeIndexed[i]; + File file = store.getLocalFile(fileName); + if(file != null) { + File workFlagFile = new File(workDir,fileName); + String cdxBase = fileNameToBase(fileName); + + try { + + LOGGER.info("Indexing " + file.getAbsolutePath()); + Iterator<SearchResult> itr = store.indexFile(file); + + if(indexClient.addSearchResults(cdxBase, itr)) { + if (!workFlagFile.delete()) { + throw new IOException("Unable to delete " + + workFlagFile.getAbsolutePath()); + } + } + numIndexed++; + } catch (IOException e) { + LOGGER.severe("FAILED index: " + file.getAbsolutePath() + + " cause: " + e.getLocalizedMessage()); + } + if(max > 0 && (numIndexed >= max)) { + break; + } + } + } + } + return numIndexed; + } + + + + public LocalResourceStore getStore() { + return store; + } + + public void setStore(LocalResourceStore store) { + this.store = store; + } + + public String getWorkDir() { + return workDir == null ? null : workDir.getAbsolutePath(); + } + + public void setWorkDir(String workDir) throws IOException { + this.workDir = DirMaker.ensureDir(workDir); + } + + public String getQueuedDir() { + return queuedDir == null ? null : queuedDir.getAbsolutePath(); + } + + public void setQueuedDir(String queuedDir) throws IOException { + this.queuedDir = DirMaker.ensureDir(queuedDir); + } + + public int getRunInterval() { + return runInterval; + } + + public void setRunInterval(int runInterval) { + this.runInterval = runInterval; + } + + public IndexClient getIndexClient() { + return indexClient; + } + + public void setIndexClient(IndexClient indexClient) { + this.indexClient = indexClient; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2007-11-28 03:13:01 UTC (rev 2090) @@ -0,0 +1,142 @@ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import org.archive.wayback.ResourceStore; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.exception.ConfigurationException; +import org.archive.wayback.exception.ResourceNotAvailableException; + +/** + * Class which implements a local ARC, WARC, ARC.gz, WARC.gz, ResourceStore + * including an optional automatic indexing thread + * + * @author brad + * @version $Date$, $Revision$ + */ +public class LocalResourceStore implements ResourceStore { + + private File dataDir = null; + private AutoIndexThread indexThread = null; + + private ArcIndexer arcIndexer = new ArcIndexer(); + private WarcIndexer warcIndexer = new WarcIndexer(); + public final static String ARC_EXTENSION = ".arc"; + public final static String ARC_GZ_EXTENSION = ".arc.gz"; + public final static String WARC_EXTENSION = ".warc"; + public final static String WARC_GZ_EXTENSION = ".warc.gz"; + public final static String OPEN_EXTENSION = ".open"; + private final static String[] SUFFIXES = { + "", ARC_EXTENSION, ARC_GZ_EXTENSION, WARC_EXTENSION, WARC_GZ_EXTENSION + }; + private FilenameFilter filter = new ArcWarcFilenameFilter(); + + public void init() throws ConfigurationException { + if(indexThread != null) { + indexThread.setStore(this); + indexThread.start(); + } + } + protected String resultToFileName(SearchResult result) { + return result.get(WaybackConstants.RESULT_ARC_FILE); + } + + protected long resultToOffset(SearchResult result) { + return Long.parseLong(result.get(WaybackConstants.RESULT_OFFSET)); + } + + public File getLocalFile(String fileName) { + // try adding suffixes: empty string is first in the list + File file = null; + for(String suffix : SUFFIXES) { + file = new File(dataDir,fileName + suffix); + if(file.exists() && file.canRead()) { + return file; + } + } + // this might work if the full path is in the index... + file = new File(fileName); + if(file.exists() && file.canRead()) { + return file; + } + // doh. + return null; + } + + public Resource retrieveResource(SearchResult result) throws IOException, + ResourceNotAvailableException { + String fileName = resultToFileName(result); + long offset = resultToOffset(result); + File file = getLocalFile(fileName); + if (file == null) { + + // TODO: this needs to be prettied up for end user consumption.. + throw new ResourceNotAvailableException("Cannot find ARC file (" + + fileName + ")"); + } else { + + Resource r = ResourceFactory.getResource(file, offset); + return r; + } + } + + public Iterator<SearchResult> indexFile(File dataFile) throws IOException { + Iterator<SearchResult> itr = null; + + String name = dataFile.getName(); + if(name.endsWith(ARC_EXTENSION)) { + itr = arcIndexer.iterator(dataFile); + } else if(name.endsWith(ARC_GZ_EXTENSION)) { + itr = arcIndexer.iterator(dataFile); + } else if(name.endsWith(WARC_EXTENSION)) { + itr = warcIndexer.iterator(dataFile); + } else if(name.endsWith(WARC_GZ_EXTENSION)) { + itr = warcIndexer.iterator(dataFile); + } + return itr; + } + + public Iterator<String> fileNamesIterator() throws IOException { + if(dataDir != null) { + String[] files = dataDir.list(filter); + List<String> l = Arrays.asList(files); + return l.iterator(); + } + return null; + } + + public File getDataDir() { + return dataDir; + } + + public void setDataDir(File dataDir) { + this.dataDir = dataDir; + } + + private class ArcWarcFilenameFilter implements FilenameFilter { + public boolean accept(File dir, String name) { + File tmp = new File(dir,name); + if(tmp.isFile() && tmp.canRead()) { + return name.endsWith(ARC_EXTENSION) || + name.endsWith(ARC_GZ_EXTENSION) || + name.endsWith(WARC_GZ_EXTENSION) || + name.endsWith(WARC_EXTENSION); + } + return false; + } + } + + public AutoIndexThread getIndexThread() { + return indexThread; + } + public void setIndexThread(AutoIndexThread indexThread) { + this.indexThread = indexThread; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-11 22:31:22
|
Revision: 2107 http://archive-access.svn.sourceforge.net/archive-access/?rev=2107&view=rev Author: bradtofel Date: 2007-12-11 14:31:26 -0800 (Tue, 11 Dec 2007) Log Message: ----------- RENAME: HttpARCResourceStore => Http11ResourceStore Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java (from rev 2105, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2007-12-11 22:31:26 UTC (rev 2107) @@ -0,0 +1,95 @@ +/* HttpARCResourceStore + * + * $Id$ + * + * Created on 5:29:56 PM Oct 12, 2006. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore; + +import java.io.IOException; +import java.net.URL; + +import org.archive.wayback.ResourceStore; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.exception.ResourceNotAvailableException; + + +/** + * Implements ResourceStore where ARCs are accessed via HTTP 1.1 range requests. + * All ARC files are assumed to be "rooted" at a particular HTTP URL, within + * a single directory, implying an ARC file reverse-proxy to connect through + * to actual HTTP ARC locations. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class Http11ResourceStore implements ResourceStore { + + private String urlPrefix = null; + + + public Resource retrieveResource(SearchResult result) throws IOException, + ResourceNotAvailableException { + + // extract ARC filename + add .arc.gz if it is not present + String arcName = result.get(WaybackConstants.RESULT_ARC_FILE); + if(arcName == null || arcName.length() < 1) { + throw new IOException("No ARC/WARC name in search result..."); + } + + // extract ARC offset + convert to long + final String offsetString = result.get(WaybackConstants.RESULT_OFFSET); + if(offsetString == null || offsetString.length() < 1) { + throw new IOException("No ARC/WARC offset in search result..."); + } + final long offset = Long.parseLong(offsetString); + + String arcUrl = urlPrefix + arcName; + Resource r = null; + try { + + r = ResourceFactory.getResource(new URL(arcUrl), offset); + + } catch (IOException e) { + + e.printStackTrace(); + throw new ResourceNotAvailableException("Unable to retrieve", + e.getLocalizedMessage()); + } + return r; + } + + /** + * @return the urlPrefix + */ + public String getUrlPrefix() { + return urlPrefix; + } + + /** + * @param urlPrefix the urlPrefix to set + */ + public void setUrlPrefix(String urlPrefix) { + this.urlPrefix = urlPrefix; + } +} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java 2007-12-11 22:28:41 UTC (rev 2106) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/HttpARCResourceStore.java 2007-12-11 22:31:26 UTC (rev 2107) @@ -1,95 +0,0 @@ -/* HttpARCResourceStore - * - * $Id$ - * - * Created on 5:29:56 PM Oct 12, 2006. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourcestore; - -import java.io.IOException; -import java.net.URL; - -import org.archive.wayback.ResourceStore; -import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.Resource; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.exception.ResourceNotAvailableException; - - -/** - * Implements ResourceStore where ARCs are accessed via HTTP 1.1 range requests. - * All ARC files are assumed to be "rooted" at a particular HTTP URL, within - * a single directory, implying an ARC file reverse-proxy to connect through - * to actual HTTP ARC locations. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class HttpARCResourceStore implements ResourceStore { - - private String urlPrefix = null; - - - public Resource retrieveResource(SearchResult result) throws IOException, - ResourceNotAvailableException { - - // extract ARC filename + add .arc.gz if it is not present - String arcName = result.get(WaybackConstants.RESULT_ARC_FILE); - if(arcName == null || arcName.length() < 1) { - throw new IOException("No ARC/WARC name in search result..."); - } - - // extract ARC offset + convert to long - final String offsetString = result.get(WaybackConstants.RESULT_OFFSET); - if(offsetString == null || offsetString.length() < 1) { - throw new IOException("No ARC/WARC offset in search result..."); - } - final long offset = Long.parseLong(offsetString); - - String arcUrl = urlPrefix + arcName; - Resource r = null; - try { - - r = ResourceFactory.getResource(new URL(arcUrl), offset); - - } catch (IOException e) { - - e.printStackTrace(); - throw new ResourceNotAvailableException("Unable to retrieve", - e.getLocalizedMessage()); - } - return r; - } - - /** - * @return the urlPrefix - */ - public String getUrlPrefix() { - return urlPrefix; - } - - /** - * @param urlPrefix the urlPrefix to set - */ - public void setUrlPrefix(String urlPrefix) { - this.urlPrefix = urlPrefix; - } -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-03-01 01:59:20
|
Revision: 2209 http://archive-access.svn.sourceforge.net/archive-access/?rev=2209&view=rev Author: bradtofel Date: 2008-02-29 17:59:24 -0800 (Fri, 29 Feb 2008) Log Message: ----------- BUGFIX (ACC-12): LocalResourceStore now returns a CloseableIterator to AutoIndexThread, which now calls close after indexing. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2008-02-27 11:28:06 UTC (rev 2208) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2008-03-01 01:59:24 UTC (rev 2209) @@ -77,7 +77,8 @@ new ARCRecordToSearchResultAdapter(); adapter2.setCanonicalizer(canonicalizer); - Iterator<ArchiveRecord> itr1 = arcReader.iterator(); + ArchiveReaderCloseableIterator itr1 = + new ArchiveReaderCloseableIterator(arcReader,arcReader.iterator()); CloseableIterator<ARCRecord> itr2 = new AdaptedIterator<ArchiveRecord,ARCRecord>(itr1,adapter1); Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java 2008-03-01 01:59:24 UTC (rev 2209) @@ -0,0 +1,29 @@ +package org.archive.wayback.resourcestore; + +import java.io.IOException; +import java.util.Iterator; + +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveRecord; +import org.archive.wayback.util.CloseableIterator; + +public class ArchiveReaderCloseableIterator implements CloseableIterator<ArchiveRecord> { + private ArchiveReader reader = null; + private Iterator<ArchiveRecord> itr = null; + public ArchiveReaderCloseableIterator(ArchiveReader reader, Iterator<ArchiveRecord> itr) { + this.reader = reader; + this.itr = itr; + } + public boolean hasNext() { + return itr.hasNext(); + } + public ArchiveRecord next() { + return itr.next(); + } + public void remove() { + itr.remove(); + } + public void close() throws IOException { + reader.close(); + } +} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-02-27 11:28:06 UTC (rev 2208) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-03-01 01:59:24 UTC (rev 2209) @@ -9,6 +9,7 @@ import org.archive.wayback.core.SearchResult; import org.archive.wayback.resourceindex.indexer.IndexClient; +import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.DirMaker; /** @@ -150,7 +151,7 @@ try { LOGGER.info("Indexing " + file.getAbsolutePath()); - Iterator<SearchResult> itr = store.indexFile(file); + CloseableIterator<SearchResult> itr = store.indexFile(file); if(indexClient.addSearchResults(cdxBase, itr)) { if (!workFlagFile.delete()) { @@ -158,6 +159,7 @@ + workFlagFile.getAbsolutePath()); } } + itr.close(); numIndexed++; } catch (IOException e) { LOGGER.severe("FAILED index: " + file.getAbsolutePath() Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2008-02-27 11:28:06 UTC (rev 2208) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2008-03-01 01:59:24 UTC (rev 2209) @@ -13,6 +13,7 @@ import org.archive.wayback.core.SearchResult; import org.archive.wayback.exception.ConfigurationException; import org.archive.wayback.exception.ResourceNotAvailableException; +import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.DirMaker; /** @@ -88,8 +89,8 @@ } } - public Iterator<SearchResult> indexFile(File dataFile) throws IOException { - Iterator<SearchResult> itr = null; + public CloseableIterator<SearchResult> indexFile(File dataFile) throws IOException { + CloseableIterator<SearchResult> itr = null; String name = dataFile.getName(); if(name.endsWith(ARC_EXTENSION)) { @@ -140,4 +141,7 @@ public void setIndexThread(AutoIndexThread indexThread) { this.indexThread = indexThread; } + public void shutdown() throws IOException { + // no-op. could shut down threads + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2008-02-27 11:28:06 UTC (rev 2208) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2008-03-01 01:59:24 UTC (rev 2209) @@ -46,10 +46,11 @@ WARCReader reader = WARCReaderFactory.get(warc); - Iterator<ArchiveRecord> itr1 = reader.iterator(); + ArchiveReaderCloseableIterator itr1 = + new ArchiveReaderCloseableIterator(reader,reader.iterator()); - CloseableIterator<WARCRecord> itr2 = new AdaptedIterator<ArchiveRecord, WARCRecord>( - itr1, adapter1); + CloseableIterator<WARCRecord> itr2 = + new AdaptedIterator<ArchiveRecord, WARCRecord>(itr1, adapter1); return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-05 20:35:00
|
Revision: 2280 http://archive-access.svn.sourceforge.net/archive-access/?rev=2280&view=rev Author: bradtofel Date: 2008-06-05 13:34:57 -0700 (Thu, 05 Jun 2008) Log Message: ----------- FEATURE: added method to return iterator from a pathOrUrl (String) Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2008-06-04 00:08:01 UTC (rev 2279) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2008-06-05 20:34:57 UTC (rev 2280) @@ -59,7 +59,7 @@ public ArcIndexer() { canonicalizer = new AggressiveUrlCanonicalizer(); } - + /** * @param arc * @return Iterator of SearchResults for input arc File @@ -67,7 +67,26 @@ */ public CloseableIterator<SearchResult> iterator(File arc) throws IOException { - ARCReader arcReader = ARCReaderFactory.get(arc); + return iterator(ARCReaderFactory.get(arc)); + } + + /** + * @param pathOrUrl + * @return Iterator of SearchResults for input pathOrUrl + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(String pathOrUrl) + throws IOException { + return iterator(ARCReaderFactory.get(pathOrUrl)); + } + + /** + * @param arcReader + * @return Iterator of SearchResults for input ARCReader + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(ARCReader arcReader) + throws IOException { arcReader.setParseHttpHeaders(true); Adapter<ArchiveRecord,ARCRecord> adapter1 = Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2008-06-04 00:08:01 UTC (rev 2279) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2008-06-05 20:34:57 UTC (rev 2280) @@ -31,21 +31,37 @@ } /** - * @param arc + * @param warc * @return Iterator of SearchResults for input arc File * @throws IOException */ public CloseableIterator<SearchResult> iterator(File warc) throws IOException { + return iterator(WARCReaderFactory.get(warc)); + } + /** + * @param pathOrUrl + * @return Iterator of SearchResults for input pathOrUrl + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(String pathOrUrl) + throws IOException { + return iterator(WARCReaderFactory.get(pathOrUrl)); + } + /** + * @param arc + * @return Iterator of SearchResults for input arc File + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(WARCReader reader) + throws IOException { Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter(); WARCRecordToSearchResultAdapter adapter2 = new WARCRecordToSearchResultAdapter(); adapter2.setCanonicalizer(canonicalizer); - - WARCReader reader = WARCReaderFactory.get(warc); - + ArchiveReaderCloseableIterator itr1 = new ArchiveReaderCloseableIterator(reader,reader.iterator()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-24 22:55:27
|
Revision: 2305 http://archive-access.svn.sourceforge.net/archive-access/?rev=2305&view=rev Author: bradtofel Date: 2008-06-24 15:55:35 -0700 (Tue, 24 Jun 2008) Log Message: ----------- INITIAL REV: ResourceFile abstraction, including ResourceFileSource interface, which will allow recursive local directories, polling of local and remote HTTP exported directories Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/DirectoryResourceFileSource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/JspUrlResourceFileSource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileList.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileLocation.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSourceUpdater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/UrlLinkExtractor.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,50 @@ +/* ArcWarcFilenameFilter + * + * $Id$ + * + * Created on 4:15:56 PM May 29, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.File; +import java.io.FilenameFilter; + +/** + * FilenameFilter which returns only compressed/uncompressed ARC/WARC files. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ArcWarcFilenameFilter implements FilenameFilter { + private final static String ARC_SUFFIX = ".arc"; + private final static String ARC_GZ_SUFFIX = ".arc.gz"; + private final static String WARC_SUFFIX = ".warc"; + private final static String WARC_GZ_SUFFIX = ".warc.gz"; + + public boolean accept(File dir, String name) { + return name.endsWith(ARC_SUFFIX) || + name.endsWith(ARC_GZ_SUFFIX) || + name.endsWith(WARC_SUFFIX) || + name.endsWith(WARC_GZ_SUFFIX); + } + +} + Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/DirectoryResourceFileSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/DirectoryResourceFileSource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/DirectoryResourceFileSource.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,144 @@ +/* DirectoryResourceFileSource + * + * $Id$ + * + * Created on 4:00:49 PM May 29, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Local directory tree holding ARC and WARC files. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class DirectoryResourceFileSource implements ResourceFileSource { + + private static char SEPRTR = '_'; + private String name = null; + private String path = null; + private File root = null; + private FilenameFilter filter = new ArcWarcFilenameFilter(); + private boolean recurse = true; + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getFileList() + */ + public ResourceFileList getResourceFileList() throws IOException { + if(root == null) { + throw new IOException("No prefix set"); + } + ResourceFileList list = new ResourceFileList(); + populateFileList(list,root,recurse); + return list; + } + + /** + * add all files matching this.filter beneath root to list, recursing if + * recurse is set. + * + * @param list + * @param root + * @param recurse + * @throws IOException + */ + private void populateFileList(ResourceFileList list, File root, boolean recurse) + throws IOException { + + File[] files = root.listFiles(); + for(File file : files) { + if(file.isFile() && filter.accept(root, file.getName())) { + ResourceFileLocation location = new ResourceFileLocation( + file.getName(),file.getAbsolutePath()); + list.add(location); + } else if(recurse && file.isDirectory()){ + populateFileList(list, file, recurse); + } + } + } + + public String getBasename(String path) { + int sepIdx = path.lastIndexOf(File.separatorChar); + if(sepIdx != -1) { + return path.substring(sepIdx + 1); + } + return path; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getName() + */ + public String getName() { + if(name != null) { + return name; + } + if(root != null) { + return root.getAbsolutePath().replace(File.separatorChar, SEPRTR); + } + return null; + } + + public void setName(String name) { + this.name = name; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getPrefix() + */ + public String getPrefix() { + return path; + } + public void setPrefix(String path) { + this.path = path; + root = new File(path); + } + + public boolean isRecurse() { + return recurse; + } + + public void setRecurse(boolean recurse) { + this.recurse = recurse; + } + + public FilenameFilter getFilter() { + return filter; + } + + public void setFilter(FilenameFilter filter) { + this.filter = filter; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getSources() + */ + public List<ResourceFileSource> getSources() { + List<ResourceFileSource> sources = new ArrayList<ResourceFileSource>(); + sources.add(this); + return sources; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/JspUrlResourceFileSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/JspUrlResourceFileSource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/JspUrlResourceFileSource.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,116 @@ +/* JspUrlResourceFileSource + * + * $Id$ + * + * Created on 5:05:53 PM Jun 5, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class JspUrlResourceFileSource implements ResourceFileSource { + + private final static char WEB_SEPARATOR_CHAR = '/'; + private final static String LINE_SEPARATOR_STRING = "\n"; + private String name = null; + private String prefix = null; + private String jsp = null; + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getBasename(java.lang.String) + */ + public String getBasename(String path) { + int sepIdx = path.lastIndexOf(WEB_SEPARATOR_CHAR); + if(sepIdx != -1) { + return path.substring(sepIdx + 1); + } + return path; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getFileList() + */ + public ResourceFileList getResourceFileList() throws IOException { + + String url = "http://localhost:8080" + jsp + "?url=" + prefix; + URL u = new URL(url); + InputStream is = u.openStream(); + InputStreamReader isr = new InputStreamReader(is); + StringBuilder sb = new StringBuilder(2000); + int READ_SIZE = 2048; + char cbuf[] = new char[READ_SIZE]; + int amt = 0; + while((amt = isr.read(cbuf, 0, READ_SIZE)) != -1) { + sb.append(new String(cbuf,0,amt)); + } + ResourceFileList list = new ResourceFileList(); + String lines[] = sb.toString().split(LINE_SEPARATOR_STRING); + for(String line : lines) { + ResourceFileLocation location = + ResourceFileLocation.deserializeLine(line); + if(location != null) { + list.add(location); + } else { + throw new IOException("Bad line format(" + line +")"); + } + } + return list; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getName() + */ + public String getName() { + return name; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getPrefix() + */ + public String getPrefix() { + return prefix; + } + + public void setName(String name) { + this.name = name; + } + + public void setPrefix(String prefix) { + this.prefix = prefix; + } + + public String getJsp() { + return jsp; + } + + public void setJsp(String jsp) { + this.jsp = jsp; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileList.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileList.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileList.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,119 @@ +/* ResourceFileList + * + * $Id$ + * + * Created on 12:15:53 PM Jun 16, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.logging.Logger; + +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.flatfile.FlatFile; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ResourceFileList { + private static final Logger LOGGER = + Logger.getLogger(ResourceFileList.class.getName()); + + private HashMap<String,ResourceFileLocation> files = + new HashMap<String,ResourceFileLocation>(); + public void add(ResourceFileLocation location) { + files.put(location.serializeLine(), location); + } + public void addAll(Iterator<ResourceFileLocation> itr) { + while(itr.hasNext()) { + add(itr.next()); + } + } + + public Iterator<ResourceFileLocation> iterator() { + return files.values().iterator(); + } + + public void store(File target) throws IOException { + FlatFile ff = new FlatFile(target.getAbsolutePath()); + Iterator<String> adapted = + new AdaptedIterator<ResourceFileLocation,String>(iterator(), + new ResourceFileLocationAdapter()); + ff.store(adapted); + } + + public static ResourceFileList load(File source) throws IOException { + ResourceFileList list = new ResourceFileList(); + + FlatFile ff = new FlatFile(source.getAbsolutePath()); + CloseableIterator<String> itr = ff.getSequentialIterator(); + while(itr.hasNext()) { + String line = itr.next(); + ResourceFileLocation location = + ResourceFileLocation.deserializeLine(line); + if(location != null) { + list.add(location); + } else { + LOGGER.warning("Bad parse of line(" + line + ") in (" + + source.getAbsolutePath() + ")"); + } + } + itr.close(); + return list; + } + + public ResourceFileList subtract(ResourceFileList that) { + HashMap<String,ResourceFileLocation> tmp = + new HashMap<String,ResourceFileLocation>(); + Iterator<ResourceFileLocation> thisItr = iterator(); + while(thisItr.hasNext()) { + ResourceFileLocation location = thisItr.next(); + tmp.put(location.serializeLine(), location); + } + + Iterator<ResourceFileLocation> thatItr = that.iterator(); + while(thatItr.hasNext()) { + ResourceFileLocation location = thatItr.next(); + tmp.remove(location.serializeLine()); + } + ResourceFileList sub = new ResourceFileList(); + sub.addAll(tmp.values().iterator()); + return sub; + } + + private class ResourceFileLocationAdapter implements Adapter<ResourceFileLocation,String> { + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public String adapt(ResourceFileLocation o) { + return o.serializeLine(); + } + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileLocation.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileLocation.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileLocation.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,80 @@ +/* ResourceFileLocation + * + * $Id$ + * + * Created on 12:16:04 PM Jun 16, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +/** + * Class encapsulating the name and String location(url/path) of a ResourceFile. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ResourceFileLocation { + private final static char DELIMETER = '\t'; + private String name = null; + private String url = null; + public ResourceFileLocation(String name, String url) { + this.name = name; + this.url = url; + } + public String serializeLine() { + StringBuilder sb = new StringBuilder(100); + sb.append(name); + sb.append(DELIMETER); + sb.append(url); + return sb.toString(); + } + public static ResourceFileLocation deserializeLine(String line) { + int idx = line.indexOf(DELIMETER); + if(idx > -1) { + return new ResourceFileLocation(line.substring(0,idx), + line.substring(idx+1)); + } + return null; + } + /** + * @return the name + */ + public String getName() { + return name; + } + /** + * @param name the name to set + */ + public void setName(String name) { + this.name = name; + } + /** + * @return the url + */ + public String getUrl() { + return url; + } + /** + * @param url the url to set + */ + public void setUrl(String url) { + this.url = url; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSource.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,41 @@ +/* ResourceFileSource + * + * $Id$ + * + * Created on 3:49:17 PM May 29, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.IOException; + +/** + * Interface representing the abstract remote or local folder holding ARC/WARC + * files. + * + * @author brad + * @version $Date$, $Revision$ + */ +public interface ResourceFileSource { + public String getName(); + public String getPrefix(); + public String getBasename(String path); + public ResourceFileList getResourceFileList() throws IOException; +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSourceUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSourceUpdater.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSourceUpdater.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,162 @@ +/* ResourceFileSourceUpdater + * + * $Id$ + * + * Created on 12:30:38 PM Jun 23, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.logging.Logger; + +import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBUpdater; +import org.archive.wayback.util.DirMaker; + +/** + * Class which repeatedly builds a ResourceFileList for a set of + * ResourceFileSource objects, serializing them into files, and dropping them + * into the incoming directory of a ResourceFileLocationDBUpdater. + * + * In the current implementation, this uses only a single thread to scan the + * ResourceFileSource objects, but with larger installations (1000's of + * ResourceFileSources), multiple threads may later be required. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ResourceFileSourceUpdater { + private static final Logger LOGGER = + Logger.getLogger(ResourceFileSourceUpdater.class.getName()); + private List<ResourceFileSource> sources = null; + + private File target = null; + + + private UpdateThread thread = null; + private long interval = 120000; + + public void init() { + if(interval > 0) { + thread = new UpdateThread(this,interval); + thread.start(); + } + } + + public void shutdown() { + if(thread != null) { + thread.interrupt(); + try { + thread.join(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + private void synchronizeSource(ResourceFileSource source) { + String name = source.getName(); + try { + LOGGER.fine("Synchronizing " + name); + ResourceFileList list = source.getResourceFileList(); + String tmp = name + ResourceFileLocationDBUpdater.TMP_SUFFIX; + File tmpListTarget = new File(target,tmp); + File listTarget = new File(target,name); + list.store(tmpListTarget); + tmpListTarget.renameTo(listTarget); + LOGGER.fine("Synchronized " + name); + } catch (IOException e) { + e.printStackTrace(); + LOGGER.warning("FAILED Synchronize " + name + e.getMessage()); + } + } + + public void synchronizeSources() { + for(ResourceFileSource source : sources) { + synchronizeSource(source); + } + } + + private class UpdateThread extends Thread { + private long runInterval = 120000; + private ResourceFileSourceUpdater updater = null; + + public UpdateThread(ResourceFileSourceUpdater updater, + long runInterval) { + + this.updater = updater; + this.runInterval = runInterval; + } + + public void run() { + LOGGER.info("alive"); + while (true) { + try { + long startSync = System.currentTimeMillis(); + updater.synchronizeSources(); + long endSync = System.currentTimeMillis(); + long syncDuration = endSync - startSync; + long sleepInterval = runInterval - syncDuration; + if(sleepInterval > 0) { + sleep(sleepInterval); + } else { + LOGGER.warning("Last Synchronize took " + syncDuration + + " where interval is " + interval + + ". Not sleeping."); + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + } + + public List<ResourceFileSource> getSources() { + return sources; + } + + public void setSources(List<ResourceFileSource> sources) { + this.sources = sources; + } + + public String getTarget() { + return DirMaker.getAbsolutePath(target); + } + + public void setTarget(String target) throws IOException { + this.target = DirMaker.ensureDir(target); + } + + /** + * @return the interval + */ + public long getInterval() { + return interval; + } + + /** + * @param interval the interval to set + */ + public void setInterval(long interval) { + this.interval = interval; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/UrlLinkExtractor.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/UrlLinkExtractor.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/UrlLinkExtractor.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,105 @@ +/* UrlLinkExtractor + * + * $Id$ + * + * Created on 4:26:53 PM Jun 5, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class UrlLinkExtractor { + private final static String QUOTED_ATTR_VALUE = "(?:\"[^\">]*\")"; + + private final static String ESC_QUOTED_ATTR_VALUE = "(?:\\\\\"[^>\\\\]*\\\\\")"; + + private final static String APOSED_ATTR_VALUE = "(?:'[^'>]*')"; + + private final static String RAW_ATTR_VALUE = "(?:[^ \\t\\n\\x0B\\f\\r>\"']+)"; + + + private final static String ANY_ATTR_VALUE = QUOTED_ATTR_VALUE + "|" + + APOSED_ATTR_VALUE + "|" + ESC_QUOTED_ATTR_VALUE + "|" + + RAW_ATTR_VALUE; + + private final static String tagName = "a"; + private final static String attrName = "href"; + + private final static String tagPatString = "<\\s*" + tagName + + "\\s+[^>]*\\b" + attrName + + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?"; + + private final static Pattern pc = Pattern.compile(tagPatString, + Pattern.CASE_INSENSITIVE); + + public static List<String> extractLinks(final String url) throws IOException { + URL u = new URL(url); + InputStream is = u.openStream(); + InputStreamReader isr = new InputStreamReader(is); + StringBuilder sb = new StringBuilder(2000); + int READ_SIZE = 2048; + char cbuf[] = new char[READ_SIZE]; + int amt = 0; + while((amt = isr.read(cbuf, 0, READ_SIZE)) != -1) { + sb.append(new String(cbuf,0,amt)); + } + return extractAnchors(sb); + } + + private static List<String> extractAnchors(final StringBuilder sb) { + + Matcher m = pc.matcher(sb); + + ArrayList<String> anchors = new ArrayList<String>(); + int idx = 0; + while(m.find(idx)) { + anchors.add(trimAttr(m.group(1))); + idx = m.end(1); + } + return anchors; + } + + private static String trimAttr(final String attr) { + int attrLength = attr.length(); + if (attr.charAt(0) == '"') { + return attr.substring(1, attrLength - 1); + } else if (attr.charAt(0) == '\'') { + return attr.substring(1, attrLength - 1); + } else if (attr.charAt(0) == '\\') { + return attr.substring(2, attrLength - 2); + } + return attr; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-24 22:58:45
|
Revision: 2307 http://archive-access.svn.sourceforge.net/archive-access/?rev=2307&view=rev Author: bradtofel Date: 2008-06-24 15:58:51 -0700 (Tue, 24 Jun 2008) Log Message: ----------- INITIAL REV: classes which: * monitor a ResourceFileLocationDB's log * create events when new files are noticed * add those new files to a queue of files needing indexing * monitor the index queue, performing indexing when needed * push new index data to a local or remote ResourceIndex Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/DirectoryIndexQueue.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueue.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/DirectoryIndexQueue.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/DirectoryIndexQueue.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/DirectoryIndexQueue.java 2008-06-24 22:58:51 UTC (rev 2307) @@ -0,0 +1,95 @@ +/* DirectoryIndexQueue + * + * $Id$ + * + * Created on 2:29:10 PM Jun 23, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.IOException; + +import org.archive.wayback.util.DirMaker; + +/** + * Simple queue implementation, which uses a directory containing empty files + * to indicate the presence of items in a queue (set in this case...) + * + * @author brad + * @version $Date$, $Revision$ + */ +public class DirectoryIndexQueue implements IndexQueue { + private File path = null; + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.indexer.IndexQueue#dequeue() + */ + public String dequeue() throws IOException { + String[] names = path.list(); + for(String name : names) { + File tmp = new File(path,name); + if(tmp.isFile()) { + if(tmp.delete()) { + return name; + } else { + throw new IOException("Unable to dequeue/delete (" + + tmp.getAbsolutePath()); + } + } + } + return null; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.indexer.IndexQueue#enqueue(java.lang.String) + */ + public void enqueue(String resourceFileName) throws IOException { + File tmp = new File(path,resourceFileName); + if(!tmp.isFile()) { + tmp.createNewFile(); + } + } + + /** + * @return the path + */ + public String getPath() { + if(path != null) { + return path.getAbsolutePath(); + } + return null; + } + + /** + * @param path the path to set + * @throws IOException + */ + public void setPath(String path) throws IOException { + this.path = DirMaker.ensureDir(path); + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.indexer.IndexQueue#recordStatus(java.lang.String, int) + */ + public void recordStatus(String resourceFileName, int status) { + + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueue.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueue.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueue.java 2008-06-24 22:58:51 UTC (rev 2307) @@ -0,0 +1,42 @@ +/* IndexQueue + * + * $Id$ + * + * Created on 2:05:12 PM Jun 23, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.IOException; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public interface IndexQueue { + public final static int STATUS_DONE = 0; + public final static int STATUS_FAIL = 1; + public final static int STATUS_RETRY = 2; + public void enqueue(String resourceFileName) throws IOException; + public String dequeue() throws IOException; + public void recordStatus(String resourceFileName, int status); +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexQueueUpdater.java 2008-06-24 22:58:51 UTC (rev 2307) @@ -0,0 +1,221 @@ +/* IndexQueueUpdater + * + * $Id$ + * + * Created on 2:02:54 PM Jun 23, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.logging.Logger; + +import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.DirMaker; + +/** + * This class polls a ResourceFileLocationDB repeatedly, to notice new files + * arriving in the DB. Whenever new files are noticed, they are added to the + * Index Queue. + * + * It uses a local file to store the last known "mark" of the location DB. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class IndexQueueUpdater { + + private static final Logger LOGGER = + Logger.getLogger(IndexQueueUpdater.class.getName()); + + private ResourceFileLocationDB db = null; + private IndexQueue queue = null; + private UpdateThread thread = null; + private MarkMemoryFile lastMark = null; + private long interval = 120000; + + public void init() { + if(interval > 0) { + thread = new UpdateThread(this,interval); + thread.start(); + } + } + + public void shutdown() { + if(thread != null) { + thread.interrupt(); + try { + thread.join(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + public int updateQueue() throws IOException { + int added = 0; + long lastMarkPoint = lastMark.getLastMark(); + long currentMarkPoint = db.getCurrentMark(); + if(currentMarkPoint > lastMarkPoint) { + // TODO: touchy touchy... need transactions here to not have + // state sync problems if something goes badly in this block.. + // for example, it would be possible to constantly enqueue the + // same files forever.. + CloseableIterator<String> newNames = + db.getNamesBetweenMarks(lastMarkPoint, currentMarkPoint); + while(newNames.hasNext()) { + queue.enqueue(newNames.next()); + added++; + } + newNames.close(); + lastMark.setLastMark(currentMarkPoint); + } + return added; + } + + private class MarkMemoryFile { + private File file = null; + public MarkMemoryFile(File file) { + this.file = file; + } + + public long getLastMark() throws IOException { + long mark = 0; + if(file.isFile() && file.length() > 0) { + BufferedReader ir = new BufferedReader(new FileReader(file)); + String line = ir.readLine(); + if(line != null) { + mark = Long.parseLong(line); + } + } + return mark; + } + + public void setLastMark(long mark) throws IOException { + PrintWriter pw = new PrintWriter(file); + pw.println(mark); + pw.close(); + } + public String getAbsolutePath() { + return file.getAbsolutePath(); + } + } + + private class UpdateThread extends Thread { + private long runInterval = 120000; + private IndexQueueUpdater updater = null; + + public UpdateThread(IndexQueueUpdater updater, + long runInterval) { + + this.updater = updater; + this.runInterval = runInterval; + } + + public void run() { + LOGGER.info("alive"); + long sleepInterval = runInterval; + while (true) { + try { + int updated = updater.updateQueue(); + + if(updated > 0) { + LOGGER.info("Updated " + updated + " files.."); + sleepInterval = runInterval; + } else { + LOGGER.info("Updated ZERO files.."); + sleepInterval += runInterval; + } + sleep(sleepInterval); + } catch (InterruptedException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + /** + * @return the db + */ + public ResourceFileLocationDB getDb() { + return db; + } + + /** + * @param db the db to set + */ + public void setDb(ResourceFileLocationDB db) { + this.db = db; + } + + /** + * @return the queue + */ + public IndexQueue getQueue() { + return queue; + } + + /** + * @param queue the queue to set + */ + public void setQueue(IndexQueue queue) { + this.queue = queue; + } + + /** + * @return the stateFile + */ + public String getLastMark() { + if(lastMark != null) { + return lastMark.getAbsolutePath(); + } + return null; + } + + /** + * @param stateFile the stateFile to set + * @throws IOException + */ + public void setLastMark(String path) throws IOException { + File tmp = new File(path); + DirMaker.ensureDir(tmp.getParentFile().getAbsolutePath()); + lastMark = new MarkMemoryFile(tmp); + } + + /** + * @return the interval + */ + public long getInterval() { + return interval; + } + + /** + * @param interval the interval to set + */ + public void setInterval(long interval) { + this.interval = interval; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2008-06-24 22:58:51 UTC (rev 2307) @@ -0,0 +1,234 @@ +/* IndexWorker + * + * $Id$ + * + * Created on 2:58:51 PM Jun 23, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.IOException; +import java.util.logging.Logger; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.indexer.IndexClient; +import org.archive.wayback.resourcestore.ArcIndexer; +import org.archive.wayback.resourcestore.WarcIndexer; +import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; +import org.archive.wayback.util.CloseableIterator; +//import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; + +/** + * Simple worker, which gets tasks from an IndexQueue, in the case, the name + * of ARC/WARC files to be indexed, retrieves the ARC/WARC location from a + * ResourceFileLocationDB, creates the index, which is serialized into a file, + * and then hands that file off to a ResourceIndex for merging, using an + * IndexClient. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class IndexWorker { + private static final Logger LOGGER = + Logger.getLogger(IndexWorker.class.getName()); + + public final static String ARC_EXTENSION = ".arc"; + public final static String ARC_GZ_EXTENSION = ".arc.gz"; + public final static String WARC_EXTENSION = ".warc"; + public final static String WARC_GZ_EXTENSION = ".warc.gz"; + + private ArcIndexer arcIndexer = new ArcIndexer(); + private WarcIndexer warcIndexer = new WarcIndexer(); + + private UrlCanonicalizer canonicalizer = new IdentityUrlCanonicalizer(); +// private UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + + private long interval = 120000; + private IndexQueue queue = null; + private ResourceFileLocationDB db = null; + private IndexClient target = null; + private WorkerThread thread = null; + + public void init() { + arcIndexer.setCanonicalizer(canonicalizer); + warcIndexer.setCanonicalizer(canonicalizer); + if(interval > 0) { + thread = new WorkerThread(this,interval); + thread.start(); + } + } + + public void shutdown() { + if(thread != null) { + thread.interrupt(); + try { + thread.join(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + public boolean doWork() throws IOException { + boolean worked = false; + String name = queue.dequeue(); + if(name != null) { + worked = true; + String[] pathsOrUrls = null; + try { + pathsOrUrls = db.nameToUrls(name); + } catch(IOException e) { + LOGGER.severe("FAILED TO LOOKUP(" + name + ")" + + e.getLocalizedMessage()); + return false; + } + try { + if(pathsOrUrls != null) { + for(String pathOrUrl : pathsOrUrls) { + CloseableIterator<SearchResult> itr = indexFile(pathOrUrl); + target.addSearchResults(name, itr); + itr.close(); + break; + } + } + } catch(IOException e) { + LOGGER.severe("FAILED to index or upload (" + name + ")"); + } + } + return worked; + } + + public CloseableIterator<SearchResult> indexFile(String pathOrUrl) + throws IOException { + + CloseableIterator<SearchResult> itr = null; + + if(pathOrUrl.endsWith(ARC_EXTENSION)) { + itr = arcIndexer.iterator(pathOrUrl); + } else if(pathOrUrl.endsWith(ARC_GZ_EXTENSION)) { + itr = arcIndexer.iterator(pathOrUrl); + } else if(pathOrUrl.endsWith(WARC_EXTENSION)) { + itr = warcIndexer.iterator(pathOrUrl); + } else if(pathOrUrl.endsWith(WARC_GZ_EXTENSION)) { + itr = warcIndexer.iterator(pathOrUrl); + } + return itr; + } + + + private class WorkerThread extends Thread { + private long runInterval = 120000; + private IndexWorker worker = null; + + public WorkerThread(IndexWorker worker, long runInterval) { + this.worker = worker; + this.runInterval = runInterval; + } + + public void run() { + LOGGER.info("alive."); + long sleepInterval = runInterval; + while (true) { + try { + boolean worked = worker.doWork(); + + if(worked) { + LOGGER.info("Did work, no sleep.."); + sleepInterval = 0; + } else { + LOGGER.info("No Work to do - sleeping.."); + sleepInterval += runInterval; + } + if(sleepInterval > 0) { + sleep(sleepInterval); + } + } catch (InterruptedException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + + /** + * @return the interval + */ + public long getInterval() { + return interval; + } + /** + * @param interval the interval to set + */ + public void setInterval(long interval) { + this.interval = interval; + } + /** + * @return the queue + */ + public IndexQueue getQueue() { + return queue; + } + /** + * @param queue the queue to set + */ + public void setQueue(IndexQueue queue) { + this.queue = queue; + } + /** + * @return the db + */ + public ResourceFileLocationDB getDb() { + return db; + } + /** + * @param db the db to set + */ + public void setDb(ResourceFileLocationDB db) { + this.db = db; + } + /** + * @return the target + */ + public IndexClient getTarget() { + return target; + } + /** + * @param target the target to set + */ + public void setTarget(IndexClient target) { + this.target = target; + } + /** + * @return the canonicalizer + */ + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + /** + * @param canonicalizer the canonicalizer to set + */ + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-25 01:30:12
|
Revision: 2321 http://archive-access.svn.sourceforge.net/archive-access/?rev=2321&view=rev Author: bradtofel Date: 2008-06-24 18:30:18 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: moved indexing related code into indexer package Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,173 +0,0 @@ -/* ArcRecordToSearchResultAdapter - * - * $Id$ - * - * Created on 3:27:03 PM Jul 26, 2007. - * - * Copyright (C) 2007 Internet Archive. - * - * This file is part of wayback-core. - * - * wayback-core is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback-core is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback-core; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.URIException; -import org.archive.io.arc.ARCRecord; -import org.archive.io.arc.ARCRecordMetaData; -import org.archive.net.UURI; -import org.archive.net.UURIFactory; -import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; - -/** - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class ARCRecordToSearchResultAdapter -implements Adapter<ARCRecord,SearchResult>{ - - private static final Logger LOGGER = Logger.getLogger( - ARCRecordToSearchResultAdapter.class.getName()); - - private UrlCanonicalizer canonicalizer = null; - - public ARCRecordToSearchResultAdapter() { - canonicalizer = new AggressiveUrlCanonicalizer(); - } -// public static SearchResult arcRecordToSearchResult(final ARCRecord rec) -// throws IOException, ParseException { - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public SearchResult adapt(ARCRecord rec) { - try { - return adaptInner(rec); - } catch (IOException e) { - e.printStackTrace(); - return null; - } - } - - private SearchResult adaptInner(ARCRecord rec) throws IOException { - rec.close(); - ARCRecordMetaData meta = rec.getMetaData(); - - SearchResult result = new SearchResult(); - String arcName = meta.getArc(); - int index = arcName.lastIndexOf(File.separator); - if (index > 0 && (index + 1) < arcName.length()) { - arcName = arcName.substring(index + 1); - } - result.put(WaybackConstants.RESULT_ARC_FILE, arcName); - result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta - .getOffset())); - - // initialize with default HTTP code... - result.put(WaybackConstants.RESULT_HTTP_CODE, "-"); - - result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); - result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype()); - result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate()); - - String uriStr = meta.getUrl(); - if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { - // skip filedesc record altogether... - return null; - } - if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { - // skip URL + HTTP header processing for dns records... - - String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX - .length()); - result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); - result.put(WaybackConstants.RESULT_REDIRECT_URL, "-"); - result.put(WaybackConstants.RESULT_URL, uriStr); - result.put(WaybackConstants.RESULT_URL_KEY, uriStr); - - } else { - - UURI uri = UURIFactory.getInstance(uriStr); - result.put(WaybackConstants.RESULT_URL, uriStr); - - String uriHost = uri.getHost(); - if (uriHost == null) { - LOGGER.info("No host in " + uriStr + " in " + meta.getArc()); - } else { - result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); - - String statusCode = (meta.getStatusCode() == null) ? "-" : meta - .getStatusCode(); - result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode); - - String redirectUrl = "-"; - Header[] headers = rec.getHttpHeaders(); - if (headers != null) { - - for (int i = 0; i < headers.length; i++) { - if (headers[i].getName().equals( - WaybackConstants.LOCATION_HTTP_HEADER)) { - - String locationStr = headers[i].getValue(); - // TODO: "Location" is supposed to be absolute: - // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) - // (section 14.30) but Content-Location can be - // relative. - // is it correct to resolve a relative Location, as - // we are? - // it's also possible to have both in the HTTP - // headers... - // should we prefer one over the other? - // right now, we're ignoring "Content-Location" - try { - UURI uriRedirect = UURIFactory.getInstance(uri, - locationStr); - redirectUrl = uriRedirect.getEscapedURI(); - - } catch (URIException e) { - LOGGER.info("Bad Location: " + locationStr - + " for " + uriStr + " in " - + meta.getArc() + " Skipped"); - } - break; - } - } - } - result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl); - - String indexUrl = canonicalizer.urlStringToKey(meta.getUrl()); - result.put(WaybackConstants.RESULT_URL_KEY, indexUrl); - } - - } - return result; - } - public UrlCanonicalizer getCanonicalizer() { - return canonicalizer; - } - public void setCanonicalizer(UrlCanonicalizer canonicalizer) { - this.canonicalizer = canonicalizer; - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,175 +0,0 @@ -/* ArcIndexer - * - * $Id$ - * - * Created on 2:33:29 PM Oct 11, 2006. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.PrintWriter; -import java.io.IOException; -import java.util.Iterator; - -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCReaderFactory; -import org.archive.io.arc.ARCRecord; -import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; -import org.archive.wayback.util.AdaptedIterator; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.IdentityUrlCanonicalizer; - -/** - * Transforms an ARC file into Iterator<SearchResult>. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class ArcIndexer { - - /** - * CDX Header line for these fields. not very configurable.. - */ - public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; - private UrlCanonicalizer canonicalizer = null; - - public ArcIndexer() { - canonicalizer = new AggressiveUrlCanonicalizer(); - } - - /** - * @param arc - * @return Iterator of SearchResults for input arc File - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(File arc) - throws IOException { - return iterator(ARCReaderFactory.get(arc)); - } - - /** - * @param pathOrUrl - * @return Iterator of SearchResults for input pathOrUrl - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(String pathOrUrl) - throws IOException { - return iterator(ARCReaderFactory.get(pathOrUrl)); - } - - /** - * @param arcReader - * @return Iterator of SearchResults for input ARCReader - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(ARCReader arcReader) - throws IOException { - arcReader.setParseHttpHeaders(true); - - Adapter<ArchiveRecord,ARCRecord> adapter1 = - new ArchiveRecordToARCRecordAdapter(); - - ARCRecordToSearchResultAdapter adapter2 = - new ARCRecordToSearchResultAdapter(); - adapter2.setCanonicalizer(canonicalizer); - - ArchiveReaderCloseableIterator itr1 = - new ArchiveReaderCloseableIterator(arcReader,arcReader.iterator()); - - CloseableIterator<ARCRecord> itr2 = - new AdaptedIterator<ArchiveRecord,ARCRecord>(itr1,adapter1); - - return new AdaptedIterator<ARCRecord,SearchResult>(itr2,adapter2); - } - - public UrlCanonicalizer getCanonicalizer() { - return canonicalizer; - } - - public void setCanonicalizer(UrlCanonicalizer canonicalizer) { - this.canonicalizer = canonicalizer; - } - - private static void USAGE() { - System.err.println("USAGE:"); - System.err.println(""); - System.err.println("arc-indexer [-identity] ARCFILE"); - System.err.println("arc-indexer [-identity] ARCFILE CDXFILE"); - System.err.println(""); - System.err.println("Create a CDX format index at CDXFILE or to STDOUT."); - System.err.println("With -identity, perform no url canonicalization."); - System.exit(1); - } - - /** - * @param args - */ - public static void main(String[] args) { - ArcIndexer indexer = new ArcIndexer(); - int idx = 0; - if(args[0] != null && args[0].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); - idx++; - } - File arc = new File(args[idx]); - idx++; - PrintWriter pw = null; - try { - if(args.length == idx) { - // dump to STDOUT: - pw = new PrintWriter(System.out); - } else if(args.length == (idx + 1)) { - pw = new PrintWriter(args[idx]); - } else { - USAGE(); - } - Iterator<SearchResult> res = indexer.iterator(arc); - Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); - while(lines.hasNext()) { - pw.println(lines.next()); - } - pw.close(); - } catch (Exception e) { - e.printStackTrace(); - System.exit(1); - } - } - - private class ArchiveRecordToARCRecordAdapter - implements Adapter<ArchiveRecord,ARCRecord> { - - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public ARCRecord adapt(ArchiveRecord o) { - ARCRecord rec = null; - if(o instanceof ARCRecord) { - rec = (ARCRecord) o; - } - return rec; - } - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,29 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.IOException; -import java.util.Iterator; - -import org.archive.io.ArchiveReader; -import org.archive.io.ArchiveRecord; -import org.archive.wayback.util.CloseableIterator; - -public class ArchiveReaderCloseableIterator implements CloseableIterator<ArchiveRecord> { - private ArchiveReader reader = null; - private Iterator<ArchiveRecord> itr = null; - public ArchiveReaderCloseableIterator(ArchiveReader reader, Iterator<ArchiveRecord> itr) { - this.reader = reader; - this.itr = itr; - } - public boolean hasNext() { - return itr.hasNext(); - } - public ArchiveRecord next() { - return itr.next(); - } - public void remove() { - itr.remove(); - } - public void close() throws IOException { - reader.close(); - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,318 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpParser; -import org.apache.commons.httpclient.StatusLine; -import org.apache.commons.httpclient.URIException; -import org.apache.commons.httpclient.util.EncodingUtil; -import org.archive.io.ArchiveRecordHeader; -import org.archive.io.RecoverableIOException; -import org.archive.io.arc.ARCConstants; -import org.archive.io.warc.WARCConstants; -import org.archive.io.warc.WARCRecord; -import org.archive.net.UURI; -import org.archive.net.UURIFactory; -import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; - -/** - * Adapts certain WARCRecords into SearchResults. DNS and response records are - * mostly straightforward, but SearchResult objects generated from revisit - * records contain lots of "placeholder" fields, which are expected to be - * understood by later processes traversing a stream of SearchResult objects. - * - * See org.archive.wayback.resourceindex.DeduplicateSearchResultAnnotationAdapter. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class WARCRecordToSearchResultAdapter -implements Adapter<WARCRecord,SearchResult>{ - - private final static String DEFAULT_VALUE = "-"; - private final static String SEARCH_FIELDS[] = { - WaybackConstants.RESULT_URL, - WaybackConstants.RESULT_URL_KEY, - WaybackConstants.RESULT_ORIG_HOST, - WaybackConstants.RESULT_CAPTURE_DATE, - WaybackConstants.RESULT_MD5_DIGEST, - WaybackConstants.RESULT_MIME_TYPE, - WaybackConstants.RESULT_HTTP_CODE, - WaybackConstants.RESULT_REDIRECT_URL, - WaybackConstants.RESULT_ARC_FILE, - WaybackConstants.RESULT_OFFSET, - }; - - private static final Logger LOGGER = Logger.getLogger( - WARCRecordToSearchResultAdapter.class.getName()); - - private UrlCanonicalizer canonicalizer = null; - - public WARCRecordToSearchResultAdapter() { - canonicalizer = new AggressiveUrlCanonicalizer(); - } - - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public SearchResult adapt(WARCRecord rec) { - try { - return adaptInner(rec); - } catch (IOException e) { - e.printStackTrace(); - return null; - } - } - - /* - * Transform input date to 14-digit timestamp: - * 2007-08-29T18:00:26Z => 20070829180026 - */ - private static String transformDate(final String input) { - - StringBuilder output = new StringBuilder(14); - - output.append(input.substring(0,4)); - output.append(input.substring(5,7)); - output.append(input.substring(8,10)); - output.append(input.substring(11,13)); - output.append(input.substring(14,16)); - output.append(input.substring(17,19)); - - return output.toString(); - } - - private static String transformHTTPMime(final String input) { - int semiIdx = input.indexOf(";"); - if(semiIdx > 0) { - return input.substring(0,semiIdx).trim(); - } - return input.trim(); - } - - private String transformWarcFilename(String readerIdentifier) { - String warcName = readerIdentifier; - int index = warcName.lastIndexOf(File.separator); - if (index > 0 && (index + 1) < warcName.length()) { - warcName = warcName.substring(index + 1); - } - return warcName; - } - - private String transformDigest(final Object o) { - if(o == null) { - return DEFAULT_VALUE; - } - String orig = o.toString(); - if(orig.startsWith("sha1:")) { - return orig.substring(5); - } - return orig; - } - - private SearchResult getBlankSearchResult() { - SearchResult result = new SearchResult(); - for(String field : SEARCH_FIELDS) { - result.put(field, DEFAULT_VALUE); - } - return result; - } - - private UURI addUrlDataToSearchResult(SearchResult result, String urlStr) - throws IOException { - - result.put(WaybackConstants.RESULT_URL, urlStr); - result.put(WaybackConstants.RESULT_URL_KEY, urlStr); - - - UURI uri = UURIFactory.getInstance(urlStr); - String uriHost = uri.getHost(); - if (uriHost == null) { - - LOGGER.info("No host in " + urlStr); - - } else { - - result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); - } - - String urlKey = canonicalizer.urlStringToKey(urlStr); - result.put(WaybackConstants.RESULT_URL_KEY, urlKey); - - return uri; - } - - private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - - SearchResult result = getBlankSearchResult(); - - result.put(WaybackConstants.RESULT_CAPTURE_DATE, - transformDate(header.getDate())); - result.put(WaybackConstants.RESULT_ARC_FILE, - transformWarcFilename(header.getReaderIdentifier())); - result.put(WaybackConstants.RESULT_OFFSET, - String.valueOf(header.getOffset())); - - String uriStr = header.getUrl(); - - String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX - .length()); - result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype()); - - result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); - result.put(WaybackConstants.RESULT_URL, uriStr); - result.put(WaybackConstants.RESULT_URL_KEY, uriStr); - - rec.close(); - result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); - - return result; - } - - private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - - SearchResult result = getBlankSearchResult(); - - result.put(WaybackConstants.RESULT_CAPTURE_DATE, - transformDate(header.getDate())); - result.put(WaybackConstants.RESULT_MD5_DIGEST, - transformDigest(header.getHeaderValue( - WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); - - addUrlDataToSearchResult(result,header.getUrl()); - - return result; - } - - /** - * borrowed(copied) from org.archive.io.arc.ARCRecord... - * - * @param bytes Array of bytes to examine for an EOL. - * @return Count of end-of-line characters or zero if none. - */ - private int getEolCharsCount(byte [] bytes) { - int count = 0; - if (bytes != null && bytes.length >=1 && - bytes[bytes.length - 1] == '\n') { - count++; - if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { - count++; - } - } - return count; - } - - private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - - SearchResult result = getBlankSearchResult(); - - result.put(WaybackConstants.RESULT_CAPTURE_DATE, - transformDate(header.getDate())); - result.put(WaybackConstants.RESULT_ARC_FILE, - transformWarcFilename(header.getReaderIdentifier())); - result.put(WaybackConstants.RESULT_OFFSET, - String.valueOf(header.getOffset())); - - String origUrl = header.getUrl(); - UURI uri = addUrlDataToSearchResult(result,origUrl); - - // need to parse the documents HTTP message and headers here: WARCReader - // does not implement this... yet.. - - byte [] statusBytes = HttpParser.readRawLine(rec); - int eolCharCount = getEolCharsCount(statusBytes); - if (eolCharCount <= 0) { - throw new RecoverableIOException("Failed to read http status where one " + - " was expected: " + new String(statusBytes)); - } - String statusLine = EncodingUtil.getString(statusBytes, 0, - statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); - if ((statusLine == null) || - !StatusLine.startsWithHTTP(statusLine)) { - throw new RecoverableIOException("Failed parse of http status line."); - } - StatusLine status = new StatusLine(statusLine); - result.put(WaybackConstants.RESULT_HTTP_CODE, - String.valueOf(status.getStatusCode())); - - Header[] headers = HttpParser.parseHeaders(rec, - ARCConstants.DEFAULT_ENCODING); - - rec.close(); - result.put(WaybackConstants.RESULT_MD5_DIGEST, - transformDigest(header.getHeaderValue( - WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); - - if (headers != null) { - - for (Header httpHeader : headers) { - if (httpHeader.getName().equals( - WaybackConstants.LOCATION_HTTP_HEADER)) { - - String locationStr = httpHeader.getValue(); - // TODO: "Location" is supposed to be absolute: - // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) - // (section 14.30) but Content-Location can be - // relative. - // is it correct to resolve a relative Location, as - // we are? - // it's also possible to have both in the HTTP - // headers... - // should we prefer one over the other? - // right now, we're ignoring "Content-Location" - try { - UURI uriRedirect = UURIFactory.getInstance(uri, - locationStr); - result.put(WaybackConstants.RESULT_REDIRECT_URL, - uriRedirect.getEscapedURI()); - } catch (URIException e) { - LOGGER.info("Bad Location: " + locationStr - + " for " + origUrl + " in " - + header.getReaderIdentifier() + " Skipped"); - } - } else if(httpHeader.getName().toLowerCase().equals("content-type")) { - result.put(WaybackConstants.RESULT_MIME_TYPE, - transformHTTPMime(httpHeader.getValue())); - } - } - } - return result; - } - - private SearchResult adaptInner(WARCRecord rec) throws IOException { - - SearchResult result = null; - ArchiveRecordHeader header = rec.getHeader(); - String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); - if(type.equals(WARCConstants.RESPONSE)) { - String mime = header.getMimetype(); - if(mime.equals("text/dns")) { - result = adaptDNS(header,rec); - } else { - result = adaptResponse(header,rec); - } - } else if(type.equals(WARCConstants.REVISIT)) { - result = adaptRevisit(header,rec); - } - - return result; - } - - public UrlCanonicalizer getCanonicalizer() { - return canonicalizer; - } - - public void setCanonicalizer(UrlCanonicalizer canonicalizer) { - this.canonicalizer = canonicalizer; - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2008-06-25 00:32:57 UTC (rev 2320) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -1,140 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.Iterator; - -import org.archive.io.ArchiveRecord; -import org.archive.io.warc.WARCReader; -import org.archive.io.warc.WARCReaderFactory; -import org.archive.io.warc.WARCRecord; -import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; -import org.archive.wayback.util.AdaptedIterator; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.IdentityUrlCanonicalizer; - -public class WarcIndexer { - - /** - * CDX Header line for these fields. not very configurable.. - */ - public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; - - private UrlCanonicalizer canonicalizer = null; - public WarcIndexer() { - canonicalizer = new AggressiveUrlCanonicalizer(); - } - - /** - * @param warc - * @return Iterator of SearchResults for input arc File - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(File warc) - throws IOException { - return iterator(WARCReaderFactory.get(warc)); - } - /** - * @param pathOrUrl - * @return Iterator of SearchResults for input pathOrUrl - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(String pathOrUrl) - throws IOException { - return iterator(WARCReaderFactory.get(pathOrUrl)); - } - /** - * @param arc - * @return Iterator of SearchResults for input arc File - * @throws IOException - */ - public CloseableIterator<SearchResult> iterator(WARCReader reader) - throws IOException { - - Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter(); - - WARCRecordToSearchResultAdapter adapter2 = - new WARCRecordToSearchResultAdapter(); - adapter2.setCanonicalizer(canonicalizer); - - ArchiveReaderCloseableIterator itr1 = - new ArchiveReaderCloseableIterator(reader,reader.iterator()); - - CloseableIterator<WARCRecord> itr2 = - new AdaptedIterator<ArchiveRecord, WARCRecord>(itr1, adapter1); - - return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2); - } - - public UrlCanonicalizer getCanonicalizer() { - return canonicalizer; - } - - public void setCanonicalizer(UrlCanonicalizer canonicalizer) { - this.canonicalizer = canonicalizer; - } - - private static void USAGE() { - System.err.println("USAGE:"); - System.err.println(""); - System.err.println("warc-indexer [-identity] WARCFILE"); - System.err.println("warc-indexer [-identity] WARCFILE CDXFILE"); - System.err.println(""); - System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); - System.err.println("With -identity, perform no url canonicalization."); - System.exit(1); - } - - /** - * @param args - */ - public static void main(String[] args) { - WarcIndexer indexer = new WarcIndexer(); - int idx = 0; - if(args[0] != null && args[0].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); - idx++; - } - File arc = new File(args[idx]); - idx++; - PrintWriter pw = null; - try { - if (args.length == idx) { - // dump to STDOUT: - pw = new PrintWriter(System.out); - } else if (args.length == (idx+1)) { - pw = new PrintWriter(args[1]); - } else { - USAGE(); - } - Iterator<SearchResult> res = indexer.iterator(arc); - Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); - while (lines.hasNext()) { - pw.println(lines.next()); - } - pw.close(); - } catch (Exception e) { - e.printStackTrace(); - } - } - - private class ArchiveRecordToWARCRecordAdapter implements - Adapter<ArchiveRecord, WARCRecord> { - - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public WARCRecord adapt(ArchiveRecord o) { - WARCRecord rec = null; - if (o instanceof WARCRecord) { - rec = (WARCRecord) o; - } - return rec; - } - } -} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java (from rev 2138, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,173 @@ +/* ArcRecordToSearchResultAdapter + * + * $Id$ + * + * Created on 3:27:03 PM Jul 26, 2007. + * + * Copyright (C) 2007 Internet Archive. + * + * This file is part of wayback-core. + * + * wayback-core is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback-core is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback-core; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.IOException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.URIException; +import org.archive.io.arc.ARCRecord; +import org.archive.io.arc.ARCRecordMetaData; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ARCRecordToSearchResultAdapter +implements Adapter<ARCRecord,SearchResult>{ + + private static final Logger LOGGER = Logger.getLogger( + ARCRecordToSearchResultAdapter.class.getName()); + + private UrlCanonicalizer canonicalizer = null; + + public ARCRecordToSearchResultAdapter() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } +// public static SearchResult arcRecordToSearchResult(final ARCRecord rec) +// throws IOException, ParseException { + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public SearchResult adapt(ARCRecord rec) { + try { + return adaptInner(rec); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + private SearchResult adaptInner(ARCRecord rec) throws IOException { + rec.close(); + ARCRecordMetaData meta = rec.getMetaData(); + + SearchResult result = new SearchResult(); + String arcName = meta.getArc(); + int index = arcName.lastIndexOf(File.separator); + if (index > 0 && (index + 1) < arcName.length()) { + arcName = arcName.substring(index + 1); + } + result.put(WaybackConstants.RESULT_ARC_FILE, arcName); + result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta + .getOffset())); + + // initialize with default HTTP code... + result.put(WaybackConstants.RESULT_HTTP_CODE, "-"); + + result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); + result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype()); + result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate()); + + String uriStr = meta.getUrl(); + if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { + // skip filedesc record altogether... + return null; + } + if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { + // skip URL + HTTP header processing for dns records... + + String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX + .length()); + result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); + result.put(WaybackConstants.RESULT_REDIRECT_URL, "-"); + result.put(WaybackConstants.RESULT_URL, uriStr); + result.put(WaybackConstants.RESULT_URL_KEY, uriStr); + + } else { + + UURI uri = UURIFactory.getInstance(uriStr); + result.put(WaybackConstants.RESULT_URL, uriStr); + + String uriHost = uri.getHost(); + if (uriHost == null) { + LOGGER.info("No host in " + uriStr + " in " + meta.getArc()); + } else { + result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); + + String statusCode = (meta.getStatusCode() == null) ? "-" : meta + .getStatusCode(); + result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode); + + String redirectUrl = "-"; + Header[] headers = rec.getHttpHeaders(); + if (headers != null) { + + for (int i = 0; i < headers.length; i++) { + if (headers[i].getName().equals( + WaybackConstants.LOCATION_HTTP_HEADER)) { + + String locationStr = headers[i].getValue(); + // TODO: "Location" is supposed to be absolute: + // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) + // (section 14.30) but Content-Location can be + // relative. + // is it correct to resolve a relative Location, as + // we are? + // it's also possible to have both in the HTTP + // headers... + // should we prefer one over the other? + // right now, we're ignoring "Content-Location" + try { + UURI uriRedirect = UURIFactory.getInstance(uri, + locationStr); + redirectUrl = uriRedirect.getEscapedURI(); + + } catch (URIException e) { + LOGGER.info("Bad Location: " + locationStr + + " for " + uriStr + " in " + + meta.getArc() + " Skipped"); + } + break; + } + } + } + result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl); + + String indexUrl = canonicalizer.urlStringToKey(meta.getUrl()); + result.put(WaybackConstants.RESULT_URL_KEY, indexUrl); + } + + } + return result; + } + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java (from rev 2280, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,175 @@ +/* ArcIndexer + * + * $Id$ + * + * Created on 2:33:29 PM Oct 11, 2006. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.PrintWriter; +import java.io.IOException; +import java.util.Iterator; + +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCReader; +import org.archive.io.arc.ARCReaderFactory; +import org.archive.io.arc.ARCRecord; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; + +/** + * Transforms an ARC file into Iterator<SearchResult>. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ArcIndexer { + + /** + * CDX Header line for these fields. not very configurable.. + */ + public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; + private UrlCanonicalizer canonicalizer = null; + + public ArcIndexer() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + + /** + * @param arc + * @return Iterator of SearchResults for input arc File + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(File arc) + throws IOException { + return iterator(ARCReaderFactory.get(arc)); + } + + /** + * @param pathOrUrl + * @return Iterator of SearchResults for input pathOrUrl + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(String pathOrUrl) + throws IOException { + return iterator(ARCReaderFactory.get(pathOrUrl)); + } + + /** + * @param arcReader + * @return Iterator of SearchResults for input ARCReader + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(ARCReader arcReader) + throws IOException { + arcReader.setParseHttpHeaders(true); + + Adapter<ArchiveRecord,ARCRecord> adapter1 = + new ArchiveRecordToARCRecordAdapter(); + + ARCRecordToSearchResultAdapter adapter2 = + new ARCRecordToSearchResultAdapter(); + adapter2.setCanonicalizer(canonicalizer); + + ArchiveReaderCloseableIterator itr1 = + new ArchiveReaderCloseableIterator(arcReader,arcReader.iterator()); + + CloseableIterator<ARCRecord> itr2 = + new AdaptedIterator<ArchiveRecord,ARCRecord>(itr1,adapter1); + + return new AdaptedIterator<ARCRecord,SearchResult>(itr2,adapter2); + } + + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } + + private static void USAGE() { + System.err.println("USAGE:"); + System.err.println(""); + System.err.println("arc-indexer [-identity] ARCFILE"); + System.err.println("arc-indexer [-identity] ARCFILE CDXFILE"); + System.err.println(""); + System.err.println("Create a CDX format index at CDXFILE or to STDOUT."); + System.err.println("With -identity, perform no url canonicalization."); + System.exit(1); + } + + /** + * @param args + */ + public static void main(String[] args) { + ArcIndexer indexer = new ArcIndexer(); + int idx = 0; + if(args[0] != null && args[0].equals("-identity")) { + indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); + idx++; + } + File arc = new File(args[idx]); + idx++; + PrintWriter pw = null; + try { + if(args.length == idx) { + // dump to STDOUT: + pw = new PrintWriter(System.out); + } else if(args.length == (idx + 1)) { + pw = new PrintWriter(args[idx]); + } else { + USAGE(); + } + Iterator<SearchResult> res = indexer.iterator(arc); + Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); + while(lines.hasNext()) { + pw.println(lines.next()); + } + pw.close(); + } catch (Exception e) { + e.printStackTrace(); + System.exit(1); + } + } + + private class ArchiveRecordToARCRecordAdapter + implements Adapter<ArchiveRecord,ARCRecord> { + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public ARCRecord adapt(ArchiveRecord o) { + ARCRecord rec = null; + if(o instanceof ARCRecord) { + rec = (ARCRecord) o; + } + return rec; + } + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java (from rev 2209, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,29 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.IOException; +import java.util.Iterator; + +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveRecord; +import org.archive.wayback.util.CloseableIterator; + +public class ArchiveReaderCloseableIterator implements CloseableIterator<ArchiveRecord> { + private ArchiveReader reader = null; + private Iterator<ArchiveRecord> itr = null; + public ArchiveReaderCloseableIterator(ArchiveReader reader, Iterator<ArchiveRecord> itr) { + this.reader = reader; + this.itr = itr; + } + public boolean hasNext() { + return itr.hasNext(); + } + public ArchiveRecord next() { + return itr.next(); + } + public void remove() { + itr.remove(); + } + public void close() throws IOException { + reader.close(); + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java (from rev 2138, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,318 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.IOException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.URIException; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.RecoverableIOException; +import org.archive.io.arc.ARCConstants; +import org.archive.io.warc.WARCConstants; +import org.archive.io.warc.WARCRecord; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +/** + * Adapts certain WARCRecords into SearchResults. DNS and response records are + * mostly straightforward, but SearchResult objects generated from revisit + * records contain lots of "placeholder" fields, which are expected to be + * understood by later processes traversing a stream of SearchResult objects. + * + * See org.archive.wayback.resourceindex.DeduplicateSearchResultAnnotationAdapter. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class WARCRecordToSearchResultAdapter +implements Adapter<WARCRecord,SearchResult>{ + + private final static String DEFAULT_VALUE = "-"; + private final static String SEARCH_FIELDS[] = { + WaybackConstants.RESULT_URL, + WaybackConstants.RESULT_URL_KEY, + WaybackConstants.RESULT_ORIG_HOST, + WaybackConstants.RESULT_CAPTURE_DATE, + WaybackConstants.RESULT_MD5_DIGEST, + WaybackConstants.RESULT_MIME_TYPE, + WaybackConstants.RESULT_HTTP_CODE, + WaybackConstants.RESULT_REDIRECT_URL, + WaybackConstants.RESULT_ARC_FILE, + WaybackConstants.RESULT_OFFSET, + }; + + private static final Logger LOGGER = Logger.getLogger( + WARCRecordToSearchResultAdapter.class.getName()); + + private UrlCanonicalizer canonicalizer = null; + + public WARCRecordToSearchResultAdapter() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public SearchResult adapt(WARCRecord rec) { + try { + return adaptInner(rec); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + /* + * Transform input date to 14-digit timestamp: + * 2007-08-29T18:00:26Z => 20070829180026 + */ + private static String transformDate(final String input) { + + StringBuilder output = new StringBuilder(14); + + output.append(input.substring(0,4)); + output.append(input.substring(5,7)); + output.append(input.substring(8,10)); + output.append(input.substring(11,13)); + output.append(input.substring(14,16)); + output.append(input.substring(17,19)); + + return output.toString(); + } + + private static String transformHTTPMime(final String input) { + int semiIdx = input.indexOf(";"); + if(semiIdx > 0) { + return input.substring(0,semiIdx).trim(); + } + return input.trim(); + } + + private String transformWarcFilename(String readerIdentifier) { + String warcName = readerIdentifier; + int index = warcName.lastIndexOf(File.separator); + if (index > 0 && (index + 1) < warcName.length()) { + warcName = warcName.substring(index + 1); + } + return warcName; + } + + private String transformDigest(final Object o) { + if(o == null) { + return DEFAULT_VALUE; + } + String orig = o.toString(); + if(orig.startsWith("sha1:")) { + return orig.substring(5); + } + return orig; + } + + private SearchResult getBlankSearchResult() { + SearchResult result = new SearchResult(); + for(String field : SEARCH_FIELDS) { + result.put(field, DEFAULT_VALUE); + } + return result; + } + + private UURI addUrlDataToSearchResult(SearchResult result, String urlStr) + throws IOException { + + result.put(WaybackConstants.RESULT_URL, urlStr); + result.put(WaybackConstants.RESULT_URL_KEY, urlStr); + + + UURI uri = UURIFactory.getInstance(urlStr); + String uriHost = uri.getHost(); + if (uriHost == null) { + + LOGGER.info("No host in " + urlStr); + + } else { + + result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost); + } + + String urlKey = canonicalizer.urlStringToKey(urlStr); + result.put(WaybackConstants.RESULT_URL_KEY, urlKey); + + return uri; + } + + private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_ARC_FILE, + transformWarcFilename(header.getReaderIdentifier())); + result.put(WaybackConstants.RESULT_OFFSET, + String.valueOf(header.getOffset())); + + String uriStr = header.getUrl(); + + String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX + .length()); + result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype()); + + result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); + result.put(WaybackConstants.RESULT_URL, uriStr); + result.put(WaybackConstants.RESULT_URL_KEY, uriStr); + + rec.close(); + result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr()); + + return result; + } + + private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_MD5_DIGEST, + transformDigest(header.getHeaderValue( + WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); + + addUrlDataToSearchResult(result,header.getUrl()); + + return result; + } + + /** + * borrowed(copied) from org.archive.io.arc.ARCRecord... + * + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) + throws IOException { + + SearchResult result = getBlankSearchResult(); + + result.put(WaybackConstants.RESULT_CAPTURE_DATE, + transformDate(header.getDate())); + result.put(WaybackConstants.RESULT_ARC_FILE, + transformWarcFilename(header.getReaderIdentifier())); + result.put(WaybackConstants.RESULT_OFFSET, + String.valueOf(header.getOffset())); + + String origUrl = header.getUrl(); + UURI uri = addUrlDataToSearchResult(result,origUrl); + + // need to parse the documents HTTP message and headers here: WARCReader + // does not implement this... yet.. + + byte [] statusBytes = HttpParser.readRawLine(rec); + int eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new RecoverableIOException("Failed to read http status where one " + + " was expected: " + new String(statusBytes)); + } + String statusLine = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + if ((statusLine == null) || + !StatusLine.startsWithHTTP(statusLine)) { + throw new RecoverableIOException("Failed parse of http status line."); + } + StatusLine status = new StatusLine(statusLine); + result.put(WaybackConstants.RESULT_HTTP_CODE, + String.valueOf(status.getStatusCode())); + + Header[] headers = HttpParser.parseHeaders(rec, + ARCConstants.DEFAULT_ENCODING); + + rec.close(); + result.put(WaybackConstants.RESULT_MD5_DIGEST, + transformDigest(header.getHeaderValue( + WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); + + if (headers != null) { + + for (Header httpHeader : headers) { + if (httpHeader.getName().equals( + WaybackConstants.LOCATION_HTTP_HEADER)) { + + String locationStr = httpHeader.getValue(); + // TODO: "Location" is supposed to be absolute: + // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) + // (section 14.30) but Content-Location can be + // relative. + // is it correct to resolve a relative Location, as + // we are? + // it's also possible to have both in the HTTP + // headers... + // should we prefer one over the other? + // right now, we're ignoring "Content-Location" + try { + UURI uriRedirect = UURIFactory.getInstance(uri, + locationStr); + result.put(WaybackConstants.RESULT_REDIRECT_URL, + uriRedirect.getEscapedURI()); + } catch (URIException e) { + LOGGER.info("Bad Location: " + locationStr + + " for " + origUrl + " in " + + header.getReaderIdentifier() + " Skipped"); + } + } else if(httpHeader.getName().toLowerCase().equals("content-type")) { + result.put(WaybackConstants.RESULT_MIME_TYPE, + transformHTTPMime(httpHeader.getValue())); + } + } + } + return result; + } + + private SearchResult adaptInner(WARCRecord rec) throws IOException { + + SearchResult result = null; + ArchiveRecordHeader header = rec.getHeader(); + String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); + if(type.equals(WARCConstants.RESPONSE)) { + String mime = header.getMimetype(); + if(mime.equals("text/dns")) { + result = adaptDNS(header,rec); + } else { + result = adaptResponse(header,rec); + } + } else if(type.equals(WARCConstants.REVISIT)) { + result = adaptRevisit(header,rec); + } + + return result; + } + + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java (from rev 2280, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2008-06-25 01:30:18 UTC (rev 2321) @@ -0,0 +1,140 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Iterator; + +import org.archive.io.ArchiveRecord; +import org.archive.io.warc.WARCReader; +import org.archive.io.warc.WARCReaderFactory; +import org.archive.io.warc.WARCRecord; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; + +public class WarcIndexer { + + /** + * CDX Header line for these fields. not very configurable.. + */ + public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; + + private UrlCanonicalizer canonicalizer = null; + public WarcIndexer() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + + /** + * @param warc + * @return Iterator of SearchResults for input arc File + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(File warc) + throws IOException { + return iterator(WARCReaderFactory.get(warc)); + } + /** + * @param pathOrUrl + * @return Iterator of SearchResults for input pathOrUrl + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(String pathOrUrl) + throws IOException { + return iterator(WARCReaderFactory.get(pathOrUrl)); + } + /** + * @param arc + * @return Iterator of SearchResults for input arc File + * @throws IOException + */ + public CloseableIterator<SearchResult> iterator(WARCReader reader) + throws IOException { + + Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter(); + + WARCRecordToSearchResultAdapter adapter2 = + new WARCRecordToSearchResultAdapter(); + adapter2.setCanonicalizer(canonicalizer); + + ArchiveReaderCloseableIterator itr1 = + new ArchiveReaderCloseableIterator(reader,reader.iterator()); + + CloseableIterator<WARCRecord> itr2 = + new AdaptedIterator<ArchiveRecord, WARCRecord>(itr1, adapter1); + + return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2); + } + + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } + + private static void USAGE() { + System.err.println("USAGE:"); + System.err.println(""); + System.err.println("warc-indexer [-identity] WARCFILE"); + System.err.println("warc-indexer [-identity] WARCFILE CDXFILE"); + System.err.println(""); + System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); + System.err.println("With -identity, perform no url canonicalization."); + System.exit(1); + } + + /** + * @param args + */ + public static void main(String[] args) { + WarcIndexer indexer = new WarcIndexer(); + int idx = 0; + if(args[0] != null && args[0].equals("-identity")) { + indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); + idx++; + } + File arc = new File(args[idx]); + idx++; + PrintWriter pw = null; + try { + if (args.length == idx) { + // dump to STDOUT: + pw = new PrintWriter(System.out); + } else if (args.length == (idx+1)) { + pw = new PrintWriter(args[1]); + } else { + USAGE(); + } + Iterator<SearchResult> res = indexer.iterator(arc); + Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); + while (lines.hasNext()) { + pw.println(lines.next()); + } + pw.close(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private class ArchiveRecordToWARCRecordAdapter implements + Adapter<ArchiveRecord, WARCRecord> { + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public WARCRecord adapt(ArchiveRecord o) { + WARCRecord rec = null; + if (o instanceof WARCRecord) { + rec = (WARCRecord) o; + } + return rec; + } + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-25 01:30:51
|
Revision: 2322 http://archive-access.svn.sourceforge.net/archive-access/?rev=2322&view=rev Author: bradtofel Date: 2008-06-24 18:30:59 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: moved ARC/WARC record to Resource code into resourcefile package. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java 2008-06-25 01:30:18 UTC (rev 2321) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java 2008-06-25 01:30:59 UTC (rev 2322) @@ -1,170 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.IOException; -import java.util.Enumeration; -import java.util.HashMap; -import java.util.Hashtable; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; -import java.util.logging.Logger; - -import org.apache.commons.httpclient.Header; -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCRecord; -import org.archive.wayback.core.Resource; - -public class ArcResource extends Resource { - /** - * Logger for this class - */ - private static final Logger LOGGER = Logger.getLogger(ArcResource.class - .getName()); - - /** - * String prefix for ARC file related metadata namespace of keys within - * metaData Properties bag. - */ - private static String ARC_META_PREFIX = "arcmeta."; - /** - * String prefix for HTTP Header related metadata namespace of keys within - * metaData Properties bag. - */ - private static String HTTP_HEADER_PREFIX = "httpheader."; - /** - * object for ARCRecord - */ - ARCRecord arcRecord = null; - /** - * object for ARCReader -- need to hold on to this in order to call close() - * to release filehandle after completing access to this record. optional - */ - ARCReader arcReader = null; - /** - * flag to indicate if the ARCRecord skipHTTPHeader() has been called - */ - boolean parsedHeader = false; - /** - * Expandable property bag for holding metadata associated with this - * resource - */ - Hashtable<String,String> metaData = new Hashtable<String,String>(); - - /** - * Constructor - * - * @param rec - * @param reader - */ - public ArcResource(final ARCRecord rec,final ARCReader reader) { - super(); - arcRecord = rec; - arcReader = reader; - setInputStream(rec); - } - - /** parse the headers on the underlying ARC record, and extract all - * @throws IOException - */ - public void parseHeaders () throws IOException { - if(!parsedHeader) { - arcRecord.skipHttpHeader(); - // copy all HTTP headers to metaData, prefixing with - // HTTP_HEADER_PREFIX - Header[] headers = arcRecord.getHttpHeaders(); - if (headers != null) { - for (int i = 0; i < headers.length; i++) { - String value = headers[i].getValue(); - String name = headers[i].getName(); - metaData.put(HTTP_HEADER_PREFIX + name,value); - } - } - - // copy all ARC record header fields to metaData, prefixing with - // ARC_META_PREFIX - @SuppressWarnings("unchecked") - Map<String,Object> headerMetaMap = arcRecord.getMetaData().getHeaderFields(); - Set<String> keys = headerMetaMap.keySet(); - Iterator<String> itr = keys.iterator(); - while(itr.hasNext()) { - String metaKey = itr.next(); - Object value = headerMetaMap.get(metaKey); - String metaValue = ""; - if(value != null) { - metaValue = value.toString(); - } - metaData.put(ARC_META_PREFIX + metaKey,metaValue); - } - - parsedHeader = true; - } - } - - /** - * @param prefix - * @return a Properties of all elements in metaData starting with 'prefix'. - * keys in the returned Properties have 'prefix' removed. - */ - public Map<String,String> filterMeta(String prefix) { - HashMap<String,String> matching = new HashMap<String,String>(); - for (Enumeration<String> e = metaData.keys(); e.hasMoreElements();) { - String key = e.nextElement(); - if (key.startsWith(prefix)) { - String finalKey = key.substring(prefix.length()); - String value = metaData.get(key); - matching.put(finalKey, value); - } - } - return matching; - } - - /** - * @return a Properties containing all HTTP header fields for this record - */ - public Map<String,String> getHttpHeaders() { - return filterMeta(HTTP_HEADER_PREFIX); - } - - /** - * @return a Properties containing all ARC Meta fields for this record - */ - public Map<String,String> getARCMetadata() { - return filterMeta(ARC_META_PREFIX); - } - - /** - * (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#getStatusCode() - * @return int HTTP status code returned with this document. - */ - public int getStatusCode() { - return arcRecord.getStatusCode(); - } - - /** - * @return the ARCRecord underlying this Resource. - */ - public ArchiveRecord getArcRecord() { - return arcRecord; - } - - /* (non-Javadoc) - * @see org.archive.io.arc.ARCRecord#close() - */ - public void close() throws IOException { - //LOGGER.info("About to close..("+arcReader+")"); - arcRecord.close(); - if(arcReader != null) { - arcReader.close(); - LOGGER.info("closed..("+arcReader+")"); - } - } - - /** - * @return byte length claimed in ARC record metadata line. - */ - public long getRecordLength() { - return arcRecord.getMetaData().getLength(); - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java 2008-06-25 01:30:18 UTC (rev 2321) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java 2008-06-25 01:30:59 UTC (rev 2322) @@ -1,105 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.net.URL; - -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCReaderFactory; -import org.archive.io.arc.ARCRecord; -import org.archive.io.warc.WARCReader; -import org.archive.io.warc.WARCReaderFactory; -import org.archive.io.warc.WARCRecord; -import org.archive.wayback.core.Resource; -import org.archive.wayback.exception.ResourceNotAvailableException; - -/** - * Static factory class for constructing ARC/WARC Resources from - * File/URL + offset. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class ResourceFactory { - - public static Resource getResource(File file, long offset) - throws IOException, ResourceNotAvailableException { - - Resource r = null; - String name = file.getName(); - if (name.endsWith(LocalResourceStore.OPEN_EXTENSION)) { - name = name.substring(0, name.length() - - LocalResourceStore.OPEN_EXTENSION.length()); - } - if (isArc(name)) { - - ARCReader reader = ARCReaderFactory.get(file,offset); - r = ARCArchiveRecordToResource(reader.get(),reader); - - } else if (isWarc(name)) { - - WARCReader reader = WARCReaderFactory.get(file,offset); - r = WARCArchiveRecordToResource(reader.get(),reader); - - } else { - throw new ResourceNotAvailableException("Unknown extension"); - } - - return r; - } - - public static Resource getResource(URL url, long offset) - throws IOException, ResourceNotAvailableException { - Resource r = null; - String name = url.getFile(); - if (isArc(name)) { - - ARCReader reader = ARCReaderFactory.get(url, offset); - r = ARCArchiveRecordToResource(reader.get(),reader); - - } else if (isWarc(name)) { - - WARCReader reader = WARCReaderFactory.get(url, offset); - r = WARCArchiveRecordToResource(reader.get(),reader); - - } else { - throw new ResourceNotAvailableException("Unknown extension"); - } - return r; - } - - private static boolean isArc(final String name) { - - return (name.endsWith(LocalResourceStore.ARC_EXTENSION) - || name.endsWith(LocalResourceStore.ARC_GZ_EXTENSION)); - } - - private static boolean isWarc(final String name) { - - return (name.endsWith(LocalResourceStore.WARC_EXTENSION) - || name.endsWith(LocalResourceStore.WARC_GZ_EXTENSION)); - } - - private static Resource ARCArchiveRecordToResource(ArchiveRecord rec, - ARCReader reader) throws ResourceNotAvailableException, IOException { - - if (!(rec instanceof ARCRecord)) { - throw new ResourceNotAvailableException("Bad ARCRecord format"); - } - ArcResource ar = new ArcResource((ARCRecord) rec, reader); - ar.parseHeaders(); - return ar; - } - - private static Resource WARCArchiveRecordToResource(ArchiveRecord rec, - WARCReader reader) throws ResourceNotAvailableException, IOException { - - if (!(rec instanceof WARCRecord)) { - throw new ResourceNotAvailableException("Bad WARCRecord format"); - } - WarcResource wr = new WarcResource((WARCRecord) rec, reader); - wr.parseHeaders(); - return wr; - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java 2008-06-25 01:30:18 UTC (rev 2321) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java 2008-06-25 01:30:59 UTC (rev 2322) @@ -1,98 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.IOException; -import java.util.Hashtable; -import java.util.Map; - -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpParser; -import org.apache.commons.httpclient.StatusLine; -import org.apache.commons.httpclient.util.EncodingUtil; -import org.archive.io.RecoverableIOException; -import org.archive.io.arc.ARCConstants; -import org.archive.io.warc.WARCReader; -import org.archive.io.warc.WARCRecord; -import org.archive.wayback.core.Resource; - -public class WarcResource extends Resource { - private WARCRecord rec = null; - private WARCReader reader = null; - private Map<String, String> headers = null; - private long length = 0; - private int status = 0; - private boolean parsedHeaders = false; - public WarcResource(WARCRecord rec, WARCReader reader) { - this.rec = rec; - this.reader = reader; - } - - /** - * @param bytes Array of bytes to examine for an EOL. - * @return Count of end-of-line characters or zero if none. - */ - private int getEolCharsCount(byte [] bytes) { - int count = 0; - if (bytes != null && bytes.length >=1 && - bytes[bytes.length - 1] == '\n') { - count++; - if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { - count++; - } - } - return count; - } - - public void parseHeaders() throws IOException { - if(parsedHeaders) { - return; - } - - byte [] statusBytes = HttpParser.readRawLine(rec); - int eolCharCount = getEolCharsCount(statusBytes); - if (eolCharCount <= 0) { - throw new RecoverableIOException("Failed to read http status where one " + - " was expected: " + new String(statusBytes)); - } - String statusLineStr = EncodingUtil.getString(statusBytes, 0, - statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); - if ((statusLineStr == null) || - !StatusLine.startsWithHTTP(statusLineStr)) { - throw new RecoverableIOException("Failed parse of http status line."); - } - StatusLine statusLine = new StatusLine(statusLineStr); - - this.status = statusLine.getStatusCode(); - - Header[] tmpHeaders = HttpParser.parseHeaders(rec, - ARCConstants.DEFAULT_ENCODING); - headers = new Hashtable<String,String>(); - for(Header header: tmpHeaders) { - headers.put(header.getName(), header.getValue()); - } - this.setInputStream(rec); - parsedHeaders = true; - } - - - @Override - public Map<String, String> getHttpHeaders() { - return headers; - } - - @Override - public long getRecordLength() { - // TODO Auto-generated method stub - return length; - } - - @Override - public int getStatusCode() { - return status; - } - - @Override - public void close() throws IOException { - rec.close(); - reader.close(); - } -} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java (from rev 2082, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcResource.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcResource.java 2008-06-25 01:30:59 UTC (rev 2322) @@ -0,0 +1,170 @@ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.IOException; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.Hashtable; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCReader; +import org.archive.io.arc.ARCRecord; +import org.archive.wayback.core.Resource; + +public class ArcResource extends Resource { + /** + * Logger for this class + */ + private static final Logger LOGGER = Logger.getLogger(ArcResource.class + .getName()); + + /** + * String prefix for ARC file related metadata namespace of keys within + * metaData Properties bag. + */ + private static String ARC_META_PREFIX = "arcmeta."; + /** + * String prefix for HTTP Header related metadata namespace of keys within + * metaData Properties bag. + */ + private static String HTTP_HEADER_PREFIX = "httpheader."; + /** + * object for ARCRecord + */ + ARCRecord arcRecord = null; + /** + * object for ARCReader -- need to hold on to this in order to call close() + * to release filehandle after completing access to this record. optional + */ + ARCReader arcReader = null; + /** + * flag to indicate if the ARCRecord skipHTTPHeader() has been called + */ + boolean parsedHeader = false; + /** + * Expandable property bag for holding metadata associated with this + * resource + */ + Hashtable<String,String> metaData = new Hashtable<String,String>(); + + /** + * Constructor + * + * @param rec + * @param reader + */ + public ArcResource(final ARCRecord rec,final ARCReader reader) { + super(); + arcRecord = rec; + arcReader = reader; + setInputStream(rec); + } + + /** parse the headers on the underlying ARC record, and extract all + * @throws IOException + */ + public void parseHeaders () throws IOException { + if(!parsedHeader) { + arcRecord.skipHttpHeader(); + // copy all HTTP headers to metaData, prefixing with + // HTTP_HEADER_PREFIX + Header[] headers = arcRecord.getHttpHeaders(); + if (headers != null) { + for (int i = 0; i < headers.length; i++) { + String value = headers[i].getValue(); + String name = headers[i].getName(); + metaData.put(HTTP_HEADER_PREFIX + name,value); + } + } + + // copy all ARC record header fields to metaData, prefixing with + // ARC_META_PREFIX + @SuppressWarnings("unchecked") + Map<String,Object> headerMetaMap = arcRecord.getMetaData().getHeaderFields(); + Set<String> keys = headerMetaMap.keySet(); + Iterator<String> itr = keys.iterator(); + while(itr.hasNext()) { + String metaKey = itr.next(); + Object value = headerMetaMap.get(metaKey); + String metaValue = ""; + if(value != null) { + metaValue = value.toString(); + } + metaData.put(ARC_META_PREFIX + metaKey,metaValue); + } + + parsedHeader = true; + } + } + + /** + * @param prefix + * @return a Properties of all elements in metaData starting with 'prefix'. + * keys in the returned Properties have 'prefix' removed. + */ + public Map<String,String> filterMeta(String prefix) { + HashMap<String,String> matching = new HashMap<String,String>(); + for (Enumeration<String> e = metaData.keys(); e.hasMoreElements();) { + String key = e.nextElement(); + if (key.startsWith(prefix)) { + String finalKey = key.substring(prefix.length()); + String value = metaData.get(key); + matching.put(finalKey, value); + } + } + return matching; + } + + /** + * @return a Properties containing all HTTP header fields for this record + */ + public Map<String,String> getHttpHeaders() { + return filterMeta(HTTP_HEADER_PREFIX); + } + + /** + * @return a Properties containing all ARC Meta fields for this record + */ + public Map<String,String> getARCMetadata() { + return filterMeta(ARC_META_PREFIX); + } + + /** + * (non-Javadoc) + * @see org.archive.io.arc.ARCRecord#getStatusCode() + * @return int HTTP status code returned with this document. + */ + public int getStatusCode() { + return arcRecord.getStatusCode(); + } + + /** + * @return the ARCRecord underlying this Resource. + */ + public ArchiveRecord getArcRecord() { + return arcRecord; + } + + /* (non-Javadoc) + * @see org.archive.io.arc.ARCRecord#close() + */ + public void close() throws IOException { + //LOGGER.info("About to close..("+arcReader+")"); + arcRecord.close(); + if(arcReader != null) { + arcReader.close(); + LOGGER.info("closed..("+arcReader+")"); + } + } + + /** + * @return byte length claimed in ARC record metadata line. + */ + public long getRecordLength() { + return arcRecord.getMetaData().getLength(); + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java (from rev 2122, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ResourceFactory.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java 2008-06-25 01:30:59 UTC (rev 2322) @@ -0,0 +1,105 @@ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.File; +import java.io.IOException; +import java.net.URL; + +import org.archive.io.ArchiveRecord; +import org.archive.io.arc.ARCReader; +import org.archive.io.arc.ARCReaderFactory; +import org.archive.io.arc.ARCRecord; +import org.archive.io.warc.WARCReader; +import org.archive.io.warc.WARCReaderFactory; +import org.archive.io.warc.WARCRecord; +import org.archive.wayback.core.Resource; +import org.archive.wayback.exception.ResourceNotAvailableException; + +/** + * Static factory class for constructing ARC/WARC Resources from + * File/URL + offset. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ResourceFactory { + + public static Resource getResource(File file, long offset) + throws IOException, ResourceNotAvailableException { + + Resource r = null; + String name = file.getName(); + if (name.endsWith(ArcWarcFilenameFilter.OPEN_SUFFIX)) { + name = name.substring(0, name.length() + - ArcWarcFilenameFilter.OPEN_SUFFIX.length()); + } + if (isArc(name)) { + + ARCReader reader = ARCReaderFactory.get(file,offset); + r = ARCArchiveRecordToResource(reader.get(),reader); + + } else if (isWarc(name)) { + + WARCReader reader = WARCReaderFactory.get(file,offset); + r = WARCArchiveRecordToResource(reader.get(),reader); + + } else { + throw new ResourceNotAvailableException("Unknown extension"); + } + + return r; + } + + public static Resource getResource(URL url, long offset) + throws IOException, ResourceNotAvailableException { + Resource r = null; + String name = url.getFile(); + if (isArc(name)) { + + ARCReader reader = ARCReaderFactory.get(url, offset); + r = ARCArchiveRecordToResource(reader.get(),reader); + + } else if (isWarc(name)) { + + WARCReader reader = WARCReaderFactory.get(url, offset); + r = WARCArchiveRecordToResource(reader.get(),reader); + + } else { + throw new ResourceNotAvailableException("Unknown extension"); + } + return r; + } + + private static boolean isArc(final String name) { + + return (name.endsWith(ArcWarcFilenameFilter.ARC_SUFFIX) + || name.endsWith(ArcWarcFilenameFilter.ARC_GZ_SUFFIX)); + } + + private static boolean isWarc(final String name) { + + return (name.endsWith(ArcWarcFilenameFilter.WARC_SUFFIX) + || name.endsWith(ArcWarcFilenameFilter.WARC_GZ_SUFFIX)); + } + + private static Resource ARCArchiveRecordToResource(ArchiveRecord rec, + ARCReader reader) throws ResourceNotAvailableException, IOException { + + if (!(rec instanceof ARCRecord)) { + throw new ResourceNotAvailableException("Bad ARCRecord format"); + } + ArcResource ar = new ArcResource((ARCRecord) rec, reader); + ar.parseHeaders(); + return ar; + } + + private static Resource WARCArchiveRecordToResource(ArchiveRecord rec, + WARCReader reader) throws ResourceNotAvailableException, IOException { + + if (!(rec instanceof WARCRecord)) { + throw new ResourceNotAvailableException("Bad WARCRecord format"); + } + WarcResource wr = new WarcResource((WARCRecord) rec, reader); + wr.parseHeaders(); + return wr; + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java (from rev 2082, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcResource.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/WarcResource.java 2008-06-25 01:30:59 UTC (rev 2322) @@ -0,0 +1,98 @@ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.IOException; +import java.util.Hashtable; +import java.util.Map; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HttpParser; +import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.util.EncodingUtil; +import org.archive.io.RecoverableIOException; +import org.archive.io.arc.ARCConstants; +import org.archive.io.warc.WARCReader; +import org.archive.io.warc.WARCRecord; +import org.archive.wayback.core.Resource; + +public class WarcResource extends Resource { + private WARCRecord rec = null; + private WARCReader reader = null; + private Map<String, String> headers = null; + private long length = 0; + private int status = 0; + private boolean parsedHeaders = false; + public WarcResource(WARCRecord rec, WARCReader reader) { + this.rec = rec; + this.reader = reader; + } + + /** + * @param bytes Array of bytes to examine for an EOL. + * @return Count of end-of-line characters or zero if none. + */ + private int getEolCharsCount(byte [] bytes) { + int count = 0; + if (bytes != null && bytes.length >=1 && + bytes[bytes.length - 1] == '\n') { + count++; + if (bytes.length >=2 && bytes[bytes.length -2] == '\r') { + count++; + } + } + return count; + } + + public void parseHeaders() throws IOException { + if(parsedHeaders) { + return; + } + + byte [] statusBytes = HttpParser.readRawLine(rec); + int eolCharCount = getEolCharsCount(statusBytes); + if (eolCharCount <= 0) { + throw new RecoverableIOException("Failed to read http status where one " + + " was expected: " + new String(statusBytes)); + } + String statusLineStr = EncodingUtil.getString(statusBytes, 0, + statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); + if ((statusLineStr == null) || + !StatusLine.startsWithHTTP(statusLineStr)) { + throw new RecoverableIOException("Failed parse of http status line."); + } + StatusLine statusLine = new StatusLine(statusLineStr); + + this.status = statusLine.getStatusCode(); + + Header[] tmpHeaders = HttpParser.parseHeaders(rec, + ARCConstants.DEFAULT_ENCODING); + headers = new Hashtable<String,String>(); + for(Header header: tmpHeaders) { + headers.put(header.getName(), header.getValue()); + } + this.setInputStream(rec); + parsedHeaders = true; + } + + + @Override + public Map<String, String> getHttpHeaders() { + return headers; + } + + @Override + public long getRecordLength() { + // TODO Auto-generated method stub + return length; + } + + @Override + public int getStatusCode() { + return status; + } + + @Override + public void close() throws IOException { + rec.close(); + reader.close(); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-25 01:33:11
|
Revision: 2323 http://archive-access.svn.sourceforge.net/archive-access/?rev=2323&view=rev Author: bradtofel Date: 2008-06-24 18:33:19 -0700 (Tue, 24 Jun 2008) Log Message: ----------- REFACTOR: package shuffling reference updates. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-06-25 01:30:59 UTC (rev 2322) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -1,218 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.net.MalformedURLException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.logging.Logger; - -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.resourceindex.updater.IndexClient; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.DirMaker; - -/** - * Thread that repeatedly notices new files in the LocalResourceStore, indexes - * those files, and hands them off to a ResourceIndex via an IndexClient - * - * @author brad - * @version $Date$, $Revision$ - */ -public class AutoIndexThread extends Thread { - private static final Logger LOGGER = - Logger.getLogger(AutoIndexThread.class.getName()); - - private final static int DEFAULT_RUN_INTERVAL_MS = 10000; - private LocalResourceStore store = null; - private File workDir = null; - private File queuedDir = null; - private int runInterval = DEFAULT_RUN_INTERVAL_MS; - private IndexClient indexClient = null; - - /** - * @param store - * @param runInterval - */ - public AutoIndexThread() { - super("AutoARCIndexThread"); - super.setDaemon(true); - } - - public void run() { - LOGGER.info("AutoIndexThread is alive."); - int sleepInterval = runInterval; - if(store == null) { - throw new RuntimeException("No LocalResourceStore set"); - } - while (true) { - try { - int numIndexed = indexNewArcs(); - if (numIndexed == 0) { - sleep(sleepInterval); - sleepInterval += runInterval; - } else { - sleepInterval = runInterval; - } - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - } - - /** - * Scan for new ARC files, and index any new files discovered. - * - * There are 3 main steps, which could be broken into separate threads: - * 1) detect new ARCs - * 2) create CDX files for each new ARC - * 3) upload CDX files to target (or rename to local "incoming" directory) - * - * for now these are sequential. - * - * @return number of ARC files indexed - */ - public int indexNewArcs() { - int numIndexed = 0; - try { - queueNewArcsForIndex(); - } catch (IOException e) { - e.printStackTrace(); - } - try { - numIndexed = indexArcs(10); - } catch (MalformedURLException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - return numIndexed; - } - /** - * Find any new ARC files and queue them for indexing. - * @throws IOException - */ - public void queueNewArcsForIndex() throws IOException { - - // build a HashMap of what has been queued already: - HashMap<String,String> queued = new HashMap<String, String>(); - String entries[] = queuedDir.list(); - if(entries != null) { - for (int i = 0; i < entries.length; i++) { - queued.put(entries[i], "i"); - } - } - // now scan thru arcDir, and make a flag file for anything that was not - // already there: - Iterator<String> files = store.fileNamesIterator(); - if(files != null) { - while(files.hasNext()) { - String fileName = files.next(); - if(!queued.containsKey(fileName)) { - File newQueuedFile = new File(queuedDir,fileName); - File newToBeIndexedFile = new File(workDir,fileName); - newToBeIndexedFile.createNewFile(); - newQueuedFile.createNewFile(); - } - } - } - } - - private String fileNameToBase(final String fileName) { - return fileName; - } - - /** - * Index up to 'max' ARC/WARC files queued for indexing, queueing the - * resulting CDX files for merging with the BDBIndex. - * - * @param indexer - * @param max maximum number to index in this method call, 0 for unlimited - * @return int number of ARC/WARC files indexed - * @throws MalformedURLException - * @throws IOException - */ - public int indexArcs(int max) - throws MalformedURLException, IOException { - - int numIndexed = 0; - String toBeIndexed[] = workDir.list(); - - if (toBeIndexed != null) { - for (int i = 0; i < toBeIndexed.length; i++) { - String fileName = toBeIndexed[i]; - File file = store.getLocalFile(fileName); - if(file != null) { - File workFlagFile = new File(workDir,fileName); - String cdxBase = fileNameToBase(fileName); - - try { - - LOGGER.info("Indexing " + file.getAbsolutePath()); - CloseableIterator<SearchResult> itr = store.indexFile(file); - - if(indexClient.addSearchResults(cdxBase, itr)) { - if (!workFlagFile.delete()) { - throw new IOException("Unable to delete " - + workFlagFile.getAbsolutePath()); - } - } - itr.close(); - numIndexed++; - } catch (IOException e) { - LOGGER.severe("FAILED index: " + file.getAbsolutePath() - + " cause: " + e.getLocalizedMessage()); - } - if(max > 0 && (numIndexed >= max)) { - break; - } - } - } - } - return numIndexed; - } - - - - public LocalResourceStore getStore() { - return store; - } - - public void setStore(LocalResourceStore store) { - this.store = store; - } - - public String getWorkDir() { - return workDir == null ? null : workDir.getAbsolutePath(); - } - - public void setWorkDir(String workDir) throws IOException { - this.workDir = DirMaker.ensureDir(workDir); - } - - public String getQueuedDir() { - return queuedDir == null ? null : queuedDir.getAbsolutePath(); - } - - public void setQueuedDir(String queuedDir) throws IOException { - this.queuedDir = DirMaker.ensureDir(queuedDir); - } - - public int getRunInterval() { - return runInterval; - } - - public void setRunInterval(int runInterval) { - this.runInterval = runInterval; - } - - public IndexClient getIndexClient() { - return indexClient; - } - - public void setIndexClient(IndexClient indexClient) { - this.indexClient = indexClient; - } -} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -0,0 +1,218 @@ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.logging.Logger; + +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.resourceindex.updater.IndexClient; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.DirMaker; + +/** + * Thread that repeatedly notices new files in the LocalResourceStore, indexes + * those files, and hands them off to a ResourceIndex via an IndexClient + * + * @author brad + * @version $Date$, $Revision$ + */ +public class AutoIndexThread extends Thread { + private static final Logger LOGGER = + Logger.getLogger(AutoIndexThread.class.getName()); + + private final static int DEFAULT_RUN_INTERVAL_MS = 10000; + private LocalResourceStore store = null; + private File workDir = null; + private File queuedDir = null; + private int runInterval = DEFAULT_RUN_INTERVAL_MS; + private IndexClient indexClient = null; + + /** + * @param store + * @param runInterval + */ + public AutoIndexThread() { + super("AutoARCIndexThread"); + super.setDaemon(true); + } + + public void run() { + LOGGER.info("AutoIndexThread is alive."); + int sleepInterval = runInterval; + if(store == null) { + throw new RuntimeException("No LocalResourceStore set"); + } + while (true) { + try { + int numIndexed = indexNewArcs(); + if (numIndexed == 0) { + sleep(sleepInterval); + sleepInterval += runInterval; + } else { + sleepInterval = runInterval; + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + /** + * Scan for new ARC files, and index any new files discovered. + * + * There are 3 main steps, which could be broken into separate threads: + * 1) detect new ARCs + * 2) create CDX files for each new ARC + * 3) upload CDX files to target (or rename to local "incoming" directory) + * + * for now these are sequential. + * + * @return number of ARC files indexed + */ + public int indexNewArcs() { + int numIndexed = 0; + try { + queueNewArcsForIndex(); + } catch (IOException e) { + e.printStackTrace(); + } + try { + numIndexed = indexArcs(10); + } catch (MalformedURLException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return numIndexed; + } + /** + * Find any new ARC files and queue them for indexing. + * @throws IOException + */ + public void queueNewArcsForIndex() throws IOException { + + // build a HashMap of what has been queued already: + HashMap<String,String> queued = new HashMap<String, String>(); + String entries[] = queuedDir.list(); + if(entries != null) { + for (int i = 0; i < entries.length; i++) { + queued.put(entries[i], "i"); + } + } + // now scan thru arcDir, and make a flag file for anything that was not + // already there: + Iterator<String> files = store.fileNamesIterator(); + if(files != null) { + while(files.hasNext()) { + String fileName = files.next(); + if(!queued.containsKey(fileName)) { + File newQueuedFile = new File(queuedDir,fileName); + File newToBeIndexedFile = new File(workDir,fileName); + newToBeIndexedFile.createNewFile(); + newQueuedFile.createNewFile(); + } + } + } + } + + private String fileNameToBase(final String fileName) { + return fileName; + } + + /** + * Index up to 'max' ARC/WARC files queued for indexing, queueing the + * resulting CDX files for merging with the BDBIndex. + * + * @param indexer + * @param max maximum number to index in this method call, 0 for unlimited + * @return int number of ARC/WARC files indexed + * @throws MalformedURLException + * @throws IOException + */ + public int indexArcs(int max) + throws MalformedURLException, IOException { + + int numIndexed = 0; + String toBeIndexed[] = workDir.list(); + + if (toBeIndexed != null) { + for (int i = 0; i < toBeIndexed.length; i++) { + String fileName = toBeIndexed[i]; + File file = store.getLocalFile(fileName); + if(file != null) { + File workFlagFile = new File(workDir,fileName); + String cdxBase = fileNameToBase(fileName); + + try { + + LOGGER.info("Indexing " + file.getAbsolutePath()); + CloseableIterator<SearchResult> itr = store.indexFile(file); + + if(indexClient.addSearchResults(cdxBase, itr)) { + if (!workFlagFile.delete()) { + throw new IOException("Unable to delete " + + workFlagFile.getAbsolutePath()); + } + } + itr.close(); + numIndexed++; + } catch (IOException e) { + LOGGER.severe("FAILED index: " + file.getAbsolutePath() + + " cause: " + e.getLocalizedMessage()); + } + if(max > 0 && (numIndexed >= max)) { + break; + } + } + } + } + return numIndexed; + } + + + + public LocalResourceStore getStore() { + return store; + } + + public void setStore(LocalResourceStore store) { + this.store = store; + } + + public String getWorkDir() { + return workDir == null ? null : workDir.getAbsolutePath(); + } + + public void setWorkDir(String workDir) throws IOException { + this.workDir = DirMaker.ensureDir(workDir); + } + + public String getQueuedDir() { + return queuedDir == null ? null : queuedDir.getAbsolutePath(); + } + + public void setQueuedDir(String queuedDir) throws IOException { + this.queuedDir = DirMaker.ensureDir(queuedDir); + } + + public int getRunInterval() { + return runInterval; + } + + public void setRunInterval(int runInterval) { + this.runInterval = runInterval; + } + + public IndexClient getIndexClient() { + return indexClient; + } + + public void setIndexClient(IndexClient indexClient) { + this.indexClient = indexClient; + } +} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2008-06-25 01:30:59 UTC (rev 2322) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -32,6 +32,8 @@ import org.archive.wayback.core.Resource; import org.archive.wayback.core.SearchResult; import org.archive.wayback.exception.ResourceNotAvailableException; +import org.archive.wayback.resourcestore.resourcefile.ArcWarcFilenameFilter; +import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; /** @@ -63,11 +65,11 @@ throw new IOException("No ARC/WARC offset in search result..."); } final long offset = Long.parseLong(offsetString); - if(!fileName.endsWith(LocalResourceStore.ARC_EXTENSION) - && !fileName.endsWith(LocalResourceStore.ARC_GZ_EXTENSION) - && !fileName.endsWith(LocalResourceStore.WARC_EXTENSION) - && !fileName.endsWith(LocalResourceStore.WARC_GZ_EXTENSION)) { - fileName = fileName + LocalResourceStore.ARC_GZ_EXTENSION; + if(!fileName.endsWith(ArcWarcFilenameFilter.ARC_SUFFIX) + && !fileName.endsWith(ArcWarcFilenameFilter.ARC_GZ_SUFFIX) + && !fileName.endsWith(ArcWarcFilenameFilter.WARC_SUFFIX) + && !fileName.endsWith(ArcWarcFilenameFilter.WARC_GZ_SUFFIX)) { + fileName = fileName + ArcWarcFilenameFilter.ARC_GZ_SUFFIX; } String fileUrl = urlPrefix + fileName; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java 2008-06-25 01:30:59 UTC (rev 2322) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -34,6 +34,7 @@ import org.archive.wayback.core.SearchResult; import org.archive.wayback.exception.ResourceNotAvailableException; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; +import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; /** * Simple ResourceStore implementation, which uses a ResourceFileLocationDB to Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2008-06-25 01:30:59 UTC (rev 2322) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -1,147 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.FilenameFilter; -import java.io.IOException; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; - -import org.archive.wayback.ResourceStore; -import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.Resource; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.exception.ConfigurationException; -import org.archive.wayback.exception.ResourceNotAvailableException; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.DirMaker; - -/** - * Class which implements a local ARC, WARC, ARC.gz, WARC.gz, ResourceStore - * including an optional automatic indexing thread - * - * @author brad - * @version $Date$, $Revision$ - */ -public class LocalResourceStore implements ResourceStore { - - private File dataDir = null; - private AutoIndexThread indexThread = null; - - private ArcIndexer arcIndexer = new ArcIndexer(); - private WarcIndexer warcIndexer = new WarcIndexer(); - public final static String ARC_EXTENSION = ".arc"; - public final static String ARC_GZ_EXTENSION = ".arc.gz"; - public final static String WARC_EXTENSION = ".warc"; - public final static String WARC_GZ_EXTENSION = ".warc.gz"; - public final static String OPEN_EXTENSION = ".open"; - private final static String[] SUFFIXES = { - "", ARC_EXTENSION, ARC_GZ_EXTENSION, WARC_EXTENSION, WARC_GZ_EXTENSION - }; - private FilenameFilter filter = new ArcWarcFilenameFilter(); - - public void init() throws ConfigurationException { - if(indexThread != null) { - indexThread.setStore(this); - indexThread.start(); - } - } - protected String resultToFileName(SearchResult result) { - return result.get(WaybackConstants.RESULT_ARC_FILE); - } - - protected long resultToOffset(SearchResult result) { - return Long.parseLong(result.get(WaybackConstants.RESULT_OFFSET)); - } - - public File getLocalFile(String fileName) { - // try adding suffixes: empty string is first in the list - File file = null; - for(String suffix : SUFFIXES) { - file = new File(dataDir,fileName + suffix); - if(file.exists() && file.canRead()) { - return file; - } - } - // this might work if the full path is in the index... - file = new File(fileName); - if(file.exists() && file.canRead()) { - return file; - } - // doh. - return null; - } - - public Resource retrieveResource(SearchResult result) throws IOException, - ResourceNotAvailableException { - String fileName = resultToFileName(result); - long offset = resultToOffset(result); - File file = getLocalFile(fileName); - if (file == null) { - - // TODO: this needs to be prettied up for end user consumption.. - throw new ResourceNotAvailableException("Cannot find ARC file (" - + fileName + ")"); - } else { - - Resource r = ResourceFactory.getResource(file, offset); - return r; - } - } - - public CloseableIterator<SearchResult> indexFile(File dataFile) throws IOException { - CloseableIterator<SearchResult> itr = null; - - String name = dataFile.getName(); - if(name.endsWith(ARC_EXTENSION)) { - itr = arcIndexer.iterator(dataFile); - } else if(name.endsWith(ARC_GZ_EXTENSION)) { - itr = arcIndexer.iterator(dataFile); - } else if(name.endsWith(WARC_EXTENSION)) { - itr = warcIndexer.iterator(dataFile); - } else if(name.endsWith(WARC_GZ_EXTENSION)) { - itr = warcIndexer.iterator(dataFile); - } - return itr; - } - - public Iterator<String> fileNamesIterator() throws IOException { - if(dataDir != null) { - String[] files = dataDir.list(filter); - List<String> l = Arrays.asList(files); - return l.iterator(); - } - return null; - } - - public String getDataDir() { - return DirMaker.getAbsolutePath(dataDir); - } - - public void setDataDir(String dataDir) throws IOException { - this.dataDir = DirMaker.ensureDir(dataDir); - } - - private class ArcWarcFilenameFilter implements FilenameFilter { - public boolean accept(File dir, String name) { - File tmp = new File(dir,name); - if(tmp.isFile() && tmp.canRead()) { - return name.endsWith(ARC_EXTENSION) || - name.endsWith(ARC_GZ_EXTENSION) || - name.endsWith(WARC_GZ_EXTENSION) || - name.endsWith(WARC_EXTENSION); - } - return false; - } - } - - public AutoIndexThread getIndexThread() { - return indexThread; - } - public void setIndexThread(AutoIndexThread indexThread) { - this.indexThread = indexThread; - } - public void shutdown() throws IOException { - // no-op. could shut down threads - } -} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -0,0 +1,150 @@ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import org.archive.wayback.ResourceStore; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.exception.ConfigurationException; +import org.archive.wayback.exception.ResourceNotAvailableException; +import org.archive.wayback.resourcestore.indexer.ArcIndexer; +import org.archive.wayback.resourcestore.indexer.WarcIndexer; +import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.DirMaker; + +/** + * Class which implements a local ARC, WARC, ARC.gz, WARC.gz, ResourceStore + * including an optional automatic indexing thread + * + * @author brad + * @version $Date$, $Revision$ + */ +public class LocalResourceStore implements ResourceStore { + + private File dataDir = null; + private AutoIndexThread indexThread = null; + + private ArcIndexer arcIndexer = new ArcIndexer(); + private WarcIndexer warcIndexer = new WarcIndexer(); + public final static String ARC_EXTENSION = ".arc"; + public final static String ARC_GZ_EXTENSION = ".arc.gz"; + public final static String WARC_EXTENSION = ".warc"; + public final static String WARC_GZ_EXTENSION = ".warc.gz"; + public final static String OPEN_EXTENSION = ".open"; + private final static String[] SUFFIXES = { + "", ARC_EXTENSION, ARC_GZ_EXTENSION, WARC_EXTENSION, WARC_GZ_EXTENSION + }; + private FilenameFilter filter = new ArcWarcFilenameFilter(); + + public void init() throws ConfigurationException { + if(indexThread != null) { + indexThread.setStore(this); + indexThread.start(); + } + } + protected String resultToFileName(SearchResult result) { + return result.get(WaybackConstants.RESULT_ARC_FILE); + } + + protected long resultToOffset(SearchResult result) { + return Long.parseLong(result.get(WaybackConstants.RESULT_OFFSET)); + } + + public File getLocalFile(String fileName) { + // try adding suffixes: empty string is first in the list + File file = null; + for(String suffix : SUFFIXES) { + file = new File(dataDir,fileName + suffix); + if(file.exists() && file.canRead()) { + return file; + } + } + // this might work if the full path is in the index... + file = new File(fileName); + if(file.exists() && file.canRead()) { + return file; + } + // doh. + return null; + } + + public Resource retrieveResource(SearchResult result) throws IOException, + ResourceNotAvailableException { + String fileName = resultToFileName(result); + long offset = resultToOffset(result); + File file = getLocalFile(fileName); + if (file == null) { + + // TODO: this needs to be prettied up for end user consumption.. + throw new ResourceNotAvailableException("Cannot find ARC file (" + + fileName + ")"); + } else { + + Resource r = ResourceFactory.getResource(file, offset); + return r; + } + } + + public CloseableIterator<SearchResult> indexFile(File dataFile) throws IOException { + CloseableIterator<SearchResult> itr = null; + + String name = dataFile.getName(); + if(name.endsWith(ARC_EXTENSION)) { + itr = arcIndexer.iterator(dataFile); + } else if(name.endsWith(ARC_GZ_EXTENSION)) { + itr = arcIndexer.iterator(dataFile); + } else if(name.endsWith(WARC_EXTENSION)) { + itr = warcIndexer.iterator(dataFile); + } else if(name.endsWith(WARC_GZ_EXTENSION)) { + itr = warcIndexer.iterator(dataFile); + } + return itr; + } + + public Iterator<String> fileNamesIterator() throws IOException { + if(dataDir != null) { + String[] files = dataDir.list(filter); + List<String> l = Arrays.asList(files); + return l.iterator(); + } + return null; + } + + public String getDataDir() { + return DirMaker.getAbsolutePath(dataDir); + } + + public void setDataDir(String dataDir) throws IOException { + this.dataDir = DirMaker.ensureDir(dataDir); + } + + private class ArcWarcFilenameFilter implements FilenameFilter { + public boolean accept(File dir, String name) { + File tmp = new File(dir,name); + if(tmp.isFile() && tmp.canRead()) { + return name.endsWith(ARC_EXTENSION) || + name.endsWith(ARC_GZ_EXTENSION) || + name.endsWith(WARC_GZ_EXTENSION) || + name.endsWith(WARC_EXTENSION); + } + return false; + } + } + + public AutoIndexThread getIndexThread() { + return indexThread; + } + public void setIndexThread(AutoIndexThread indexThread) { + this.indexThread = indexThread; + } + public void shutdown() throws IOException { + // no-op. could shut down threads + } +} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2008-06-25 01:30:59 UTC (rev 2322) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -30,8 +30,6 @@ import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.SearchResult; import org.archive.wayback.resourceindex.updater.IndexClient; -import org.archive.wayback.resourcestore.ArcIndexer; -import org.archive.wayback.resourcestore.WarcIndexer; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; import org.archive.wayback.util.CloseableIterator; //import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java 2008-06-25 01:30:59 UTC (rev 2322) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java 2008-06-25 01:33:19 UTC (rev 2323) @@ -34,10 +34,11 @@ * @version $Date$, $Revision$ */ public class ArcWarcFilenameFilter implements FilenameFilter { - private final static String ARC_SUFFIX = ".arc"; - private final static String ARC_GZ_SUFFIX = ".arc.gz"; - private final static String WARC_SUFFIX = ".warc"; - private final static String WARC_GZ_SUFFIX = ".warc.gz"; + public final static String ARC_SUFFIX = ".arc"; + public final static String ARC_GZ_SUFFIX = ".arc.gz"; + public final static String WARC_SUFFIX = ".warc"; + public final static String WARC_GZ_SUFFIX = ".warc.gz"; + public final static String OPEN_SUFFIX = ".open"; public boolean accept(File dir, String name) { return name.endsWith(ARC_SUFFIX) || This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-01 23:47:42
|
Revision: 2376 http://archive-access.svn.sourceforge.net/archive-access/?rev=2376&view=rev Author: bradtofel Date: 2008-07-01 16:47:50 -0700 (Tue, 01 Jul 2008) Log Message: ----------- REFACTOR: SearchResult => (Url|Capture)SearchResult Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-07-01 23:46:33 UTC (rev 2375) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-07-01 23:47:50 UTC (rev 2376) @@ -7,7 +7,7 @@ import java.util.Iterator; import java.util.logging.Logger; -import org.archive.wayback.core.SearchResult; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.resourceindex.updater.IndexClient; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.DirMaker; @@ -151,7 +151,7 @@ try { LOGGER.info("Indexing " + file.getAbsolutePath()); - CloseableIterator<SearchResult> itr = store.indexFile(file); + CloseableIterator<CaptureSearchResult> itr = store.indexFile(file); if(indexClient.addSearchResults(cdxBase, itr)) { if (!workFlagFile.delete()) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2008-07-01 23:46:33 UTC (rev 2375) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2008-07-01 23:47:50 UTC (rev 2376) @@ -28,9 +28,8 @@ import java.net.URL; import org.archive.wayback.ResourceStore; -import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.Resource; -import org.archive.wayback.core.SearchResult; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.ResourceNotAvailableException; import org.archive.wayback.resourcestore.resourcefile.ArcWarcFilenameFilter; import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; @@ -50,21 +49,16 @@ private String urlPrefix = null; - public Resource retrieveResource(SearchResult result) throws IOException, + public Resource retrieveResource(CaptureSearchResult result) throws IOException, ResourceNotAvailableException { // extract ARC filename - String fileName = result.get(WaybackConstants.RESULT_ARC_FILE); + String fileName = result.getFile(); if(fileName == null || fileName.length() < 1) { throw new IOException("No ARC/WARC name in search result..."); } - // extract offset + convert to long - final String offsetString = result.get(WaybackConstants.RESULT_OFFSET); - if(offsetString == null || offsetString.length() < 1) { - throw new IOException("No ARC/WARC offset in search result..."); - } - final long offset = Long.parseLong(offsetString); + final long offset = result.getOffset(); if(!fileName.endsWith(ArcWarcFilenameFilter.ARC_SUFFIX) && !fileName.endsWith(ArcWarcFilenameFilter.ARC_GZ_SUFFIX) && !fileName.endsWith(ArcWarcFilenameFilter.WARC_SUFFIX) Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java 2008-07-01 23:46:33 UTC (rev 2375) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java 2008-07-01 23:47:50 UTC (rev 2376) @@ -29,9 +29,8 @@ import java.net.URL; import org.archive.wayback.ResourceStore; -import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.Resource; -import org.archive.wayback.core.SearchResult; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.ResourceNotAvailableException; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; @@ -50,26 +49,21 @@ /* (non-Javadoc) * @see org.archive.wayback.ResourceStore#retrieveResource(org.archive.wayback.core.SearchResult) */ - public Resource retrieveResource(SearchResult result) throws IOException, + public Resource retrieveResource(CaptureSearchResult result) throws IOException, ResourceNotAvailableException { // extract ARC filename - String fileName = result.get(WaybackConstants.RESULT_ARC_FILE); + String fileName = result.getFile(); if(fileName == null || fileName.length() < 1) { throw new IOException("No ARC/WARC name in search result..."); } - // extract offset + convert to long - final String offsetString = result.get(WaybackConstants.RESULT_OFFSET); - if(offsetString == null || offsetString.length() < 1) { - throw new IOException("No ARC/WARC offset in search result..."); - } String urls[] = db.nameToUrls(fileName); if(urls == null || urls.length == 0) { throw new ResourceNotAvailableException("Unable to locate(" + fileName + ")"); } - final long offset = Long.parseLong(offsetString); + final long offset = result.getOffset(); Resource r = null; // TODO: attempt multiple threads? Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2008-07-01 23:46:33 UTC (rev 2375) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2008-07-01 23:47:50 UTC (rev 2376) @@ -8,9 +8,8 @@ import java.util.List; import org.archive.wayback.ResourceStore; -import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.Resource; -import org.archive.wayback.core.SearchResult; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.ConfigurationException; import org.archive.wayback.exception.ResourceNotAvailableException; import org.archive.wayback.resourcestore.indexer.ArcIndexer; @@ -49,13 +48,6 @@ indexThread.start(); } } - protected String resultToFileName(SearchResult result) { - return result.get(WaybackConstants.RESULT_ARC_FILE); - } - - protected long resultToOffset(SearchResult result) { - return Long.parseLong(result.get(WaybackConstants.RESULT_OFFSET)); - } public File getLocalFile(String fileName) { // try adding suffixes: empty string is first in the list @@ -75,10 +67,10 @@ return null; } - public Resource retrieveResource(SearchResult result) throws IOException, + public Resource retrieveResource(CaptureSearchResult result) throws IOException, ResourceNotAvailableException { - String fileName = resultToFileName(result); - long offset = resultToOffset(result); + String fileName = result.getFile(); + long offset = result.getOffset(); File file = getLocalFile(fileName); if (file == null) { @@ -92,8 +84,8 @@ } } - public CloseableIterator<SearchResult> indexFile(File dataFile) throws IOException { - CloseableIterator<SearchResult> itr = null; + public CloseableIterator<CaptureSearchResult> indexFile(File dataFile) throws IOException { + CloseableIterator<CaptureSearchResult> itr = null; String name = dataFile.getName(); if(name.endsWith(ARC_EXTENSION)) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |