From: Brad <bra...@us...> - 2005-11-16 03:11:40
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30992/src/java/org/archive/wayback/cdx Added Files: CDXRecord.java LocalBDBResourceIndex.java BDBResourceIndex.java Log Message: Massive overhaul decomposing into three main categories of changes: 1) All internal datatypes are now extensible (currently Properties, but should be Maps) including: a) WaybackRequest(was WBRequest) b) SearchResults (was ResourceResults) c) SearchResult (was ResourceResult) d) Resource so that there is no longer an assumption of Archival URL queries, or "CDX-style" index results. This will put more responsiblility on the UI components to interrogate SearchResults to decide how to render, but should enable extension to data returned from Indexes, as well as allow far more flexibility in queries, predominantly geared towards free-text searching. This is still somewhat clunky, as there are no convenience accessor methods, so all users refer to constants when interacting with them. 2) Major cleanup of servlet and filter interaction with servlet container. ReplayUI and QueryUI are now just plain old servlets, and filters can be optionally added to allow non-CGI argument requests to be coerced into standard WaybackRequest objects. 3) Alternate "Proxy" Replay mode is now functional, and some work has been done towards an alternate Nutch ResourceIndex. Currently the web.xml contains example configurations for both Proxy and Archival Url replay modes, but the Proxy related configurations are commented out. Proxy mode *requires* changing the servlet context to ROOT. ArchivalUrl replay mode works as ROOT context and as any (I think) other context. There are some cosmetic double-slashe issues to work out. --- NEW FILE: BDBResourceIndex.java --- /* BDBResourceIndex * * Created on 2005/10/18 14:00:00 * * Copyright (C) 2005 Internet Archive. * * This file is part of the Wayback Machine (crawler.archive.org). * * Wayback Machine is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Wayback Machine is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Wayback Machine; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.wayback.cdx; import java.io.File; import java.text.ParseException; import java.util.Iterator; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.SearchResult; import org.archive.wayback.core.SearchResults; import com.sleepycat.je.Cursor; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseConfig; import com.sleepycat.je.DatabaseEntry; import com.sleepycat.je.DatabaseException; import com.sleepycat.je.Environment; import com.sleepycat.je.EnvironmentConfig; import com.sleepycat.je.LockMode; import com.sleepycat.je.OperationStatus; /** * ResourceResults-specific wrapper on top of the BDBJE database. * * @author Brad Tofel * @version $Date: 2005/11/16 03:11:30 $, $Revision: 1.1 $ */ public class BDBResourceIndex { private String path; private String dbName; Environment env = null; Database db = null; /** * Constructor * * @param thePath * directory where BDBJE files are stored * @param theDbName * name of BDB database * @throws DatabaseException */ public BDBResourceIndex(final String thePath, final String theDbName) throws DatabaseException { super(); initializeDB(thePath, theDbName); } protected void initializeDB(final String thePath, final String theDbName) throws DatabaseException { path = thePath; dbName = theDbName; EnvironmentConfig environmentConfig = new EnvironmentConfig(); environmentConfig.setAllowCreate(true); environmentConfig.setTransactional(false); File file = new File(path); env = new Environment(file, environmentConfig); DatabaseConfig databaseConfig = new DatabaseConfig(); databaseConfig.setAllowCreate(true); databaseConfig.setTransactional(false); // perform other database configurations db = env.openDatabase(null, dbName, databaseConfig); } /** * shut down the BDB. * * @throws DatabaseException */ public void shutdownDB() throws DatabaseException { if (db != null) { db.close(); } if (env != null) { env.close(); } } // TODO add aditional "replay" search method which allows passing in of // an exact date, and use a "scrolling window" of the best results, to // allow for returning the N closest results to a particular date, within // a specific window of dates... protected SearchResults doUrlSearch(final String url, final String firstDate, final String lastDate, final String exactHost, final int startRecord, final int maxRecords) { SearchResults results = new SearchResults(); DatabaseEntry key = new DatabaseEntry(); DatabaseEntry value = new DatabaseEntry(); int numRecords = 0; int numSkipped = 0; String searchStart = url + " " + firstDate; key.setData(searchStart.getBytes()); key.setPartial(false); try { Cursor cursor = db.openCursor(null, null); OperationStatus status = cursor.getSearchKeyRange(key, value, LockMode.DEFAULT); while (status == OperationStatus.SUCCESS) { // String keyString = new String(key.getData()); String valueString = new String(value.getData()); CDXRecord parser = new CDXRecord(); parser.parseLine(valueString, 0); if (!parser.url.equals(url)) { break; } if (parser.captureDate.compareTo(lastDate) > 0) { break; } if (parser.captureDate.compareTo(firstDate) >= 0) { if (numSkipped >= startRecord) { results.addSearchResult(parser.toSearchResult()); numRecords++; if (numRecords >= maxRecords) { results.putFilter(WaybackConstants.RESULTS_HAS_MORE, "true"); break; } } else { numSkipped++; } } status = cursor.getNext(key, value, LockMode.DEFAULT); } cursor.close(); } catch (DatabaseException dbe) { // TODO: let this bubble up as Index error dbe.printStackTrace(); } catch (ParseException e) { // TODO: let this bubble up as Index error e.printStackTrace(); } return results; } protected SearchResults doUrlPrefixSearch(final String urlPrefix, final String firstDate, final String lastDate, final String exactHost, final int startRecord, final int maxRecords) { SearchResults results = new SearchResults(); DatabaseEntry key = new DatabaseEntry(); DatabaseEntry value = new DatabaseEntry(); int numRecords = 0; int numSkipped = 0; String searchStart = urlPrefix; key.setData(searchStart.getBytes()); key.setPartial(false); try { Cursor cursor = db.openCursor(null, null); OperationStatus status = cursor.getSearchKeyRange(key, value, LockMode.DEFAULT); while (status == OperationStatus.SUCCESS) { String valueString = new String(value.getData()); CDXRecord parser = new CDXRecord(); parser.parseLine(valueString, 0); if (!parser.url.startsWith(urlPrefix)) { break; } if ((parser.captureDate.compareTo(lastDate) <= 0) && (parser.captureDate.compareTo(firstDate) >= 0)) { if (numSkipped >= startRecord) { results.addSearchResult(parser.toSearchResult()); numRecords++; if (numRecords >= maxRecords) { // TODO should this be here?... results.putFilter(WaybackConstants.RESULTS_HAS_MORE, "true"); break; } } else { numSkipped++; } } status = cursor.getNext(key, value, LockMode.DEFAULT); } cursor.close(); } catch (DatabaseException dbe) { // TODO: let this bubble up as Index error dbe.printStackTrace(); } catch (ParseException e) { // TODO: let this bubble up as Index error e.printStackTrace(); } return results; } /** * Add all ResourceResult in results to BDB index * @param results * @throws Exception */ public void addResults(SearchResults results) throws Exception { Iterator itr = results.iterator(); DatabaseEntry key = new DatabaseEntry(); DatabaseEntry value = new DatabaseEntry(); OperationStatus status = null; CDXRecord parser = new CDXRecord(); try { Cursor cursor = db.openCursor(null, null); while (itr.hasNext()) { SearchResult result = (SearchResult) itr.next(); parser.fromSearchResult(result); String keyString = parser.toKey(); String valueString = parser.toValue(); key.setData(keyString.getBytes()); value.setData(valueString.getBytes()); status = cursor.put(key, value); if (status != OperationStatus.SUCCESS) { throw new Exception("oops, put had non-success status"); } } cursor.close(); } catch (DatabaseException e) { e.printStackTrace(); } } } --- NEW FILE: CDXRecord.java --- /* CDXRecord * * $Id: CDXRecord.java,v 1.1 2005/11/16 03:11:30 bradtofel Exp $ * * Created on 4:40:45 PM Nov 10, 2005. * * Copyright (C) 2005 Internet Archive. * * This file is part of wayback. * * wayback is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * wayback is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with wayback; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.wayback.cdx; import java.text.ParseException; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.SearchResult; /** * * * @author brad * @version $Date: 2005/11/16 03:11:30 $, $Revision: 1.1 $ */ public class CDXRecord { public String url; public String captureDate; public String origHost = null; public String mimeType = null; public String httpResponseCode = null; public String md5Fragment = null; public String redirectUrl = null; public long compressedOffset = -1; public String arcFileName = null; public CDXRecord() { super(); } /** * Attempt to deserialize state from a single text line, fields delimited by * spaces. There are standard ways to do this, and this is not one of * them... for no good reason. * * @param line * @param lineNumber * @throws ParseException */ public void parseLine(final String line, final int lineNumber) throws ParseException { String[] tokens = line.split(" "); if (tokens.length != 9) { throw new ParseException(line, lineNumber); } url = tokens[0]; captureDate = tokens[1]; origHost = tokens[2]; mimeType = tokens[3]; httpResponseCode = tokens[4]; md5Fragment = tokens[5]; redirectUrl = tokens[6]; compressedOffset = Long.parseLong(tokens[7]); arcFileName = tokens[8]; } public SearchResult toSearchResult() { SearchResult result = new SearchResult(); result.put(WaybackConstants.RESULT_URL, url); result.put(WaybackConstants.RESULT_CAPTURE_DATE, captureDate); result.put(WaybackConstants.RESULT_ORIG_HOST, origHost); result.put(WaybackConstants.RESULT_MIME_TYPE, mimeType); result.put(WaybackConstants.RESULT_HTTP_CODE, httpResponseCode); result.put(WaybackConstants.RESULT_MD5_DIGEST, md5Fragment); result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl); // HACKHACK: result.put(WaybackConstants.RESULT_OFFSET, "" + compressedOffset); result.put(WaybackConstants.RESULT_ARC_FILE, arcFileName); return result; } public void fromSearchResult(final SearchResult result) { url = result.get(WaybackConstants.RESULT_URL); captureDate = result.get(WaybackConstants.RESULT_CAPTURE_DATE); origHost = result.get(WaybackConstants.RESULT_ORIG_HOST); mimeType = result.get(WaybackConstants.RESULT_MIME_TYPE); httpResponseCode = result.get(WaybackConstants.RESULT_HTTP_CODE); md5Fragment = result.get(WaybackConstants.RESULT_MD5_DIGEST); redirectUrl = result.get(WaybackConstants.RESULT_REDIRECT_URL); compressedOffset = Long.parseLong(result.get( WaybackConstants.RESULT_OFFSET)); arcFileName = result.get(WaybackConstants.RESULT_ARC_FILE); } public String toValue() { return url + " " + captureDate + " " + origHost + " " + mimeType + " " + httpResponseCode + " " + md5Fragment + " " + redirectUrl + " " + compressedOffset + " " + arcFileName; } public String toKey() { return url + " " + captureDate; } } --- NEW FILE: LocalBDBResourceIndex.java --- /* LocalBDBResourceIndex * * Created on 2005/10/18 14:00:00 * * Copyright (C) 2005 Internet Archive. * * This file is part of the Wayback Machine (crawler.archive.org). * * Wayback Machine is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Wayback Machine is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Wayback Machine; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.wayback.cdx; import java.text.ParseException; import java.util.Properties; import org.apache.commons.httpclient.URIException; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.wayback.ResourceIndex; import org.archive.wayback.WaybackConstants; import org.archive.wayback.cdx.indexer.IndexPipeline; import org.archive.wayback.core.Timestamp; import org.archive.wayback.core.SearchResults; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BadQueryException; import org.archive.wayback.exception.ConfigurationException; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.exception.ResourceNotInArchiveException; import com.sleepycat.je.DatabaseException; /** * Implements ResourceIndex interface using a BDBResourceIndex * * @author Brad Tofel * @version $Date: 2005/11/16 03:11:30 $, $Revision: 1.1 $ */ public class LocalBDBResourceIndex implements ResourceIndex { private final static String INDEX_PATH = "resourceindex.indexpath"; private final static String DB_NAME = "resourceindex.dbname"; private final static int MAX_RECORDS = 1000; private BDBResourceIndex db = null; private IndexPipeline pipeline = null; /** * Constructor */ public LocalBDBResourceIndex() { super(); } public void init(Properties p) throws ConfigurationException { System.out.println("initializing LocalDBDResourceIndex..."); String dbPath = (String) p.get(INDEX_PATH); if (dbPath == null || (dbPath.length() <= 0)) { throw new IllegalArgumentException("Failed to find " + INDEX_PATH); } String dbName = (String) p.get(DB_NAME); if (dbName == null || (dbName.length() <= 0)) { throw new IllegalArgumentException("Failed to find " + DB_NAME); } try { db = new BDBResourceIndex(dbPath, dbName); } catch (DatabaseException e) { e.printStackTrace(); throw new ConfigurationException(e.getMessage()); } pipeline = new IndexPipeline(); pipeline.init(p); } public SearchResults query(WaybackRequest wbRequest) throws ResourceIndexNotAvailableException, ResourceNotInArchiveException, BadQueryException { UURI searchURI; String searchHost; String searchPath; int resultsPerPage = wbRequest.getResultsPerPage(); int pageNum = wbRequest.getPageNum(); int startResult; String searchUrl = wbRequest.get(WaybackConstants.REQUEST_URL); String searchType = wbRequest.get(WaybackConstants.REQUEST_TYPE); String startDate = wbRequest.get(WaybackConstants.REQUEST_START_DATE); String endDate = wbRequest.get(WaybackConstants.REQUEST_END_DATE); if (resultsPerPage < 1) { throw new BadQueryException("resultsPerPage cannot be < 1"); } if (resultsPerPage > MAX_RECORDS) { throw new BadQueryException("resultsPerPage cannot be > " + MAX_RECORDS); } if(pageNum < 1) { throw new BadQueryException("pageNum must be > 0"); } startResult = (pageNum - 1) * resultsPerPage; if ((searchUrl == null) || (searchUrl.length() == 0)) { throw new BadQueryException(WaybackConstants.REQUEST_URL + " must be specified"); } if ((searchType == null) || (searchType.length() == 0)) { throw new BadQueryException(WaybackConstants.REQUEST_TYPE + " must be specified"); } if ((startDate == null) || (startDate.length() == 0)) { try { startDate = Timestamp.earliestTimestamp().getDateStr(); } catch (ParseException e) { e.printStackTrace(); throw new BadQueryException("unexpected data error " + e.getMessage()); } } if ((endDate == null) || (endDate.length() == 0)) { try { endDate = Timestamp.currentTimestamp().getDateStr(); } catch (ParseException e) { e.printStackTrace(); throw new BadQueryException("unexpected data error " + e.getMessage()); } } try { if (searchUrl.startsWith("http://")) { if (-1 == searchUrl.indexOf('/', 8)) { searchUrl = searchUrl + "/"; } } else { if (!searchUrl.contains("/")) { searchUrl = searchUrl + "/"; } searchUrl = "http://" + searchUrl; } searchURI = UURIFactory.getInstance(searchUrl); searchHost = searchURI.getHostBasename(); searchPath = searchURI.getEscapedPathQuery(); } catch (URIException e) { e.printStackTrace(); throw new BadQueryException("Problem with URI " + e.getMessage()); } String keyUrl = searchHost + searchPath; SearchResults results; if (searchType.equals(WaybackConstants.REQUEST_REPLAY_QUERY)) { results = db.doUrlSearch(keyUrl, startDate, endDate, null, startResult, resultsPerPage); } else if (searchType.equals(WaybackConstants.REQUEST_URL_QUERY)) { results = db.doUrlSearch(keyUrl, startDate, endDate, null, startResult, resultsPerPage); } else if (searchType.equals( WaybackConstants.REQUEST_URL_PREFIX_QUERY)) { results = db.doUrlPrefixSearch(keyUrl, startDate, endDate, null, startResult, resultsPerPage); } else { throw new BadQueryException("Unknown query type, must be " + WaybackConstants.REQUEST_REPLAY_QUERY + ", " + WaybackConstants.REQUEST_URL_QUERY + ", or " + WaybackConstants.REQUEST_URL_PREFIX_QUERY); } if(results.isEmpty()) { throw new ResourceNotInArchiveException("the URL " + keyUrl + " is not in the archive."); } results.putFilter(WaybackConstants.REQUEST_URL,keyUrl); results.putFilter(WaybackConstants.REQUEST_START_DATE,startDate); results.putFilter(WaybackConstants.REQUEST_END_DATE,endDate); results.putFilter(WaybackConstants.RESULTS_FIRST_RECORD,""+startResult); return results; } } |