From: <bra...@us...> - 2008-08-13 01:37:30
|
Revision: 2543 http://archive-access.svn.sourceforge.net/archive-access/?rev=2543&view=rev Author: bradtofel Date: 2008-08-13 01:37:38 +0000 (Wed, 13 Aug 2008) Log Message: ----------- TWEAK: tested & finalized for 1.4 release. NutchWax 0.12.1 does not current work with Wayback as WERA did for replay: several problems, primarily that it does not index images, css, and likely several other formats. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ComplexAccessPoint.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/NutchCollection.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/RemoteCollection.xml Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml 2008-08-13 01:37:38 UTC (rev 2543) @@ -0,0 +1,129 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> + +<!-- + + This file contains the default WaybackCollection implementation shipped + with Wayback. It consists of a LocalResourceIndex using a BDBIndex, and + uses several Worker Threads to implement automatic indexing. + + Several beans defined in this file reference the "resourcefilelocationdb" + bean defined in wayback.xml. + + To customize where the automatic indexing system looks for ARC/WARC files, + see "resourcefilesourceupdater.sources" within this file. + + To customize the location where the automatic indexing state data is stored + you can modify "wayback.basedir" in wayback.xml, or replace the properties + in this file for further flexibility. + + For more information about the auto-indexing system, please see: + + http://archive-access.sourceforge.net/projects/wayback/resource_store.html + +--> + +<!-- + A LocalResourceIndex bean using a BDBIndex SearchResultSource. +--> + <bean id="localbdbresourceindex" class="org.archive.wayback.resourceindex.LocalResourceIndex"> + <property name="source"> + <bean class="org.archive.wayback.resourceindex.bdb.BDBIndex"> + <property name="bdbName" value="DB1" /> + <property name="bdbPath" value="${wayback.basedir}/index/" /> + </bean> + </property> + <property name="maxRecords" value="10000" /> + </bean> + +<!-- + An IndexQueue implementation required for automatic indexing. +--> + <bean id="indexqueue" class="org.archive.wayback.resourcestore.indexer.DirectoryIndexQueue"> + <property name="path" value="${wayback.basedir}/index-data/queue" /> + </bean> + + <bean id="localbdbcollection" class="org.archive.wayback.webapp.WaybackCollection"> + + <property name="resourceStore"> + <bean id="localresourcestore" class="org.archive.wayback.resourcestore.LocationDBResourceStore"> + <property name="db" ref="resourcefilelocationdb" /> + </bean> + </property> + + <property name="resourceIndex" ref="localbdbresourceindex"/> + + <property name="shutdownables"> + <list> + <!-- This thread notices new files appearing in your resourcefilesources --> + <bean id="resourcefilesourceupdater" class="org.archive.wayback.resourcestore.resourcefile.ResourceFileSourceUpdater"> + <property name="target" value="${wayback.basedir}/file-db/incoming" /> + <property name="interval" value="100000" /> + <property name="sources"> + <list> + <!-- + This example looks for ARC/WARC files recursively under 2 + directories: /tmp/wayback/files1 and /tmp/wayback/files2 + You can specify as few or as many ResourceFileSource instances + as needed, but each must have a unique 'name' property. + --> + <bean class="org.archive.wayback.resourcestore.resourcefile.DirectoryResourceFileSource"> + <property name="name" value="files1" /> + <property name="prefix" value="/tmp/wayback/files1/" /> + </bean> + <bean class="org.archive.wayback.resourcestore.resourcefile.DirectoryResourceFileSource"> + <property name="name" value="files2" /> + <property name="prefix" value="/tmp/wayback/files2/" /> + </bean> + </list> + </property> + </bean> + + <!-- This thread updates the location db with updates from resourcefilesourceupdater --> + <bean id="resourcefilelocationdbupdater" class="org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBUpdater"> + <property name="interval" value="100000" /> + <property name="db" ref="resourcefilelocationdb" /> + <property name="incomingDir" value="${wayback.basedir}/file-db/incoming" /> + <property name="stateDir" value="${wayback.basedir}/file-db/state" /> + </bean> + + <!-- This thread notices new files arriving in the filelocationdb, and queues them for indexing --> + <bean id="indexqueueupdater" class="org.archive.wayback.resourcestore.indexer.IndexQueueUpdater"> + <property name="db" ref="resourcefilelocationdb" /> + <property name="queue" ref="indexqueue" /> + <property name="interval" value="1000" /> + <property name="lastMark" value="${wayback.basedir}/index-data/queue.mark" /> + </bean> + + <!-- This thread checks the to-be-indexed queue for files needing indexing, indexes them, and hands off the results for merging with the ResourceIndex --> + <bean id="indexworker" class="org.archive.wayback.resourcestore.indexer.IndexWorker"> + <property name="db" ref="resourcefilelocationdb" /> + <property name="queue" ref="indexqueue" /> + <property name="interval" value="1000" /> + <property name="target"> + <bean class="org.archive.wayback.resourceindex.updater.IndexClient"> + <property name="tmpDir" value="${wayback.basedir}/index-data/tmp/" /> + <property name="target" value="${wayback.basedir}/index-data/incoming/" /> + </bean> + </property> + </bean> + + <!-- This thread merges updates from the indexworker into the ResourceIndex --> + <bean class="org.archive.wayback.resourceindex.updater.LocalResourceIndexUpdater"> + + <property name="index" ref="localbdbresourceindex" /> + <property name="incoming" value="${wayback.basedir}/index-data/incoming/" /> + <property name="failed" value="${wayback.basedir}/index-data/failed/" /> + <property name="merged" value="${wayback.basedir}/index-data/merged/" /> + <property name="runInterval" value="10000" /> + </bean> + </list> + </property> + </bean> + + +</beans> \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml 2008-08-13 01:37:38 UTC (rev 2543) @@ -0,0 +1,69 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> + +<!-- + + This file contains a WaybackCollection implementation using a manually + created and administered CDX index file(s). It also uses the experimental + FlatFileResourceFileLocationDB, which enables mapping between ARC/WARC file + names and their absolute paths/URLs using a sorted text file + "path-index.txt". + + The format of the path-index.txt is + <NAME><TAB><PATH-OR-URL> + + Be sure to set the environment variable LC_ALL to "C" before sorting CDX + files and path-index.txt files. + + All paths in this file reference the Spring property placeholder + "wayback.basedir" defined in wayback.xml. That value may be changed to + alter top-level prefixes for these paths, or the values in this file can + be changed directly for further flexibility. + +--> + + <bean id="localcdxcollection" class="org.archive.wayback.webapp.WaybackCollection"> + + <property name="resourceStore"> + <bean class="org.archive.wayback.resourcestore.LocationDBResourceStore"> + <property name="db"> + <bean class="org.archive.wayback.resourcestore.locationdb.FlatFileResourceFileLocationDB"> + <property name="path" value="${wayback.basedir}/path-index.txt" /> + </bean> + </property> + </bean> + </property> + + <property name="resourceIndex"> + <bean class="org.archive.wayback.resourceindex.LocalResourceIndex"> + <property name="source"> + + <bean class="org.archive.wayback.resourceindex.cdx.CDXIndex"> + <property name="path" value="${wayback.basedir}/cdx-index/index.cdx" /> + </bean> + +<!-- + A CompositeSearchResultSource example, that allows searching through + multiple sorted CDX files. +--> +<!-- + <bean class="org.archive.wayback.resourceindex.CompositeSearchResultSource"> + <property name="CDXSources"> + <list> + <value>${wayback.basedir}/cdx-index/index-1.cdx</value> + <value>${wayback.basedir}/cdx-index/index-2.cdx</value> + </list> + </property> + </bean> +--> + </property> + <property name="maxRecords" value="10000" /> + </bean> + </property> + </bean> + +</beans> \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ComplexAccessPoint.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ComplexAccessPoint.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ComplexAccessPoint.xml 2008-08-13 01:37:38 UTC (rev 2543) @@ -0,0 +1,87 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> + +<!-- + + This file contains an alternate "8080:wayback" AccessPoint demonstrating + several optional AccessPoint configurations. +--> + + <bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> + <property name="collection" ref="localbdbcollection" /> + <property name="replay" ref="archivalurlreplay" /> + <property name="query"> + <bean class="org.archive.wayback.query.Renderer"> + <property name="captureJsp" value="/WEB-INF/query/CalendarResults.jsp" /> + </bean> + </property> + + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://localhost.archive.org:8080/wayback/"/> + </bean> + </property> + + <property name="parser"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser"> + <property name="maxRecords" value="1000" /> + <property name="earliestTimestamp" value="1996" /> + </bean> + </property> + + <!-- + The following configuration enables free String key-value pairs to be + associated. These values can be used within customized .jsp UI files. + Here is some example .jsp code demonstrating access of the "Institution" + value: + UIResults results = UIResults.getGeneric(request); + String institution = results.getContextConfig("Institution"); + ... + + --> + <property name="configs"> + <props> + <prop key="Institution">Sample Institution</prop> + <prop key="Collection">Sample Collection</prop> + </props> + </property> + + <!-- + The following is a rather complex configuration example demonstrating + context specific AccessControl configuration. Specifically, it causes any + request NOT originating INSIDE the 192.168.1.16/24 IP space to use the + specified Access Control Oracle to determine which documents are + accessible. Requests originating INSIDE the IP space have no access + control restrictions. + --> + + <property name="authentication"> + <bean class="org.archive.wayback.authenticationcontrol.AccessControlSettingOperation"> + <property name="operator"> + <bean class="org.archive.wayback.util.operator.NotBooleanOperator"> + <property name="operand"> + <bean class="org.archive.wayback.authenticationcontrol.IPMatchesBooleanOperator"> + <property name="allowedRanges"> + <list> + <value>192.168.1.16/24</value> + </list> + </property> + </bean> + </property> + </bean> + </property> + <property name="factory"> + <bean class="org.archive.wayback.accesscontrol.oracleclient.OracleExclusionFilterFactory"> + <property name="oracleUrl" value="http://localhost:8180/oracle/" /> + <property name="accessGroup" value="ia_archiver" /> + </bean> + </property> + </bean> + </property> + </bean> + +</beans> Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/NutchCollection.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/NutchCollection.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/NutchCollection.xml 2008-08-13 01:37:38 UTC (rev 2543) @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> +<!-- + + The following WaybackCollection bean is an example using a NutchWAX + full-text index with Wayback, using a SimpleResourceStore to access + replayed documents. You will need to change searchUrlBase to your local + NutchWAX installation. + + Please note that Wayback is supported for use with NutchWax version 0.14.1 + or higher. + + Please see RemoteCollection.xml for information on the meaning of the + SimpleResourceStore definition and options. + + You also need to ensure that the maxRecords on your RequestParser is not + greater than the maxRecords configured on the RemoteNutchResourceIndex. + +--> + + <bean id="remotenutchcollection" class="org.archive.wayback.webapp.WaybackCollection"> + + <property name="resourceStore"> + <bean class="org.archive.wayback.resourcestore.SimpleResourceStore"> + <property name="prefix" value="http://wayback.archive-it.org/fileproxy/" /> + </bean> + </property> + + <property name="resourceIndex"> + <bean class="org.archive.wayback.resourceindex.NutchResourceIndex"> + <property name="searchUrlBase" value="http://ia400138.us.archive.org:8080/nutch-1.0-dev/opensearch" /> + <property name="maxRecords" value="100" /> + </bean> + </property> + </bean> + +</beans> \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/RemoteCollection.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/RemoteCollection.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/RemoteCollection.xml 2008-08-13 01:37:38 UTC (rev 2543) @@ -0,0 +1,46 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> + +<!-- + + This file contains a WaybackCollection implementation using a + RemoteResourceIndex and a SimpleResourceStore. + + The RemoteResourceIndex implementation assumes that a Wayback is running on + the machine indicated by "searchUrlBase", which provides an XML-HTTP search + interface to the ResourceIndex on that machine. + + The SimpleResourceStore implementation assumes that all ARC/WARC files are + accessible under the path/URL named in "prefix". + + When a path is specified as the "prefix", it is assumed that there exists a + single local directory containing all ARC/WARC files. + + When a URL is specified as the "prefix", it is assumed that all ARC/WARC + files are HTTP 1.1 exported under the directory denoted. The + FileProxyServlet defined in wayback.xml may be useful in installations + where ARC/WARC files are distributed across many machines, and it is + desirable to route all ARC/WARC resource requests through a single machine. + +--> + + <bean id="remotecollection" class="org.archive.wayback.webapp.WaybackCollection"> + + <property name="resourceStore"> + <bean class="org.archive.wayback.resourcestore.SimpleResourceStore"> + <property name="prefix" value="http://wayback.archive-it.org/fileproxy/" /> + </bean> + </property> + + <property name="resourceIndex"> + <bean class="org.archive.wayback.resourceindex.RemoteResourceIndex"> + <property name="searchUrlBase" value="http://wayback.archive-it.org/1055/xmlquery" /> + </bean> + </property> + </bean> + +</beans> \ No newline at end of file Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-08-13 01:07:20 UTC (rev 2542) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-08-13 01:37:38 UTC (rev 2543) @@ -53,16 +53,16 @@ </bean> --> - <import resource="BDBCollection.xml"/> <!-- The XML files indicated in the following import tags contain alternate example implementations of WaybackCollections. --> + <import resource="NutchCollection.xml"/> +<!-- + <import resource="BDBCollection.xml"/> <import resource="CDXCollection.xml"/> <import resource="RemoteCollection.xml"/> - <import resource="NutchCollection.xml"/> -<!-- --> @@ -79,7 +79,7 @@ --> <import resource="ArchivalUrlReplay.xml"/> <bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> - <property name="collection" ref="remotecollection" /> + <property name="collection" ref="localbdbcollection" /> <property name="replay" ref="archivalurlreplay" /> <property name="query"> <bean class="org.archive.wayback.query.Renderer"> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |