You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bi...@us...> - 2010-10-27 06:56:49
|
Revision: 3305 http://archive-access.svn.sourceforge.net/archive-access/?rev=3305&view=rev Author: binzino Date: 2010-10-27 06:56:42 +0000 (Wed, 27 Oct 2010) Log Message: ----------- Initial revision. Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeFilter.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeNormalizer.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/URLFilter.java Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeFilter.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeFilter.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeFilter.java 2010-10-27 06:56:42 UTC (rev 3305) @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.index; + +import java.io.*; +import java.util.*; + +public class TypeFilter +{ + public static final String[] DEFAULT_ALLOWED = + { + "text/html", + "text/plain", + "application/pdf", + // MS Office document types + "application/msword", + "application/vnd.ms-powerpoint", + // OpenOffice document types + "application/vnd.oasis.opendocument.text", + "application/vnd.oasis.opendocument.presentation", + "application/vnd.oasis.opendocument.spreadsheet", + }; + + private Set<String> allowed; + private TypeNormalizer normalizer; + + public TypeFilter( ) + { + + } + + public TypeFilter( Set<String> allowed, TypeNormalizer normalizer ) + { + this.allowed = allowed; + this.normalizer = normalizer; + } + + public void setTypeNormalizer( TypeNormalizer normalizer ) + { + this.normalizer = normalizer; + } + + public void setAllowed( Set<String> allowed ) + { + this.allowed = allowed; + } + + public Set<String> getAllowed( ) + { + return this.allowed; + } + + public boolean isAllowed( String type ) + { + // If no explicit list of allowed types, allow them all. + if ( this.allowed == null || this.allowed.size( ) == 0 ) + { + return true; + } + + // De-alias it. + type = this.normalizer.normalize( type ); + + return allowed.contains( type ); + } + + public static Set<String> parse( String s ) + { + Set<String> types = new HashSet<String>( ); + + for ( String type : s.split( "\\s+" ) ) + { + if ( type.length() < 1 ) continue ; + + types.add( type ); + } + + return types; + } + + public static Set<String> getDefaultAllowed( ) + { + Set<String> defaults = new HashSet<String>( ); + + for ( String allowed : DEFAULT_ALLOWED ) + { + defaults.add( allowed ); + } + + return defaults; + } + +} Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeNormalizer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeNormalizer.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/TypeNormalizer.java 2010-10-27 06:56:42 UTC (rev 3305) @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.index; + +import java.io.*; +import java.util.*; + +public class TypeNormalizer +{ + // Maps alias->canonical + public static final String[][] DEFAULT_ALIASES = + { + // PDF aliases + { "application/x-pdf", "application/pdf" }, + // HTML aliases. + { "application/xhtml+xml", "text/html" }, + // MS Word aliases. + { "application/vnd.ms-word", "application/msword" }, + { "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/msword" }, + // PowerPoint aliases. + {"application/mspowerpoint", "application/vnd.ms-powerpoint" }, + {"application/ms-powerpoint", "application/vnd.ms-powerpoint" }, + {"application/mspowerpnt", "application/vnd.ms-powerpoint" }, + {"application/vnd-mspowerpoint", "application/vnd.ms-powerpoint" }, + {"application/powerpoint", "application/vnd.ms-powerpoint" }, + {"application/x-powerpoint", "application/vnd.ms-powerpoint" }, + {"application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint" }, + }; + + private Map<String,String> aliases; + + public static Map<String,String> getDefaultAliases( ) + { + Map<String,String> defaults = new HashMap<String,String>( ); + + for ( String[] alias : DEFAULT_ALIASES ) + { + defaults.put( alias[0], alias[1] ); + } + + return defaults; + } + + public static Map<String,String> parseAliases( String s ) + { + Map<String,String> aliases = new HashMap<String,String>( ); + + for ( String line : s.split( "\\s+" ) ) + { + if ( line.length() < 1 ) continue ; + + String[] tokens = line.split( "[:,]" ); + + if ( tokens.length < 2 ) continue ; + + String type = tokens[0]; + + if ( type.length() < 1 ) continue ; + + for ( int i = 1; i < tokens.length ; i++ ) + { + aliases.put( tokens[i], type ); + } + } + + return aliases; + } + + public void setAliases( Map<String,String> aliases ) + { + this.aliases = aliases; + } + + public Map<String,String> getAliases( ) + { + return this.aliases; + } + + public String normalize( String type ) + { + // Chop off anything after a ';' character. This is + // for stuff like: "text/html; charset=utf-8" + int p = type.indexOf( ';' ); + if ( p >= 0 ) type = type.substring( 0, p ).trim(); + + if ( this.aliases != null && this.aliases.containsKey( type ) ) + { + type = this.aliases.get( type ); + } + + return type; + } + +} Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/URLFilter.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/URLFilter.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/URLFilter.java 2010-10-27 06:56:42 UTC (rev 3305) @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.index; + +import java.io.*; +import java.net.*; +import java.util.*; + +public class URLFilter +{ + public static final String[] DEFAULT_PROHIBITED = + { + "/robots.txt", + "/favicon.ico", + }; + + private Set<String> prohibited; + + public URLFilter( Set<String> prohibited ) + { + this.prohibited = prohibited; + } + + public void setProhibited( Set<String> prohibited ) + { + this.prohibited = prohibited; + } + + public Set<String> getProhibited( ) + { + return this.prohibited; + } + + public boolean isAllowed( URI uri ) + { + String path = uri.getRawPath( ); + + return ! this.prohibited.contains( path ); + } + + public static Set<String> parse( String s ) + { + Set<String> paths = new HashSet<String>( ); + + for ( String type : s.split( "\\s+" ) ) + { + if ( type.length() < 1 ) continue ; + + paths.add( type ); + } + + return paths; + } + + public static Set<String> getDefaultProhibited( ) + { + Set<String> defaults = new HashSet<String>( ); + + for ( String prohibited : DEFAULT_PROHIBITED ) + { + defaults.add( prohibited ); + } + + return defaults; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-10-27 06:48:52
|
Revision: 3304 http://archive-access.svn.sourceforge.net/archive-access/?rev=3304&view=rev Author: binzino Date: 2010-10-27 06:48:46 +0000 (Wed, 27 Oct 2010) Log Message: ----------- Initial checkin of BoilerPipe and dependent libraries. Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/boilerpipe-1.0.3.jar tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/nekohtml-1.9.13.jar tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/xerces-2.9.1.jar Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/boilerpipe-1.0.3.jar =================================================================== (Binary files differ) Property changes on: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/boilerpipe-1.0.3.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/nekohtml-1.9.13.jar =================================================================== (Binary files differ) Property changes on: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/nekohtml-1.9.13.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/xerces-2.9.1.jar =================================================================== (Binary files differ) Property changes on: tags/nutchwax-0_13-JIRA-WAX-75/archive/lib/xerces-2.9.1.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-22 23:48:44
|
Revision: 3303 http://archive-access.svn.sourceforge.net/archive-access/?rev=3303&view=rev Author: bradtofel Date: 2010-10-22 23:48:38 +0000 (Fri, 22 Oct 2010) Log Message: ----------- PRE-1.6.0 POM version info Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/pom.xml trunk/archive-access/projects/wayback/pom.xml trunk/archive-access/projects/wayback/wayback-core/pom.xml trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml trunk/archive-access/projects/wayback/wayback-webapp/pom.xml Modified: trunk/archive-access/projects/wayback/dist/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/pom.xml 2010-10-22 23:48:07 UTC (rev 3302) +++ trunk/archive-access/projects/wayback/dist/pom.xml 2010-10-22 23:48:38 UTC (rev 3303) @@ -3,7 +3,7 @@ <parent> <groupId>org.archive</groupId> <artifactId>wayback</artifactId> - <version>1.5.3-SNAPSHOT</version> + <version>1.6.0</version> </parent> <modelVersion>4.0.0</modelVersion> @@ -53,7 +53,7 @@ <dependency> <groupId>org.archive.wayback</groupId> <artifactId>wayback-core</artifactId> - <version>1.5.3-SNAPSHOT</version> + <version>1.6.0</version> </dependency> </dependencies> <build> Modified: trunk/archive-access/projects/wayback/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/pom.xml 2010-10-22 23:48:07 UTC (rev 3302) +++ trunk/archive-access/projects/wayback/pom.xml 2010-10-22 23:48:38 UTC (rev 3303) @@ -17,10 +17,10 @@ <groupId>org.archive</groupId> <artifactId>wayback</artifactId> <properties> - <globalVersion>1.5.3-SNAPSHOT</globalVersion> + <globalVersion>1.6.0</globalVersion> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> - <version>1.5.3-SNAPSHOT</version> + <version>1.6.0</version> <packaging>pom</packaging> <name>Wayback</name> @@ -246,15 +246,4 @@ </plugins> </reporting> -<!--Needed because we have test code under src/java. - <dependencies> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <version>3.8.1</version> - <scope>test</scope> - </dependency> - </dependencies> - --> - </project> \ No newline at end of file Modified: trunk/archive-access/projects/wayback/wayback-core/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/pom.xml 2010-10-22 23:48:07 UTC (rev 3302) +++ trunk/archive-access/projects/wayback/wayback-core/pom.xml 2010-10-22 23:48:38 UTC (rev 3303) @@ -17,7 +17,7 @@ <parent> <groupId>org.archive</groupId> <artifactId>wayback</artifactId> - <version>1.5.3-SNAPSHOT</version> + <version>1.6.0</version> </parent> <groupId>org.archive.wayback</groupId> <artifactId>wayback-core</artifactId> @@ -99,6 +99,12 @@ <artifactId>htmlparser</artifactId> <version>1.6</version> </dependency> + <dependency> + <groupId>com.flagstone</groupId> + <artifactId>transform</artifactId> + <version>3.0.1-SNAPSHOT</version> + </dependency> + <!-- Doh... I'm not sure what package is configuring org.apache.commons-logging to use log4j, but it's breaking some command line tools. Modified: trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml 2010-10-22 23:48:07 UTC (rev 3302) +++ trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml 2010-10-22 23:48:38 UTC (rev 3303) @@ -2,13 +2,13 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive</groupId> - <version>1.5.3-SNAPSHOT</version> + <version>1.6.0</version> </parent> <modelVersion>4.0.0</modelVersion> <groupId>org.archive.wayback</groupId> <artifactId>wayback-hadoop</artifactId> <name>Wayback Hadoop Jar Packaging</name> - <version>1.5.3-SNAPSHOT</version> + <version>1.6.0</version> <url>http://maven.apache.org</url> <packaging>pom</packaging> <dependencies> @@ -21,7 +21,7 @@ <dependency> <groupId>org.archive.wayback</groupId> <artifactId>wayback-hadoop-java</artifactId> - <version>1.5.3-SNAPSHOT</version> + <version>1.6.0</version> <scope>compile</scope> </dependency> </dependencies> @@ -37,16 +37,7 @@ <finalName>wayback-hadoop</finalName> <archive> <manifestFile>src/main/archive/MANIFEST.MF</manifestFile> -<!-- - <manifest> - <mainClass>org.archive.wayback.hadoop.SortDriver</mainClass> - </manifest> - <manifestEntries> - <Class-Path>hadoop-0.19.1-core.jar lib/commons-cli-2.0-SNAPSHOT.jar lib/commons-codec-1.3.jar lib/commons-httpclient-3.0.1.jar lib/commons-logging-1.0.4.jar lib/commons-logging-api-1.0.4.jar lib/commons-net-1.4.1.jar lib/hsqldb-1.8.0.10.jar lib/jets3t-0.6.1.jar lib/jetty-5.1.4.jar lib/junit-3.8.1.jar lib/kfs-0.2.0.jar lib/log4j-1.2.15.jar lib/oro-2.0.8.jar lib/servlet-api.jar lib/slf4j-api-1.4.3.jar lib/slf4j-log4j12-1.4.3.jar lib/xmlenc-0.52.jar lib/jetty-ext/commons-el.jar lib/jetty-ext/jasper-compiler.jar lib/jetty-ext/jasper-runtime.jar lib/jetty-ext/jsp-api.jar</Class-Path> - </manifestEntries> ---> </archive> - </configuration> <executions> <execution> Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml 2010-10-22 23:48:07 UTC (rev 3302) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml 2010-10-22 23:48:38 UTC (rev 3303) @@ -3,13 +3,13 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive</groupId> - <version>1.5.3-SNAPSHOT</version> + <version>1.6.0</version> </parent> <modelVersion>4.0.0</modelVersion> <groupId>org.archive.wayback</groupId> <artifactId>wayback-hadoop-java</artifactId> <name>Wayback Hadoop Java Code</name> - <version>1.5.3-SNAPSHOT</version> + <version>1.6.0</version> <url>http://maven.apache.org</url> <packaging>jar</packaging> @@ -37,7 +37,7 @@ <groupId>org.archive.wayback</groupId> <artifactId>wayback-core</artifactId> <scope>compile</scope> - <version>1.5.3-SNAPSHOT</version> + <version>1.6.0</version> </dependency> </dependencies> <build> Modified: trunk/archive-access/projects/wayback/wayback-webapp/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2010-10-22 23:48:07 UTC (rev 3302) +++ trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2010-10-22 23:48:38 UTC (rev 3303) @@ -3,7 +3,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive</groupId> - <version>1.5.3-SNAPSHOT</version> + <version>1.6.0</version> </parent> <modelVersion>4.0.0</modelVersion> <groupId>org.archive.wayback</groupId> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-22 23:48:13
|
Revision: 3302 http://archive-access.svn.sourceforge.net/archive-access/?rev=3302&view=rev Author: bradtofel Date: 2010-10-22 23:48:07 +0000 (Fri, 22 Oct 2010) Log Message: ----------- Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/LiveWeb.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/LiveWeb.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/LiveWeb.xml 2010-10-22 23:47:57 UTC (rev 3301) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/LiveWeb.xml 2010-10-22 23:48:07 UTC (rev 3302) @@ -38,5 +38,10 @@ <property name="proxyHostPort" value="localhost:3128" /> --> </bean> + <bean id="excluder-factory-robot" class="org.archive.wayback.accesscontrol.robotstxt.RobotExclusionFilterFactory"> + <property name="maxCacheMS" value="86400000" /> + <property name="userAgent" value="ia_archiver" /> + <property name="webCache" ref="proxylivewebcache" /> + </bean> </beans> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-22 23:48:03
|
Revision: 3301 http://archive-access.svn.sourceforge.net/archive-access/?rev=3301&view=rev Author: bradtofel Date: 2010-10-22 23:47:57 +0000 (Fri, 22 Oct 2010) Log Message: ----------- Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2010-10-22 23:47:16 UTC (rev 3300) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2010-10-22 23:47:57 UTC (rev 3301) @@ -323,7 +323,7 @@ <!-- <import resource="DomainPrefixReplay.xml"/> <bean name="8081" parent="8080:wayback"> - <property name="urlRoot" value="http://localhost.archive.org:8081/" /> + <property name="queryRoot" value="http://localhost.archive.org:8081/" /> <property name="replay" ref="domainprefixreplay" /> <property name="uriConverter"> <bean class="org.archive.wayback.domainprefix.DomainPrefixResultURIConverter"> @@ -356,7 +356,7 @@ <property name="bounceToQueryPrefix" value="false" /> <property name="refererAuth" value="" /> - <property name="urlRoot" value="http://localhost.archive.org:8090/" /> + <property name="queryRoot" value="http://localhost.archive.org:8090/" /> <property name="replay" ref="proxyreplay" /> <property name="uriConverter"> <bean class="org.archive.wayback.proxy.RedirectResultURIConverter"> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-22 23:47:22
|
Revision: 3300 http://archive-access.svn.sourceforge.net/archive-access/?rev=3300&view=rev Author: bradtofel Date: 2010-10-22 23:47:16 +0000 (Fri, 22 Oct 2010) Log Message: ----------- Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2010-10-22 22:36:42 UTC (rev 3299) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2010-10-22 23:47:16 UTC (rev 3300) @@ -152,12 +152,10 @@ </bean> <!-- SWF Rewriting, highly experimental --> - <!-- <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> <property name="mimeContains"> <list> <value>application/x-shockwave-flash</value> - <value>application/xhtml</value> </list> </property> <property name="renderer"> @@ -166,7 +164,6 @@ </bean> </property> </bean> - --> <!-- CSS REPLAY --> <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-22 22:36:48
|
Revision: 3299 http://archive-access.svn.sourceforge.net/archive-access/?rev=3299&view=rev Author: bradtofel Date: 2010-10-22 22:36:42 +0000 (Fri, 22 Oct 2010) Log Message: ----------- very old info.. Removed Paths: ------------- trunk/archive-access/projects/wayback/dist/src/site/xdoc/developer_manual.xml Deleted: trunk/archive-access/projects/wayback/dist/src/site/xdoc/developer_manual.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/developer_manual.xml 2010-10-22 22:35:14 UTC (rev 3298) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/developer_manual.xml 2010-10-22 22:36:42 UTC (rev 3299) @@ -1,29 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<document> - <properties> - <author email="brad.AT.archive.DOT.org">Brad Tofel</author> - <title>Developer Manual</title> - </properties> - <meta name="keyword" content="wayback machine, heritrix, java"/> - <body> - <section name="Introduction"> - <p> - The Wayback Machine is a pure Java application that allows - web browsers to access and search content stored in a set of ARC - files. - </p> - </section> - <section name="Setting up the developement Environment"> - <p> - Please see <a href="developer_environment.html">this page</a>. - </p> - </section> - <section name="Query .jsp customizations"> - <p> - Please see the reference implementation .jsp files for examples, and - the API docs (and source) for - org.archive.wayback.core.UIResults.java. - </p> - </section> - </body> -</document> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-22 22:35:21
|
Revision: 3298 http://archive-access.svn.sourceforge.net/archive-access/?rev=3298&view=rev Author: bradtofel Date: 2010-10-22 22:35:14 +0000 (Fri, 22 Oct 2010) Log Message: ----------- PRE 1.6.0 doc update Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/src/site/site.xml trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml trunk/archive-access/projects/wayback/dist/src/site/xdoc/navigation.xml Added Paths: ----------- trunk/archive-access/projects/wayback/dist/src/site/xdoc/access_point_naming.xml trunk/archive-access/projects/wayback/dist/src/site/xdoc/hadoop.xml Modified: trunk/archive-access/projects/wayback/dist/src/site/site.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/site.xml 2010-10-22 22:34:24 UTC (rev 3297) +++ trunk/archive-access/projects/wayback/dist/src/site/site.xml 2010-10-22 22:35:14 UTC (rev 3298) @@ -28,9 +28,8 @@ <menu name="Overview"> <item name="Requirements" href="requirements.html"/> <item name="Downloads" href="downloads.html"/> - <item name="User Manual" href="user_manual.html"/> <item name="Administrator Manual" href="administrator_manual.html"/> - <item name="Developer Manual" href="developer_manual.html"/> + <item name="Hadoop CDX Generation" href="hadoop.html"/> <item name="Release Notes" href="release_notes.html"/> <item name="FAQ" href="/faq.html"/> <item name="API" href="./apidocs"/> Added: trunk/archive-access/projects/wayback/dist/src/site/xdoc/access_point_naming.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/access_point_naming.xml (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/access_point_naming.xml 2010-10-22 22:35:14 UTC (rev 3298) @@ -0,0 +1,287 @@ +<?xml version="1.0" encoding="utf-8"?> +<document> + <properties> + <title>Access Point Naming</title> + <author email="brad at archive dot org">Brad Tofel</author> + <revision>$$Id$$</revision> + </properties> + + <body> + + + + <section name="Overview"> + <p> + Tomcat (or other servlet containers) are configured to listen on one or + more ports, so each request received on one of those ports is targeted + to a particular webapp based on the name of the .war file deployed under + the <b>webapps/</b> directory. The targeted webapp is determined based on + the first directory in incoming requests. + </p> + <p> + If there are two webapps deployed under the <b>webapps/</b> directory, + called <b>webappA.war</b> and <b>webappB.war</b>, then an incoming + request <b>/webappA/file1</b> will be received by the webapp inside + <b>webappA.war</b> as the request <b>/file1</b>. An incoming request + for <b>webappB/images/foo.gif</b> will be received by the webapp inside + <b>webappB.war</b> as <b>/images/foo.gif</b>. + </p> + <p> + Tomcat (and other servlet containers) allow a special .war file to be + deployed under the <b>webapps/</b> directory called <b>ROOT.war</b> + which will receive requests not matching another webapp. If the above + example also included a webapp deployed under the <b>webapps/</b> + directory named <b>ROOT.war</b>, then requests starting with <b>webappA/</b> + will be received by <b>webappA.war</b>, requests starting with <b>webappB/</b> + will be received by <b>webappB.war</b>, and all other requests will be + receieved by the <b>ROOT.war</b> webapp. + </p> + <p> + If possible, deploying your webapp as <b>ROOT.war</b> will result in + somewhat cleaner public URLs, but this is not a requirement. The + examples below all include alternate URL configuration prefixes depending + on whether you deploy the Wayback .war file as either <b>ROOT.war</b> or + <b>wayback.war</b>. + </p> + <subsection name="AccessPoint Names"> + <p> + Each AccessPoint Spring XML bean definition must include a <b>name</b> + property: + <br></br> + <code> + +<bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> + ... +</bean> + + </code> + <br></br> + The <b>name</b> property indicates how requests <b>that are received + by the Wayback webapp</b> are routed to the appropriate AccessPoint. + Wayback allows targeting AccessPoints based on: + <ul> + <li>hostname</li> + <li>port</li> + <li>first path <b>after</b> the optional webapp deployment name + (which is empty if you deploy your Wayback webapp as + <b>ROOT.war</b>)</li> + </ul> + using the AccessPoint bean <b>name</b> field composed of <b>hostname</b>:<b>port</b>:<b>first_path</b>. + </p> + <p> + If you have configured DNS to resolve multiple hostnames to the same + computer, you can use the <b>hostname:</b> to control AccessPoint + resolving based on virtual hosts. + </p> + <p> + Port is the only required configuration component within the + AccessPoint <b>name</b> configuration. If you have multiple Tomcat + <b>Connector</b>s you can alter this AccessPoint name configuration to + target specific AccessPoints, otherwise, all your AccessPoint names + will have the same port, likely one of 8080, or 80. + </p> + <p> + A more commonly useful AccessPoint name resolving component is the + <b>first-path</b>, which allows you to easily expose multiple + collections within a single Wayback webapp deployment, without varying + hostnames, or ports (which often require network or system + administrator assistance). + </p> + </subsection> + <subsection name="Example AccessPoint names and URLs"> + <p> + The following table shows how urls will map to particular AccessPoints + assuming you have deployed the Wayback webapp as <b>ROOT.war</b>, on + a host with the name "access.example.org", using port 8080. + <table> + <tr> + <th>Access Point bean name</th> + <th>Archival URL prefix</th> + <th>Archival URL query example for <b>http://archive.org</b></th> + </tr> + <tr> + <td>8080:collectionA</td> + <td>http://access.example.org:8080/collectionA/</td> + <td>http://access.example.org:8080/collectionA/*/http://archive.org/</td> + </tr> + <tr> + <td>8080:collectionB</td> + <td>http://access.example.org:8080/collectionB/</td> + <td>http://access.example.org:8080/collectionB/*/http://archive.org/</td> + </tr> + </table> + </p> + <p> + If you deployed your Wayback webapp with the name <b>wayback.war</b> + the following table shows how urls will map to particular + AccessPoints, on a host with the name "access.example.org", using port + 8080. + <table> + <tr> + <th>Access Point bean name</th> + <th>Archival URL prefix</th> + <th>Archival URL query example for <b>http://archive.org</b></th> + </tr> + <tr> + <td>8080:collectionA</td> + <td>http://access.example.org:8080/wayback/collectionA/</td> + <td>http://access.example.org:8080/wayback/collectionA/*/http://archive.org/</td> + </tr> + <tr> + <td>8080:collectionB</td> + <td>http://access.example.org:8080/wayback/collectionB/</td> + <td>http://access.example.org:8080/wayback/collectionB/*/http://archive.org/</td> + </tr> + </table> + </p> + <p> + If you have configured multiple <b>Connector</b>s for your Tomcat + server, listening on both port <b>80</b>, and port <b>8080</b>, and + you deploy <b>ROOT.war</b> you can target different AccessPoints by + port, as shown below. These examples assume your servers hostname is + still "access.example.org". + <table> + <tr> + <th>Access Point bean name</th> + <th>Archival URL prefix</th> + <th>Archival URL query example for <b>http://archive.org</b></th> + </tr> + <tr> + <td>80:collectionA</td> + <td>http://access.example.org/collectionA/</td> + <td>http://access.example.org/collectionA/*/http://archive.org/</td> + </tr> + <tr> + <td>8080:collectionB</td> + <td>http://access.example.org:8080/collectionB/</td> + <td>http://access.example.org:8080/collectionB/*/http://archive.org/</td> + </tr> + <tr> + <td>80:collectionC</td> + <td>http://access.example.org/collectionC/</td> + <td>http://access.example.org/collectionC/*/http://archive.org/</td> + </tr> + </table> + </p> + <p> + If you have a very limited number of AccessPoints to expose, you can + do away with the <b>first-path</b> component, to achieve potentially + very uncluttered Archival URLs. Assuming multiple <b>Connector</b>s + for your Tomcat server, listening on both port <b>80</b>, and port + <b>8080</b>, and you deploy <b>ROOT.war</b> you can target different + AccessPoints by port alone, as shown below. These examples still + assume your servers hostname is "access.example.org". + <table> + <tr> + <th>Access Point bean name</th> + <th>Archival URL prefix</th> + <th>Archival URL query example for <b>http://archive.org</b></th> + </tr> + <tr> + <td>80</td> + <td>http://access.example.org/</td> + <td>http://access.example.org/*/http://archive.org/</td> + </tr> + <tr> + <td>8080</td> + <td>http://access.example.org:8080/</td> + <td>http://access.example.org:8080/*/http://archive.org/</td> + </tr> + </table> + </p> + <p> + Getting somewhat fancy, you can use virtual hosts, doing away with + non-standard ports, and use hostnames alone to specify AccessPoints. + This means getting your Tomcat to listen on port <b>80</b>, and + deploying the webapp as <b>ROOT.war</b>. You'd have to configure your + DNS so both "collection1.example.org" and "collection2.example.org" + point to the host running Wayback: + <table> + <tr> + <th>Access Point bean name</th> + <th>Archival URL prefix</th> + <th>Archival URL query example for <b>http://archive.org</b></th> + </tr> + <tr> + <td>collection1.example.org:80</td> + <td>http://collection1.example.org/</td> + <td>http://collection1.example.org/*/http://archive.org/</td> + </tr> + <tr> + <td>collection2.example.org:80</td> + <td>http://collection2.example.org/</td> + <td>http://collection2.example.org/*/http://archive.org/</td> + </tr> + </table> + </p> + </subsection> + <subsection name="Getting really fancy"> + + <p> + Assuming you've deployed your webapp as <b>ROOT.war</b> and have Tomcat + listening on both port 80 and 8080, with the hostnames + "collection1.example.org" and "collection2.example.org" both + pointing to the host running wayback: + <table> + <tr> + <th>Access Point bean name</th> + <th>Archival URL prefix</th> + <th>Archival URL query example for <b>http://archive.org</b></th> + </tr> + <tr> + <td>collection1.example.org:80</td> + <td>http://collection1.example.org/</td> + <td>http://collection1.example.org/*/http://archive.org/</td> + </tr> + <tr> + <td>collection1.example.org:8080:subset1</td> + <td>http://collection1.example.org:8080/subset1/</td> + <td>http://collection1.example.org:8080/subset1/*/http://archive.org/</td> + </tr> + <tr> + <td>collection1.example.org:8080:subset2</td> + <td>http://collection1.example.org:8080/subset2/</td> + <td>http://collection1.example.org:8080/subset2/*/http://archive.org/</td> + </tr> + <tr> + <td>collection2.example.org:8080</td> + <td>http://collection1.example.org:8080/</td> + <td>http://collection1.example.org:8080/*/http://archive.org/</td> + </tr> + <tr> + <td>collection2.example.org:80:internal</td> + <td>http://collection2.example.org/internal/</td> + <td>http://collection2.example.org/internal/*/http://archive.org/</td> + </tr> + <tr> + <td>collection2.example.org:80:public</td> + <td>http://collection2.example.org/public/</td> + <td>http://collection2.example.org/public/*/http://archive.org/</td> + </tr> + </table> + </p> + </subsection> +<!-- + <subsection name="ArchivalURL Server-Relative URL rewriting"> + <p> + As hard as we've tried to make Server-side rewrite "do the right + thing" in ArchivalURL Replay mode, sometimes things don't work out + right. For example, if a page, <b>http://example.net/news/a.html</b> + contains some Javascript, that generates the following HTML with a + <b>document.write()</b> call: + <br></br> + <code> + +<img src="/foo.gif"></img> + </code> + <br></br> + And you were running an AccessPoint at <b>http://archive.org/web/</b>, + the then page would be expecting that URL to resolve as + <b>http://example.net/foo.gif</b>, but in fact, the page being + displayed as + </p> + <subsection> +--> + </section> + </body> +</document> \ No newline at end of file Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml 2010-10-22 22:34:24 UTC (rev 3297) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml 2010-10-22 22:35:14 UTC (rev 3298) @@ -12,7 +12,6 @@ <section name="Requirements"> - <subsection name="Third Party Packages"> <p> Please see the @@ -53,7 +52,7 @@ <p> Once you have downloaded the .tar.gz file from sourceforge, you will need to unpack the file to access the - webapp file, <b>wayback-webapp-1.4.0.war</b>. + webapp file, <b>wayback-webapp-1.6.0.war</b>. </p> <p> Installation and configuration of this software involves the @@ -66,7 +65,7 @@ Waiting for Tomcat to unpack the .war file. </li> <li> - Customizing base wayback.xml file. + Customizing base wayback.xml and possibly other XML configuration files. </li> <li> Restarting tomcat. @@ -84,18 +83,19 @@ documents. Query access allows users to locate particular documents within the collection by URL and date. Replay access allows users to view archived pages within their web browsers. Some Replay modes - require altering the original pages so embedded content is also loaded - from the wayback service, and not from the live web. + require altering the original pages and resources, so embedded and + referenced content is also loaded from the Wayback service, and not + from the live web. </p> <p> A WaybackCollection defines a set of archived documents and an index - which allows documents to be located within the collection. A + which allows documents to be quickly located within the collection. A WaybackCollection may be exposed to end users through one or more AccessPoints, which define: <ul> <li>the WaybackCollection itself</li> <li>the URL where users can access the collection</li> - <li>how users can query the collection (the Query UI)</li> + <li>how query results are presented to users (the Query UI)</li> <li>how documents are returned to users so they appear correctly in their web browsers (the Replay UI)</li> <li>the look and feel of the wayback user interface</li> @@ -104,12 +104,12 @@ </ul> </p> <p> - Wayback is configured using Spring IOC, to specify and configure - concrete implementations of several basic modules. For information - about using Spring, please see - <a href="http://www.springframework.org/docs/reference/beans.html"> - this page - </a>. + Wayback is configured using + <a href="http://static.springsource.org/spring/docs/2.5.x/reference/beans.html#beans-basics">Spring IOC</a>, + to specify and configure concrete implementations of several basic + modules. Please see the + <a href="http://static.springsource.org/spring/docs/2.5.x/reference/beans.html#beans-basics">Spring website</a> for more information on + configuring beans using Spring XML. </p> <subsection name="AccessPoint configuration options"> <p> @@ -121,8 +121,8 @@ AccessPoint. </li> <li><a href="Query_UI"><b>query</b></a> responsible for generating - user visible content in response to user Queries, HTML, XML, - etc.</li> + user visible content(HTML, XML, etc) in response to user + Queries.</li> <li><a href="Replay_Modes"><b>replay</b></a> responsible for determining the appropriate ReplayRenderer implementation based on the users request and the particular document to be @@ -135,7 +135,9 @@ </ul> </p> <p> - An AccessPoint's configuration may optionally specify the following: + An AccessPoint's configuration may optionally specify the following, + but must specify at least one of replayPrefix, queryPrefix, or + staticPrefix: <ul> <li><a href="Exception_Rendering"><b>exception</b></a> - an implementation responsible for generating error pages to users @@ -158,13 +160,38 @@ </a> - an implementation specifying who is allowed to connect to this AccessPoint </li> - <li><b>urlRoot</b> - a String URL prefix under which all UI - elements should be referenced. + <li> + <b>replayPrefix</b> - a String URL prefix indicating the host, + port, and path to the correct Replay AccessPoint. If unspecified, + defaults to queryPrefix, then staticPrefix. </li> + <li> + <b>queryPrefix</b> - a String URL prefix indicating the host, + port, and path to the correct Query AccessPoint. If unspecified, + defaults to staticPrefix, then replayPrefix. + </li> + <li> + <b>staticPrefix</b> - a String URL prefix indicating the host, + port, and path to static content used within the UI. If + unspecified, defaults to queryPrefix, then replayPrefix. + </li> + <li> + <b>livewebPrefix</b> - a String URL prefix indicating the host, + port, and path to the correct Replay AccessPoint. + </li> <li><b>locale</b> - A specific Locale to use for all requests within this AccessPoint, overriding the users preferred Locale as specified by their web browser. </li> + <li> + <b>exactHostMatch</b> - true or false, if true, only returns + results exactly matching a given request hostname (case insensitive). + Default is false. + </li> + <li> + <b>exactSchemeMatch</b> - true of false, if true, only returns + results exactly matching a given request scheme. Default is true. + </li> </ul> </p> <p> @@ -222,7 +249,9 @@ <li><a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml">BDBCollection.xml</a></li> <li><a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml">CDXCollection.xml</a></li> <li><a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/RemoteCollection.xml">RemoteCollection.xml</a></li> +<!-- <li><a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/NutchCollection.xml">NutchCollection.xml</a></li> +--> </ul> </p> </subsection> @@ -257,13 +286,14 @@ the Access Point. See below for example CONTEXT mappings. </li> <li> - <b>CONTEXT</b> is the context where the Wayback webapp has been - deployed, plus the name of the Access Point. See below for - example CONTEXT mappings. + <b>CONTEXT</b> is an optional context where the Wayback webapp + has been deployed, plus an optional name of the Access Point + within the webapp. See below for example CONTEXT mappings. </li> <li> <b>TIMESTAMP</b> is 0 to 14 digits of a date, possibly - followed by an asterisk ('*'). The format of a + followed by an asterisk ('*'), or one or more tags providing + further specifics for the request. The format of a TIMESTAMP is: <div> <code> @@ -304,6 +334,25 @@ Dec 31, 2004 23:01:00 (pm UTC) - 20041231230100 </div> <br></br> + <p> + Following the date portion of a timestamp, the following flags + can be appended: + <ul> + <li> + <b>id_</b> Identity - perform no alterations of the original + resource, return it as it was archived. + </li> + <li> + <b>js_</b> Javascript - return document marked up as javascript. + </li> + <li> + <b>cs_</b> CSS - return document marked up as CSS. + </li> + <li> + <b>im_</b> Image - return document as an image. + </li> + </ul> + </p> </li> <li> <b>URL</b> represents the actual URL that should be @@ -312,17 +361,9 @@ </ul> <br></br> <div> - Here is an example Archival URL, on an assumed host - <b>wayback.somehost.org</b>, with a wayback webapp deployed as - <b>ROOT</b>, via the Access Point named <b>80:archive</b> for the - page <b>http://www.yahoo.com/</b> on Dec 31, 1999 at 12:00:00 UTC. - <br></br> - <div> - <code> - http://wayback.somehost.org/archive/19991231120000/http://www.yahoo.com/ - </code> - </div> - <br></br> + For some simple and more elaborate examples of how AccessPoint bean + names interact with Archival URLs, please refer to + <a href="access_point_naming.html">Access Point Naming</a>. </div> <br></br> <div> @@ -350,107 +391,15 @@ </div> <br></br> <div> - There is a trade-off between these two approaches. The entirely - server-side rewriting requires more server resources, and is less - tested than the JavaScript method. The JavaScript is also imperfect: - sometimes requests "leak" to the live web temporarily, before the - Javascript has executed. With both methods, not all URLs are - rewritten correctly, especially URLs that are created by JavaScript - that was in the original page, and specialized file types containing - links like Flash and PDF documents. + Currently, we are recommending the entirely server-side rewriting + method, and are deprecating the original server-side plus Javascript + method, but this functionality is still available in Wayback. + Neither method is perfect, not all URLs are rewritten correctly, + particularly URLs that are created by JavaScript in the original + pages, and specialized file types containing links like Flash + and PDF documents. </div> <br></br> - <div> - The <b>name</b> of the Access Point bean in the Spring configuration - file determines the CONTEXT and PORT used in Archival URLs within - that Access Point. The Servlet context name where the Wayback - application is deployed also factors into the CONTEXT used within - Archival URLs for each Access Point. - </div> - <br></br> - <div> - The following examples show the Archival URL prefix for the - following two Access Points depending on the Wayback webapp being - deployed in two different contexts, "ROOT" and "wayback". - </div> - <br></br> - <div> - If the following Access Point definitions are present in the - wayback.xml: - <pre> - -<bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> - <property name="collection" ref="localcollection" /> - ... -</bean> - -<bean name="8080:wayback2" class="org.archive.wayback.webapp.AccessPoint"> - <property name="collection" ref="localcollection" /> - ... -</bean> - - </pre> - then the following table shows the Archival URL prefixes to access - each collection on the host "wayback.somehost.org" assuming a - Tomcat Connector listening on port 8080: - </div> - <table> - <tr> - <th> - webapp deployed at - </th> - <th> - Access Point bean name - </th> - <th> - Archival URL prefix - </th> - </tr> - <tr> - <td> - ROOT - </td> - <td> - 8080:wayback - </td> - <td> - http://wayback.somehost.org:8080/wayback/ - </td> - </tr> - <tr> - <td> - ROOT - </td> - <td> - 8080:wayback2 - </td> - <td> - http://wayback.somehost.org:8080/wayback2/ - </td> - </tr> - <tr> - <td> - wb-webapp - </td> - <td> - 8080:wayback - </td> - <td> - http://wayback.somehost.org:8080/wb-webapp/wayback/ - </td> - </tr> - <tr> - <td> - wb-webapp - </td> - <td> - 8080:wayback2 - </td> - <td> - http://wayback.somehost.org:8080/wb-webapp/wayback2/ - </td> - </tr> - </table> </p> <p> The properties <b>parser</b> and <b>uriConverter</b> @@ -468,7 +417,7 @@ <property name="uriConverter"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> - <property name="replayURIPrefix" value="http://wayback.somehost.org:8080/wb-webapp/wayback/" /> + <property name="replayURIPrefix" value="http://wayback.example.org:8080/collection/" /> </bean> </property> @@ -519,7 +468,7 @@ </td> <td> Points to the Archival URL prefix of the Access Point as - illustrated in the preceding table. + illustrated in <a href="access_point_naming.html">Access Point Naming</a> document. </td> </tr> </table> @@ -533,11 +482,12 @@ <subsection name="Proxy Replay Mode"> <p> Wayback can be configured to act as an HTTP proxy server. To utilize - this mode, the wayback webapp must be deployed as the ROOT context, - and client browser must be configured to proxy all HTTP requests - through the Wayback Machine application. Instead of retrieving - documents from the live web, the Wayback Machine will retrieve - documents from the configured WaybackCollection. + this mode, the wayback webapp <b>must</b> be deployed as the ROOT + context, no other AccessPoints can use the port dedicated to the + Proxy AccessPoint, and client browsers must be configured to proxy + all HTTP requests through the Wayback Machine application. Instead of + retrieving documents from the live web, the Wayback Machine will + retrieve documents from the configured WaybackCollection. </p> <p> Proxy Replay mode does not suffer from the shortcomings of @@ -575,7 +525,7 @@ <pre> <bean name="8090" parent="8080:wayback"> - <property name="urlRoot" value="http://wayback.somehost.org/" /> + <property name="queryPrefix" value="http://wayback.somehost.org/" /> <property name="replay"> ref="proxyreplay" /> <property name="uriconverter"> <bean class="org.archive.wayback.proxy.RedirectResultURIConverter"> @@ -769,6 +719,15 @@ place the banner, attempting to only place the banner in the largest frame within a frameset. </li> + <li> + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Toolbar.jsp">/WEB-INF/replay/Toolbar.jsp</a> + Inserts a fancier banner in the top of replayed documents which + includes a graphic representaion of the number of captures over + time and allows users to navigate directly between other captures + of the current page they are viewing. This version uses Javascript + to place the banner, attempting to only place the banner in the + largest frame within a frameset. + </li> </ul> </p> </subsection> @@ -1092,7 +1051,7 @@ </p> </subsection> - <subsection name="arc-indexer|warc-indexer"> + <subsection name="cdx-indexer"> <p> These tools create a CDX format index for the ARC/WARC file at PATH, either on STDOUT, or at the path specified by CDX_PATH. The @@ -1100,8 +1059,7 @@ files to generate CDX format ResourceIndex. </p> <pre> - bin/arc-indexer [-identity] PATH [CDX_PATH] - bin/warc-indexer [-identity] PATH [CDX_PATH] + bin/cdx-indexer [-identity] PATH [CDX_PATH] </pre> <p> Note that when manually constructing CDX files using these tools, you @@ -1190,9 +1148,9 @@ input URL. </p> <p> - This tool is required when using the <b>arc-indexer</b> or - <b>warc-indexer</b> tools with the <b>-identity</b> option. Typical - usage involves generating an <i>identity</i> CDX index, then + This tool is required when using the <b>cdx-indexer</b> tool with the + <b>-identity</b> option. Typical usage involves generating an + <i>identity</i> CDX index, then passing the lines in that index through this tool to canonicalize the record URL key for queries. If the <i>identity</i> CDX files are kept, then canonicalization schemes can be swapped without Added: trunk/archive-access/projects/wayback/dist/src/site/xdoc/hadoop.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/hadoop.xml (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/hadoop.xml 2010-10-22 22:35:14 UTC (rev 3298) @@ -0,0 +1,209 @@ +<?xml version="1.0" encoding="utf-8"?> + +<document> + <properties> + <title>Wayback Hadoop CDX generation</title> + <author email="brad at archive dot org">Brad Tofel</author> + <revision>$$Id$$</revision> + </properties> + + <body> + <section name="Overview"> + <p> + Wayback is distributed with an .jar file that + simplifies creation of large-scale CDX files using hadoop. This code is + experimental, and will primarily be useful only if your CDX files are + very large - more than a few hundred GB (or more, depending on your + hardware). If building or updating your CDX files is the + largest problem with your installation, this may help. At IA, we've + used this framework to build and deploy CDX files of more than 700GB, + containing billions of records, using a 24 node cluster in about 8 + hours from start to finish. Just writing a 700GB file to disk at + 50MB/sec takes around 4 hours, so the final deployment step takes + around half the time. + </p> + </section> + <section name="Requirements"> + <p> + <ul> + <li>Existing hadoop cluster running Hadoop 0.20.2.</li> + <li>Per-resource CDX files existing in a viable Hadoop-FS (HDFS, S3, + etc).</li> + <li>Perl, to create a split file based on a sample CDX.</li> + </ul> + </p> + </section> + <section name="Implementing"> + <p> + Using hadoop to generate your CDX files requires the following + high-level process: + <ul> + <li> + Integrating per-WARC CDX creation into your ingestion process. + </li> + <li> + Building a split file, to inform hadoop on how to efficiently + partition your data while sorting. + </li> + <li> + Building a manifest listing the specific per-WARC CDX files to sort. + </li> + <li> + Running the hadoop job, which produces a series of alphabetically + contiguous, partitioned CDX in your HDFS. + </li> + <li> + Deploying the partitioned CDX files to your node running Wayback. + </li> + </ul> + </p> + <subsection name="Process integration"> + <p> + It is assumed you will integrate the Wayback indexing code, + <b>cdx-indexer</b> into your standard file ingestion workflow. That + is, whatever system is used to move data from your crawlers into your + permanent repository should be modified to also build a CDX file for + each W/ARC file, as it is ingested, and to store that CDX file in + your HDFS. As an optimization, you can compress the per-WARC CDX files + before storing them in HDFS. If your per-W/ARC CDX files are named + with a trailing, <b>.gz</b> suffix, the Wayback hadoop code will + infer that these input files are compressed. + </p> + </subsection> + <subsection name="Building the split file"> + <p> + CDX files are large sorted text files. Hadoop can be used to perform + large distributed sort operations, but to achieve an efficient total + ordering across your resulting data, you need to give hadoop some + explicit instructions, in the form of the split file, indicating + how to distribute the data in your hadoop job. + </p> + <p> + The split file is a text file, with each line indicating a partition + point URL within the total possible URL space. The number of lines + determines the number of chunks that will be built within hadoop, and + it should be based on the number of concurrent Reduce tasks you can + run concurrently on your cluster. + </p> + <p> + If R is the number of reduce tasks you can run <i>at the same time</i> + on your hadoop cluster, you should use (R-5) as the second argument + to <b>cdx-sample</b>, which is distributed in the wayback .tar.gz + distribution. 5 leaves a few spare reduce workers in case of node + failure, and for speculative execution in case some of your nodes + are running slowly. + </p> + <p> + The more accurately the partition points evenly divide your particular + collections URLs, the more optimally your hadoop distributed + processing will execute. It is assumed that if you are using this + hadoop to generate your CDX, you will already have built a sizable + CDX file for your collection. The <b>cdx-sample</b> tool will sample + an existing sorted CDX file for your collection, and produce a list + of URL partitions that can be used as the split file for your hadoop + processing. You should use the most recent sizable CDX built using + other methods with the <b>cdx-sample</b> tool. If you don't have a + previously built sorted CDX file for your collection, create + a sample sorted CDX file from 20 or 30 random per-WARC CDX files, as + described elsewhere, and use that with the <b>cdx-sample</b> tool. + </p> + <p> + You might use something similar to the following command to build + your split file, assuming an previously built, sorted CDX file for + your collection called <b>existing.cdx</b>, and a total reducer + capacity of <b>20</b>: + <div> + <pre> +cdx-sample existing.cdx 15 > split.txt +hadoop fs -put split.txt /user/brad/input-split.txt + </pre> + </div> + </p> + </subsection> + <subsection name="Building the manifest"> + <p> + The second input file you will need is your list of per-WARC + (or per-ARC) CDX files to process. + </p> + <p> + This file can be built using the <b>hadoop fs -ls</b> command, and + should contain one line for each CDX file you want to sort into your + final CDX file. + </p> + <p> + This is an example line suitable for a manifest file: + <div> + <pre> +hdfs:///cdx/COLL-A/COLLECTION-A-20080726045700-00019-ia400028.us.archive.org.warc.os.cdx.gz + </pre> + </div> + </p> + <p> + You might use something similar to the following command to build + your manifest: + <div> + <pre> +hadoop fs -ls /cdx/collectionA | perl -ane 'print "hdfs://$F[-1]\n";' | grep cdx.gz > manifest.txt +hadoop fs -put manifest.txt /user/brad/input-manifest.txt + </pre> + </div> + </p> + </subsection> + <subsection name="Running the job"> + <p> + This is actually the simplest part! You just need to run: + <div> + <pre> +hadoop jar PATH_TO_WAYBACK_HADOOP_JAR cdxsort -m MAPS [--compress-output] SPLIT INPUT OUTPUT_DIR + </pre> + </div> + The --compress-output option will cause the resulting CDX files in HDFS to be compressed. + </p> + <p> + Here is an example usage: + <div> + <pre> +hadoop jar /home/brad/wayback-hadoop-jar-with-dependencies.jar cdxsort -m 470 --compress-output /user/brad/input-split.txt /user/brad/input-manifest.txt /user/brad/cdx-output + </pre> + </div> + indicating 470 map tasks, and that the resulting files should be + compressed. The number of map tasks to use should be roughly 1/3rd the + number of lines in your INPUT file. + </p> + </subsection> + <subsection name="Deploying the production Wayback CDX:"> + <p> + The previous hadoop command will create alphabetically contiguous, + sorted CDX files in your HDFS output directory(OUTPUT_DIR). To merge + them into a single CDX file which can be efficiently searched using + Wayback, you need to dump them into a single, concatenated file. + For now, you have to use some shell code: + <div> + <pre> +for i in `hadoop fs -ls OUTPUT_DIR | perl -ane 'print "$F[-1]\n";' | sort`; do + hadoop fs -cat $i +done > LOCAL_FILE + </pre> + </div> + where OUTPUT_DIR is the same as the one specified in your hadoop job, + and where LOCAL_FILE is where you want your target file to exist, on + the local computer. + </p> + <p> + If you did specified the --compress-output option with your + "hadoop jar ..." command, you will need to add 'zcat' as follows: + <div> + <pre> +for i in `hadoop fs -ls OUTPUT_DIR | perl -ane 'print "$F[-1]\n";' | sort`; do + hadoop fs -cat $i | zcat +done > LOCAL_FILE + </pre> + </div> + </p> + <p> + At this point, LOCAL_FILE is ready for use as a Wayback CDX. + </p> + </subsection> + </section> + </body> +</document> \ No newline at end of file Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml 2010-10-22 22:34:24 UTC (rev 3297) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml 2010-10-22 22:35:14 UTC (rev 3298) @@ -74,6 +74,16 @@ </p> </section> <section name="News"> + <subsection name="New Release - 1.6.0, 10/21/2010"> + <p> + The long awaited 1.6.0 release is now available, with improved + server-side rewriting of HTML, CSS, Javascript, and SWF content. + This version includes other new features and bug fixes, which are + detailed on the <a href="release_notes.html">release notes</a> page. + Upgrading to this version will require changes to Wayback Spring XML + configuration. + </p> + </subsection> <subsection name="Maintenance Release - 1.4.2, 7/17/2009"> <p> Release 1.4.2 fixes several problems discovered in the 1.4.1 Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/navigation.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/navigation.xml 2010-10-22 22:34:24 UTC (rev 3297) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/navigation.xml 2010-10-22 22:35:14 UTC (rev 3298) @@ -13,7 +13,7 @@ <item name="License" href="/license.html"/> <item name="Requirements" href="requirements.html"/> <item name="Downloads" href="downloads.html"/> - <item name="User Manual" href="user_manual.html"/> + <item name="Administator Manual" href="administrator_manual.html"/> <item name="Release Notes" href="release_notes.html"/> <item name="Test" href="test.html"/> <item name="FAQ" href="/faq.html"/> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-22 22:34:31
|
Revision: 3297 http://archive-access.svn.sourceforge.net/archive-access/?rev=3297&view=rev Author: bradtofel Date: 2010-10-22 22:34:24 +0000 (Fri, 22 Oct 2010) Log Message: ----------- REMOVED, unused Removed Paths: ------------- trunk/archive-access/projects/wayback/dist/src/scripts/location-db Deleted: trunk/archive-access/projects/wayback/dist/src/scripts/location-db =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/location-db 2010-10-21 23:02:07 UTC (rev 3296) +++ trunk/archive-access/projects/wayback/dist/src/scripts/location-db 2010-10-22 22:34:24 UTC (rev 3297) @@ -1,82 +0,0 @@ -#!/usr/bin/env sh -## -## This script allows querying and updating of a remote LocationDB from the -## command line, including syncronizing the LocationDB with an entire directory -## of ARCs files -## -## Optional environment variables -## -## JAVA_HOME Point at a JDK install to use. -## -## WAYBACK_HOME Pointer to your wayback install. If not present, we -## make an educated guess based of position relative to this -## script. -## -## JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. -## - -# Resolve links - $0 may be a softlink -PRG="$0" -while [ -h "$PRG" ]; do - ls=`ls -ld "$PRG"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '.*/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "$PRG"`/"$link" - fi -done -PRGDIR=`dirname "$PRG"` - -# Set WAYBACK_HOME. -if [ -z "$WAYBACK_HOME" ] -then - WAYBACK_HOME=`cd "$PRGDIR/.." ; pwd` -fi - -# Find JAVA_HOME. -if [ -z "$JAVA_HOME" ] -then - JAVA=`which java` - if [ -z "$JAVA" ] - then - echo "Cannot find JAVA. Please set JAVA_HOME or your PATH." - exit 1 - fi - JAVA_BINDIR=`dirname $JAVA` - JAVA_HOME=$JAVA_BINDIR/.. -fi - -if [ -z "$JAVACMD" ] -then - # It may be defined in env - including flags!! - JAVACMD=$JAVA_HOME/bin/java -fi - -# Ignore previous classpath. Build one that contains heritrix jar and content -# of the lib directory into the variable CP. -for jar in `ls $WAYBACK_HOME/lib/*.jar $WAYBACK_HOME/*.jar 2> /dev/null` -do - CP=${CP}:${jar} -done - -# cygwin path translation -if expr `uname` : 'CYGWIN*' > /dev/null; then - CP=`cygpath -p -w "$CP"` - WAYBACK_HOME=`cygpath -p -w "$WAYBACK_HOME"` -fi - -# Make sure of java opts. -if [ -z "$JAVA_OPTS" ] -then - JAVA_OPTS=" -Xmx256m" -fi - -# Main ArcIndexer class. -if [ -z "$CLASS_MAIN" ] -then - CLASS_MAIN='org.archive.wayback.resourcestore.resourcefile.BDBResourceFileLocationDB' -fi - -CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" - This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-21 23:02:13
|
Revision: 3296 http://archive-access.svn.sourceforge.net/archive-access/?rev=3296&view=rev Author: bradtofel Date: 2010-10-21 23:02:07 +0000 (Thu, 21 Oct 2010) Log Message: ----------- RELEASE DOCS Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml trunk/archive-access/projects/wayback/dist/src/site/xdoc/requirements.xml trunk/archive-access/projects/wayback/dist/src/site/xdoc/resource_index.xml Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml 2010-10-21 23:01:21 UTC (rev 3295) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml 2010-10-21 23:02:07 UTC (rev 3296) @@ -14,6 +14,162 @@ to release 1.2.0. </p> </section> + <section name="Release 1.6.0"> + <subsection name="Major Features"> + <ul> + <li> + <a href="http://www.mementoweb.org/guide/quick-intro/">Memento</a> integration. + </li> + <li> + Improved live-web fetching, enabling simpler external caching of + robots.txt documents, or other arbitrary content used to improve + function of a replay session. + </li> + <li> + Customizable logging, via a logging.properties configuration file. + </li> + <li> + Vastly improved Server-side HTML rewriting capabilities, including + customizable rewriting of specific tags and attributes, rewriting + of (some easily recognizable) URLs within JavaScript and CSS. + </li> + <li> + Snazzy embedded toolbar with "sparkline" indicating the distribution + of captures for a given HTML page, control elements enabling + navigation between various versions of the current page, and a + search box to navigate to other URLs directly from a replay session. + </li> + <li> + Improved hadoop CDX generation capabilities for large scale indexes. + </li> + <li> + SWF (Flash) rewriting, to contextualize URLs embedded within flash + content. + </li> + <li> + ArchivalUrl mode now accepts identity ("id_") flag to indicate + transparent replaying of original content. + </li> + <li> + NotInArchive can now optionally trigger an attempt to fill in + content from the live web, on the fly. + </li> + <li> + Updated license to Apache 2. + </li> + </ul> + </subsection> + <subsection name="Major Bug Fixes"> + <ul> + <li> + More robust handling of chunk encoded resources. + </li> + <li> + Fixed problem with improperly resolving path-relative URLs found + in HTML, CSS, Javascript, SWF content. + </li> + <li> + Fixed problem with improperly escaping URLs within HTML when + rewriting them. + </li> + <li> + Fixed problem where a misconfigured or missing administrative + exclusion file was allowing results to be returned, instead of + returning and appropriate error. + </li> + <li> + No longer extracts resources from the ResourceStore before + redirecting to the closest version, which was a major inefficiency. + </li> + </ul> + </subsection> + <subsection name="Minor Features"> + <ul> + <li> + Now provide closeMatches list of search results which were not + applicable given the users request, but that may be useful for + followup requests. + </li> + <li> + Archival Url mode now allows rotating through several character + encoding detection schemes. + </li> + <li> + Proxy Replay mode now accepts ArchivalURL format requests, allowing + dates to be explicitly requested via proxy mode. + </li> + <li> + AccessPoints can be now configured to optional require strict host + matching for queries and replay requests. + </li> + <li> + Now filters URLs which contain user-info (USER:PAS...@ex...) + from the ResourceIndex + </li> + <li> + ArchivalURL mode requests without a datespec are now interpreted as + a request for the most recent capture of the URL. + </li> + <li> + Improvements in mapping incoming requests to AccessPoints, to allow + virtual hosts to target specific AccessPoints. + </li> + <li> + ResourceNotAvailable exceptions now include other close search + results, allowing the UI to offer other versions which may be + available. + </li> + <li> + ArchivalURL mode now forwards request flags (cs_, js_, im_, etc) + when redirecting to a closer date. + </li> + <li> + ResourceStore implementation now allows retrying when confronted + with possibly-transient HTTP 502 errors. + </li> + </ul> + </subsection> + <subsection name="Minor Bug Fixes"> + <ul> + <li> + cdx-indexer (replacement for arc-indexer and warc-indexer) tool now + returns accurate error code on failure. + </li> + <li> + No longer sets JVM-wide default timezone to GMT - now it is set + appropriately on Calendars when needed. + </li> + <li> + Hostname comparison is now case-insensitive. + </li> + <li> + Server-relative archival url redirects now include query arguments + when redirecting. + </li> + <li> + Server-relative archival url redirects now include a Vary HTTP + header, to fix problems when a cache is used between clients and + the Wayback service. + </li> + <li> + Fixed problem with robots.txt caching within a single request, + which caused serious inefficiency. + </li> + <li> + Fixed problem with resources redirecting to alternate HTTP/HTTPS + version of themselves. + </li> + <li> + Fixed problem with accurately converting 14-digit Timestamps into + Date objects for later comparison. + </li> + <li> + Automatically remaps the oft-misused charset "iso-8859-1" to the + superset "cp1252". + </li> + </ul> + </subsection> + </section> <section name="Release 1.4.2"> <subsection name="Features"> <ul> Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/requirements.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/requirements.xml 2010-10-21 23:01:21 UTC (rev 3295) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/requirements.xml 2010-10-21 23:02:07 UTC (rev 3296) @@ -10,16 +10,16 @@ <section name="Runtime Requirements"> <subsection name="JAVA"> <p> - Tested working with SUN v1.5.0_01. + Tested working with SUN v1.6. It is highly recommended you + use the latest version available for your operating system. </p> </subsection> <subsection name="Tomcat"> <p> - Tested working with Apache Tomcat 5.5, which can be - <a href="http://tomcat.apache.org/download-55.cgi"> - downloaded here - </a> - . + Tested working with Apache Tomcat + <a href="http://tomcat.apache.org/download-55.cgi">5.5</a>, + and + <a href="http://tomcat.apache.org/download-60.cgi">6.0</a>. </p> </subsection> Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/resource_index.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/resource_index.xml 2010-10-21 23:01:21 UTC (rev 3295) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/resource_index.xml 2010-10-21 23:02:07 UTC (rev 3296) @@ -102,6 +102,10 @@ <li><b>canonicalizer</b> - an implementation of UrlCanonicalizer. See the section labeled URL Canonicalization below for more information.</li> + <li><b>filter</b> - an implementation of + ObjectFilter<CaptureSearchResult> which will remove + records at query time from the index.</li> + </ul> </p> <p> @@ -153,6 +157,7 @@ </ul> </p> </subsection> + <!-- <subsection name="NutchResourceIndex configuration options"> <p> This implementation, similar to the RemoteResourceIndex, accesses @@ -189,6 +194,7 @@ </ul> </p> </subsection> + --> </section> <section name="URL Canonicalization"> <subsection name="Introduction and Concepts"> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-21 23:01:27
|
Revision: 3295 http://archive-access.svn.sourceforge.net/archive-access/?rev=3295&view=rev Author: bradtofel Date: 2010-10-21 23:01:21 +0000 (Thu, 21 Oct 2010) Log Message: ----------- tweaks... Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/src/site/fml/faq.fml Modified: trunk/archive-access/projects/wayback/dist/src/site/fml/faq.fml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/fml/faq.fml 2010-10-21 22:41:28 UTC (rev 3294) +++ trunk/archive-access/projects/wayback/dist/src/site/fml/faq.fml 2010-10-21 23:01:21 UTC (rev 3295) @@ -18,9 +18,11 @@ <p> Primarily it is a few easily replaceable interfaces, and some core classes that utilize those interfaces to provide the Wayback - service. Presently only trivial implementations of those interfaces - have been developed, but we hope that these interfaces will allow a - high degree of flexibility and experimentation. + service. Presently, a variety of implementations of these interfaces + are available allowing a high degree of customization in the number + and distribution of native resources, the location, size and + ability to easily update the index, and presentation of search + results and archived resources to end users. </p> </answer> </faq> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-21 22:41:35
|
Revision: 3294 http://archive-access.svn.sourceforge.net/archive-access/?rev=3294&view=rev Author: bradtofel Date: 2010-10-21 22:41:28 +0000 (Thu, 21 Oct 2010) Log Message: ----------- replaced with cdx-indexer Removed Paths: ------------- trunk/archive-access/projects/wayback/dist/src/scripts/arc-indexer trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer Deleted: trunk/archive-access/projects/wayback/dist/src/scripts/arc-indexer =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/arc-indexer 2010-10-21 22:40:29 UTC (rev 3293) +++ trunk/archive-access/projects/wayback/dist/src/scripts/arc-indexer 2010-10-21 22:41:28 UTC (rev 3294) @@ -1,82 +0,0 @@ -#!/usr/bin/env sh -## -## This script creates a CDX file for all ARC files in a directory -## PUTs those CDX files into a remote pipeline, and informs a remote -## LocationDB of the locations of all the ARC files. -## -## Optional environment variables -## -## JAVA_HOME Point at a JDK install to use. -## -## WAYBACK_HOME Pointer to your wayback install. If not present, we -## make an educated guess based of position relative to this -## script. -## -## JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. -## - -# Resolve links - $0 may be a softlink -PRG="$0" -while [ -h "$PRG" ]; do - ls=`ls -ld "$PRG"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '.*/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "$PRG"`/"$link" - fi -done -PRGDIR=`dirname "$PRG"` - -# Set WAYBACK_HOME. -if [ -z "$WAYBACK_HOME" ] -then - WAYBACK_HOME=`cd "$PRGDIR/.." ; pwd` -fi - -# Find JAVA_HOME. -if [ -z "$JAVA_HOME" ] -then - JAVA=`which java` - if [ -z "$JAVA" ] - then - echo "Cannot find JAVA. Please set JAVA_HOME or your PATH." - exit 1 - fi - JAVA_BINDIR=`dirname $JAVA` - JAVA_HOME=$JAVA_BINDIR/.. -fi - -if [ -z "$JAVACMD" ] -then - # It may be defined in env - including flags!! - JAVACMD=$JAVA_HOME/bin/java -fi - -# Ignore previous classpath. Build one that contains heritrix jar and content -# of the lib directory into the variable CP. -for jar in `ls $WAYBACK_HOME/lib/*.jar $WAYBACK_HOME/*.jar 2> /dev/null` -do - CP=${CP}:${jar} -done - -# cygwin path translation -if expr `uname` : 'CYGWIN*' > /dev/null; then - CP=`cygpath -p -w "$CP"` - WAYBACK_HOME=`cygpath -p -w "$WAYBACK_HOME"` -fi - -# Make sure of java opts. -if [ -z "$JAVA_OPTS" ] -then - JAVA_OPTS=" -Xmx256m" -fi - -# Main ArcIndexer class. -if [ -z "$CLASS_MAIN" ] -then - CLASS_MAIN='org.archive.wayback.resourcestore.indexer.IndexWorker' -fi - -CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" - Deleted: trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer 2010-10-21 22:40:29 UTC (rev 3293) +++ trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer 2010-10-21 22:41:28 UTC (rev 3294) @@ -1,82 +0,0 @@ -#!/usr/bin/env sh -## -## This script creates a CDX file for all ARC files in a directory -## PUTs those CDX files into a remote pipeline, and informs a remote -## LocationDB of the locations of all the ARC files. -## -## Optional environment variables -## -## JAVA_HOME Point at a JDK install to use. -## -## WAYBACK_HOME Pointer to your wayback install. If not present, we -## make an educated guess based of position relative to this -## script. -## -## JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. -## - -# Resolve links - $0 may be a softlink -PRG="$0" -while [ -h "$PRG" ]; do - ls=`ls -ld "$PRG"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '.*/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "$PRG"`/"$link" - fi -done -PRGDIR=`dirname "$PRG"` - -# Set WAYBACK_HOME. -if [ -z "$WAYBACK_HOME" ] -then - WAYBACK_HOME=`cd "$PRGDIR/.." ; pwd` -fi - -# Find JAVA_HOME. -if [ -z "$JAVA_HOME" ] -then - JAVA=`which java` - if [ -z "$JAVA" ] - then - echo "Cannot find JAVA. Please set JAVA_HOME or your PATH." - exit 1 - fi - JAVA_BINDIR=`dirname $JAVA` - JAVA_HOME=$JAVA_BINDIR/.. -fi - -if [ -z "$JAVACMD" ] -then - # It may be defined in env - including flags!! - JAVACMD=$JAVA_HOME/bin/java -fi - -# Ignore previous classpath. Build one that contains heritrix jar and content -# of the lib directory into the variable CP. -for jar in `ls $WAYBACK_HOME/lib/*.jar $WAYBACK_HOME/*.jar 2> /dev/null` -do - CP=${CP}:${jar} -done - -# cygwin path translation -if expr `uname` : 'CYGWIN*' > /dev/null; then - CP=`cygpath -p -w "$CP"` - WAYBACK_HOME=`cygpath -p -w "$WAYBACK_HOME"` -fi - -# Make sure of java opts. -if [ -z "$JAVA_OPTS" ] -then - JAVA_OPTS=" -Xmx256m" -fi - -# Main ArcIndexer class. -if [ -z "$CLASS_MAIN" ] -then - CLASS_MAIN='org.archive.wayback.resourcestore.indexer.IndexWorker' -fi - -CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" - This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-21 22:40:35
|
Revision: 3293 http://archive-access.svn.sourceforge.net/archive-access/?rev=3293&view=rev Author: bradtofel Date: 2010-10-21 22:40:29 +0000 (Thu, 21 Oct 2010) Log Message: ----------- INITIAL REV: new tool cdx-indexer, a replacement for arc-indexer and warc-indexer, as well as two new undocumented tools for dealing with specially compressed CDX indexes. We haven't figured out how to release the additional zlib C code to create these CDX's so the tools and code is dead weight at for outside institutions at the moment. Added Paths: ----------- trunk/archive-access/projects/wayback/dist/src/scripts/cdx-sample trunk/archive-access/projects/wayback/dist/src/scripts/zipline-manifest trunk/archive-access/projects/wayback/dist/src/scripts/zl-bin-search Added: trunk/archive-access/projects/wayback/dist/src/scripts/cdx-sample =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/cdx-sample (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/scripts/cdx-sample 2010-10-21 22:40:29 UTC (rev 3293) @@ -0,0 +1,44 @@ +#!/usr/bin/env perl +use strict; +use warnings; + +my $v = 0; +sub USAGE { + my($msg,$code) = @_; + $msg = "" unless (defined($msg) && length($msg)); + print STDERR <<EOUSAGE; +$msg USAGE: $0 PATH NUM +Create a split file for use with Wayback hadoop indexing code on STDOUT. +Finds approximate offsets at host boundaries for file at PATH, producing +a split file with NUM parts, which indicates the number of reduce tasks. +EOUSAGE + exit($code); +} +my $path = shift || &USAGE("Need path to CDX argument 1\n\n",2); +if(($path eq "-h") or ($path eq "-help") or ($path eq "--help")) { + &USAGE(0); +} +my $num = shift || &USAGE("Need NUM chunk count argument 2\n\n",2); + +my $fh; +open($fh,$path) or die "FAILED open($path) ($!)"; +my $size = (-s $fh); +my $per = $size / $num; +# print first for blank: +print "\n"; +foreach my $i (1..$num-1) { + my $offset = $per * $i; + seek($fh,$offset,0) or die "failed seek($path,$offset,0) ($!)"; + # consume first line to align on next complete line: + my $line = <$fh>; + while(1) { + my $line = <$fh>; + die "bad line($line) in ($path)" unless length($line); + if($line =~ /^([^:\/]+)[:\/]/) { + print "$1\n"; + last; + } + print STDERR "Skipping wierd line($line)\n"; + } +} +close($fh) or die "FAILED close($path) ($!)"; Added: trunk/archive-access/projects/wayback/dist/src/scripts/zipline-manifest =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/zipline-manifest (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/scripts/zipline-manifest 2010-10-21 22:40:29 UTC (rev 3293) @@ -0,0 +1,82 @@ +#!/usr/bin/env sh +## +## This script allows querying and updating of a remote LocationDB from the +## command line, including syncronizing the LocationDB with an entire directory +## of ARCs files +## +## Optional environment variables +## +## JAVA_HOME Point at a JDK install to use. +## +## WAYBACK_HOME Pointer to your wayback install. If not present, we +## make an educated guess based of position relative to this +## script. +## +## JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. +## + +# Resolve links - $0 may be a softlink +PRG="$0" +while [ -h "$PRG" ]; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`/"$link" + fi +done +PRGDIR=`dirname "$PRG"` + +# Set WAYBACK_HOME. +if [ -z "$WAYBACK_HOME" ] +then + WAYBACK_HOME=`cd "$PRGDIR/.." ; pwd` +fi + +# Find JAVA_HOME. +if [ -z "$JAVA_HOME" ] +then + JAVA=`which java` + if [ -z "$JAVA" ] + then + echo "Cannot find JAVA. Please set JAVA_HOME or your PATH." + exit 1 + fi + JAVA_BINDIR=`dirname $JAVA` + JAVA_HOME=$JAVA_BINDIR/.. +fi + +if [ -z "$JAVACMD" ] +then + # It may be defined in env - including flags!! + JAVACMD=$JAVA_HOME/bin/java +fi + +# Ignore previous classpath. Build one that contains heritrix jar and content +# of the lib directory into the variable CP. +for jar in `ls $WAYBACK_HOME/lib/*.jar $WAYBACK_HOME/*.jar 2> /dev/null` +do + CP=${CP}:${jar} +done + +# cygwin path translation +if expr `uname` : 'CYGWIN*' > /dev/null; then + CP=`cygpath -p -w "$CP"` + WAYBACK_HOME=`cygpath -p -w "$WAYBACK_HOME"` +fi + +# Make sure of java opts. +if [ -z "$JAVA_OPTS" ] +then + JAVA_OPTS=" -Xmx256m" +fi + +# Main class. +if [ -z "$CLASS_MAIN" ] +then + CLASS_MAIN='org.archive.wayback.resourceindex.ziplines.ZiplinesChunkIterator' +fi + +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" + Added: trunk/archive-access/projects/wayback/dist/src/scripts/zl-bin-search =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/zl-bin-search (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/scripts/zl-bin-search 2010-10-21 22:40:29 UTC (rev 3293) @@ -0,0 +1,82 @@ +#!/usr/bin/env sh +## +## This script creates a CDX file for all ARC files in a directory +## PUTs those CDX files into a remote pipeline, and informs a remote +## LocationDB of the locations of all the ARC files. +## +## Optional environment variables +## +## JAVA_HOME Point at a JDK install to use. +## +## WAYBACK_HOME Pointer to your wayback install. If not present, we +## make an educated guess based of position relative to this +## script. +## +## JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. +## + +# Resolve links - $0 may be a softlink +PRG="$0" +while [ -h "$PRG" ]; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`/"$link" + fi +done +PRGDIR=`dirname "$PRG"` + +# Set WAYBACK_HOME. +if [ -z "$WAYBACK_HOME" ] +then + WAYBACK_HOME=`cd "$PRGDIR/.." ; pwd` +fi + +# Find JAVA_HOME. +if [ -z "$JAVA_HOME" ] +then + JAVA=`which java` + if [ -z "$JAVA" ] + then + echo "Cannot find JAVA. Please set JAVA_HOME or your PATH." + exit 1 + fi + JAVA_BINDIR=`dirname $JAVA` + JAVA_HOME=$JAVA_BINDIR/.. +fi + +if [ -z "$JAVACMD" ] +then + # It may be defined in env - including flags!! + JAVACMD=$JAVA_HOME/bin/java +fi + +# Ignore previous classpath. Build one that contains heritrix jar and content +# of the lib directory into the variable CP. +for jar in `ls $WAYBACK_HOME/lib/*.jar $WAYBACK_HOME/*.jar 2> /dev/null` +do + CP=${CP}:${jar} +done + +# cygwin path translation +if expr `uname` : 'CYGWIN*' > /dev/null; then + CP=`cygpath -p -w "$CP"` + WAYBACK_HOME=`cygpath -p -w "$WAYBACK_HOME"` +fi + +# Make sure of java opts. +if [ -z "$JAVA_OPTS" ] +then + JAVA_OPTS=" -Xmx256m" +fi + +# Main class. +if [ -z "$CLASS_MAIN" ] +then + CLASS_MAIN='org.archive.wayback.resourceindex.ziplines.ZiplinesSearchResultSource' +fi + +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" + Property changes on: trunk/archive-access/projects/wayback/dist/src/scripts/zl-bin-search ___________________________________________________________________ Added: svn:executable + * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-21 17:45:17
|
Revision: 3292 http://archive-access.svn.sourceforge.net/archive-access/?rev=3292&view=rev Author: bradtofel Date: 2010-10-21 17:45:11 +0000 (Thu, 21 Oct 2010) Log Message: ----------- CONFIG: added example configuration for SWF rewriting, which is not enabled by default.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2010-10-21 17:42:32 UTC (rev 3291) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2010-10-21 17:45:11 UTC (rev 3292) @@ -151,6 +151,22 @@ <property name="renderer" ref="archivalsaxreplayrenderer"/> </bean> + <!-- SWF Rewriting, highly experimental --> + <!-- + <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> + <property name="mimeContains"> + <list> + <value>application/x-shockwave-flash</value> + <value>application/xhtml</value> + </list> + </property> + <property name="renderer"> + <bean class="org.archive.wayback.replay.swf.SWFReplayRenderer"> + <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> + </bean> + </property> + </bean> + --> <!-- CSS REPLAY --> <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-21 17:42:38
|
Revision: 3291 http://archive-access.svn.sourceforge.net/archive-access/?rev=3291&view=rev Author: bradtofel Date: 2010-10-21 17:42:32 +0000 (Thu, 21 Oct 2010) Log Message: ----------- WHITESPACE/JAVADOC Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/RobustMovieDecoder.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/SWFReplayRenderer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/RobustMovieDecoder.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/RobustMovieDecoder.java 2010-10-21 17:34:40 UTC (rev 3290) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/RobustMovieDecoder.java 2010-10-21 17:42:32 UTC (rev 3291) @@ -45,10 +45,23 @@ /** Allow arbitrary trailing values in decoded tags - attempt anything. */ public static final int DECODE_RULE_LAX = 2; + /** + * @param decodeRule the robustness level to use for re-aligning the + * decoder. MovieDecoder just chokes if a given tag has trailing data - does + * not need as many bytes as it declares it has. This decorator will attempt + * to realign by either: + * 1) LAX - chewing up *any* bytes to realign + * 2) NULLS - chewing up only trailing NULL bytes trailing the tag + * 3) STRICT - throw exception if an realignment is required. + */ public void setDecodeRule(int decodeRule) { this.decodeRule = decodeRule; } + /** + * @param delegate the underlying/wrapped MovieDecoder which does the heavy + * lifting to parse out tags. + */ public void setDelegate(SWFFactory<MovieTag> delegate) { this.delegate = delegate; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/SWFReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/SWFReplayRenderer.java 2010-10-21 17:34:40 UTC (rev 3290) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/SWFReplayRenderer.java 2010-10-21 17:42:32 UTC (rev 3291) @@ -21,7 +21,6 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.io.OutputStream; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; @@ -59,11 +58,17 @@ import com.flagstone.transform.coder.DecoderRegistry; /** + * ReplayRenderer which passes embedded URLs inside flash (SWF) format content + * through a ResultURIConverter, allowing them to be rewritten. + * * @author brad * */ public class SWFReplayRenderer implements ReplayRenderer { private HttpHeaderProcessor httpHeaderProcessor; + /** + * @param httpHeaderProcessor to use for rewriting original HTTP headers + */ public SWFReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) { this.httpHeaderProcessor = httpHeaderProcessor; } @@ -128,7 +133,7 @@ baos.writeTo(httpResponse.getOutputStream()); } - public Movie getRobustMovie(int decodeRule) { + private Movie getRobustMovie(int decodeRule) { Movie movie = new Movie(); DecoderRegistry registry = DecoderRegistry.getDefault(); @@ -141,7 +146,7 @@ return movie; } - public MovieTag rewriteTag(SWFUrlRewriter rw, MovieTag tag) { + private MovieTag rewriteTag(SWFUrlRewriter rw, MovieTag tag) { if(tag instanceof DoAction) { DoAction doAction = (DoAction) tag; doAction.setActions(rewriteActions(rw, doAction.getActions())); @@ -153,7 +158,8 @@ return tag; } - public List<EventHandler> rewriteEventHandlers(SWFUrlRewriter rw, List<EventHandler> handlers) { + private List<EventHandler> rewriteEventHandlers(SWFUrlRewriter rw, + List<EventHandler> handlers) { ArrayList<EventHandler> newActions = new ArrayList<EventHandler>(); for(EventHandler handler : handlers) { handler.setActions(rewriteActions(rw, handler.getActions())); @@ -162,7 +168,7 @@ return newActions; } - public List<Action> rewriteActions(SWFUrlRewriter rw, List<Action> actions) { + private List<Action> rewriteActions(SWFUrlRewriter rw, List<Action> actions) { ArrayList<Action> newActions = new ArrayList<Action>(); for(Action action : actions) { if(action instanceof Table) { @@ -175,12 +181,14 @@ Push push = (Push) action; - newActions.add(new Push(rewriteObjectValues(rw, push.getValues()))); + newActions.add(new Push(rewriteObjectValues(rw, + push.getValues()))); } else if(action instanceof GetUrl) { GetUrl getUrl = (GetUrl) action; - newActions.add(new GetUrl(rewriteString(rw, getUrl.getUrl()),getUrl.getTarget())); + newActions.add(new GetUrl(rewriteString(rw, getUrl.getUrl()), + getUrl.getTarget())); } else { newActions.add(action); @@ -189,7 +197,9 @@ return newActions; } - public List<Object> rewriteObjectValues(SWFUrlRewriter rw, List<Object> values) { + private List<Object> rewriteObjectValues(SWFUrlRewriter rw, + List<Object> values) { + ArrayList<Object> nvals = new ArrayList<Object>(); for(int i = 0; i < values.size(); i++) { Object orig = values.get(i); @@ -201,20 +211,25 @@ } return nvals; } - public List<String> rewriteStringValues(SWFUrlRewriter rw, List<String> values) { + + private List<String> rewriteStringValues(SWFUrlRewriter rw, + List<String> values) { + ArrayList<String> nvals = new ArrayList<String>(); for(int i = 0; i < values.size(); i++) { nvals.add(rewriteString(rw, values.get(i))); } return nvals; } - public String rewriteString(SWFUrlRewriter rw, String original) { + + private String rewriteString(SWFUrlRewriter rw, String original) { if(original.startsWith("http://")) { // System.err.format("Rewrite(%s)\n",original); return rw.rewrite(original); } return original; } + private class SWFUrlRewriter { UURI baseUrl = null; ResultURIConverter converter; @@ -224,7 +239,8 @@ this.datespec = datespec; this.converter = converter; try { - this.baseUrl = UURIFactory.getInstance(baseUrl.toExternalForm()); + this.baseUrl = + UURIFactory.getInstance(baseUrl.toExternalForm()); } catch (URIException e) { e.printStackTrace(); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-21 17:34:47
|
Revision: 3290 http://archive-access.svn.sourceforge.net/archive-access/?rev=3290&view=rev Author: bradtofel Date: 2010-10-21 17:34:40 +0000 (Thu, 21 Oct 2010) Log Message: ----------- INITIAL REV: very early but somewhat functional SWF URL rewriting. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/RobustMovieDecoder.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/SWFReplayRenderer.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/RobustMovieDecoder.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/RobustMovieDecoder.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/RobustMovieDecoder.java 2010-10-21 17:34:40 UTC (rev 3290) @@ -0,0 +1,88 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.replay.swf; + +import java.io.IOException; +import java.util.List; + +import com.flagstone.transform.MovieTag; +import com.flagstone.transform.coder.CoderException; +import com.flagstone.transform.coder.Context; +import com.flagstone.transform.coder.SWFDecoder; +import com.flagstone.transform.coder.SWFFactory; + +/** + * @author brad + * + */ +public class RobustMovieDecoder implements SWFFactory<MovieTag> { + private SWFFactory<MovieTag> delegate = null; + private int decodeRule = DECODE_RULE_STRICT; + + /** Decoding robustness/sloppiness factor. */ + public static final int DECODE_RULE = 19; + /** Allow no unparsed data - very strict decoding. */ + public static final int DECODE_RULE_STRICT = 0; + /** Allow trailing NULL values in decoded tags. */ + public static final int DECODE_RULE_NULLS = 1; + /** Allow arbitrary trailing values in decoded tags - attempt anything. */ + public static final int DECODE_RULE_LAX = 2; + + public void setDecodeRule(int decodeRule) { + this.decodeRule = decodeRule; + } + + public void setDelegate(SWFFactory<MovieTag> delegate) { + this.delegate = delegate; + } + + public void getObject(List<MovieTag> list, SWFDecoder coder, Context context) + throws IOException { + try { + delegate.getObject(list, coder, context); + } catch(CoderException e) { + int delta = coder.getDelta(); + switch (decodeRule) { + case DECODE_RULE_LAX: + // just eat the next 'delta' bytes and hope for the best.. + while(delta-- > 0) { + coder.readByte(); + } + break; + case DECODE_RULE_NULLS: + // make sure next 'delta' bytes are null: + while(delta-- > 0) { + if(coder.readByte() != 0) { + throw new CoderException(coder.getLocation(), + coder.getExpected(), coder.getDelta()); + } + } + break; + case DECODE_RULE_STRICT: + default: + throw new CoderException(coder.getLocation(), + coder.getExpected(), coder.getDelta()); + } + + } + + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/RobustMovieDecoder.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/SWFReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/SWFReplayRenderer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/SWFReplayRenderer.java 2010-10-21 17:34:40 UTC (rev 3290) @@ -0,0 +1,247 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.replay.swf; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.zip.DataFormatException; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.commons.httpclient.URIException; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; +import org.archive.wayback.ReplayRenderer; +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.CaptureSearchResults; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadContentException; +import org.archive.wayback.exception.WaybackException; +import org.archive.wayback.replay.HttpHeaderOperation; +import org.archive.wayback.replay.HttpHeaderProcessor; + +import com.flagstone.transform.DoAction; +import com.flagstone.transform.EventHandler; +import com.flagstone.transform.Movie; +import com.flagstone.transform.MovieTag; +import com.flagstone.transform.action.Action; +import com.flagstone.transform.action.GetUrl; +import com.flagstone.transform.action.Push; +import com.flagstone.transform.action.Table; +import com.flagstone.transform.button.DefineButton2; +import com.flagstone.transform.coder.DecoderRegistry; + +/** + * @author brad + * + */ +public class SWFReplayRenderer implements ReplayRenderer { + private HttpHeaderProcessor httpHeaderProcessor; + public SWFReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) { + this.httpHeaderProcessor = httpHeaderProcessor; + } + + public void renderResource(HttpServletRequest httpRequest, + HttpServletResponse httpResponse, WaybackRequest wbRequest, + CaptureSearchResult result, Resource resource, + ResultURIConverter uriConverter, CaptureSearchResults results) + throws ServletException, IOException, WaybackException { + + // copy HTTP response code: + HttpHeaderOperation.copyHTTPMessageHeader(resource, httpResponse); + + // load and process original headers: + Map<String,String> headers = HttpHeaderOperation.processHeaders( + resource, result, uriConverter, httpHeaderProcessor); + + // The URL of the resource, for resolving embedded relative URLs: + URL url = null; + try { + url = new URL(result.getOriginalUrl()); + } catch (MalformedURLException e1) { + e1.printStackTrace(); + throw new IOException(e1.getMessage()); + } + // the date to associate with the embedded, rewritten URLs: + String datespec = result.getCaptureTimestamp(); + SWFUrlRewriter rw = new SWFUrlRewriter(uriConverter, url, datespec); + + + // OK, try to read the input movie: + Movie movie = getRobustMovie(RobustMovieDecoder.DECODE_RULE_NULLS); + + try { + movie.decodeFromStream(resource); + } catch (DataFormatException e1) { + throw new BadContentException(e1.getLocalizedMessage()); + } + Movie outMovie = new Movie(movie); + + List<MovieTag> inTags = movie.getObjects(); + ArrayList<MovieTag> outTags = new ArrayList<MovieTag>(); + for(MovieTag tag : inTags) { + outTags.add(rewriteTag(rw,tag)); + } + outMovie.setObjects(outTags); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try { + outMovie.encodeToStream(baos); + } catch (DataFormatException e) { + throw new BadContentException(e.getLocalizedMessage()); + } + + // put the new corrected length: + headers.put(HttpHeaderOperation.HTTP_LENGTH_HEADER, + String.valueOf(baos.size())); + + // send the new headers: + HttpHeaderOperation.sendHeaders(headers, httpResponse); + + // and copy the stored up byte-stream: + baos.writeTo(httpResponse.getOutputStream()); + + } + public Movie getRobustMovie(int decodeRule) { + Movie movie = new Movie(); + + DecoderRegistry registry = DecoderRegistry.getDefault(); + RobustMovieDecoder decoder = new RobustMovieDecoder(); + decoder.setDelegate(registry.getMovieDecoder()); + decoder.setDecodeRule(decodeRule); + registry.setMovieDecoder(decoder); + movie.setRegistry(registry); + + return movie; + } + + public MovieTag rewriteTag(SWFUrlRewriter rw, MovieTag tag) { + if(tag instanceof DoAction) { + DoAction doAction = (DoAction) tag; + doAction.setActions(rewriteActions(rw, doAction.getActions())); + } else if(tag instanceof DefineButton2) { + + DefineButton2 defButton2 = (DefineButton2) tag; + defButton2.setEvents(rewriteEventHandlers(rw, defButton2.getEvents())); + } + return tag; + } + + public List<EventHandler> rewriteEventHandlers(SWFUrlRewriter rw, List<EventHandler> handlers) { + ArrayList<EventHandler> newActions = new ArrayList<EventHandler>(); + for(EventHandler handler : handlers) { + handler.setActions(rewriteActions(rw, handler.getActions())); + newActions.add(handler); + } + return newActions; + } + + public List<Action> rewriteActions(SWFUrlRewriter rw, List<Action> actions) { + ArrayList<Action> newActions = new ArrayList<Action>(); + for(Action action : actions) { + if(action instanceof Table) { + + Table table = (Table) action; + table.setValues(rewriteStringValues(rw, table.getValues())); + newActions.add(table); + + } else if(action instanceof Push) { + + Push push = (Push) action; + + newActions.add(new Push(rewriteObjectValues(rw, push.getValues()))); + + } else if(action instanceof GetUrl) { + + GetUrl getUrl = (GetUrl) action; + newActions.add(new GetUrl(rewriteString(rw, getUrl.getUrl()),getUrl.getTarget())); + + } else { + newActions.add(action); + } + } + return newActions; + } + + public List<Object> rewriteObjectValues(SWFUrlRewriter rw, List<Object> values) { + ArrayList<Object> nvals = new ArrayList<Object>(); + for(int i = 0; i < values.size(); i++) { + Object orig = values.get(i); + if(orig instanceof String) { + nvals.add(rewriteString(rw, (String)orig)); + } else { + nvals.add(orig); + } + } + return nvals; + } + public List<String> rewriteStringValues(SWFUrlRewriter rw, List<String> values) { + ArrayList<String> nvals = new ArrayList<String>(); + for(int i = 0; i < values.size(); i++) { + nvals.add(rewriteString(rw, values.get(i))); + } + return nvals; + } + public String rewriteString(SWFUrlRewriter rw, String original) { + if(original.startsWith("http://")) { +// System.err.format("Rewrite(%s)\n",original); + return rw.rewrite(original); + } + return original; + } + private class SWFUrlRewriter { + UURI baseUrl = null; + ResultURIConverter converter; + String datespec; + public SWFUrlRewriter(ResultURIConverter converter, URL baseUrl, + String datespec) { + this.datespec = datespec; + this.converter = converter; + try { + this.baseUrl = UURIFactory.getInstance(baseUrl.toExternalForm()); + } catch (URIException e) { + e.printStackTrace(); + } + + + } + public String rewrite(String url) { + try { + String resolved = url; + if(baseUrl != null) { + resolved = UURIFactory.getInstance(baseUrl, url).toString(); + } + return converter.makeReplayURI(datespec, resolved); + } catch (URIException e) { + e.printStackTrace(); + } + return url; + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/swf/SWFReplayRenderer.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-21 17:33:36
|
Revision: 3289 http://archive-access.svn.sourceforge.net/archive-access/?rev=3289&view=rev Author: bradtofel Date: 2010-10-21 17:33:30 +0000 (Thu, 21 Oct 2010) Log Message: ----------- BUGFIX(unreported) corrected rewriting of URLs embedded in CSS, which has no concept of <base href="">. Now for memento, we make URLs absolute, but do not prepend any prefix or date - relying on the client plugin to intercept and redirect the in-browser requests. This required a non-ArchivalUrlResultURIConverter, so many other codes which pulled the replay prefix from that object now pull it from the AccessPoint. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoReplayRendererDecorator.java trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/Memento.jsp trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoResultURIConverter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoReplayRendererDecorator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoReplayRendererDecorator.java 2010-10-21 00:37:52 UTC (rev 3288) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoReplayRendererDecorator.java 2010-10-21 17:33:30 UTC (rev 3289) @@ -80,10 +80,8 @@ SimpleDateFormat formatterk = new SimpleDateFormat("yyyyMMddHHmmss"); formatterk.setTimeZone(tzo); Properties apProps = wbRequest.getAccessPoint().getConfigs(); - ArchivalUrlResultURIConverter aUriConverter = - (ArchivalUrlResultURIConverter) uriConverter; Date closestDate = result.getCaptureDate(); - String uriPrefix = aUriConverter.getReplayURIPrefix(); + String uriPrefix = wbRequest.getAccessPoint().getReplayPrefix(); String agguri = apProps.getProperty("aggregationPrefix") + "timebundle/" + u; String timemap = " , <" Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoResultURIConverter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoResultURIConverter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoResultURIConverter.java 2010-10-21 17:33:30 UTC (rev 3289) @@ -0,0 +1,33 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.memento; + +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.util.url.UrlOperations; + +/** + * @author brad + * + */ +public class MementoResultURIConverter implements ResultURIConverter { + public String makeReplayURI(String datespec, String url) { + return UrlOperations.stripDefaultPortFromUrl(url); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoResultURIConverter.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/Memento.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/Memento.jsp 2010-10-21 00:37:52 UTC (rev 3288) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/Memento.jsp 2010-10-21 17:33:30 UTC (rev 3289) @@ -52,9 +52,7 @@ + "timemap/link/" + u + ">;rel=\"timemap\"; type=\"text/csv\""; String origlink = ", <" + u + ">;rel=\"original\""; - ArchivalUrlResultURIConverter uriconverter = (ArchivalUrlResultURIConverter) results - .getURIConverter(); - String uriPrefix = uriconverter.getReplayURIPrefix(); + String uriPrefix = wbRequest.getAccessPoint().getReplayPrefix(); String replayUrl = results.resultToReplayUrl(res); StringBuffer sb = new StringBuffer(); Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp 2010-10-21 00:37:52 UTC (rev 3288) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp 2010-10-21 17:33:30 UTC (rev 3289) @@ -38,9 +38,7 @@ CaptureSearchResults cResults = results.getCaptureResults(); CaptureSearchResult res = cResults.getClosest(); - ArchivalUrlResultURIConverter uriconverter = (ArchivalUrlResultURIConverter) results - .getURIConverter(); - String uriPrefix = uriconverter.getReplayURIPrefix(); + String uriPrefix = wbRequest.getAccessPoint().getReplayPrefix(); String u = wbRequest.getRequestUrl(); String agguri = uriPrefix + "timebundle/" + u; String format = wbRequest.get("format"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3288 http://archive-access.svn.sourceforge.net/archive-access/?rev=3288&view=rev Author: bradtofel Date: 2010-10-21 00:37:52 +0000 (Thu, 21 Oct 2010) Log Message: ----------- FEATURE: added 'retries' property which determines the number of attempts to grab a document when receiving HTTP 502 errors. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java 2010-10-18 22:24:46 UTC (rev 3287) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java 2010-10-21 00:37:52 UTC (rev 3288) @@ -43,7 +43,10 @@ private final static Logger LOGGER = Logger.getLogger( SimpleResourceStore.class.getName()); + private static String HTTP_ERROR = "HTTP"; + private static String HTTP_502 = "502"; private String prefix = null; + private int retries = 2; public Resource retrieveResource(CaptureSearchResult result) throws ResourceNotAvailableException { @@ -65,9 +68,26 @@ String fileUrl = prefix + fileName; Resource r = null; try { + int attempts = retries; + while(attempts-- > 0) { + try { + r = ResourceFactory.getResource(fileUrl, offset); + break; + } catch (IOException e) { + String message = e.getMessage(); + if(attempts > 0 + && message.contains(HTTP_ERROR) + && message.contains(HTTP_502)) { + + LOGGER.warning(String.format( + "Failed attempt for (%s) retrying with" + + " (%d) attempts left",fileUrl,attempts)); + } else { + throw e; + } + } + } - r = ResourceFactory.getResource(fileUrl, offset); - } catch (IOException e) { LOGGER.warning("Unable to retrieve:" + fileUrl + ":" + offset); e.printStackTrace(); @@ -94,4 +114,20 @@ public void shutdown() throws IOException { // no-op } + + /** + * @return the number of attempts to fetch resources with an HTTP 502 + * failure. + */ + public int getRetries() { + return retries; + } + + /** + * @param retries the number of attempts to fetch resources with an HTTP 502 + * failure. + */ + public void setRetries(int retries) { + this.retries = retries; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-18 23:05:21
|
Revision: 3286 http://archive-access.svn.sourceforge.net/archive-access/?rev=3286&view=rev Author: bradtofel Date: 2010-10-18 22:23:58 +0000 (Mon, 18 Oct 2010) Log Message: ----------- Feature: Added ReplayRendererDecorator, to simplify creation of things like MementoReplayRendererDecorator, which adds HTTP headers for all replayed resources in a Memento accessPoint. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/MementoReplay.xml Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoReplayRendererDecorator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/ReplayRendererDecorator.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoReplayRendererDecorator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoReplayRendererDecorator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoReplayRendererDecorator.java 2010-10-18 22:23:58 UTC (rev 3286) @@ -0,0 +1,225 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.memento; + +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Iterator; +import java.util.Properties; +import java.util.TimeZone; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.archive.wayback.ReplayRenderer; +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.CaptureSearchResults; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadContentException; +import org.archive.wayback.exception.WaybackException; +import org.archive.wayback.replay.HttpHeaderProcessor; +import org.archive.wayback.replay.TextReplayRenderer; +import org.archive.wayback.replay.ReplayRendererDecorator; + +/** + * @author brad + * + */ +public class MementoReplayRendererDecorator extends ReplayRendererDecorator { + + public MementoReplayRendererDecorator() { + super(); + } + /** + * @param decorated + * @param httpHeaderProcessor + */ + public MementoReplayRendererDecorator(ReplayRenderer decorated) { + super(decorated); + } + + @Override + public void renderResource(HttpServletRequest httpRequest, + HttpServletResponse httpResponse, WaybackRequest wbRequest, + CaptureSearchResult result, Resource resource, + ResultURIConverter uriConverter, CaptureSearchResults results) + throws ServletException, IOException, WaybackException { + + // add Memento headers: +// UIResults results = UIResults.extractCaptureQuery(request); +// WaybackRequest wbRequest = results.getWbRequest(); +// CaptureSearchResults cResults = results.getCaptureResults(); +// CaptureSearchResult res = cResults.getClosest(); + String u = wbRequest.getRequestUrl(); + SimpleDateFormat httpformatterl = new SimpleDateFormat( + "E, dd MMM yyyy HH:mm:ss z"); + TimeZone tzo = TimeZone.getTimeZone("GMT"); + httpformatterl.setTimeZone(tzo); + SimpleDateFormat formatterk = new SimpleDateFormat("yyyyMMddHHmmss"); + formatterk.setTimeZone(tzo); + Properties apProps = wbRequest.getAccessPoint().getConfigs(); + ArchivalUrlResultURIConverter aUriConverter = + (ArchivalUrlResultURIConverter) uriConverter; + Date closestDate = result.getCaptureDate(); + String uriPrefix = aUriConverter.getReplayURIPrefix(); + String agguri = apProps.getProperty("aggregationPrefix") + + "timebundle/" + u; + String timemap = " , <" + + apProps.getProperty("aggregationPrefix") + + "timemap/link/" + u + + ">;rel=\"timemap\"; type=\"text/csv\""; + + String timegate = ",<" + uriPrefix + "timegate/" + u + + ">;rel=\"timegate\""; + + Date f = results.getFirstResultDate(); + Date l = results.getLastResultDate(); + + StringBuffer sb = new StringBuffer(); + + httpResponse.setHeader("Memento-Datetime", + httpformatterl.format(result.getCaptureDate())); + + String memento = ",<" + uriPrefix + formatterk.format(closestDate) + + "/" + u + ">;rel=\"memento\";datetime=\"" + + httpformatterl.format(closestDate) + "\""; + String mfl = null; + if ((closestDate.equals(f)) && closestDate.equals(l)) { + mfl = ", <" + + uriPrefix + + formatterk.format(f) + + "/" + + u + + ">;rel=\"first-memento memento last-memento\"; datetime=\"" + + httpformatterl.format(f) + "\""; + } else if (closestDate.equals(f)) { + mfl = ", <" + uriPrefix + formatterk.format(f) + "/" + u + + ">;rel=\"first-memento memento\"; datetime=\"" + + httpformatterl.format(f) + "\""; + mfl = mfl + ", <" + uriPrefix + formatterk.format(l) + "/" + u + + ">;rel=\"last-memento\"; datetime=\"" + + httpformatterl.format(l) + "\""; + + } else if (closestDate.equals(l)) { + mfl = ", <" + uriPrefix + formatterk.format(l) + "/" + u + + ">;rel=\"last-memento memento\"; datetime=\"" + + httpformatterl.format(l) + "\""; + mfl = mfl + ", <" + uriPrefix + formatterk.format(f) + "/" + u + + ">;rel=\"first-memento\"; datetime=\"" + + httpformatterl.format(f) + "\""; + } else { + mfl = memento; + + mfl = mfl + ", <" + uriPrefix + formatterk.format(l) + "/" + u + + ">;rel=\"last-memento\"; datetime=\"" + + httpformatterl.format(l) + "\""; + mfl = mfl + ", <" + uriPrefix + formatterk.format(f) + "/" + u + + ">;rel=\"first-memento\"; datetime=\"" + + httpformatterl.format(f) + "\""; + } + + sb = new StringBuffer(mfl); + + // calculate closest values for link header + + CaptureSearchResult closestleft = null; + CaptureSearchResult closestright = null; + long rclosestDistance = 0; + long lclosestDistance = 0; + CaptureSearchResult cur = null; + + long wantTime = closestDate.getTime(); + + Iterator<CaptureSearchResult> itr = results.iterator(); + while (itr.hasNext()) { + cur = itr.next(); + cur.getCaptureDate(); + long curDistance = cur.getCaptureDate().getTime() - wantTime; + // == 0 skip + if (curDistance > 0) { + if ((closestright == null) + || (Math.abs(curDistance) < Math + .abs(rclosestDistance))) { + closestright = cur; + rclosestDistance = Math.abs(curDistance); + } + } + + if (curDistance < 0) { + if ((closestleft == null) + || (Math.abs(curDistance) < Math + .abs(lclosestDistance))) { + closestleft = cur; + lclosestDistance = Math.abs(curDistance); + } + } + + } + + if (closestleft != null) { + if (!(closestleft.getCaptureDate().equals(f))) { + + sb.append(", <" + + uriPrefix + + formatterk.format(closestleft.getCaptureDate()) + + "/" + + u + + ">;rel=\"prev-memento\"; datetime=\"" + + httpformatterl.format(closestleft + .getCaptureDate()) + "\""); + } else { + int m_index = sb.lastIndexOf("\"first-memento\""); + sb.insert(m_index + 1, "prev-memento "); + + } + } + if (closestright != null) { + if (!(closestright.getCaptureDate().equals(l))) { + sb.append(", <" + + uriPrefix + + formatterk.format(closestright.getCaptureDate()) + + "/" + + u + + ">;rel=\"next-memento\"; datetime=\"" + + httpformatterl.format(closestright + .getCaptureDate()) + "\""); + } else { + int m_index = sb.lastIndexOf("\"last-memento\""); + sb.insert(m_index + 1, "next-memento "); + + } + + } + + String origlink = ", <" + u + ">;rel=\"original\""; + + httpResponse.setHeader("Link", "<" + agguri + ">;rel=\"timebundle\"" + + origlink + sb.toString() + timemap + timegate); + + decorated.renderResource(httpRequest, httpResponse, wbRequest, result, + resource, uriConverter, results); + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoReplayRendererDecorator.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/ReplayRendererDecorator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/ReplayRendererDecorator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/ReplayRendererDecorator.java 2010-10-18 22:23:58 UTC (rev 3286) @@ -0,0 +1,72 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.replay; + +import java.io.IOException; +import java.util.List; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.archive.wayback.ReplayRenderer; +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.CaptureSearchResults; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadContentException; +import org.archive.wayback.exception.WaybackException; +import org.archive.wayback.replay.charset.CharsetDetector; + +/** + * @author brad + * + */ +public abstract class ReplayRendererDecorator implements ReplayRenderer { + + protected ReplayRenderer decorated = null; + /** + * @return the decorated + */ + public ReplayRenderer getDecorated() { + return decorated; + } + /** + * @param decorated the decorated to set + */ + public void setDecorated(ReplayRenderer decorated) { + this.decorated = decorated; + } + public ReplayRendererDecorator() { + } + /** + * @param httpHeaderProcessor + */ + public ReplayRendererDecorator(ReplayRenderer decorated) { + this.decorated = decorated; + } + + public abstract void renderResource(HttpServletRequest httpRequest, + HttpServletResponse httpResponse, WaybackRequest wbRequest, + CaptureSearchResult result, Resource resource, + ResultURIConverter uriConverter, CaptureSearchResults results) + throws ServletException, IOException, WaybackException; +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/ReplayRendererDecorator.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/MementoReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/MementoReplay.xml 2010-10-18 22:17:43 UTC (rev 3285) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/MementoReplay.xml 2010-10-18 22:23:58 UTC (rev 3286) @@ -17,7 +17,9 @@ <list> <value>/WEB-INF/replay/ArchiveComment.jsp</value> <value>/WEB-INF/replay/Disclaimer.jsp</value> + <!-- <value>/WEB-INF/replay/MementoValidity.jsp</value> + --> </list> </property> </bean> @@ -29,13 +31,6 @@ <property name="selectors"> <list> - <!-- REDIRECT IF NOT EXACT DATE --> - <bean class="org.archive.wayback.replay.selector.DateMismatchSelector"> - <property name="renderer"> - <bean class="org.archive.wayback.archivalurl.ArchivalUrlDateRedirectReplayRenderer" /> - </property> - </bean> - <!-- HTML REPLAY --> <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> <property name="mimeContains"> @@ -44,7 +39,11 @@ <value>application/xhtml</value> </list> </property> - <property name="renderer" ref="mementoclientsidehtmlreplayrenderer"/> + <property name="renderer"> + <bean class="org.archive.wayback.memento.MementoReplayRendererDecorator"> + <property name="decorated" ref="mementoclientsidehtmlreplayrenderer"/> + </bean> + </property> </bean> <!-- CSS REPLAY --> @@ -54,32 +53,20 @@ <value>text/css</value> </list> </property> - <property name="renderer" ref="archivalcssreplayrenderer"/> - </bean> - - <!-- ASX-MIME REPLAY --> - <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> - <property name="mimeContains"> - <list> - <value>video/x-ms-asf</value> - </list> + <property name="renderer"> + <bean class="org.archive.wayback.memento.MementoReplayRendererDecorator"> + <property name="decorated" ref="archivalcssreplayrenderer"/> + </bean> </property> - <property name="renderer" ref="archivalasxreplayrenderer"/> </bean> - <!-- ASX-PATH REPLAY --> - <bean class="org.archive.wayback.replay.selector.PathMatchSelector"> - <property name="pathContains"> - <list> - <value>.asx</value> - </list> - </property> - <property name="renderer" ref="archivalasxreplayrenderer"/> - </bean> - <!-- DEFAULT-TRANSPARENT REPLAY --> <bean class="org.archive.wayback.replay.selector.AlwaysMatchSelector"> - <property name="renderer" ref="archivaltransparentreplayrenderer"/> + <property name="renderer"> + <bean class="org.archive.wayback.memento.MementoReplayRendererDecorator"> + <property name="decorated" ref="archivaltransparentreplayrenderer"/> + </bean> + </property> </bean> </list> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3287 http://archive-access.svn.sourceforge.net/archive-access/?rev=3287&view=rev Author: bradtofel Date: 2010-10-18 22:24:46 +0000 (Mon, 18 Oct 2010) Log Message: ----------- BUGFIX(unreported) now correctly fails to parse incoming timegate requests missing the trailing '/' Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeGateRequestParser.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeGateRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeGateRequestParser.java 2010-10-18 22:23:58 UTC (rev 3286) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeGateRequestParser.java 2010-10-18 22:24:46 UTC (rev 3287) @@ -51,7 +51,7 @@ List<SimpleDateFormat> dtsupportedformats = new ArrayList<SimpleDateFormat>(); - String MEMENTO_BASE = "timegate"; + String MEMENTO_BASE = "timegate/"; /** * @param wrapped @@ -78,7 +78,7 @@ if (base.startsWith(MEMENTO_BASE)) { // strip leading "timegate/": - String urlStr = base.substring(requestPath.indexOf("/") + 1); + String urlStr = base.substring(MEMENTO_BASE.length()); // get the "Accept-Datetime" header: String httpdate = getHttpDate(httpRequest); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-18 22:17:50
|
Revision: 3285 http://archive-access.svn.sourceforge.net/archive-access/?rev=3285&view=rev Author: bradtofel Date: 2010-10-18 22:17:43 +0000 (Mon, 18 Oct 2010) Log Message: ----------- BUGFIX (unreported) HTTP header was missing text/cvs type in Link Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp 2010-10-11 20:44:05 UTC (rev 3284) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp 2010-10-18 22:17:43 UTC (rev 3285) @@ -109,7 +109,7 @@ linkbf.append(",<" + results.getContextConfig("Prefix") + "timegate/" + u + ">;rel=\"timegate\"\n"); linkbf.append(",<" + uriPrefix + "timemap/" + format + "/" + u - + ">;rel=\"timemap\"\n"); + + ">;rel=\"timemap\";type=\"text/csv\"\n"); String firstmemento = null; int count = 0; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-11 20:44:12
|
Revision: 3284 http://archive-access.svn.sourceforge.net/archive-access/?rev=3284&view=rev Author: bradtofel Date: 2010-10-11 20:44:05 +0000 (Mon, 11 Oct 2010) Log Message: ----------- now uses logging.proeperties Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/log4j.properties Deleted: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/log4j.properties =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/log4j.properties 2010-10-11 20:43:02 UTC (rev 3283) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/log4j.properties 2010-10-11 20:44:05 UTC (rev 3284) @@ -1,11 +0,0 @@ -log4j.rootLogger=WARN, R -log4j.appender.R=org.apache.log4j.ConsoleAppender -log4j.appender.R.target=System.err -#log4j.appender.R=org.apache.log4j.RollingFileAppender -#log4j.appender.R.File=/tmp/wayback.log -#log4j.appender.R.MaxFileSize=100MB -#log4j.appender.R.MaxBackupIndex=10 -log4j.appender.R.layout=org.apache.log4j.PatternLayout -log4j.appender.R.layout.ConversionPattern=%d{ISO8601} %p %t %c - %m%n -log4j.logger.org.archive.wayback=WARN -log4j.logger.org.archive.wayback.webapp.AccessPoint=INFO This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-11 20:43:09
|
Revision: 3283 http://archive-access.svn.sourceforge.net/archive-access/?rev=3283&view=rev Author: bradtofel Date: 2010-10-11 20:43:02 +0000 (Mon, 11 Oct 2010) Log Message: ----------- INITIAL REV: simple example of inserting content that will break javascript in selected pages, if the javascript is interfering with replay Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ArchiveBrokenCSSComment.jsp Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ArchiveBrokenCSSComment.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ArchiveBrokenCSSComment.jsp (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ArchiveBrokenCSSComment.jsp 2010-10-11 20:43:02 UTC (rev 3283) @@ -0,0 +1,23 @@ +<%@ page language="java" pageEncoding="utf-8" contentType="text/css;charset=utf-8" +%><%@ page import="java.util.Date" +%><%@ page import="org.archive.wayback.core.UIResults" +%><%@ page import="org.archive.wayback.util.StringFormatter" +%><% +UIResults results = UIResults.extractReplay(request); +StringFormatter fmt = results.getWbRequest().getFormatter(); +Date exactDate = results.getResult().getCaptureDate(); +Date now = new Date(); +String prettyDateFormat = "{0,date,H:mm:ss MMM d, yyyy}"; +String prettyArchiveString = fmt.format(prettyDateFormat,exactDate); +String prettyRequestString = fmt.format(prettyDateFormat,now); +%> +/* + FILE ARCHIVED ON <%= prettyArchiveString %> AND RETRIEVED FROM THE + INTERNET ARCHIVE ON <%= prettyRequestString %>. + JAVASCRIPT APPENDED BY WAYBACK MACHINE, COPYRIGHT INTERNET ARCHIVE. + + ALL OTHER CONTENT MAY ALSO BE PROTECTED BY COPYRIGHT (17 U.S.C. + SECTION 108(a)(3)). +*/ +intentionally broken stuff here to assist Wayback replay... + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-11 20:39:22
|
Revision: 3282 http://archive-access.svn.sourceforge.net/archive-access/?rev=3282&view=rev Author: bradtofel Date: 2010-10-11 20:39:15 +0000 (Mon, 11 Oct 2010) Log Message: ----------- TWEAK: added Memento support configuration example, reorganized some fo the default configs.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2010-10-11 20:38:31 UTC (rev 3281) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2010-10-11 20:39:15 UTC (rev 3282) @@ -15,11 +15,14 @@ <property name="properties"> <value> wayback.basedir=/tmp/wayback + wayback.urlprefix=http://localhost.archive.org:8080/wayback/ </value> </property> </bean> <bean id="waybackCanonicalizer" class="org.archive.wayback.util.url.AggressiveUrlCanonicalizer" /> + + <!-- The ResourceFileLocationDB implementation to use for mapping ARC/WARC names to absolute paths/URLs via a BDBJE database. @@ -29,7 +32,6 @@ <property name="bdbName" value="DB1" /> <property name="logPath" value="${wayback.basedir}/file-db/db.log" /> </bean> - <!-- The following bean provides an alternate flat-file based LocationDB implementation. @@ -55,17 +57,18 @@ Required when using the SimpleResourceStore to access distributed ARC/WARC files over HTTP through a single reverse proxy. --> - +<!-- <bean name="8080:fileproxy" class="org.archive.wayback.resourcestore.locationdb.FileProxyServlet"> <property name="locationDB" ref="resourcefilelocationdb" /> </bean> +--> - <!-- The XML files indicated in the following import tags contain alternate example implementations of WaybackCollections. To specify where your ARC/WARC files are located, see the file BDBCollection.xml. --> + <import resource="BDBCollection.xml"/> <!-- <import resource="CDXCollection.xml"/> @@ -73,18 +76,57 @@ <import resource="NutchCollection.xml"/> --> + + + <!-- - LiveWeb.xml contains beans that enable fetching content from the live - web, and caching those results in ARC files. This import is needed if you - use the "excluder-factory-robot" exclusionFactory property of the + LiveWeb.xml contains the 'proxylivewebcache' bean that enable fetching + content from the live web, recording that content in ARC files. + To use the "excluder-factory-robot" bean as an exclusionFactory property of AccessPoints, which will cause live robots.txt files to be consulted - retroactively before showing archived content. + retroactively before showing archived content, you'll need to import + LiveWeb.xml as well. --> <!-- <import resource="LiveWeb.xml"/> + <bean id="excluder-factory-robot" class="org.archive.wayback.accesscontrol.robotstxt.RobotExclusionFilterFactory"> + <property name="maxCacheMS" value="86400000" /> + <property name="userAgent" value="ia_archiver" /> + <property name="webCache" ref="proxylivewebcache" /> + </bean> --> <!-- + The 'excluder-factory-static' bean defines an exclusionFactory object which + consults a local text file containing either URLs or SURTs of content to + block from the ResourceIndex. These URLs or SURTs are treated as prefixes: + "http://www.archive.org/ima" will block anything starting with that string + from being returned from the index. +--> +<!-- + <bean id="excluder-factory-static" class="org.archive.wayback.accesscontrol.staticmap.StaticMapExclusionFilterFactory"> + <property name="file" value="/var/tmp/os-cdx/exclusion-2008-09-22-cleaned.txt" /> + <property name="checkInterval" value="600000" /> + </bean> +--> + +<!-- + The 'excluder-factory-composite' bean creates a single exclusionFactory + which restricts from both a static list of URLs, and also by live web + robots.txt documents. +--> +<!-- + <bean id="excluder-factory-composite" class="org.archive.wayback.accesscontrol.CompositeExclusionFilterFactory"> + <property name="factories"> + <list> + <ref bean="excluder-factory-static" /> + <ref bean="excluder-factory-robot" /> + </list> + </property> + </bean> +--> + +<!-- This is the only AccessPoint defined by default within this wayback.xml Spring configuration file, providing an ArchivalURL Replay UI to the "localbdbcollection", defined in "BDBCollection.xml" by providing @@ -96,13 +138,46 @@ with your fully qualified hostname of the computer running Tomcat. --> <import resource="ArchivalUrlReplay.xml"/> + + <!-- + Last ditch attempt to resolve server-relative URLs (/page1.htm) that were + not successfully rewritten, resolving them against the referring URL to + get them back on track. + --> + <bean name="+" class="org.archive.wayback.webapp.ServerRelativeArchivalRedirect"> + <property name="matchPort" value="8080" /> + <property name="useCollection" value="true" /> + </bean> + <bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> + <property name="serveStatic" value="true" /> + <property name="bounceToReplayPrefix" value="false" /> + <property name="bounceToQueryPrefix" value="false" /> + <!-- + These properties enable customized handling of query, replay, and static + requests by different URL prefixes + --> + + <property name="replayPrefix" value="${wayback.urlprefix}" /> + <property name="queryPrefix" value="${wayback.urlprefix}" /> + <property name="staticPrefix" value="${wayback.urlprefix}" /> + + <!-- + The following property will cause only results matching the exact host + the user requested to be displayed. URLs matching other versions of the + same host will be stored in the closeMatches list of the SearchResults, + and can be displayed by query .jsp files. + --> + <!-- + <property name="exactHostMatch" value="true" /> + --> + <property name="collection" ref="localbdbcollection" /> <!-- - An example of a text file CDX collection, with a text file path index. <property name="collection" ref="localcdxcollection" /> --> + <property name="replay" ref="archivalurlreplay" /> <property name="query"> <bean class="org.archive.wayback.query.Renderer"> @@ -114,49 +189,99 @@ </bean> </property> -<!-- See the LiveWeb.xml import above. - <property name="exclusionFactory" ref="excluder-factory-robot" /> ---> <property name="uriConverter"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> - <property name="replayURIPrefix" value="http://localhost.archive.org:8080/wayback/"/> + <property name="replayURIPrefix" value="${wayback.urlprefix}"/> </bean> </property> <property name="parser"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser"> - <property name="maxRecords" value="1000" /> + <property name="maxRecords" value="10000" /> <!-- <property name="earliestTimestamp" value="1999" /> <property name="latestTimestamp" value="2004" /> --> </bean> </property> - <!-- - The following property will cause only results matching the exact host - the user requested to be displayed. URLs matching other versions of the - same host will be stored in the closeMatches list of the SearchResults, - and can be displayed by query .jsp files. - --> - <!-- - <property name="exactHostMatch" value="true" /> - --> + +<!-- See the LiveWeb.xml import above. + <property name="exclusionFactory" ref="excluder-factory-static" /> +--> + </bean> - <!-- - + =========================================================== All beans defined below here represent examples of alternate AccessPoint definitions and implementations. + =========================================================== +--> + +<!-- + The following import and two bean definitions enable Memento access to + content in your collections. --> +<!-- + <import resource="MementoReplay.xml"/> + <bean name="8080:memento" parent="8080:wayback"> + <property name="configs"> + <props> + <prop key="aggregationPrefix">http://localhost.archive.org:8080/list/</prop> + </props> + </property> + <property name="replay" ref="mementoreplay" /> + <property name="query"> + <bean class="org.archive.wayback.query.Renderer"> + <property name="captureJsp" value="/WEB-INF/query/Memento.jsp" /> + </bean> + </property> + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://localhost.archive.org:8080/memento/"/> + </bean> + </property> + <property name="parser"> + <bean class="org.archive.wayback.memento.MementoRequestParser"> + <property name="maxRecords" value="10000" /> + <property name="earliestTimestamp" value="1996" /> + </bean> + </property> + <property name="exception"> + <bean class="org.archive.wayback.exception.BaseExceptionRenderer"> + <property name="errorJsp" value="/WEB-INF/exception/TimegateError.jsp" /> + </bean> + </property> + </bean> + <bean name="8080:list" parent="8080:memento"> + <property name="staticPrefix" value="http://localhost.archive.org:8080/list/" /> + <property name="configs"> + <props> + <prop key="Prefix">http://localhost.archive.org:8080/memento/</prop> + </props> + </property> + <property name="replay" ref="archivalurlreplay" /> + <property name="query"> + <bean class="org.archive.wayback.query.Renderer"> + <property name="captureJsp" value="/WEB-INF/query/ORE.jsp" /> + </bean> + </property> + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://memento.localhost.archive.org:8080/list/"/> + </bean> + </property> + </bean> +--> + <!-- The following AccessPoint inherits all configuration from the 8080:wayback AccessPoint, but provides a OpenSearch format query results. @@ -165,6 +290,7 @@ 8080:wayback AccessPoint: presumably users following links from here will prefer the HTML interface. --> + <!-- <bean name="8080:opensearch" parent="8080:wayback"> <property name="urlRoot" value="http://localhost.archive.org:8080/wayback/" /> <property name="query"> @@ -180,7 +306,7 @@ </bean> </property> </bean> - + --> <!-- The following AccessPoint inherits all configuration from the 8080:wayback @@ -222,8 +348,14 @@ Note: using this AccessPoint requires adding a "Connector" on port 8090 in your Tomcat's server.xml file. --> + <!-- <import resource="ProxyReplay.xml"/> - <bean name="8090" parent="8080:wayback"> + <bean name="8090" parent="replay.wayback.localhost.archive.org:8080"> + <property name="serveStatic" value="false" /> + <property name="bounceToReplayPrefix" value="false" /> + <property name="bounceToQueryPrefix" value="false" /> + <property name="refererAuth" value="" /> + <property name="urlRoot" value="http://localhost.archive.org:8090/" /> <property name="replay" ref="proxyreplay" /> <property name="uriConverter"> @@ -232,94 +364,17 @@ </bean> </property> <property name="parser"> - <bean class="org.archive.wayback.proxy.ProxyArchivalRequestParser"> + <bean class="org.archive.wayback.proxy.ProxyRequestParser"> <property name="localhostNames"> <list> <value>localhost.archive.org</value> </list> </property> <property name="maxRecords" value="1000" /> + <property name="addDefaults" value="false" /> </bean> </property> </bean> - - <bean name="8091" parent="8080:wayback"> - <property name="urlRoot" value="http://localhost.archive.org/" /> - <property name="replay" ref="proxyreplay" /> - <property name="uriConverter"> - <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> - <property name="replayURIPrefix" value="http://localhost.archive.org/"/> - </bean> -<!-- - <bean class="org.archive.wayback.proxy.RedirectResultURIConverter"> - <property name="redirectURI" value="http://localhost.archive.org:8090/jsp/QueryUI/Redirect.jsp" /> - </bean> --> - </property> - <property name="parser"> - <bean class="org.archive.wayback.proxy.ProxyArchivalRequestParser"> - <property name="localhostNames"> - <list> - <value>localhost.archive.org</value> - </list> - </property> - <property name="maxRecords" value="1000" /> - </bean> - </property> - </bean> -<!-- - The following AccessPoint inherits all configuration from the 8080:wayback - AccessPoint, but uses an Access Control Oracle to determine if archived - content should be accessible. - - The Access Control Oracle was developed by Alex Osborne of the NLA. - - Some documentation for this project is available at: - - http://webteam.archive.org/confluence/display/wayback/Exclusions+API ---> - -<!-- - <bean name="8080:exclusion" parent="8080:wayback"> - <property name="exclusionFactory"> - <bean class="org.archive.wayback.accesscontrol.oracleclient.OracleExclusionFilterFactory"> - <property name="oracleUrl" value="http://localhost:8180/oracle/" /> - <property name="accessGroup" value="ia_archiver" /> - </bean> - </property> - <property name="uriConverter"> - <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> - <property name="replayURIPrefix" value="http://localhost:8080/exclusion/" /> - </bean> - </property> - </bean> ---> - - -<!-- - The following AccessPoint inherits all configuration from the 8080:wayback - AccessPoint, but only allows access from the specified IP network. ---> -<!-- - <bean name="8080:netsecure" parent="8080:wayback"> - - <property name="authentication"> - <bean class="org.archive.wayback.authenticationcontrol.IPMatchesBooleanOperator"> - <property name="allowedRanges"> - <list> - <value>192.168.1.16/24</value> - </list> - </property> - </bean> - </property> - - <property name="uriConverter"> - <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> - <property name="replayURIPrefix" value="http://localhost.archive.org:8080/netsecure/" /> - </bean> - </property> - </bean> ---> - </beans> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-10-11 20:38:38
|
Revision: 3281 http://archive-access.svn.sourceforge.net/archive-access/?rev=3281&view=rev Author: bradtofel Date: 2010-10-11 20:38:31 +0000 (Mon, 11 Oct 2010) Log Message: ----------- TWEAK: moved a couple AccessPoint configuration examples from wayback.xml here Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ComplexAccessPoint.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ComplexAccessPoint.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ComplexAccessPoint.xml 2010-10-11 20:38:04 UTC (rev 3280) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ComplexAccessPoint.xml 2010-10-11 20:38:31 UTC (rev 3281) @@ -8,7 +8,8 @@ <!-- This file contains an alternate "8080:wayback" AccessPoint demonstrating - several optional AccessPoint configurations. + several optional AccessPoint configurations, and a few other, hopefully + useful, example AccessPoint configurations. --> <bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> @@ -84,4 +85,83 @@ </property> </bean> + + + <!-- + <bean name="8091" parent="8080:wayback"> + <property name="urlRoot" value="http://localhost.archive.org/" /> + <property name="replay" ref="proxyreplay" /> + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://localhost.archive.org/"/> + </bean> + </property> + <property name="parser"> + <bean class="org.archive.wayback.proxy.ProxyArchivalRequestParser"> + <property name="localhostNames"> + <list> + <value>localhost.archive.org</value> + </list> + </property> + <property name="maxRecords" value="1000" /> + </bean> + </property> + </bean> +--> + +<!-- + The following AccessPoint inherits all configuration from the 8080:wayback + AccessPoint, but uses an Access Control Oracle to determine if archived + content should be accessible. + + The Access Control Oracle was developed by Alex Osborne of the NLA. + + Some documentation for this project is available at: + + http://webteam.archive.org/confluence/display/wayback/Exclusions+API +--> + +<!-- + <bean name="8080:exclusion" parent="8080:wayback"> + <property name="exclusionFactory"> + <bean class="org.archive.wayback.accesscontrol.oracleclient.OracleExclusionFilterFactory"> + <property name="oracleUrl" value="http://localhost:8180/oracle/" /> + <property name="accessGroup" value="ia_archiver" /> + </bean> + </property> + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://localhost:8080/exclusion/" /> + </bean> + </property> + </bean> +--> + + +<!-- + The following AccessPoint inherits all configuration from the 8080:wayback + AccessPoint, but only allows access from the specified IP network. +--> +<!-- + <bean name="8080:netsecure" parent="8080:wayback"> + + <property name="authentication"> + <bean class="org.archive.wayback.authenticationcontrol.IPMatchesBooleanOperator"> + <property name="allowedRanges"> + <list> + <value>192.168.1.16/24</value> + </list> + </property> + </bean> + </property> + + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://localhost.archive.org:8080/netsecure/" /> + </bean> + </property> + </bean> +--> + + </beans> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |