You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bra...@us...> - 2008-02-07 00:39:46
|
Revision: 2180 http://archive-access.svn.sourceforge.net/archive-access/?rev=2180&view=rev Author: bradtofel Date: 2008-02-06 16:39:51 -0800 (Wed, 06 Feb 2008) Log Message: ----------- RELEASE: 1.2.0 Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/pom.xml trunk/archive-access/projects/wayback/pom.xml trunk/archive-access/projects/wayback/wayback-core/pom.xml trunk/archive-access/projects/wayback/wayback-mapreduce/pom.xml trunk/archive-access/projects/wayback/wayback-mapreduce-prereq/pom.xml trunk/archive-access/projects/wayback/wayback-webapp/pom.xml Modified: trunk/archive-access/projects/wayback/dist/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/pom.xml 2008-02-07 00:29:00 UTC (rev 2179) +++ trunk/archive-access/projects/wayback/dist/pom.xml 2008-02-07 00:39:51 UTC (rev 2180) @@ -3,7 +3,7 @@ <parent> <groupId>org.archive</groupId> <artifactId>wayback</artifactId> - <version>1.1.0-SNAPSHOT</version> + <version>1.2.0</version> </parent> <modelVersion>4.0.0</modelVersion> @@ -54,13 +54,13 @@ <dependency> <groupId>org.archive.wayback</groupId> <artifactId>wayback-webapp</artifactId> - <version>1.1.0-SNAPSHOT</version> + <version>1.2.0</version> <type>war</type> </dependency> <dependency> <groupId>org.archive.wayback</groupId> <artifactId>wayback-mapreduce</artifactId> - <version>1.1.0-SNAPSHOT</version> + <version>1.2.0</version> </dependency> </dependencies> Modified: trunk/archive-access/projects/wayback/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/pom.xml 2008-02-07 00:29:00 UTC (rev 2179) +++ trunk/archive-access/projects/wayback/pom.xml 2008-02-07 00:39:51 UTC (rev 2180) @@ -16,7 +16,7 @@ <modelVersion>4.0.0</modelVersion> <groupId>org.archive</groupId> <artifactId>wayback</artifactId> - <version>1.1.0-SNAPSHOT</version> + <version>1.2.0</version> <packaging>pom</packaging> <name>Wayback</name> Modified: trunk/archive-access/projects/wayback/wayback-core/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/pom.xml 2008-02-07 00:29:00 UTC (rev 2179) +++ trunk/archive-access/projects/wayback/wayback-core/pom.xml 2008-02-07 00:39:51 UTC (rev 2180) @@ -17,7 +17,7 @@ <parent> <groupId>org.archive</groupId> <artifactId>wayback</artifactId> - <version>1.1.0-SNAPSHOT</version> + <version>1.2.0</version> </parent> <groupId>org.archive.wayback</groupId> <artifactId>wayback-core</artifactId> Modified: trunk/archive-access/projects/wayback/wayback-mapreduce/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-mapreduce/pom.xml 2008-02-07 00:29:00 UTC (rev 2179) +++ trunk/archive-access/projects/wayback/wayback-mapreduce/pom.xml 2008-02-07 00:39:51 UTC (rev 2180) @@ -12,7 +12,7 @@ <parent> <groupId>org.archive</groupId> <artifactId>wayback</artifactId> - <version>1.1.0-SNAPSHOT</version> + <version>1.2.0</version> </parent> <groupId>org.archive.wayback</groupId> <artifactId>wayback-mapreduce</artifactId> Modified: trunk/archive-access/projects/wayback/wayback-mapreduce-prereq/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-mapreduce-prereq/pom.xml 2008-02-07 00:29:00 UTC (rev 2179) +++ trunk/archive-access/projects/wayback/wayback-mapreduce-prereq/pom.xml 2008-02-07 00:39:51 UTC (rev 2180) @@ -10,7 +10,7 @@ <parent> <groupId>org.archive</groupId> <artifactId>wayback</artifactId> - <version>1.1.0-SNAPSHOT</version> + <version>1.2.0</version> </parent> <groupId>org.archive.wayback</groupId> <artifactId>wayback-mapreduce-prereq</artifactId> Modified: trunk/archive-access/projects/wayback/wayback-webapp/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2008-02-07 00:29:00 UTC (rev 2179) +++ trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2008-02-07 00:39:51 UTC (rev 2180) @@ -3,7 +3,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive</groupId> - <version>1.1.0-SNAPSHOT</version> + <version>1.2.0</version> </parent> <modelVersion>4.0.0</modelVersion> <groupId>org.archive.wayback</groupId> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-07 00:28:55
|
Revision: 2179 http://archive-access.svn.sourceforge.net/archive-access/?rev=2179&view=rev Author: bradtofel Date: 2008-02-06 16:29:00 -0800 (Wed, 06 Feb 2008) Log Message: ----------- DOC: updated with major feature notes for 1.2 Modified Paths: -------------- trunk/archive-access/projects/wayback/plan.txt Modified: trunk/archive-access/projects/wayback/plan.txt =================================================================== --- trunk/archive-access/projects/wayback/plan.txt 2008-02-07 00:09:52 UTC (rev 2178) +++ trunk/archive-access/projects/wayback/plan.txt 2008-02-07 00:29:00 UTC (rev 2179) @@ -42,6 +42,7 @@ * numerous bug-fixes * pluggable URL canonicalization module * initial support for de-duplicated WARC records + == BRAINSTORMING == This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-07 00:09:50
|
Revision: 2178 http://archive-access.svn.sourceforge.net/archive-access/?rev=2178&view=rev Author: bradtofel Date: 2008-02-06 16:09:52 -0800 (Wed, 06 Feb 2008) Log Message: ----------- DOC: Added basic info about duplicate reduction features. Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml 2008-02-07 00:09:12 UTC (rev 2177) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml 2008-02-07 00:09:52 UTC (rev 2178) @@ -289,6 +289,7 @@ <bean class="org.archive.wayback.resourceindex.LocalResourceIndex"> <property name="source" ... /> <property name="maxRecords" value="10000" /> + <property name="dedupeRecords" value="false" /> </bean> </property> @@ -301,9 +302,16 @@ specifies the maximum number of records to process, and thus that can be returned, during a single query. </p> - <br></br> <p> <b> + dedupeRecords + </b> + set to true if you are using WARC files created by Heritrix 1.12 or + higher and configured the duplicate reduction features. See the + section Duplicate Reduction below for more information. + </p> + <p> + <b> source </b> defines the format to be used for storing and searching records in @@ -1644,6 +1652,29 @@ </p> </subsection> </section> - + <section name="Duplicate Reduction"> + <p> + Heritrix 1.12 and above have the capability to write WARC files, which + omit storing documents that have not changed since a previous visit. For + specifics on activating these features, please refer to the Heritrix + documentation. When Heritrix is using these features, and notices that + a document has not changed since the last time it was visited, it + creates an abbreviated WARC record, indicating that the document was + retrieved but not stored. In this abbreviated WARC record is an + indicator of the SHA1 digest of the document. + </p> + <p> + The wayback uses these identical SHA1 digests to map the location + (ARC/WARC + offset) of the original record that was stored to subsequent + records that were not. When a request for a subsequent capture that was + not stored is received by wayback, it will return the content of the + previous stored record. + </p> + <p> + The matching of these digests occurs at query time, and is configured + by setting the "dedupeRecords" option of the LocalResourceIndex to + "true". + </p> + </section> </body> </document> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2177 http://archive-access.svn.sourceforge.net/archive-access/?rev=2177&view=rev Author: bradtofel Date: 2008-02-06 16:09:12 -0800 (Wed, 06 Feb 2008) Log Message: ----------- BUGFIX (unreported) now we track all non-abbreviated records for matching against subsequent abbreviated records. Previously only the most recent non-abbrev record was saved, which caused problems with massaging: Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java 2008-02-06 02:01:26 UTC (rev 2176) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java 2008-02-07 00:09:12 UTC (rev 2177) @@ -7,17 +7,18 @@ import org.archive.wayback.util.Adapter; /** - * Adapter class that observes a stream of SearchResults tracking the last seen: + * Adapter class that observes a stream of SearchResults tracking for each + * complete record, a mapping of that records digest to: * Arc/Warc Filename * Arc/Warc offset * HTTP Response * MIME-Type * Redirect URL * - * for complete SearchResults. If subsequent SearchResults are missing these - * fields ("-") and the Digest field is the same, then the subsequent - * SearchResults are updated with the values from the kept copy, and an - * additional annotation field is added. + * If subsequent SearchResults are missing these fields ("-") and the Digest + * field has been seen, then the subsequent SearchResults are updated with the + * values from the kept copy matching that digest, and an additional annotation + * field is added. * * * @author brad @@ -26,42 +27,46 @@ public class DeduplicationSearchResultAnnotationAdapter implements Adapter<SearchResult,SearchResult> { private final static String EMPTY_VALUE = "-"; + + // these fields are all copied to deduped records as-is: private final static String FIELDS[] = { WaybackConstants.RESULT_ARC_FILE, WaybackConstants.RESULT_OFFSET, WaybackConstants.RESULT_HTTP_CODE, WaybackConstants.RESULT_MIME_TYPE, - WaybackConstants.RESULT_REDIRECT_URL + WaybackConstants.RESULT_REDIRECT_URL, }; - private String lastDigest = null; - private String lastTimeStamp = null; - private HashMap<String,String> lastValues = new HashMap<String,String>(); + private HashMap<String,SearchResult> memory = null; + + public DeduplicationSearchResultAnnotationAdapter() { + memory = new HashMap<String,SearchResult>(); + } + private SearchResult annotate(SearchResult o) { String thisDigest = o.get(WaybackConstants.RESULT_MD5_DIGEST); - if(!thisDigest.equals(lastDigest)) { + SearchResult last = memory.get(thisDigest); + if(last == null) { return null; } for(String field : FIELDS) { - o.put(field, lastValues.get(field)); + o.put(field, last.get(field)); } o.put(WaybackConstants.RESULT_DUPLICATE_ANNOTATION, WaybackConstants.RESULT_DUPLICATE_DIGEST); - o.put(WaybackConstants.RESULT_DUPLICATE_STORED_DATE, lastTimeStamp); + o.put(WaybackConstants.RESULT_DUPLICATE_STORED_DATE, + last.get(WaybackConstants.RESULT_CAPTURE_DATE)); return o; } + private SearchResult remember(SearchResult o) { - lastDigest = o.get(WaybackConstants.RESULT_MD5_DIGEST); - lastTimeStamp = o.get(WaybackConstants.RESULT_CAPTURE_DATE); - for(String field : FIELDS) { - lastValues.put(field, o.get(field)); - } + memory.put(o.get(WaybackConstants.RESULT_MD5_DIGEST),o); return o; } + public SearchResult adapt(SearchResult o) { if(o.get(FIELDS[0]).equals(EMPTY_VALUE)) { return annotate(o); } return remember(o); } - -} +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-06 02:01:21
|
Revision: 2176 http://archive-access.svn.sourceforge.net/archive-access/?rev=2176&view=rev Author: bradtofel Date: 2008-02-05 18:01:26 -0800 (Tue, 05 Feb 2008) Log Message: ----------- now under .../articles/release_notes.xml Removed Paths: ------------- trunk/archive-access/projects/wayback/dist/src/site/articles/releasenotes.html Deleted: trunk/archive-access/projects/wayback/dist/src/site/articles/releasenotes.html =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/articles/releasenotes.html 2008-02-06 02:00:42 UTC (rev 2175) +++ trunk/archive-access/projects/wayback/dist/src/site/articles/releasenotes.html 2008-02-06 02:01:26 UTC (rev 2176) @@ -1,8 +0,0 @@ -<html> - <head> - <title>Wayback Release Notes</title> - </head> -<body> - For brad. -</body> -</html> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-06 02:00:38
|
Revision: 2175 http://archive-access.svn.sourceforge.net/archive-access/?rev=2175&view=rev Author: bradtofel Date: 2008-02-05 18:00:42 -0800 (Tue, 05 Feb 2008) Log Message: ----------- DOCS: updated with new 1.2.0 features, configuration, added release_notes.html Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/src/site/site.xml trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml trunk/archive-access/projects/wayback/dist/src/site/xdoc/navigation.xml Added Paths: ----------- trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml Modified: trunk/archive-access/projects/wayback/dist/src/site/site.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/site.xml 2008-02-06 01:13:42 UTC (rev 2174) +++ trunk/archive-access/projects/wayback/dist/src/site/site.xml 2008-02-06 02:00:42 UTC (rev 2175) @@ -31,6 +31,7 @@ <item name="User Manual" href="user_manual.html"/> <item name="Administrator Manual" href="administrator_manual.html"/> <item name="Developer Manual" href="developer_manual.html"/> + <item name="Release Notes" href="release_notes.html"/> <item name="FAQ" href="/faq.html"/> <item name="API" href="./apidocs"/> <item name="Browse/Submit a Bug" Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml 2008-02-06 01:13:42 UTC (rev 2174) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml 2008-02-06 02:00:42 UTC (rev 2175) @@ -127,34 +127,40 @@ <section name="org.archive.wayback.ResourceStore implementations"> - <subsection name="LocalARCResourceStore"> + <subsection name="LocalResourceStore"> <p> This implementation works well for small - collections, where all the ARC files can be placed in a single + collections, where all the ARC/WARC files can be placed in a single directory on the same computer running the wayback application. Using NFS or another network filesystem technology and symbolic - links can allow this implementation to deal with ARC files in + links can allow this implementation to deal with files in multiple directories, or across multiple storage nodes. This implementation also includes the capability to run a background - thread to automatically notice new ARC files appearing, index - those ARC files, and hand off the index data for merging with + thread to automatically notice new ARC/WARC files appearing, index + those files, and hand off the index data for merging with a BDBResourceIndex. </p> <p> - The XML configuration template for a LocalARCResourceStore follows: + The XML configuration template for a LocalResourceStore follows: <pre> -<property name="resourceStore"> - <bean class="org.archive.wayback.resourcestore.LocalARCResourceStore" - init-method="init"> - <property name="arcDir" value="/tmp/wayback/arcs/" /> - <property name="queuedDir" value="/tmp/wayback/arc-indexer/queued" /> - <property name="workDir" value="/tmp/wayback/arc-indexer/work" /> - <property name="runInterval" value="10000" /> - <property name="indexClient"> - <bean class="org.archive.wayback.resourceindex.indexer.IndexClient"> - <property name="tmpDir" value="/tmp/wayback/arc-indexer/tmp" /> - <property name="target" value="/tmp/wayback/index-data/incoming" /> +<property name="resourceStore"> + <bean class="org.archive.wayback.resourcestore.LocalResourceStore" + init-method="init"> + + <property name="dataDir" value="/tmp/wayback/arcs/" /> + + <property name="indexThread"> + <bean class="org.archive.wayback.resourcestore.AutoIndexThread"> + <property name="queuedDir" value="/tmp/wayback/arc-indexer/queued" /> + <property name="workDir" value="/tmp/wayback/arc-indexer/work" /> + <property name="runInterval" value="10000" /> + <property name="indexClient"> + <bean class="org.archive.wayback.resourceindex.indexer.IndexClient"> + <property name="tmpDir" value="/tmp/wayback/arc-indexer/tmp" /> + <property name="target" value="/tmp/wayback/index-data/incoming" /> + </bean> + </property> </bean> </property> </bean> @@ -167,7 +173,7 @@ <ul> <li> <b> - arcDir + dataDir </b> is the local directory where ARC files will be located. @@ -175,7 +181,8 @@ </ul> </p> <p> - Optional configuration (only needed for automatic indexing) + Optional configuration (only needed if the indexThread property-bean + is specified, for automatic indexing) <ul> <li> <b> @@ -226,19 +233,19 @@ </subsection> - <subsection name="HttpARCResourceStore"> + <subsection name="Http11ResourceStore"> <p> - This implementation allows the wayback - application to access documents in remote ARC files via HTTP 1.1, - and scales to millions of ARC files. + This implementation allows the wayback application to access + documents in remote ARC/WARC files via HTTP 1.1, and scales to + millions of ARC/WARC files. </p> <p> - The XML configuration template for an HttpARCResourceStore follows: + The XML configuration template for an Http11ResourceStore follows: <pre> -<property name="resourceStore"> - <bean class="org.archive.wayback.resourcestore.HttpARCResourceStore"> - <property name="urlPrefix" value="http://localhost:8080/arcproxy/" /> +<property name="resourceStore"> + <bean class="org.archive.wayback.resourcestore.Http11ResourceStore"> + <property name="urlPrefix" value="http://localhost:8080/arcproxy/" /> </bean> </property> @@ -251,8 +258,8 @@ <b> urlPrefix </b> - this is the http:// prefix where ARC files are exported with an - ArcProxy installation. See elsewhere in this document for + this is the http:// prefix where ARC/WARC files are exported with + an ArcProxy installation. See elsewhere in this document for information about setting up an ArcProxy. </li> </ul> @@ -346,10 +353,11 @@ This implementation is good for larger scale installations, bounded mostly by the size of the index you can (first create, and later) store on a single machine. Using the command line tool - <b>arc-indexer</b>, and the standard UNIX <b>sort</b> tool - (see note below on LC_ALL), you create a sorted flat text file - that is searched on each request. Building these sorted files, - and updating the index are manual operations presently. + <b>arc-indexer</b> or <b>warc-indexer</b>, and the standard UNIX + <b>sort</b> tool (see note below on LC_ALL), you create a sorted + flat text file that is searched on each request. Building these + sorted files, and updating the index are manual operations + presently. <pre> <bean id="cdxsearchresultsource" class="org.archive.wayback.resourceindex.cdx.CDXIndex"> @@ -460,8 +468,8 @@ also provide different levels of access to a collection. For example, users within a particular subnet may be able to access all documents within a collection via one AccessPoint, but users outside that subnet - may only be restricted to viewing documents currently allowed by a - web sites current robots.txt file. + may be restricted to viewing documents allowed by a web sites current + robots.txt file. </p> <p> The XML configuration template for an AccessPoint follows: @@ -735,23 +743,27 @@ HTML documents returned in Archival URL Replay mode are modified from the original version to provide a replay experience more consistent to viewing the original - content. This is accomplished by the insertion of - Javascript, which executes in the client browser after - the page has loaded. This Javascript modifies most URLs - within the HTML page, both Anchors (links) as well as - embedded content (images, applets, etc) so that they - become appropriate Archival URL requests back to the Wayback - application. + content. This is accomplished by one of two methods. The first + includes modification of a subset of the HTML tags on the server, + combined with the insertion of JavaScript into the HTML page. This + JavaScript executes in the client browser after the page has loaded, + and modifies the remaining URLs within the HTML page, both + Anchors (links) as well as embedded content (images, applets, etc) + so that they become appropriate Archival URL requests back to the + Wayback application. The second method involves rewriting all HTML + tags within the page on the server, to make embedded URLs point back + into the Wayback application. </div> <br></br> <div> - This Javascript is imperfect: sometimes requests - "leak" to the live web temporarily, before the - Javascript has executed. Also, not all URLs are - rewritten correctly, especially URLs that are created - by Javascript that was in the original page, and - specialized file types containing links like Flash and - PDF documents. + There is a trade-off between these two approaches. The entirely + server-side rewriting requires more server resources, and is less + tested than the JavaScript method. The JavaScript is also imperfect: + sometimes requests "leak" to the live web temporarily, before the + Javascript has executed. With both methods, not all URLs are + rewritten correctly, especially URLs that are created by JavaScript + that was in the original page, and specialized file types containing + links like Flash and PDF documents. </div> <br></br> <div> @@ -854,13 +866,11 @@ <property name="replay"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlReplayDispatcher"> - <property name="jsInserts"> - <list> - <value>http://wayback.somehost.org:8080/wb-webapp/wm.js</value> - </list> - </property> + <property name="serverSideRendering" value="false" /> <property name="jspInserts"> <list> + <value>/replay/ArchiveComment.jsp</value> + <value>/replay/ClientSideJSInsert.jsp</value> <value>/replay/Timeline.jsp</value> </list> </property> @@ -897,16 +907,20 @@ </tr> <tr> <td> - jsInserts + serverSideRendering </td> <td> required </td> <td> - This list must include a reference to the wm.js javascript file, - but references to additional javascript files here will result in - a reference to those javascript URLs within all replayed HTML - pages. + When set to true, all URL rewriting occurs on the server, + eliminating the need for client side Javascript rewriting. If this + option is set to false, then the <i>ClientSideJSInsert.jsp</i> + <b>jspInsert</b> should be used. If this option is true, and + you're attempting to set up an entirely JavaScript free + installation which includes an embedded Timeline in replayed + HTML documents, you can use the <i>JSLessTimeline.jsp</i> + <b>jspInsert</b>. </td> </tr> <tr> @@ -917,12 +931,27 @@ optional </td> <td> - If any values are referenced here, then those .jsp files will be + If any values are included here, then those .jsp files will be invoked for every replayed document, and the resulting output will be included in replayed HTML pages. The example included - here will result in a Timeline banner in-page presence being - included with each replayed HTML page, allowing navigation - between different versions of the current URL. + here will result in: + <ul> + <li> + An HTML comment embedded inside replayed web pages indicating + the dates the document was captured and the date it was served + by wayback. + </li> + <li> + A reference to a javascript file, client-rewrite.js, which + will attempt to modify URLs within the users browser to make + them direct back into wayback. + </li> + <li> + A timeline banner embedded in the top of HTML pages that + allows navigation between other versions of the currently + viewed document. + </li> + </ul> </td> </tr> <tr> @@ -962,6 +991,12 @@ </td> </tr> </table> + <p> + Note that the old <b>jsInserts</b> configuration has been deprecated, + in favor of including references to JavaScript files using jspInserts. + Also note that the use of the ClientSideJSInsert.jsp is required when + serverSideRendering is set to false. + </p> </subsection> <subsection name="Proxy"> @@ -973,8 +1008,6 @@ documents from the live web, the Wayback Machine will retrieve documents from the local repository of ARC files. </p> - <br></br> - <br></br> <p> Proxy Replay mode does not suffer from the shortcomings of the inserted Javascript that the Archival URL mode uses, @@ -984,8 +1017,6 @@ client browser to the Wayback Machine - no date information is sent with the request. </p> - <br></br> - <br></br> <p> In Proxy Replay mode, the Wayback Machine will return the most recent version captured of any requested page. This @@ -996,15 +1027,10 @@ here </a>. </p> - <br></br> - <br></br> <p> Thanks Oskar! </p> - - <br></br> - <br></br> - <div> + <p> The following is an example Proxy Replay Access Point definition. It assumes to be running on a host <b>wayback.somehost.org</b>, that a Tomcat Connector has been added for port <b>8090</b>, @@ -1036,16 +1062,14 @@ </bean> </pre> - </div> - <br></br> - <br></br> - <div> + </p> + <p> <b>redirectURI</b> is required, and must be set to the name of the host where the Wayback application is running. If this is not the primary name of the machine running the Wayback application, then you may need to also specify the hostname used for the Wayback application in the <b>localhostNames</b> configuration list. - </div> + </p> </subsection> </section> @@ -1181,7 +1205,7 @@ <pre> -UIResults results = UIResults.getFromRequest(request); +UIQueryResults results = (UIQueryResults) UIResults.getFromRequest(request); String instString = results.getContextConfig("inst"); String logoString = results.getContextConfig("logo"); @@ -1199,15 +1223,8 @@ <p> All the command line tools can be found which can be found underneath the directory where you unpacked your distribution - at:<b>bin/*</b> (example: <i>bin/location-client</i>). You will - need to change permissions on the tools to allow them to be - executed: + at:<b>bin/*</b> (example: <i>bin/location-client</i>). </p> - <p> - <code> - chmod a+x bin/* - </code> - </p> <subsection name="bdb-client"> <p> @@ -1219,10 +1236,10 @@ <code> bin/bdb-client -r BDB_DIR BDB_NAME [PREFIX] </code> - <p> + <div> Output records from a BDB database on STDOUT. - </p> - <p> + </div> + <div> where: <ul> <li> @@ -1241,17 +1258,17 @@ order. </li> </ul> - </p> + </div> </li> <li> <code> bin/bdb-client -w BDB_DIR BDB_NAME </code> - <p> + <div> Read CDX format lines from STDIN, and insert into a BDB, creating the BDB if needed. - </p> - <p> + </div> + <div> where: <ul> <li> @@ -1262,7 +1279,7 @@ <i>BDB_NAME</i> Open BDB with this name. </li> </ul> - </p> + </div> </li> </ol> </p> @@ -1284,27 +1301,35 @@ output. </li> <li> - <i>FILE [FILE2 ...]</i> Sequentially search through - each file specified, outputting the lines prefixed - with KEY for each file. Note that the complete - output of bin-search will be sorted when used with - a single file, but when multiple files are searched, - the results may not be sorted completely. + <i>FILE [FILE2 ...]</i> Search through all files specified, + outputting the lines prefixed with KEY from each file in a single, + sorted stream. This assumes that all FILE arguments are sorted. </li> </ul> </p> </subsection> - <subsection name="arc-indexer"> + <subsection name="arc-indexer|warc-indexer"> <p> - This tool creates a CDX format index for the ARC file at ARC_PATH, - either on STDOUT, or at the path specified by CDX_PATH. The resulting - file can be sorted and merged with other CDX format index files to - generate CDX format ResourceIndex. - <code> - bin/arc-indexer ARC_PATH [CDX_PATH] - </code> + These tools create a CDX format index for the ARC/WARC file at + PATH, either on STDOUT, or at the path specified by CDX_PATH. The + resulting file can be sorted and merged with other CDX format index + files to generate CDX format ResourceIndex. </p> + <pre> + bin/arc-indexer [-identity] PATH [CDX_PATH] + bin/warc-indexer [-identity] PATH [CDX_PATH] + </pre> + <p> + Note that when manually constructing CDX files using these tools, you + <b>must</b> set the environment variable <b>LC_ALL=C</b> when using + the standard UNIX <b>sort</b> command line tool. + </p> + <p> + The <b>-identity</b> option causes the tools to skip canonicalization + of URLs. See the documentation for the <b>url-client</b> tool, and + the URL Canonicalization section below for more information. + </p> </subsection> <subsection name="location-client"> @@ -1367,7 +1392,7 @@ <subsection name="url-client"> <p> URLs stored in BDB and CDX format ResourceIndexes are - <i>canonicalized</i> to a more genertic form. Before + <i>canonicalized</i> to a more generic form. Before performing a lookup operation on the ResourceIndex, the same canonicalization function is applied to requested URLs. This tool will read space(" ") delimited lines from STDIN, and @@ -1380,20 +1405,25 @@ This tool is mostly useful for debugging the canonicalization function, but can also be used, if the canonicalization function is altered, to update an existing - CDX index, without recreating CDX files from original ARCs. + CDX index, without recreating CDX files from original ARCs. See the + seciond URL Canonicalization for more information. </p> <p> <code> - bin/url-client [-cdx] [-f FIELD] + bin/url-client [-cdx] [-d DELIMITER] [-f FIELD] [-f FIELD2] ... </code> <ul> <li> - <i>-cdx</i> Pass thru lines prefixed with " CDX " - unchanged. + <i>-cdx</i> Pass thru lines prefixed with " CDX " unchanged. </li> <li> + <i>-d DELIMITER</i> Use DELIMITER as to separate fields instead + of default Space(" "). + </li> + <li> <i>-f FIELD</i> alter column FIELD of each line, - instead of the default column 1. + instead of the default column 1. If specified multiple times, then + each column will be canonicalized in transformed lines. </li> </ul> </p> @@ -1455,6 +1485,165 @@ </pre> </section> + <section name="URL Canonicalization"> + <subsection name="Introduction and Concepts"> + <p> + Sometimes URLs found in the field can have multiple forms, for + example: + <pre> + http://www.example.com/img/foo.gif + http://www.example.com/docs/../img/foo.gif + </pre> + are both valid representations of the exact same URL. Another, less + certain example would be: + <pre> + http://www.example.com/Interview.html + http://www.example.com/interview.html + </pre> + which differ only in the capitalization of the letter "i". On some + operating systems, these two URLs legitimately specify two distinct + documents. On Windows platforms, they refer to the same document. If + the document on a web server is actually named "Interview.html", but + a web designer creates a web page that refers to this document using + the lowercase "interview.html", then the link will work, and they and + the web site visitors may never notice the difference. The same + situation on a different operating system would probably not work + (although some web server plugins and modules will also correct this + problem transparently) and the web designer would probably notice and + correct the problem. In practice, we have found that it is very rare + for the two URLs above with different capitalization to refer to + different documents, and they can be treated as equivalent in most + situations. + </p> + <p> + Another example, which occurs far more often in the real world, + involves web servers injecting a session ID inside paths to documents + hosted on that web server. These session IDs allow the web server to + track individual user's states. Here are some example URLs + demonstrating path session ID injection: + <pre> + http://www.example.com/(S(4hqa0555fwsecu455xqckv45))/page1.aspx + http://www.example.com/(S(4hqa0555fwsecu455xqckv45))/page2.aspx + http://www.example.com/(S(a63098d96360a63098d96360))/page3.aspx + </pre> + In these examples, the first two URLs are using one session ID, and + the third uses a different session ID. If <b>page3.aspx</b> refers to + <b>page1.aspx</b> using an anchor like this: + <pre> + <a href="page1.aspx">page1</a> + </pre> + and a user visiting <b>page3.aspx</b> clicks the link to page1, then + the wayback will recieve a request for the URL: + <pre> + http://www.example.com/(S(a63098d96360a63098d96360))/page1.aspx + </pre> + If page1.aspx was captured using the different session ID, then the + wayback will be unable to locate this document in the index, even + though it was captured. + </p> + <p> + This session ID problem can be mitigated by <i>canonicalizing</i> the + URLs as they are placed in the index, so the index would contain the + following URLs, instead of the original form, which the crawler + captured: + <pre> + http://www.example.com/page1.aspx + http://www.example.com/page2.aspx + http://www.example.com/page3.aspx + </pre> + If the same canonicalization scheme is used to transform incoming + requests, before attempting to lookup URLs in the index, then the + software is able to locate and return the documents correctly. + </p> + </subsection> + <subsection name="Current Status within Wayback"> + <p> + Currently the Wayback includes only a single reference implementation + of a canonicalization scheme, which is currently called + <b>AggressiveUrlCanonicalizer</b>. This implementation provides the + following canonicalization: + <ul> + <li> + <b>www# removal</b> + http://www.example.com => example.com, + http://www13.example.com => example.com + </li> + <li> + <b>user info removal</b> + http://us...@ex... => example.com, + http://user:pas...@ex... => example.com, + </li> + <li> + <b>session ID removal</b> + http://www.example.com/(S(a63098d96360a63098d96360))/page1.aspx + => + example.com/page1.aspx + <br></br> + <i>(and other common session ID path injection schemes)</i> + </li> + <li> + <b>path and CGI argument lowercasing</b> + http://www.example.com/Interviews.cgi?Interview=Left + => + example.com/interviews.cgi?interview=left + </li> + <li> + <b>extra query argument delimiter removal</b> + http://www.example.com/Interviews.cgi?Interview=Left& + => + example.com/interviews.cgi?interview=left + </li> + <li> + <b>unneeded query specifier removal</b> + http://www.example.com/Interviews.cgi? + => + example.com/interviews.cgi + </li> + </ul> + These heuristics generally lead to correcting many common URL lookup + problems, but in some cases, these operation do the wrong thing, + typically by making content which is actually different appear to be + the same thing. + </p> + <p> + At the IA, we have recently switched to building CDX files using the + <b>-identity</b> option on the <b>arc-indexer</b> and + <b>warc-indexer</b> tools, and have added an additional step in our + CDX creation processes which uses the <b>url-client</b> tool before + sorting and merging CDX files. By keeping the original "identity" CDX + files, we have been able to test various URL canonicalization + strategies without the overhead of re-processing all the source + materials. + </p> + </subsection> + <subsection name="Future Directions within Wayback"> + <p> + In upcoming wayback releases, we intend to provide more + canonicalization implementations, including a configurable + implementation that will allow broad customization capabilities. + </p> + <p> + We also intend to alter the format of wayback indexes significantly. + Using this new format will be optional, but once indexes are created + in the new format is created, other indexes with different + canonicalization strategies can be built from them without requiring + a complete reindex of the original ARC/WARC content. + </p> + <p> + The new format will also allow a degree of dynamic canonicalization + at run-time, meaning different strategies can be tested using the + same indexes, and site-specific canonicalization strategies may be + possible. + </p> + <p> + We believe that allowing (advanced) users to easily change between + canonicalization strategies within the same wayback session will + promote better community understanding of the impacts of different + strategies, and will enable the community to build a set of best + practices for URL canonicalization. + </p> + </subsection> + </section> </body> </document> Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml 2008-02-06 01:13:42 UTC (rev 2174) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml 2008-02-06 02:00:42 UTC (rev 2175) @@ -8,7 +8,86 @@ </properties> <body> + <section name="Introduction"> + <p><b>wayback</b> is an open source java implementation of the + <a href="http://www.archive.org/web/web.php">The Internet Archive + Wayback Machine</a>. + </p> + <p> + The current production version of the Wayback Machine is implemented in + perl, and lacks in maintainability and extensibility. Also, the code is + not open source. Primary motivation for the new version is to address + these three issues, enabling public distribution of the application, and + easy experimentation with new features and access technologies. + </p> + <p> + The current Java version of the Wayback Machine supports three access, + or Replay modes of operation: "Archival Url" mode "Proxy" mode, and + "Domain Prefix" mode. + </p> + <p> + Archival URL mode provides a user experience very close to the current + production Wayback Machine. All query and replay access requests can be + expressed as URLs. In Archival Url replay mode, archived content is + modified as it is returned to users, attempting to make links and + embedded content refer back to the Wayback Machine by rewriting them as + Archival URLs. + </p> + <p> + Proxy URL mode allows replaying of archived documents within a client + browser by configuring the browser to proxy all HTTP requests through + the Wayback Machine. This has the strong advantage that no Javascript + or server side page markup is required to coerce the client browser to + request additional URLs and embedded content from the Wayback Machine + -- content just works as-is. When used with the Firefox plugin + extension, available + <a href="http://archive-access.sourceforge.net/projects/waxtoolbar/"> + here + </a>, client browsers can navigate between versions of the current + document, and the Wayback Machine server will attempt to display images + from the same time period as pages being viewed. The Proxy URL mode + requires special configuration of the client web browser to access the + Wayback Service. This browser configuration is not complex, but it + means that content cannot be accessed as a global URL. + </p> + <p>See the <a href="administrator_manual.html">Administrator Manual</a> + to learn more about access modes. + </p> + <p> + The current Java version can operate in several deployment modes, + ranging from a stand alone application on a single host holding all + archived documents and indexes, up to a highly distributed system where + indexes and archived content is spread across hundreds of machines. + </p> + <p> + In the local, standalone mode, this software includes the capability to + scan for new archived content in a specified location, and to + automatically index and serve the new content as it appears. Directing + the Wayback to look for ARC files in the directory where an instance of + the Heritrix web crawler is writing ARC output should provide the + capability to browse content archived by Heritrix as it is crawled. + </p> + </section> <section name="News"> + <subsection name="New Release - 1.2.0, 1/30/2008"> + <p> + Release 1.2.0 has several new features, as well as several + bug-fixes. Wayback now supports compressed and uncompressed + ARC and WARC formats. Previously there was only support for + compressed ARC files. This version also includes a new Archival URL + replay mechanism, where all URL rewriting occurs on the server, + obviating the need for client-side Javascript, and preventing + some request leakage. This version also includes the capability to + replace the default URL canonicalization scheme(currently there is + still only one implementation available, but the groundwork for + using different schemes is now in place.) This version also + includes support for de-duplicated WARC records. + </p> + <p> + Please see the <a href="release_notes.html">Release Notes</a> for + specific features and bug fixes. + </p> + </subsection> <subsection name="New Release - 1.0.0, 10/12/2007"> <p> Release 1.0.0 has several significant changes, most notably a @@ -124,83 +203,10 @@ </p> </subsection> <subsection name="First Release - 0.2.0, 12/09/2005"> - <p>First public release of the open source wayback. - See below in the <a href="#Introduction">Introduction</a> - section for a listing of initial features. + <p> + First public release of the open source wayback. </p> </subsection> </section> - <section name="Introduction"> - <p><b>wayback</b> is an open source java implementation of the - <a href="http://www.archive.org/web/web.php">The Internet Archive - Wayback Machine</a>. - </p> - <p> - The current production version of the Wayback Machine is implemented in - perl, and lacks in maintainability and extensibility. Also, the code is - not open source. Primary motivation for the new version is to address - these three issues, enabling public distribution of the application, and - easy experimentation with new features and access technologies. - </p> - <p> - The current Java version of the Wayback Machine supports two access, or - replay modes of operation: "Archival Url" mode and "Proxy" mode. - </p> - <p> - Archival URL mode provides a user experience very close to the current - production Wayback Machine. All query and replay access requests can be - expressed as URLs. In Archival Url replay mode, HTML documents are - delivered with additional Javascript embedded in the page. This - Javascript alters the document within the browser, attempting to make - links and embedded content refer back to the Wayback Machine by - rewriting them as Archival URLs. - </p> - <p> - Proxy URL mode allows replaying of archived documents within a client - browser by configuring the browser to proxy all HTTP requests through - the Wayback Machine. This has the strong advantage that no Javascript - page markup is required to coerce the client browser to request - additional URLs and embedded content from the Wayback Machine -- content - just works as-is. When used with the Firefox plugin extension, available - <a href="http://archive-access.sourceforge.net/projects/waxtoolbar/"> - here - </a>, client browsers can navigate between versions of the current - document, and the Wayback Machine server will attempt to display images - from the same time period as pages being viewed. The Proxy URL mode - requires special configuration of the client web browser to access the - Wayback Service. This browser configuration is not complex, but it - means that content cannot be accessed as a global URL. - </p> - <p> - Timeline Mode allows for navigation between different dates collected - of the current page, similar to the WERA application, using framesets. - </p> - <p>See the <a href="user_manual.html">User Manual</a> to learn more - about access modes. - </p> - <p> - The current Java version is intended to operate as a standalone webapp, - maintaining an index on the machine hosting the webapp. This index - contains records of the resources within a set of ARC files, which are - also assumed to be stored on the same machine hosting the webapp. - </p> - <p> - This software includes the capability to scan for ARC files in a - specified location, and to automatically index and serve content in - newly discovered ARC files as they appear. Directing the Wayback - Machine to look for ARC files in the directory where an instance of the - Heritrix web crawler is writing ARC output should provide the - capability to browse content archived by Heritrix as it is crawled. - </p> - <p> - The 0.4.0 version includes the capability to retrieve documents from ARC - files stored on remote hosts using HTTP 1.1. Please see the User Manual - for more information about using this and other new features. - </p> - <p> - Future versions of this software may integrate more tightly with the - Heritrix web crawler application. - </p> - </section> </body> </document> Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/navigation.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/navigation.xml 2008-02-06 01:13:42 UTC (rev 2174) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/navigation.xml 2008-02-06 02:00:42 UTC (rev 2175) @@ -5,7 +5,7 @@ <properties> <title>Wayback</title> <author email="brad at archive dot org">Brad Tofel</author> - <revision>$Id$</revision> + <revision>$Id:navigation.xml 2009 2007-09-28 00:09:04Z bradtofel $</revision> </properties> <body> @@ -14,6 +14,7 @@ <item name="Requirements" href="requirements.html"/> <item name="Downloads" href="downloads.html"/> <item name="User Manual" href="user_manual.html"/> + <item name="Release Notes" href="release_notes.html"/> <item name="Test" href="test.html"/> <item name="FAQ" href="/faq.html"/> <item name="API" href="./apidocs"/> Added: trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml 2008-02-06 02:00:42 UTC (rev 2175) @@ -0,0 +1,123 @@ +<?xml version="1.0" encoding="ISO-8859-1"?> + +<document> + <properties> + <title>Release Notes</title> + <author email="brad at archive dot org">Brad Tofel</author> + <revision>$Id: index.xml 2040 2007-10-12 23:21:40Z bradtofel $</revision> + </properties> + + <body> + <section name="Releases"> + <p> + Full listing of changes and bug fixes are not currently available prior + to release 1.2.0. + </p> + </section> + <section name="Release 1.2.0"> + <subsection name="Features"> + <ul> + <li> + now supports compressed and uncompressed ARC and WARC files. + </li> + <li> + initial revision of "deduplicated" WARC record handling, which + returns the last version that was actually stored when + subsequent captures are not saved because they have not changed. + </li> + <li> + now filters (literal) duplicate records from the ResourceIndex, + in case the same capture (url + date) appears twice, or in two + CDX files. + </li> + <li> + UrlCanonicalizer is now pluggable, current functionality is now + implemented in AggressiveUrlCanonicalizer. Added + IdentityUrlCanonicalizer, which performs no canonicalization. + </li> + <li> + <b>bin-search</b> command line tool now outputs a single stream of + sorted results from multiple files, instead of returning matches + from each file sequentially. + </li> + <li> + extracted several replay features into separately jspInserts that + can now be mixed and matched. + </li> + <li> + now handles most text/css URL rewriting, both inside HTML pages, + and in externally linked .css files. + </li> + <li> + externalized commented embedded inside replayed HTML pages into + jspInsert: ArchiveComment.jsp. + </li> + <li> + non-javascript Archival URL replay mode, where all URL rewriting + occurs on the server. This includes a non-javascript + Timeline jspInsert. + </li> + <li> + added two-month timeline partition. + </li> + <li> + root page of webapp now lists access points, when users make + a request that does not specify one. Also, now access point + "slash-pages" are available "without the slash". + </li> + </ul> + </subsection> + <subsection name="Bug Fixes"> + <ul> + <li> + Now rewrite Location and Content-Base HTTP headers in non-HTML + Archival URL replayed documents. + </li> + <li> + now rewrites all <b>background</b> attributes found in returned + pages (archival URL mode only) instead of just on BODY tags. + </li> + <li> + now rewrites <b>src</b> attributes on INPUT tags. + </li> + <li> + command line tools now allow whitespace arguments, important for + tools accepting delimter arguments. + </li> + <li> + replay URLs in query results now include non-standard ports, if + needed. + </li> + <li> + Timezone is now explicitly set to GMT/UTC, fixing a Calendar + result partioning problem. + </li> + <li> + uncaught character-encoding exceptions now handled, plus + slightly improved detection of correct character encoding by + removing internal whitespace in declared encoding names. + </li> + <li> + archival URL parsing of query end-date now assumes latest + possible date given a partial end-date, instead of earliest + possible date. + </li> + <li> + re-implemented lost "closest" indicator for XML results. + </li> + <li> + now supports multiple auto index threads, one per ResourceStore, + and also multiple auto index merge threads, one per BDB + ResourceIndex. + </li> + <li> + fixed hard-coded maximum year issue. + </li> + <li> + reimplemented NotInArchive logging, which was lost in 1.0.0. + </li> + </ul> + </subsection> + </section> + </body> +</document> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-06 01:13:38
|
Revision: 2174 http://archive-access.svn.sourceforge.net/archive-access/?rev=2174&view=rev Author: bradtofel Date: 2008-02-05 17:13:42 -0800 (Tue, 05 Feb 2008) Log Message: ----------- BUGFIX: now calculates current(which translates to max) year on startup, which still will require a restart on New Years, but at least won't require more code changes.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Timestamp.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/core/TimestampTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Timestamp.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Timestamp.java 2008-02-04 22:58:24 UTC (rev 2173) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Timestamp.java 2008-02-06 01:13:42 UTC (rev 2174) @@ -43,7 +43,8 @@ private final static String LOWER_TIMESTAMP_LIMIT = "10000000000000"; private final static String UPPER_TIMESTAMP_LIMIT = "29991939295959"; private final static String YEAR_LOWER_LIMIT = "1996"; - private final static String YEAR_UPPER_LIMIT = "2008"; + private final static String YEAR_UPPER_LIMIT = + String.valueOf(Calendar.getInstance().get(Calendar.YEAR)); private final static String MONTH_LOWER_LIMIT = "01"; private final static String MONTH_UPPER_LIMIT = "12"; private final static String DAY_LOWER_LIMIT = "01"; Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/core/TimestampTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/core/TimestampTest.java 2008-02-04 22:58:24 UTC (rev 2173) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/core/TimestampTest.java 2008-02-06 01:13:42 UTC (rev 2174) @@ -24,6 +24,8 @@ */ package org.archive.wayback.core; +import java.util.Calendar; + import junit.framework.TestCase; import org.archive.wayback.core.Timestamp; @@ -40,11 +42,13 @@ */ public void testPadDateStr() { + String curYear = String.valueOf(Calendar.getInstance().get(Calendar.YEAR)); + assertEquals("padStart '1'","19960101000000",Timestamp.padStartDateStr("1")); assertEquals("padEnd '1'","19991231235959",Timestamp.padEndDateStr("1")); assertEquals("padStart '2'","20000101000000",Timestamp.padStartDateStr("2")); - assertEquals("padEnd","20081231235959",Timestamp.padEndDateStr("2")); - assertEquals("padEnd","20081231235959",Timestamp.padEndDateStr("3")); + assertEquals("padEnd",curYear + "1231235959",Timestamp.padEndDateStr("2")); + assertEquals("padEnd",curYear + "1231235959",Timestamp.padEndDateStr("3")); assertEquals("padEnd","20061231235959",Timestamp.padEndDateStr("2006")); assertEquals("padEnd","20061231235959",Timestamp.padEndDateStr("200613")); assertEquals("padEnd","20071231235959",Timestamp.padEndDateStr("2007")); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2173 http://archive-access.svn.sourceforge.net/archive-access/?rev=2173&view=rev Author: bradtofel Date: 2008-02-04 14:58:24 -0800 (Mon, 04 Feb 2008) Log Message: ----------- BUGFIX: (ACC-10) now origHost contains the port number, if non-standard. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2008-02-02 00:58:43 UTC (rev 2172) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXLineToSearchResultAdapter.java 2008-02-04 22:58:24 UTC (rev 2173) @@ -72,6 +72,9 @@ try { UURI uri = UURIFactory.getInstance( WaybackConstants.HTTP_URL_PREFIX + url); + if(uri.getPort() != -1) { + origHost += ":" + uri.getPort(); + } origUrl = origHost + uri.getEscapedPathQuery(); } catch (URIException e) { // TODO Stifle? throw an error? This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2172 http://archive-access.svn.sourceforge.net/archive-access/?rev=2172&view=rev Author: bradtofel Date: 2008-02-01 16:58:43 -0800 (Fri, 01 Feb 2008) Log Message: ----------- BUGFIX: searching for specific data within an URL was causing some deduped records to be lost.. need to just start at the beginning for now.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2008-02-02 00:57:46 UTC (rev 2171) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2008-02-02 00:58:43 UTC (rev 2172) @@ -340,7 +340,11 @@ filters.addFilter(exclusion); } filters.addFilter(finalCounter); - startKey = keyUrl + " " + startDate; + // OPTIMIZ: beginning the search at the startDate causes problems + // with deduplicated results. We need to be smarter about rolling + // backwards a ways if we start on a deduped record. +// startKey = keyUrl + " " + startDate; + startKey = keyUrl + " "; // add the start and end windowing filters: filters.addFilter(new WindowStartFilter(startResult)); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-02 00:57:44
|
Revision: 2171 http://archive-access.svn.sourceforge.net/archive-access/?rev=2171&view=rev Author: bradtofel Date: 2008-02-01 16:57:46 -0800 (Fri, 01 Feb 2008) Log Message: ----------- BUGFIX: was not catching IllegalCharsetName exception.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-02-01 23:53:57 UTC (rev 2170) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-02-02 00:57:46 UTC (rev 2171) @@ -29,6 +29,7 @@ import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; import java.text.ParseException; import java.util.Map; @@ -88,11 +89,21 @@ this.uriConverter = uriConverter; } + private boolean isCharsetSupported(String charsetName) { + // can you believe that this throws a runtime? Just asking if it's + // supported!!?! They coulda just said "no"... + try { + return Charset.isSupported(charsetName); + } catch(IllegalCharsetNameException e) { + return false; + } + } + private String contentTypeToCharset(final String contentType) { int offset = contentType.indexOf(CHARSET_TOKEN); if (offset != -1) { String cs = contentType.substring(offset + CHARSET_TOKEN.length()); - if(Charset.isSupported(cs)) { + if(isCharsetSupported(cs)) { return cs; } // test for extra spaces... there's at least one page out there that @@ -101,7 +112,7 @@ // <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1"> // bad web page! - if(Charset.isSupported(cs.replace(" ", ""))) { + if(isCharsetSupported(cs.replace(" ", ""))) { return cs.replace(" ", ""); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-01 23:53:53
|
Revision: 2170 http://archive-access.svn.sourceforge.net/archive-access/?rev=2170&view=rev Author: bradtofel Date: 2008-02-01 15:53:57 -0800 (Fri, 01 Feb 2008) Log Message: ----------- OPTIMIZ: two major optimizations, now holds URL to run regexs against in a StringBuilder, to reduce String Object construction overhead, and we now do a String compare against a "chooser" string before bothering to test the RegEx against the URLs. BUGFIX: fixed a couple of session ID stripper RegExes that were broken. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2008-02-01 19:34:06 UTC (rev 2169) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2008-02-01 23:53:57 UTC (rev 2170) @@ -51,29 +51,40 @@ * Strip leading 'www.' */ private static final Pattern STRIP_WWW_REGEX = - Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$"); + Pattern.compile("(?i)^(?:https?://)(www[0-9]*\\.)(?:[^/]*/.+)$"); + private static final String STRIP_WWW_CHOOSER = "/www"; +// /** +// * Strip leading 'www44.', 'www3.', etc. +// */ +// private static final Pattern STRIP_WWWN_REGEX = +// Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$"); /** - * Strip leading 'www44.', 'www3.', etc. - */ - private static final Pattern STRIP_WWWN_REGEX = - Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$"); - /** * Strip userinfo. */ private static final Pattern STRIP_USERINFO_REGEX = - Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$", + Pattern.compile("^(?:(?:(?:https?)|(?:ftps?))://)([^/]+@)(?:.*)$", Pattern.CASE_INSENSITIVE); + private static final String STRIP_USERINFO_CHOOSER = "@"; /** - * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A. * Example: PHPSESSID=9682993c8daa2c5497996114facdc805. */ - private static final Pattern STRIP_SESSION_ID_REGEX = - Pattern.compile("^(.+)(?:(?:(?:jsessionid)|(?:phpsessid))=" + - "[0-9a-zA-Z]{32})(?:&(.*))?$", + private static final Pattern STRIP_PHPSESSION_ID_REGEX = + Pattern.compile("^(?:.+)(phpsessid=" + + "[0-9a-zA-Z]{32}&?)(?:(?:.*))?$", Pattern.CASE_INSENSITIVE); + private static final String STRIP_PHPSESSION_ID_CHOOSER = "phpsessid="; + /** + * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A. + */ + private static final Pattern STRIP_JSESSION_ID_REGEX = + Pattern.compile("^.*(jsessionid=[0-9a-zA-Z]{32}&?).*$", + Pattern.CASE_INSENSITIVE); + private static final String STRIP_JSESSION_ID_CHOOSER = "jsessionid="; + + /** * Example: sid=9682993c8daa2c5497996114facdc805. * 'sid=' can be tricky but all sid= followed by 32 byte string * so far seen have been session ids. Sid is a 32 byte string @@ -81,16 +92,18 @@ * so have to have it run after the phpsessid elimination. */ private static final Pattern STRIP_SID_REGEX = - Pattern.compile("^(.+)" + - "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE); + Pattern.compile("^(?:.+)" + + "(sid=[0-9a-zA-Z]{32}&?)(?:(?:.*))?$", Pattern.CASE_INSENSITIVE); + private static final String STRIP_SID_CHOOSER = "sid="; /** * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM. */ private static final Pattern STRIP_ASPSESSION_REGEX = - Pattern.compile("^(.+)" + - "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", + Pattern.compile("^(?:.+)" + + "(ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24}&?)(?:(?:.*))?$", Pattern.CASE_INSENSITIVE); + private static final String STRIP_ASPSESSION_CHOOSER = "aspsessionid"; /** * Examples: @@ -108,10 +121,10 @@ * */ private static final Pattern STRIP_ASPSESSION2_REGEX = - Pattern.compile("^([^\\?]+/)" + - "(?:\\((?:S\\(|)[0-9a-z]{24}\\)(?:\\)|)/)([^\\?]+\\.aspx.*)$", + Pattern.compile(".*/(\\([0-9a-z]{24}\\)/)(?:[^\\?]+\\.aspx.*)$", Pattern.CASE_INSENSITIVE); - + private static final String STRIP_ASPSESSION2_CHOOSER = ".aspx"; + /** * Examples: * @@ -123,12 +136,10 @@ * http://msdn2.microsoft.com/en-us/library/aa479315.aspx * */ - private static final Pattern STRIP_ASPSESSION3_REGEX = - Pattern.compile("^([^\\?]+/" + - "\\((?:a\\([0-9a-z]{24}\\)))(?:S\\([0-9a-z]{24}\\))" + - "((?:f\\([0-9a-z]{24}\\))\\)/[^\\?]+\\.aspx.*)$", + Pattern.compile(".*/(\\((?:[a-z]\\([0-9a-z]{24}\\))+\\)/)[^\\?]+\\.aspx.*$", Pattern.CASE_INSENSITIVE); + private static final String STRIP_ASPSESSION3_CHOOSER = ".aspx"; /** * Strip ColdFusion session IDs. Remove sessionids that look like the @@ -137,36 +148,52 @@ * CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A */ private static final Pattern STRIP_CFSESSION_REGEX = - Pattern.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+(?:jsession=[^&]+)?)" + - "(?:&(.*))?$",Pattern.CASE_INSENSITIVE); + Pattern.compile(".+(cfid=[^&]+&cftoken=[^&]+(?:&jsessionid=[^&]+)?&?).*$", + Pattern.CASE_INSENSITIVE); + private static final String STRIP_CFSESSION_CHOOSER = "cftoken="; + + private static final String choosers[] = { + STRIP_USERINFO_CHOOSER, + STRIP_WWW_CHOOSER, + STRIP_PHPSESSION_ID_CHOOSER, + STRIP_JSESSION_ID_CHOOSER, + STRIP_ASPSESSION_CHOOSER, + STRIP_ASPSESSION2_CHOOSER, + STRIP_ASPSESSION3_CHOOSER, + STRIP_SID_CHOOSER, + STRIP_CFSESSION_CHOOSER + }; + private static final Pattern strippers[] = { + STRIP_USERINFO_REGEX, + STRIP_WWW_REGEX, + STRIP_PHPSESSION_ID_REGEX, + STRIP_JSESSION_ID_REGEX, + STRIP_ASPSESSION_REGEX, + STRIP_ASPSESSION2_REGEX, + STRIP_ASPSESSION3_REGEX, + STRIP_SID_REGEX, + STRIP_CFSESSION_REGEX + }; + /** - * Run a regex that strips elements of a string. + * Run a regex against a StringBuilder, removing group 1 if it matches. * * Assumes the regex has a form that wants to strip elements of the passed - * string. Assumes that if a match, appending group 1 - * and group 2 yields desired result. + * string. Assumes that if a match, group 1 should be removed * @param url Url to search in. - * @param matcher Matcher whose form yields a group 1 and group 2 if a - * match (non-null. - * @return Original <code>url</code> else concatenization of group 1 - * and group 2. + * @param matcher Matcher whose form yields a group to remove + * @return true if the StringBuilder was modified */ - protected String doStripRegexMatch(String url, Matcher matcher) { - return (matcher != null && matcher.matches())? - checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)): - url; + protected boolean doStripRegexMatch(StringBuilder url, Matcher matcher) { + if(matcher != null && matcher.matches()) { + url.delete(matcher.start(1), matcher.end(1)); + return true; + } + return false; } /** - * @param string String to check. - * @return <code>string</code> if non-null, else empty string (""). - */ - private String checkForNull(String string) { - return (string != null)? string: ""; - } - - /** * return the canonical string key for the URL argument. * * @param urlString @@ -175,6 +202,9 @@ */ public String urlStringToKey(final String urlString) throws URIException { + if(urlString.startsWith("dns:")) { + return urlString; + } String searchUrl = canonicalize(urlString); // TODO: force https into http for the moment... @@ -195,20 +225,20 @@ searchUrl = "http://" + searchUrl; } - // unescape anythying that can be: + // TODO: These next few lines look crazy -- need to be reworked.. This + // was the only easy way I could find to get the correct unescaping + // out of UURIs, possible a bug. Definitely needs some TLC in any case, + // as building UURIs is *not* a cheap operation. + + // unescape anything that can be: UURI tmpURI = UURIFactory.getInstance(searchUrl); tmpURI.setPath(tmpURI.getPath()); - - // convert to UURI to perform require URI fixup: + // convert to UURI to perform required URI fixup: UURI searchURI = UURIFactory.getInstance(tmpURI.getURI()); - - - // replace ' ' with '+' (this is only to match Alexa's canonicalization) String newPath = searchURI.getEscapedPath().replace("%20","+"); -// String newPath = searchURI.getPath().replace(' ','+'); // replace multiple consecutive '/'s in the path. while(newPath.contains("//")) { @@ -241,12 +271,10 @@ if(searchURI.getEscapedQuery() != null) { sb.append("?").append(searchURI.getEscapedQuery()); } - return sb.toString(); } - /** * Idempotent operation that will determine the 'fuzziest' * form of the url argument. This operation is done prior to adding records @@ -259,19 +287,23 @@ * @return canonicalized version of url argument. */ public String canonicalize(String url) { - url = doStripRegexMatch(url, STRIP_USERINFO_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_WWW_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_WWWN_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_SESSION_ID_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_ASPSESSION_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_ASPSESSION2_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_ASPSESSION3_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_SID_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_CFSESSION_REGEX.matcher(url)); - url = url.toLowerCase(); + if (url == null || url.length() <= 0) { return url; } + + // hang on, we're about to get aggressive: + url = url.toLowerCase(); + StringBuilder sb = new StringBuilder(url); + boolean changed = false; + for(int i=0; i<choosers.length; i++) { + if(sb.indexOf(choosers[i]) != -1) { + changed |= doStripRegexMatch(sb,strippers[i].matcher(sb)); + } + } + if(changed) { + url = sb.toString(); + } int index = url.lastIndexOf('?'); if (index > 0) { @@ -285,8 +317,8 @@ url = url.substring(0, url.length() - 2); } else { // The '&' is redundant. Strip it. - url = url.substring(0, index + 1) + - url.substring(index + 2); + url = url.substring(0, index + 1) + + url.substring(index + 2); } } else if (url.charAt(url.length() - 1) == '&') { // If we have a lone '&' on end of query str, Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2008-02-01 19:34:06 UTC (rev 2169) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2008-02-01 23:53:57 UTC (rev 2170) @@ -144,7 +144,7 @@ String sid3 = "sid=9682993c8daa2c5497996114facdc805"; String sid4 = "ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM"; String sid5 = "CFID=12412453&CFTOKEN=15501799"; - //String sid6 = "CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A"; + String sid6 = "CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A"; String fore = "http://foo.com/bar?bo=lo&"; String aft = "&gum=yum"; @@ -158,7 +158,7 @@ checkCanonicalization(fore + sid3 + aft,want); checkCanonicalization(fore + sid4 + aft,want); checkCanonicalization(fore + sid5 + aft,want); - //checkCanonicalization(fore + sid6 + aft,want); + checkCanonicalization(fore + sid6 + aft,want); // Check ASP_SESSIONID2: checkCanonicalization( @@ -173,7 +173,7 @@ // Check ASP_SESSIONID3: checkCanonicalization( "http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules", - "legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules"); + "legislature.mi.gov/mileg.aspx?page=sessionschedules"); // strip port 80 checkCanonicalization("http://www.chub.org:80/foo","chub.org/foo"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-01 19:34:18
|
Revision: 2169 http://archive-access.svn.sourceforge.net/archive-access/?rev=2169&view=rev Author: bradtofel Date: 2008-02-01 11:34:06 -0800 (Fri, 01 Feb 2008) Log Message: ----------- TWEAK: upped archive-commons version to 2.0.1 Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/pom.xml Modified: trunk/archive-access/projects/wayback/wayback-core/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/pom.xml 2008-02-01 01:59:59 UTC (rev 2168) +++ trunk/archive-access/projects/wayback/wayback-core/pom.xml 2008-02-01 19:34:06 UTC (rev 2169) @@ -57,7 +57,7 @@ <dependency> <groupId>org.archive.heritrix</groupId> <artifactId>commons</artifactId> - <version>2.0.0-SNAPSHOT</version> + <version>2.0.1-SNAPSHOT</version> </dependency> <dependency> <groupId>org.mozilla</groupId> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-01 01:59:56
|
Revision: 2168 http://archive-access.svn.sourceforge.net/archive-access/?rev=2168&view=rev Author: bradtofel Date: 2008-01-31 17:59:59 -0800 (Thu, 31 Jan 2008) Log Message: ----------- BUGFIX: need to pass arguments to Java via sh's '"$@"' not '$@'. Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/src/scripts/arc-indexer trunk/archive-access/projects/wayback/dist/src/scripts/bdb-client trunk/archive-access/projects/wayback/dist/src/scripts/bin-search trunk/archive-access/projects/wayback/dist/src/scripts/create-test-arc trunk/archive-access/projects/wayback/dist/src/scripts/location-client trunk/archive-access/projects/wayback/dist/src/scripts/url-client trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer Modified: trunk/archive-access/projects/wayback/dist/src/scripts/arc-indexer =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/arc-indexer 2008-02-01 01:44:53 UTC (rev 2167) +++ trunk/archive-access/projects/wayback/dist/src/scripts/arc-indexer 2008-02-01 01:59:59 UTC (rev 2168) @@ -78,5 +78,5 @@ CLASS_MAIN='org.archive.wayback.resourcestore.ArcIndexer' fi -CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN $@ +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" Modified: trunk/archive-access/projects/wayback/dist/src/scripts/bdb-client =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/bdb-client 2008-02-01 01:44:53 UTC (rev 2167) +++ trunk/archive-access/projects/wayback/dist/src/scripts/bdb-client 2008-02-01 01:59:59 UTC (rev 2168) @@ -78,5 +78,5 @@ CLASS_MAIN='org.archive.wayback.resourceindex.bdb.BDBIndex' fi -CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN $@ +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" Modified: trunk/archive-access/projects/wayback/dist/src/scripts/bin-search =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/bin-search 2008-02-01 01:44:53 UTC (rev 2167) +++ trunk/archive-access/projects/wayback/dist/src/scripts/bin-search 2008-02-01 01:59:59 UTC (rev 2168) @@ -78,5 +78,5 @@ CLASS_MAIN='org.archive.wayback.util.flatfile.FlatFile' fi -CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN $@ +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" Modified: trunk/archive-access/projects/wayback/dist/src/scripts/create-test-arc =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/create-test-arc 2008-02-01 01:44:53 UTC (rev 2167) +++ trunk/archive-access/projects/wayback/dist/src/scripts/create-test-arc 2008-02-01 01:59:59 UTC (rev 2168) @@ -78,5 +78,5 @@ CLASS_MAIN='org.archive.wayback.util.ARCCreator' fi -CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN $@ +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" Modified: trunk/archive-access/projects/wayback/dist/src/scripts/location-client =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/location-client 2008-02-01 01:44:53 UTC (rev 2167) +++ trunk/archive-access/projects/wayback/dist/src/scripts/location-client 2008-02-01 01:59:59 UTC (rev 2168) @@ -78,5 +78,5 @@ CLASS_MAIN='org.archive.wayback.resourcestore.http.FileLocationDBClient' fi -CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN $@ +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" Modified: trunk/archive-access/projects/wayback/dist/src/scripts/url-client =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/url-client 2008-02-01 01:44:53 UTC (rev 2167) +++ trunk/archive-access/projects/wayback/dist/src/scripts/url-client 2008-02-01 01:59:59 UTC (rev 2168) @@ -78,5 +78,5 @@ CLASS_MAIN='org.archive.wayback.util.url.AggressiveUrlCanonicalizer' fi -CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN $@ +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" Modified: trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer 2008-02-01 01:44:53 UTC (rev 2167) +++ trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer 2008-02-01 01:59:59 UTC (rev 2168) @@ -78,5 +78,5 @@ CLASS_MAIN='org.archive.wayback.resourcestore.WarcIndexer' fi -CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN $@ +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-01 01:44:50
|
Revision: 2167 http://archive-access.svn.sourceforge.net/archive-access/?rev=2167&view=rev Author: bradtofel Date: 2008-01-31 17:44:53 -0800 (Thu, 31 Jan 2008) Log Message: ----------- FEATURE: now merges results from multiple search files, so output is sorted. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java 2008-02-01 00:10:36 UTC (rev 2166) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java 2008-02-01 01:44:53 UTC (rev 2167) @@ -30,9 +30,11 @@ import java.io.IOException; import java.io.PrintWriter; import java.io.RandomAccessFile; +import java.util.Comparator; import java.util.Iterator; import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.CompositeSortedIterator; /** * Subclass of File, which allows binary searching, returning Iterators @@ -199,25 +201,36 @@ USAGE(); } String prefix = args[0]; - for(int i=1; i < args.length; i++) { - FlatFile ff = new FlatFile(args[i]); - RecordIterator ri; - try { - ri = (RecordIterator) ff.getRecordIterator(prefix); - while(ri.hasNext()) { - String line = (String) ri.next(); - if(!line.startsWith(prefix)) { - break; + CloseableIterator<String> itr; + try { + if(args.length == 2) { + FlatFile ff = new FlatFile(args[1]); + itr = (RecordIterator) ff.getRecordIterator(prefix); + } else { + Comparator<String> comp = new Comparator<String>() { + public int compare(String o1, String o2) { + return o1.compareTo(o2); } - if(args.length > 2) { - System.out.println(args[i] + " " + line); - } else { - System.out.println(line); - } + }; + CompositeSortedIterator<String> csi = + new CompositeSortedIterator<String>(comp); + RecordIterator fitr; + for(int i=1; i < args.length; i++) { + FlatFile ff = new FlatFile(args[i]); + fitr = (RecordIterator) ff.getRecordIterator(prefix); + csi.addComponent(fitr); } - } catch (IOException e) { - e.printStackTrace(); + itr = csi; } + while(itr.hasNext()) { + String line = (String) itr.next(); + if(!line.startsWith(prefix)) { + break; + } + System.out.println(line); + } + } catch (IOException e) { + e.printStackTrace(); } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-01 00:10:32
|
Revision: 2166 http://archive-access.svn.sourceforge.net/archive-access/?rev=2166&view=rev Author: bradtofel Date: 2008-01-31 16:10:36 -0800 (Thu, 31 Jan 2008) Log Message: ----------- Done after the fact -- failed to create branch at 0.8.0 release... Added Paths: ----------- branches/wayback-0_8_0/wayback/ Copied: branches/wayback-0_8_0/wayback (from rev 1427, trunk/archive-access/projects/wayback) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-01 00:07:52
|
Revision: 2165 http://archive-access.svn.sourceforge.net/archive-access/?rev=2165&view=rev Author: bradtofel Date: 2008-01-31 16:07:52 -0800 (Thu, 31 Jan 2008) Log Message: ----------- Added Paths: ----------- wayback/ Copied: wayback (from rev 1427, trunk/archive-access/projects/wayback) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-01 00:06:58
|
Revision: 2164 http://archive-access.svn.sourceforge.net/archive-access/?rev=2164&view=rev Author: bradtofel Date: 2008-01-31 16:07:01 -0800 (Thu, 31 Jan 2008) Log Message: ----------- failed to add this branch when we made the 0.8.0 version. Added Paths: ----------- branches/wayback-0_8_0/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-31 01:26:36
|
Revision: 2163 http://archive-access.svn.sourceforge.net/archive-access/?rev=2163&view=rev Author: bradtofel Date: 2008-01-30 17:26:40 -0800 (Wed, 30 Jan 2008) Log Message: ----------- FEATURE: now rewrites all background="..." tag attributes on the server side... Should only be legal in TABLE, TD, TD, and then only in the microsoft world, but probably simpler to just replace them all.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-01-31 01:08:52 UTC (rev 2162) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-01-31 01:26:40 UTC (rev 2163) @@ -237,7 +237,8 @@ {"FRAME","SRC"}, {"META","URL"}, {"LINK","HREF"}, - {"SCRIPT","SRC"} + {"SCRIPT","SRC"}, + {TagMagix.ANY_TAGNAME,"background"} }; // TODO: The classic WM added a js_ to the datespec, so NotInArchives // can return an valid javascript doc, and not cause Javascript errors. @@ -280,7 +281,7 @@ {"APPLET","ARCHIVE"}, {"EMBED","SRC"}, {"IFRAME","SRC"}, - {"BODY","BACKGROUND"}, + {TagMagix.ANY_TAGNAME,"background"} }; for(String tagAttr[] : markups) { TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl, This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-31 01:08:51
|
Revision: 2162 http://archive-access.svn.sourceforge.net/archive-access/?rev=2162&view=rev Author: bradtofel Date: 2008-01-30 17:08:52 -0800 (Wed, 30 Jan 2008) Log Message: ----------- TWEAK: made ANY_TAGNAME public. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2008-01-31 01:07:58 UTC (rev 2161) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2008-01-31 01:08:52 UTC (rev 2162) @@ -59,7 +59,7 @@ private static String RAW_ATTR_VALUE = "(?:[^ \\t\\n\\x0B\\f\\r>\"']+)"; - private static String ANY_TAGNAME = "[a-z]+"; + public static String ANY_TAGNAME = "[a-z]+"; private static String STYLE_ATTR_NAME = "style"; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-31 01:07:56
|
Revision: 2161 http://archive-access.svn.sourceforge.net/archive-access/?rev=2161&view=rev Author: bradtofel Date: 2008-01-30 17:07:58 -0800 (Wed, 30 Jan 2008) Log Message: ----------- BUGFIX: check that urls have not already been rewritten(on the server, for example) before rewriting. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/client-rewrite.js Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/client-rewrite.js =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/client-rewrite.js 2008-01-31 00:30:37 UTC (rev 2160) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/client-rewrite.js 2008-01-31 01:07:58 UTC (rev 2161) @@ -17,10 +17,12 @@ var wmSpecial = aCollection[i].getAttribute("wmSpecial"); if(wmSpecial && wmSpecial.length > 0) { } else { - if(aCollection[i][sProp].indexOf("http") == 0) { - aCollection[i][sProp] = sWayBackCGI + aCollection[i][sProp]; - } else { - aCollection[i][sProp] = sWayBackCGI + xResolveUrl(aCollection[i][sProp]); + if(aCollection[i][sProp].indexOf(sWayBackCGI) == -1) { + if(aCollection[i][sProp].indexOf("http") == 0) { + aCollection[i][sProp] = sWayBackCGI + aCollection[i][sProp]; + } else { + aCollection[i][sProp] = sWayBackCGI + xResolveUrl(aCollection[i][sProp]); + } } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-31 00:30:33
|
Revision: 2160 http://archive-access.svn.sourceforge.net/archive-access/?rev=2160&view=rev Author: bradtofel Date: 2008-01-30 16:30:37 -0800 (Wed, 30 Jan 2008) Log Message: ----------- FEATURE: firstly, we test that a charset is supported before returning it as a viable charset to encode/decode. Secondly, we now attemt to replace internal spaces within a charset declaration... there's at least one lame webpage out there that has "charset=i so-8859-1"... Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-01-30 03:26:36 UTC (rev 2159) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-01-31 00:30:37 UTC (rev 2160) @@ -28,6 +28,7 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; import java.text.ParseException; import java.util.Map; @@ -86,11 +87,23 @@ this.result = result; this.uriConverter = uriConverter; } - + private String contentTypeToCharset(final String contentType) { int offset = contentType.indexOf(CHARSET_TOKEN); if (offset != -1) { - return contentType.substring(offset + CHARSET_TOKEN.length()); + String cs = contentType.substring(offset + CHARSET_TOKEN.length()); + if(Charset.isSupported(cs)) { + return cs; + } + // test for extra spaces... there's at least one page out there that + // indicates it's charset with: + +// <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1"> + + // bad web page! + if(Charset.isSupported(cs.replace(" ", ""))) { + return cs.replace(" ", ""); + } } return null; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-30 04:13:54
|
Revision: 2159 http://archive-access.svn.sourceforge.net/archive-access/?rev=2159&view=rev Author: bradtofel Date: 2008-01-29 19:26:36 -0800 (Tue, 29 Jan 2008) Log Message: ----------- Text: added some notes about what's included in 1.2. Modified Paths: -------------- trunk/archive-access/projects/wayback/plan.txt Modified: trunk/archive-access/projects/wayback/plan.txt =================================================================== --- trunk/archive-access/projects/wayback/plan.txt 2008-01-30 03:07:20 UTC (rev 2158) +++ trunk/archive-access/projects/wayback/plan.txt 2008-01-30 03:26:36 UTC (rev 2159) @@ -30,7 +30,19 @@ - automatically maintained distributed index - basis for production Wayback Machine replacement - internationalization support - + +1.2 (Jan 2008) + - changed LocalArcResourceStore => LocalResourceStore + * supports compressed/uncompressed ARC + WARC format + * requires property arcDir => dataDir + * major refactoring of Replay system, allowing mixing and matching of now- + separable replay elements + * server-side URL rewriting in archival URL mode for JavaScript-less client + replay browsing + * numerous bug-fixes + * pluggable URL canonicalization module + * initial support for de-duplicated WARC records + == BRAINSTORMING == Error-handling: This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-30 03:07:14
|
Revision: 2158 http://archive-access.svn.sourceforge.net/archive-access/?rev=2158&view=rev Author: bradtofel Date: 2008-01-29 19:07:20 -0800 (Tue, 29 Jan 2008) Log Message: ----------- Updated to reflect new ResourceStore implementation class names, and to use new Replay jspInserts, including some example jsp inserts. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-01-30 03:05:36 UTC (rev 2157) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-01-30 03:07:20 UTC (rev 2158) @@ -71,16 +71,22 @@ <bean id="localbdbcollection" class="org.archive.wayback.webapp.WaybackCollection"> <property name="resourceStore"> - <bean class="org.archive.wayback.resourcestore.LocalARCResourceStore" + <bean class="org.archive.wayback.resourcestore.LocalResourceStore" init-method="init"> - <property name="arcDir" value="/tmp/wayback/arcs/" /> - <property name="queuedDir" value="/tmp/wayback/arc-indexer/queued" /> - <property name="workDir" value="/tmp/wayback/arc-indexer/work" /> - <property name="runInterval" value="10000" /> - <property name="indexClient"> - <bean class="org.archive.wayback.resourceindex.indexer.IndexClient"> - <property name="tmpDir" value="/tmp/wayback/arc-indexer/tmp" /> - <property name="target" value="/tmp/wayback/index-data/incoming" /> + + <property name="dataDir" value="/tmp/wayback/arcs/" /> + + <property name="indexThread"> + <bean class="org.archive.wayback.resourcestore.AutoIndexThread"> + <property name="queuedDir" value="/tmp/wayback/arc-indexer/queued" /> + <property name="workDir" value="/tmp/wayback/arc-indexer/work" /> + <property name="runInterval" value="10000" /> + <property name="indexClient"> + <bean class="org.archive.wayback.resourceindex.indexer.IndexClient"> + <property name="tmpDir" value="/tmp/wayback/arc-indexer/tmp" /> + <property name="target" value="/tmp/wayback/index-data/incoming" /> + </bean> + </property> </bean> </property> </bean> @@ -116,9 +122,9 @@ <bean id="localcdxcollection" class="org.archive.wayback.webapp.WaybackCollection"> <property name="resourceStore"> - <bean class="org.archive.wayback.resourcestore.LocalARCResourceStore" + <bean class="org.archive.wayback.resourcestore.LocalResourceStore" init-method="init"> - <property name="arcDir" value="/tmp/wayback/arcs/" /> + <property name="dataDir" value="/tmp/wayback/arcs/" /> </bean> </property> @@ -147,7 +153,7 @@ <bean id="remotecollection" class="org.archive.wayback.webapp.WaybackCollection"> <property name="resourceStore"> - <bean class="org.archive.wayback.resourcestore.HttpARCResourceStore"> + <bean class="org.archive.wayback.resourcestore.Http11ResourceStore"> <property name="urlPrefix" value="http://localhost:8080/arcproxy/" /> </bean> </property> @@ -175,26 +181,47 @@ <property name="collection" ref="localbdbcollection" /> + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://localhost:8080/wayback/" /> + </bean> + </property> + <property name="query"> <bean class="org.archive.wayback.query.Renderer"> <property name="captureJsp" value="/jsp/HTMLResults.jsp" /> </bean> </property> - <property name="replay"> - <bean class="org.archive.wayback.archivalurl.ArchivalUrlReplayDispatcher"> - <property name="jsInserts"> - <list> - <value>http://localhost:8080/wm.js</value> - </list> - </property> - <property name="jspInserts"> - <list> - <value>/replay/Timeline.jsp</value> - </list> - </property> - </bean> - </property> + <property name="replay"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlReplayDispatcher"> + <property name="serverSideRendering" value="false" /> + <property name="jspInserts"> + <list> + <value>/replay/ArchiveComment.jsp</value> + <value>/replay/ClientSideJSInsert.jsp</value> +<!-- + The following 2 .jsp include values will produce in-page elements within + replayed HTML pages. Both require client-side Javascript. +--> +<!-- + <value>/replay/Disclaimer.jsp</value> + <value>/replay/Timeline.jsp</value> +--> +<!-- + The following .jsp include value will produce a timeline within *all* replayed + pages, including all subframes within a frameset, but requires no client side + Javascript. It is intended for use in deployments which use: + + serverSideRendering=true +--> +<!-- + <value>/replay/JSLessTimeline.jsp</value> +--> + </list> + </property> + </bean> + </property> <property name="parser"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser" @@ -203,12 +230,6 @@ <property name="earliestTimestamp" value="1996" /> </bean> </property> - - <property name="uriConverter"> - <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> - <property name="replayURIPrefix" value="http://localhost:8080/wayback/" /> - </bean> - </property> </bean> <!-- This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-30 03:05:54
|
Revision: 2157 http://archive-access.svn.sourceforge.net/archive-access/?rev=2157&view=rev Author: bradtofel Date: 2008-01-29 19:05:36 -0800 (Tue, 29 Jan 2008) Log Message: ----------- FEATURE: now rewrites INPUT.src urls. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/client-rewrite.js Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/client-rewrite.js =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/client-rewrite.js 2008-01-30 03:03:55 UTC (rev 2156) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/client-rewrite.js 2008-01-30 03:05:36 UTC (rev 2157) @@ -37,6 +37,7 @@ xLateUrl(document.getElementsByTagName("APPLET"),"archive"); xLateUrl(document.getElementsByTagName("EMBED"),"src"); xLateUrl(document.getElementsByTagName("IFRAME"),"src"); +xLateUrl(document.getElementsByTagName("INPUT"),"src"); xLateUrl(document.getElementsByTagName("BODY"),"background"); var forms = document.getElementsByTagName("FORM"); if (forms) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-30 03:03:55
|
Revision: 2156 http://archive-access.svn.sourceforge.net/archive-access/?rev=2156&view=rev Author: bradtofel Date: 2008-01-29 19:03:55 -0800 (Tue, 29 Jan 2008) Log Message: ----------- added tests for text/css URL rewriting. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/TagMagixTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/TagMagixTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/TagMagixTest.java 2008-01-30 03:01:18 UTC (rev 2155) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/TagMagixTest.java 2008-01-30 03:03:55 UTC (rev 2156) @@ -281,7 +281,127 @@ } + + public void testCSSMarkup() { + // basic, with quot apos + raw: + checkCSSMarkup("@import url(http://foo.com/f.css);", + "@import url(http://web.archive.org/wayback/2004/http://foo.com/f.css);", + "http://web.archive.org/wayback/","2004","http://foo.com/"); + checkCSSMarkup("@import url('http://foo.com/f.css');", + "@import url('http://web.archive.org/wayback/2004/http://foo.com/f.css');", + "http://web.archive.org/wayback/","2004","http://foo.com/"); + checkCSSMarkup("@import url(\"http://foo.com/f.css\");", + "@import url(\"http://web.archive.org/wayback/2004/http://foo.com/f.css\");", + "http://web.archive.org/wayback/","2004","http://foo.com/"); + + + // same as basic, but with extra whitespace after "url" + checkCSSMarkup("@import url (http://foo.com/f.css);", + "@import url (http://web.archive.org/wayback/2004/http://foo.com/f.css);", + "http://web.archive.org/wayback/","2004","http://foo.com/"); + checkCSSMarkup("@import url\t('http://foo.com/f.css');", + "@import url\t('http://web.archive.org/wayback/2004/http://foo.com/f.css');", + "http://web.archive.org/wayback/","2004","http://foo.com/"); + checkCSSMarkup("@import url\n(\"http://foo.com/f.css\");", + "@import url\n(\"http://web.archive.org/wayback/2004/http://foo.com/f.css\");", + "http://web.archive.org/wayback/","2004","http://foo.com/"); + + // whitespace within url spec: + checkCSSMarkup("@import url( http://foo.com/f.css);", + "@import url( http://web.archive.org/wayback/2004/http://foo.com/f.css);", + "http://web.archive.org/wayback/","2004","http://foo.com/"); + checkCSSMarkup("@import url('http://foo.com/f.css' );", + "@import url('http://web.archive.org/wayback/2004/http://foo.com/f.css' );", + "http://web.archive.org/wayback/","2004","http://foo.com/"); + checkCSSMarkup("@import url( \"http://foo.com/f.css\" );", + "@import url( \"http://web.archive.org/wayback/2004/http://foo.com/f.css\" );", + "http://web.archive.org/wayback/","2004","http://foo.com/"); + checkCSSMarkup("@import url(\t\"http://foo.com/f.css\"\t);", + "@import url(\t\"http://web.archive.org/wayback/2004/http://foo.com/f.css\"\t);", + "http://web.archive.org/wayback/","2004","http://foo.com/"); + checkCSSMarkup("@import url(\n\"http://foo.com/f.css\"\n);", + "@import url(\n\"http://web.archive.org/wayback/2004/http://foo.com/f.css\"\n);", + "http://web.archive.org/wayback/","2004","http://foo.com/"); + checkCSSMarkup("@import url(\r\n\"http://foo.com/f.css\"\n\r);", + "@import url(\r\n\"http://web.archive.org/wayback/2004/http://foo.com/f.css\"\n\r);", + "http://web.archive.org/wayback/","2004","http://foo.com/"); + + + + } + + public void testStyleUrlMarkup() { + // simple, server relative + checkStyleUrlMarkup("<table style=\"background: url(/css/b.gif)\"></table>", + "<table style=\"background: url(http://w.a.org/wb/2004/http://f.au/css/b.gif)\"></table>", + "http://w.a.org/wb/","2004","http://f.au/"); + // server-relative, which now means something + checkStyleUrlMarkup("<table style=\"background: url(/css/b.gif)\"></table>", + "<table style=\"background: url(http://w.a.org/wb/2004/http://f.au/css/b.gif)\"></table>", + "http://w.a.org/wb/","2004","http://f.au/b/"); + + // path relative: + checkStyleUrlMarkup("<table style=\"background: url(css/b.gif)\"></table>", + "<table style=\"background: url(http://w.a.org/wb/2004/http://f.au/css/b.gif)\"></table>", + "http://w.a.org/wb/","2004","http://f.au/"); + // path relative, meaningful: + checkStyleUrlMarkup("<table style=\"background: url(css/b.gif)\"></table>", + "<table style=\"background: url(http://w.a.org/wb/2004/http://f.au/b/css/b.gif)\"></table>", + "http://w.a.org/wb/","2004","http://f.au/b/"); + + // absolute: + checkStyleUrlMarkup("<table style=\"background: url(http://e.au/css/b.gif)\"></table>", + "<table style=\"background: url(http://w.a.org/wb/2004/http://e.au/css/b.gif)\"></table>", + "http://w.a.org/wb/","2004","http://f.au/b/"); + + // apos attribute + checkStyleUrlMarkup("<table style='background: url(/css/b.gif)'></table>", + "<table style='background: url(http://w.a.org/wb/2004/http://f.au/css/b.gif)'></table>", + "http://w.a.org/wb/","2004","http://f.au/"); + + // quote attribute, apos url: + checkStyleUrlMarkup("<table style=\"background: url('/css/b.gif')\"></table>", + "<table style=\"background: url('http://w.a.org/wb/2004/http://f.au/css/b.gif')\"></table>", + "http://w.a.org/wb/","2004","http://f.au/"); + + // apos attribute, quote url: + checkStyleUrlMarkup("<table style='background: url(\"/css/b.gif\")'></table>", + "<table style='background: url(\"http://w.a.org/wb/2004/http://f.au/css/b.gif\")'></table>", + "http://w.a.org/wb/","2004","http://f.au/"); + + // apos attribute, quote url, plus semi-colon: + checkStyleUrlMarkup("<table style='background: url(\"/css/b.gif\");'></table>", + "<table style='background: url(\"http://w.a.org/wb/2004/http://f.au/css/b.gif\");'></table>", + "http://w.a.org/wb/","2004","http://f.au/"); + + // Two url()s in same attribute value: + checkStyleUrlMarkup("<table style=\"bg: url(/css/b.gif); fg: url(/css/f.gif);\"></table>", + "<table style=\"bg: url(http://w.a.org/wb/2004/http://f.au/css/b.gif); fg: url(http://w.a.org/wb/2004/http://f.au/css/f.gif);\"></table>", + "http://w.a.org/wb/","2004","http://f.au/"); + + // Two url()s in same quoted attribute value, with embedded apos: + checkStyleUrlMarkup("<table style=\"bg: url('/css/b.gif'); fg: url('/css/f.gif');\"></table>", + "<table style=\"bg: url('http://w.a.org/wb/2004/http://f.au/css/b.gif'); fg: url('http://w.a.org/wb/2004/http://f.au/css/f.gif');\"></table>", + "http://w.a.org/wb/","2004","http://f.au/"); + + // Two url()s in same apos'ed attribute value, with embedded quote: + checkStyleUrlMarkup("<table style='bg: url(\"/css/b.gif\"); fg: url(\"/css/f.gif\");'></table>", + "<table style='bg: url(\"http://w.a.org/wb/2004/http://f.au/css/b.gif\"); fg: url(\"http://w.a.org/wb/2004/http://f.au/css/f.gif\");'></table>", + "http://w.a.org/wb/","2004","http://f.au/"); +// +// NOT WORKING YET... Let's see if we get a complaint... Not even sure this +// is legit HTML: +// +// // Two url()s in same quoted attribute value, with embedded escaped quote: +// checkStyleUrlMarkup("<table style=\"bg: url(\\\"/css/b.gif\\\"); fg: url(\\\"/css/f.gif\\\");\"></table>", +// "<table style=\"bg: url(\\\"http://w.a.org/wb/2004/http://f.au/css/b.gif\\\"); fg: url(\\\"http://w.a.org/wb/2004/http://f.au/css/f.gif\\\");\"></table>", +// "http://w.a.org/wb/","2004","http://f.au/"); + + + } + + private void checkAttrValue(String page, String tag, String attr, String wantValue) { StringBuilder sb = new StringBuilder(page); @@ -299,11 +419,26 @@ } } + private void checkCSSMarkup(String orig, String want,String prefix, String ts, String url) { + StringBuilder buf = new StringBuilder(orig); + ArchivalUrlResultURIConverter uriC = new ArchivalUrlResultURIConverter(); + uriC.setReplayURIPrefix(prefix); + TagMagix.markupCSSImports(buf, uriC, ts, url); + String marked = buf.toString(); + assertEquals(want,marked); + } + + private void checkStyleUrlMarkup(String orig, String want, String prefix, String ts, String url) { + StringBuilder buf = new StringBuilder(orig); + ArchivalUrlResultURIConverter uriC = new ArchivalUrlResultURIConverter(); + uriC.setReplayURIPrefix(prefix); + TagMagix.markupStyleUrls(buf,uriC,ts,url); + String marked = buf.toString(); + assertEquals(want,marked); + } + private void checkMarkup(String orig, String want, String tag, String attr, String prefix, String ts, String url) { StringBuilder buf = new StringBuilder(orig); -// if(url.startsWith("http://")) { -// url = url.substring(7); -// } ArchivalUrlResultURIConverter uriC = new ArchivalUrlResultURIConverter(); uriC.setReplayURIPrefix(prefix); TagMagix.markupTagREURIC(buf,uriC,ts,url,tag,attr); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |