From: <bra...@us...> - 2007-12-12 02:19:07
|
Revision: 2110 http://archive-access.svn.sourceforge.net/archive-access/?rev=2110&view=rev Author: bradtofel Date: 2007-12-11 18:19:12 -0800 (Tue, 11 Dec 2007) Log Message: ----------- BUGFIX: (unreported) no longer rewrite mailto: and javascript: URLs in full server-side rewrite mode Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2007-12-12 02:15:09 UTC (rev 2109) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2007-12-12 02:19:12 UTC (rev 2110) @@ -214,39 +214,64 @@ String captureDate = result.getCaptureDate(); String existingBaseHref = TagMagix.getBaseHref(sb); - if (existingBaseHref != null) { + if (existingBaseHref == null) { + insertAtStartOfHead("<base href=\"" + pageUrl + "\" />"); + } else { pageUrl = existingBaseHref; } - TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, - "FRAME", "SRC"); -// TagMagix.markupTagREURIC(page, uriConverter, captureDate, pageUrl, -// "IFRAME", "SRC"); - TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, - "META", "URL"); - TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, - "LINK", "HREF"); + String markups[][] = { + {"FRAME","SRC"}, + {"META","URL"}, + {"LINK","HREF"}, + {"SCRIPT","SRC"} + }; // TODO: The classic WM added a js_ to the datespec, so NotInArchives // can return an valid javascript doc, and not cause Javascript errors. - TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, - "SCRIPT", "SRC"); - - if (existingBaseHref == null) { - String baseTag = "<base href=\"" + pageUrl + "\" />"; - int insertPoint = sb.indexOf("<head>"); - if (-1 == insertPoint) { - insertPoint = sb.indexOf("<HEAD>"); - } - if (-1 == insertPoint) { - insertPoint = 0; - } else { - insertPoint += 6; // just after the tag - } - sb.insert(insertPoint, baseTag); + for(String tagAttr[] : markups) { + TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, + tagAttr[0], tagAttr[1]); } } + /** + * Update all URLs inside the page, so they resolve correctly to absolute + * URLs within the Wayback service. + */ + public void resolveAllPageUrls() { + // TODO: get url from Resource instead of SearchResult? + String pageUrl = result.getAbsoluteUrl(); + String captureDate = result.getCaptureDate(); + + String existingBaseHref = TagMagix.getBaseHref(sb); + if (existingBaseHref != null) { + pageUrl = existingBaseHref; + } + ResultURIConverter ruc = new SpecialResultURIConverter(uriConverter); + + // TODO: forms...? + String markups[][] = { + {"FRAME","SRC"}, + {"META","URL"}, + {"LINK","HREF"}, + {"SCRIPT","SRC"}, + {"IMG","SRC"}, + {"A","HREF"}, + {"AREA","HREF"}, + {"OBJECT","CODEBASE"}, + {"OBJECT","CDATA"}, + {"APPLET","CODEBASE"}, + {"APPLET","ARCHIVE"}, + {"EMBED","SRC"}, + {"IFRAME","SRC"}, + {"BODY","BACKGROUND"}, + }; + for(String tagAttr[] : markups) { + TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl, + tagAttr[0], tagAttr[1]); + } + } /** * @param charSet * @throws IOException @@ -310,9 +335,20 @@ } os.write(b); } - + /** * @param toInsert + */ + public void insertAtStartOfHead(String toInsert) { + int insertPoint = TagMagix.getEndOfFirstTag(sb,"head"); + if (-1 == insertPoint) { + insertPoint = 0; + } + sb.insert(insertPoint,toInsert); + } + + /** + * @param toInsert */ public void insertAtEndOfBody(String toInsert) { int insertPoint = sb.lastIndexOf("</body>"); @@ -325,6 +361,16 @@ sb.insert(insertPoint,toInsert); } /** + * @param toInsert + */ + public void insertAtStartOfBody(String toInsert) { + int insertPoint = TagMagix.getEndOfFirstTag(sb,"body"); + if (-1 == insertPoint) { + insertPoint = 0; + } + sb.insert(insertPoint,toInsert); + } + /** * @param jspPath * @param httpRequest * @param httpResponse @@ -373,4 +419,22 @@ public void setCharSet(String charSet) { this.charSet = charSet; } + + private class SpecialResultURIConverter implements ResultURIConverter { + private static final String EMAIL_PROTOCOL_PREFIX = "mailto:"; + private static final String JAVASCRIPT_PROTOCOL_PREFIX = "javascript:"; + private ResultURIConverter base = null; + public SpecialResultURIConverter(ResultURIConverter base) { + this.base = base; + } + public String makeReplayURI(String datespec, String url) { + if(url.startsWith(EMAIL_PROTOCOL_PREFIX)) { + return url; + } + if(url.startsWith(JAVASCRIPT_PROTOCOL_PREFIX)) { + return url; + } + return base.makeReplayURI(datespec, url); + } + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-30 02:59:13
|
Revision: 2153 http://archive-access.svn.sourceforge.net/archive-access/?rev=2153&view=rev Author: bradtofel Date: 2008-01-29 18:59:01 -0800 (Tue, 29 Jan 2008) Log Message: ----------- FEATURE: now uses TagMagix css related rewrite methods. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-01-30 02:58:00 UTC (rev 2152) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-01-30 02:59:01 UTC (rev 2153) @@ -232,6 +232,8 @@ TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl, tagAttr[0], tagAttr[1]); } + TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl); + TagMagix.markupStyleUrls(sb,uriConverter,captureDate,pageUrl); } /** @@ -271,7 +273,17 @@ TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl, tagAttr[0], tagAttr[1]); } + TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl); + TagMagix.markupStyleUrls(sb,uriConverter,captureDate,pageUrl); } + + public void resolveCSSUrls() { + // TODO: get url from Resource instead of SearchResult? + String pageUrl = result.getAbsoluteUrl(); + String captureDate = result.getCaptureDate(); + TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl); + } + /** * @param charSet * @throws IOException This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-31 00:30:33
|
Revision: 2160 http://archive-access.svn.sourceforge.net/archive-access/?rev=2160&view=rev Author: bradtofel Date: 2008-01-30 16:30:37 -0800 (Wed, 30 Jan 2008) Log Message: ----------- FEATURE: firstly, we test that a charset is supported before returning it as a viable charset to encode/decode. Secondly, we now attemt to replace internal spaces within a charset declaration... there's at least one lame webpage out there that has "charset=i so-8859-1"... Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-01-30 03:26:36 UTC (rev 2159) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-01-31 00:30:37 UTC (rev 2160) @@ -28,6 +28,7 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; import java.text.ParseException; import java.util.Map; @@ -86,11 +87,23 @@ this.result = result; this.uriConverter = uriConverter; } - + private String contentTypeToCharset(final String contentType) { int offset = contentType.indexOf(CHARSET_TOKEN); if (offset != -1) { - return contentType.substring(offset + CHARSET_TOKEN.length()); + String cs = contentType.substring(offset + CHARSET_TOKEN.length()); + if(Charset.isSupported(cs)) { + return cs; + } + // test for extra spaces... there's at least one page out there that + // indicates it's charset with: + +// <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1"> + + // bad web page! + if(Charset.isSupported(cs.replace(" ", ""))) { + return cs.replace(" ", ""); + } } return null; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-31 01:26:36
|
Revision: 2163 http://archive-access.svn.sourceforge.net/archive-access/?rev=2163&view=rev Author: bradtofel Date: 2008-01-30 17:26:40 -0800 (Wed, 30 Jan 2008) Log Message: ----------- FEATURE: now rewrites all background="..." tag attributes on the server side... Should only be legal in TABLE, TD, TD, and then only in the microsoft world, but probably simpler to just replace them all.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-01-31 01:08:52 UTC (rev 2162) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-01-31 01:26:40 UTC (rev 2163) @@ -237,7 +237,8 @@ {"FRAME","SRC"}, {"META","URL"}, {"LINK","HREF"}, - {"SCRIPT","SRC"} + {"SCRIPT","SRC"}, + {TagMagix.ANY_TAGNAME,"background"} }; // TODO: The classic WM added a js_ to the datespec, so NotInArchives // can return an valid javascript doc, and not cause Javascript errors. @@ -280,7 +281,7 @@ {"APPLET","ARCHIVE"}, {"EMBED","SRC"}, {"IFRAME","SRC"}, - {"BODY","BACKGROUND"}, + {TagMagix.ANY_TAGNAME,"background"} }; for(String tagAttr[] : markups) { TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl, This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-02-02 00:57:44
|
Revision: 2171 http://archive-access.svn.sourceforge.net/archive-access/?rev=2171&view=rev Author: bradtofel Date: 2008-02-01 16:57:46 -0800 (Fri, 01 Feb 2008) Log Message: ----------- BUGFIX: was not catching IllegalCharsetName exception.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-02-01 23:53:57 UTC (rev 2170) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-02-02 00:57:46 UTC (rev 2171) @@ -29,6 +29,7 @@ import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; import java.text.ParseException; import java.util.Map; @@ -88,11 +89,21 @@ this.uriConverter = uriConverter; } + private boolean isCharsetSupported(String charsetName) { + // can you believe that this throws a runtime? Just asking if it's + // supported!!?! They coulda just said "no"... + try { + return Charset.isSupported(charsetName); + } catch(IllegalCharsetNameException e) { + return false; + } + } + private String contentTypeToCharset(final String contentType) { int offset = contentType.indexOf(CHARSET_TOKEN); if (offset != -1) { String cs = contentType.substring(offset + CHARSET_TOKEN.length()); - if(Charset.isSupported(cs)) { + if(isCharsetSupported(cs)) { return cs; } // test for extra spaces... there's at least one page out there that @@ -101,7 +112,7 @@ // <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1"> // bad web page! - if(Charset.isSupported(cs.replace(" ", ""))) { + if(isCharsetSupported(cs.replace(" ", ""))) { return cs.replace(" ", ""); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-04 00:02:04
|
Revision: 2275 http://archive-access.svn.sourceforge.net/archive-access/?rev=2275&view=rev Author: bradtofel Date: 2008-06-03 17:02:04 -0700 (Tue, 03 Jun 2008) Log Message: ----------- FEATURE: added ASX markup method, which rewrites ASX XML documents, converting mms:// to http:// as it rewrites urls.. This might even be the "right thing" to do for mms://... Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-06-02 22:01:49 UTC (rev 2274) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-06-04 00:02:04 UTC (rev 2275) @@ -309,6 +309,18 @@ TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl); } + public void resolveASXRefUrls() { + + // TODO: get url from Resource instead of SearchResult? + String pageUrl = result.getAbsoluteUrl(); + String captureDate = result.getCaptureDate(); + ResultURIConverter ruc = new MMSToHTTPResultURIConverter(uriConverter); + + TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl, + "REF", "HREF"); + } + + /** * @param charSet * @throws IOException @@ -475,4 +487,20 @@ return base.makeReplayURI(datespec, url); } } + + private class MMSToHTTPResultURIConverter implements ResultURIConverter { + private static final String MMS_PROTOCOL_PREFIX = "mms://"; + private static final String HTTP_PROTOCOL_PREFIX = "http://"; + private ResultURIConverter base = null; + public MMSToHTTPResultURIConverter(ResultURIConverter base) { + this.base = base; + } + public String makeReplayURI(String datespec, String url) { + if(url.startsWith(MMS_PROTOCOL_PREFIX)) { + url = HTTP_PROTOCOL_PREFIX + + url.substring(MMS_PROTOCOL_PREFIX.length()); + } + return base.makeReplayURI(datespec, url); + } + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-06-24 21:09:20
|
Revision: 2301 http://archive-access.svn.sourceforge.net/archive-access/?rev=2301&view=rev Author: bradtofel Date: 2008-06-24 14:09:13 -0700 (Tue, 24 Jun 2008) Log Message: ----------- FEATURE: added stripHTML() method - which is completely untested... perhaps a placeholder. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-06-24 20:52:20 UTC (rev 2300) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-06-24 21:09:13 UTC (rev 2301) @@ -320,7 +320,11 @@ "REF", "HREF"); } - + public void stripHTML() { + String stripped = sb.toString().replaceAll("\\<.*?>",""); + sb.setLength(0); + sb.append(stripped); + } /** * @param charSet * @throws IOException This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-01 23:55:36
|
Revision: 2380 http://archive-access.svn.sourceforge.net/archive-access/?rev=2380&view=rev Author: bradtofel Date: 2008-07-01 16:55:46 -0700 (Tue, 01 Jul 2008) Log Message: ----------- REFACTOR: SearchResult => (Url|Capture)SearchResult Also now uses UIReplayResult object to forward context to .jsps Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-07-01 23:54:14 UTC (rev 2379) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java 2008-07-01 23:55:46 UTC (rev 2380) @@ -40,10 +40,10 @@ import org.archive.wayback.ResultURIConverter; import org.archive.wayback.core.Resource; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.core.SearchResults; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.CaptureSearchResults; import org.archive.wayback.core.WaybackRequest; -import org.archive.wayback.query.UIQueryResults; +import org.archive.wayback.replay.UIReplayResult; import org.mozilla.universalchardet.UniversalDetector; /** @@ -68,7 +68,7 @@ private final static int C_BUFFER_SIZE = 4096; private Resource resource = null; - private SearchResult result = null; + private CaptureSearchResult result = null; private ResultURIConverter uriConverter = null; /** * the internal StringBuilder @@ -82,7 +82,7 @@ * @param result * @param uriConverter */ - public HTMLPage(Resource resource, SearchResult result, + public HTMLPage(Resource resource, CaptureSearchResult result, ResultURIConverter uriConverter) { this.resource = resource; this.result = result; @@ -234,8 +234,8 @@ public void resolvePageUrls() { // TODO: get url from Resource instead of SearchResult? - String pageUrl = result.getAbsoluteUrl(); - String captureDate = result.getCaptureDate(); + String pageUrl = result.getOriginalUrl(); + String captureDate = result.getCaptureTimestamp(); String existingBaseHref = TagMagix.getBaseHref(sb); if (existingBaseHref == null) { @@ -268,8 +268,8 @@ public void resolveAllPageUrls() { // TODO: get url from Resource instead of SearchResult? - String pageUrl = result.getAbsoluteUrl(); - String captureDate = result.getCaptureDate(); + String pageUrl = result.getOriginalUrl(); + String captureDate = result.getCaptureTimestamp(); String existingBaseHref = TagMagix.getBaseHref(sb); if (existingBaseHref != null) { @@ -304,16 +304,16 @@ public void resolveCSSUrls() { // TODO: get url from Resource instead of SearchResult? - String pageUrl = result.getAbsoluteUrl(); - String captureDate = result.getCaptureDate(); + String pageUrl = result.getOriginalUrl(); + String captureDate = result.getCaptureTimestamp(); TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl); } public void resolveASXRefUrls() { // TODO: get url from Resource instead of SearchResult? - String pageUrl = result.getAbsoluteUrl(); - String captureDate = result.getCaptureDate(); + String pageUrl = result.getOriginalUrl(); + String captureDate = result.getCaptureTimestamp(); ResultURIConverter ruc = new MMSToHTTPResultURIConverter(uriConverter); TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl, @@ -436,12 +436,12 @@ */ public String includeJspString(String jspPath, HttpServletRequest httpRequest, HttpServletResponse httpResponse, - WaybackRequest wbRequest, SearchResults results, SearchResult result) + WaybackRequest wbRequest, CaptureSearchResults results, + CaptureSearchResult result, Resource resource) throws ServletException, IOException { - UIQueryResults uiResults = new UIQueryResults(httpRequest, wbRequest, - results, uriConverter); - uiResults.setResult(result); + UIReplayResult uiResults = new UIReplayResult(httpRequest, wbRequest, + result, results, resource, uriConverter); StringHttpServletResponseWrapper wrappedResponse = new StringHttpServletResponseWrapper(httpResponse); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |