From: <bra...@us...> - 2007-12-12 02:15:05
|
Revision: 2109 http://archive-access.svn.sourceforge.net/archive-access/?rev=2109&view=rev Author: bradtofel Date: 2007-12-11 18:15:09 -0800 (Tue, 11 Dec 2007) Log Message: ----------- FEATURE: added static getEndOfFirstTag() Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2007-12-12 02:12:48 UTC (rev 2108) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2007-12-12 02:15:09 UTC (rev 2109) @@ -283,4 +283,14 @@ public static String getBaseHref(StringBuilder page) { return getTagAttr(page, "BASE", "HREF"); } + + public static int getEndOfFirstTag(StringBuilder page, String tag) { + Pattern tagPattern = getWholeTagPattern(tag); + Matcher tagMatcher = tagPattern.matcher(page); + int offset = -1; + if(tagMatcher.find()) { + offset = tagMatcher.end(); + } + return offset; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-15 02:57:24
|
Revision: 2135 http://archive-access.svn.sourceforge.net/archive-access/?rev=2135&view=rev Author: bradtofel Date: 2008-01-14 18:57:30 -0800 (Mon, 14 Jan 2008) Log Message: ----------- REFACTOR: moved resolveUrl() from UrlCanonicalizer to UrlOperations Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2008-01-15 02:30:01 UTC (rev 2134) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2008-01-15 02:57:30 UTC (rev 2135) @@ -29,7 +29,7 @@ import java.util.regex.Pattern; import org.archive.wayback.ResultURIConverter; -import org.archive.wayback.util.UrlCanonicalizer; +import org.archive.wayback.util.url.UrlOperations; /** * Library for updating arbitrary attributes in arbitrary tags to rewrite HTML @@ -171,7 +171,7 @@ quote = "\\\""; url = url.substring(2, url.length() - 2); } - String finalUrl = UrlCanonicalizer.resolveUrl(baseUrl,url); + String finalUrl = UrlOperations.resolveUrl(baseUrl,url); String replayUrl = quote + uriConverter.makeReplayURI(captureDate, finalUrl) + quote; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-30 02:58:49
|
Revision: 2152 http://archive-access.svn.sourceforge.net/archive-access/?rev=2152&view=rev Author: bradtofel Date: 2008-01-29 18:58:00 -0800 (Tue, 29 Jan 2008) Log Message: ----------- FERATURE: added functionality for rewriting CSS @import url() and style="... url();" parts of .css and .html pages. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2008-01-30 01:56:39 UTC (rev 2151) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2008-01-30 02:58:00 UTC (rev 2152) @@ -59,10 +59,27 @@ private static String RAW_ATTR_VALUE = "(?:[^ \\t\\n\\x0B\\f\\r>\"']+)"; + private static String ANY_TAGNAME = "[a-z]+"; + + private static String STYLE_ATTR_NAME = "style"; + private static String ANY_ATTR_VALUE = QUOTED_ATTR_VALUE + "|" + APOSED_ATTR_VALUE + "|" + ESC_QUOTED_ATTR_VALUE + "|" + RAW_ATTR_VALUE; + +// private static String cssUrlPatString = +// "url\\s*\\(\\s*(['\"]?.+?['\"]?)\\s*\\)"; + private static String cssUrlPatString = + "url\\s*\\(\\s*([\\\\\"']*.+?[\\\\\"']*)\\s*\\)"; + + private static String cssImportPatString = + "@import\\s+" + cssUrlPatString; + private static Pattern cssImportPattern = + Pattern.compile(cssImportPatString); + + private static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString); + /** * get (and cache) a regex Pattern for locating an HTML attribute value * within a particular tag. if found, the pattern will have the attribute @@ -131,6 +148,65 @@ return pc; } + public static void markupCSSImports(StringBuilder page, + ResultURIConverter uriConverter, String captureDate, + String baseUrl) { + markupTagREURIC(page,uriConverter,captureDate,baseUrl,cssImportPattern); + } + + public static void markupStyleUrls(StringBuilder page, + ResultURIConverter uriConverter, String captureDate, + String baseUrl) { + Pattern stylePattern = getPattern(ANY_TAGNAME, STYLE_ATTR_NAME); + Matcher matcher = stylePattern.matcher(page); + + int idx = 0; + while (matcher.find(idx)) { + String attrValue = matcher.group(1); + int origAttrLength = attrValue.length(); + int attrStart = matcher.start(1); + int attrEnd = matcher.end(1); + if (attrValue.charAt(0) == '"') { + attrValue = attrValue.substring(1, origAttrLength - 1); + attrStart += 1; + } else if (attrValue.charAt(0) == '\'') { + attrValue = attrValue.substring(1, origAttrLength - 1); + attrStart += 1; + } else if (attrValue.charAt(0) == '\\') { + attrValue = attrValue.substring(2, origAttrLength - 2); + attrStart += 2; + } + + idx = attrEnd; + Matcher urlMatcher = cssUrlPattern.matcher(attrValue); + int attrIdx = 0; + while(urlMatcher.find(attrIdx)) { + String url = urlMatcher.group(1); + int origUrlLength = url.length(); + int urlStart = urlMatcher.start(1); + int urlEnd = urlMatcher.end(1); + attrIdx = urlEnd; + if (url.charAt(0) == '"') { + url = url.substring(1, origUrlLength - 1); + urlStart += 1; + } else if (url.charAt(0) == '\'') { + url = url.substring(1, origUrlLength - 1); + urlStart += 1; + } else if (url.charAt(0) == '\\') { + url = url.substring(2, origUrlLength - 2); + urlStart += 2; + } + int urlLength = url.length(); + String finalUrl = UrlOperations.resolveUrl(baseUrl,url); + String replayUrl = uriConverter.makeReplayURI(captureDate, finalUrl); + int delta = replayUrl.length() - urlLength; + page.replace(attrStart + urlStart, attrStart + urlStart + urlLength , replayUrl); + idx += delta; + attrStart += delta; + } + } + } + /** * Alter the HTML document in page, updating URLs in the attrName attributes * of all tagName tags such that: @@ -152,8 +228,14 @@ String baseUrl, String tagName, String attrName) { Pattern tagPat = getPattern(tagName, attrName); - Matcher matcher = tagPat.matcher(page); + markupTagREURIC(page,uriConverter,captureDate,baseUrl,tagPat); + } + public static void markupTagREURIC(StringBuilder page, + ResultURIConverter uriConverter, String captureDate, + String baseUrl, Pattern pattern) { + Matcher matcher = pattern.matcher(page); + int idx = 0; while (matcher.find(idx)) { String url = matcher.group(1); @@ -163,13 +245,13 @@ String quote = ""; if (url.charAt(0) == '"') { quote = "\""; - url = url.substring(1, url.length() - 1); + url = url.substring(1, origUrlLength - 1); } else if (url.charAt(0) == '\'') { quote = "'"; - url = url.substring(1, url.length() - 1); + url = url.substring(1, origUrlLength - 1); } else if (url.charAt(0) == '\\') { quote = "\\\""; - url = url.substring(2, url.length() - 2); + url = url.substring(2, origUrlLength - 2); } String finalUrl = UrlOperations.resolveUrl(baseUrl,url); String replayUrl = quote This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-31 01:08:51
|
Revision: 2162 http://archive-access.svn.sourceforge.net/archive-access/?rev=2162&view=rev Author: bradtofel Date: 2008-01-30 17:08:52 -0800 (Wed, 30 Jan 2008) Log Message: ----------- TWEAK: made ANY_TAGNAME public. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2008-01-31 01:07:58 UTC (rev 2161) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2008-01-31 01:08:52 UTC (rev 2162) @@ -59,7 +59,7 @@ private static String RAW_ATTR_VALUE = "(?:[^ \\t\\n\\x0B\\f\\r>\"']+)"; - private static String ANY_TAGNAME = "[a-z]+"; + public static String ANY_TAGNAME = "[a-z]+"; private static String STYLE_ATTR_NAME = "style"; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-04-11 03:58:03
|
Revision: 2229 http://archive-access.svn.sourceforge.net/archive-access/?rev=2229&view=rev Author: bradtofel Date: 2008-04-10 20:58:09 -0700 (Thu, 10 Apr 2008) Log Message: ----------- BUGFIX: ACC-17: Parsing of some style tags that were not really style tags cause a String OOB exception. Now we're checking that the regex matched substring is long enough to consider. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2008-04-11 03:56:08 UTC (rev 2228) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2008-04-11 03:58:09 UTC (rev 2229) @@ -42,6 +42,10 @@ */ public class TagMagix { + // minimum length XXXX in a 'style=XXXX' declaration... mostly handy + // to keep us from trying to mark up javascript generated style code. + private static int MIN_STYLE_LENGTH = 3; + private static HashMap<String, Pattern> pcPatterns = new HashMap<String, Pattern>(); @@ -166,6 +170,11 @@ int origAttrLength = attrValue.length(); int attrStart = matcher.start(1); int attrEnd = matcher.end(1); + idx = attrEnd; + if(origAttrLength < MIN_STYLE_LENGTH) { + continue; + } + if (attrValue.charAt(0) == '"') { attrValue = attrValue.substring(1, origAttrLength - 1); attrStart += 1; @@ -177,7 +186,6 @@ attrStart += 2; } - idx = attrEnd; Matcher urlMatcher = cssUrlPattern.matcher(attrValue); int attrIdx = 0; while(urlMatcher.find(attrIdx)) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-15 01:19:52
|
Revision: 2444 http://archive-access.svn.sourceforge.net/archive-access/?rev=2444&view=rev Author: bradtofel Date: 2008-07-14 18:20:01 -0700 (Mon, 14 Jul 2008) Log Message: ----------- BUGFIX(ACC-26): string OOB exception in javascript generated escaped HTML attributes. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2008-07-15 01:18:56 UTC (rev 2443) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2008-07-15 01:20:01 UTC (rev 2444) @@ -42,9 +42,9 @@ */ public class TagMagix { - // minimum length XXXX in a 'style=XXXX' declaration... mostly handy - // to keep us from trying to mark up javascript generated style code. - private static int MIN_STYLE_LENGTH = 3; + // minimum length XXXX in a 'ATTR=XXXX' declaration... mostly handy + // to keep us from trying to mark up javascript generated HTML/CSS code. + private static int MIN_ATTR_LENGTH = 3; private static HashMap<String, Pattern> pcPatterns = new HashMap<String, Pattern>(); @@ -171,7 +171,7 @@ int attrStart = matcher.start(1); int attrEnd = matcher.end(1); idx = attrEnd; - if(origAttrLength < MIN_STYLE_LENGTH) { + if(origAttrLength < MIN_ATTR_LENGTH) { continue; } @@ -250,6 +250,10 @@ int origUrlLength = url.length(); int attrStart = matcher.start(1); int attrEnd = matcher.end(1); + if(origUrlLength < MIN_ATTR_LENGTH) { + idx = attrEnd; + continue; + } String quote = ""; if (url.charAt(0) == '"') { quote = "\""; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |