From: <bra...@us...> - 2008-02-01 23:53:53
|
Revision: 2170 http://archive-access.svn.sourceforge.net/archive-access/?rev=2170&view=rev Author: bradtofel Date: 2008-02-01 15:53:57 -0800 (Fri, 01 Feb 2008) Log Message: ----------- OPTIMIZ: two major optimizations, now holds URL to run regexs against in a StringBuilder, to reduce String Object construction overhead, and we now do a String compare against a "chooser" string before bothering to test the RegEx against the URLs. BUGFIX: fixed a couple of session ID stripper RegExes that were broken. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2008-02-01 19:34:06 UTC (rev 2169) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2008-02-01 23:53:57 UTC (rev 2170) @@ -51,29 +51,40 @@ * Strip leading 'www.' */ private static final Pattern STRIP_WWW_REGEX = - Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$"); + Pattern.compile("(?i)^(?:https?://)(www[0-9]*\\.)(?:[^/]*/.+)$"); + private static final String STRIP_WWW_CHOOSER = "/www"; +// /** +// * Strip leading 'www44.', 'www3.', etc. +// */ +// private static final Pattern STRIP_WWWN_REGEX = +// Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$"); /** - * Strip leading 'www44.', 'www3.', etc. - */ - private static final Pattern STRIP_WWWN_REGEX = - Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$"); - /** * Strip userinfo. */ private static final Pattern STRIP_USERINFO_REGEX = - Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$", + Pattern.compile("^(?:(?:(?:https?)|(?:ftps?))://)([^/]+@)(?:.*)$", Pattern.CASE_INSENSITIVE); + private static final String STRIP_USERINFO_CHOOSER = "@"; /** - * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A. * Example: PHPSESSID=9682993c8daa2c5497996114facdc805. */ - private static final Pattern STRIP_SESSION_ID_REGEX = - Pattern.compile("^(.+)(?:(?:(?:jsessionid)|(?:phpsessid))=" + - "[0-9a-zA-Z]{32})(?:&(.*))?$", + private static final Pattern STRIP_PHPSESSION_ID_REGEX = + Pattern.compile("^(?:.+)(phpsessid=" + + "[0-9a-zA-Z]{32}&?)(?:(?:.*))?$", Pattern.CASE_INSENSITIVE); + private static final String STRIP_PHPSESSION_ID_CHOOSER = "phpsessid="; + /** + * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A. + */ + private static final Pattern STRIP_JSESSION_ID_REGEX = + Pattern.compile("^.*(jsessionid=[0-9a-zA-Z]{32}&?).*$", + Pattern.CASE_INSENSITIVE); + private static final String STRIP_JSESSION_ID_CHOOSER = "jsessionid="; + + /** * Example: sid=9682993c8daa2c5497996114facdc805. * 'sid=' can be tricky but all sid= followed by 32 byte string * so far seen have been session ids. Sid is a 32 byte string @@ -81,16 +92,18 @@ * so have to have it run after the phpsessid elimination. */ private static final Pattern STRIP_SID_REGEX = - Pattern.compile("^(.+)" + - "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE); + Pattern.compile("^(?:.+)" + + "(sid=[0-9a-zA-Z]{32}&?)(?:(?:.*))?$", Pattern.CASE_INSENSITIVE); + private static final String STRIP_SID_CHOOSER = "sid="; /** * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM. */ private static final Pattern STRIP_ASPSESSION_REGEX = - Pattern.compile("^(.+)" + - "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", + Pattern.compile("^(?:.+)" + + "(ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24}&?)(?:(?:.*))?$", Pattern.CASE_INSENSITIVE); + private static final String STRIP_ASPSESSION_CHOOSER = "aspsessionid"; /** * Examples: @@ -108,10 +121,10 @@ * */ private static final Pattern STRIP_ASPSESSION2_REGEX = - Pattern.compile("^([^\\?]+/)" + - "(?:\\((?:S\\(|)[0-9a-z]{24}\\)(?:\\)|)/)([^\\?]+\\.aspx.*)$", + Pattern.compile(".*/(\\([0-9a-z]{24}\\)/)(?:[^\\?]+\\.aspx.*)$", Pattern.CASE_INSENSITIVE); - + private static final String STRIP_ASPSESSION2_CHOOSER = ".aspx"; + /** * Examples: * @@ -123,12 +136,10 @@ * http://msdn2.microsoft.com/en-us/library/aa479315.aspx * */ - private static final Pattern STRIP_ASPSESSION3_REGEX = - Pattern.compile("^([^\\?]+/" + - "\\((?:a\\([0-9a-z]{24}\\)))(?:S\\([0-9a-z]{24}\\))" + - "((?:f\\([0-9a-z]{24}\\))\\)/[^\\?]+\\.aspx.*)$", + Pattern.compile(".*/(\\((?:[a-z]\\([0-9a-z]{24}\\))+\\)/)[^\\?]+\\.aspx.*$", Pattern.CASE_INSENSITIVE); + private static final String STRIP_ASPSESSION3_CHOOSER = ".aspx"; /** * Strip ColdFusion session IDs. Remove sessionids that look like the @@ -137,36 +148,52 @@ * CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A */ private static final Pattern STRIP_CFSESSION_REGEX = - Pattern.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+(?:jsession=[^&]+)?)" + - "(?:&(.*))?$",Pattern.CASE_INSENSITIVE); + Pattern.compile(".+(cfid=[^&]+&cftoken=[^&]+(?:&jsessionid=[^&]+)?&?).*$", + Pattern.CASE_INSENSITIVE); + private static final String STRIP_CFSESSION_CHOOSER = "cftoken="; + + private static final String choosers[] = { + STRIP_USERINFO_CHOOSER, + STRIP_WWW_CHOOSER, + STRIP_PHPSESSION_ID_CHOOSER, + STRIP_JSESSION_ID_CHOOSER, + STRIP_ASPSESSION_CHOOSER, + STRIP_ASPSESSION2_CHOOSER, + STRIP_ASPSESSION3_CHOOSER, + STRIP_SID_CHOOSER, + STRIP_CFSESSION_CHOOSER + }; + private static final Pattern strippers[] = { + STRIP_USERINFO_REGEX, + STRIP_WWW_REGEX, + STRIP_PHPSESSION_ID_REGEX, + STRIP_JSESSION_ID_REGEX, + STRIP_ASPSESSION_REGEX, + STRIP_ASPSESSION2_REGEX, + STRIP_ASPSESSION3_REGEX, + STRIP_SID_REGEX, + STRIP_CFSESSION_REGEX + }; + /** - * Run a regex that strips elements of a string. + * Run a regex against a StringBuilder, removing group 1 if it matches. * * Assumes the regex has a form that wants to strip elements of the passed - * string. Assumes that if a match, appending group 1 - * and group 2 yields desired result. + * string. Assumes that if a match, group 1 should be removed * @param url Url to search in. - * @param matcher Matcher whose form yields a group 1 and group 2 if a - * match (non-null. - * @return Original <code>url</code> else concatenization of group 1 - * and group 2. + * @param matcher Matcher whose form yields a group to remove + * @return true if the StringBuilder was modified */ - protected String doStripRegexMatch(String url, Matcher matcher) { - return (matcher != null && matcher.matches())? - checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)): - url; + protected boolean doStripRegexMatch(StringBuilder url, Matcher matcher) { + if(matcher != null && matcher.matches()) { + url.delete(matcher.start(1), matcher.end(1)); + return true; + } + return false; } /** - * @param string String to check. - * @return <code>string</code> if non-null, else empty string (""). - */ - private String checkForNull(String string) { - return (string != null)? string: ""; - } - - /** * return the canonical string key for the URL argument. * * @param urlString @@ -175,6 +202,9 @@ */ public String urlStringToKey(final String urlString) throws URIException { + if(urlString.startsWith("dns:")) { + return urlString; + } String searchUrl = canonicalize(urlString); // TODO: force https into http for the moment... @@ -195,20 +225,20 @@ searchUrl = "http://" + searchUrl; } - // unescape anythying that can be: + // TODO: These next few lines look crazy -- need to be reworked.. This + // was the only easy way I could find to get the correct unescaping + // out of UURIs, possible a bug. Definitely needs some TLC in any case, + // as building UURIs is *not* a cheap operation. + + // unescape anything that can be: UURI tmpURI = UURIFactory.getInstance(searchUrl); tmpURI.setPath(tmpURI.getPath()); - - // convert to UURI to perform require URI fixup: + // convert to UURI to perform required URI fixup: UURI searchURI = UURIFactory.getInstance(tmpURI.getURI()); - - - // replace ' ' with '+' (this is only to match Alexa's canonicalization) String newPath = searchURI.getEscapedPath().replace("%20","+"); -// String newPath = searchURI.getPath().replace(' ','+'); // replace multiple consecutive '/'s in the path. while(newPath.contains("//")) { @@ -241,12 +271,10 @@ if(searchURI.getEscapedQuery() != null) { sb.append("?").append(searchURI.getEscapedQuery()); } - return sb.toString(); } - /** * Idempotent operation that will determine the 'fuzziest' * form of the url argument. This operation is done prior to adding records @@ -259,19 +287,23 @@ * @return canonicalized version of url argument. */ public String canonicalize(String url) { - url = doStripRegexMatch(url, STRIP_USERINFO_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_WWW_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_WWWN_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_SESSION_ID_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_ASPSESSION_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_ASPSESSION2_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_ASPSESSION3_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_SID_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_CFSESSION_REGEX.matcher(url)); - url = url.toLowerCase(); + if (url == null || url.length() <= 0) { return url; } + + // hang on, we're about to get aggressive: + url = url.toLowerCase(); + StringBuilder sb = new StringBuilder(url); + boolean changed = false; + for(int i=0; i<choosers.length; i++) { + if(sb.indexOf(choosers[i]) != -1) { + changed |= doStripRegexMatch(sb,strippers[i].matcher(sb)); + } + } + if(changed) { + url = sb.toString(); + } int index = url.lastIndexOf('?'); if (index > 0) { @@ -285,8 +317,8 @@ url = url.substring(0, url.length() - 2); } else { // The '&' is redundant. Strip it. - url = url.substring(0, index + 1) + - url.substring(index + 2); + url = url.substring(0, index + 1) + + url.substring(index + 2); } } else if (url.charAt(url.length() - 1) == '&') { // If we have a lone '&' on end of query str, Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2008-02-01 19:34:06 UTC (rev 2169) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2008-02-01 23:53:57 UTC (rev 2170) @@ -144,7 +144,7 @@ String sid3 = "sid=9682993c8daa2c5497996114facdc805"; String sid4 = "ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM"; String sid5 = "CFID=12412453&CFTOKEN=15501799"; - //String sid6 = "CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A"; + String sid6 = "CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A"; String fore = "http://foo.com/bar?bo=lo&"; String aft = "&gum=yum"; @@ -158,7 +158,7 @@ checkCanonicalization(fore + sid3 + aft,want); checkCanonicalization(fore + sid4 + aft,want); checkCanonicalization(fore + sid5 + aft,want); - //checkCanonicalization(fore + sid6 + aft,want); + checkCanonicalization(fore + sid6 + aft,want); // Check ASP_SESSIONID2: checkCanonicalization( @@ -173,7 +173,7 @@ // Check ASP_SESSIONID3: checkCanonicalization( "http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules", - "legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules"); + "legislature.mi.gov/mileg.aspx?page=sessionschedules"); // strip port 80 checkCanonicalization("http://www.chub.org:80/foo","chub.org/foo"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |