|
From: <bra...@us...> - 2010-03-20 01:00:57
|
Revision: 2986
http://archive-access.svn.sourceforge.net/archive-access/?rev=2986&view=rev
Author: bradtofel
Date: 2010-03-20 01:00:50 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
BUGFIX(unreported): was not using correct resolve method, causing extra level of escaping on all GET arguments.
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-03-20 00:59:42 UTC (rev 2985)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-03-20 01:00:50 UTC (rev 2986)
@@ -100,7 +100,7 @@
url = url.substring(0,hashIdx);
}
try {
- return baseUrl.resolve(url).toString() + frag;
+ return baseUrl.resolve(url,true).toString() + frag;
} catch (URIException e) {
e.printStackTrace();
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2010-03-30 22:35:03
|
Revision: 3008
http://archive-access.svn.sourceforge.net/archive-access/?rev=3008&view=rev
Author: bradtofel
Date: 2010-03-30 22:34:57 +0000 (Tue, 30 Mar 2010)
Log Message:
-----------
BUGFIX: now using (hopefully) correct resolving code: UURIFactory.resolve(UURI,String) instead of UURI.resolve(String,true)...
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-03-29 21:50:42 UTC (rev 3007)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-03-30 22:34:57 UTC (rev 3008)
@@ -100,7 +100,7 @@
url = url.substring(0,hashIdx);
}
try {
- return baseUrl.resolve(url,true).toString() + frag;
+ return UURIFactory.getInstance(baseUrl, url).toString() + frag;
} catch (URIException e) {
e.printStackTrace();
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2010-05-18 22:46:13
|
Revision: 3104
http://archive-access.svn.sourceforge.net/archive-access/?rev=3104&view=rev
Author: bradtofel
Date: 2010-05-18 22:46:07 +0000 (Tue, 18 May 2010)
Log Message:
-----------
Translate escaped characters within resolved urls prior to contextualizing.
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-05-18 22:44:22 UTC (rev 3103)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-05-18 22:46:07 UTC (rev 3104)
@@ -31,6 +31,7 @@
import org.apache.commons.httpclient.URIException;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
+import org.htmlparser.util.Translate;
/**
* Class which tracks the context and state involved with parsing an HTML
@@ -93,6 +94,7 @@
* @throws URISyntaxException if the input URL is malformed
*/
public String resolve(String url) throws URISyntaxException {
+ url = Translate.decode(url);
int hashIdx = url.indexOf('#');
String frag = "";
if(hashIdx != -1) {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2010-07-21 00:00:33
|
Revision: 3186
http://archive-access.svn.sourceforge.net/archive-access/?rev=3186&view=rev
Author: bradtofel
Date: 2010-07-21 00:00:27 +0000 (Wed, 21 Jul 2010)
Log Message:
-----------
HACKHACK - if original HREF or base-HREF is not a valid URL, allows identity rewriting of other urls in the page
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-07-20 23:59:24 UTC (rev 3185)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-07-21 00:00:27 UTC (rev 3186)
@@ -106,6 +106,11 @@
url = url.substring(0,hashIdx);
}
try {
+ if(baseUrl == null) {
+ // TODO: log
+ System.err.println("No url to resolve!");
+ return url;
+ }
return baseUrl.resolve(url,false).toString() + frag;
// return UURIFactory.getInstance(baseUrl, url).toString() + frag;
} catch (URIException e) {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2010-09-28 21:22:45
|
Revision: 3259
http://archive-access.svn.sourceforge.net/archive-access/?rev=3259&view=rev
Author: bradtofel
Date: 2010-09-28 21:22:39 +0000 (Tue, 28 Sep 2010)
Log Message:
-----------
Fixed URL canonicalization test case and underlying code
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-09-28 21:22:20 UTC (rev 3258)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-09-28 21:22:39 UTC (rev 3259)
@@ -30,6 +30,7 @@
import java.util.Map;
import org.apache.commons.httpclient.URIException;
+import org.apache.commons.lang.StringEscapeUtils;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.htmlparser.util.Translate;
@@ -98,7 +99,10 @@
* @throws URISyntaxException if the input URL is malformed
*/
public String resolve(String url) throws URISyntaxException {
- url = Translate.decode(url);
+ // BUG in Translate.decode(): "foo?a=b&lang=en" acts as if it
+ // was "⟨"
+// url = Translate.decode(url);
+ url = StringEscapeUtils.unescapeHtml(url);
int hashIdx = url.indexOf('#');
String frag = "";
if(hashIdx != -1) {
@@ -111,7 +115,8 @@
System.err.println("No url to resolve!");
return url;
}
- return baseUrl.resolve(url,false).toString() + frag;
+ return baseUrl.resolve(url,true).toString() + frag;
+// return baseUrl.resolve(url,false).toString() + frag;
// return UURIFactory.getInstance(baseUrl, url).toString() + frag;
} catch (URIException e) {
e.printStackTrace();
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2010-10-11 18:56:45
|
Revision: 3271
http://archive-access.svn.sourceforge.net/archive-access/?rev=3271&view=rev
Author: bradtofel
Date: 2010-10-11 18:56:38 +0000 (Mon, 11 Oct 2010)
Log Message:
-----------
BUGFIX: numerous url escaping/resolving issues, by switching back to UURIFactory.getInstance()
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-10-01 23:23:52 UTC (rev 3270)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-10-11 18:56:38 UTC (rev 3271)
@@ -28,7 +28,6 @@
import org.apache.commons.lang.StringEscapeUtils;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
-import org.htmlparser.util.Translate;
/**
* Class which tracks the context and state involved with parsing an HTML
@@ -74,6 +73,10 @@
public String getData(String key) {
return data.get(key);
}
+
+ /**
+ * @return the full Map of String to String for this parsing context.
+ */
public Map<String,String> getMap() {
return data;
}
@@ -104,20 +107,21 @@
frag = url.substring(hashIdx);
url = url.substring(0,hashIdx);
}
+
+ if(baseUrl == null) {
+ // TODO: log ?
+ return url + frag;
+ }
+
try {
- if(baseUrl == null) {
- // TODO: log
- System.err.println("No url to resolve!");
- return url;
- }
- return baseUrl.resolve(url,true).toString() + frag;
-// return baseUrl.resolve(url,false).toString() + frag;
-// return UURIFactory.getInstance(baseUrl, url).toString() + frag;
+
+ return UURIFactory.getInstance(baseUrl, url).toString() + frag;
} catch (URIException e) {
e.printStackTrace();
}
return url;
- }
+ }
+
/**
* @param url which should be resolved.
* @return absolute form of input url, or url itself if javascript:
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2011-02-06 14:35:56
|
Revision: 3393
http://archive-access.svn.sourceforge.net/archive-access/?rev=3393&view=rev
Author: bradtofel
Date: 2011-02-06 14:35:50 +0000 (Sun, 06 Feb 2011)
Log Message:
-----------
LOGGING: replaced stacktrace with log message
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2011-02-06 14:35:02 UTC (rev 3392)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2011-02-06 14:35:50 UTC (rev 3393)
@@ -23,12 +23,12 @@
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
+import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.StringEscapeUtils;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
-
/**
* Class which tracks the context and state involved with parsing an HTML
* document via SAX events.
@@ -44,6 +44,9 @@
*/
public class ParseContext {
+ private static final Logger LOGGER = Logger.getLogger(
+ ParseContext.class.getName());
+
protected UURI baseUrl = null;
private boolean inCSS = false;
@@ -114,10 +117,10 @@
}
try {
-
return UURIFactory.getInstance(baseUrl, url).toString() + frag;
} catch (URIException e) {
- e.printStackTrace();
+ LOGGER.warning("FAILED RESOLVE: base(" + baseUrl + ") frag(" + url +
+ ") error(" + e.getMessage() + ")");
}
return url;
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|