|
From: <go...@us...> - 2003-09-09 23:12:23
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor
In directory sc8-pr-cvs1:/tmp/cvs-serv10481/src/org/archive/crawler/extractor
Modified Files:
ExtractorHTML.java
Log Message:
handle & in codebase, resource, onEvent attributes
Index: ExtractorHTML.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v
retrieving revision 1.13
retrieving revision 1.14
diff -C2 -d -r1.13 -r1.14
*** ExtractorHTML.java 6 Sep 2003 02:01:07 -0000 1.13
--- ExtractorHTML.java 9 Sep 2003 23:12:20 -0000 1.14
***************
*** 103,107 ****
// Just in case it's an OBJECT tag
! CharSequence codebase = null;
ArrayList resources = null;
--- 103,107 ----
// Just in case it's an OBJECT tag
! String codebase = null;
ArrayList resources = null;
***************
*** 127,132 ****
} else if (attr.start(6)>-1) {
// CODEBASE
! codebase = value;
! processEmbed(curi,codebase.toString());
} else if (attr.start(7)>-1) {
// CLASSID,DATA
--- 127,133 ----
} else if (attr.start(6)>-1) {
// CODEBASE
! codebase = value.toString();
! codebase = codebase.replaceAll("&","&"); // TODO: more HTML deescaping?
! processEmbed(curi,codebase);
} else if (attr.start(7)>-1) {
// CLASSID,DATA
***************
*** 166,169 ****
--- 167,171 ----
while(iter.hasNext()) {
String res = iter.next().toString();
+ res = res.replaceAll("&","&"); // TODO: more HTML deescaping?
if (codebaseURI != null) {
res = codebaseURI.resolve(res).toString();
***************
*** 194,198 ****
*/
private void processScriptCode(CrawlURI curi, CharSequence cs) {
! Matcher candidates = JAVASCRIPT_LIKELY_URI_EXTRACTOR.matcher(cs);
while (candidates.find()) {
curi.addEmbed(candidates.group(2));
--- 196,202 ----
*/
private void processScriptCode(CrawlURI curi, CharSequence cs) {
! String code = cs.toString();
! code = code.replaceAll("&","&"); // TODO: more HTML deescaping?
! Matcher candidates = JAVASCRIPT_LIKELY_URI_EXTRACTOR.matcher(code);
while (candidates.find()) {
curi.addEmbed(candidates.group(2));
|