From: <go...@us...> - 2003-09-09 23:12:23
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor In directory sc8-pr-cvs1:/tmp/cvs-serv10481/src/org/archive/crawler/extractor Modified Files: ExtractorHTML.java Log Message: handle & in codebase, resource, onEvent attributes Index: ExtractorHTML.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** ExtractorHTML.java 6 Sep 2003 02:01:07 -0000 1.13 --- ExtractorHTML.java 9 Sep 2003 23:12:20 -0000 1.14 *************** *** 103,107 **** // Just in case it's an OBJECT tag ! CharSequence codebase = null; ArrayList resources = null; --- 103,107 ---- // Just in case it's an OBJECT tag ! String codebase = null; ArrayList resources = null; *************** *** 127,132 **** } else if (attr.start(6)>-1) { // CODEBASE ! codebase = value; ! processEmbed(curi,codebase.toString()); } else if (attr.start(7)>-1) { // CLASSID,DATA --- 127,133 ---- } else if (attr.start(6)>-1) { // CODEBASE ! codebase = value.toString(); ! codebase = codebase.replaceAll("&","&"); // TODO: more HTML deescaping? ! processEmbed(curi,codebase); } else if (attr.start(7)>-1) { // CLASSID,DATA *************** *** 166,169 **** --- 167,171 ---- while(iter.hasNext()) { String res = iter.next().toString(); + res = res.replaceAll("&","&"); // TODO: more HTML deescaping? if (codebaseURI != null) { res = codebaseURI.resolve(res).toString(); *************** *** 194,198 **** */ private void processScriptCode(CrawlURI curi, CharSequence cs) { ! Matcher candidates = JAVASCRIPT_LIKELY_URI_EXTRACTOR.matcher(cs); while (candidates.find()) { curi.addEmbed(candidates.group(2)); --- 196,202 ---- */ private void processScriptCode(CrawlURI curi, CharSequence cs) { ! String code = cs.toString(); ! code = code.replaceAll("&","&"); // TODO: more HTML deescaping? ! Matcher candidates = JAVASCRIPT_LIKELY_URI_EXTRACTOR.matcher(code); while (candidates.find()) { curi.addEmbed(candidates.group(2)); |