From: <go...@us...> - 2003-09-06 02:01:10
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor In directory sc8-pr-cvs1:/tmp/cvs-serv17531/src/org/archive/crawler/extractor Modified Files: ExtractorHTML.java Log Message: in-attribute '&' handling Index: ExtractorHTML.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** ExtractorHTML.java 3 Sep 2003 01:51:05 -0000 1.12 --- ExtractorHTML.java 6 Sep 2003 02:01:07 -0000 1.13 *************** *** 205,212 **** */ private void processLink(CrawlURI curi, CharSequence value) { ! if(value.toString().matches("(?i)^javascript:.*")) { processScriptCode(curi,value.subSequence(11,value.length())); } else { ! curi.addLink(value.toString()); } } --- 205,214 ---- */ private void processLink(CrawlURI curi, CharSequence value) { ! String link = value.toString(); ! link = link.replaceAll("&","&"); // TODO: more HTML deescaping? ! if(link.matches("(?i)^javascript:.*")) { processScriptCode(curi,value.subSequence(11,value.length())); } else { ! curi.addLink(link); } } *************** *** 219,223 **** */ private void processEmbed(CrawlURI curi, CharSequence value) { ! curi.addEmbed(value.toString()); } --- 221,227 ---- */ private void processEmbed(CrawlURI curi, CharSequence value) { ! String embed = value.toString(); ! embed = embed.replaceAll("&","&"); // TODO: more HTML deescaping? ! curi.addEmbed(embed); } |