Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor
In directory sc8-pr-cvs1:/tmp/cvs-serv17531/src/org/archive/crawler/extractor
Modified Files:
ExtractorHTML.java
Log Message:
in-attribute '&' handling
Index: ExtractorHTML.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v
retrieving revision 1.12
retrieving revision 1.13
diff -C2 -d -r1.12 -r1.13
*** ExtractorHTML.java 3 Sep 2003 01:51:05 -0000 1.12
--- ExtractorHTML.java 6 Sep 2003 02:01:07 -0000 1.13
***************
*** 205,212 ****
*/
private void processLink(CrawlURI curi, CharSequence value) {
! if(value.toString().matches("(?i)^javascript:.*")) {
processScriptCode(curi,value.subSequence(11,value.length()));
} else {
! curi.addLink(value.toString());
}
}
--- 205,214 ----
*/
private void processLink(CrawlURI curi, CharSequence value) {
! String link = value.toString();
! link = link.replaceAll("&","&"); // TODO: more HTML deescaping?
! if(link.matches("(?i)^javascript:.*")) {
processScriptCode(curi,value.subSequence(11,value.length()));
} else {
! curi.addLink(link);
}
}
***************
*** 219,223 ****
*/
private void processEmbed(CrawlURI curi, CharSequence value) {
! curi.addEmbed(value.toString());
}
--- 221,227 ----
*/
private void processEmbed(CrawlURI curi, CharSequence value) {
! String embed = value.toString();
! embed = embed.replaceAll("&","&"); // TODO: more HTML deescaping?
! curi.addEmbed(embed);
}
|