Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor
In directory sc8-pr-cvs1:/tmp/cvs-serv1585/src/org/archive/crawler/extractor
Modified Files:
ExtractorHTML.java
Log Message:
ignore HTML from paths which suggest non-HTML content (soft 404 protection)
Index: ExtractorHTML.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** ExtractorHTML.java 12 Aug 2003 00:47:16 -0000 1.10
--- ExtractorHTML.java 26 Aug 2003 00:16:51 -0000 1.11
***************
*** 29,32 ****
--- 29,34 ----
*/
public class ExtractorHTML extends Processor implements CoreAttributeConstants {
+ private boolean ignoreUnexpectedHTML = true; // TODO: add config param to change
+
private static Logger logger = Logger.getLogger("org.archive.crawler.basic.ExtractorHTML");
***************
*** 230,233 ****
--- 232,245 ----
return;
}
+
+ if(ignoreUnexpectedHTML) {
+ if(!expectedHTML(curi)) {
+ // HTML was not expected (eg a GIF was expected) so ignore
+ // (as if a soft 404)
+ return;
+ }
+ }
+
+
GetMethod get = (GetMethod)curi.getAList().getObject(A_HTTP_TRANSACTION);
Header contentType = get.getResponseHeader("Content-Type");
***************
*** 268,272 ****
}
!
/**
* @param curi
--- 280,305 ----
}
!
! static Pattern NON_HTML_PATH_EXTENSION = Pattern.compile(
! "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+
! "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)");
! /**
! * @param curi
! * @return
! */
! private boolean expectedHTML(CrawlURI curi) {
! String path = curi.getUURI().getUri().getPath();
! int dot = path.lastIndexOf('.');
! if (dot<0) {
! // no path extension, HTML is fine
! return true;
! }
! if(dot<(path.length()-5)) {
! // extension too long to recognize, HTML is fine
! return true;
! }
! return NON_HTML_PATH_EXTENSION.matcher(path.substring(dot)).matches();
! }
!
/**
* @param curi
|