From: <go...@us...> - 2003-08-26 00:17:14
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor In directory sc8-pr-cvs1:/tmp/cvs-serv1585/src/org/archive/crawler/extractor Modified Files: ExtractorHTML.java Log Message: ignore HTML from paths which suggest non-HTML content (soft 404 protection) Index: ExtractorHTML.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** ExtractorHTML.java 12 Aug 2003 00:47:16 -0000 1.10 --- ExtractorHTML.java 26 Aug 2003 00:16:51 -0000 1.11 *************** *** 29,32 **** --- 29,34 ---- */ public class ExtractorHTML extends Processor implements CoreAttributeConstants { + private boolean ignoreUnexpectedHTML = true; // TODO: add config param to change + private static Logger logger = Logger.getLogger("org.archive.crawler.basic.ExtractorHTML"); *************** *** 230,233 **** --- 232,245 ---- return; } + + if(ignoreUnexpectedHTML) { + if(!expectedHTML(curi)) { + // HTML was not expected (eg a GIF was expected) so ignore + // (as if a soft 404) + return; + } + } + + GetMethod get = (GetMethod)curi.getAList().getObject(A_HTTP_TRANSACTION); Header contentType = get.getResponseHeader("Content-Type"); *************** *** 268,272 **** } ! /** * @param curi --- 280,305 ---- } ! ! static Pattern NON_HTML_PATH_EXTENSION = Pattern.compile( ! "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+ ! "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)"); ! /** ! * @param curi ! * @return ! */ ! private boolean expectedHTML(CrawlURI curi) { ! String path = curi.getUURI().getUri().getPath(); ! int dot = path.lastIndexOf('.'); ! if (dot<0) { ! // no path extension, HTML is fine ! return true; ! } ! if(dot<(path.length()-5)) { ! // extension too long to recognize, HTML is fine ! return true; ! } ! return NON_HTML_PATH_EXTENSION.matcher(path.substring(dot)).matches(); ! } ! /** * @param curi |