From: Michael S. <sta...@us...> - 2005-10-21 01:29:37
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20626/src/java/org/archive/access/nutch Modified Files: Arc2Segment.java Log Message: Implement '[ 1309781 ] Add in skipping certain types if > size' * src/java/org/archive/access/nutch/Arc2Segment.java Test for text/html that is larger than the archive.skip.big.html value. Log and skip any found. * conf/nutch-site.xml.nutchwax Edit. Index: Arc2Segment.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/Arc2Segment.java,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** Arc2Segment.java 18 Oct 2005 23:21:11 -0000 1.30 --- Arc2Segment.java 21 Oct 2005 01:29:29 -0000 1.31 *************** *** 87,90 **** --- 87,101 ---- } } + private static boolean skipBigHtml = false; + private static long bigHtmlMax = -1; + static { + String tmp = NutchConf.get().get("archive.skip.big.html"); + if (tmp != null) { + bigHtmlMax = Long.parseLong(tmp); + if (bigHtmlMax != -1) { + skipBigHtml = true; + } + } + } /** Get the MimeTypes resolver instance. */ *************** *** 195,201 **** metaData.put(header.getName(), header.getValue()); } - String noSpacesMimetype = (mimetype == null)? "null": TextUtils.replaceAll(WHITESPACE, mimetype, "-"); LOG.info("adding " + Long.toString(arcData.getLength()) + " bytes of mimetype " + noSpacesMimetype + " " + url); --- 206,224 ---- metaData.put(header.getName(), header.getValue()); } String noSpacesMimetype = (mimetype == null)? "null": TextUtils.replaceAll(WHITESPACE, mimetype, "-"); + + // New test for Dan. If text/html and > than a certain size, then + // skip completly. + if (skipBigHtml && mimetype != null && + mimetype.startsWith("text/html")) { + if (arcData.getLength() >= bigHtmlMax) { + LOG.info("skipping big html " + + Long.toString(arcData.getLength()) + " bytes of mimetype " + + noSpacesMimetype + " " + url); + return; + } + } + LOG.info("adding " + Long.toString(arcData.getLength()) + " bytes of mimetype " + noSpacesMimetype + " " + url); *************** *** 209,213 **** metaData.put(CONTENT_TYPE_KEY, mimetype); } ! // Collect content bytes // TODO: Skip if unindexable type. --- 232,236 ---- metaData.put(CONTENT_TYPE_KEY, mimetype); } ! // Collect content bytes // TODO: Skip if unindexable type. *************** *** 322,325 **** --- 345,349 ---- Arc2Segment arc2Segment = new Arc2Segment(segmentDir, collectionName, nfs); LOG.info("Index all mimetypes: " + arc2Segment.isIndexAll()); + LOG.info("skipBigHtml " + skipBigHtml + ", cutoff size " + bigHtmlMax); try { |