Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20626/src/java/org/archive/access/nutch
Modified Files:
Arc2Segment.java
Log Message:
Implement '[ 1309781 ] Add in skipping certain types if > size'
* src/java/org/archive/access/nutch/Arc2Segment.java
Test for text/html that is larger than the archive.skip.big.html value.
Log and skip any found.
* conf/nutch-site.xml.nutchwax
Edit.
Index: Arc2Segment.java
===================================================================
RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/Arc2Segment.java,v
retrieving revision 1.30
retrieving revision 1.31
diff -C2 -d -r1.30 -r1.31
*** Arc2Segment.java 18 Oct 2005 23:21:11 -0000 1.30
--- Arc2Segment.java 21 Oct 2005 01:29:29 -0000 1.31
***************
*** 87,90 ****
--- 87,101 ----
}
}
+ private static boolean skipBigHtml = false;
+ private static long bigHtmlMax = -1;
+ static {
+ String tmp = NutchConf.get().get("archive.skip.big.html");
+ if (tmp != null) {
+ bigHtmlMax = Long.parseLong(tmp);
+ if (bigHtmlMax != -1) {
+ skipBigHtml = true;
+ }
+ }
+ }
/** Get the MimeTypes resolver instance. */
***************
*** 195,201 ****
metaData.put(header.getName(), header.getValue());
}
-
String noSpacesMimetype = (mimetype == null)? "null":
TextUtils.replaceAll(WHITESPACE, mimetype, "-");
LOG.info("adding " + Long.toString(arcData.getLength())
+ " bytes of mimetype " + noSpacesMimetype + " " + url);
--- 206,224 ----
metaData.put(header.getName(), header.getValue());
}
String noSpacesMimetype = (mimetype == null)? "null":
TextUtils.replaceAll(WHITESPACE, mimetype, "-");
+
+ // New test for Dan. If text/html and > than a certain size, then
+ // skip completly.
+ if (skipBigHtml && mimetype != null &&
+ mimetype.startsWith("text/html")) {
+ if (arcData.getLength() >= bigHtmlMax) {
+ LOG.info("skipping big html " +
+ Long.toString(arcData.getLength()) + " bytes of mimetype " +
+ noSpacesMimetype + " " + url);
+ return;
+ }
+ }
+
LOG.info("adding " + Long.toString(arcData.getLength())
+ " bytes of mimetype " + noSpacesMimetype + " " + url);
***************
*** 209,213 ****
metaData.put(CONTENT_TYPE_KEY, mimetype);
}
!
// Collect content bytes
// TODO: Skip if unindexable type.
--- 232,236 ----
metaData.put(CONTENT_TYPE_KEY, mimetype);
}
!
// Collect content bytes
// TODO: Skip if unindexable type.
***************
*** 322,325 ****
--- 345,349 ----
Arc2Segment arc2Segment = new Arc2Segment(segmentDir, collectionName, nfs);
LOG.info("Index all mimetypes: " + arc2Segment.isIndexAll());
+ LOG.info("skipBigHtml " + skipBigHtml + ", cutoff size " + bigHtmlMax);
try {
|