Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv22127/src/java/org/archive/access/nutch
Modified Files:
Arc2Segment.java
Log Message:
Part of '[ 1312200 ] [nutchwax+wera] Pages at end of redirects not found.'
* conf/nutch-site.xml.template
Make default redirect be off by default.
* src/java/org/archive/access/nutch/Arc2Segment.java
Fix logic around redirect (Was skipping 30xs). Log state of indirect flag.
(isIndexRedirects): Added.
Index: Arc2Segment.java
===================================================================
RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/Arc2Segment.java,v
retrieving revision 1.32
retrieving revision 1.33
diff -C2 -d -r1.32 -r1.33
*** Arc2Segment.java 23 Nov 2005 23:56:16 -0000 1.32
--- Arc2Segment.java 28 Nov 2005 21:23:22 -0000 1.33
***************
*** 144,147 ****
--- 144,151 ----
}
+ public boolean isIndexRedirects() {
+ return Arc2Segment.indexRedirects;
+ }
+
public void addArc(String arcFile) throws IOException {
File f = new File(arcFile);
***************
*** 158,166 ****
for (Iterator i = arc.iterator(); i.hasNext();) {
ARCRecord rec = (ARCRecord)i.next();
! if (rec.getStatusCode() != 200 ||
! (this.indexRedirects &&
! rec.getStatusCode() >= 300 &&
! rec.getStatusCode() < 400)) {
! continue;
}
try {
--- 162,171 ----
for (Iterator i = arc.iterator(); i.hasNext();) {
ARCRecord rec = (ARCRecord)i.next();
! if (rec.getStatusCode() != 200) {
! if (!(this.indexRedirects &&
! (rec.getStatusCode() >= 300 &&
! rec.getStatusCode() < 400))) {
! continue;
! }
}
try {
***************
*** 225,230 ****
if (arcData.getLength() >= bigHtmlMax) {
LOG.info("skipping big html " +
! Long.toString(arcData.getLength()) + " bytes of mimetype " +
! noSpacesMimetype + " " + url);
return;
}
--- 230,235 ----
if (arcData.getLength() >= bigHtmlMax) {
LOG.info("skipping big html " +
! Long.toString(arcData.getLength()) +
! " bytes of mimetype " + noSpacesMimetype + " " + url);
return;
}
***************
*** 244,248 ****
// Collect content bytes
- // TODO: Skip if unindexable type.
rec.skipHttpHeader();
ByteArrayOutputStream contentBuffer = new ByteArrayOutputStream();
--- 249,252 ----
***************
*** 356,359 ****
--- 360,364 ----
LOG.info("Index all mimetypes: " + arc2Segment.isIndexAll());
LOG.info("skipBigHtml " + skipBigHtml + ", cutoff size " + bigHtmlMax);
+ LOG.info("Index redirects " + arc2Segment.isIndexRedirects());
try {
|