From: Michael S. <sta...@us...> - 2005-11-29 05:35:11
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv22127/src/java/org/archive/access/nutch Modified Files: Arc2Segment.java Log Message: Part of '[ 1312200 ] [nutchwax+wera] Pages at end of redirects not found.' * conf/nutch-site.xml.template Make default redirect be off by default. * src/java/org/archive/access/nutch/Arc2Segment.java Fix logic around redirect (Was skipping 30xs). Log state of indirect flag. (isIndexRedirects): Added. Index: Arc2Segment.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/Arc2Segment.java,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** Arc2Segment.java 23 Nov 2005 23:56:16 -0000 1.32 --- Arc2Segment.java 28 Nov 2005 21:23:22 -0000 1.33 *************** *** 144,147 **** --- 144,151 ---- } + public boolean isIndexRedirects() { + return Arc2Segment.indexRedirects; + } + public void addArc(String arcFile) throws IOException { File f = new File(arcFile); *************** *** 158,166 **** for (Iterator i = arc.iterator(); i.hasNext();) { ARCRecord rec = (ARCRecord)i.next(); ! if (rec.getStatusCode() != 200 || ! (this.indexRedirects && ! rec.getStatusCode() >= 300 && ! rec.getStatusCode() < 400)) { ! continue; } try { --- 162,171 ---- for (Iterator i = arc.iterator(); i.hasNext();) { ARCRecord rec = (ARCRecord)i.next(); ! if (rec.getStatusCode() != 200) { ! if (!(this.indexRedirects && ! (rec.getStatusCode() >= 300 && ! rec.getStatusCode() < 400))) { ! continue; ! } } try { *************** *** 225,230 **** if (arcData.getLength() >= bigHtmlMax) { LOG.info("skipping big html " + ! Long.toString(arcData.getLength()) + " bytes of mimetype " + ! noSpacesMimetype + " " + url); return; } --- 230,235 ---- if (arcData.getLength() >= bigHtmlMax) { LOG.info("skipping big html " + ! Long.toString(arcData.getLength()) + ! " bytes of mimetype " + noSpacesMimetype + " " + url); return; } *************** *** 244,248 **** // Collect content bytes - // TODO: Skip if unindexable type. rec.skipHttpHeader(); ByteArrayOutputStream contentBuffer = new ByteArrayOutputStream(); --- 249,252 ---- *************** *** 356,359 **** --- 360,364 ---- LOG.info("Index all mimetypes: " + arc2Segment.isIndexAll()); LOG.info("skipBigHtml " + skipBigHtml + ", cutoff size " + bigHtmlMax); + LOG.info("Index redirects " + arc2Segment.isIndexRedirects()); try { |