From: <bra...@us...> - 2011-12-18 00:07:57
|
Revision: 3575 http://archive-access.svn.sourceforge.net/archive-access/?rev=3575&view=rev Author: bradtofel Date: 2011-12-18 00:07:50 +0000 (Sun, 18 Dec 2011) Log Message: ----------- Added Logging to convert problems - adding Robot Meta instructions field, swapping compressed length/offset fields 9,10 (now offset is 10th) and we now output a (somewhat more) correct CDX header line, with the new "S" column for compressed size. Modified Paths: -------------- trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/cdx/CDXConverterTool.java trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/mapreduce/CDXMapper.java Modified: trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/cdx/CDXConverterTool.java =================================================================== --- trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/cdx/CDXConverterTool.java 2011-11-29 06:03:59 UTC (rev 3574) +++ trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/cdx/CDXConverterTool.java 2011-12-18 00:07:50 UTC (rev 3575) @@ -39,10 +39,12 @@ break; } StringPair pair = mapper.convert(cdxLine); - pw.print(pair.first); - pw.print(" "); - pw.print(pair.second); - pw.println(); + if(pair != null) { + pw.print(pair.first); + pw.print(" "); + pw.print(pair.second); + pw.println(); + } } pw.flush(); return 0; Modified: trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/mapreduce/CDXMapper.java =================================================================== --- trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/mapreduce/CDXMapper.java 2011-11-29 06:03:59 UTC (rev 3574) +++ trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/mapreduce/CDXMapper.java 2011-12-18 00:07:50 UTC (rev 3575) @@ -1,6 +1,7 @@ package org.archive.hadoop.mapreduce; import java.io.IOException; +import java.util.logging.Logger; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; @@ -11,7 +12,12 @@ public class CDXMapper extends Mapper<Object, Text, Text, Text> implements Configurable { - + private static final Logger LOG = + Logger.getLogger(CDXMapper.class.getName()); + + // Note the (unbelievably) new "S" for "Size in Compressed Bytes..." + public final static String NEW_CDX_HEADER = + "CDX N b a m s k r M S V g"; private static String TEXT_OUTPUT_DELIM_CONFIG = "text.output.delim"; public static int MODE_GLOBAL = 0; public static int MODE_FULL = 1; @@ -30,8 +36,7 @@ public StringPair convert(String cdxLine) { if(cdxLine.startsWith(" CDX ")) { - return new StringPair("", cdxLine.substring(1)); -// return null + return new StringPair("", NEW_CDX_HEADER); } String[] parts = cdxLine.split(delim); int offsetIdx = 8; @@ -45,6 +50,9 @@ return null; } } + } else { + LOG.warning("Skipping line:" + cdxLine); + return null; } // don't care about the old key: @@ -68,8 +76,9 @@ valSB.append(responseCode).append(delim); valSB.append(digest).append(delim); valSB.append(redirect).append(delim); + valSB.append(metaInstructions).append(delim); + valSB.append(DEFAULT_GZ_LEN).append(delim); valSB.append(offset).append(delim); - valSB.append(DEFAULT_GZ_LEN).append(delim); valSB.append(filename); return new StringPair(keySB.toString(), valSB.toString()); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |