From: <bi...@us...> - 2009-03-04 01:18:45
|
Revision: 2689 http://archive-access.svn.sourceforge.net/archive-access/?rev=2689&view=rev Author: binzino Date: 2009-03-04 01:18:44 +0000 (Wed, 04 Mar 2009) Log Message: ----------- Added boolean configuration property nutchwax.import.store.content to determine whether or not the Importer stores the full content in the segment's "content" directory. Removed a useless debug message from the end of the Import job. Removed searcher.max.hits from nutch-site.xml as it actually causes lots of problems with search-time site-based de-dup. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2009-03-03 20:34:38 UTC (rev 2688) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2009-03-04 01:18:44 UTC (rev 2689) @@ -456,8 +456,12 @@ try { - output.collect( key, new NutchWritable( datum ) ); - output.collect( key, new NutchWritable( content ) ); + output.collect( key, new NutchWritable( datum ) ); + + if ( jobConf.getBoolean( "nutchwax.import.store.content", false ) ) + { + output.collect( key, new NutchWritable( content ) ); + } if ( parseResult != null ) { @@ -649,9 +653,6 @@ RunningJob rj = JobClient.runJob( job ); - // Emit job id and status. - System.out.println( "JOB_STATUS: " + rj.getID( ) + ": " + (rj.isSuccessful( ) ? "SUCCESS" : "FAIL" ) ); - return rj.isSuccessful( ) ? 0 : 1; } catch ( Exception e ) Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-03-03 20:34:38 UTC (rev 2688) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-03-04 01:18:44 UTC (rev 2689) @@ -137,6 +137,25 @@ <value>1048576</value> </property> +<!-- Whether or not we store the full content in the segment's + "content" directory. Most NutchWAX users are also using Wayback + to serve the archived content, so there's no need for NutchWAX to + keep a "cached" copy as well. + + Setting to 'true' yields the same bahavior as in previous + versions of NutchWAX, and as in Nutch. The content is stored in + the segment's "content" directory. + + Setting to 'false' results in an empty "content" directory in the + segment. The content is not stored. + + Default value is 'false'. + --> +<property> + <name>nutchwax.import.store.content</name> + <value>false</value> +</property> + <!-- Enable per-collection segment sub-dirs, e.g. segments/<collectionId>/segment1 /segment2 @@ -156,11 +175,6 @@ </property> <property> - <name>searcher.max.hits</name> - <value>1000</value> -</property> - -<property> <name>searcher.summary.context</name> <value>8</value> </property> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |