[Archive-crawler-cvs] SF.net SVN: archive-crawler:[6965] trunk/heritrix3

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Revision: 6965
          http://archive-crawler.svn.sourceforge.net/archive-crawler/?rev=6965&view=rev
Author:   gojomo
Date:     2010-09-30 23:01:25 +0000 (Thu, 30 Sep 2010)

Log Message:
-----------
[HER-1830] H3: improve sheet examples in default configuration CXML
* **/profile-crawler-beans.xml
    example SurPrefixesSheetAssociation

Modified Paths:
--------------
    trunk/heritrix3/dist/src/main/conf/jobs/profile-defaults/profile-crawler-beans.cxml
    trunk/heritrix3/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml

Modified: trunk/heritrix3/dist/src/main/conf/jobs/profile-defaults/profile-crawler-beans.cxml
===================================================================

--- trunk/heritrix3/dist/src/main/conf/jobs/profile-defaults/profile-crawler-beans.cxml	2010-09-30 19:48:35 UTC (rev 6964)
+++ trunk/heritrix3/dist/src/main/conf/jobs/profile-defaults/profile-crawler-beans.cxml	2010-09-30 23:01:25 UTC (rev 6965)
@@ -6,8 +6,8 @@
    
    Commented-out beans and properties are provided as an example; values
    shown in comments reflect the actual defaults which are in effect
-   without specification. (To change from the default behavior, 
-   uncomment AND alter the shown values.)   
+   if not otherwise specified specification. (To change from the default 
+   behavior, uncomment AND alter the shown values.)   
  -->
 <beans xmlns="http://www.springframework.org/schema/beans"
 	     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
@@ -93,7 +93,8 @@
 <!-- <property name='sourceTagSeeds' value='false'/> -->
  </bean>
  
- <!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file
+ <!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in
+      the job directory, similar to the H1 approach. 
       Use either the above, or this, but not both. -->
  <!-- 
  <bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
@@ -117,7 +118,7 @@
     <!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... -->
     <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
      <!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
-     <!-- <property name="alsoCheckVia" value="true" /> -->
+     <!-- <property name="alsoCheckVia" value="false" /> -->
      <!-- <property name="surtsSourceFile" value="" /> -->
      <!-- <property name="surtsDumpFile" value="surts.dump" /> -->
     </bean>
@@ -134,7 +135,7 @@
     <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
           <property name="decision" value="REJECT"/>
           <property name="seedsAsSurtPrefixes" value="false"/>
-          <property name="surtsDumpFile" value="negative-surts.dump" />
+          <property name="surtsDumpFile" value="negative-surts.dump" /> 
      <!-- <property name="surtsSourceFile" value="" /> -->
     </bean>
     <!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... -->
@@ -168,7 +169,7 @@
    PROCESSING CHAINS
     Much of the crawler's work is specified by the sequential 
     application of swappable Processor modules. These Processors
-    are collected into three 'chains. The CandidateChain is applied 
+    are collected into three 'chains'. The CandidateChain is applied 
     to URIs being considered for inclusion, before a URI is enqueued
     for collection. The FetchChain is applied to URIs when their 
     turn for collection comes up. The DispositionChain is applied 
@@ -176,7 +177,7 @@
   -->
   
  <!-- CANDIDATE CHAIN --> 
- <!-- processors declared as named beans -->
+ <!-- first, processors are declared as top-level named beans -->
  <bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper">
  </bean>
  <bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer">
@@ -195,19 +196,20 @@
         <ref bean="costAssignmentPolicy" />
        </property> -->
  </bean>
- <!-- assembled into ordered CandidateChain bean -->
+ <!-- now, processors are assembled into ordered CandidateChain bean -->
  <bean id="candidateProcessors" class="org.archive.modules.CandidateChain">
   <property name="processors">
    <list>
     <!-- apply scoping rules to each individual candidate URI... -->
     <ref bean="candidateScoper"/>
-    <!-- ...then prepare those ACCEPTed for enqueuing to frontier. -->
+    <!-- ...then prepare those ACCEPTed to be enqueued to frontier. -->
+    <ref bean="preparer"/>
    </list>
   </property>
  </bean>
   
  <!-- FETCH CHAIN --> 
- <!-- processors declared as named beans -->
+ <!-- first, processors are declared as top-level named beans -->
  <bean id="preselector" class="org.archive.crawler.prefetch.Preselector">
   <!-- <property name="recheckScope" value="false" /> -->
   <!-- <property name="blockAll" value="false" /> -->
@@ -270,11 +272,11 @@
  </bean>
  <bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF">
  </bean>    
- <!-- assembled into ordered FetchChain bean -->
+ <!-- now, processors are assembled into ordered FetchChain bean -->
  <bean id="fetchProcessors" class="org.archive.modules.FetchChain">
   <property name="processors">
    <list>
-    <!-- recheck scope, if so enabled... -->
+    <!-- re-check scope, if so enabled... -->
     <ref bean="preselector"/>
     <!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... -->
     <ref bean="preconditions"/>
@@ -282,29 +284,29 @@
     <ref bean="fetchDns"/>
     <!-- ...fetch if HTTP URI... -->
     <ref bean="fetchHttp"/>
-    <!-- ...extract oulinks from HTTP headers... -->
+    <!-- ...extract outlinks from HTTP headers... -->
     <ref bean="extractorHttp"/>
-    <!-- ...extract oulinks from HTML content... -->
+    <!-- ...extract outlinks from HTML content... -->
     <ref bean="extractorHtml"/>
-    <!-- ...extract oulinks from CSS content... -->
+    <!-- ...extract outlinks from CSS content... -->
     <ref bean="extractorCss"/>
-    <!-- ...extract oulinks from Javascript content... -->
+    <!-- ...extract outlinks from Javascript content... -->
     <ref bean="extractorJs"/>
-    <!-- ...extract oulinks from Flash content... -->
+    <!-- ...extract outlinks from Flash content... -->
     <ref bean="extractorSwf"/>
    </list>
   </property>
  </bean>
   
  <!-- DISPOSITION CHAIN -->
- <!-- processors declared as named beans -->
+ <!-- first, processors are declared as top-level named beans  -->
  <bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor">
   <!-- <property name="compress" value="true" /> -->
   <!-- <property name="prefix" value="IAH" /> -->
   <!-- <property name="suffix" value="${HOSTNAME}" /> -->
   <!-- <property name="maxFileSizeBytes" value="1000000000" /> -->
   <!-- <property name="poolMaxActive" value="1" /> -->
-  <!-- <property name="poolMaxWaitMs" value="300000" /> -->
+  <!-- <property name="MaxWaitForIdleMs" value="500" /> -->
   <!-- <property name="skipIdenticalDigests" value="false" /> -->
   <!-- <property name="maxTotalBytesToWrite" value="0" /> -->
   <!-- <property name="directory" value="." /> -->
@@ -329,7 +331,7 @@
   <!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> -->
   
  </bean>
- <!-- assembled into ordered DispositionChain bean -->
+ <!-- now, processors are assembled into ordered DispositionChain bean -->
  <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">
   <property name="processors">
    <list>
@@ -391,6 +393,108 @@
    class="org.archive.crawler.util.BdbUriUniqFilter">
  </bean>
  
+ <!--
+   EXAMPLE SETTINGS OVERLAY SHEETS
+   Sheets allow some settings to vary by context - usually by URI context,
+   so that different sites or sections of sites can be treated differently. 
+   Here are some example Sheets for common purposes. The SheetOverlaysManager
+   (below) automatically collects all Sheet instances declared among the 
+   original beans, but others can be added during the crawl via the scripting 
+   interface.
+  -->
+
+<!-- forceRetire: any URI to which this sheet's settings are applied 
+     will force its containing queue to 'retired' status. -->
+<bean id='forceRetire' class='org.archive.spring.Sheet'>
+ <property name='map'>
+  <map>
+   <entry key='disposition.forceRetire' value='true'/>
+  </map>
+ </property>
+</bean>
+
+<!-- smallBudget: any URI to which this sheet's settings are applied 
+     will give its containing queue small values for balanceReplenishAmount 
+     (causing it to have shorter 'active' periods while other queues are 
+     waiting) and queueTotalBudget (causing the queue to enter 'retired' 
+     status once that expenditure is reached by URI attempts and errors) -->
+<bean id='smallBudget' class='org.archive.spring.Sheet'>
+ <property name='map'>
+  <map>
+   <entry key='frontier.balanceReplenishAmount' value='20'/>
+   <entry key='frontier.queueTotalBudget' value='100'/>
+  </map>
+ </property>
+</bean>
+
+<!-- veryPolite: any URI to which this sheet's settings are applied 
+     will cause its queue to take extra-long politeness snoozes -->
+<bean id='veryPolite' class='org.archive.spring.Sheet'>
+ <property name='map'>
+  <map>
+   <entry key='disposition.delayFactor' value='10'/>
+   <entry key='disposition.minDelayMs' value='10000'/>
+   <entry key='disposition.maxDelayMs' value='1000000'/>
+   <entry key='disposition.respectCrawlDelayUpToSeconds' value='3600'/>
+  </map>
+ </property>
+</bean>
+
+<!-- highPrecedence: any URI to which this sheet's settings are applied 
+     will give its containing queue a slightly-higher than default 
+     queue precedence value. That queue will then be preferred over 
+     other queues for active crawling, never waiting behind lower-
+     precedence queues. -->
+<bean id='highPrecedence' class='org.archive.spring.Sheet'>
+ <property name='map'>
+  <map>
+   <entry key='frontier.balanceReplenishAmount' value='20'/>
+   <entry key='frontier.queueTotalBudget' value='100'/>
+  </map>
+ </property>
+</bean>
+
+<!--
+   EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION
+   A SheetAssociation says certain URIs should have certain overlay Sheets
+   applied. This example applies two sheets to URIs matching two SURT-prefixes.
+   New associations may also be added mid-crawl using the scripting facility.
+  -->
+
+<!--
+<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>
+ <property name='surtPrefixes'>
+  <list>
+   <value>http://(org,example,</value>
+   <value>http://(com,example,www,)/</value>
+  </list>
+ </property>
+ <property name='targetSheetNames'>
+  <list>
+   <value>veryPolite</value>
+   <value>highPrecedence</value>
+  </list>
+ </property>
+</bean>
+-->
+
+<!--
+<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>
+ <property name='surtPrefixes'>
+  <list>
+  <value>http://(com,example,</value>
+  <value>http://(org,example,</value>
+  </list>
+ </property>
+ <property name='targetSheetNames'>
+  <list>
+   <value>veryPolite</value>
+   <value>smallBudget</value>
+  </list>
+ </property>
+</bean>
+  -->
+
  <!-- 
    OPTIONAL BUT RECOMMENDED BEANS
   -->

Modified: trunk/heritrix3/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml
===================================================================
--- trunk/heritrix3/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml	2010-09-30 19:48:35 UTC (rev 6964)
+++ trunk/heritrix3/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml	2010-09-30 23:01:25 UTC (rev 6965)
@@ -6,8 +6,8 @@
    
    Commented-out beans and properties are provided as an example; values
    shown in comments reflect the actual defaults which are in effect
-   without specification. (To change from the default behavior, 
-   uncomment AND alter the shown values.)   
+   if not otherwise specified specification. (To change from the default 
+   behavior, uncomment AND alter the shown values.)   
  -->
 <beans xmlns="http://www.springframework.org/schema/beans"
 	     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
@@ -93,7 +93,8 @@
 <!-- <property name='sourceTagSeeds' value='false'/> -->
  </bean>
  
- <!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file
+ <!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in
+      the job directory, similar to the H1 approach. 
       Use either the above, or this, but not both. -->
  <!-- 
  <bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
@@ -117,7 +118,7 @@
     <!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... -->
     <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
      <!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
-     <!-- <property name="alsoCheckVia" value="true" /> -->
+     <!-- <property name="alsoCheckVia" value="false" /> -->
      <!-- <property name="surtsSourceFile" value="" /> -->
      <!-- <property name="surtsDumpFile" value="surts.dump" /> -->
     </bean>
@@ -134,7 +135,7 @@
     <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
           <property name="decision" value="REJECT"/>
           <property name="seedsAsSurtPrefixes" value="false"/>
-          <property name="surtsDumpFile" value="negative-surts.dump" />
+          <property name="surtsDumpFile" value="negative-surts.dump" /> 
      <!-- <property name="surtsSourceFile" value="" /> -->
     </bean>
     <!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... -->
@@ -168,7 +169,7 @@
    PROCESSING CHAINS
     Much of the crawler's work is specified by the sequential 
     application of swappable Processor modules. These Processors
-    are collected into three 'chains. The CandidateChain is applied 
+    are collected into three 'chains'. The CandidateChain is applied 
     to URIs being considered for inclusion, before a URI is enqueued
     for collection. The FetchChain is applied to URIs when their 
     turn for collection comes up. The DispositionChain is applied 
@@ -176,7 +177,7 @@
   -->
   
  <!-- CANDIDATE CHAIN --> 
- <!-- processors declared as named beans -->
+ <!-- first, processors are declared as top-level named beans -->
  <bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper">
  </bean>
  <bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer">
@@ -195,20 +196,20 @@
         <ref bean="costAssignmentPolicy" />
        </property> -->
  </bean>
- <!-- assembled into ordered CandidateChain bean -->
+ <!-- now, processors are assembled into ordered CandidateChain bean -->
  <bean id="candidateProcessors" class="org.archive.modules.CandidateChain">
   <property name="processors">
    <list>
     <!-- apply scoping rules to each individual candidate URI... -->
     <ref bean="candidateScoper"/>
-    <!-- ...then prepare those ACCEPTed for enqueuing to frontier. -->
+    <!-- ...then prepare those ACCEPTed to be enqueued to frontier. -->
     <ref bean="preparer"/>
    </list>
   </property>
  </bean>
   
  <!-- FETCH CHAIN --> 
- <!-- processors declared as named beans -->
+ <!-- first, processors are declared as top-level named beans -->
  <bean id="preselector" class="org.archive.crawler.prefetch.Preselector">
   <!-- <property name="recheckScope" value="false" /> -->
   <!-- <property name="blockAll" value="false" /> -->
@@ -271,11 +272,11 @@
  </bean>
  <bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF">
  </bean>    
- <!-- assembled into ordered FetchChain bean -->
+ <!-- now, processors are assembled into ordered FetchChain bean -->
  <bean id="fetchProcessors" class="org.archive.modules.FetchChain">
   <property name="processors">
    <list>
-    <!-- recheck scope, if so enabled... -->
+    <!-- re-check scope, if so enabled... -->
     <ref bean="preselector"/>
     <!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... -->
     <ref bean="preconditions"/>
@@ -283,29 +284,29 @@
     <ref bean="fetchDns"/>
     <!-- ...fetch if HTTP URI... -->
     <ref bean="fetchHttp"/>
-    <!-- ...extract oulinks from HTTP headers... -->
+    <!-- ...extract outlinks from HTTP headers... -->
     <ref bean="extractorHttp"/>
-    <!-- ...extract oulinks from HTML content... -->
+    <!-- ...extract outlinks from HTML content... -->
     <ref bean="extractorHtml"/>
-    <!-- ...extract oulinks from CSS content... -->
+    <!-- ...extract outlinks from CSS content... -->
     <ref bean="extractorCss"/>
-    <!-- ...extract oulinks from Javascript content... -->
+    <!-- ...extract outlinks from Javascript content... -->
     <ref bean="extractorJs"/>
-    <!-- ...extract oulinks from Flash content... -->
+    <!-- ...extract outlinks from Flash content... -->
     <ref bean="extractorSwf"/>
    </list>
   </property>
  </bean>
   
  <!-- DISPOSITION CHAIN -->
- <!-- processors declared as named beans -->
+ <!-- first, processors are declared as top-level named beans  -->
  <bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor">
   <!-- <property name="compress" value="true" /> -->
   <!-- <property name="prefix" value="IAH" /> -->
   <!-- <property name="suffix" value="${HOSTNAME}" /> -->
   <!-- <property name="maxFileSizeBytes" value="1000000000" /> -->
   <!-- <property name="poolMaxActive" value="1" /> -->
-  <!-- <property name="poolMaxWaitMs" value="300000" /> -->
+  <!-- <property name="MaxWaitForIdleMs" value="500" /> -->
   <!-- <property name="skipIdenticalDigests" value="false" /> -->
   <!-- <property name="maxTotalBytesToWrite" value="0" /> -->
   <!-- <property name="directory" value="." /> -->
@@ -330,7 +331,7 @@
   <!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> -->
   
  </bean>
- <!-- assembled into ordered DispositionChain bean -->
+ <!-- now, processors are assembled into ordered DispositionChain bean -->
  <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">
   <property name="processors">
    <list>
@@ -392,6 +393,108 @@
    class="org.archive.crawler.util.BdbUriUniqFilter">
  </bean>
  
+ <!--
+   EXAMPLE SETTINGS OVERLAY SHEETS
+   Sheets allow some settings to vary by context - usually by URI context,
+   so that different sites or sections of sites can be treated differently. 
+   Here are some example Sheets for common purposes. The SheetOverlaysManager
+   (below) automatically collects all Sheet instances declared among the 
+   original beans, but others can be added during the crawl via the scripting 
+   interface.
+  -->
+
+<!-- forceRetire: any URI to which this sheet's settings are applied 
+     will force its containing queue to 'retired' status. -->
+<bean id='forceRetire' class='org.archive.spring.Sheet'>
+ <property name='map'>
+  <map>
+   <entry key='disposition.forceRetire' value='true'/>
+  </map>
+ </property>
+</bean>
+
+<!-- smallBudget: any URI to which this sheet's settings are applied 
+     will give its containing queue small values for balanceReplenishAmount 
+     (causing it to have shorter 'active' periods while other queues are 
+     waiting) and queueTotalBudget (causing the queue to enter 'retired' 
+     status once that expenditure is reached by URI attempts and errors) -->
+<bean id='smallBudget' class='org.archive.spring.Sheet'>
+ <property name='map'>
+  <map>
+   <entry key='frontier.balanceReplenishAmount' value='20'/>
+   <entry key='frontier.queueTotalBudget' value='100'/>
+  </map>
+ </property>
+</bean>
+
+<!-- veryPolite: any URI to which this sheet's settings are applied 
+     will cause its queue to take extra-long politeness snoozes -->
+<bean id='veryPolite' class='org.archive.spring.Sheet'>
+ <property name='map'>
+  <map>
+   <entry key='disposition.delayFactor' value='10'/>
+   <entry key='disposition.minDelayMs' value='10000'/>
+   <entry key='disposition.maxDelayMs' value='1000000'/>
+   <entry key='disposition.respectCrawlDelayUpToSeconds' value='3600'/>
+  </map>
+ </property>
+</bean>
+
+<!-- highPrecedence: any URI to which this sheet's settings are applied 
+     will give its containing queue a slightly-higher than default 
+     queue precedence value. That queue will then be preferred over 
+     other queues for active crawling, never waiting behind lower-
+     precedence queues. -->
+<bean id='highPrecedence' class='org.archive.spring.Sheet'>
+ <property name='map'>
+  <map>
+   <entry key='frontier.balanceReplenishAmount' value='20'/>
+   <entry key='frontier.queueTotalBudget' value='100'/>
+  </map>
+ </property>
+</bean>
+
+<!--
+   EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION
+   A SheetAssociation says certain URIs should have certain overlay Sheets
+   applied. This example applies two sheets to URIs matching two SURT-prefixes.
+   New associations may also be added mid-crawl using the scripting facility.
+  -->
+
+<!--
+<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>
+ <property name='surtPrefixes'>
+  <list>
+   <value>http://(org,example,</value>
+   <value>http://(com,example,www,)/</value>
+  </list>
+ </property>
+ <property name='targetSheetNames'>
+  <list>
+   <value>veryPolite</value>
+   <value>highPrecedence</value>
+  </list>
+ </property>
+</bean>
+-->
+
+<!--
+<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>
+ <property name='surtPrefixes'>
+  <list>
+  <value>http://(com,example,</value>
+  <value>http://(org,example,</value>
+  </list>
+ </property>
+ <property name='targetSheetNames'>
+  <list>
+   <value>veryPolite</value>
+   <value>smallBudget</value>
+  </list>
+ </property>
+</bean>
+  -->
+
  <!-- 
    OPTIONAL BUT RECOMMENDED BEANS
   -->
@@ -429,19 +532,20 @@
  <!--
  <bean id="canonicalizationPolicy" 
    class="org.archive.modules.canonicalize.RulesCanonicalizationPolicy">
-  <property name="rules">
-   <list>
-    <bean class="org.archive.modules.canonicalize.LowercaseRule" />
-    <bean class="org.archive.modules.canonicalize.StripUserinfoRule" />
-    <bean class="org.archive.modules.canonicalize.StripWWWNRule" />
-    <bean class="org.archive.modules.canonicalize.StripSessionIDs" />
-    <bean class="org.archive.modules.canonicalize.StripSessionCFIDs" />
-    <bean class="org.archive.modules.canonicalize.FixupQueryString" />
-   </list>
+   <property name="rules">
+    <list>
+     <bean class="org.archive.modules.canonicalize.LowercaseRule" />
+     <bean class="org.archive.modules.canonicalize.StripUserinfoRule" />
+     <bean class="org.archive.modules.canonicalize.StripWWWNRule" />
+     <bean class="org.archive.modules.canonicalize.StripSessionIDs" />
+     <bean class="org.archive.modules.canonicalize.StripSessionCFIDs" />
+     <bean class="org.archive.modules.canonicalize.FixupQueryString" />
+    </list>
   </property>
  </bean>
  -->
  
+
  <!-- QUEUE ASSIGNMENT POLICY -->
  <!--
  <bean id="queueAssignmentPolicy" 
@@ -473,7 +577,21 @@
  </bean>
  -->
  
+ <!-- DISK SPACE MONITOR: 
+      Pauses the crawl if disk space at monitored paths falls below minimum threshold -->
  <!-- 
+ <bean id="diskSpaceMonitor" class="org.archive.crawler.monitor.DiskSpaceMonitor">
+   <property name="pauseThresholdMiB" value="500" />
+   <property name="monitorConfigPaths" value="true" />
+   <property name="monitorPaths">
+     <list>
+       <value>PATH</value>
+     </list>
+   </property>
+ </bean>
+ -->
+ 
+ <!-- 
    REQUIRED STANDARD BEANS
     It will be very rare to replace or reconfigure the following beans.
   -->


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.