From: <go...@us...> - 2010-09-30 23:01:36
|
Revision: 6965 http://archive-crawler.svn.sourceforge.net/archive-crawler/?rev=6965&view=rev Author: gojomo Date: 2010-09-30 23:01:25 +0000 (Thu, 30 Sep 2010) Log Message: ----------- [HER-1830] H3: improve sheet examples in default configuration CXML * **/profile-crawler-beans.xml example SurPrefixesSheetAssociation Modified Paths: -------------- trunk/heritrix3/dist/src/main/conf/jobs/profile-defaults/profile-crawler-beans.cxml trunk/heritrix3/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml Modified: trunk/heritrix3/dist/src/main/conf/jobs/profile-defaults/profile-crawler-beans.cxml =================================================================== --- trunk/heritrix3/dist/src/main/conf/jobs/profile-defaults/profile-crawler-beans.cxml 2010-09-30 19:48:35 UTC (rev 6964) +++ trunk/heritrix3/dist/src/main/conf/jobs/profile-defaults/profile-crawler-beans.cxml 2010-09-30 23:01:25 UTC (rev 6965) @@ -6,8 +6,8 @@ Commented-out beans and properties are provided as an example; values shown in comments reflect the actual defaults which are in effect - without specification. (To change from the default behavior, - uncomment AND alter the shown values.) + if not otherwise specified specification. (To change from the default + behavior, uncomment AND alter the shown values.) --> <beans xmlns="http://www.springframework.org/schema/beans" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" @@ -93,7 +93,8 @@ <!-- <property name='sourceTagSeeds' value='false'/> --> </bean> - <!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file + <!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in + the job directory, similar to the H1 approach. Use either the above, or this, but not both. --> <!-- <bean id="seeds" class="org.archive.modules.seeds.TextSeedModule"> @@ -117,7 +118,7 @@ <!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... --> <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> <!-- <property name="seedsAsSurtPrefixes" value="true" /> --> - <!-- <property name="alsoCheckVia" value="true" /> --> + <!-- <property name="alsoCheckVia" value="false" /> --> <!-- <property name="surtsSourceFile" value="" /> --> <!-- <property name="surtsDumpFile" value="surts.dump" /> --> </bean> @@ -134,7 +135,7 @@ <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> <property name="decision" value="REJECT"/> <property name="seedsAsSurtPrefixes" value="false"/> - <property name="surtsDumpFile" value="negative-surts.dump" /> + <property name="surtsDumpFile" value="negative-surts.dump" /> <!-- <property name="surtsSourceFile" value="" /> --> </bean> <!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... --> @@ -168,7 +169,7 @@ PROCESSING CHAINS Much of the crawler's work is specified by the sequential application of swappable Processor modules. These Processors - are collected into three 'chains. The CandidateChain is applied + are collected into three 'chains'. The CandidateChain is applied to URIs being considered for inclusion, before a URI is enqueued for collection. The FetchChain is applied to URIs when their turn for collection comes up. The DispositionChain is applied @@ -176,7 +177,7 @@ --> <!-- CANDIDATE CHAIN --> - <!-- processors declared as named beans --> + <!-- first, processors are declared as top-level named beans --> <bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper"> </bean> <bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer"> @@ -195,19 +196,20 @@ <ref bean="costAssignmentPolicy" /> </property> --> </bean> - <!-- assembled into ordered CandidateChain bean --> + <!-- now, processors are assembled into ordered CandidateChain bean --> <bean id="candidateProcessors" class="org.archive.modules.CandidateChain"> <property name="processors"> <list> <!-- apply scoping rules to each individual candidate URI... --> <ref bean="candidateScoper"/> - <!-- ...then prepare those ACCEPTed for enqueuing to frontier. --> + <!-- ...then prepare those ACCEPTed to be enqueued to frontier. --> + <ref bean="preparer"/> </list> </property> </bean> <!-- FETCH CHAIN --> - <!-- processors declared as named beans --> + <!-- first, processors are declared as top-level named beans --> <bean id="preselector" class="org.archive.crawler.prefetch.Preselector"> <!-- <property name="recheckScope" value="false" /> --> <!-- <property name="blockAll" value="false" /> --> @@ -270,11 +272,11 @@ </bean> <bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF"> </bean> - <!-- assembled into ordered FetchChain bean --> + <!-- now, processors are assembled into ordered FetchChain bean --> <bean id="fetchProcessors" class="org.archive.modules.FetchChain"> <property name="processors"> <list> - <!-- recheck scope, if so enabled... --> + <!-- re-check scope, if so enabled... --> <ref bean="preselector"/> <!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... --> <ref bean="preconditions"/> @@ -282,29 +284,29 @@ <ref bean="fetchDns"/> <!-- ...fetch if HTTP URI... --> <ref bean="fetchHttp"/> - <!-- ...extract oulinks from HTTP headers... --> + <!-- ...extract outlinks from HTTP headers... --> <ref bean="extractorHttp"/> - <!-- ...extract oulinks from HTML content... --> + <!-- ...extract outlinks from HTML content... --> <ref bean="extractorHtml"/> - <!-- ...extract oulinks from CSS content... --> + <!-- ...extract outlinks from CSS content... --> <ref bean="extractorCss"/> - <!-- ...extract oulinks from Javascript content... --> + <!-- ...extract outlinks from Javascript content... --> <ref bean="extractorJs"/> - <!-- ...extract oulinks from Flash content... --> + <!-- ...extract outlinks from Flash content... --> <ref bean="extractorSwf"/> </list> </property> </bean> <!-- DISPOSITION CHAIN --> - <!-- processors declared as named beans --> + <!-- first, processors are declared as top-level named beans --> <bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor"> <!-- <property name="compress" value="true" /> --> <!-- <property name="prefix" value="IAH" /> --> <!-- <property name="suffix" value="${HOSTNAME}" /> --> <!-- <property name="maxFileSizeBytes" value="1000000000" /> --> <!-- <property name="poolMaxActive" value="1" /> --> - <!-- <property name="poolMaxWaitMs" value="300000" /> --> + <!-- <property name="MaxWaitForIdleMs" value="500" /> --> <!-- <property name="skipIdenticalDigests" value="false" /> --> <!-- <property name="maxTotalBytesToWrite" value="0" /> --> <!-- <property name="directory" value="." /> --> @@ -329,7 +331,7 @@ <!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> --> </bean> - <!-- assembled into ordered DispositionChain bean --> + <!-- now, processors are assembled into ordered DispositionChain bean --> <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain"> <property name="processors"> <list> @@ -391,6 +393,108 @@ class="org.archive.crawler.util.BdbUriUniqFilter"> </bean> + <!-- + EXAMPLE SETTINGS OVERLAY SHEETS + Sheets allow some settings to vary by context - usually by URI context, + so that different sites or sections of sites can be treated differently. + Here are some example Sheets for common purposes. The SheetOverlaysManager + (below) automatically collects all Sheet instances declared among the + original beans, but others can be added during the crawl via the scripting + interface. + --> + +<!-- forceRetire: any URI to which this sheet's settings are applied + will force its containing queue to 'retired' status. --> +<bean id='forceRetire' class='org.archive.spring.Sheet'> + <property name='map'> + <map> + <entry key='disposition.forceRetire' value='true'/> + </map> + </property> +</bean> + +<!-- smallBudget: any URI to which this sheet's settings are applied + will give its containing queue small values for balanceReplenishAmount + (causing it to have shorter 'active' periods while other queues are + waiting) and queueTotalBudget (causing the queue to enter 'retired' + status once that expenditure is reached by URI attempts and errors) --> +<bean id='smallBudget' class='org.archive.spring.Sheet'> + <property name='map'> + <map> + <entry key='frontier.balanceReplenishAmount' value='20'/> + <entry key='frontier.queueTotalBudget' value='100'/> + </map> + </property> +</bean> + +<!-- veryPolite: any URI to which this sheet's settings are applied + will cause its queue to take extra-long politeness snoozes --> +<bean id='veryPolite' class='org.archive.spring.Sheet'> + <property name='map'> + <map> + <entry key='disposition.delayFactor' value='10'/> + <entry key='disposition.minDelayMs' value='10000'/> + <entry key='disposition.maxDelayMs' value='1000000'/> + <entry key='disposition.respectCrawlDelayUpToSeconds' value='3600'/> + </map> + </property> +</bean> + +<!-- highPrecedence: any URI to which this sheet's settings are applied + will give its containing queue a slightly-higher than default + queue precedence value. That queue will then be preferred over + other queues for active crawling, never waiting behind lower- + precedence queues. --> +<bean id='highPrecedence' class='org.archive.spring.Sheet'> + <property name='map'> + <map> + <entry key='frontier.balanceReplenishAmount' value='20'/> + <entry key='frontier.queueTotalBudget' value='100'/> + </map> + </property> +</bean> + +<!-- + EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION + A SheetAssociation says certain URIs should have certain overlay Sheets + applied. This example applies two sheets to URIs matching two SURT-prefixes. + New associations may also be added mid-crawl using the scripting facility. + --> + +<!-- +<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'> + <property name='surtPrefixes'> + <list> + <value>http://(org,example,</value> + <value>http://(com,example,www,)/</value> + </list> + </property> + <property name='targetSheetNames'> + <list> + <value>veryPolite</value> + <value>highPrecedence</value> + </list> + </property> +</bean> +--> + +<!-- +<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'> + <property name='surtPrefixes'> + <list> + <value>http://(com,example,</value> + <value>http://(org,example,</value> + </list> + </property> + <property name='targetSheetNames'> + <list> + <value>veryPolite</value> + <value>smallBudget</value> + </list> + </property> +</bean> + --> + <!-- OPTIONAL BUT RECOMMENDED BEANS --> Modified: trunk/heritrix3/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml =================================================================== --- trunk/heritrix3/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml 2010-09-30 19:48:35 UTC (rev 6964) +++ trunk/heritrix3/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml 2010-09-30 23:01:25 UTC (rev 6965) @@ -6,8 +6,8 @@ Commented-out beans and properties are provided as an example; values shown in comments reflect the actual defaults which are in effect - without specification. (To change from the default behavior, - uncomment AND alter the shown values.) + if not otherwise specified specification. (To change from the default + behavior, uncomment AND alter the shown values.) --> <beans xmlns="http://www.springframework.org/schema/beans" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" @@ -93,7 +93,8 @@ <!-- <property name='sourceTagSeeds' value='false'/> --> </bean> - <!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file + <!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in + the job directory, similar to the H1 approach. Use either the above, or this, but not both. --> <!-- <bean id="seeds" class="org.archive.modules.seeds.TextSeedModule"> @@ -117,7 +118,7 @@ <!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... --> <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> <!-- <property name="seedsAsSurtPrefixes" value="true" /> --> - <!-- <property name="alsoCheckVia" value="true" /> --> + <!-- <property name="alsoCheckVia" value="false" /> --> <!-- <property name="surtsSourceFile" value="" /> --> <!-- <property name="surtsDumpFile" value="surts.dump" /> --> </bean> @@ -134,7 +135,7 @@ <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> <property name="decision" value="REJECT"/> <property name="seedsAsSurtPrefixes" value="false"/> - <property name="surtsDumpFile" value="negative-surts.dump" /> + <property name="surtsDumpFile" value="negative-surts.dump" /> <!-- <property name="surtsSourceFile" value="" /> --> </bean> <!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... --> @@ -168,7 +169,7 @@ PROCESSING CHAINS Much of the crawler's work is specified by the sequential application of swappable Processor modules. These Processors - are collected into three 'chains. The CandidateChain is applied + are collected into three 'chains'. The CandidateChain is applied to URIs being considered for inclusion, before a URI is enqueued for collection. The FetchChain is applied to URIs when their turn for collection comes up. The DispositionChain is applied @@ -176,7 +177,7 @@ --> <!-- CANDIDATE CHAIN --> - <!-- processors declared as named beans --> + <!-- first, processors are declared as top-level named beans --> <bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper"> </bean> <bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer"> @@ -195,20 +196,20 @@ <ref bean="costAssignmentPolicy" /> </property> --> </bean> - <!-- assembled into ordered CandidateChain bean --> + <!-- now, processors are assembled into ordered CandidateChain bean --> <bean id="candidateProcessors" class="org.archive.modules.CandidateChain"> <property name="processors"> <list> <!-- apply scoping rules to each individual candidate URI... --> <ref bean="candidateScoper"/> - <!-- ...then prepare those ACCEPTed for enqueuing to frontier. --> + <!-- ...then prepare those ACCEPTed to be enqueued to frontier. --> <ref bean="preparer"/> </list> </property> </bean> <!-- FETCH CHAIN --> - <!-- processors declared as named beans --> + <!-- first, processors are declared as top-level named beans --> <bean id="preselector" class="org.archive.crawler.prefetch.Preselector"> <!-- <property name="recheckScope" value="false" /> --> <!-- <property name="blockAll" value="false" /> --> @@ -271,11 +272,11 @@ </bean> <bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF"> </bean> - <!-- assembled into ordered FetchChain bean --> + <!-- now, processors are assembled into ordered FetchChain bean --> <bean id="fetchProcessors" class="org.archive.modules.FetchChain"> <property name="processors"> <list> - <!-- recheck scope, if so enabled... --> + <!-- re-check scope, if so enabled... --> <ref bean="preselector"/> <!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... --> <ref bean="preconditions"/> @@ -283,29 +284,29 @@ <ref bean="fetchDns"/> <!-- ...fetch if HTTP URI... --> <ref bean="fetchHttp"/> - <!-- ...extract oulinks from HTTP headers... --> + <!-- ...extract outlinks from HTTP headers... --> <ref bean="extractorHttp"/> - <!-- ...extract oulinks from HTML content... --> + <!-- ...extract outlinks from HTML content... --> <ref bean="extractorHtml"/> - <!-- ...extract oulinks from CSS content... --> + <!-- ...extract outlinks from CSS content... --> <ref bean="extractorCss"/> - <!-- ...extract oulinks from Javascript content... --> + <!-- ...extract outlinks from Javascript content... --> <ref bean="extractorJs"/> - <!-- ...extract oulinks from Flash content... --> + <!-- ...extract outlinks from Flash content... --> <ref bean="extractorSwf"/> </list> </property> </bean> <!-- DISPOSITION CHAIN --> - <!-- processors declared as named beans --> + <!-- first, processors are declared as top-level named beans --> <bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor"> <!-- <property name="compress" value="true" /> --> <!-- <property name="prefix" value="IAH" /> --> <!-- <property name="suffix" value="${HOSTNAME}" /> --> <!-- <property name="maxFileSizeBytes" value="1000000000" /> --> <!-- <property name="poolMaxActive" value="1" /> --> - <!-- <property name="poolMaxWaitMs" value="300000" /> --> + <!-- <property name="MaxWaitForIdleMs" value="500" /> --> <!-- <property name="skipIdenticalDigests" value="false" /> --> <!-- <property name="maxTotalBytesToWrite" value="0" /> --> <!-- <property name="directory" value="." /> --> @@ -330,7 +331,7 @@ <!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> --> </bean> - <!-- assembled into ordered DispositionChain bean --> + <!-- now, processors are assembled into ordered DispositionChain bean --> <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain"> <property name="processors"> <list> @@ -392,6 +393,108 @@ class="org.archive.crawler.util.BdbUriUniqFilter"> </bean> + <!-- + EXAMPLE SETTINGS OVERLAY SHEETS + Sheets allow some settings to vary by context - usually by URI context, + so that different sites or sections of sites can be treated differently. + Here are some example Sheets for common purposes. The SheetOverlaysManager + (below) automatically collects all Sheet instances declared among the + original beans, but others can be added during the crawl via the scripting + interface. + --> + +<!-- forceRetire: any URI to which this sheet's settings are applied + will force its containing queue to 'retired' status. --> +<bean id='forceRetire' class='org.archive.spring.Sheet'> + <property name='map'> + <map> + <entry key='disposition.forceRetire' value='true'/> + </map> + </property> +</bean> + +<!-- smallBudget: any URI to which this sheet's settings are applied + will give its containing queue small values for balanceReplenishAmount + (causing it to have shorter 'active' periods while other queues are + waiting) and queueTotalBudget (causing the queue to enter 'retired' + status once that expenditure is reached by URI attempts and errors) --> +<bean id='smallBudget' class='org.archive.spring.Sheet'> + <property name='map'> + <map> + <entry key='frontier.balanceReplenishAmount' value='20'/> + <entry key='frontier.queueTotalBudget' value='100'/> + </map> + </property> +</bean> + +<!-- veryPolite: any URI to which this sheet's settings are applied + will cause its queue to take extra-long politeness snoozes --> +<bean id='veryPolite' class='org.archive.spring.Sheet'> + <property name='map'> + <map> + <entry key='disposition.delayFactor' value='10'/> + <entry key='disposition.minDelayMs' value='10000'/> + <entry key='disposition.maxDelayMs' value='1000000'/> + <entry key='disposition.respectCrawlDelayUpToSeconds' value='3600'/> + </map> + </property> +</bean> + +<!-- highPrecedence: any URI to which this sheet's settings are applied + will give its containing queue a slightly-higher than default + queue precedence value. That queue will then be preferred over + other queues for active crawling, never waiting behind lower- + precedence queues. --> +<bean id='highPrecedence' class='org.archive.spring.Sheet'> + <property name='map'> + <map> + <entry key='frontier.balanceReplenishAmount' value='20'/> + <entry key='frontier.queueTotalBudget' value='100'/> + </map> + </property> +</bean> + +<!-- + EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION + A SheetAssociation says certain URIs should have certain overlay Sheets + applied. This example applies two sheets to URIs matching two SURT-prefixes. + New associations may also be added mid-crawl using the scripting facility. + --> + +<!-- +<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'> + <property name='surtPrefixes'> + <list> + <value>http://(org,example,</value> + <value>http://(com,example,www,)/</value> + </list> + </property> + <property name='targetSheetNames'> + <list> + <value>veryPolite</value> + <value>highPrecedence</value> + </list> + </property> +</bean> +--> + +<!-- +<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'> + <property name='surtPrefixes'> + <list> + <value>http://(com,example,</value> + <value>http://(org,example,</value> + </list> + </property> + <property name='targetSheetNames'> + <list> + <value>veryPolite</value> + <value>smallBudget</value> + </list> + </property> +</bean> + --> + <!-- OPTIONAL BUT RECOMMENDED BEANS --> @@ -429,19 +532,20 @@ <!-- <bean id="canonicalizationPolicy" class="org.archive.modules.canonicalize.RulesCanonicalizationPolicy"> - <property name="rules"> - <list> - <bean class="org.archive.modules.canonicalize.LowercaseRule" /> - <bean class="org.archive.modules.canonicalize.StripUserinfoRule" /> - <bean class="org.archive.modules.canonicalize.StripWWWNRule" /> - <bean class="org.archive.modules.canonicalize.StripSessionIDs" /> - <bean class="org.archive.modules.canonicalize.StripSessionCFIDs" /> - <bean class="org.archive.modules.canonicalize.FixupQueryString" /> - </list> + <property name="rules"> + <list> + <bean class="org.archive.modules.canonicalize.LowercaseRule" /> + <bean class="org.archive.modules.canonicalize.StripUserinfoRule" /> + <bean class="org.archive.modules.canonicalize.StripWWWNRule" /> + <bean class="org.archive.modules.canonicalize.StripSessionIDs" /> + <bean class="org.archive.modules.canonicalize.StripSessionCFIDs" /> + <bean class="org.archive.modules.canonicalize.FixupQueryString" /> + </list> </property> </bean> --> + <!-- QUEUE ASSIGNMENT POLICY --> <!-- <bean id="queueAssignmentPolicy" @@ -473,7 +577,21 @@ </bean> --> + <!-- DISK SPACE MONITOR: + Pauses the crawl if disk space at monitored paths falls below minimum threshold --> <!-- + <bean id="diskSpaceMonitor" class="org.archive.crawler.monitor.DiskSpaceMonitor"> + <property name="pauseThresholdMiB" value="500" /> + <property name="monitorConfigPaths" value="true" /> + <property name="monitorPaths"> + <list> + <value>PATH</value> + </list> + </property> + </bean> + --> + + <!-- REQUIRED STANDARD BEANS It will be very rare to replace or reconfigure the following beans. --> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |