From: <go...@us...> - 2003-09-19 19:55:33
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/example-crawl In directory sc8-pr-cvs1:/tmp/cvs-serv14623/example-crawl Modified Files: example-order.xml Log Message: example of seed-extension filtering Index: example-order.xml =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/example-crawl/example-order.xml,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** example-order.xml 6 Aug 2003 01:16:28 -0000 1.12 --- example-order.xml 19 Sep 2003 01:37:41 -0000 1.13 *************** *** 12,21 **** <selector class="org.archive.crawler.basic.SimpleSelector"> ! <seeds src="example-seeds.txt" /> ! <filter ! name="www.loc.gov-only" class="org.archive.crawler.util.URIRegExpFilter" ! modifier="not" ! regexp="http://www\.loc\.gov/.*" /> <filter name="pathological-path" --- 12,46 ---- <selector class="org.archive.crawler.basic.SimpleSelector"> ! <seeds> ! # http://my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/ldep ! # http://dmoz.org ! # http://www.yahoo.com ! # http://www.msnbc.com ! # http://www.lycos.com ! # http://www.drudgereport.com ! # http://www.army.mod.uk ! # http://www.dfid.gov.uk ! # http://www.fco.gov.uk ! # http://www.mod.uk ! # http://www.odpm.gov.uk ! # http://www.pm.gov.uk ! # http://www.raf.mod.uk ! # http://www.royal-navy.mod.uk ! # http://www.sabre.mod.uk ! # http://www.archive.org/.. ! # http://www.yahoo.com/../../movies ! # http://www.creativecommons.org/../ ! http://www.royal-navy.mod.uk/rn/form/form.html?page=1 ! http://www.dfid.gov.uk/../../aboutdfid/files/glossary_l.htm ! #http://directory.google.com/Top/Games/ ! # http://www3.google.com/help/customize.html ! ! </seeds> ! <!-- ! <filter ! name="yahoo" class="org.archive.crawler.util.URIRegExpFilter" ! regexp=".*yahoo\.com.*" /> ! --> <filter name="pathological-path" *************** *** 28,31 **** --- 53,77 ---- modifier="not" regexp="[^/]*?//[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?" /> + + <!-- + <filter + name="problemarea" + class="org.archive.crawler.util.URIRegExpFilter" + regexp="http://www\.royal-navy\.mod\.uk/rn/form/form\.html.*" /> + <filter + name="within-8hosts" + class="org.archive.crawler.util.URIRegExpFilter" + regexp="http://www\.((army\.mod\.uk)|(dfid\.gov\.uk)|(fco\.gov\.uk)|(mod\.uk + )|(odpm\.gov\.uk)|(pm\.gov\.uk)|(raf\.mod\.uk)|(royal-navy\.mod\.uk)|(sabre\.mod + \.uk)).*" /> + --> + + <filter + name="focus" + class="org.archive.crawler.util.SeedExtensionFilter" + mode="domain" + /> + + </selector> *************** *** 39,44 **** class="org.archive.crawler.basic.SimplePreconditionEnforcer" next="DNS"> ! <params delay-factor="5" /> ! <params minimum-delay="100" /> </processor> <processor --- 85,89 ---- class="org.archive.crawler.basic.SimplePreconditionEnforcer" next="DNS"> ! <params delay-factor="3" minimum-delay="100" /> </processor> <processor *************** *** 84,91 **** --- 129,138 ---- <compression use="true"/> <arc-files max-size-bytes="20000000"/> + <!-- <filter name="http-only" class="org.archive.crawler.util.URIRegExpFilter" regexp="^http://.*" /> + --> </processor> <processor *************** *** 98,106 **** <!-- actual enforcement of these limits may depend on choice of SSS/processor instances that read and respect these limits --> ! <max-link-depth value="0" /> <!-- zero means crawl seeds only --> ! <max-pages value="1000" /> ! <max-duration value="1h" /> ! <max-resources-per-site value="1000" /> ! <max-toe-threads value="3" /> </limits> --- 145,151 ---- <!-- actual enforcement of these limits may depend on choice of SSS/processor instances that read and respect these limits --> ! <max-link-depth value="100" /> <!-- zero means crawl seeds only --> ! <max-embed-depth value="5" /> <!-- extra hops that can be taken for embeds --> ! <max-toe-threads value="20" /> </limits> |