From: Doug C. <cu...@us...> - 2005-10-20 23:30:57
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/conf In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv27408/conf Modified Files: Tag: mapred nutch-site.xml Log Message: Pre-au fixes. Index: nutch-site.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/conf/nutch-site.xml,v retrieving revision 1.24.2.3 retrieving revision 1.24.2.4 diff -C2 -d -r1.24.2.3 -r1.24.2.4 *** nutch-site.xml 1 Sep 2005 18:45:29 -0000 1.24.2.3 --- nutch-site.xml 20 Oct 2005 23:30:48 -0000 1.24.2.4 *************** *** 9,83 **** <!-- <property> --> <!-- <name>fs.default.name</name> --> ! <!-- <value>ia109102:8009</value> --> <!-- </property> --> - <property> - <name>ndfs.name.dir</name> - <value>/0/nutch/ndfs/names</value> - </property> - - <property> - <name>ndfs.data.dir</name> - <value>/0/nutch/ndfs/doug,/1/nutch/ndfs/doug</value> - </property> - - <property> - <name>ndfs.replication</name> - <value>2</value> - </property> - <!-- MapReduce --> <!-- <property> --> <!-- <name>mapred.job.tracker</name> --> ! <!-- <value>ia109102:8010</value> --> ! <!-- </property> --> ! ! <!-- <property> --> ! <!-- <name>mapred.job.tracker.info.port</name> --> ! <!-- <value>7846</value> --> ! <!-- </property> --> ! ! <!-- <property> --> ! <!-- <name>mapred.local.dir</name> --> ! <!-- <value>/0/nutch/mapred/local</value> --> ! <!-- </property> --> ! ! <!-- <property> --> ! <!-- <name>mapred.system.dir</name> --> ! <!-- <value>/mapred/system</value> --> ! <!-- </property> --> ! ! <!-- <property> --> ! <!-- <name>mapred.task.timeout</name> --> ! <!-- <value>3600000</value> --> <!-- </property> --> <!-- Override a few Nutch defaults --> - - <!-- Enable parse-ext (parse-ext is a parser that calls the 'ext'ernal program - xpdf to parse pdf files. Also enable parse-default and the ia plugins. - --> <property> ! <name>plugin.includes</name> ! <value>urlfilter-regex|parse-(text|html|ext|default)|index-(basic|ia)|query-(basic|site|url|ia)</value> </property> ! <!-- keep all links, not just inter-host --> ! <!-- db updates will be FASTER if set to true. ! Downside is that link text from same site won't be included. ! (More valuable to take anchor text from other hosts). Use this ! if wide variety of sites to index. ! --> <property> ! <name>db.ignore.internal.links</name> ! <value>false</value> </property> - <!-- use in-degree as poor-man's link analysis --> <property> ! <name>indexer.boost.by.link.count</name> ! <value>true</value> </property> --- 9,38 ---- <!-- <property> --> <!-- <name>fs.default.name</name> --> ! <!-- <value>ia109102.archive.org:8009</value> --> <!-- </property> --> <!-- MapReduce --> <!-- <property> --> <!-- <name>mapred.job.tracker</name> --> ! <!-- <value>ia109102.archive.org:8010</value> --> <!-- </property> --> <!-- Override a few Nutch defaults --> <property> ! <name>archive.collection</name> ! <value>au</value> </property> ! <!-- the name of the archive server hosting this archive --> <property> ! <name>archive.host</name> ! <value>crawls.archive.org</value> </property> <property> ! <name>plugin.includes</name> ! <value>urlfilter-regex|parse-(text|html|js|ext)|index-(basic|ia)|query-(basic|site|url|ia)</value> </property> *************** *** 132,160 **** </property> - <!-- the name of the archive server hosting this archive --> - <property> - <name>archive.host</name> - <value>crawls.archive.org</value> - </property> - - <!-- The name of this archive collection. - DEPRECATED. Now search.jsp uses the 'collection' returned by the search - result drawing up the wayback URL and at index time, use the - command-line 'collection' option. - - <property> - <name>archive.collection</name> - <value>be05</value> - </property> - --> - - <!--Optionally, hardcode the nutch datadir location rather - than rely on tomcat startup location. - <property> - <name>searcher.dir</name> - <value>/home/stack/workspace/nutch-datadir</value> - </property> - --> - <!--If set to true, all contenttypes are indexed. Otherwise we only index text/* and application/* --- 87,90 ---- *************** *** 164,166 **** --- 94,97 ---- <value>false</value> </property> + </nutch-conf> |