Thread: [Archive-access-cvs] archive-access/projects/nutch/conf nutch-site.xml.nutchwax,NONE,1.1 nutch-site.

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/archive-access/archive-access/projects/nutch/conf
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9063/conf

Added Files:
	nutch-site.xml.nutchwax 
Removed Files:
	nutch-site.xml.all 
Log Message:
* maven.xml
    nutch-site.xml.all renamed as nutch-site.xml.nutchwax.
* conf/nutch-site.xml.nutchwax 
    Added.  Replaces...
* conf/nutch-site.xml.all 
    Deleted.

--- nutch-site.xml.all DELETED ---

--- NEW FILE: nutch-site.xml.nutchwax ---
<?xml version="1.0"?>

<!--Internet Archive Nutch configuration. This config. is what gets built into
    nutchwax.  Overrides a few Nutch defaults and adds nutchwax specific
    config (Such config. options have an 'archive' prefix).
-->

<nutch-conf>

<!-- Enable parse-ext (parse-ext is a parser that calls the 'ext'ernal program
xpdf to parse pdf files). Also enable parse-default and the ia plugins.
-->
<property>
  <name>plugin.includes</name>
  <value>urlfilter-regex|parse-(text|html|ext|default)|index-(basic|ia)|query-(basic|site|url|ia)</value>
</property>

<property>
  <name>db.ignore.internal.links</name>
  <value>false</value>
    <description>Keep all links, not just inter-host. db updates will be
    FASTER if set to true.  Downside is that link text from same site won't
    be included  (More valuable to take anchor text from other hosts).  Use
    this if wide variety of sites to index.
    </description>
</property>

<property>
  <name>indexer.boost.by.link.count</name>
  <value>true</value>
    <description>Use in-degree as poor-man's link analysis.</description>
</property>

<property>
  <name>indexer.max.tokens</name>
  <value>100000</value>
    <description>Don't truncate documents as much as by default.
    </description>
</property>

<property>
  <name>http.content.limit</name>
  <value>10000000</value>
</property>

<property>
  <name>io.map.index.skip</name>
  <value>7</value>
  <description>Use less RAM. Index files get read into memory.  This config.
  says read 1/7th only in at a time. Random access is slower but use more
  memory.
  </description>
</property>

<property>
  <name>indexer.termIndexInterval</name>
  <value>1024</value>
  <description>Determines the fraction of terms which Lucene keeps in
  RAM when searching, to facilitate random-access.  Smaller values use
  more memory but make searches somewhat faster.  Larger values use
  less memory but make searches somewhat slower.
    For lucene indexes, normally.  The default is 128.
    Write every 1024 entries rather than every 128, the default.
  </description>
</property>

<property>
  <name>indexer.maxMergeDocs</name>
  <value>2147483647</value>
  <description>This number determines the maximum number of Lucene
  Documents to be merged into a new Lucene segment. Larger values
  increase indexing speed and reduce the number of Lucene segments,
  which reduces the number of open file handles; however, this also
  increases RAM usage during indexing.

  Doug says: "There was a bogus value for indexer.maxMergeDocs in
    nutch-default.xml which made indexing really slow.  The correct
    value is something really big (like Integer.MAX_VALUE)."
  </description>
</property>

<property>
  <name>searcher.summary.context</name>
  <value>20</value>
  <description>
  The number of context terms to display preceding and following
  matching terms in a hit summary.
    Make summaries a little longer than the default.
  </description>
</property>

<property>
  <name>searcher.summary.length</name>
  <value>80</value>
  <description>
  The total number of terms to display in a hit summary.
  </description>
</property>

<property>
  <name>collections.host</name>
  <value>collections.example.org</value>
    <description>The name of the server hosting collections.
    </description>
</property>

<!-- The name of this archive collection.
    DEPRECATED.  Now search.jsp uses the 'collection' returned by the search
    result drawing up the wayback URL and at index time, use the
    command-line 'collection' option.

<property>
  <name>archive.collection</name>
  <value>be05</value>
</property>
-->

<!--
<property>
    <name>searcher.dir</name>
    <value>/home/stack/workspace/nutch-datadir</value>
    <description>Optionally, hardcode the nutch datadir location rather
    than rely on tomcat startup location.
    </description>
</property>
 -->

<property>
  <name>archive.index.all</name>
  <value>true</value>
    <description>If set to true, all contenttypes are indexed. 
    Otherwise we only index text/* and application/*
    </description>
</property>

<property>
    <name>archive.skip.big.html</name>
    <value>-1</value>
    <description>If text/html is larger than value, just skip it completely.
    Use this setting to bypass problematic massive text/html (We were seeing
    the text/html parser hang for hours in bad, big html docs). Default
    value is -1 which says don't skip text/html docs.</description>
</property>
<property>
    <name>archive.dedup.count.collection</name>
    <value>false</value>
    <description>If true, when deduping, compare collection names
    as well as URL and content-md5 deduping.
    </description>
</property>

</nutch-conf>

Thread: [Archive-access-cvs] archive-access/projects/nutch/conf nutch-site.xml.nutchwax,NONE,1.1 nutch-site.

archive-access-cvs