<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Recent changes to Documentation</title><link>https://sourceforge.net/p/webcorpus/wiki/Documentation/</link><description>Recent changes to Documentation</description><atom:link href="https://sourceforge.net/p/webcorpus/wiki/Documentation/feed" rel="self"/><language>en</language><lastBuildDate>Thu, 17 Dec 2015 09:47:45 -0000</lastBuildDate><atom:link href="https://sourceforge.net/p/webcorpus/wiki/Documentation/feed" rel="self" type="application/rss+xml"/><item><title>Documentation modified by Steffen Remus</title><link>https://sourceforge.net/p/webcorpus/wiki/Documentation/</link><description>&lt;div class="markdown_content"&gt;&lt;pre&gt;--- v25
+++ v26
@@ -787,7 +787,7 @@
 -----

-# &lt;a name="Shellscript_to_run_the_jobs_40run.sh_41"&gt;&lt;/a&gt; Example Shellscript to run the jobs (run.sh) 
+# &lt;a name="Shellscript_to_run_the_jobs_40run.sh_41"&gt;&lt;/a&gt; Example Shellscript to run the jobs

 &lt;pre&gt;HDFS_DIRECTORY=webcorpus
 JAR=../webcorpus.jar
&lt;/pre&gt;


&lt;/pre&gt;&lt;/div&gt;</description><dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/">Steffen Remus</dc:creator><pubDate>Thu, 17 Dec 2015 09:47:45 -0000</pubDate><guid>https://sourceforge.net04875f5b14892858aa9741852ac9f9fcbab65750</guid></item><item><title>Documentation modified by Steffen Remus</title><link>https://sourceforge.net/p/webcorpus/wiki/Documentation/</link><description>&lt;div class="markdown_content"&gt;&lt;pre&gt;&lt;/pre&gt;
&lt;/div&gt;</description><dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/">Steffen Remus</dc:creator><pubDate>Thu, 17 Dec 2015 09:47:08 -0000</pubDate><guid>https://sourceforge.net9cd393739d75a20da8e971bf36f941c95ce5dbe3</guid></item><item><title>Documentation modified by Steffen Remus</title><link>https://sourceforge.net/p/webcorpus/wiki/Documentation/</link><description>&lt;div class="markdown_content"&gt;&lt;pre&gt;--- v23
+++ v24
@@ -744,7 +744,7 @@
 &lt;/pre&gt;

&lt;p&gt;Note that total count is higher than the number of URLs. This means, that a Sentence occurs multiple time in at least one page. &lt;/p&gt;
&lt;p&gt;-&lt;br/&gt;
+&amp;lt;!--&lt;br/&gt;
 # &lt;a name="How_to_run"&gt;&lt;/a&gt; How to run &lt;/p&gt;
&lt;p&gt;@@ -779,10 +779,15 @@&lt;br/&gt;
 *   Edit webcorpus/mycorpus/jobconf.txt according to your requirements. &lt;br/&gt;
 *   &lt;strong&gt;&amp;gt; cd mycorpus&lt;/strong&gt; &lt;br/&gt;
 *   &lt;strong&gt;&amp;gt; sh run.sh&lt;/strong&gt; &lt;br/&gt;
-&lt;br/&gt;
-&lt;br/&gt;
-&lt;br/&gt;
-## &lt;a name="Shellscript_to_run_the_jobs_40run.sh_41"&gt;&lt;/a&gt; Shellscript to run the jobs (run.sh) &lt;br/&gt;
+--&amp;gt;&lt;br/&gt;
+&lt;br/&gt;
+&lt;br/&gt;
+&lt;br/&gt;
+&lt;br/&gt;
+-----&lt;br/&gt;
+&lt;br/&gt;
+&lt;br/&gt;
+# &lt;a name="Shellscript_to_run_the_jobs_40run.sh_41"&gt;&lt;/a&gt; Example Shellscript to run the jobs (run.sh) &lt;/p&gt;
&lt;p&gt;&lt;/p&gt;&lt;pre&gt;HDFS_DIRECTORY=webcorpus&lt;br/&gt;
 JAR=../webcorpus.jar&lt;br/&gt;
&lt;/pre&gt;&lt;p&gt;&lt;/p&gt;&lt;/div&gt;</description><dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/">Steffen Remus</dc:creator><pubDate>Thu, 17 Dec 2015 09:46:38 -0000</pubDate><guid>https://sourceforge.net276090de20eb4ebf9ad216756a07d0bd10c5db55</guid></item><item><title>Documentation modified by Johannes Simon</title><link>https://sourceforge.net/p/webcorpus/wiki/Documentation/</link><description>&lt;div class="markdown_content"&gt;&lt;pre&gt;--- v22
+++ v23
@@ -491,7 +491,7 @@

 *   Takes optional parameter \[language_name\] (ISO 639-1) to set language to look for. 
-    *   hadoop jar webcorpus.jar webcorpus.hadoopjobs.LanguageJob webcorpus\_data/sentence webcorpus\_data/language [language_name] 
+    *   hadoop jar webcorpus.jar webcorpus.hadoopjobs.LanguageJob webcorpus\_data/sentence webcorpus\_data/language \[language_name\] 
 *   Sentences, where the detected language (lani) matches language\_name, will be labeled with: lang=language\_name, lani=language_name 
 *   Where lani does not match language_name for a sequence of sentences, following rules apply: 
     *   if sequence is at the beginning of a paragraph: lang=lani, lani=lani 
&lt;/pre&gt;
&lt;/div&gt;</description><dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/">Johannes Simon</dc:creator><pubDate>Thu, 20 Mar 2014 21:09:54 -0000</pubDate><guid>https://sourceforge.net67b219f8964725cc3f36eb82f8fd26d540df165e</guid></item><item><title>Documentation modified by Johannes Simon</title><link>https://sourceforge.net/p/webcorpus/wiki/Documentation/</link><description>&lt;div class="markdown_content"&gt;&lt;pre&gt;--- v21
+++ v22
@@ -404,7 +404,7 @@
 ### &lt;a name="UTF8Job"&gt;&lt;/a&gt; UTF8Job
 Encoding errors can lead to undesirable behaviour in language technology. This job tries to filter at least the obvious appearances of encoding errors.

-*   Labels (*encoding_error:=[true|false]*) or removes documents with defective encoding. 
+*   Labels (*encoding_error:=\[true|false\]*) or removes documents with defective encoding. 
 *   Detects defective encoding just by looking for "�" (unknown glyph). 

@@ -699,9 +699,8 @@

 #### &lt;a name="Data_example_AN9"&gt;&lt;/a&gt; Data example 

-##### &lt;a name="Before_AN9"&gt;&lt;/a&gt; Before  Expects output of 
-
-&lt;a href="/bin/edit/Hiwi/LanguageJob?topicparent=Hiwi.UKPWebCorpus" rel="nofollow" title="Create this topic"&gt;LanguageJob&lt;/a&gt; as input. 
+##### &lt;a name="Before_AN9"&gt;&lt;/a&gt; Before
+Expects output of &lt;a href="/bin/edit/Hiwi/LanguageJob?topicparent=Hiwi.UKPWebCorpus" rel="nofollow" title="Create this topic"&gt;LanguageJob&lt;/a&gt; as input. 
 ##### &lt;a name="Call_AN9"&gt;&lt;/a&gt; Call 

 &lt;/pre&gt;&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.SentenceExtractJob webcorpus_data/language webcorpus_data/sentenceExtract
@@ -730,9 +729,8 @@

 #### &lt;a name="Data_example_AN10"&gt;&lt;/a&gt; Data example 

-##### &lt;a name="Before_AN10"&gt;&lt;/a&gt; Before  Expects output of 
-
-&lt;a href="/bin/edit/Hiwi/SentenceExtractJob?topicparent=Hiwi.UKPWebCorpus" rel="nofollow" title="Create this topic"&gt;SentenceExtractJob&lt;/a&gt; as input. 
+##### &lt;a name="Before_AN10"&gt;&lt;/a&gt; Before
+Expects output of &lt;a href="/bin/edit/Hiwi/SentenceExtractJob?topicparent=Hiwi.UKPWebCorpus" rel="nofollow" title="Create this topic"&gt;SentenceExtractJob&lt;/a&gt; as input. 
 ##### &lt;a name="Call_AN10"&gt;&lt;/a&gt; Call 

 &lt;/pre&gt;&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.SentenceExtractCompactJob webcorpus_data/sentenceExtract webcorpus_data/sentenceExtractCompact
&lt;/pre&gt;
&lt;/div&gt;</description><dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/">Johannes Simon</dc:creator><pubDate>Thu, 20 Mar 2014 21:08:34 -0000</pubDate><guid>https://sourceforge.netf22c5b5253d1bd50708979a973d81b0cf114afa6</guid></item><item><title>Documentation modified by Johannes</title><link>https://sourceforge.net/p/webcorpus/wiki/Documentation/</link><description>&lt;div class="markdown_content"&gt;&lt;pre&gt;--- v20
+++ v21
@@ -145,7 +145,7 @@

     &lt;tr&gt;
       &lt;td&gt;
-        &lt;a href="#UIMAJob"&gt;UIMAJob&lt;/a&gt; (n)
+        &lt;a href="#SentenceAnnotateJob"&gt;SentenceAnnotateJob&lt;/a&gt; (n)
       &lt;/td&gt;

       &lt;td&gt;
@@ -529,14 +529,14 @@
 &lt;/td&gt;&lt;/tr&gt;&lt;/pre&gt;
&lt;p&gt;The second sentence is considered to be an english sentence, as it is following an english sentence and is short enough. The last sentence is removed. &lt;/p&gt;
&lt;p&gt;-### &lt;a name="UIMAJob"&gt;&lt;/a&gt; UIMAJob&lt;br /&gt;
+### &lt;a name="SentenceAnnotateJob"&gt;&lt;/a&gt; SentenceAnnotateJob&lt;/p&gt;
&lt;p&gt;Runs arbitrary UIMA components on deduplified sentences.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Counts n-grams based on tokens. &lt;/li&gt;
&lt;li&gt;Expects output of SentenceExtractCompactJob as input&lt;/li&gt;
&lt;li&gt;Writes one XML-serialized CAS per line. Compresses each CAS with GZip.&lt;br /&gt;
-&lt;em&gt;   hadoop jar webcorpus.jar webcorpus.hadoopjobs.UIMAJob webcorpus_data/sentenceExtractCompact webcorpus_data/uima&lt;br /&gt;
+&lt;/em&gt;   hadoop jar webcorpus.jar webcorpus.hadoopjobs.SentenceAnnotateJob webcorpus_data/sentenceExtractCompact webcorpus_data/sentenceAnnotate&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;@@ -556,7 +556,7 @@&lt;/p&gt;
&lt;p&gt;##### &lt;a name="Before_AN7"&gt;&lt;/a&gt; Before &lt;/p&gt;
&lt;p&gt;-Expects the output of &lt;a href="#UIMAJob"&gt;UIMAJob&lt;/a&gt; as input. &lt;br /&gt;
+Expects the output of &lt;a href="#SentenceAnnotateJob"&gt;SentenceAnnotateJob&lt;/a&gt; as input. &lt;br /&gt;
 ##### &lt;a name="Call_AN7"&gt;&lt;/a&gt; Call &lt;/p&gt;
&lt;p&gt;&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.NGramCountJob webcorpus_data/token webcorpus_data/ngram -n 3&lt;br /&gt;
@@ -589,7 +589,7 @@&lt;/pre&gt;&lt;/p&gt;
&lt;p&gt;##### &lt;a name="Before_AN7"&gt;&lt;/a&gt; Before &lt;/p&gt;
&lt;p&gt;-Expects the output of &lt;a href="#UIMAJob"&gt;UIMAJob&lt;/a&gt; as input. &lt;br /&gt;
+Expects the output of &lt;a href="#SentenceAnnotateJob"&gt;SentenceAnnotateJob&lt;/a&gt; as input. &lt;br /&gt;
 ##### &lt;a name="Call_AN7"&gt;&lt;/a&gt; Call &lt;/p&gt;
&lt;p&gt;&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.NGramWithPOSCountJob webcorpus_data/token webcorpus_data/ngram-with-pos -n 3&lt;br /&gt;
@@ -627,7 +627,7 @@&lt;/pre&gt;&lt;/p&gt;
&lt;p&gt;##### &lt;a name="Before_AN7"&gt;&lt;/a&gt; Before &lt;/p&gt;
&lt;p&gt;-Expects the output of &lt;a href="#UIMAJob"&gt;UIMAJob&lt;/a&gt; as input. &lt;br /&gt;
+Expects the output of &lt;a href="#SentenceAnnotateJob"&gt;SentenceAnnotateJob&lt;/a&gt; as input. &lt;br /&gt;
 ##### &lt;a name="Call_AN7"&gt;&lt;/a&gt; Call &lt;/p&gt;
&lt;p&gt;&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.POSNGramCountJob webcorpus_data/token webcorpus_data/pos-ngram -n 3&lt;br /&gt;
&lt;/pre&gt;&lt;/p&gt;&lt;/div&gt;</description><dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/">Johannes</dc:creator><pubDate>Wed, 03 Jul 2013 14:17:06 -0000</pubDate><guid>https://sourceforge.net0635753d9b47aebb434901a7ae02c0d4f1c405c9</guid></item><item><title>Documentation modified by Johannes</title><link>https://sourceforge.net/p/webcorpus/wiki/Documentation/</link><description>&lt;div class="markdown_content"&gt;&lt;pre&gt;--- v19
+++ v20
@@ -300,7 +300,12 @@
 *   Detects duplicates by same URL, same length and same content in first and last n characters. 
 *   Output one document per line. Format (replace #TAB# with tab): **URL#TAB#sm#TAB#pm#TAB#document**

-
+#### &lt;a name="Configuration"&gt;&lt;/a&gt; Configuration
+There's two configuration options that are of special relevance here:
+
+To only read documents from the archive that are of a specific mime type, you can turn on mime-type filtering using
+&lt;code&gt;conf.set("webcorpus.common.io.warcinputformat.filter-mimetypes", true)&lt;/code&gt;. The default value is &lt;code&gt;false&lt;/code&gt;.
+To specify a list of mime-types to be read, use &lt;code&gt;conf.set("webcorpus.documentjob.content-type-whitelist", "type1, type2, ...")&lt;/code&gt;, e.g. "text/html" to keep only HTML documents.

 #### &lt;a name="Data_example"&gt;&lt;/a&gt; Data example 

&lt;/pre&gt;
&lt;/div&gt;</description><dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/">Johannes</dc:creator><pubDate>Wed, 03 Jul 2013 09:02:19 -0000</pubDate><guid>https://sourceforge.netf4d2ae22471408ed3d603b693f0a5cef717682e4</guid></item><item><title>Documentation modified by Johannes</title><link>https://sourceforge.net/p/webcorpus/wiki/Documentation/</link><description>&lt;div class="markdown_content"&gt;&lt;pre&gt;--- v18
+++ v19
@@ -109,7 +109,7 @@

     &lt;tr&gt;
       &lt;td&gt;
-        &lt;a href="#SentenceJob"&gt;SentenceJob&lt;/a&gt; (--lang=&amp;lt;language&amp;gt;)
+        &lt;a href="#SentenceJob"&gt;SentenceJob&lt;/a&gt; (language)
       &lt;/td&gt;

       &lt;td&gt;
@@ -127,7 +127,7 @@

     &lt;tr&gt;
       &lt;td&gt;
-        &lt;a href="#LanguageJob"&gt;LanguageJob&lt;/a&gt; (--lang=&amp;lt;language&amp;gt;)
+        &lt;a href="#LanguageJob"&gt;LanguageJob&lt;/a&gt; (language)
       &lt;/td&gt;

       &lt;td&gt;
&lt;/td&gt;&lt;/tr&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/pre&gt;
&lt;/div&gt;</description><dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/">Johannes</dc:creator><pubDate>Fri, 14 Jun 2013 19:53:32 -0000</pubDate><guid>https://sourceforge.netd3a9e91476c4a84eb852be95a605c90de47a73de</guid></item><item><title>Documentation modified by Johannes</title><link>https://sourceforge.net/p/webcorpus/wiki/Documentation/</link><description>&lt;div class="markdown_content"&gt;&lt;pre&gt;--- v17
+++ v18
@@ -318,7 +318,7 @@

 ##### &lt;a name="Call"&gt;&lt;/a&gt; Call 

-&lt;/pre&gt;&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.DocumentJob webcorpus_data/raw webcorpus_data/document &amp;lt;format&amp;gt;
+&lt;/pre&gt;&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.DocumentJob webcorpus_data/raw webcorpus_data/document --input-format &amp;lt;format&amp;gt;
 &lt;/pre&gt;

 Here, &lt;code&gt;format&lt;/code&gt; must be one of "warc", "arc" or "leipzig".
@@ -468,7 +468,7 @@

 ##### &lt;a name="Call_AN4"&gt;&lt;/a&gt; Call 

-&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.SentenceJob webcorpus_data/utf8 webcorpus_data/sentence --lang=en
+&lt;/pre&gt;&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.SentenceJob webcorpus_data/utf8 webcorpus_data/sentence --lang en
 &lt;/pre&gt;

 ##### &lt;a name="After_AN4"&gt;&lt;/a&gt; After 
@@ -514,7 +514,7 @@

 ##### &lt;a name="Call_AN5"&gt;&lt;/a&gt; Call 

-&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.LanguageJob webcorpus_data/sentence webcorpus_data/language en
+&lt;/pre&gt;&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.LanguageJob webcorpus_data/sentence webcorpus_data/language --lang en
 &lt;/pre&gt;

 ##### &lt;a name="After_AN5"&gt;&lt;/a&gt; After 
@@ -543,7 +543,7 @@

 *   Counts n-grams based on tokens. 
 *   Expects a parameter to set n: 
-    *   hadoop jar webcorpus.jar webcorpus.hadoopjobs.NGramCountJob webcorpus\_data/token webcorpus\_data/ngram \[n\] 
+    *   hadoop jar webcorpus.jar webcorpus.hadoopjobs.NGramCountJob webcorpus\_data/token webcorpus\_data/ngram -n &amp;lt;n&amp;gt; 

@@ -554,7 +554,7 @@
 Expects the output of &lt;a href="#UIMAJob"&gt;UIMAJob&lt;/a&gt; as input. 
 ##### &lt;a name="Call_AN7"&gt;&lt;/a&gt; Call 

-&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.NGramCountJob webcorpus_data/token webcorpus_data/ngram 3
+&lt;/pre&gt;&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.NGramCountJob webcorpus_data/token webcorpus_data/ngram -n 3

 &lt;/pre&gt;
&lt;p&gt;Calculate some 3-grams. &lt;/p&gt;
&lt;p&gt;@@ -576,7 +576,7 @@&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Counts n-grams based on tokens. &lt;/li&gt;
&lt;li&gt;Expects a parameter to set n: &lt;/li&gt;
&lt;li&gt;
&lt;ul&gt;
&lt;li&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.NGramWithPOSCountJob webcorpus_data/token webcorpus_data/ngram [n] &lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;ul&gt;
&lt;li&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.NGramWithPOSCountJob webcorpus_data/token webcorpus_data/ngram-with-pos -n &amp;lt;n&amp;lt; &lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;@@ -587,7 +587,7 @@&lt;br /&gt;
 Expects the output of &lt;a href="#UIMAJob"&gt;UIMAJob&lt;/a&gt; as input. &lt;br /&gt;
 ##### &lt;a name="Call_AN7"&gt;&lt;/a&gt; Call &lt;/p&gt;
&lt;p&gt;-&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.NGramWithPOSCountJob webcorpus_data/token webcorpus_data/ngram 3&lt;br /&gt;
+&lt;/pre&gt;&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.NGramWithPOSCountJob webcorpus_data/token webcorpus_data/ngram-with-pos -n 3&lt;/pre&gt;&lt;/p&gt;
&lt;p&gt; Calculate some 3-grams. &lt;/p&gt;
&lt;p&gt;@@ -614,7 +614,7 @@&lt;br /&gt;
 ...&lt;br /&gt;
&lt;br /&gt;
 *   Expects a parameter to set n: &lt;br /&gt;
-    *   hadoop jar webcorpus.jar webcorpus.hadoopjobs.POSNGramCountJob webcorpus_data/token webcorpus_data/ngram [n] &lt;br /&gt;
+    *   hadoop jar webcorpus.jar webcorpus.hadoopjobs.POSNGramCountJob webcorpus_data/token webcorpus_data/pos-ngram -n &amp;lt;n&amp;lt; &lt;/p&gt;
&lt;p&gt;@@ -625,7 +625,7 @@&lt;br /&gt;
 Expects the output of &lt;a href="#UIMAJob"&gt;UIMAJob&lt;/a&gt; as input. &lt;br /&gt;
 ##### &lt;a name="Call_AN7"&gt;&lt;/a&gt; Call &lt;/p&gt;
&lt;p&gt;-&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.POSNGramCountJob webcorpus_data/token webcorpus_data/ngram 3&lt;br /&gt;
+&lt;/pre&gt;&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.POSNGramCountJob webcorpus_data/token webcorpus_data/pos-ngram -n 3&lt;/pre&gt;&lt;/p&gt;
&lt;p&gt; Calculate some 3-grams. &lt;/p&gt;
&lt;p&gt;@@ -649,7 +649,7 @@&lt;br /&gt;
 *   Counts cooccurrences based on tokens with distance up to n. &lt;br /&gt;
 *   Outputs all cooccurrences for all distances (1, 2, ..., n) at once. &lt;br /&gt;
 *   Expects a parameter to set n: &lt;br /&gt;
-    *   hadoop jar webcorpus.jar webcorpus.hadoopjobs.CooccurrenceJob webcorpus_data/token webcorpus_data/cooccurrence [n] &lt;br /&gt;
+    *   hadoop jar webcorpus.jar webcorpus.hadoopjobs.CooccurrenceJob webcorpus_data/token webcorpus_data/cooccurrence -n &amp;lt;n&amp;lt;&lt;/p&gt;
&lt;p&gt;@@ -660,7 +660,7 @@&lt;br /&gt;
&lt;a href="/bin/edit/Hiwi/CooccurrenceJob?topicparent=Hiwi.UKPWebCorpus" rel="nofollow" title="Create this topic"&gt;CooccurrenceJob&lt;/a&gt; expects the output of &lt;a href="/bin/edit/Hiwi/TokenJob?topicparent=Hiwi.UKPWebCorpus" rel="nofollow" title="Create this topic"&gt;TokenJob&lt;/a&gt; as input. &lt;br /&gt;
 ##### &lt;a name="Call_AN8"&gt;&lt;/a&gt; Call &lt;/p&gt;
&lt;p&gt;-&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.CooccurrenceJob webcorpus_data/token webcorpus_data/cooccurrence 5&lt;br /&gt;
+&lt;/pre&gt;&lt;pre&gt;hadoop jar webcorpus.jar webcorpus.hadoopjobs.CooccurrenceJob webcorpus_data/token webcorpus_data/cooccurrence -n 5&lt;br /&gt;
 &lt;/pre&gt; Calculate cooccurrences up to distance 5. &lt;/p&gt;
&lt;p&gt;##### &lt;a name="After_AN8"&gt;&lt;/a&gt; After &lt;br /&gt;
@@ -790,15 +790,15 @@&lt;br /&gt;
 hadoop jar ${JAR} webcorpus.hadoopjobs.DeduplicationJob ${HDFS_DIRECTORY}/document ${HDFS_DIRECTORY}/deduplication&lt;br /&gt;
 hadoop jar ${JAR} webcorpus.hadoopjobs.DeduplicationByHostJob ${HDFS_DIRECTORY}/deduplication ${HDFS_DIRECTORY}/deduplicationByHost&lt;br /&gt;
 hadoop jar ${JAR} webcorpus.hadoopjobs.UTF8Job ${HDFS_DIRECTORY}/deduplicationByHost ${HDFS_DIRECTORY}/utf8&lt;br /&gt;
-hadoop jar ${JAR} webcorpus.hadoopjobs.SentenceJob ${HDFS_DIRECTORY}/utf8 ${HDFS_DIRECTORY}/sentence&lt;br /&gt;
-hadoop jar ${JAR} webcorpus.hadoopjobs.LanguageJob ${HDFS_DIRECTORY}/sentence ${HDFS_DIRECTORY}/language ${LANGUAGE}&lt;br /&gt;
+hadoop jar ${JAR} webcorpus.hadoopjobs.SentenceJob ${HDFS_DIRECTORY}/utf8 ${HDFS_DIRECTORY}/sentence --lang en&lt;br /&gt;
+hadoop jar ${JAR} webcorpus.hadoopjobs.LanguageJob ${HDFS_DIRECTORY}/sentence --lang en ${HDFS_DIRECTORY}/language ${LANGUAGE}&lt;br /&gt;
 hadoop jar ${JAR} webcorpus.hadoopjobs.TokenJob ${HDFS_DIRECTORY}/language ${HDFS_DIRECTORY}/token&lt;br /&gt;
-hadoop jar ${JAR} webcorpus.hadoopjobs.NGramCountJob ${HDFS_DIRECTORY}/token ${HDFS_DIRECTORY}/1gram 1&lt;br /&gt;
-hadoop jar ${JAR} webcorpus.hadoopjobs.NGramCountJob ${HDFS_DIRECTORY}/token ${HDFS_DIRECTORY}/2gram 2&lt;br /&gt;
-hadoop jar ${JAR} webcorpus.hadoopjobs.NGramCountJob ${HDFS_DIRECTORY}/token ${HDFS_DIRECTORY}/3gram 3&lt;br /&gt;
-hadoop jar ${JAR} webcorpus.hadoopjobs.NGramCountJob ${HDFS_DIRECTORY}/token ${HDFS_DIRECTORY}/4gram 4&lt;br /&gt;
-hadoop jar ${JAR} webcorpus.hadoopjobs.NGramCountJob ${HDFS_DIRECTORY}/token ${HDFS_DIRECTORY}/5gram 5 &lt;br /&gt;
-hadoop jar ${JAR} webcorpus.hadoopjobs.CooccurrenceJob ${HDFS_DIRECTORY}/token ${HDFS_DIRECTORY}/cooccurrence 5&lt;br /&gt;
+hadoop jar ${JAR} webcorpus.hadoopjobs.NGramCountJob ${HDFS_DIRECTORY}/token ${HDFS_DIRECTORY}/1gram -n 1&lt;br /&gt;
+hadoop jar ${JAR} webcorpus.hadoopjobs.NGramCountJob ${HDFS_DIRECTORY}/token ${HDFS_DIRECTORY}/2gram -n 2&lt;br /&gt;
+hadoop jar ${JAR} webcorpus.hadoopjobs.NGramCountJob ${HDFS_DIRECTORY}/token ${HDFS_DIRECTORY}/3gram -n 3&lt;br /&gt;
+hadoop jar ${JAR} webcorpus.hadoopjobs.NGramCountJob ${HDFS_DIRECTORY}/token ${HDFS_DIRECTORY}/4gram -n 4&lt;br /&gt;
+hadoop jar ${JAR} webcorpus.hadoopjobs.NGramCountJob ${HDFS_DIRECTORY}/token ${HDFS_DIRECTORY}/5gram -n 5 &lt;br /&gt;
+hadoop jar ${JAR} webcorpus.hadoopjobs.CooccurrenceJob ${HDFS_DIRECTORY}/token ${HDFS_DIRECTORY}/cooccurrence -n 5&lt;br /&gt;
 hadoop jar ${JAR} webcorpus.hadoopjobs.SentenceExtractJob ${HDFS_DIRECTORY}/language ${HDFS_DIRECTORY}/sentenceExtract&lt;br /&gt;
 hadoop jar ${JAR} webcorpus.hadoopjobs.SentenceExtractCompactJob ${HDFS_DIRECTORY}/sentenceExtract ${HDFS_DIRECTORY}/sentenceExtractCompact&lt;br /&gt;
&lt;br /&gt;
@@ -845,8 +845,8 @@&lt;br /&gt;
 dedupBloomVectorSize=1024&lt;/p&gt;
&lt;p&gt;# Bloom filter: number of hashes to consider&lt;br /&gt;
-# int - default: 1024&lt;br /&gt;
-dedupBloomNbHash=1024&lt;br /&gt;
+# int - default: 7&lt;br /&gt;
+dedupBloomNbHash=7&lt;/p&gt;
&lt;p&gt;# Deduplication Bloom Filter hash function&lt;br /&gt;
 # {murmur, jenkins} - default: jenkins&lt;br /&gt;
&lt;/p&gt;&lt;/div&gt;</description><dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/">Johannes</dc:creator><pubDate>Fri, 14 Jun 2013 19:52:24 -0000</pubDate><guid>https://sourceforge.neta679bbe22ecedba32af58d31b450160aa47075e9</guid></item><item><title>Documentation modified by Johannes</title><link>https://sourceforge.net/p/webcorpus/wiki/Documentation/</link><description>&lt;div class="markdown_content"&gt;&lt;pre&gt;--- v16
+++ v17
@@ -109,7 +109,7 @@

     &lt;tr&gt;
       &lt;td&gt;
-        &lt;a href="#SentenceJob"&gt;SentenceJob (--lang=en|de)&lt;/a&gt;
+        &lt;a href="#SentenceJob"&gt;SentenceJob&lt;/a&gt; (--lang=&amp;lt;language&amp;gt;)
       &lt;/td&gt;

       &lt;td&gt;
@@ -121,13 +121,13 @@
       &lt;/td&gt;

       &lt;td&gt;
-        Sentences are wrapped with XML-s-tags. If possible, a language-specific sentence segmentation model is used.
-      &lt;/td&gt;
-    &lt;/tr&gt;
-    
-    &lt;tr&gt;
-      &lt;td&gt;
-        &lt;a href="#LanguageJob"&gt;LanguageJob&lt;/a&gt; (lang)
+        Sentences are wrapped with XML-s-tags. If possible, a language-specific sentence segmentation model is used. For the language, use its two-letter ISO 639-2 code.
+      &lt;/td&gt;
+    &lt;/tr&gt;
+    
+    &lt;tr&gt;
+      &lt;td&gt;
+        &lt;a href="#LanguageJob"&gt;LanguageJob&lt;/a&gt; (--lang=&amp;lt;language&amp;gt;)
       &lt;/td&gt;

       &lt;td&gt;
@@ -235,15 +235,15 @@

     &lt;tr&gt;
       &lt;td&gt;
-        &lt;a href="#SentenceExtractJob"&gt;SentenceExtractJob&lt;/a&gt; (lang)
-      &lt;/td&gt;
-      
-      &lt;td&gt;
-        One document per line with sentence annotation (and language annotation). Parameter giving expected language.
-      &lt;/td&gt;
-      
-      &lt;td&gt;
-        Extract sentences with expected language and maximum lenght of 512 characters.
+        &lt;a href="#SentenceExtractJob"&gt;SentenceExtractJob&lt;/a&gt;
+      &lt;/td&gt;
+      
+      &lt;td&gt;
+        One document per line with sentence annotation (and language annotation).
+      &lt;/td&gt;
+      
+      &lt;td&gt;
+        Extract sentences with expected language (specified in LanguageJob run) and maximum length of 512 characters.
       &lt;/td&gt;

       &lt;td&gt;
&lt;/td&gt;&lt;/tr&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/pre&gt;
&lt;/div&gt;</description><dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/">Johannes</dc:creator><pubDate>Thu, 13 Jun 2013 16:06:27 -0000</pubDate><guid>https://sourceforge.nete5f2dc9ea4b4709bf49e252cee06cb58835af0d4</guid></item></channel></rss>