| Name | Modified | Size | Downloads / Week | 
|---|---|---|---|
| Parent folder | |||
| SingleLineTokens.py | 2012-07-07 | 273 Bytes | |
| Lowercase.py | 2012-07-07 | 156 Bytes | |
| README | 2012-07-07 | 557 Bytes | |
| WikiExtractor.py | 2012-07-05 | 21.3 kB | |
| TrainPunkt.py | 2012-07-05 | 1.6 kB | |
| RawFilter.py | 2012-07-05 | 1.3 kB | |
| Punkt.py | 2012-07-05 | 1.1 kB | |
| Totals: 7 Items | 26.3 kB | 0 | |
wget -c http://dumps.wikimedia.org/glwiki/20120625/glwiki-20120625-pages-articles.xml.bz2
mkdir glwiki
bzcat glwiki-20120625-pages-articles.xml.bz2 | ./WikiExtractor.py -o glwiki
find glwiki -exec cat {} \; > corpus/gl.raw
rm -rf glwiki
cat corpus/gl.raw | ./RawFilter.py > corpus/gl.pars
cat corpus/gl.pars | ./TrainPunkt.py > corpus/gl.punkt
cat corpus/gl.pars | ./Punkt.py corpus/gl.punkt > corpus/gl.sent
cd corpus
bzip2 gl.*
cat corpus/gl.tokens | ./SingleLineTokens.py | ./Lowercase.py | /usr/local/irstlm/add-start-end.sh > corpus/gl-20120625.tokens