From: <lor...@us...> - 2013-09-04 09:41:46
|
Revision: 4050 http://sourceforge.net/p/dl-learner/code/4050 Author: lorenz_b Date: 2013-09-04 09:41:41 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Added simple stop word filtering. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StopWordFilter.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StopWordFilter.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StopWordFilter.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StopWordFilter.java 2013-09-04 09:41:41 UTC (rev 4050) @@ -0,0 +1,56 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.io.File; +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +import org.dllearner.algorithms.isle.index.Annotation; + +import com.google.common.base.Charsets; +import com.google.common.io.Files; + +/** + * @author Lorenz Buehmann + * + */ +public class StopWordFilter { + + private Set<String> stopWords; + private static final String stopWordfile = "src/main/resources/stopwords.txt"; + + public StopWordFilter() { + try { + stopWords = new HashSet<String>(Files.readLines(new File(stopWordfile), Charsets.UTF_8)); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public String removeStopWords(String input) { + for (String s : stopWords) { + input = input.replaceAll("\\b" + s + "\\b", ""); + } + return input; + } + + public void removeStopWords(Set<String> words) { + words.removeAll(stopWords); + } + + public void removeStopWordAnnotations(Set<Annotation> annotations) { + for (Iterator<Annotation> iter = annotations.iterator(); iter.hasNext();) { + Annotation annotation = iter.next(); + String content = annotation.getGetReferencedDocument().getContent(); + String token = content.substring(annotation.getOffset(), annotation.getOffset()+annotation.getLength()); + if(stopWords.contains(token)){ + iter.remove(); + } + } + } + +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-04 09:41:35 UTC (rev 4049) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-04 09:41:41 UTC (rev 4050) @@ -1,22 +1,35 @@ package org.dllearner.algorithms.isle.index; +import java.io.IOException; +import java.io.StringReader; import java.util.HashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.util.Version; +import org.dllearner.algorithms.isle.StopWordFilter; + /** * * @author Jens Lehmann * */ public class SimpleLinguisticAnnotator implements LinguisticAnnotator { + + private StopWordFilter stopWordFilter = new StopWordFilter(); @Override public Set<Annotation> annotate(Document document) { String s = document.getRawContent().trim(); + System.out.println("Document:" + s); +// s = stopWordFilter.removeStopWords(s); Set<Annotation> annotations = new HashSet<Annotation>(); - Pattern pattern = Pattern.compile("\\u0020+"); + Pattern pattern = Pattern.compile("(\\u0020)+"); Matcher matcher = pattern.matcher(s); // Check all occurrences int start = 0; @@ -28,7 +41,20 @@ if(start < s.length()-1){ annotations.add(new Annotation(document, start, s.length() - start)); } + stopWordFilter.removeStopWordAnnotations(annotations); return annotations; } + + public static void main(String[] args) throws Exception { + String s = "male person least 1 child"; + Pattern pattern = Pattern.compile("(\\u0020)+"); + Matcher matcher = pattern.matcher(s); + int start = 0; + while (matcher.find()) { + int end = matcher.start(); + System.out.println(s.substring(start, end)); + start = matcher.end(); + } + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |