From: <dfl...@us...> - 2013-10-24 13:40:09
|
Revision: 4127 http://sourceforge.net/p/dl-learner/code/4127 Author: dfleischhacker Date: 2013-10-24 13:40:06 +0000 (Thu, 24 Oct 2013) Log Message: ----------- Fix ContextExtractors * Use processed content instead of raw since annotations link to the former * Fix bug occurring for tokens at index 0 Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java 2013-10-22 14:08:14 UTC (rev 4126) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java 2013-10-24 13:40:06 UTC (rev 4127) @@ -3,12 +3,6 @@ */ package org.dllearner.algorithms.isle.wsd; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - -import org.dllearner.algorithms.isle.index.TextDocument; - import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; @@ -16,7 +10,12 @@ import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; +import org.dllearner.algorithms.isle.index.TextDocument; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + /** * @author Lorenz Buehmann * @@ -37,14 +36,14 @@ @Override public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) { //split text into sentences - List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent()); - + List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getContent()); + //find the sentence containing the token of the annotation int tokenStart = annotation.getOffset(); int index = 0; for (CoreMap sentence : sentences) { String s = sentence.toString(); - if (index < tokenStart && s.length() > tokenStart) { + if (index <= tokenStart && s.length() > tokenStart) { List<String> context = new ArrayList<String>(); for (CoreLabel label : sentence.get(TokensAnnotation.class)) { // this is the text of the token Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java 2013-10-22 14:08:14 UTC (rev 4126) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java 2013-10-24 13:40:06 UTC (rev 4127) @@ -3,12 +3,6 @@ */ package org.dllearner.algorithms.isle.wsd; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - -import org.dllearner.algorithms.isle.index.TextDocument; - import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; @@ -16,7 +10,12 @@ import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; +import org.dllearner.algorithms.isle.index.TextDocument; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + /** * @author Lorenz Buehmann * @@ -43,14 +42,14 @@ @Override public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) { // split text into sentences - List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent()); + List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getContent()); // find the sentence containing the token of the annotation int tokenStart = annotation.getOffset(); int index = 0; for (CoreMap sentence : sentences) { String s = sentence.toString(); - if (index < tokenStart && s.length() > tokenStart) { + if (index <= tokenStart && s.length() > tokenStart) { List<String> context = new ArrayList<String>(); for (CoreLabel label : sentence.get(TokensAnnotation.class)) { // this is the text of the token @@ -62,8 +61,8 @@ } index += s.length(); } - throw new RuntimeException("Token " + annotation.getToken() + " not found in text " - + annotation.getReferencedDocument().getRawContent()); + throw new RuntimeException("Token " + annotation + " not found in text " + + annotation.getReferencedDocument().getContent()); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |