From: <dfl...@us...> - 2013-10-07 09:15:23
|
Revision: 4120 http://sourceforge.net/p/dl-learner/code/4120 Author: dfleischhacker Date: 2013-10-07 09:15:20 +0000 (Mon, 07 Oct 2013) Log Message: ----------- Fix bug leading to out of bounds exception Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/StanfordLemmatizer.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-10-07 07:38:17 UTC (rev 4119) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-10-07 09:15:20 UTC (rev 4120) @@ -137,7 +137,7 @@ else { res.append(" "); } - res.append(lemmatizeSingleWord(word)); + res.append(lemmatizeSingleWord(w)); } catch (Exception e) { throw new RuntimeException(e); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-10-07 07:38:17 UTC (rev 4119) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-10-07 09:15:20 UTC (rev 4120) @@ -149,7 +149,8 @@ @Override public Set<Entity> getCandidateEntities(String s) { - return trie.get(s); + Set<Entity> res = trie.get(s); + return res == null ? new HashSet<Entity>() : trie.get(s); } @Override @@ -263,4 +264,34 @@ return res; } } + + /** + * Pair of the actual word and the word after processing. + */ + public static class ActualModifiedWordPair { + private String actualString; + private String modifiedString; + + public String getActualString() { + return actualString; + } + + public void setActualString(String actualString) { + this.actualString = actualString; + } + + public String getModifiedString() { + return modifiedString; + } + + public void setModifiedString(String modifiedString) { + this.modifiedString = modifiedString; + } + + public ActualModifiedWordPair(String actualString, String modifiedString) { + + this.actualString = actualString; + this.modifiedString = modifiedString; + } + } } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/StanfordLemmatizer.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/StanfordLemmatizer.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/StanfordLemmatizer.java 2013-10-07 09:15:20 UTC (rev 4120) @@ -0,0 +1,54 @@ +package org.dllearner.algorithms.isle.index; + +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.util.CoreMap; + +import java.util.LinkedList; +import java.util.List; +import java.util.Properties; + +/** + * + */ +class StanfordLemmatizer { + + protected StanfordCoreNLP pipeline; + + public StanfordLemmatizer() { + // Create StanfordCoreNLP object properties, with POS tagging + // (required for lemmatization), and lemmatization + Properties props; + props = new Properties(); + props.put("annotators", "tokenize, ssplit, pos, lemma"); + + // StanfordCoreNLP loads a lot of models, so you probably + // only want to do this once per execution + this.pipeline = new StanfordCoreNLP(props); + } + + public String lemmatize(String documentText) + { + List<String> lemmas = new LinkedList<String>(); + + // create an empty Annotation just with the given text + edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(documentText); + + // run all Annotators on this text + this.pipeline.annotate(document); + + // Iterate over all of the sentences found + List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); + for(CoreMap sentence: sentences) { + // Iterate over all tokens in a sentence + for (CoreLabel token: sentence.get(CoreAnnotations.TokensAnnotation.class)) { + // Retrieve and add the lemma for each word into the + // list of lemmas + lemmas.add(token.get(CoreAnnotations.LemmaAnnotation.class)); + } + } + + return lemmas.get(0); + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-10-07 07:38:17 UTC (rev 4119) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-10-07 09:15:20 UTC (rev 4120) @@ -37,6 +37,8 @@ } String match = candidatesTrie.getLongestMatch(unparsed); if (match != null && !match.isEmpty()) { + + //TODO: here we are losing the original offset and index... Annotation annotation = new Annotation(document, i, match.length()); annotations.add(annotation); i += match.length() - 1; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |