From: <lor...@us...> - 2013-12-02 15:22:07
|
Revision: 4187 http://sourceforge.net/p/dl-learner/code/4187 Author: lorenz_b Date: 2013-12-02 15:22:04 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Refactoring ISLE. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-02 15:20:16 UTC (rev 4186) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-02 15:22:04 UTC (rev 4187) @@ -16,11 +16,11 @@ /** - * Gets set of candidate entities for an exact given String + * Gets set of candidate entities for a list of tokens * @param s * @return */ - public Set<Entity> getCandidateEntities(String s); + public Set<Entity> getCandidateEntities(List<Token> tokens); /** @@ -31,12 +31,12 @@ * @param s the string to search in the trie * @return string generating the path of the longest match in the trie */ - public String getGeneratingStringForLongestMatch(String s); + public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens); /** * Gets the longest matching string * @param s * @return */ - public String getLongestMatchingText(String s); + public List<Token> getLongestMatchingText(List<Token> tokens); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java 2013-12-02 15:20:16 UTC (rev 4186) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java 2013-12-02 15:22:04 UTC (rev 4187) @@ -15,6 +15,6 @@ * @param document the document to get annotation for * @return set of annotations for the given document */ - Set<Annotation> annotate(Document document); + Set<Annotation> annotate(TextDocument document); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 15:20:16 UTC (rev 4186) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 15:22:04 UTC (rev 4187) @@ -145,21 +145,18 @@ } @Override - public Set<Entity> getCandidateEntities(String s) { - FullTokenEntitySetPair res = trie.get(s); - return res == null ? new HashSet<Entity>() : trie.get(s).getEntitySet(); + public Set<Entity> getCandidateEntities(List<Token> tokens) { + return tree.get(tokens); } @Override - public String getGeneratingStringForLongestMatch(String s) { - CharSequence match = trie.getLongestMatch(s); - return (match!=null) ? trie.get(match).getFullToken() : null; + public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens) { + return tree.getOriginalTokensForLongestMatch(tokens); } @Override - public String getLongestMatchingText(String s) { - CharSequence match = trie.getLongestMatch(s); - return (match!=null) ? match.toString() : null; + public List<Token> getLongestMatchingText(List<Token> tokens) { + return tree.getLongestMatch(tokens); } public String toString() { Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-02 15:20:16 UTC (rev 4186) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-02 15:22:04 UTC (rev 4187) @@ -28,7 +28,7 @@ } public Set<Entity> getCandidates(Annotation annotation) { - return candidatesTrie.getCandidateEntities(annotation.getMatchedString()); + return candidatesTrie.getCandidateEntities(annotation.getTokens()); } /** Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-02 15:20:16 UTC (rev 4186) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-02 15:22:04 UTC (rev 4187) @@ -1,6 +1,7 @@ package org.dllearner.algorithms.isle.index; import java.util.HashSet; +import java.util.List; import java.util.Set; /** @@ -24,26 +25,17 @@ * @return the set of annotation for the given document */ @Override - public Set<Annotation> annotate(Document document) { + public Set<Annotation> annotate(TextDocument document) { Set<Annotation> annotations = new HashSet<Annotation>(); NormalizedTextMapper mapper = new NormalizedTextMapper(document); String content = mapper.getNormalizedText(); - for (int i = 0; i < content.length(); i++) { - if (Character.isWhitespace(content.charAt(i))) { - continue; - } - String unparsed = content.substring(i); - String match = candidatesTrie.getLongestMatchingText(unparsed); - if (match != null && !match.isEmpty()) { - Annotation annotation = mapper.getOriginalAnnotationForPosition(i, match.length()); - annotation.setMatchedString(match); - annotations.add(annotation); - i += match.length() - 1; - } - while (!Character.isWhitespace(content.charAt(i)) && i < content.length()) { - i++; - } - } + + List<Token> matchedTokens; + for (Token token : document) { + matchedTokens = candidatesTrie.getLongestMatchingText(document.getTokensStartingAtToken(token, true)); + Annotation annotation = new Annotation(document, matchedTokens); + annotations.add(annotation); + } return annotations; } Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-02 15:20:16 UTC (rev 4186) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-02 15:22:04 UTC (rev 4187) @@ -6,6 +6,7 @@ import com.google.common.base.Charsets; import com.google.common.base.Joiner; import com.google.common.io.Files; + import org.dllearner.algorithms.celoe.CELOE; import org.dllearner.algorithms.isle.index.*; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; @@ -31,6 +32,7 @@ import org.junit.Test; import org.semanticweb.owlapi.apibinding.OWLManager; import org.semanticweb.owlapi.model.*; + import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; import java.io.File; @@ -38,6 +40,7 @@ import java.net.URL; import java.text.DecimalFormat; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; @@ -124,7 +127,7 @@ // @Test public void testTextRetrieval() { System.out.println("Text for entity " + cls + ":"); - Map<String, Double> relevantText = textRetriever.getRelevantText(cls); + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(cls); System.out.println(Joiner.on("\n").join(relevantText.entrySet())); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |