From: <dfl...@us...> - 2013-12-02 14:59:39
|
Revision: 4184 http://sourceforge.net/p/dl-learner/code/4184 Author: dfleischhacker Date: 2013-12-02 14:59:36 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Adapt to new Token implementation Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-02 14:52:33 UTC (rev 4183) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-02 14:59:36 UTC (rev 4184) @@ -2,6 +2,7 @@ import org.dllearner.core.owl.Entity; +import java.util.List; import java.util.Set; public interface EntityCandidatesTrie { @@ -11,7 +12,7 @@ * @param s * @param e */ - public void addEntry(String s, Entity e); + public void addEntry(List<Token> s, Entity e); /** Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 14:52:33 UTC (rev 4183) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 14:59:36 UTC (rev 4184) @@ -3,7 +3,6 @@ import org.apache.commons.lang.StringUtils; import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; import org.dllearner.core.owl.Entity; -import org.dllearner.utilities.MapUtils; import org.dllearner.utilities.datastructures.PrefixTrie; import org.semanticweb.owlapi.model.OWLOntology; @@ -11,7 +10,7 @@ import java.util.Map.Entry; public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { - + TokenTree tree; PrefixTrie<FullTokenEntitySetPair> trie; EntityTextRetriever entityTextRetriever; @@ -41,7 +40,7 @@ } public void buildTrie(OWLOntology ontology, NameGenerator nameGenerator) { - this.trie = new PrefixTrie<FullTokenEntitySetPair>(); + this.tree = new TokenTree(); Map<Entity, Set<List<Token>>> entity2TokenSet = entityTextRetriever.getRelevantText(ontology); @@ -62,51 +61,45 @@ /** * Adds the subsequences of a test * @param entity - * @param text + * @param tokens */ - private void addSubsequences(Entity entity, String text) { - if (text.contains(" ")) { - String[] tokens = text.split(" "); - for (int size=1; size<tokens.length; size++) { - - for (int start=0; start<tokens.length-size+1; start++) { - String subsequence = ""; - for (int i=0; i<size; i++) { - subsequence += tokens[start+i] + " "; - } - subsequence = subsequence.trim(); - - addEntry(subsequence, entity); - } - - } - } - } - - private void addSubsequencesWordNet(Entity entity, String text) { - if (text.contains(" ")) { - String[] tokens = text.split(" "); - - List<String>[] wordnetTokens = (ArrayList<String>[]) new ArrayList[tokens.length]; - - // generate list of lemmatized wordnet synonyms for each token - for (int i = 0; i < tokens.length; i++) { - wordnetTokens[i] = new ArrayList<String>(); - wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(tokens[i].toLowerCase())); - for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(tokens[i], 5)) { - wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).toLowerCase()); + private void addSubsequences(Entity entity, List<Token> tokens) { + tree.add(tokens, entity); + for (int size = 1; size < tokens.size(); size++) { + for (int start = 0; start < tokens.size() - size + 1; start++) { + ArrayList<Token> subsequence = new ArrayList<>(); + for (int i = 0; i < size; i++) { + subsequence.add(tokens.get(start + i)); } + addEntry(subsequence, entity); } - - // generate subsequences starting at the given start index of the given size - Set<String[]> allPossibleSubsequences = getAllPossibleSubsequences(tokens, wordnetTokens); - - for (String[] s : allPossibleSubsequences) { - addEntry(s[0], entity, s[1]); - } } } +// private void addSubsequencesWordNet(Entity entity, String text) { +// if (text.contains(" ")) { +// String[] tokens = text.split(" "); +// +// List<String>[] wordnetTokens = (ArrayList<String>[]) new ArrayList[tokens.length]; +// +// // generate list of lemmatized wordnet synonyms for each token +// for (int i = 0; i < tokens.length; i++) { +// wordnetTokens[i] = new ArrayList<String>(); +// wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(tokens[i].toLowerCase())); +// for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(tokens[i], 5)) { +// wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).toLowerCase()); +// } +// } +// +// // generate subsequences starting at the given start index of the given size +// Set<String[]> allPossibleSubsequences = getAllPossibleSubsequences(tokens, wordnetTokens); +// +// for (String[] s : allPossibleSubsequences) { +// addEntry(s[0], entity, s[1]); +// } +// } +// } + private static Set<String[]> getAllPossibleSubsequences(String[] originalTokens, List<String>[] wordnetTokens) { ArrayList<String[]> res = new ArrayList<String[]>(); @@ -143,30 +136,12 @@ } @Override - public void addEntry(String s, Entity e) { - s = s.trim(); - FullTokenEntitySetPair candidates; - if (trie.contains(s)) - candidates = trie.get(s); - else - candidates = new FullTokenEntitySetPair(s); - - candidates.addEntity(e); - - trie.put(s, candidates); + public void addEntry(List<Token> s, Entity e) { + tree.add(s, e); } - public void addEntry(String s, Entity e, String originalString) { - s = s.trim(); - FullTokenEntitySetPair candidates; - if (trie.contains(s)) - candidates = trie.get(s); - else - candidates = new FullTokenEntitySetPair(originalString); - - candidates.addEntity(e); - - trie.put(s, candidates); + public void addEntry(List<Token> s, Entity e, List<Token> originalTokens) { + tree.add(s, e, originalTokens); } @Override Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:52:33 UTC (rev 4183) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:59:36 UTC (rev 4184) @@ -15,10 +15,12 @@ public class TokenTree { private HashMap<Token, TokenTree> children; private Set<Entity> entities; + private List<Token> originalTokens; public TokenTree() { this.children = new HashMap<>(); this.entities = new HashSet<>(); + this.originalTokens = new ArrayList<>(); } /** @@ -27,7 +29,7 @@ * @param tokens tokens to locate insertion point for entities * @param entities entities to add */ - public void add(List<Token> tokens, Set<Entity> entities) { + public void add(List<Token> tokens, Set<Entity> entities, List<Token> originalTokens) { TokenTree curNode = this; for (Token t : tokens) { TokenTree nextNode = curNode.children.get(t); @@ -38,8 +40,13 @@ curNode = nextNode; } curNode.entities.addAll(entities); + curNode.originalTokens = new ArrayList<>(originalTokens); } + public void add(List<Token> tokens, Set<Entity> entities) { + add(tokens, entities, tokens); + } + /** * Adds the given entity to the tree. * @@ -50,6 +57,10 @@ add(tokens, Collections.singleton(entity)); } + public void add(List<Token> tokens, Entity entity, List<Token> originalTokens) { + add(tokens, Collections.singleton(entity), originalTokens); + } + /** * Returns the set of entities located by the given list of tokens. * @@ -112,6 +123,27 @@ return fallback == null ? Collections.<Entity>emptySet() : fallback.entities; } + /** + * Returns the original token for the longest match + */ + public List<Token> getOriginalTokensForLongestMatch(List<Token> tokens) { + TokenTree fallback = this.entities.isEmpty() ? null : this; + TokenTree curNode = this; + + for (Token t : tokens) { + TokenTree nextNode = curNode.children.get(t); + if (nextNode == null) { + return fallback == null ? null : fallback.originalTokens; + } + curNode = nextNode; + if (!curNode.entities.isEmpty()) { + fallback = curNode; + } + } + + return fallback == null ? Collections.<Token>emptyList() : fallback.originalTokens; + } + public static void main(String[] args) throws Exception { List<Token> tokens1 = Lists.newLinkedList(); for (String s : Splitter.on(" ").split("this is a token tree")) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |