From: <dfl...@us...> - 2013-12-10 15:25:17
|
Revision: 4207 http://sourceforge.net/p/dl-learner/code/4207 Author: dfleischhacker Date: 2013-12-10 15:25:13 +0000 (Tue, 10 Dec 2013) Log Message: ----------- Add scoring for hyponyms and token tree Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -13,6 +13,8 @@ public class WordNet { + private static final double SYNONYM_FACTOR = 0.8; + private static final double HYPONYM_FACTOR = 0.4; public Dictionary dict; public WordNet() { @@ -280,6 +282,42 @@ } } + public List<LemmaScorePair> getHyponymsScored(POS pos, String s) { + ArrayList<LemmaScorePair> result = new ArrayList<>(); + try { + IndexWord word = dict.getIndexWord(pos, s); + if (word == null) { + System.err.println("Unable to find index word for " + s); + return result; + } + Synset sense = word.getSense(1); + getHyponymsScoredRecursive(result, sense, 3, SYNONYM_FACTOR); + } + catch (JWNLException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + return result; + } + + public void getHyponymsScoredRecursive(List<LemmaScorePair> lemmas, Synset sense, int depthToGo, double score) { + for (Word w : sense.getWords()) { + lemmas.add(new LemmaScorePair(w.getLemma(), score)); + } + if (depthToGo == 0) { + return; + } + try { + PointerTargetNodeList directHyponyms = PointerUtils.getInstance().getDirectHyponyms(sense); + for (Object directHyponym : directHyponyms) { + getHyponymsScoredRecursive(lemmas, ((PointerTargetNode) directHyponym).getSynset(), depthToGo - 1, + score * HYPONYM_FACTOR); + } + } + catch (JWNLException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + } + /** * Funktion returns a List of Hypo and Hypernyms of a given string * @@ -356,4 +394,71 @@ return result; } + public static class LemmaScorePair implements Comparable<LemmaScorePair> { + private String lemma; + private Double score; + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + LemmaScorePair that = (LemmaScorePair) o; + + if (lemma != null ? !lemma.equals(that.lemma) : that.lemma != null) { + return false; + } + if (score != null ? !score.equals(that.score) : that.score != null) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = lemma != null ? lemma.hashCode() : 0; + result = 31 * result + (score != null ? score.hashCode() : 0); + return result; + } + + public String getLemma() { + + return lemma; + } + + public void setLemma(String lemma) { + this.lemma = lemma; + } + + public Double getScore() { + return score; + } + + public void setScore(Double score) { + this.score = score; + } + + public LemmaScorePair(String lemma, Double score) { + + this.lemma = lemma; + this.score = score; + } + + @Override + public int compareTo(LemmaScorePair o) { + int val = score.compareTo(o.score); + + if (val == 0) { + val = lemma.compareTo(o.getLemma()); + } + + return val; + } + } + } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -0,0 +1,77 @@ +package org.dllearner.algorithms.isle.index; + +import org.dllearner.core.owl.Entity; + +/** + * Represents a scored entity. The score is produced from the path used to retrieve it from the candidates tree. + * @author Daniel Fleischhacker + */ +public class EntityScorePair implements Comparable<EntityScorePair> { + @Override + public String toString() { + return entity + " : " + score; + } + + private Entity entity; + private Double score; + + @Override + public int compareTo(EntityScorePair o) { + int val = score.compareTo(o.score); + + if (val == 0) { + val = entity.getURI().toString().compareTo(o.entity.getURI().toString()); + } + + return val; + } + + public EntityScorePair(Entity entity, Double score) { + this.entity = entity; + this.score = score; + } + + public Entity getEntity() { + return entity; + } + + public void setEntity(Entity entity) { + this.entity = entity; + } + + public Double getScore() { + return score; + } + + public void setScore(Double score) { + this.score = score; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + EntityScorePair that = (EntityScorePair) o; + + if (entity != null ? !entity.equals(that.entity) : that.entity != null) { + return false; + } + if (score != null ? !score.equals(that.score) : that.score != null) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = entity != null ? entity.hashCode() : 0; + result = 31 * result + (score != null ? score.hashCode() : 0); + return result; + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -5,8 +5,7 @@ import net.didion.jwnl.data.POS; import org.dllearner.algorithms.isle.WordNet; -import java.util.ArrayList; -import java.util.Collections; +import java.util.*; /** * Provides shortcuts to commonly used linguistic operations @@ -35,6 +34,26 @@ } } + public Set<WordNet.LemmaScorePair> getScoredHyponyms(String word, POS pos) { + List<WordNet.LemmaScorePair> pairs = wn.getHyponymsScored(pos, word); + HashMap<String, Double> lemmaScores = new HashMap<>(); + for (WordNet.LemmaScorePair p : pairs) { + if (!lemmaScores.containsKey(p.getLemma())) { + lemmaScores.put(p.getLemma(), p.getScore()); + } + else { + lemmaScores.put(p.getLemma(), Math.max(p.getScore(), lemmaScores.get(p.getLemma()))); + } + } + + TreeSet<WordNet.LemmaScorePair> scoredPairs = new TreeSet<>(); + for (Map.Entry<String, Double> e : lemmaScores.entrySet()) { + scoredPairs.add(new WordNet.LemmaScorePair(e.getKey(), e.getValue())); + } + + return scoredPairs; + } + /** * Processes the given string and puts camelCased words into single words. * @param camelCase the word containing camelcase to split Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -1,6 +1,7 @@ package org.dllearner.algorithms.isle.index; import net.didion.jwnl.data.POS; +import org.dllearner.algorithms.isle.WordNet; import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; @@ -89,15 +90,16 @@ continue; } //String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); - String[] synonyms = LinguisticUtil.getInstance().getAllHyponymsForWord(t.getRawForm(), wordnetPos); + Set<WordNet.LemmaScorePair> alternativeFormPairs = LinguisticUtil.getInstance() + .getScoredHyponyms(t.getRawForm(), wordnetPos); - for (String synonym : synonyms) { + for (WordNet.LemmaScorePair synonym : alternativeFormPairs) { // ignore all multi word synonyms - if (synonym.contains("_")) { + if (synonym.getLemma().contains("_")) { continue; } //t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); - t.addAlternativeForm(synonym); + t.addAlternativeForm(synonym.getLemma(), synonym.getScore()); } } } @@ -113,9 +115,14 @@ @Override public Set<Entity> getCandidateEntities(List<Token> tokens) { - return tree.getAllEntities(tokens); - } + Set<Entity> res = tree.getAllEntities(tokens); + System.out.println("Unscored: " + res); + Set<EntityScorePair> scored = tree.getAllEntitiesScored(tokens); + System.out.println("Scored: " + scored); + return res; + } + @Override public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens) { return tree.getOriginalTokensForLongestMatch(tokens); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -7,7 +7,8 @@ import java.io.Serializable; import java.util.Collections; -import java.util.HashSet; +import java.util.HashMap; +import java.util.Map; import java.util.Set; /** @@ -23,7 +24,8 @@ private boolean isStopWord; private boolean isHead; /// for storing alternative forms of this token, e.g., generated by WordNet synonyms - private HashSet<String> alternativeForms; + private HashMap<String, Double> alternativeForms; + public Token(String rawForm) { this.rawForm = rawForm; @@ -35,7 +37,7 @@ this.posTag = posTag; this.isPunctuation = isPunctuation; this.isStopWord = isStopWord; - this.alternativeForms = new HashSet<>(); + this.alternativeForms = new HashMap<>(); } /** @@ -66,15 +68,22 @@ * @return unmodifiable set of alternative surface forms for this token */ public Set<String> getAlternativeForms() { - return Collections.unmodifiableSet(alternativeForms); + return Collections.unmodifiableSet(alternativeForms.keySet()); } /** + * Returns the map storing the scored alternative forms of this token. + */ + public Map<String, Double> getScoredAlternativeForms() { + return Collections.unmodifiableMap(alternativeForms); + } + + /** * Adds a new surface form to the alternative forms of this token. Alternative forms are included in comparison of * two tokens when using the {@link #equalsWithAlternativeForms}. */ - public void addAlternativeForm(String alternativeForm) { - this.alternativeForms.add(alternativeForm); + public void addAlternativeForm(String alternativeForm, Double score) { + this.alternativeForms.put(alternativeForm, score); } /** @@ -120,7 +129,7 @@ } /** - * @param wheteher the token is the head of the containg sequence of tokens + * @param isHead the token is the head of the containg sequence of tokens */ public void setIsHead(boolean isHead) { this.isHead = isHead; @@ -158,8 +167,8 @@ return false; } - if (other.stemmedForm.equals(stemmedForm) || other.alternativeForms.contains(stemmedForm) || - alternativeForms.contains(other.stemmedForm)) { + if (other.stemmedForm.equals(stemmedForm) || other.alternativeForms.containsKey(stemmedForm) || + alternativeForms.containsKey(other.stemmedForm)) { return true; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -13,6 +13,9 @@ * @author Daniel Fleischhacker */ public class TokenTree { + public static final double WORDNET_FACTOR = 0.3d; + public static final double ORIGINAL_FACTOR = 1.0d; + private LinkedHashMap<Token, TokenTree> children; private Set<Entity> entities; private List<Token> originalTokens; @@ -23,14 +26,15 @@ this.entities = new HashSet<>(); this.originalTokens = new ArrayList<>(); } - + /** * If set to TRUE, stopwords like 'of, on' are ignored during creation and retrieval operations. - * @param ignoreStopWords the ignoreStopWords to set - */ - public void setIgnoreStopWords(boolean ignoreStopWords) { - this.ignoreStopWords = ignoreStopWords; - } + * + * @param ignoreStopWords the ignoreStopWords to set + */ + public void setIgnoreStopWords(boolean ignoreStopWords) { + this.ignoreStopWords = ignoreStopWords; + } /** * Adds all given entities to the end of the path resulting from the given tokens. @@ -41,14 +45,14 @@ public void add(List<Token> tokens, Set<Entity> entities, List<Token> originalTokens) { TokenTree curNode = this; for (Token t : tokens) { - if(!ignoreStopWords || (ignoreStopWords && !t.isStopWord())){ - TokenTree nextNode = curNode.children.get(t); + if (!ignoreStopWords || (ignoreStopWords && !t.isStopWord())) { + TokenTree nextNode = curNode.children.get(t); if (nextNode == null) { nextNode = new TokenTree(); curNode.children.put(t, nextNode); } curNode = nextNode; - } + } } curNode.entities.addAll(entities); curNode.originalTokens = new ArrayList<>(originalTokens); @@ -90,6 +94,75 @@ return curNode.entities; } + public Set<EntityScorePair> getAllEntitiesScored(List<Token> tokens) { + HashSet<EntityScorePair> resEntities = new HashSet<>(); + getAllEntitiesScoredRec(tokens, 0, this, resEntities, 1.0); + + // only keep highest confidence for each entity + HashMap<Entity, Double> entityScores = new HashMap<>(); + + for (EntityScorePair p : resEntities) { + if (!entityScores.containsKey(p.getEntity())) { + entityScores.put(p.getEntity(), p.getScore()); + } + else { + entityScores.put(p.getEntity(), Math.max(p.getScore(), entityScores.get(p.getEntity()))); + } + } + + TreeSet<EntityScorePair> result = new TreeSet<>(); + for (Map.Entry<Entity, Double> e : entityScores.entrySet()) { + result.add(new EntityScorePair(e.getKey(), e.getValue())); + } + + return result; + } + + public void getAllEntitiesScoredRec(List<Token> tokens, int curPosition, TokenTree curTree, + HashSet<EntityScorePair> resEntities, Double curScore) { + + if (curPosition == tokens.size()) { + for (Entity e : curTree.entities) { + resEntities.add(new EntityScorePair(e, curScore)); + } + return; + } + Token currentTextToken = tokens.get(curPosition); + for (Map.Entry<Token, TokenTree> treeTokenEntry : curTree.children.entrySet()) { + if (currentTextToken.equals(treeTokenEntry.getKey())) { + getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, + curScore * ORIGINAL_FACTOR); + } + else { + for (Map.Entry<String, Double> treeAlternativeForm : treeTokenEntry.getKey().getScoredAlternativeForms() + .entrySet()) { + if (currentTextToken.getStemmedForm().equals(treeAlternativeForm.getKey())) { + getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, + curScore * ORIGINAL_FACTOR * treeAlternativeForm.getValue()); + } + } + for (Map.Entry<String, Double> textAlternativeForm : currentTextToken.getScoredAlternativeForms() + .entrySet()) { + if (treeTokenEntry.getKey().getStemmedForm().equals(textAlternativeForm.getKey())) { + getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, + curScore * ORIGINAL_FACTOR * textAlternativeForm.getValue()); + } + } + + for (Map.Entry<String, Double> treeAlternativeForm : treeTokenEntry.getKey().getScoredAlternativeForms() + .entrySet()) { + for (Map.Entry<String, Double> textAlternativeForm : currentTextToken.getScoredAlternativeForms() + .entrySet()) { + if (treeAlternativeForm.getKey().equals(textAlternativeForm.getKey())) { + getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, + curScore * treeAlternativeForm.getValue() * textAlternativeForm.getValue()); + } + } + } + } + } + } + public Set<Entity> getAllEntities(List<Token> tokens) { HashSet<Entity> resEntities = new HashSet<>(); getAllEntitiesRec(tokens, 0, this, resEntities); @@ -145,7 +218,8 @@ /** * Returns the set of entities assigned to the longest matching token subsequence of the given token sequence. - * @param tokens token sequence to search for longest match + * + * @param tokens token sequence to search for longest match * @return set of entities assigned to the longest matching token subsequence of the given token sequence */ public Set<Entity> getEntitiesForLongestMatch(List<Token> tokens) { @@ -188,34 +262,37 @@ } public static void main(String[] args) throws Exception { - List<Token> tokens1 = Lists.newLinkedList(); - for (String s : Splitter.on(" ").split("this is a token tree")) { - tokens1.add(new Token(s, s, s, false, false)); - }; - - List<Token> tokens2 = Lists.newLinkedList(); - for (String s : Splitter.on(" ").split("this is a tokenized tree")) { - tokens2.add(new Token(s, s, s, false, false)); - }; - - TokenTree tree = new TokenTree(); - tree.add(tokens1, new NamedClass("TokenTree")); - tree.add(tokens2, new NamedClass("TokenizedTree")); + List<Token> tokens1 = Lists.newLinkedList(); + for (String s : Splitter.on(" ").split("this is a token tree")) { + tokens1.add(new Token(s, s, s, false, false)); + } + ; + + List<Token> tokens2 = Lists.newLinkedList(); + for (String s : Splitter.on(" ").split("this is a tokenized tree")) { + tokens2.add(new Token(s, s, s, false, false)); + } + ; + + TokenTree tree = new TokenTree(); + tree.add(tokens1, new NamedClass("TokenTree")); + tree.add(tokens2, new NamedClass("TokenizedTree")); System.out.println(tree); - + System.out.println(tree.getEntitiesForLongestMatch(tokens1)); System.out.println(tree.getLongestMatch(tokens1)); - + List<Token> tokens3 = Lists.newLinkedList(); - for (String s : Splitter.on(" ").split("this is a very nice tokenized tree")) { - tokens3.add(new Token(s, s, s, false, false)); - }; + for (String s : Splitter.on(" ").split("this is a very nice tokenized tree")) { + tokens3.add(new Token(s, s, s, false, false)); + } + ; System.out.println(tree.getLongestMatch(tokens3)); } - + public String toString() { - return "TokenTree\n"+ toString(0); + return "TokenTree\n" + toString(0); } public String toString(int indent) { @@ -233,5 +310,5 @@ return sb.toString(); } - + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |