[DL-Learner SVN] SF.net SVN: dl-learner:[4207] trunk/components-core/src/main/java/org/ dllearner/a

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4207
          http://sourceforge.net/p/dl-learner/code/4207
Author:   dfleischhacker
Date:     2013-12-10 15:25:13 +0000 (Tue, 10 Dec 2013)
Log Message:
-----------
Add scoring for hyponyms and token tree

Modified Paths:
--------------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java

Added Paths:
-----------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java
===================================================================

--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java	2013-12-10 14:35:02 UTC (rev 4206)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java	2013-12-10 15:25:13 UTC (rev 4207)
@@ -13,6 +13,8 @@
 
 public class WordNet {
 
+    private static final double SYNONYM_FACTOR = 0.8;
+    private static final double HYPONYM_FACTOR = 0.4;
     public Dictionary dict;
 
     public WordNet() {
@@ -280,6 +282,42 @@
         }
     }
 
+    public List<LemmaScorePair> getHyponymsScored(POS pos, String s) {
+        ArrayList<LemmaScorePair> result = new ArrayList<>();
+        try {
+            IndexWord word = dict.getIndexWord(pos, s);
+            if (word == null) {
+                System.err.println("Unable to find index word for " + s);
+                return result;
+            }
+            Synset sense = word.getSense(1);
+            getHyponymsScoredRecursive(result, sense, 3, SYNONYM_FACTOR);
+        }
+        catch (JWNLException e) {
+            e.printStackTrace();  //To change body of catch statement use File | Settings | File Templates.
+        }
+        return result;
+    }
+
+    public void getHyponymsScoredRecursive(List<LemmaScorePair> lemmas, Synset sense, int depthToGo, double score) {
+        for (Word w : sense.getWords()) {
+            lemmas.add(new LemmaScorePair(w.getLemma(), score));
+        }
+        if (depthToGo == 0) {
+            return;
+        }
+        try {
+            PointerTargetNodeList directHyponyms = PointerUtils.getInstance().getDirectHyponyms(sense);
+            for (Object directHyponym : directHyponyms) {
+                getHyponymsScoredRecursive(lemmas, ((PointerTargetNode) directHyponym).getSynset(), depthToGo - 1,
+                        score * HYPONYM_FACTOR);
+            }
+        }
+        catch (JWNLException e) {
+            e.printStackTrace();  //To change body of catch statement use File | Settings | File Templates.
+        }
+    }
+
     /**
      * Funktion returns a List of Hypo and Hypernyms of a given string
      *
@@ -356,4 +394,71 @@
         return result;
     }
 
+    public static class LemmaScorePair implements Comparable<LemmaScorePair> {
+        private String lemma;
+        private Double score;
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) {
+                return true;
+            }
+            if (o == null || getClass() != o.getClass()) {
+                return false;
+            }
+
+            LemmaScorePair that = (LemmaScorePair) o;
+
+            if (lemma != null ? !lemma.equals(that.lemma) : that.lemma != null) {
+                return false;
+            }
+            if (score != null ? !score.equals(that.score) : that.score != null) {
+                return false;
+            }
+
+            return true;
+        }
+
+        @Override
+        public int hashCode() {
+            int result = lemma != null ? lemma.hashCode() : 0;
+            result = 31 * result + (score != null ? score.hashCode() : 0);
+            return result;
+        }
+
+        public String getLemma() {
+
+            return lemma;
+        }
+
+        public void setLemma(String lemma) {
+            this.lemma = lemma;
+        }
+
+        public Double getScore() {
+            return score;
+        }
+
+        public void setScore(Double score) {
+            this.score = score;
+        }
+
+        public LemmaScorePair(String lemma, Double score) {
+
+            this.lemma = lemma;
+            this.score = score;
+        }
+
+        @Override
+        public int compareTo(LemmaScorePair o) {
+            int val = score.compareTo(o.score);
+
+            if (val == 0) {
+                val = lemma.compareTo(o.getLemma());
+            }
+
+            return val;
+        }
+    }
+
 }

Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java	                        (rev 0)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java	2013-12-10 15:25:13 UTC (rev 4207)
@@ -0,0 +1,77 @@
+package org.dllearner.algorithms.isle.index;
+
+import org.dllearner.core.owl.Entity;
+
+/**
+ * Represents a scored entity. The score is produced from the path used to retrieve it from the candidates tree.
+ * @author Daniel Fleischhacker
+ */
+public class EntityScorePair implements Comparable<EntityScorePair> {
+    @Override
+    public String toString() {
+        return entity + " : " + score;
+    }
+
+    private Entity entity;
+    private Double score;
+
+    @Override
+    public int compareTo(EntityScorePair o) {
+        int val = score.compareTo(o.score);
+
+        if (val == 0) {
+            val = entity.getURI().toString().compareTo(o.entity.getURI().toString());
+        }
+
+        return val;
+    }
+
+    public EntityScorePair(Entity entity, Double score) {
+        this.entity = entity;
+        this.score = score;
+    }
+
+    public Entity getEntity() {
+        return entity;
+    }
+
+    public void setEntity(Entity entity) {
+        this.entity = entity;
+    }
+
+    public Double getScore() {
+        return score;
+    }
+
+    public void setScore(Double score) {
+        this.score = score;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) {
+            return true;
+        }
+        if (o == null || getClass() != o.getClass()) {
+            return false;
+        }
+
+        EntityScorePair that = (EntityScorePair) o;
+
+        if (entity != null ? !entity.equals(that.entity) : that.entity != null) {
+            return false;
+        }
+        if (score != null ? !score.equals(that.score) : that.score != null) {
+            return false;
+        }
+
+        return true;
+    }
+
+    @Override
+    public int hashCode() {
+        int result = entity != null ? entity.hashCode() : 0;
+        result = 31 * result + (score != null ? score.hashCode() : 0);
+        return result;
+    }
+}

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java	2013-12-10 14:35:02 UTC (rev 4206)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java	2013-12-10 15:25:13 UTC (rev 4207)
@@ -5,8 +5,7 @@
 import net.didion.jwnl.data.POS;
 import org.dllearner.algorithms.isle.WordNet;
 
-import java.util.ArrayList;
-import java.util.Collections;
+import java.util.*;
 
 /**
  * Provides shortcuts to commonly used linguistic operations
@@ -35,6 +34,26 @@
         }
     }
 
+    public Set<WordNet.LemmaScorePair> getScoredHyponyms(String word, POS pos) {
+        List<WordNet.LemmaScorePair> pairs = wn.getHyponymsScored(pos, word);
+        HashMap<String, Double> lemmaScores = new HashMap<>();
+        for (WordNet.LemmaScorePair p : pairs) {
+            if (!lemmaScores.containsKey(p.getLemma())) {
+                lemmaScores.put(p.getLemma(), p.getScore());
+            }
+            else {
+                lemmaScores.put(p.getLemma(), Math.max(p.getScore(), lemmaScores.get(p.getLemma())));
+            }
+        }
+
+        TreeSet<WordNet.LemmaScorePair> scoredPairs = new TreeSet<>();
+        for (Map.Entry<String, Double> e : lemmaScores.entrySet()) {
+            scoredPairs.add(new WordNet.LemmaScorePair(e.getKey(), e.getValue()));
+        }
+
+        return scoredPairs;
+    }
+
     /**
      * Processes the given string and puts camelCased words into single words.
      * @param camelCase    the word containing camelcase to split

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java	2013-12-10 14:35:02 UTC (rev 4206)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java	2013-12-10 15:25:13 UTC (rev 4207)
@@ -1,6 +1,7 @@
 package org.dllearner.algorithms.isle.index;
 
 import net.didion.jwnl.data.POS;
+import org.dllearner.algorithms.isle.WordNet;
 import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever;
 import org.dllearner.core.owl.Entity;
 import org.semanticweb.owlapi.model.OWLOntology;
@@ -89,15 +90,16 @@
                 continue;
             }
             //String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos);
-            String[] synonyms = LinguisticUtil.getInstance().getAllHyponymsForWord(t.getRawForm(), wordnetPos);
+            Set<WordNet.LemmaScorePair> alternativeFormPairs = LinguisticUtil.getInstance()
+                    .getScoredHyponyms(t.getRawForm(), wordnetPos);
 
-            for (String synonym : synonyms) {
+            for (WordNet.LemmaScorePair synonym : alternativeFormPairs) {
                 // ignore all multi word synonyms
-                if (synonym.contains("_")) {
+                if (synonym.getLemma().contains("_")) {
                     continue;
                 }
                 //t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym));
-                t.addAlternativeForm(synonym);
+                t.addAlternativeForm(synonym.getLemma(), synonym.getScore());
             }
         }
     }
@@ -113,9 +115,14 @@
 
 	@Override
 	public Set<Entity> getCandidateEntities(List<Token> tokens) {
-        return tree.getAllEntities(tokens);
-	}
+        Set<Entity> res = tree.getAllEntities(tokens);
+        System.out.println("Unscored: " + res);
+        Set<EntityScorePair> scored = tree.getAllEntitiesScored(tokens);
+        System.out.println("Scored: " + scored);
 
+        return res;
+    }
+
 	@Override
 	public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens) {
 		return tree.getOriginalTokensForLongestMatch(tokens);

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java	2013-12-10 14:35:02 UTC (rev 4206)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java	2013-12-10 15:25:13 UTC (rev 4207)
@@ -7,7 +7,8 @@
 
 import java.io.Serializable;
 import java.util.Collections;
-import java.util.HashSet;
+import java.util.HashMap;
+import java.util.Map;
 import java.util.Set;
 
 /**
@@ -23,7 +24,8 @@
 	private boolean isStopWord;
 	private boolean isHead;
     /// for storing alternative forms of this token, e.g., generated by WordNet synonyms
-    private HashSet<String> alternativeForms;
+    private HashMap<String, Double> alternativeForms;
+
 	
 	public Token(String rawForm) {
 		this.rawForm = rawForm;
@@ -35,7 +37,7 @@
 		this.posTag = posTag;
 		this.isPunctuation = isPunctuation;
 		this.isStopWord = isStopWord;
-        this.alternativeForms = new HashSet<>();
+        this.alternativeForms = new HashMap<>();
 	}
 	
 	/**
@@ -66,15 +68,22 @@
      * @return unmodifiable set of alternative surface forms for this token
      */
     public Set<String> getAlternativeForms() {
-        return Collections.unmodifiableSet(alternativeForms);
+        return Collections.unmodifiableSet(alternativeForms.keySet());
     }
 
     /**
+     * Returns the map storing the scored alternative forms of this token.
+     */
+    public Map<String, Double> getScoredAlternativeForms() {
+        return Collections.unmodifiableMap(alternativeForms);
+    }
+
+    /**
      * Adds a new surface form to the alternative forms of this token. Alternative forms are included in comparison of
      * two tokens when using the {@link #equalsWithAlternativeForms}.
      */
-    public void addAlternativeForm(String alternativeForm) {
-        this.alternativeForms.add(alternativeForm);
+    public void addAlternativeForm(String alternativeForm, Double score) {
+        this.alternativeForms.put(alternativeForm, score);
     }
 
     /**
@@ -120,7 +129,7 @@
 	}
 	
 	/**
-	 * @param wheteher the token is the head of the containg sequence of tokens
+	 * @param isHead the token is the head of the containg sequence of tokens
 	 */
 	public void setIsHead(boolean isHead) {
 		this.isHead = isHead;
@@ -158,8 +167,8 @@
             return false;
         }
 
-        if (other.stemmedForm.equals(stemmedForm) || other.alternativeForms.contains(stemmedForm) ||
-                alternativeForms.contains(other.stemmedForm)) {
+        if (other.stemmedForm.equals(stemmedForm) || other.alternativeForms.containsKey(stemmedForm) ||
+                alternativeForms.containsKey(other.stemmedForm)) {
             return true;
         }
 

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java	2013-12-10 14:35:02 UTC (rev 4206)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java	2013-12-10 15:25:13 UTC (rev 4207)
@@ -13,6 +13,9 @@
  * @author Daniel Fleischhacker
  */
 public class TokenTree {
+    public static final double WORDNET_FACTOR = 0.3d;
+    public static final double ORIGINAL_FACTOR = 1.0d;
+
     private LinkedHashMap<Token, TokenTree> children;
     private Set<Entity> entities;
     private List<Token> originalTokens;
@@ -23,14 +26,15 @@
         this.entities = new HashSet<>();
         this.originalTokens = new ArrayList<>();
     }
-    
+
     /**
      * If set to TRUE, stopwords like 'of, on' are ignored during creation and retrieval operations.
-	 * @param ignoreStopWords the ignoreStopWords to set
-	 */
-	public void setIgnoreStopWords(boolean ignoreStopWords) {
-		this.ignoreStopWords = ignoreStopWords;
-	}
+     *
+     * @param ignoreStopWords the ignoreStopWords to set
+     */
+    public void setIgnoreStopWords(boolean ignoreStopWords) {
+        this.ignoreStopWords = ignoreStopWords;
+    }
 
     /**
      * Adds all given entities to the end of the path resulting from the given tokens.
@@ -41,14 +45,14 @@
     public void add(List<Token> tokens, Set<Entity> entities, List<Token> originalTokens) {
         TokenTree curNode = this;
         for (Token t : tokens) {
-        	if(!ignoreStopWords || (ignoreStopWords && !t.isStopWord())){
-        		TokenTree nextNode = curNode.children.get(t);
+            if (!ignoreStopWords || (ignoreStopWords && !t.isStopWord())) {
+                TokenTree nextNode = curNode.children.get(t);
                 if (nextNode == null) {
                     nextNode = new TokenTree();
                     curNode.children.put(t, nextNode);
                 }
                 curNode = nextNode;
-        	} 
+            }
         }
         curNode.entities.addAll(entities);
         curNode.originalTokens = new ArrayList<>(originalTokens);
@@ -90,6 +94,75 @@
         return curNode.entities;
     }
 
+    public Set<EntityScorePair> getAllEntitiesScored(List<Token> tokens) {
+        HashSet<EntityScorePair> resEntities = new HashSet<>();
+        getAllEntitiesScoredRec(tokens, 0, this, resEntities, 1.0);
+
+        // only keep highest confidence for each entity
+        HashMap<Entity, Double> entityScores = new HashMap<>();
+
+        for (EntityScorePair p : resEntities) {
+            if (!entityScores.containsKey(p.getEntity())) {
+                entityScores.put(p.getEntity(), p.getScore());
+            }
+            else {
+                entityScores.put(p.getEntity(), Math.max(p.getScore(), entityScores.get(p.getEntity())));
+            }
+        }
+
+        TreeSet<EntityScorePair> result = new TreeSet<>();
+        for (Map.Entry<Entity, Double> e : entityScores.entrySet()) {
+            result.add(new EntityScorePair(e.getKey(), e.getValue()));
+        }
+
+        return result;
+    }
+
+    public void getAllEntitiesScoredRec(List<Token> tokens, int curPosition, TokenTree curTree,
+                                        HashSet<EntityScorePair> resEntities, Double curScore) {
+
+        if (curPosition == tokens.size()) {
+            for (Entity e : curTree.entities) {
+                resEntities.add(new EntityScorePair(e, curScore));
+            }
+            return;
+        }
+        Token currentTextToken = tokens.get(curPosition);
+        for (Map.Entry<Token, TokenTree> treeTokenEntry : curTree.children.entrySet()) {
+            if (currentTextToken.equals(treeTokenEntry.getKey())) {
+                getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities,
+                        curScore * ORIGINAL_FACTOR);
+            }
+            else {
+                for (Map.Entry<String, Double> treeAlternativeForm : treeTokenEntry.getKey().getScoredAlternativeForms()
+                        .entrySet()) {
+                    if (currentTextToken.getStemmedForm().equals(treeAlternativeForm.getKey())) {
+                        getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities,
+                                curScore * ORIGINAL_FACTOR * treeAlternativeForm.getValue());
+                    }
+                }
+                for (Map.Entry<String, Double> textAlternativeForm : currentTextToken.getScoredAlternativeForms()
+                        .entrySet()) {
+                    if (treeTokenEntry.getKey().getStemmedForm().equals(textAlternativeForm.getKey())) {
+                        getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities,
+                                curScore * ORIGINAL_FACTOR * textAlternativeForm.getValue());
+                    }
+                }
+
+                for (Map.Entry<String, Double> treeAlternativeForm : treeTokenEntry.getKey().getScoredAlternativeForms()
+                        .entrySet()) {
+                    for (Map.Entry<String, Double> textAlternativeForm : currentTextToken.getScoredAlternativeForms()
+                            .entrySet()) {
+                        if (treeAlternativeForm.getKey().equals(textAlternativeForm.getKey())) {
+                            getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities,
+                                    curScore * treeAlternativeForm.getValue() * textAlternativeForm.getValue());
+                        }
+                    }
+                }
+            }
+        }
+    }
+
     public Set<Entity> getAllEntities(List<Token> tokens) {
         HashSet<Entity> resEntities = new HashSet<>();
         getAllEntitiesRec(tokens, 0, this, resEntities);
@@ -145,7 +218,8 @@
 
     /**
      * Returns the set of entities assigned to the longest matching token subsequence of the given token sequence.
-     * @param tokens    token sequence to search for longest match
+     *
+     * @param tokens token sequence to search for longest match
      * @return set of entities assigned to the longest matching token subsequence of the given token sequence
      */
     public Set<Entity> getEntitiesForLongestMatch(List<Token> tokens) {
@@ -188,34 +262,37 @@
     }
 
     public static void main(String[] args) throws Exception {
-    	List<Token> tokens1 = Lists.newLinkedList();
-    	for (String s : Splitter.on(" ").split("this is a token tree")) {
-			tokens1.add(new Token(s, s, s, false, false));
-		};
-		
-		List<Token> tokens2 = Lists.newLinkedList();
-    	for (String s : Splitter.on(" ").split("this is a tokenized tree")) {
-			tokens2.add(new Token(s, s, s, false, false));
-		};
-		
-		TokenTree tree = new TokenTree();
-		tree.add(tokens1, new NamedClass("TokenTree"));
-		tree.add(tokens2, new NamedClass("TokenizedTree"));
+        List<Token> tokens1 = Lists.newLinkedList();
+        for (String s : Splitter.on(" ").split("this is a token tree")) {
+            tokens1.add(new Token(s, s, s, false, false));
+        }
+        ;
+
+        List<Token> tokens2 = Lists.newLinkedList();
+        for (String s : Splitter.on(" ").split("this is a tokenized tree")) {
+            tokens2.add(new Token(s, s, s, false, false));
+        }
+        ;
+
+        TokenTree tree = new TokenTree();
+        tree.add(tokens1, new NamedClass("TokenTree"));
+        tree.add(tokens2, new NamedClass("TokenizedTree"));
         System.out.println(tree);
-        
+
         System.out.println(tree.getEntitiesForLongestMatch(tokens1));
         System.out.println(tree.getLongestMatch(tokens1));
-        
+
         List<Token> tokens3 = Lists.newLinkedList();
-    	for (String s : Splitter.on(" ").split("this is a very nice tokenized tree")) {
-			tokens3.add(new Token(s, s, s, false, false));
-		};
+        for (String s : Splitter.on(" ").split("this is a very nice tokenized tree")) {
+            tokens3.add(new Token(s, s, s, false, false));
+        }
+        ;
         System.out.println(tree.getLongestMatch(tokens3));
     }
 
-    
+
     public String toString() {
-        return "TokenTree\n"+ toString(0);
+        return "TokenTree\n" + toString(0);
     }
 
     public String toString(int indent) {
@@ -233,5 +310,5 @@
         return sb.toString();
     }
 
-    
+
 }

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[DL-Learner SVN] SF.net SVN: dl-learner:[4207] trunk/components-core/src/main/java/org/ dllearner/a

[DL-Learner SVN] SF.net SVN: dl-learner:[4207] trunk/components-core/src/main/java/org/ dllearner/algorithms/isle