[DL-Learner SVN] SF.net SVN: dl-learner:[4093] trunk/components-core/src/main/java/org/ dllearner/a

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4093
          http://sourceforge.net/p/dl-learner/code/4093
Author:   dfleischhacker
Date:     2013-09-06 11:36:33 +0000 (Fri, 06 Sep 2013)
Log Message:
-----------
Extend ontology words by synonyms

Modified Paths:
--------------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java
===================================================================

--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java	2013-09-06 10:01:53 UTC (rev 4092)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java	2013-09-06 11:36:33 UTC (rev 4093)
@@ -93,7 +93,7 @@
 //			IndexWord iw = dict.getMorphologicalProcessor().lookupBaseForm(pos, s);
             if (iw != null) {
                 Synset[] synsets = iw.getSenses();
-                for (int i = 0; i < n; i++) {
+                for (int i = 0; i < Math.min(n, synsets.length); i++) {
                     for (Word word : synsets[i].getWords()) {
                         String c = word.getLemma();
                         if (!c.equals(s) && !c.contains(" ")) {

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java	2013-09-06 10:01:53 UTC (rev 4092)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java	2013-09-06 11:36:33 UTC (rev 4093)
@@ -36,9 +36,13 @@
         for (int i = 0; i < camelCase.length(); i++) {
             // we just ignore characters not matching the defined pattern
             char curChar = camelCase.charAt(i);
-            if (!Character.isLetter(curChar)) {
+            if (Character.isWhitespace(curChar)) {
+                sb.append(" ");
                 continue;
             }
+            else if (!Character.isLetter(curChar)) {
+                continue;
+            }
             if (Character.isUpperCase(curChar)) { // found a new upper case letter
                 resultingWords.add(sb.toString());
                 sb = new StringBuilder();

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java	2013-09-06 10:01:53 UTC (rev 4092)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java	2013-09-06 11:36:33 UTC (rev 4093)
@@ -1,5 +1,6 @@
 package org.dllearner.algorithms.isle.index;
 
+import org.apache.commons.lang.StringUtils;
 import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever;
 import org.dllearner.core.owl.Entity;
 import org.dllearner.utilities.datastructures.PrefixTrie;
@@ -11,28 +12,62 @@
 
 	PrefixTrie<Set<Entity>> trie;
 	EntityTextRetriever entityTextRetriever;
-	
+
+    /**
+     * Initialize the trie with strings from the provided ontology using a no-op name generator, i.e., only the
+     * actual ontology strings are added and no expansion is done.
+     *
+     * @param entityTextRetriever the text retriever to use
+     * @param ontology the ontology to get strings from
+     */
 	public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) {
-		this.entityTextRetriever = entityTextRetriever;
-		buildTrie(ontology);
+        this(entityTextRetriever, ontology, new DummyNameGenerator());
 	}
+
+    /**
+     * Initialize the trie with strings from the provided ontology and use the given entity name generator
+     * for generating alternative words.
+     *
+     * @param entityTextRetriever the text retriever to use
+     * @param ontology the ontology to get strings from
+     * @param nameGenerator the name generator to use for generating alternative words
+     */
+    public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology,
+                                      NameGenerator nameGenerator) {
+        this.entityTextRetriever = entityTextRetriever;
+        buildTrie(ontology, nameGenerator);
+    }
 	
-	public void buildTrie(OWLOntology ontology) {	
+	public void buildTrie(OWLOntology ontology, NameGenerator nameGenerator) {
 		this.trie = new PrefixTrie<Set<Entity>>();
 		Map<Entity, Set<String>> relevantText = entityTextRetriever.getRelevantText(ontology);
 		
 		for (Entity entity : relevantText.keySet()) {
+
 			for (String text : relevantText.get(entity)) {
-				addEntry(text, entity);
-				// Adds also composing words, e.g. for "has child", "has" and "child" are also added
-				if (text.contains(" ")) {
-					for (String subtext : text.split(" ")) {
-						addEntry(subtext, entity);
-						//System.out.println("trie.add("+subtext+","++")");
-					}
-				}
-			}
-		}
+                text = StringUtils.join(LinguisticUtil.getWordsFromCamelCase(text), " ");
+                text = StringUtils.join(LinguisticUtil.getWordsFromUnderscored(text), " ");
+                if (text.trim().isEmpty()) {
+                    continue;
+                }
+                addEntry(text, entity);
+                for (String alternativeText : nameGenerator.getAlternativeText(text)) {
+//                    System.out.println("New alternative text for " + text + " --> " + alternativeText);
+                    addEntry(alternativeText, entity);
+                }
+                // Adds also composing words, e.g. for "has child", "has" and "child" are also added
+                if (text.contains(" ")) {
+                    for (String subtext : text.split(" ")) {
+                        addEntry(subtext, entity);
+                        for (String alternativeText : nameGenerator.getAlternativeText(subtext)) {
+//                            System.out.println("New alternative text for " + subtext + " --> " + alternativeText);
+                            addEntry(alternativeText, entity);
+                        }
+                        //System.out.println("trie.add("+subtext+","++")");
+                    }
+                }
+            }
+        }
 	}
 	
 	@Override
@@ -62,7 +97,7 @@
 	public String toString() {
 		String output = "";
 		Map<String,Set<Entity>> trieMap = trie.toMap();
-		List<String> termsList = new ArrayList(trieMap.keySet());
+		List<String> termsList = new ArrayList<String>(trieMap.keySet());
 		Collections.sort(termsList);
 		for (String key : termsList) {
 			output += key + ":\n";
@@ -78,4 +113,68 @@
 		
 	}
 
+    public static interface NameGenerator {
+        /**
+         * Returns a list of possible alternative words for the given word
+         *
+         * @param text    the text to return alternative words for
+         * @return alternative words for given word
+         */
+        List<String> getAlternativeText(String text);
+    }
+
+    public static class DummyNameGenerator implements NameGenerator {
+        @Override
+        public List<String> getAlternativeText(String word) {
+            return Collections.singletonList(word);
+        }
+    }
+
+    /**
+     * Generates alternative texts by using WordNet synonyms.
+     */
+    public static class WordNetNameGenerator implements NameGenerator {
+        private int maxNumberOfSenses = 5;
+
+        /**
+         * Sets up the generator for returning the lemmas of the top {@code maxNumberOfSenses} senses.
+         * @param maxNumberOfSenses the maximum number of senses to aggregate word lemmas from
+         */
+        public WordNetNameGenerator(int maxNumberOfSenses) {
+            this.maxNumberOfSenses = maxNumberOfSenses;
+        }
+
+        @Override
+        public List<String> getAlternativeText(String word) {
+            return Arrays.asList(LinguisticUtil.getTopSynonymsForWord(word, maxNumberOfSenses));
+        }
+    }
+
+    /**
+     * Generates alternative texts by using WordNet synonym and lemmatizing of the original words
+     */
+    public static class LemmatizingWordNetNameGenerator implements NameGenerator {
+        private int maxNumberOfSenses = 5;
+
+        /**
+         * Sets up the generator for returning the lemmas of the top {@code maxNumberOfSenses} senses.
+         * @param maxNumberOfSenses the maximum number of senses to aggregate word lemmas from
+         */
+        public LemmatizingWordNetNameGenerator(int maxNumberOfSenses) {
+            this.maxNumberOfSenses = maxNumberOfSenses;
+        }
+
+        @Override
+        public List<String> getAlternativeText(String word) {
+            ArrayList<String> res = new ArrayList<String>();
+            res.add(LinguisticUtil.getNormalizedForm(word));
+
+            for (String w : LinguisticUtil
+                    .getTopSynonymsForWord(LinguisticUtil.getNormalizedForm(word), maxNumberOfSenses)) {
+                res.add(w.replaceAll("_", " "));
+            }
+
+            return res;
+        }
+    }
 }

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java	2013-09-06 10:01:53 UTC (rev 4092)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java	2013-09-06 11:36:33 UTC (rev 4093)
@@ -5,36 +5,36 @@
 
 /**
  * Annotates a document using a prefix trie
+ *
  * @author Andre Melo
- *
  */
 public class TrieLinguisticAnnotator implements LinguisticAnnotator {
-	
-	EntityCandidatesTrie candidatesTrie;
-	
-	public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) {
-		this.candidatesTrie = candidatesTrie;
-	}
-	
-	/**
-	 * Generates annotation based on trie's longest matching strings
-	 * @param document
-	 * @return
-	 */
-	@Override
-	public Set<Annotation> annotate(Document document) {
-		String content = document.getContent();
-		Set<Annotation> annotations = new HashSet<Annotation>();
-		for (int i=0; i<content.length(); i++) {
-			String unparsed = content.substring(i);
-			String match = candidatesTrie.getLongestMatch(unparsed);
-			if (match!=null && !match.isEmpty()) {
-				Annotation annotation = new Annotation(document, i, match.length());
-				annotations.add(annotation);
-				i += match.length()-1;
-			}
-		}
-		return annotations;
-	}
+    EntityCandidatesTrie candidatesTrie;
 
+    public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) {
+        this.candidatesTrie = candidatesTrie;
+    }
+
+    /**
+     * Generates annotation based on trie's longest matching strings
+     *
+     * @param document the document to get annotations for
+     * @return the set of annotation for the given document
+     */
+    @Override
+    public Set<Annotation> annotate(Document document) {
+        String content = document.getContent();
+        Set<Annotation> annotations = new HashSet<Annotation>();
+        for (int i = 0; i < content.length(); i++) {
+            String unparsed = content.substring(i);
+            String match = candidatesTrie.getLongestMatch(unparsed);
+            if (match != null && !match.isEmpty()) {
+                Annotation annotation = new Annotation(document, i, match.length());
+                annotations.add(annotation);
+                i += match.length() - 1;
+            }
+        }
+        return annotations;
+    }
+
 }

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java	2013-09-06 10:01:53 UTC (rev 4092)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java	2013-09-06 11:36:33 UTC (rev 4093)
@@ -30,7 +30,8 @@
      */
     public SimpleSemanticIndex(OWLOntology ontology, SyntacticIndex syntacticIndex) {
         super(ontology);
-        SimpleEntityCandidatesTrie trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology);
+        SimpleEntityCandidatesTrie trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology),
+                ontology, new SimpleEntityCandidatesTrie.LemmatizingWordNetNameGenerator(5));
 //        trie.printTrie();
         setSemanticAnnotator(new SemanticAnnotator(
                 new SimpleWordSenseDisambiguation(ontology),

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[DL-Learner SVN] SF.net SVN: dl-learner:[4093] trunk/components-core/src/main/java/org/ dllearner/a

[DL-Learner SVN] SF.net SVN: dl-learner:[4093] trunk/components-core/src/main/java/org/ dllearner/algorithms/isle