[DL-Learner SVN] SF.net SVN: dl-learner:[4095] trunk/components-core/src/main/java/org/ dllearner/a

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4095
          http://sourceforge.net/p/dl-learner/code/4095
Author:   dfleischhacker
Date:     2013-09-06 13:31:43 +0000 (Fri, 06 Sep 2013)
Log Message:
-----------
Make LinguisticUtil singleton

Modified Paths:
--------------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java
===================================================================

--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java	2013-09-06 12:48:08 UTC (rev 4094)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java	2013-09-06 13:31:43 UTC (rev 4095)
@@ -6,23 +6,25 @@
 import org.dllearner.algorithms.isle.WordNet;
 
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 
 /**
  * Provides shortcuts to commonly used linguistic operations
  * @author Daniel Fleischhacker
  */
 public class LinguisticUtil {
+    private static LinguisticUtil instance;
+
     private static final WordNet wn = new WordNet();
     private static POS[] RELEVANT_POS = new POS[]{POS.NOUN, POS.VERB};
     private static Lemmatizer lemmatizer;
 
-    static {
-        try {
-            lemmatizer = new DefaultLemmatizer();
+    public static LinguisticUtil getInstance() {
+        if (instance == null) {
+            instance = new LinguisticUtil();
         }
-        catch (Exception e) {
-            e.printStackTrace();
-        }
+        return instance;
     }
 
     /**
@@ -30,7 +32,7 @@
      * @param camelCase    the word containing camelcase to split
      * @return all words as camelcase contained in the given word
      */
-    public static String[] getWordsFromCamelCase(String camelCase) {
+    public String[] getWordsFromCamelCase(String camelCase) {
         ArrayList<String> resultingWords = new ArrayList<String>();
         StringBuilder sb = new StringBuilder();
         for (int i = 0; i < camelCase.length(); i++) {
@@ -66,7 +68,7 @@
      * @param underScored    word to split at underscores
      * @return words contained in given word
      */
-    public static String[] getWordsFromUnderscored(String underScored) {
+    public String[] getWordsFromUnderscored(String underScored) {
         return underScored.split("_");
     }
 
@@ -77,7 +79,7 @@
      * @param word the word to retrieve synonyms for
      * @return synonyms for the given word
      */
-    public static String[] getSynonymsForWord(String word) {
+    public String[] getSynonymsForWord(String word) {
         ArrayList<String> synonyms = new ArrayList<String>();
 
         for (POS pos : RELEVANT_POS) {
@@ -94,7 +96,7 @@
      * @param n the number of senses to get lemmas for
      * @return synonyms for the given word
      */
-    public static String[] getTopSynonymsForWord(String word, int n) {
+    public String[] getTopSynonymsForWord(String word, int n) {
         ArrayList<String> synonyms = new ArrayList<String>();
 
         for (POS pos : RELEVANT_POS) {
@@ -104,30 +106,48 @@
     }
 
     /**
-     * Returns the normalized form of the given word. This method is only able to work with single words! If there is an
-     * error normalizing the given word, the word itself is returned.
+     * Returns the normalized form of the given word. If the word contains spaces, each part separated by spaces is
+     * normalized independently and joined afterwards. If there is an error normalizing the given word, the word itself
+     * is returned.
      *
      * @param word the word to get normalized form for
      * @return normalized form of the word or the word itself on an error
      */
-    public static String getNormalizedForm(String word) {
-        try {
-            if (lemmatizer == null) {
-                return word;
+    public String getNormalizedForm(String word) {
+        StringBuilder res = new StringBuilder();
+
+        boolean first = true;
+
+        ArrayList<String> singleWords = new ArrayList<String>();
+        Collections.addAll(singleWords, word.split(" "));
+
+        for (String w : singleWords) {
+            try {
+                if (first) {
+                    first = false;
+                }
+                else {
+                    res.append(" ");
+                }
+                if (lemmatizer == null) {
+                    res.append(w);
+                }
+                else {
+                    res.append(lemmatizer.lemmatize(w));
+                }
             }
-            return lemmatizer.lemmatize(word);
+            catch (Exception e) {
+                e.printStackTrace();
+            }
         }
-        catch (Exception e) {
-            e.printStackTrace();
-        }
-        return word;
+        return res.toString();
     }
 
     public static void main(String[] args) {
-        System.out.println(getNormalizedForm("going"));
-        for (String s : getWordsFromCamelCase("thisIsAClassWith1Name123")) {
+        System.out.println(LinguisticUtil.getInstance().getNormalizedForm("going"));
+        for (String s : LinguisticUtil.getInstance().getWordsFromCamelCase("thisIsAClassWith1Name123")) {
             System.out.println(s);
-            for (String w : getSynonymsForWord(s)) {
+            for (String w : LinguisticUtil.getInstance().getSynonymsForWord(s)) {
                 System.out.println(" --> " + w);
             }
         }

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java	2013-09-06 12:48:08 UTC (rev 4094)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java	2013-09-06 13:31:43 UTC (rev 4095)
@@ -42,7 +42,10 @@
     	Set<SemanticAnnotation> semanticAnnotations = new HashSet<SemanticAnnotation>();
     	for (Annotation annotation : annotations) {
     		Set<Entity> candidateEntities = entityCandidateGenerator.getCandidates(annotation);
-    		SemanticAnnotation semanticAnnotation = wordSenseDisambiguation.disambiguate(annotation, candidateEntities);
+            if (candidateEntities == null || candidateEntities.size() == 0) {
+                continue;
+            }
+            SemanticAnnotation semanticAnnotation = wordSenseDisambiguation.disambiguate(annotation, candidateEntities);
     		if(semanticAnnotation != null){
     			semanticAnnotations.add(semanticAnnotation);
     		}

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java	2013-09-06 12:48:08 UTC (rev 4094)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java	2013-09-06 13:31:43 UTC (rev 4095)
@@ -45,14 +45,14 @@
 		for (Entity entity : relevantText.keySet()) {
 
 			for (String text : relevantText.get(entity)) {
-                text = StringUtils.join(LinguisticUtil.getWordsFromCamelCase(text), " ");
-                text = StringUtils.join(LinguisticUtil.getWordsFromUnderscored(text), " ");
+                text = StringUtils.join(LinguisticUtil.getInstance().getWordsFromCamelCase(text), " ");
+                text = StringUtils.join(LinguisticUtil.getInstance().getWordsFromUnderscored(text), " ");
                 if (text.trim().isEmpty()) {
                     continue;
                 }
                 addEntry(text, entity);
                 for (String alternativeText : nameGenerator.getAlternativeText(text)) {
-//                    System.out.println("New alternative text for " + text + " --> " + alternativeText);
+                    System.out.println("New alternative text for " + text + " --> " + alternativeText);
                     addEntry(alternativeText, entity);
                 }
                 // Adds also composing words, e.g. for "has child", "has" and "child" are also added
@@ -60,7 +60,7 @@
                     for (String subtext : text.split(" ")) {
                         addEntry(subtext, entity);
                         for (String alternativeText : nameGenerator.getAlternativeText(subtext)) {
-//                            System.out.println("New alternative text for " + subtext + " --> " + alternativeText);
+                            System.out.println("New alternative text for " + subtext + " --> " + alternativeText);
                             addEntry(alternativeText, entity);
                         }
                         //System.out.println("trie.add("+subtext+","++")");
@@ -146,7 +146,7 @@
 
         @Override
         public List<String> getAlternativeText(String word) {
-            return Arrays.asList(LinguisticUtil.getTopSynonymsForWord(word, maxNumberOfSenses));
+            return Arrays.asList(LinguisticUtil.getInstance().getTopSynonymsForWord(word, maxNumberOfSenses));
         }
     }
 
@@ -167,10 +167,10 @@
         @Override
         public List<String> getAlternativeText(String word) {
             ArrayList<String> res = new ArrayList<String>();
-            res.add(LinguisticUtil.getNormalizedForm(word));
+            res.add(LinguisticUtil.getInstance().getNormalizedForm(word));
 
-            for (String w : LinguisticUtil
-                    .getTopSynonymsForWord(LinguisticUtil.getNormalizedForm(word), maxNumberOfSenses)) {
+            for (String w : LinguisticUtil.getInstance()
+                    .getTopSynonymsForWord(LinguisticUtil.getInstance().getNormalizedForm(word), maxNumberOfSenses)) {
                 res.add(w.replaceAll("_", " "));
             }
 

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java	2013-09-06 12:48:08 UTC (rev 4094)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java	2013-09-06 13:31:43 UTC (rev 4095)
@@ -27,7 +27,7 @@
         Set<Annotation> annotations = new HashSet<Annotation>();
         for (int i = 0; i < content.length(); i++) {
             String unparsed = content.substring(i);
-            String match = candidatesTrie.getLongestMatch(unparsed);
+            String match = candidatesTrie.getLongestMatch(LinguisticUtil.getInstance().getNormalizedForm(unparsed));
             if (match != null && !match.isEmpty()) {
                 Annotation annotation = new Annotation(document, i, match.length());
                 annotations.add(annotation);

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java	2013-09-06 12:48:08 UTC (rev 4094)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java	2013-09-06 13:31:43 UTC (rev 4095)
@@ -91,8 +91,8 @@
 		
 		if(textWithWeight.isEmpty() && useShortFormFallback){
 			String shortForm = sfp.getShortForm(IRI.create(entity.getURI()));
-			shortForm = Joiner.on(" ").join(LinguisticUtil.getWordsFromCamelCase(shortForm));
-			shortForm = Joiner.on(" ").join(LinguisticUtil.getWordsFromUnderscored(shortForm)).trim();
+			shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromCamelCase(shortForm));
+			shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromUnderscored(shortForm)).trim();
 			textWithWeight.put(shortForm, weight);
 		}
 		

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[DL-Learner SVN] SF.net SVN: dl-learner:[4095] trunk/components-core/src/main/java/org/ dllearner/a

[DL-Learner SVN] SF.net SVN: dl-learner:[4095] trunk/components-core/src/main/java/org/ dllearner/algorithms/isle