Thread: [DL-Learner SVN] SF.net SVN: dl-learner:[4099] trunk/components-core/src/main/java/org/ dllearner/

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4099
          http://sourceforge.net/p/dl-learner/code/4099
Author:   dfleischhacker
Date:     2013-09-09 10:12:21 +0000 (Mon, 09 Sep 2013)
Log Message:
-----------
Add possibility to switch off word normalization in annotator

Modified Paths:
--------------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java
===================================================================

--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java	2013-09-09 10:11:41 UTC (rev 4098)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java	2013-09-09 10:12:21 UTC (rev 4099)
@@ -4,19 +4,21 @@
 import java.util.Set;
 
 /**
- * Annotates a document using a prefix trie
+ * Annotates a document using a prefix trie.
  *
  * @author Andre Melo
  */
 public class TrieLinguisticAnnotator implements LinguisticAnnotator {
     EntityCandidatesTrie candidatesTrie;
+    private boolean normalizeWords = true;
 
     public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) {
         this.candidatesTrie = candidatesTrie;
     }
 
     /**
-     * Generates annotation based on trie's longest matching strings
+     * Generates annotation based on trie's longest matching strings. By default, the document's contents are
+     * normalized using a lemmatizer. The normalization step can be disabled using the
      *
      * @param document the document to get annotations for
      * @return the set of annotation for the given document
@@ -26,8 +28,14 @@
         String content = document.getContent();
         Set<Annotation> annotations = new HashSet<Annotation>();
         for (int i = 0; i < content.length(); i++) {
+            if (Character.isWhitespace(content.charAt(i))) {
+                continue;
+            }
             String unparsed = content.substring(i);
-            String match = candidatesTrie.getLongestMatch(LinguisticUtil.getInstance().getNormalizedForm(unparsed));
+            if (normalizeWords) {
+                unparsed = LinguisticUtil.getInstance().getNormalizedForm(unparsed);
+            }
+            String match = candidatesTrie.getLongestMatch(unparsed);
             if (match != null && !match.isEmpty()) {
                 Annotation annotation = new Annotation(document, i, match.length());
                 annotations.add(annotation);
@@ -37,4 +45,11 @@
         return annotations;
     }
 
+    /**
+     * Sets whether the document's contents should be normalized or not.
+     * @param enabled if true normalizing is enabled, otherwise disabled
+     */
+    public void setNormalizeWords(boolean enabled) {
+        normalizeWords = enabled;
+    }
 }

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





Thread: [DL-Learner SVN] SF.net SVN: dl-learner:[4099] trunk/components-core/src/main/java/org/ dllearner/

dl-learner-svn