Revision: 4099
http://sourceforge.net/p/dl-learner/code/4099
Author: dfleischhacker
Date: 2013-09-09 10:12:21 +0000 (Mon, 09 Sep 2013)
Log Message:
-----------
Add possibility to switch off word normalization in annotator
Modified Paths:
--------------
trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java
Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-09 10:11:41 UTC (rev 4098)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-09 10:12:21 UTC (rev 4099)
@@ -4,19 +4,21 @@
import java.util.Set;
/**
- * Annotates a document using a prefix trie
+ * Annotates a document using a prefix trie.
*
* @author Andre Melo
*/
public class TrieLinguisticAnnotator implements LinguisticAnnotator {
EntityCandidatesTrie candidatesTrie;
+ private boolean normalizeWords = true;
public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) {
this.candidatesTrie = candidatesTrie;
}
/**
- * Generates annotation based on trie's longest matching strings
+ * Generates annotation based on trie's longest matching strings. By default, the document's contents are
+ * normalized using a lemmatizer. The normalization step can be disabled using the
*
* @param document the document to get annotations for
* @return the set of annotation for the given document
@@ -26,8 +28,14 @@
String content = document.getContent();
Set<Annotation> annotations = new HashSet<Annotation>();
for (int i = 0; i < content.length(); i++) {
+ if (Character.isWhitespace(content.charAt(i))) {
+ continue;
+ }
String unparsed = content.substring(i);
- String match = candidatesTrie.getLongestMatch(LinguisticUtil.getInstance().getNormalizedForm(unparsed));
+ if (normalizeWords) {
+ unparsed = LinguisticUtil.getInstance().getNormalizedForm(unparsed);
+ }
+ String match = candidatesTrie.getLongestMatch(unparsed);
if (match != null && !match.isEmpty()) {
Annotation annotation = new Annotation(document, i, match.length());
annotations.add(annotation);
@@ -37,4 +45,11 @@
return annotations;
}
+ /**
+ * Sets whether the document's contents should be normalized or not.
+ * @param enabled if true normalizing is enabled, otherwise disabled
+ */
+ public void setNormalizeWords(boolean enabled) {
+ normalizeWords = enabled;
+ }
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|