From: <dfl...@us...> - 2013-09-09 10:12:23
|
Revision: 4099 http://sourceforge.net/p/dl-learner/code/4099 Author: dfleischhacker Date: 2013-09-09 10:12:21 +0000 (Mon, 09 Sep 2013) Log Message: ----------- Add possibility to switch off word normalization in annotator Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-09 10:11:41 UTC (rev 4098) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-09 10:12:21 UTC (rev 4099) @@ -4,19 +4,21 @@ import java.util.Set; /** - * Annotates a document using a prefix trie + * Annotates a document using a prefix trie. * * @author Andre Melo */ public class TrieLinguisticAnnotator implements LinguisticAnnotator { EntityCandidatesTrie candidatesTrie; + private boolean normalizeWords = true; public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) { this.candidatesTrie = candidatesTrie; } /** - * Generates annotation based on trie's longest matching strings + * Generates annotation based on trie's longest matching strings. By default, the document's contents are + * normalized using a lemmatizer. The normalization step can be disabled using the * * @param document the document to get annotations for * @return the set of annotation for the given document @@ -26,8 +28,14 @@ String content = document.getContent(); Set<Annotation> annotations = new HashSet<Annotation>(); for (int i = 0; i < content.length(); i++) { + if (Character.isWhitespace(content.charAt(i))) { + continue; + } String unparsed = content.substring(i); - String match = candidatesTrie.getLongestMatch(LinguisticUtil.getInstance().getNormalizedForm(unparsed)); + if (normalizeWords) { + unparsed = LinguisticUtil.getInstance().getNormalizedForm(unparsed); + } + String match = candidatesTrie.getLongestMatch(unparsed); if (match != null && !match.isEmpty()) { Annotation annotation = new Annotation(document, i, match.length()); annotations.add(annotation); @@ -37,4 +45,11 @@ return annotations; } + /** + * Sets whether the document's contents should be normalized or not. + * @param enabled if true normalizing is enabled, otherwise disabled + */ + public void setNormalizeWords(boolean enabled) { + normalizeWords = enabled; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |