From: <dfl...@us...> - 2013-09-04 09:45:43
|
Revision: 4051 http://sourceforge.net/p/dl-learner/code/4051 Author: dfleischhacker Date: 2013-09-04 09:45:38 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Add n-gram generating annotator Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java 2013-09-04 09:45:38 UTC (rev 4051) @@ -0,0 +1,78 @@ +package org.dllearner.algorithms.isle.index; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Pattern; + +/** + * Generates word n-grams + * @author Daniel Fleischhacker + */ +public class NGramGeneratingAnnotator implements LinguisticAnnotator { + private int length; + + /** + * Initializes the annotator to generate word n-grams of the given length ({@code length} words per n-gram) + * @param length length of the single n-grams + */ + public NGramGeneratingAnnotator(int length) { + this.length = length; + } + + @Override + public Set<Annotation> annotate(Document document) { + String text = document.getContent(); + + + Pattern legalChars = Pattern.compile("[A-Za-z]"); + + // clean up all texts + int curWordStartPosition = 0; + StringBuilder curWord = new StringBuilder(); + ArrayList<String> wordsInText = new ArrayList<String>(); + ArrayList<Integer> wordStart = new ArrayList<Integer>(); + ArrayList<Integer> wordEnd = new ArrayList<Integer>(); + + int i = 0; + while (i < text.length()) { + Character curChar = text.charAt(i); + if (!legalChars.matcher(curChar.toString()).matches()) { + if (curWord.length() == 0) { + curWordStartPosition = i + 1; + i++; + continue; + } + // current word finished + wordsInText.add(curWord.toString()); + wordStart.add(curWordStartPosition); + wordEnd.add(i); + curWord = new StringBuilder(); + curWordStartPosition = i + 1; + } + else { + curWord.append(curChar); + } + i++; + } + + HashSet<Annotation> annotations = new HashSet<Annotation>(); + + i = 0; + while (i < wordsInText.size() - (length-1)) { + StringBuilder sb = new StringBuilder(); + int curStart = wordStart.get(i); + int lastEnd = wordEnd.get(i); + for (int j = 1; j < length; j++) { + sb.append(wordsInText.get(i + j)); + lastEnd = wordEnd.get(i + j); + } + String nGram = sb.toString().trim(); + System.out.println(nGram); + annotations.add(new Annotation(document, curStart, lastEnd - curStart)); + i++; + } + + return annotations; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |