From: <dfl...@us...> - 2013-09-04 09:45:43
|
Revision: 4051 http://sourceforge.net/p/dl-learner/code/4051 Author: dfleischhacker Date: 2013-09-04 09:45:38 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Add n-gram generating annotator Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java 2013-09-04 09:45:38 UTC (rev 4051) @@ -0,0 +1,78 @@ +package org.dllearner.algorithms.isle.index; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Pattern; + +/** + * Generates word n-grams + * @author Daniel Fleischhacker + */ +public class NGramGeneratingAnnotator implements LinguisticAnnotator { + private int length; + + /** + * Initializes the annotator to generate word n-grams of the given length ({@code length} words per n-gram) + * @param length length of the single n-grams + */ + public NGramGeneratingAnnotator(int length) { + this.length = length; + } + + @Override + public Set<Annotation> annotate(Document document) { + String text = document.getContent(); + + + Pattern legalChars = Pattern.compile("[A-Za-z]"); + + // clean up all texts + int curWordStartPosition = 0; + StringBuilder curWord = new StringBuilder(); + ArrayList<String> wordsInText = new ArrayList<String>(); + ArrayList<Integer> wordStart = new ArrayList<Integer>(); + ArrayList<Integer> wordEnd = new ArrayList<Integer>(); + + int i = 0; + while (i < text.length()) { + Character curChar = text.charAt(i); + if (!legalChars.matcher(curChar.toString()).matches()) { + if (curWord.length() == 0) { + curWordStartPosition = i + 1; + i++; + continue; + } + // current word finished + wordsInText.add(curWord.toString()); + wordStart.add(curWordStartPosition); + wordEnd.add(i); + curWord = new StringBuilder(); + curWordStartPosition = i + 1; + } + else { + curWord.append(curChar); + } + i++; + } + + HashSet<Annotation> annotations = new HashSet<Annotation>(); + + i = 0; + while (i < wordsInText.size() - (length-1)) { + StringBuilder sb = new StringBuilder(); + int curStart = wordStart.get(i); + int lastEnd = wordEnd.get(i); + for (int j = 1; j < length; j++) { + sb.append(wordsInText.get(i + j)); + lastEnd = wordEnd.get(i + j); + } + String nGram = sb.toString().trim(); + System.out.println(nGram); + annotations.add(new Annotation(document, curStart, lastEnd - curStart)); + i++; + } + + return annotations; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-09-04 09:55:38
|
Revision: 4053 http://sourceforge.net/p/dl-learner/code/4053 Author: dfleischhacker Date: 2013-09-04 09:55:35 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Remove debugging output Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java 2013-09-04 09:52:57 UTC (rev 4052) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java 2013-09-04 09:55:35 UTC (rev 4053) @@ -67,7 +67,6 @@ lastEnd = wordEnd.get(i + j); } String nGram = sb.toString().trim(); - System.out.println(nGram); annotations.add(new Annotation(document, curStart, lastEnd - curStart)); i++; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 13:53:16
|
Revision: 4168 http://sourceforge.net/p/dl-learner/code/4168 Author: dfleischhacker Date: 2013-11-21 13:53:13 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Remove unused NGramGeneratingAnnotator Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java 2013-11-21 13:52:45 UTC (rev 4167) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java 2013-11-21 13:53:13 UTC (rev 4168) @@ -1,76 +0,0 @@ -package org.dllearner.algorithms.isle.index; - -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Set; -import java.util.regex.Pattern; - -/** - * Generates word n-grams - * @author Daniel Fleischhacker - */ -public class NGramGeneratingAnnotator implements LinguisticAnnotator { - private int length; - - /** - * Initializes the annotator to generate word n-grams of the given length ({@code length} words per n-gram) - * @param length length of the single n-grams - */ - public NGramGeneratingAnnotator(int length) { - this.length = length; - } - - @Override - public Set<Annotation> annotate(Document document) { - String text = document.getContent(); - - Pattern legalChars = Pattern.compile("[A-Za-z]"); - - // clean up all texts - int curWordStartPosition = 0; - StringBuilder curWord = new StringBuilder(); - ArrayList<String> wordsInText = new ArrayList<String>(); - ArrayList<Integer> wordStart = new ArrayList<Integer>(); - ArrayList<Integer> wordEnd = new ArrayList<Integer>(); - - int i = 0; - while (i < text.length()) { - Character curChar = text.charAt(i); - if (!legalChars.matcher(curChar.toString()).matches()) { - if (curWord.length() == 0) { - curWordStartPosition = i + 1; - i++; - continue; - } - // current word finished - wordsInText.add(curWord.toString()); - wordStart.add(curWordStartPosition); - wordEnd.add(i); - curWord = new StringBuilder(); - curWordStartPosition = i + 1; - } - else { - curWord.append(curChar); - } - i++; - } - - HashSet<Annotation> annotations = new HashSet<Annotation>(); - - i = 0; - while (i < wordsInText.size() - (length-1)) { - StringBuilder sb = new StringBuilder(); - int curStart = wordStart.get(i); - int lastEnd = wordEnd.get(i); - for (int j = 1; j < length; j++) { - sb.append(wordsInText.get(i + j)); - lastEnd = wordEnd.get(i + j); - } - String nGram = sb.toString().trim(); - annotations.add(new Annotation(document, curStart, lastEnd - curStart)); - i++; - } - - return annotations; - } -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |