From: <lor...@us...> - 2013-11-21 12:57:14
|
Revision: 4161 http://sourceforge.net/p/dl-learner/code/4161 Author: lorenz_b Date: 2013-11-21 12:57:10 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Cont. text document generator. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-11-21 12:51:05 UTC (rev 4160) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-11-21 12:57:10 UTC (rev 4161) @@ -19,7 +19,10 @@ public class TextDocumentGenerator { private static TextDocumentGenerator instance; + private StanfordCoreNLP pipeline; + private final String punctuationPattern = "\\p{Punct}"; + private final StopWordFilter stopWordFilter = new StopWordFilter(); private TextDocumentGenerator(){ Properties props = new Properties(); @@ -54,14 +57,22 @@ String pos = label.get(PartOfSpeechAnnotation.class); //this is the POS tag of the token String lemma = label.get(LemmaAnnotation.class); + //check if token is punctuation + boolean isPunctuation = word.matches(punctuationPattern); + //check if it is a stop word + boolean isStopWord = stopWordFilter.isStopWord(word); - Token token = new Token(word); - token.setPOSTag(pos); - token.setStemmedForm(lemma); + Token token = new Token(word, lemma, pos, isPunctuation, isStopWord); + document.add(token); } } return document; } + + public static void main(String[] args) throws Exception { + TextDocument document = TextDocumentGenerator.getInstance().tag("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. "); + System.out.println(document); + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-11-21 12:51:05 UTC (rev 4160) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-11-21 12:57:10 UTC (rev 4161) @@ -12,11 +12,21 @@ private String rawForm; private String stemmedForm; private String posTag; + private boolean isPunctuation; + private boolean isStopWord; public Token(String rawForm) { - posTag = rawForm; + this.rawForm = rawForm; } + public Token(String rawForm, String stemmedForm, String posTag, boolean isPunctuation, boolean isStopWord) { + this.rawForm = rawForm; + this.stemmedForm = stemmedForm; + this.posTag = posTag; + this.isPunctuation = isPunctuation; + this.isStopWord = isStopWord; + } + /** * @return the rawForm */ @@ -39,6 +49,20 @@ } /** + * @return the isPunctuation + */ + public boolean isPunctuation() { + return isPunctuation; + } + + /** + * @return the isStopWord + */ + public boolean isStopWord() { + return isStopWord; + } + + /** * @param stemmedForm the stemmedForm to set */ public void setStemmedForm(String stemmedForm) { @@ -51,14 +75,28 @@ public void setPOSTag(String posTag) { this.posTag = posTag; } + + /** + * @param isPunctuation the isPunctuation to set + */ + public void setIsPunctuation(boolean isPunctuation) { + this.isPunctuation = isPunctuation; + } + + /** + * @param isStopWord the isStopWord to set + */ + public void setIsStopWord(boolean isStopWord) { + this.isStopWord = isStopWord; + } /* (non-Javadoc) * @see java.lang.Object#toString() */ @Override public String toString() { - return "Word: " + rawForm + "\n" + return "\n[Word: " + rawForm + "\n" + "Stemmed word: " + stemmedForm + "\n" - + "POS tag: " + posTag; + + "POS tag: " + posTag + "]"; } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |