From: <lor...@us...> - 2013-11-21 11:54:26
|
Revision: 4155 http://sourceforge.net/p/dl-learner/code/4155 Author: lorenz_b Date: 2013-11-21 11:54:23 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Added POS tags to text documents in constructor. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StanfordPartOfSpeechTagger.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StanfordPartOfSpeechTagger.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StanfordPartOfSpeechTagger.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StanfordPartOfSpeechTagger.java 2013-11-21 11:54:23 UTC (rev 4155) @@ -0,0 +1,59 @@ +package org.dllearner.algorithms.isle; + +import java.util.List; +import java.util.Properties; + +import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.util.CoreMap; + +public class StanfordPartOfSpeechTagger { + + private static StanfordPartOfSpeechTagger instance; + private StanfordCoreNLP pipeline; + + private StanfordPartOfSpeechTagger(){ + Properties props = new Properties(); + props.put("annotators", "tokenize, ssplit, pos"); + pipeline = new StanfordCoreNLP(props); + } + + public static synchronized StanfordPartOfSpeechTagger getInstance(){ + if(instance == null){ + instance = new StanfordPartOfSpeechTagger(); + } + return instance; + } + + public String tag(String text) { + String out = ""; + + // create an empty Annotation just with the given text + Annotation document = new Annotation(text); + + // run all Annotators on this text + pipeline.annotate(document); + + // these are all the sentences in this document + // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types + List<CoreMap> sentences = document.get(SentencesAnnotation.class); + + for(CoreMap sentence: sentences) { + for (CoreLabel token: sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = token.get(TextAnnotation.class); + // this is the POS tag of the token + String pos = token.get(PartOfSpeechAnnotation.class); + + out += " " + word + "/" + pos; + } + } + + return out.trim(); + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 11:39:19 UTC (rev 4154) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 11:54:23 UTC (rev 4155) @@ -1,5 +1,7 @@ package org.dllearner.algorithms.isle.index; +import org.dllearner.algorithms.isle.StanfordPartOfSpeechTagger; + /** * A simple text document without further formatting or markup. * @@ -10,7 +12,6 @@ private String rawContent; private String posTaggedContent; - /** * Initializes a text document with the given raw content. Internally, the content is cleaned up so that it only * contains letters adhering to the regular expression pattern [A-Za-z]. @@ -19,26 +20,24 @@ */ public TextDocument(String content) { this.rawContent = content; - this.content = content.toLowerCase(); - this.content = this.content.replaceAll("[^a-z ]", " "); - this.content = this.content.replaceAll("\\s{2,}", " "); - this.content = this.content.trim(); + + //build cleaned content + buildCleanedContent(); + + //build POS tagged content + buildPOSTaggedContent(); } - /** - * Initializes a text document with the given raw content. Internally, the content is cleaned up so that it only - * contains letters adhering to the regular expression pattern [A-Za-z]. - * - * @param content the raw content of this text document - */ - public TextDocument(String content, String posTaggedContent) { - this.rawContent = content; - this.posTaggedContent = posTaggedContent; - this.content = content.toLowerCase(); + private void buildCleanedContent(){ + this.content = content.toLowerCase(); this.content = this.content.replaceAll("[^a-z ]", " "); this.content = this.content.replaceAll("\\s{2,}", " "); this.content = this.content.trim(); } + + private void buildPOSTaggedContent(){ + this.posTaggedContent = StanfordPartOfSpeechTagger.getInstance().tag(rawContent); + } @Override public String getContent() { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |