[DL-Learner SVN] SF.net SVN: dl-learner:[4158] trunk/components-core/src/main/java/org/ dllearner/

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4158
          http://sourceforge.net/p/dl-learner/code/4158
Author:   lorenz_b
Date:     2013-11-21 12:36:47 +0000 (Thu, 21 Nov 2013)
Log Message:
-----------
Added text document generator.

Added Paths:
-----------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java

Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java
===================================================================

--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java	                        (rev 0)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java	2013-11-21 12:36:47 UTC (rev 4158)
@@ -0,0 +1,67 @@
+package org.dllearner.algorithms.isle;
+
+import java.util.List;
+import java.util.Properties;
+
+import org.dllearner.algorithms.isle.index.TextDocument;
+import org.dllearner.algorithms.isle.index.Token;
+
+import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
+import edu.stanford.nlp.util.CoreMap;
+
+public class TextDocumentGenerator {
+
+	private static TextDocumentGenerator instance;
+	private StanfordCoreNLP pipeline;
+	
+	private TextDocumentGenerator(){
+		Properties props = new Properties();
+	    props.put("annotators", "tokenize, ssplit, pos, lemma");
+	    pipeline = new StanfordCoreNLP(props);
+	}
+	
+	public static synchronized TextDocumentGenerator getInstance(){
+		if(instance == null){
+			instance = new TextDocumentGenerator();
+		}
+		return instance;
+	}
+
+	public TextDocument tag(String text) {
+		TextDocument document = new TextDocument();
+	    // create an empty Annotation just with the given text
+	    Annotation annotatedDocument = new Annotation(text);
+	    
+	    // run all Annotators on this text
+	    pipeline.annotate(annotatedDocument);
+	    
+	    // these are all the sentences in this document
+	    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
+	    List<CoreMap> sentences = annotatedDocument.get(SentencesAnnotation.class);
+	    
+	    for(CoreMap sentence: sentences) {
+	    	for (CoreLabel label: sentence.get(TokensAnnotation.class)) {
+	    		// this is the text of the token
+	            String word = label.get(TextAnnotation.class);
+	            // this is the POS tag of the token
+	            String pos = label.get(PartOfSpeechAnnotation.class);
+	            //this is the POS tag of the token
+	            String lemma = label.get(LemmaAnnotation.class);
+	           
+	            Token token = new Token(word);
+	            token.setPOSTag(pos);
+	            token.setStemmedForm(lemma);
+	            document.add(token);
+	          }
+	    }
+		
+		return document;
+	}
+}

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[DL-Learner SVN] SF.net SVN: dl-learner:[4158] trunk/components-core/src/main/java/org/ dllearner/

[DL-Learner SVN] SF.net SVN: dl-learner:[4158] trunk/components-core/src/main/java/org/ dllearner/algorithms/isle/TextDocumentGenerator.java