[DL-Learner SVN] SF.net SVN: dl-learner:[4155] trunk/components-core/src/main/java/org/ dllearner/a

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4155
          http://sourceforge.net/p/dl-learner/code/4155
Author:   lorenz_b
Date:     2013-11-21 11:54:23 +0000 (Thu, 21 Nov 2013)
Log Message:
-----------
Added POS tags to text documents in constructor.

Modified Paths:
--------------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java

Added Paths:
-----------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StanfordPartOfSpeechTagger.java

Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StanfordPartOfSpeechTagger.java
===================================================================

--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StanfordPartOfSpeechTagger.java	                        (rev 0)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StanfordPartOfSpeechTagger.java	2013-11-21 11:54:23 UTC (rev 4155)
@@ -0,0 +1,59 @@
+package org.dllearner.algorithms.isle;
+
+import java.util.List;
+import java.util.Properties;
+
+import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
+import edu.stanford.nlp.util.CoreMap;
+
+public class StanfordPartOfSpeechTagger {
+
+	private static StanfordPartOfSpeechTagger instance;
+	private StanfordCoreNLP pipeline;
+	
+	private StanfordPartOfSpeechTagger(){
+		Properties props = new Properties();
+	    props.put("annotators", "tokenize, ssplit, pos");
+	    pipeline = new StanfordCoreNLP(props);
+	}
+	
+	public static synchronized StanfordPartOfSpeechTagger getInstance(){
+		if(instance == null){
+			instance = new StanfordPartOfSpeechTagger();
+		}
+		return instance;
+	}
+
+	public String tag(String text) {
+		String out = "";
+		
+	    // create an empty Annotation just with the given text
+	    Annotation document = new Annotation(text);
+	    
+	    // run all Annotators on this text
+	    pipeline.annotate(document);
+	    
+	    // these are all the sentences in this document
+	    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
+	    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
+	    
+	    for(CoreMap sentence: sentences) {
+	    	for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
+	    		// this is the text of the token
+	            String word = token.get(TextAnnotation.class);
+	            // this is the POS tag of the token
+	            String pos = token.get(PartOfSpeechAnnotation.class);
+	           
+	            out += " " + word + "/" + pos;
+	          }
+	    }
+		
+		return out.trim();
+	}
+}

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java	2013-11-21 11:39:19 UTC (rev 4154)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java	2013-11-21 11:54:23 UTC (rev 4155)
@@ -1,5 +1,7 @@
 package org.dllearner.algorithms.isle.index;
 
+import org.dllearner.algorithms.isle.StanfordPartOfSpeechTagger;
+
 /**
  * A simple text document without further formatting or markup.
  *
@@ -10,7 +12,6 @@
     private String rawContent;
 	private String posTaggedContent;
 
-
     /**
      * Initializes a text document with the given raw content. Internally, the content is cleaned up so that it only
      * contains letters adhering to the regular expression pattern [A-Za-z].
@@ -19,26 +20,24 @@
      */
     public TextDocument(String content) {
         this.rawContent = content;
-        this.content = content.toLowerCase();
-        this.content = this.content.replaceAll("[^a-z ]", " ");
-        this.content = this.content.replaceAll("\\s{2,}", " ");
-        this.content = this.content.trim();
+		
+		//build cleaned content
+        buildCleanedContent();
+        
+        //build POS tagged content
+        buildPOSTaggedContent();
     }
     
-    /**
-     * Initializes a text document with the given raw content. Internally, the content is cleaned up so that it only
-     * contains letters adhering to the regular expression pattern [A-Za-z].
-     *
-     * @param content the raw content of this text document
-     */
-    public TextDocument(String content, String posTaggedContent) {
-        this.rawContent = content;
-		this.posTaggedContent = posTaggedContent;
-        this.content = content.toLowerCase();
+    private void buildCleanedContent(){
+    	this.content = content.toLowerCase();
         this.content = this.content.replaceAll("[^a-z ]", " ");
         this.content = this.content.replaceAll("\\s{2,}", " ");
         this.content = this.content.trim();
     }
+    
+    private void buildPOSTaggedContent(){
+    	this.posTaggedContent = StanfordPartOfSpeechTagger.getInstance().tag(rawContent);
+    }
 
     @Override
     public String getContent() {

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[DL-Learner SVN] SF.net SVN: dl-learner:[4155] trunk/components-core/src/main/java/org/ dllearner/a

[DL-Learner SVN] SF.net SVN: dl-learner:[4155] trunk/components-core/src/main/java/org/ dllearner/algorithms/isle