[DL-Learner SVN] SF.net SVN: dl-learner:[4192] trunk/components-core/src

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4192
          http://sourceforge.net/p/dl-learner/code/4192
Author:   lorenz_b
Date:     2013-12-03 12:41:34 +0000 (Tue, 03 Dec 2013)
Log Message:
-----------
Made semantic index serializable.

Modified Paths:
--------------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java
    trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java
    trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java
    trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java
===================================================================

--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java	2013-12-03 09:40:14 UTC (rev 4191)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java	2013-12-03 12:41:34 UTC (rev 4192)
@@ -4,6 +4,7 @@
 package org.dllearner.algorithms.isle.index;
 
 
+import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -12,7 +13,7 @@
  * @author Lorenz Buehmann
  *
  */
-public class Annotation {
+public class Annotation implements Serializable{
 	
 	private Document referencedDocument;
     private ArrayList<Token> tokens;

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java	2013-12-03 09:40:14 UTC (rev 4191)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java	2013-12-03 12:41:34 UTC (rev 4192)
@@ -4,6 +4,9 @@
 package org.dllearner.algorithms.isle.index;
 
 import java.io.Serializable;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
 
 import com.google.common.collect.ComparisonChain;
 
@@ -159,9 +162,9 @@
 
         Token token = (Token) o;
 
-//        if (!posTag.equals(token.posTag)) {
-//            return false;
-//        }
+        if (!WordTypeComparator.sameWordType(posTag, token.posTag)) {
+            return false;
+        }
         if (!stemmedForm.equals(token.stemmedForm)) {
             return false;
         }
@@ -172,7 +175,7 @@
     @Override
     public int hashCode() {
         int result = stemmedForm.hashCode();
-//        result = 31 * result + posTag.hashCode();
+        result = 31 * result + WordTypeComparator.hashCode(posTag);
         return result;
     }
 

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java	2013-12-03 09:40:14 UTC (rev 4191)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java	2013-12-03 12:41:34 UTC (rev 4192)
@@ -18,10 +18,28 @@
 	 * @return
 	 */
 	public static boolean sameWordType(String posTag1, String posTag2){
-		if(posTag1.startsWith("NN") && posTag2.startsWith("NN") ||
-				posTag1.startsWith("V") && posTag2.startsWith("V")){
+		if(posTag1.startsWith("NN") && posTag2.startsWith("NN") || //nouns
+			posTag1.startsWith("V") && posTag2.startsWith("V") || //verbs
+			posTag1.startsWith("JJ") && posTag2.startsWith("JJ") || //adjectives
+			posTag1.startsWith("RB") && posTag2.startsWith("RB"))  //adverbs
+		{
 			return true;
+		} else {
+			return posTag1.equals(posTag2);
 		}
-		return false;
 	}
+	
+	public static int hashCode(String posTag){
+		if(posTag.startsWith("NN")){//nouns
+			return "NN".hashCode();
+		} else if(posTag.startsWith("V")){//verbs
+			return "V".hashCode();
+		} else if(posTag.startsWith("JJ")){//adjectives
+			return "JJ".hashCode();
+		} else if(posTag.startsWith("RB")){//adverbs
+			return "RB".hashCode();
+		} else {
+			return posTag.hashCode();
+		}
+	}
 }

Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java
===================================================================
--- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java	2013-12-03 09:40:14 UTC (rev 4191)
+++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java	2013-12-03 12:41:34 UTC (rev 4192)
@@ -4,20 +4,15 @@
 package org.dllearner.algorithms.isle;
 
 import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.ObjectOutputStream;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
-import java.util.Properties;
 import java.util.Set;
 import java.util.SortedSet;
 
 import org.dllearner.algorithms.celoe.CELOE;
-import org.dllearner.algorithms.isle.index.TextDocument;
 import org.dllearner.algorithms.isle.index.semantic.SemanticIndex;
 import org.dllearner.algorithms.isle.index.semantic.SemanticIndexGenerator;
 import org.dllearner.algorithms.isle.metrics.PMIRelevanceMetric;
@@ -47,15 +42,6 @@
 
 import com.google.common.collect.Sets;
 
-import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
-import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
-import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
-import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
-import edu.stanford.nlp.ling.CoreLabel;
-import edu.stanford.nlp.pipeline.Annotation;
-import edu.stanford.nlp.pipeline.StanfordCoreNLP;
-import edu.stanford.nlp.util.CoreMap;
-
 /**
  * Experimental setup:
  * 
@@ -86,54 +72,21 @@
 	private String testFolder = "experiments/logs/";
 	
 	private OWLOntology ontology;
-	private Set<TextDocument> documents;
+	private Set<String> documents;
 	
 	private boolean initialized = false;
 	private RhoDRDown operator;
-	protected StanfordCoreNLP pipeline;
 
 	
 	protected abstract OWLOntology getOntology();
-	protected abstract Set<TextDocument> getDocuments();
+	protected abstract Set<String> getDocuments();
 	
 	/**
 	 * 
 	 */
 	public Experiment() {
-		// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution 
-	    Properties props = new Properties();
-	    props.put("annotators", "tokenize, ssplit, pos");
-	    pipeline = new StanfordCoreNLP(props);
 	}
 	
-	protected String getPOSTaggedText(String text){
-	    // create an empty Annotation just with the given text
-	    Annotation document = new Annotation(text);
-	    
-	    // run all Annotators on this text
-	    pipeline.annotate(document);
-	    
-	    // these are all the sentences in this document
-	    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
-	    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
-	    
-	    StringBuilder sb = new StringBuilder();
-	    for(CoreMap sentence: sentences) {
-	      // traversing the words in the current sentence
-	      // a CoreLabel is a CoreMap with additional token-specific methods
-	      for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
-	        // this is the text of the token
-	        String word = token.get(TextAnnotation.class);
-	        // this is the POS tag of the token
-	        String pos = token.get(PartOfSpeechAnnotation.class);
-	        
-	        sb.append(word).append("/").append(pos).append(" ");
-	      }
-	      
-	    }
-	    return sb.toString();
-	}
-	
 	private void initIfNecessary() {
 		if(!initialized){
 			ontology = getOntology();
@@ -141,13 +94,6 @@
 			
 			// build semantic index
 			SemanticIndex semanticIndex = SemanticIndexGenerator.generateIndex(documents, ontology, false);
-			try {
-				ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream("semantic-index.ser"));
-				oos.writeObject(semanticIndex);
-				oos.close();
-			} catch (IOException e1) {
-				e1.printStackTrace();
-			}
 			
 			// set the relevance metric
 			relevance = new PMIRelevanceMetric(semanticIndex);

Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java
===================================================================
--- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java	2013-12-03 09:40:14 UTC (rev 4191)
+++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java	2013-12-03 12:41:34 UTC (rev 4192)
@@ -90,14 +90,14 @@
                 new URL("http://gold.linkeddata.org/data/bible/chapter_index.zip"));
     }
 	
-	private Set<TextDocument> createDocuments(){
-		Set<TextDocument> documents = new HashSet<TextDocument>();
+	private Set<String> createDocuments(){
+		Set<String> documents = new HashSet<String>();
 		File folder = new File(testFolder+"corpus/");
 		for (File file  : folder.listFiles()) {
 			if(!file.isDirectory() && !file.isHidden()){
 				try {
 					String text = Files.toString(file, Charsets.UTF_8);
-					documents.add(TextDocumentGenerator.getInstance().generateDocument(text));
+					documents.add(text);
 				} catch (IOException e) {
 					e.printStackTrace();
 				}
@@ -106,8 +106,8 @@
 		return documents;
 	}
 
-    private Set<TextDocument> createBibleDocuments() throws IOException {
-        Set<TextDocument> documents = new HashSet<TextDocument>();
+    private Set<String> createBibleDocuments() throws IOException {
+        Set<String> documents = new HashSet<String>();
         RemoteDataProvider bibleByChapter = new RemoteDataProvider(
                 new URL("http://gold.linkeddata.org/data/bible/split_by_chapter.zip"));
         File folder = bibleByChapter.getLocalDirectory();
@@ -115,7 +115,7 @@
             if(!file.isDirectory() && !file.isHidden()){
                 try {
                     String text = Files.toString(file, Charsets.UTF_8);
-                    documents.add(TextDocumentGenerator.getInstance().generateDocument(text));
+                    documents.add(text);
                 } catch (IOException e) {
                     e.printStackTrace();
                 }
@@ -191,9 +191,9 @@
         EntityCandidateGenerator ecg = new TrieEntityCandidateGenerator(ontology, ect);
         SemanticAnnotator semanticAnnotator = new SemanticAnnotator(wsd, ecg, linguisticAnnotator);
 
-        Set<TextDocument> docs = createDocuments();
-        for (TextDocument doc : docs) {
-            AnnotatedDocument annotated = semanticAnnotator.processDocument(doc);
+        Set<String> docs = createDocuments();
+        for (String doc : docs) {
+            AnnotatedDocument annotated = semanticAnnotator.processDocument(TextDocumentGenerator.getInstance().generateDocument(doc));
             System.out.println(annotated);
         }
     }
@@ -208,9 +208,9 @@
         EntityCandidateGenerator ecg = new TrieEntityCandidateGenerator(ontology, ect);
         SemanticAnnotator semanticAnnotator = new SemanticAnnotator(wsd, ecg, linguisticAnnotator);
 
-        Set<TextDocument> docs = createDocuments();
-        for (TextDocument doc : docs) {
-            AnnotatedDocument annotated = semanticAnnotator.processDocument(doc);
+        Set<String> docs = createDocuments();
+        for (String doc : docs) {
+            AnnotatedDocument annotated = semanticAnnotator.processDocument(TextDocumentGenerator.getInstance().generateDocument(doc));
             System.out.println(annotated);
         }
     }

Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java
===================================================================
--- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java	2013-12-03 09:40:14 UTC (rev 4191)
+++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java	2013-12-03 12:41:34 UTC (rev 4192)
@@ -8,12 +8,9 @@
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.HashSet;
-import java.util.List;
-import java.util.Properties;
 import java.util.Set;
 
 import org.dllearner.algorithms.isle.index.RemoteDataProvider;
-import org.dllearner.algorithms.isle.index.TextDocument;
 import org.dllearner.core.owl.NamedClass;
 import org.semanticweb.owlapi.apibinding.OWLManager;
 import org.semanticweb.owlapi.model.IRI;
@@ -22,23 +19,8 @@
 import org.semanticweb.owlapi.model.OWLOntologyManager;
 
 import com.google.common.base.Charsets;
-import com.google.common.collect.Sets;
 import com.google.common.io.Files;
 
-import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
-import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
-import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
-import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
-import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
-import edu.stanford.nlp.ling.CoreLabel;
-import edu.stanford.nlp.pipeline.Annotation;
-import edu.stanford.nlp.pipeline.StanfordCoreNLP;
-import edu.stanford.nlp.trees.Tree;
-import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
-import edu.stanford.nlp.trees.semgraph.SemanticGraph;
-import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
-import edu.stanford.nlp.util.CoreMap;
-
 /**
  * @author Lorenz Buehmann
  *
@@ -73,10 +55,8 @@
 	 * @see org.dllearner.algorithms.isle.Experiment#getDocuments()
 	 */
 	@Override
-	protected Set<TextDocument> getDocuments() {
-		Set<TextDocument> documents = new HashSet<TextDocument>();
-		File taggedFolder = new File("tmp/tagged");
-		taggedFolder.mkdirs();
+	protected Set<String> getDocuments() {
+		Set<String> documents = new HashSet<String>();
         try {
 			RemoteDataProvider bibleByChapter = new RemoteDataProvider(
 			        new URL("http://gold.linkeddata.org/data/bible/split_by_chapter.zip"));
@@ -85,9 +65,10 @@
 			    if(!file.isDirectory() && !file.isHidden()){
 			        try {
 			            String text = Files.toString(file, Charsets.UTF_8);
-//			            String posTagged = getPOSTaggedText(text);
-//			            Files.write(posTagged, new File(taggedFolder, file.getName() + ".tagged"), Charsets.UTF_8);
-//			            documents.add(TextDocumentGenerator.getInstance().generateDocument(text));
+			            text = text.trim();
+			            if(!text.isEmpty()){
+			            	documents.add(text);
+			            }
 			        } catch (IOException e) {
 			            e.printStackTrace();
 			        }
@@ -98,9 +79,8 @@
 		} catch (IOException e) {
 			e.printStackTrace();
 		}
-        documents.clear();
-        TextDocument doc = TextDocumentGenerator.getInstance().generateDocument("and in that day seven women shall take hold of one man saying we will eat our own bread and wear our own apparel only let us be called by thy name to take away our reproach in that day shall the branch of the lord be beautiful and glorious and the fruit of the earth excellent and comely for them that are escaped of israel and it shall come to pass left in zion and remaineth in jerusalem shall be called holy every one that is written among the living in jerusalem when the lord shall have washed away the filth of the daughters of zion and shall have purged the blood of jerusalem from the midst thereof by the spirit of judgment and by the spirit of burning and the lord will create upon every dwelling place of mount zion and upon her assemblies a cloud and smoke by day and the shining of a flaming fire by night for upon all the glory a defence and there shall be a tabernacle for a shadow in the daytime from the heat and for a place of refuge and for a covert from storm and from rain");
-        documents.add(doc);
+//        documents.clear();
+//        documents.add("and in that day seven women shall take hold of one man saying we will eat our own bread and wear our own apparel only let us be called by thy name to take away our reproach in that day shall the branch of the lord be beautiful and glorious and the fruit of the earth excellent and comely for them that are escaped of israel and it shall come to pass left in zion and remaineth in jerusalem shall be called holy every one that is written among the living in jerusalem when the lord shall have washed away the filth of the daughters of zion and shall have purged the blood of jerusalem from the midst thereof by the spirit of judgment and by the spirit of burning and the lord will create upon every dwelling place of mount zion and upon her assemblies a cloud and smoke by day and the shining of a flaming fire by night for upon all the glory a defence and there shall be a tabernacle for a shadow in the daytime from the heat and for a place of refuge and for a covert from storm and from rain");
         return documents;
 	}
 	

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.