[DL-Learner SVN] SF.net SVN: dl-learner:[3902] trunk/components-ext

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3902
          http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3902&view=rev
Author:   lorenz_b
Date:     2013-02-25 11:47:56 +0000 (Mon, 25 Feb 2013)
Log Message:
-----------
Updated Staford model loading.

Modified Paths:
--------------
    trunk/components-ext/pom.xml
    trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordLemmatizer.java
    trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordPartOfSpeechTagger.java
    trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/util/SPARQLEndpointMetrics.java
    trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/TBSLTest.java

Modified: trunk/components-ext/pom.xml
===================================================================

--- trunk/components-ext/pom.xml	2013-02-18 14:16:54 UTC (rev 3901)
+++ trunk/components-ext/pom.xml	2013-02-25 11:47:56 UTC (rev 3902)
@@ -91,12 +91,18 @@
         <!--END Logging Dependencies-->
 
 
+      <dependency>
+			<groupId>edu.stanford.nlp</groupId>
+			<artifactId>stanford-corenlp</artifactId>
+			<version>1.3.3</version>
+		</dependency>
+		<dependency>
+			<groupId>edu.stanford.nlp</groupId>
+			<artifactId>stanford-corenlp</artifactId>
+			<version>1.3.3</version>
+			<classifier>models</classifier>
+		</dependency>
         <dependency>
-            <groupId>edu.stanford</groupId>
-            <artifactId>postagger</artifactId>
-            <version>3.0.2</version>
-        </dependency>
-        <dependency>
             <groupId>lbj</groupId>
             <artifactId>library</artifactId>
             <version>1.0</version>

Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordLemmatizer.java
===================================================================
--- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordLemmatizer.java	2013-02-18 14:16:54 UTC (rev 3901)
+++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordLemmatizer.java	2013-02-25 11:47:56 UTC (rev 3902)
@@ -26,7 +26,7 @@
 
 	@Override
 	public String stem(String word, String tag) {
-		return stemmer.stem(word, tag).word();
+		return stemmer.lemma(word, tag);
 	}
 
 	@Override

Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordPartOfSpeechTagger.java
===================================================================
--- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordPartOfSpeechTagger.java	2013-02-18 14:16:54 UTC (rev 3901)
+++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordPartOfSpeechTagger.java	2013-02-25 11:47:56 UTC (rev 3902)
@@ -1,36 +1,29 @@
 package org.dllearner.algorithm.tbsl.nlp;
 
-import java.io.IOException;
-import java.io.StringReader;
-import java.net.URISyntaxException;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.Properties;
 
 import com.aliasi.tag.Tagging;
 
-import edu.stanford.nlp.ling.HasWord;
-import edu.stanford.nlp.ling.TaggedWord;
-import edu.stanford.nlp.tagger.maxent.MaxentTagger;
+import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
+import edu.stanford.nlp.util.CoreMap;
 
 public class StanfordPartOfSpeechTagger implements PartOfSpeechTagger{
 
-	private static final String MODEL = "tbsl/models/bidirectional-distsim-wsj-0-18.tagger";
+	private StanfordCoreNLP pipeline;
 	
-	private MaxentTagger tagger;
-	
 	public StanfordPartOfSpeechTagger(){
-		try {
-//			String modelPath = this.getClass().getClassLoader().getResource(MODEL).getPath();
-			String modelPath = getClass().getResource("/tbsl/models/bidirectional-distsim-wsj-0-18.tagger").getPath(); 
-//			String modelPath = Thread.currentThread().getContextClassLoader().getResource(MODEL).getFile();
-			tagger = new MaxentTagger(modelPath);
-		} catch (IOException e) {
-			e.printStackTrace();
-		} catch (ClassNotFoundException e) {
-			e.printStackTrace();
-		}
+		Properties props = new Properties();
+	    props.put("annotators", "tokenize, ssplit, pos");
+	    pipeline = new StanfordCoreNLP(props);
 	}
 	
 	@Override
@@ -39,68 +32,94 @@
 	}
 
 	@Override
-	public String tag(String sentence) {
+	public String tag(String text) {
 		String out = "";
 		
-		ArrayList<TaggedWord> tagged = new ArrayList<TaggedWord>(); 
+	    // create an empty Annotation just with the given text
+	    Annotation document = new Annotation(text);
+	    
+	    // run all Annotators on this text
+	    pipeline.annotate(document);
+	    
+	    // these are all the sentences in this document
+	    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
+	    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
+	    
+	    for(CoreMap sentence: sentences) {
+	    	for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
+	    		// this is the text of the token
+	            String word = token.get(TextAnnotation.class);
+	            // this is the POS tag of the token
+	            String pos = token.get(PartOfSpeechAnnotation.class);
+	           
+	            out += " " + word + "/" + pos;
+	          }
+	    }
 		
-		StringReader reader = new StringReader(sentence);
-		List<List<HasWord>> text = MaxentTagger.tokenizeText(reader);
-			
-		if (text.size() == 1) {
-			tagged = tagger.tagSentence(text.get(0));
-		}
-		
-		for (TaggedWord t : tagged) {
-			out += " " + t.toString();
-		}
 		return out.trim();
 	}
+	
+	
 
 	@Override
 	public List<String> tagTopK(String sentence) {
 		return Collections.singletonList(tag(sentence));
 	}
 	
-	public List<String> getTags(String sentence){
+	public List<String> getTags(String text){
 		List<String> tags = new ArrayList<String>();
 		
-		ArrayList<TaggedWord> tagged = new ArrayList<TaggedWord>(); 
+		// create an empty Annotation just with the given text
+	    Annotation document = new Annotation(text);
+	    
+	    // run all Annotators on this text
+	    pipeline.annotate(document);
+	    
+	    // these are all the sentences in this document
+	    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
+	    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
+	    
+	    for(CoreMap sentence: sentences) {
+	    	for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
+	    		// this is the text of the token
+	            String word = token.get(TextAnnotation.class);
+	            // this is the POS tag of the token
+	            String pos = token.get(PartOfSpeechAnnotation.class);
+	           
+	            tags.add(pos);
+	          }
+	    }
 		
-		StringReader reader = new StringReader(sentence);
-		List<List<HasWord>> text = MaxentTagger.tokenizeText(reader);
-			
-		if (text.size() == 1) {
-			tagged = tagger.tagSentence(text.get(0));
-		}
-		
-		for(TaggedWord tW : tagged){
-			tags.add(tW.tag());
-		}
-		
 		return tags;
 	}
 	
 	@Override
-	public Tagging<String> getTagging(String sentence){
-		ArrayList<TaggedWord> tagged = new ArrayList<TaggedWord>(); 
-		
-		StringReader reader = new StringReader(sentence);
-		List<List<HasWord>> text = MaxentTagger.tokenizeText(reader);
-			
-		if (text.size() == 1) {
-			tagged = tagger.tagSentence(text.get(0));
-		}
-		
+	public Tagging<String> getTagging(String text){
 		List<String> tokenList = new ArrayList<String>();
 		List<String> tagList = new ArrayList<String>();
 		
-		for(TaggedWord tW : tagged){
-			tokenList.add(tW.word());
-			tagList.add(tW.tag());
-		}
+		// create an empty Annotation just with the given text
+	    Annotation document = new Annotation(text);
+	    
+	    // run all Annotators on this text
+	    pipeline.annotate(document);
+	    
+	    // these are all the sentences in this document
+	    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
+	    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
+	    
+	    for(CoreMap sentence: sentences) {
+	    	for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
+	    		// this is the text of the token
+	            String word = token.get(TextAnnotation.class);
+	            // this is the POS tag of the token
+	            String pos = token.get(PartOfSpeechAnnotation.class);
+	           
+	            tokenList.add(word);
+				tagList.add(pos);
+	          }
+	    }
 		
 		return new Tagging<String>(tokenList, tagList);
 	}
-
 }

Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/util/SPARQLEndpointMetrics.java
===================================================================
--- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/util/SPARQLEndpointMetrics.java	2013-02-18 14:16:54 UTC (rev 3901)
+++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/util/SPARQLEndpointMetrics.java	2013-02-25 11:47:56 UTC (rev 3902)
@@ -9,6 +9,7 @@
 import java.util.SortedSet;
 import java.util.TreeSet;
 
+import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.dllearner.core.owl.Individual;
 import org.dllearner.core.owl.NamedClass;
@@ -34,8 +35,8 @@
 	public SPARQLEndpointMetrics(SparqlEndpoint endpoint, ExtractionDBCache cache) {
 		this.endpoint = endpoint;
 		this.cache = cache;
-		cache.setFreshnessInMilliseconds(31536000000l);
-		cache.setMaxExecutionTimeInSeconds(30);
+		cache.setFreshnessInMilliseconds(Long.MAX_VALUE);//31536000000l);
+		cache.setMaxExecutionTimeInSeconds(300);
 		
 		this.reasoner = new SPARQLReasoner(new SparqlEndpointKS(endpoint), cache);
 	}
@@ -214,6 +215,32 @@
 	}
 	
 	/**
+	 * Returns the number of triples where the given individual is in subject position(out-going links).
+	 * @param cls
+	 * @return
+	 */
+	public int getOccurencesInSubjectPosition(Individual ind){
+		log.trace(String.format("Computing number of occurences in subject position for %s", ind.getName()));
+		String query  = String.format("SELECT (COUNT(*) AS ?cnt) WHERE {<%s> ?p ?o.}", ind.getName());
+		ResultSet rs = SparqlQuery.convertJSONtoResultSet(cache.executeSelectQuery(endpoint, query));
+		int classOccurenceCnt = rs.next().getLiteral("cnt").getInt();
+		return classOccurenceCnt;
+	}
+	
+	/**
+	 * Returns the number of triples where the given individual is in object position(in-going links).
+	 * @param cls
+	 * @return
+	 */
+	public int getOccurencesInObjectPosition(Individual ind){
+		log.trace(String.format("Computing number of occurences in object position for %s", ind.getName()));
+		String query  = String.format("SELECT (COUNT(*) AS ?cnt) WHERE {?s ?p <%s>.}", ind.getName());
+		ResultSet rs = SparqlQuery.convertJSONtoResultSet(cache.executeSelectQuery(endpoint, query));
+		int classOccurenceCnt = rs.next().getLiteral("cnt").getInt();
+		return classOccurenceCnt;
+	}
+	
+	/**
 	 * Returns the number triples with the given property as predicate.
 	 * @param prop
 	 * @return
@@ -394,8 +421,9 @@
 	}
 	
 	public static void main(String[] args) {
-		SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia();
-		ExtractionDBCache cache = new ExtractionDBCache("/opt/tbsl/dbpedia_pmi_cache");
+		Logger.getLogger(SPARQLEndpointMetrics.class).setLevel(Level.DEBUG);
+		SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpediaLiveOpenLink();
+		ExtractionDBCache cache = new ExtractionDBCache("/opt/tbsl/dbpedia_pmi_cache_v2");
 		String NS = "http://dbpedia.org/ontology/";
 		String NS_Res = "http://dbpedia.org/resource/";
 		

Modified: trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/TBSLTest.java
===================================================================
--- trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/TBSLTest.java	2013-02-18 14:16:54 UTC (rev 3901)
+++ trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/TBSLTest.java	2013-02-25 11:47:56 UTC (rev 3902)
@@ -3,6 +3,7 @@
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
 import java.net.URL;
 import java.util.Collections;
 
@@ -38,9 +39,9 @@
 	@Override
 	protected void setUp() throws Exception {
 		super.setUp();
-		endpoint = new SparqlEndpoint(new URL("http://lgd.aksw.org:8900/sparql"), Collections.singletonList("http://diadem.cs.ox.ac.uk"), Collections.<String>emptyList());
-//		model = ModelFactory.createOntologyModel();
-//		File dir = new File("/home/lorenz/arbeit/papers/question-answering-iswc-2012/examples/data");
+		endpoint = new SparqlEndpoint(new URL("http://[2001:638:902:2010:0:168:35:138]/sparql"), Collections.singletonList("http://diadem.cs.ox.ac.uk"), Collections.<String>emptyList());
+		model = ModelFactory.createOntologyModel();
+//		File dir = new File("/home/me/work/papers/question-answering-iswc-2012/data_v2");
 //		try {
 //			for(File f : dir.listFiles()){
 //				if(f.isFile()){
@@ -53,6 +54,7 @@
 //					}
 //				}
 //			}
+//			model.write(new FileOutputStream(dir.getAbsolutePath() + "/oxford-data.ttl"), "TURTLE", null);
 //			model.read(new FileInputStream(new File("/home/lorenz/arbeit/papers/question-answering-iswc-2012/examples/ontology.ttl")), null, "TURTLE");
 //		} catch (FileNotFoundException e) {
 //			e.printStackTrace();
@@ -88,6 +90,7 @@
 		
 		SPARQLTemplateBasedLearner2 learner = new SPARQLTemplateBasedLearner2(model, resourcesIndex, classesIndex, propertiesIndex);
 		learner.init();
+		learner.setGrammarFiles(new String[]{"tbsl/lexicon/english.lex","tbsl/lexicon/english_oxford.lex"});
 		
 		String question = "Give me all houses with more than 3 bathrooms and more than 2 bedrooms.";
 		
@@ -117,9 +120,10 @@
 		learner.setGrammarFiles(new String[]{"tbsl/lexicon/english.lex","tbsl/lexicon/english_oxford.lex"});
 		
 		String question = "Give me all houses near a school.";
-		question = "Give me all houses with more than 3 bathrooms and more than 2 bedrooms.";
-		question = "Give me all Victorian houses in Oxfordshire";
-		question = "Edwardian houses close to supermarket for less than 1,000,000 in Oxfordshire";
+		question = "Give me all houses with more than 3 bathrooms.";
+		question = "houses at walking distance from a pharmacy";
+//		question = "Give me all Victorian houses in Oxfordshire";
+//		question = "Edwardian houses close to supermarket for less than 1,000,000 in Oxfordshire";
 //		question = "Give me all family houses with more than 2 bathrooms and more than 4 bedrooms";
 		
 		learner.setQuestion(question);

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.