From: <lor...@us...> - 2013-02-25 11:48:03
|
Revision: 3902 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3902&view=rev Author: lorenz_b Date: 2013-02-25 11:47:56 +0000 (Mon, 25 Feb 2013) Log Message: ----------- Updated Staford model loading. Modified Paths: -------------- trunk/components-ext/pom.xml trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordLemmatizer.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordPartOfSpeechTagger.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/util/SPARQLEndpointMetrics.java trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/TBSLTest.java Modified: trunk/components-ext/pom.xml =================================================================== --- trunk/components-ext/pom.xml 2013-02-18 14:16:54 UTC (rev 3901) +++ trunk/components-ext/pom.xml 2013-02-25 11:47:56 UTC (rev 3902) @@ -91,12 +91,18 @@ <!--END Logging Dependencies--> + <dependency> + <groupId>edu.stanford.nlp</groupId> + <artifactId>stanford-corenlp</artifactId> + <version>1.3.3</version> + </dependency> + <dependency> + <groupId>edu.stanford.nlp</groupId> + <artifactId>stanford-corenlp</artifactId> + <version>1.3.3</version> + <classifier>models</classifier> + </dependency> <dependency> - <groupId>edu.stanford</groupId> - <artifactId>postagger</artifactId> - <version>3.0.2</version> - </dependency> - <dependency> <groupId>lbj</groupId> <artifactId>library</artifactId> <version>1.0</version> Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordLemmatizer.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordLemmatizer.java 2013-02-18 14:16:54 UTC (rev 3901) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordLemmatizer.java 2013-02-25 11:47:56 UTC (rev 3902) @@ -26,7 +26,7 @@ @Override public String stem(String word, String tag) { - return stemmer.stem(word, tag).word(); + return stemmer.lemma(word, tag); } @Override Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordPartOfSpeechTagger.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordPartOfSpeechTagger.java 2013-02-18 14:16:54 UTC (rev 3901) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/StanfordPartOfSpeechTagger.java 2013-02-25 11:47:56 UTC (rev 3902) @@ -1,36 +1,29 @@ package org.dllearner.algorithm.tbsl.nlp; -import java.io.IOException; -import java.io.StringReader; -import java.net.URISyntaxException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Properties; import com.aliasi.tag.Tagging; -import edu.stanford.nlp.ling.HasWord; -import edu.stanford.nlp.ling.TaggedWord; -import edu.stanford.nlp.tagger.maxent.MaxentTagger; +import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.util.CoreMap; public class StanfordPartOfSpeechTagger implements PartOfSpeechTagger{ - private static final String MODEL = "tbsl/models/bidirectional-distsim-wsj-0-18.tagger"; + private StanfordCoreNLP pipeline; - private MaxentTagger tagger; - public StanfordPartOfSpeechTagger(){ - try { -// String modelPath = this.getClass().getClassLoader().getResource(MODEL).getPath(); - String modelPath = getClass().getResource("/tbsl/models/bidirectional-distsim-wsj-0-18.tagger").getPath(); -// String modelPath = Thread.currentThread().getContextClassLoader().getResource(MODEL).getFile(); - tagger = new MaxentTagger(modelPath); - } catch (IOException e) { - e.printStackTrace(); - } catch (ClassNotFoundException e) { - e.printStackTrace(); - } + Properties props = new Properties(); + props.put("annotators", "tokenize, ssplit, pos"); + pipeline = new StanfordCoreNLP(props); } @Override @@ -39,68 +32,94 @@ } @Override - public String tag(String sentence) { + public String tag(String text) { String out = ""; - ArrayList<TaggedWord> tagged = new ArrayList<TaggedWord>(); + // create an empty Annotation just with the given text + Annotation document = new Annotation(text); + + // run all Annotators on this text + pipeline.annotate(document); + + // these are all the sentences in this document + // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types + List<CoreMap> sentences = document.get(SentencesAnnotation.class); + + for(CoreMap sentence: sentences) { + for (CoreLabel token: sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = token.get(TextAnnotation.class); + // this is the POS tag of the token + String pos = token.get(PartOfSpeechAnnotation.class); + + out += " " + word + "/" + pos; + } + } - StringReader reader = new StringReader(sentence); - List<List<HasWord>> text = MaxentTagger.tokenizeText(reader); - - if (text.size() == 1) { - tagged = tagger.tagSentence(text.get(0)); - } - - for (TaggedWord t : tagged) { - out += " " + t.toString(); - } return out.trim(); } + + @Override public List<String> tagTopK(String sentence) { return Collections.singletonList(tag(sentence)); } - public List<String> getTags(String sentence){ + public List<String> getTags(String text){ List<String> tags = new ArrayList<String>(); - ArrayList<TaggedWord> tagged = new ArrayList<TaggedWord>(); + // create an empty Annotation just with the given text + Annotation document = new Annotation(text); + + // run all Annotators on this text + pipeline.annotate(document); + + // these are all the sentences in this document + // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types + List<CoreMap> sentences = document.get(SentencesAnnotation.class); + + for(CoreMap sentence: sentences) { + for (CoreLabel token: sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = token.get(TextAnnotation.class); + // this is the POS tag of the token + String pos = token.get(PartOfSpeechAnnotation.class); + + tags.add(pos); + } + } - StringReader reader = new StringReader(sentence); - List<List<HasWord>> text = MaxentTagger.tokenizeText(reader); - - if (text.size() == 1) { - tagged = tagger.tagSentence(text.get(0)); - } - - for(TaggedWord tW : tagged){ - tags.add(tW.tag()); - } - return tags; } @Override - public Tagging<String> getTagging(String sentence){ - ArrayList<TaggedWord> tagged = new ArrayList<TaggedWord>(); - - StringReader reader = new StringReader(sentence); - List<List<HasWord>> text = MaxentTagger.tokenizeText(reader); - - if (text.size() == 1) { - tagged = tagger.tagSentence(text.get(0)); - } - + public Tagging<String> getTagging(String text){ List<String> tokenList = new ArrayList<String>(); List<String> tagList = new ArrayList<String>(); - for(TaggedWord tW : tagged){ - tokenList.add(tW.word()); - tagList.add(tW.tag()); - } + // create an empty Annotation just with the given text + Annotation document = new Annotation(text); + + // run all Annotators on this text + pipeline.annotate(document); + + // these are all the sentences in this document + // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types + List<CoreMap> sentences = document.get(SentencesAnnotation.class); + + for(CoreMap sentence: sentences) { + for (CoreLabel token: sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = token.get(TextAnnotation.class); + // this is the POS tag of the token + String pos = token.get(PartOfSpeechAnnotation.class); + + tokenList.add(word); + tagList.add(pos); + } + } return new Tagging<String>(tokenList, tagList); } - } Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/util/SPARQLEndpointMetrics.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/util/SPARQLEndpointMetrics.java 2013-02-18 14:16:54 UTC (rev 3901) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/util/SPARQLEndpointMetrics.java 2013-02-25 11:47:56 UTC (rev 3902) @@ -9,6 +9,7 @@ import java.util.SortedSet; import java.util.TreeSet; +import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.dllearner.core.owl.Individual; import org.dllearner.core.owl.NamedClass; @@ -34,8 +35,8 @@ public SPARQLEndpointMetrics(SparqlEndpoint endpoint, ExtractionDBCache cache) { this.endpoint = endpoint; this.cache = cache; - cache.setFreshnessInMilliseconds(31536000000l); - cache.setMaxExecutionTimeInSeconds(30); + cache.setFreshnessInMilliseconds(Long.MAX_VALUE);//31536000000l); + cache.setMaxExecutionTimeInSeconds(300); this.reasoner = new SPARQLReasoner(new SparqlEndpointKS(endpoint), cache); } @@ -214,6 +215,32 @@ } /** + * Returns the number of triples where the given individual is in subject position(out-going links). + * @param cls + * @return + */ + public int getOccurencesInSubjectPosition(Individual ind){ + log.trace(String.format("Computing number of occurences in subject position for %s", ind.getName())); + String query = String.format("SELECT (COUNT(*) AS ?cnt) WHERE {<%s> ?p ?o.}", ind.getName()); + ResultSet rs = SparqlQuery.convertJSONtoResultSet(cache.executeSelectQuery(endpoint, query)); + int classOccurenceCnt = rs.next().getLiteral("cnt").getInt(); + return classOccurenceCnt; + } + + /** + * Returns the number of triples where the given individual is in object position(in-going links). + * @param cls + * @return + */ + public int getOccurencesInObjectPosition(Individual ind){ + log.trace(String.format("Computing number of occurences in object position for %s", ind.getName())); + String query = String.format("SELECT (COUNT(*) AS ?cnt) WHERE {?s ?p <%s>.}", ind.getName()); + ResultSet rs = SparqlQuery.convertJSONtoResultSet(cache.executeSelectQuery(endpoint, query)); + int classOccurenceCnt = rs.next().getLiteral("cnt").getInt(); + return classOccurenceCnt; + } + + /** * Returns the number triples with the given property as predicate. * @param prop * @return @@ -394,8 +421,9 @@ } public static void main(String[] args) { - SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); - ExtractionDBCache cache = new ExtractionDBCache("/opt/tbsl/dbpedia_pmi_cache"); + Logger.getLogger(SPARQLEndpointMetrics.class).setLevel(Level.DEBUG); + SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpediaLiveOpenLink(); + ExtractionDBCache cache = new ExtractionDBCache("/opt/tbsl/dbpedia_pmi_cache_v2"); String NS = "http://dbpedia.org/ontology/"; String NS_Res = "http://dbpedia.org/resource/"; Modified: trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/TBSLTest.java =================================================================== --- trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/TBSLTest.java 2013-02-18 14:16:54 UTC (rev 3901) +++ trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/TBSLTest.java 2013-02-25 11:47:56 UTC (rev 3902) @@ -3,6 +3,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.net.URL; import java.util.Collections; @@ -38,9 +39,9 @@ @Override protected void setUp() throws Exception { super.setUp(); - endpoint = new SparqlEndpoint(new URL("http://lgd.aksw.org:8900/sparql"), Collections.singletonList("http://diadem.cs.ox.ac.uk"), Collections.<String>emptyList()); -// model = ModelFactory.createOntologyModel(); -// File dir = new File("/home/lorenz/arbeit/papers/question-answering-iswc-2012/examples/data"); + endpoint = new SparqlEndpoint(new URL("http://[2001:638:902:2010:0:168:35:138]/sparql"), Collections.singletonList("http://diadem.cs.ox.ac.uk"), Collections.<String>emptyList()); + model = ModelFactory.createOntologyModel(); +// File dir = new File("/home/me/work/papers/question-answering-iswc-2012/data_v2"); // try { // for(File f : dir.listFiles()){ // if(f.isFile()){ @@ -53,6 +54,7 @@ // } // } // } +// model.write(new FileOutputStream(dir.getAbsolutePath() + "/oxford-data.ttl"), "TURTLE", null); // model.read(new FileInputStream(new File("/home/lorenz/arbeit/papers/question-answering-iswc-2012/examples/ontology.ttl")), null, "TURTLE"); // } catch (FileNotFoundException e) { // e.printStackTrace(); @@ -88,6 +90,7 @@ SPARQLTemplateBasedLearner2 learner = new SPARQLTemplateBasedLearner2(model, resourcesIndex, classesIndex, propertiesIndex); learner.init(); + learner.setGrammarFiles(new String[]{"tbsl/lexicon/english.lex","tbsl/lexicon/english_oxford.lex"}); String question = "Give me all houses with more than 3 bathrooms and more than 2 bedrooms."; @@ -117,9 +120,10 @@ learner.setGrammarFiles(new String[]{"tbsl/lexicon/english.lex","tbsl/lexicon/english_oxford.lex"}); String question = "Give me all houses near a school."; - question = "Give me all houses with more than 3 bathrooms and more than 2 bedrooms."; - question = "Give me all Victorian houses in Oxfordshire"; - question = "Edwardian houses close to supermarket for less than 1,000,000 in Oxfordshire"; + question = "Give me all houses with more than 3 bathrooms."; + question = "houses at walking distance from a pharmacy"; +// question = "Give me all Victorian houses in Oxfordshire"; +// question = "Edwardian houses close to supermarket for less than 1,000,000 in Oxfordshire"; // question = "Give me all family houses with more than 2 bathrooms and more than 4 bedrooms"; learner.setQuestion(question); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |