From: <lor...@us...> - 2011-02-26 11:49:36
|
Revision: 2665 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=2665&view=rev Author: lorenz_b Date: 2011-02-26 11:49:29 +0000 (Sat, 26 Feb 2011) Log Message: ----------- Continued evaluation script. Added class to preprocess given question by extracting words, remove stop words and stem them. Modified Paths: -------------- trunk/autosparql/src/main/java/org/dllearner/autosparql/evaluation/EvaluationWithNLQueriesScript.java trunk/autosparql/src/main/java/org/dllearner/autosparql/server/ExampleFinder.java trunk/autosparql/src/main/java/org/dllearner/autosparql/server/search/DBpediaLuceneIndexCreator.java trunk/autosparql/src/main/java/org/dllearner/autosparql/server/search/LuceneSearch.java trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/QuestionBasedStatementFilter.java Added Paths: ----------- trunk/autosparql/src/main/java/org/dllearner/autosparql/server/search/QuestionProcessor.java Modified: trunk/autosparql/src/main/java/org/dllearner/autosparql/evaluation/EvaluationWithNLQueriesScript.java =================================================================== --- trunk/autosparql/src/main/java/org/dllearner/autosparql/evaluation/EvaluationWithNLQueriesScript.java 2011-02-25 11:23:58 UTC (rev 2664) +++ trunk/autosparql/src/main/java/org/dllearner/autosparql/evaluation/EvaluationWithNLQueriesScript.java 2011-02-26 11:49:29 UTC (rev 2665) @@ -5,12 +5,15 @@ import java.io.BufferedReader; import java.io.File; +import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; +import java.io.StringReader; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Hashtable; @@ -25,9 +28,17 @@ import javax.xml.parsers.ParserConfigurationException; import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.FileAppender; +import org.apache.log4j.Layout; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.log4j.PatternLayout; +import org.apache.lucene.analysis.StopAnalyzer; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.util.Version; +import org.apache.solr.analysis.StopFilterFactory; import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer; @@ -37,13 +48,18 @@ import org.dllearner.autosparql.client.exception.SPARQLQueryException; import org.dllearner.autosparql.server.ExampleFinder; import org.dllearner.autosparql.server.Generalisation; +import org.dllearner.autosparql.server.NBR; import org.dllearner.autosparql.server.exception.TimeOutException; +import org.dllearner.autosparql.server.search.DBpediaSchemaIndex; +import org.dllearner.autosparql.server.search.LuceneSearch; +import org.dllearner.autosparql.server.search.QuestionProcessor; import org.dllearner.autosparql.server.util.SPARQLEndpointEx; import org.dllearner.kb.sparql.ExtractionDBCache; import org.dllearner.kb.sparql.SparqlEndpoint; import org.dllearner.kb.sparql.SparqlQuery; import org.dllearner.sparqlquerygenerator.operations.lgg.LGGGeneratorImpl; import org.dllearner.sparqlquerygenerator.util.ExactMatchFilter; +import org.dllearner.sparqlquerygenerator.util.QuestionBasedStatementFilter; import org.w3c.dom.DOMException; import org.w3c.dom.Document; import org.w3c.dom.Element; @@ -52,12 +68,19 @@ import com.hp.hpl.jena.query.ResultSet; +import edu.stanford.nlp.ling.HasWord; +import edu.stanford.nlp.ling.Sentence; +import edu.stanford.nlp.ling.TaggedWord; +import edu.stanford.nlp.tagger.maxent.MaxentTagger; + public class EvaluationWithNLQueriesScript { private static Logger logger = Logger.getLogger(EvaluationWithNLQueriesScript.class); private static final boolean USE_SYNONYMS = false; private static final String SOLR_SERVER_URL = "http://139.18.2.164:8983/solr/dbpediaCore/"; private static final String QUERY_ANSWERS_FILE_PATH = "evaluation/dbpedia-train_cleaned.xml"; + private static final String SCHEMA_FILE_PATH = "evaluation/dbpedia_schema.owl"; + private static final String LUCENE_INDEX_DIRECTORY = "/opt/autosparql/index"; private static final SparqlEndpoint ENDPOINT = SparqlEndpoint.getEndpointDBpediaLiveAKSW(); private static final int NR_OF_POS_START_EXAMPLES_COUNT = 3; @@ -76,7 +99,12 @@ private ExampleFinder exFinder; + private DBpediaSchemaIndex schemaIndex; + private LuceneSearch luceneSearch; + private QuestionProcessor qProcessor = new QuestionProcessor(); + + public EvaluationWithNLQueriesScript(){ try { server = new CommonsHttpSolrServer(SOLR_SERVER_URL); @@ -88,6 +116,8 @@ exFinder = new ExampleFinder(new SPARQLEndpointEx( new SparqlEndpoint(new URL("http://lod.openlinksw.com/sparql"), Collections.singletonList("http://dbpedia.org"), Collections.<String>emptyList()), null, null, predicateFilters), selectCache, constructCache); + schemaIndex = new DBpediaSchemaIndex(SCHEMA_FILE_PATH); + luceneSearch = new LuceneSearch(LUCENE_INDEX_DIRECTORY); } catch (MalformedURLException e) { e.printStackTrace(); } @@ -137,13 +167,13 @@ logger.info("Done."); } - private Set<String> getResourcesByNLQuery(String query){ + private Set<String> getResourcesByNLQuery(String question){ logger.info("Getting Top " + TOP_K + " resources related to question with Solr..."); Set<String> resources = new HashSet<String>(); QueryResponse response; try { ModifiableSolrParams params = new ModifiableSolrParams(); - params.set("q", query); + params.set("q", question); params.set("rows", TOP_K); response = server.query(params); for(SolrDocument d : response.getResults()){ @@ -157,6 +187,14 @@ return resources; } + private List<String> getRelevantWords(String question){ + return qProcessor.getRelevantWords(question); +// Properties props = new Properties(); +// props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); +// StanfordCoreNLP pipeline = new StanfordCoreNLP(props); + + } + private List<String> getResourcesByWikipedia(String query) { logger.info("Getting Top " + TOP_K + " resources related to question with Wikipedia..."); long startTime = System.currentTimeMillis(); @@ -179,7 +217,6 @@ rd.close(); XMLPagesParser parser = new XMLPagesParser(sb.toString()); parser.parse(); - String title; for (Page page : parser.getPagesList()) { resources.add("http://dbpedia.org/resource/" + page.getTitle().replace(" ", "_")); } @@ -195,6 +232,15 @@ return resources; } + private List<String> getSchemaElementsByQuery(String query){ + logger.info("Getting Top " + TOP_K + " schema elements related to question with Lucene..."); + long startTime = System.currentTimeMillis(); + List<String> elements = schemaIndex.getResources(query); + logger.info("Got " + elements.size() + " elements in " + (System.currentTimeMillis()-startTime) + "ms."); + logger.info(elements); + return elements; + } + private Set<String> getResourcesBySPARQLQuery(String query){ logger.info("Sending query..."); long startTime = System.currentTimeMillis(); @@ -212,6 +258,7 @@ Set<String> answers; List<String> examples; Set<String> relatedResources; + List<String> relevantWords; int i = 1; for(String question : question2Answers.keySet()){ logger.info(getNewQuestionString(i++, question)); @@ -221,12 +268,23 @@ logger.info("Target query: \n" + targetQuery); answers = question2Answers.get(question); logger.info("Answers (" + answers.size() + "): " + answers); - examples = getResourcesByWikipedia(question); - relatedResources = getResourcesByNLQuery(question.substring(0, question.length()-1)); + //preprocess question to extract only relevant words and set them as filter for statements + relevantWords = getRelevantWords(question); + exFinder.setStatementFilter(new QuestionBasedStatementFilter(new HashSet<String>(relevantWords))); + question = ""; + for(String word : relevantWords){ + question += " " + word; + } + question.trim(); + logger.info("Rebuilt question string: " + question); + //get examples + examples = getResourcesByWikipedia(question);//luceneSearch.getResources(question) + //get resources which are relevant for query and add them as filter for objects +// relatedResources = getResourcesByNLQuery(question.substring(0, question.length()-1)); +// relatedResources.addAll(getSchemaElementsByQuery(question.substring(0, question.length()-1))); +// exFinder.setObjectFilter(new ExactMatchFilter(relatedResources)); - exFinder.setObjectFilter(new ExactMatchFilter(relatedResources)); - - //select some positive example and negative examples + //select some positive and negative examples List<String> posExamples = new ArrayList<String>(); List<String> negExamples = new ArrayList<String>(); for(String ex : examples){ @@ -303,8 +361,16 @@ Logger.getLogger(Generalisation.class).setLevel(Level.OFF); Logger.getLogger(LGGGeneratorImpl.class).setLevel(Level.OFF); Logger.getRootLogger().removeAllAppenders(); - ConsoleAppender appender = new ConsoleAppender(new PatternLayout("%m%n")); + Layout layout = new PatternLayout("%m%n"); + ConsoleAppender appender = new ConsoleAppender(layout); Logger.getRootLogger().addAppender(appender); + FileAppender fileAppender = new FileAppender( + layout, "log/evaluation.log", false); + fileAppender.setThreshold(Level.DEBUG); + Logger.getRootLogger().addAppender(fileAppender); + Logger.getLogger(NBR.class).setLevel(Level.DEBUG); + + new EvaluationWithNLQueriesScript().evaluate(); } Modified: trunk/autosparql/src/main/java/org/dllearner/autosparql/server/ExampleFinder.java =================================================================== --- trunk/autosparql/src/main/java/org/dllearner/autosparql/server/ExampleFinder.java 2011-02-25 11:23:58 UTC (rev 2664) +++ trunk/autosparql/src/main/java/org/dllearner/autosparql/server/ExampleFinder.java 2011-02-26 11:49:29 UTC (rev 2665) @@ -30,6 +30,7 @@ import com.hp.hpl.jena.query.QuerySolution; import com.hp.hpl.jena.query.ResultSetRewindable; import com.hp.hpl.jena.rdf.model.Model; +import com.hp.hpl.jena.rdf.model.Selector; import com.hp.hpl.jena.vocabulary.RDFS; public class ExampleFinder { @@ -454,6 +455,10 @@ queryTreeCache.setObjectFilter(filter); } + public void setStatementFilter(Selector filter){ + queryTreeCache.setStatementFilter(filter); + } + public String getCurrentQuery(){ return currentQuery; } Modified: trunk/autosparql/src/main/java/org/dllearner/autosparql/server/search/DBpediaLuceneIndexCreator.java =================================================================== --- trunk/autosparql/src/main/java/org/dllearner/autosparql/server/search/DBpediaLuceneIndexCreator.java 2011-02-25 11:23:58 UTC (rev 2664) +++ trunk/autosparql/src/main/java/org/dllearner/autosparql/server/search/DBpediaLuceneIndexCreator.java 2011-02-26 11:49:29 UTC (rev 2665) @@ -178,7 +178,7 @@ ps.setString(1, uri); ResultSet rs = ps.executeQuery(); if(rs.next()){ - pageRank = rs.getInt("pagerank"); + pageRank = rs.getInt("rank"); } } catch (SQLException e) { e.printStackTrace(); Modified: trunk/autosparql/src/main/java/org/dllearner/autosparql/server/search/LuceneSearch.java =================================================================== --- trunk/autosparql/src/main/java/org/dllearner/autosparql/server/search/LuceneSearch.java 2011-02-25 11:23:58 UTC (rev 2664) +++ trunk/autosparql/src/main/java/org/dllearner/autosparql/server/search/LuceneSearch.java 2011-02-26 11:49:29 UTC (rev 2665) @@ -85,7 +85,7 @@ Document d = searcher.doc(doc.doc); String uri = d.get("uri"); String label = d.get("label"); - String comment = d.get("abstract"); + String comment = d.get("comment"); String imageURL = d.get("imageURL"); examples.add(new Example(uri, label, imageURL, comment)); } Added: trunk/autosparql/src/main/java/org/dllearner/autosparql/server/search/QuestionProcessor.java =================================================================== --- trunk/autosparql/src/main/java/org/dllearner/autosparql/server/search/QuestionProcessor.java (rev 0) +++ trunk/autosparql/src/main/java/org/dllearner/autosparql/server/search/QuestionProcessor.java 2011-02-26 11:49:29 UTC (rev 2665) @@ -0,0 +1,110 @@ +package org.dllearner.autosparql.server.search; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.log4j.Logger; + +import edu.stanford.nlp.ling.HasWord; +import edu.stanford.nlp.ling.TaggedWord; +import edu.stanford.nlp.process.Morphology; +import edu.stanford.nlp.tagger.maxent.MaxentTagger; + +public class QuestionProcessor { + + private final Logger logger = Logger.getLogger(QuestionProcessor.class); + + private MaxentTagger tagger; + private final List<String> stopWords = Arrays.asList( + "a", "all", "an", "and", "are", "as", "at", "be", "but", "by", + "for", "he", "if", "in", "into", "is", "it", "me", + "no", "not", "of", "on", "or", "she", "such", + "that", "the", "their", "then", "there", "these", + "they", "this", "to", "was", "will", "with" + ); + + public QuestionProcessor(){ + try { + tagger = new MaxentTagger("src/main/resources/de/simba/ner/models/left3words-wsj-0-18.tagger"); + } catch (IOException e) { + e.printStackTrace(); + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } + } + + public List<String> getRelevantWords(String question){ + logger.info("Processing question \"" + question + "\"..."); + //tokenize question + List<String> words = getWords(question); + logger.info("Extracted words: " + words); + //remove stop words + removeStopWords(words); + logger.info("After removed stop words: " + words); + //stem words + words = getStemmedWords(words); + logger.info("After stemming: " + words); + + return words; + } + + private List<String> getWords(String question){ + List<String> words = new ArrayList<String>(); + List<ArrayList<? extends HasWord>> sentences = tagger.tokenizeText(new BufferedReader(new StringReader(question))); + for (ArrayList<? extends HasWord> sentence : sentences) { + ArrayList<TaggedWord> tSentence = tagger.tagSentence(sentence); + String nounPhrase = ""; + boolean firstWord = true; + for(TaggedWord tWord : tSentence){ + //ignore first word if it is a verb + if(firstWord){ + if(tWord.tag().startsWith("V")){ + continue; + } + firstWord = false; + } + //if words belongs to noun phrase treat them as one single term + if(tWord.tag().equals("NNP")){ + nounPhrase += " " + tWord.word(); + } else { + if(!nounPhrase.isEmpty()){ + words.add(nounPhrase.trim()); + nounPhrase = ""; + } + //ignore punctuation signs + if(!tWord.tag().equals(".")){ + words.add(tWord.word()); + } + } + + } + if(!nounPhrase.isEmpty()){ + words.add(nounPhrase.trim() ); + nounPhrase = ""; + } + } + return words; + } + + private void removeStopWords(List<String> words){ + words.removeAll(stopWords); + } + + private List<String> getStemmedWords(List<String> words) { + List<String> stemmedWords = new ArrayList<String>(); + Morphology morpho = new Morphology(); + for (String w : words) { + if(!(w.indexOf(" ") > 0)){ + stemmedWords.add(morpho.stem(w)); + } else { + stemmedWords.add(w); + } + } + return stemmedWords; + } + +} Modified: trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/QuestionBasedStatementFilter.java =================================================================== --- trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/QuestionBasedStatementFilter.java 2011-02-25 11:23:58 UTC (rev 2664) +++ trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/QuestionBasedStatementFilter.java 2011-02-26 11:49:29 UTC (rev 2665) @@ -1,5 +1,7 @@ package org.dllearner.sparqlquerygenerator.util; +import java.util.Set; + import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance; @@ -11,12 +13,13 @@ public class QuestionBasedStatementFilter implements Selector { - private String question; + private Set<String> questionWords; private AbstractStringMetric metric; private double threshold = 0.7; - public QuestionBasedStatementFilter(String question){ - this.question = question; + + public QuestionBasedStatementFilter(Set<String> questionWords){ + this.questionWords = questionWords; metric = new QGramsDistance(); } @@ -24,17 +27,29 @@ @Override public boolean test(Statement s) { String predicate = s.getPredicate().getURI().substring(s.getPredicate().getURI().lastIndexOf("/")); - String object; + String object = null; if(s.getObject().isURIResource()){ object = s.getObject().asResource().getURI(); object = object.substring(object.lastIndexOf("/")); } else if(s.getObject().isLiteral()){ object = s.getObject().asLiteral().getLexicalForm(); } + if(isSimiliar2QuestionWord(object) || isSimiliar2QuestionWord(predicate)){ + return true; + } return false; } + private boolean isSimiliar2QuestionWord(String s){ + for(String word : questionWords){ + if(areSimiliar(word, s)){ + return true; + } + } + return false; + } + private boolean areSimiliar(String s1, String s2){ float sim = metric.getSimilarity(s1, s2); return sim >= threshold; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |