From: <lor...@us...> - 2011-06-14 04:11:29
|
Revision: 2870 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=2870&view=rev Author: lorenz_b Date: 2011-06-14 04:11:22 +0000 (Tue, 14 Jun 2011) Log Message: ----------- Switched to new SOLR libs and index built on this version. Made NER case insensitive. Some small changes in Eval script. Set max query execution time to 10 s. Modified Paths: -------------- trunk/components-ext/pom.xml trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/Preprocessor.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/LingPipeNER.java trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/Evaluation.java Modified: trunk/components-ext/pom.xml =================================================================== --- trunk/components-ext/pom.xml 2011-06-13 21:32:50 UTC (rev 2869) +++ trunk/components-ext/pom.xml 2011-06-14 04:11:22 UTC (rev 2870) @@ -46,7 +46,7 @@ <dependency> <groupId>org.apache.solr</groupId> <artifactId>solr-core</artifactId> - <version>1.4.1</version> + <version>3.1.0</version> <type>jar</type> <scope>compile</scope> </dependency> Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner.java 2011-06-13 21:32:50 UTC (rev 2869) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner.java 2011-06-14 04:11:22 UTC (rev 2870) @@ -8,6 +8,7 @@ import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -57,12 +58,13 @@ private Monitor mon = MonitorFactory.getTimeMonitor("stbl"); private static final int TOP_K = 5; - private static final String SOLR_SERVER_URL = "http://139.18.2.173:8080/apache-solr-1.4.1"; + private static final String SOLR_SERVER_URL = "http://139.18.2.173:8080/apache-solr-3.1.0"; private static final int RECURSION_DEPTH = 2; private Ranking ranking = Ranking.SIMILARITY; private boolean useRemoteEndpointValidation = true; private boolean stopIfQueryResultNotEmpty = true; + private int maxQueriesPerTemplate = 25; private SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpediaLiveAKSW(); private ExtractionDBCache cache = new ExtractionDBCache("cache"); @@ -83,13 +85,16 @@ private Map<String, List<String>> learnedSPARQLQueries; private Set<Template> templates; - private Collection<? extends Query> sparqlQueryCandidates; + private Collection<Query> sparqlQueryCandidates; + private Map<Template, Collection<? extends Query>> template2Queries; private Map<String, String> prefixMap; private Lemmatizer lemmatizer = new LingPipeLemmatizer();// StanfordLemmatizer(); + private int maxQueryExecutionTimeInSeconds = 10; + public SPARQLTemplateBasedLearner(){ resource_index = new SolrSearch(SOLR_SERVER_URL + "/dbpedia_resources"); resource_index.setHitsPerPage(TOP_K); @@ -113,6 +118,8 @@ modelGenenerator = new ModelGenerator(endpoint, predicateFilters); templateGenerator = new Templator(); + + cache.setMaxExecutionTimeInSeconds(maxQueryExecutionTimeInSeconds); } public void setEndpoint(SparqlEndpoint endpoint){ @@ -131,6 +138,14 @@ this.useRemoteEndpointValidation = useRemoteEndpointValidation; } + public int getMaxQueryExecutionTimeInSeconds() { + return maxQueryExecutionTimeInSeconds; + } + + public void setMaxQueryExecutionTimeInSeconds(int maxQueryExecutionTimeInSeconds) { + this.maxQueryExecutionTimeInSeconds = maxQueryExecutionTimeInSeconds; + } + public void setRanking(Ranking ranking) { this.ranking = ranking; } @@ -154,8 +169,9 @@ logger.info(t); } - //generate SPARQL query candidates - sparqlQueryCandidates = getSPARQLQueryCandidates(templates, ranking); + //generate SPARQL query candidates, but select only a fixed number per template + template2Queries = getSPARQLQueryCandidates(templates, ranking); + sparqlQueryCandidates = getNBestQueryCandidatesForTemplates(template2Queries); //test candidates if(useRemoteEndpointValidation){ //on remote endpoint @@ -183,7 +199,10 @@ //generate SPARQL query candidates logger.info("Generating SPARQL query candidates..."); mon.start(); - sparqlQueryCandidates = getSPARQLQueryCandidates(templates, ranking); + Map<Template, Collection<? extends Query>> template2Queries = getSPARQLQueryCandidates(templates, ranking); + sparqlQueryCandidates = getNBestQueryCandidatesForTemplates(template2Queries); + + mon.stop(); logger.info("Done in " + mon.getLastValue() + "ms."); @@ -208,6 +227,10 @@ return queries; } + public Map<Template, Collection<? extends Query>> getTemplates2SPARQLQueries(){ + return template2Queries; + } + private Model getWorkingModel(List<String> resources){ logger.info("Generating local model..."); mon.start(); @@ -223,7 +246,7 @@ return workingModel; } - private Collection<? extends Query> getSPARQLQueryCandidates(Set<Template> templates, Ranking ranking){ + private Map<Template,Collection<? extends Query>> getSPARQLQueryCandidates(Set<Template> templates, Ranking ranking){ switch(ranking){ case LUCENE: return getSPARQLQueryCandidatesSortedByLucene(templates); case SIMILARITY: return getSPARQLQueryCandidatesSortedBySimilarity(templates); @@ -232,13 +255,15 @@ } } - private Set<Query> getSPARQLQueryCandidates(Set<Template> templates){ + private Map<Template, Collection<? extends Query>> getSPARQLQueryCandidates(Set<Template> templates){ logger.info("Generating candidate SPARQL queries..."); mon.start(); Set<Query> queries = new HashSet<Query>(); - + Map<Template, Collection<? extends Query>> template2Queries = new HashMap<Template, Collection<? extends Query>>(); for(Template template : templates){ + queries = new HashSet<Query>(); queries.add(template.getQuery()); + template2Queries.put(template, queries); for(Slot slot : template.getSlots()){ Set<Query> tmp = new HashSet<Query>(); String var = slot.getAnchor(); @@ -258,7 +283,7 @@ } mon.stop(); logger.info("Done in " + mon.getLastValue() + "ms."); - return queries; + return template2Queries; } private Map<String, Float> getCandidateRatedSPARQLQueries(Set<Template> templates){ @@ -290,15 +315,18 @@ return query2Score; } - private Set<RatedQuery> getSPARQLQueryCandidatesSortedByLucene(Set<Template> templates){ + private Map<Template, Collection<? extends Query>> getSPARQLQueryCandidatesSortedByLucene(Set<Template> templates){ logger.info("Generating candidate SPARQL queries..."); mon.start(); SortedSet<RatedQuery> ratedQueries = new TreeSet<RatedQuery>(); + Map<Template, Collection<? extends Query>> template2Queries = new HashMap<Template, Collection<? extends Query>>(); Query query; for(Template template : templates){ query = template.getQuery(); + ratedQueries = new TreeSet<RatedQuery>(); ratedQueries.add(new RatedQuery(query, 0)); + template2Queries.put(template, ratedQueries); for(Slot slot : template.getSlots()){ Set<RatedQuery> tmp = new HashSet<RatedQuery>(); String var = slot.getAnchor(); @@ -319,16 +347,19 @@ } mon.stop(); logger.info("Done in " + mon.getLastValue() + "ms."); - return ratedQueries; + return template2Queries; } - private List<Query> getSPARQLQueryCandidatesSortedBySimilarity(Set<Template> templates){ + private Map<Template, Collection<? extends Query>> getSPARQLQueryCandidatesSortedBySimilarity(Set<Template> templates){ logger.info("Generating candidate SPARQL queries..."); mon.start(); List<Query> queries = new ArrayList<Query>(); + Map<Template, Collection<? extends Query>> template2Queries = new HashMap<Template, Collection<? extends Query>>(); List<String> uriCandidates; for(Template template : templates){ + queries = new ArrayList<Query>(); queries.add(template.getQuery()); + template2Queries.put(template, queries); for(Slot slot : template.getSlots()){ List<Query> tmp = new ArrayList<Query>(); String var = slot.getAnchor(); @@ -364,7 +395,7 @@ } mon.stop(); logger.info("Done in " + mon.getLastValue() + "ms."); - return queries; + return template2Queries; } private Set<String> getCandidateURIs(Slot slot){ @@ -520,6 +551,22 @@ return uri2Score; } + private List<Query> getNBestQueryCandidatesForTemplates(Map<Template, Collection<? extends Query>> template2Queries){ + List<Query> queries = new ArrayList<Query>(); + for(Entry<Template, Collection<? extends Query>> entry : template2Queries.entrySet()){ + int max = Math.min(maxQueriesPerTemplate, entry.getValue().size()); + int i = 0; + for(Query q : entry.getValue()){ + queries.add(q); + i++; + if(i == max){ + break; + } + } + } + return queries; + } + private void validateAgainstRemoteEndpoint(Collection<? extends Query> queries){ List<String> queryStrings = new ArrayList<String>(); for(Query query : queries){ @@ -609,7 +656,7 @@ // Logger.getLogger(DefaultHttpParams.class).setLevel(Level.OFF); // Logger.getLogger(HttpClient.class).setLevel(Level.OFF); // Logger.getLogger(HttpMethodBase.class).setLevel(Level.OFF); - String question = "Who are the presidents of the United States?"; + String question = "Give me all school types."; // String question = "Give me all films starring Brad Pitt"; SPARQLTemplateBasedLearner learner = new SPARQLTemplateBasedLearner(); SparqlEndpoint endpoint = new SparqlEndpoint(new URL("http://live.dbpedia.org/sparql"), Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/Preprocessor.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/Preprocessor.java 2011-06-13 21:32:50 UTC (rev 2869) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/ltag/parser/Preprocessor.java 2011-06-14 04:11:22 UTC (rev 2870) @@ -15,7 +15,7 @@ static final String[] genericReplacements = { "\"", "", "'", "", "[!?.,;]", "" }; static final String[] englishReplacements = { "don't", "do not", "doesn't", "does not" }; - static NER ner = new LingPipeNER(); + static NER ner = new LingPipeNER(false);//not case sensitive best solution? public Preprocessor() { } Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/LingPipeNER.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/LingPipeNER.java 2011-06-13 21:32:50 UTC (rev 2869) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/LingPipeNER.java 2011-06-14 04:11:22 UTC (rev 2870) @@ -28,7 +28,7 @@ } public LingPipeNER(boolean caseSensitive) { - this(caseSensitive, true); + this(caseSensitive, false); } public LingPipeNER(boolean caseSensitive, boolean allMatches) { Modified: trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/Evaluation.java =================================================================== --- trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/Evaluation.java 2011-06-13 21:32:50 UTC (rev 2869) +++ trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/Evaluation.java 2011-06-14 04:11:22 UTC (rev 2870) @@ -1,15 +1,14 @@ package org.dllearner.algorithm.tbsl; -import java.io.BufferedWriter; import java.io.File; -import java.io.FileWriter; import java.io.IOException; import java.io.UnsupportedEncodingException; -import java.io.Writer; import java.net.URLDecoder; +import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.SortedMap; @@ -29,6 +28,7 @@ import org.apache.log4j.PatternLayout; import org.dllearner.algorithm.tbsl.learning.NoTemplateFoundException; import org.dllearner.algorithm.tbsl.learning.SPARQLTemplateBasedLearner; +import org.dllearner.algorithm.tbsl.sparql.Query; import org.dllearner.algorithm.tbsl.sparql.Template; import org.dllearner.algorithm.tbsl.util.LatexWriter; import org.w3c.dom.DOMException; @@ -148,6 +148,9 @@ answer = endpoint.executeAsk(query); } else { answer = new HashSet<String>(); + if(!query.contains("LIMIT")){ + query = query + " LIMIT 200"; + } ResultSet rs = endpoint.executeSelect(query); String variable; if(rs.getResultVars().size() == 1){ @@ -182,7 +185,7 @@ public void run(){ - int topN2Print = 25; + int topN2Print = 10; int questionId; @@ -193,7 +196,7 @@ latex.beginDocument(); int i = 0; for(Entry<Integer, String> entry : id2Question.entrySet()){ - if(i++ == 1)break; +// if(i++ == 1)break; try { questionId = entry.getKey(); question = entry.getValue(); @@ -226,10 +229,8 @@ if(learnedQuery != null){ learnedAnswer = getAnswerForSPARQLQuery(learnedQuery, "y"); } - //get the generated SPARQL query candidates - List<String> queries = stbl.getGeneratedSPARQLQueries(); //get the used templates - Set<Template> templates = stbl.getTemplates(); + List<Template> templates = new ArrayList<Template>(stbl.getTemplates()); //start output //write templates subsection @@ -242,29 +243,33 @@ } latex.endEnumeration(); + //get the generated SPARQL query candidates + Map<Template, Collection<? extends Query>> template2Queries = stbl.getTemplates2SPARQLQueries(); + //write generated queries subsection - latex.beginSubsection("Top " + topN2Print + " generated queries (max. " + queries.size() + ")"); - logger.info("LEARNED QUERIES(#" + queries.size() + "):\n"); - int cnt = 1; - if(!queries.isEmpty()){ - latex.beginEnumeration(); + latex.beginSubsection("Top " + topN2Print + " generated queries per template"); + int k = 1; + List<Query> queries; + for(Template t : templates){ + latex.beginSubSubsection("Template " + k); + queries = new ArrayList<Query>(template2Queries.get(t)); + if(!queries.isEmpty()){ + latex.beginEnumeration(); + } + //print top n queries to latex file + int max = Math.min(topN2Print, queries.size()); + for(int j = 0; j < max; j++){ + latex.beginEnumerationItem(); + latex.addListing(queries.get(j).toString()); + latex.endEnumerationItem(); + } + if(!queries.isEmpty()){ + latex.endEnumeration(); + } + k++; } - //print queries to log file - for(String q : queries){ - logger.info("QUERY " + cnt++ + ":\n" + q + "\n"); - logger.info("--------"); - } - //print top n queries to latex file - int max = Math.min(topN2Print, queries.size()); - for(int j = 0; j < max; j++){ - latex.beginEnumerationItem(); - latex.addListing(queries.get(j)); - latex.endEnumerationItem(); - } - if(!queries.isEmpty()){ - latex.endEnumeration(); - } + //write solution subsection if exists if(learnedQuery != null){ latex.beginSubsection("Solution"); @@ -299,7 +304,7 @@ latex.beginDocument(); int i = 0; for(Entry<Integer, String> entry : id2Question.entrySet()){ - if(i++ == 1)break; +// if(i++ == 1)break; try { questionId = entry.getKey(); question = entry.getValue(); @@ -340,7 +345,7 @@ latex.endEnumeration(); //write generated queries subsection - latex.beginSubsection("Top " + topN2Print + " generated queries (max. " + queries.size() + ")"); + latex.beginSubsection("Top " + topN2Print + " generated queries per template"); logger.info("LEARNED QUERIES(#" + queries.size() + "):\n"); int cnt = 1; if(!queries.isEmpty()){ @@ -385,13 +390,14 @@ if(target.contains(s)){ s = "\\textcolor{green}{" + s + "}"; } - sb.append(URLDecoder.decode(s, "UTF-8").replace("_", "\\_").replace("http://dbpedia.org/resource/", "")).append(", "); +// sb.append(URLDecoder.decode(s, "UTF-8").replace("_", "\\_").replace("http://dbpedia.org/resource/", "")).append(", "); + sb.append(s.replace("_", "\\_").replace("&", "\\&").replace("%", "\\%").replace("#", "\\#").replace("http://dbpedia.org/resource/", "")).append(", "); if(i % 2 == 0){ sb.append("\n"); } i++; } - } catch (UnsupportedEncodingException e) { + } catch (Exception e) { e.printStackTrace(); } return sb.toString(); @@ -407,13 +413,14 @@ try { int i = 1; for(String s : (Collection<String>)learnedAnswer){ - sb.append(URLDecoder.decode(s, "UTF-8").replace("_", "\\_").replace("http://dbpedia.org/resource/", "")).append(", "); +// sb.append(URLDecoder.decode(s, "UTF-8").replace("_", "\\_").replace("http://dbpedia.org/resource/", "")).append(", "); + sb.append(s.replace("_", "\\_").replace("&", "\\&").replace("%", "\\%").replace("#", "\\#").replace("http://dbpedia.org/resource/", "")).append(", "); if(i % 2 == 0){ sb.append("\n"); } i++; } - } catch (UnsupportedEncodingException e) { + } catch (Exception e) { e.printStackTrace(); } return sb.toString(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |