From: <lor...@us...> - 2011-09-28 16:02:31
|
Revision: 3289 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3289&view=rev Author: lorenz_b Date: 2011-09-28 16:02:20 +0000 (Wed, 28 Sep 2011) Log Message: ----------- Added new index for properties based on NLP pattern extracted with the BOA framework from Daniel and Axel. Modified Paths: -------------- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/WordNet.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/Search.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/SolrSearch.java trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/Templator.java trunk/components-ext/src/main/resources/tbsl/tbsl.properties Added Paths: ----------- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/HierarchicalSolrSearch.java Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner.java 2011-09-27 13:16:47 UTC (rev 3288) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/learning/SPARQLTemplateBasedLearner.java 2011-09-28 16:02:20 UTC (rev 3289) @@ -82,6 +82,7 @@ private SolrSearch resource_index; private SolrSearch class_index; private SolrSearch property_index; + private SolrSearch boa_pattern_property_index; private ModelGenerator modelGenenerator; private Templator templateGenerator; @@ -127,14 +128,23 @@ private void init(Options options){ String resourcesIndexUrl = options.fetch("solr.resources.url"); - resource_index = new SolrSearch(resourcesIndexUrl); + String resourcesIndexSearchField = options.fetch("solr.resources.searchfield"); + resource_index = new SolrSearch(resourcesIndexUrl, resourcesIndexSearchField); String classesIndexUrl = options.fetch("solr.classes.url"); - class_index = new SolrSearch(classesIndexUrl); + String classesIndexSearchField = options.fetch("solr.classes.searchfield"); + class_index = new SolrSearch(classesIndexUrl, classesIndexSearchField); String propertiesIndexUrl = options.fetch("solr.properties.url"); - property_index = new SolrSearch(propertiesIndexUrl); + String propertiesIndexSearchField = options.fetch("solr.properties.searchfield"); + property_index = new SolrSearch(propertiesIndexUrl, propertiesIndexSearchField); + String boaPatternIndexUrl = options.fetch("solr.boa.properties.url"); + String boaPatternIndexSearchField = options.fetch("solr.boa.properties.searchfield"); + boa_pattern_property_index = new SolrSearch(boaPatternIndexUrl, boaPatternIndexSearchField); + + int maxIndexResults = Integer.parseInt(options.fetch("solr.query.limit"), 10); + maxQueryExecutionTimeInSeconds = Integer.parseInt(options.get("sparql.query.maxExecutionTimeInSeconds", "20")); cache.setMaxExecutionTimeInSeconds(maxQueryExecutionTimeInSeconds); @@ -493,21 +503,47 @@ if(slot.getSlotType() == SlotType.RESOURCE){ words = slot.getWords(); } else { - words = pruneList(slot.getWords());//getLemmatizedWords(slot.getWords()); +// words = pruneList(slot.getWords());//getLemmatizedWords(slot.getWords()); + words = pruneList(slot.getWords()); } - for(String word : words){ - tmp = new TreeSet<String>(new StringSimilarityComparator(word)); - uris = uriCache.get(word); - if(uris == null){ -// uris = index.getResources("label:\"" + word + "\"~0.7"); - uris = index.getResources("label:" + word + "~0.5"); - uriCache.put(word, uris); + if(slot.getSlotType() == SlotType.PROPERTY || slot.getSlotType() == SlotType.SYMPROPERTY){ + for(String word : words){ + tmp = new TreeSet<String>(new StringSimilarityComparator(word)); + uris = uriCache.get(word); + index = boa_pattern_property_index; + if(uris == null){ + uris = index.getResources(word); + uriCache.put(word, uris); + } + index = property_index; + if(uris.size() < 10){ + uris.addAll(index.getResources(word)); + } + if(uris.size() < 10){ + uris.addAll(index.getResources("" + word + "~0.8")); + } + tmp.addAll(uris); + sortedURIs.addAll(tmp); + tmp.clear(); } - tmp.addAll(uris); - sortedURIs.addAll(tmp); - tmp.clear(); + } else { + for(String word : words){ + tmp = new TreeSet<String>(new StringSimilarityComparator(word)); + uris = uriCache.get(word); + if(uris == null){ + uris = index.getResources(word); + uriCache.put(word, uris); + } + if(uris.size() < 10){ + uris.addAll(index.getResources("" + word + "~0.7")); + } + tmp.addAll(uris); + sortedURIs.addAll(tmp); + tmp.clear(); + } } + slot2URI.put(slot, sortedURIs); mon.stop(); logger.info("Done in " + mon.getLastValue() + "ms."); @@ -521,7 +557,7 @@ boolean smallest = true; for(String w2 : words){ if(!w1.equals(w2)){ - if(w2.contains(w1)){ + if(w1.contains(w2)){ smallest = false; break; } @@ -546,7 +582,6 @@ pruned.add(word); } else { String lemWord = lemmatizer.stem(word); - new LingPipeLemmatizer().stem(word); if(!pruned.contains(lemWord)){ pruned.add(lemWord); } @@ -742,7 +777,7 @@ // String question = "Give me all books written by authors influenced by Ernest Hemingway."; // String question = "Give me all cities in Canada."; - String question = "Give me all soccer clubs in Premier League?"; + String question = "Give me all books written by authors influenced by Ernest Hemingway."; SPARQLTemplateBasedLearner learner = new SPARQLTemplateBasedLearner(); SparqlEndpoint endpoint = new SparqlEndpoint(new URL("http://greententacle.techfak.uni-bielefeld.de:5171/sparql"), Collections.<String>singletonList(""), Collections.<String>emptyList()); Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/WordNet.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/WordNet.java 2011-09-27 13:16:47 UTC (rev 3288) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/WordNet.java 2011-09-28 16:02:20 UTC (rev 3289) @@ -34,6 +34,7 @@ try { IndexWord iw = dict.getIndexWord(pos, s);//dict.getMorphologicalProcessor().lookupBaseForm(pos, s) +// IndexWord iw = dict.getMorphologicalProcessor().lookupBaseForm(pos, s); if(iw != null){ Synset[] synsets = iw.getSenses(); Word[] words = synsets[0].getWords(); Added: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/HierarchicalSolrSearch.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/HierarchicalSolrSearch.java (rev 0) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/HierarchicalSolrSearch.java 2011-09-28 16:02:20 UTC (rev 3289) @@ -0,0 +1,36 @@ +package org.dllearner.algorithm.tbsl.search; + +import java.util.ArrayList; +import java.util.List; + +public class HierarchicalSolrSearch extends SolrSearch { + + private SolrSearch primarySearch; + private SolrSearch secondarySearch; + + public HierarchicalSolrSearch(SolrSearch primarySearch, SolrSearch secondarySearch) { + this.primarySearch = primarySearch; + this.secondarySearch = secondarySearch; + } + + @Override + public List<String> getResources(String queryString) { + return getResources(queryString, 10, 0); + } + + @Override + public List<String> getResources(String queryString, int limit) { + return getResources(queryString, limit, 0); + } + + @Override + public List<String> getResources(String queryString, int limit, int offset) { + List<String> resources = new ArrayList<String>(); + resources = primarySearch.getResources(queryString, limit, offset); + if(resources.size() < limit){ + resources.addAll(secondarySearch.getResources(queryString, limit-resources.size(), offset)); + } + return resources; + } + +} Property changes on: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/HierarchicalSolrSearch.java ___________________________________________________________________ Added: svn:mime-type + text/plain Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/Search.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/Search.java 2011-09-27 13:16:47 UTC (rev 3288) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/Search.java 2011-09-28 16:02:20 UTC (rev 3289) @@ -4,7 +4,8 @@ public interface Search { List<String> getResources(String queryString); - List<String> getResources(String queryString, int offset); + List<String> getResources(String queryString, int limit); + List<String> getResources(String queryString, int limit, int offset); int getTotalHits(String queryString); void setHitsPerPage(int hitsPerPage); Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/SolrSearch.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/SolrSearch.java 2011-09-27 13:16:47 UTC (rev 3288) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/search/SolrSearch.java 2011-09-28 16:02:20 UTC (rev 3289) @@ -22,6 +22,12 @@ private int hitsPerPage = 10; private int lastTotalHits = 0; + private String searchField; + + public SolrSearch() { + // TODO Auto-generated constructor stub + } + public SolrSearch(String solrServerURL){ try { server = new CommonsHttpSolrServer(solrServerURL); @@ -30,22 +36,36 @@ e.printStackTrace(); } } + + public SolrSearch(String solrServerURL, String searchField){ + this(solrServerURL); + this.searchField = searchField; + } @Override public List<String> getResources(String queryString) { - return getResources(queryString, 0); + return getResources(queryString, hitsPerPage); } + + @Override + public List<String> getResources(String queryString, int limit) { + return getResources(queryString, limit, 0); + } @Override - public List<String> getResources(String queryString, int offset) { + public List<String> getResources(String queryString, int limit, int offset) { List<String> resources = new ArrayList<String>(); QueryResponse response; try { - ModifiableSolrParams params = new ModifiableSolrParams(); - params.set("q", queryString); - params.set("rows", hitsPerPage); - params.set("start", offset); - response = server.query(params); + SolrQuery q = new SolrQuery((searchField != null) ? searchField + ":" + queryString : queryString); + q.setStart(offset); + q.setRows(limit); + response = server.query(q); +// ModifiableSolrParams params = new ModifiableSolrParams(); +// params.set("q", queryString); +// params.set("rows", hitsPerPage); +// params.set("start", offset); +// response = server.query(params); SolrDocumentList docList = response.getResults(); lastTotalHits = (int) docList.getNumFound(); Modified: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/Templator.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/Templator.java 2011-09-27 13:16:47 UTC (rev 3288) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/templator/Templator.java 2011-09-28 16:02:20 UTC (rev 3289) @@ -16,6 +16,7 @@ import org.dllearner.algorithm.tbsl.ltag.parser.Parser; import org.dllearner.algorithm.tbsl.ltag.parser.Preprocessor; import org.dllearner.algorithm.tbsl.nlp.ApachePartOfSpeechTagger; +import org.dllearner.algorithm.tbsl.nlp.Lemmatizer; import org.dllearner.algorithm.tbsl.nlp.LingPipeLemmatizer; import org.dllearner.algorithm.tbsl.nlp.PartOfSpeechTagger; import org.dllearner.algorithm.tbsl.nlp.WordNet; @@ -173,8 +174,8 @@ newwords.add(word); newwords.addAll(strings); - newwords.addAll(wordnet.getBestSynonyms(wordnetpos,word)); - for (String att : strings) { + newwords.addAll(wordnet.getBestSynonyms(wordnetpos,getLemmatizedWord(word))); + for (String att : getLemmatizedWords(strings)) { newwords.addAll(wordnet.getBestSynonyms(wordnetpos,att)); } if (newwords.isEmpty()) { @@ -205,6 +206,24 @@ return templates; } + private List<String> getLemmatizedWords(List<String> words){ + List<String> stemmed = new ArrayList<String>(); + for(String word : words){ + //currently only stem single words + if(word.contains(" ")){ + stemmed.add(word); + } else { + stemmed.add(getLemmatizedWord(word)); + } + + } + return stemmed; + } + + private String getLemmatizedWord(String word){ + return lem.stem(word); + } + private boolean containsModuloRenaming(Set<DRS> drses, DRS drs) { for (DRS d : drses) { Modified: trunk/components-ext/src/main/resources/tbsl/tbsl.properties =================================================================== --- trunk/components-ext/src/main/resources/tbsl/tbsl.properties 2011-09-27 13:16:47 UTC (rev 3288) +++ trunk/components-ext/src/main/resources/tbsl/tbsl.properties 2011-09-28 16:02:20 UTC (rev 3289) @@ -1,7 +1,12 @@ solr.server.url = http://139.18.2.173:8080/apache-solr-3.3.0 solr.classes.url = ${solr.server.url}/dbpedia_classes +solr.classes.searchfield = label solr.resources.url = ${solr.server.url}/dbpedia_resources +solr.resources.searchfield = label solr.properties.url = ${solr.server.url}/dbpedia_properties +solr.properties.searchfield = label +solr.boa.properties.url = ${solr.server.url}/boa_pattern +solr.boa.properties.searchfield = nlr solr.query.limit = 20 sparql.endpoint.url = http://live.dbpedia.org/sparql @@ -11,7 +16,7 @@ !remote | local learning.validationType = remote learning.stopAfterFirstNonEmptyQueryResult = true -learning.maxTestedQueriesPerTemplate = 20 +learning.maxTestedQueriesPerTemplate = 50 !similarity | lucene | none learning.ranking = similarity This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |