From: <lor...@us...> - 2011-03-02 08:45:52
|
Revision: 2694 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=2694&view=rev Author: lorenz_b Date: 2011-03-02 08:45:43 +0000 (Wed, 02 Mar 2011) Log Message: ----------- Extended evaluation. Integrated always checking if LGG is solution when new positive example was added. Added substring metric to filter. Modified Paths: -------------- trunk/autosparql/src/main/java/org/dllearner/autosparql/evaluation/EvaluationWithNLQueriesScript.java trunk/autosparql/src/main/java/org/dllearner/autosparql/server/NBR.java trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/QueryTreeFactory.java trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/impl/QueryTreeFactoryImpl.java trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/QuestionBasedQueryTreeFilter.java trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/QuestionBasedStatementFilter.java Added Paths: ----------- trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/I_Sub.java Modified: trunk/autosparql/src/main/java/org/dllearner/autosparql/evaluation/EvaluationWithNLQueriesScript.java =================================================================== --- trunk/autosparql/src/main/java/org/dllearner/autosparql/evaluation/EvaluationWithNLQueriesScript.java 2011-03-01 17:47:13 UTC (rev 2693) +++ trunk/autosparql/src/main/java/org/dllearner/autosparql/evaluation/EvaluationWithNLQueriesScript.java 2011-03-02 08:45:43 UTC (rev 2694) @@ -79,13 +79,14 @@ private static final String LUCENE_INDEX_DIRECTORY = "/home/jl/hdd/other_large_files/index/"; private static final String WORDNET_DICTIONARY = "src/main/resources/de/simba/ner/dictionary"; private static final SparqlEndpoint ENDPOINT = SparqlEndpoint.getEndpointDBpediaLiveAKSW(); + private static final String ENDPOINT_URL = "http://db0.aksw.org:8999/sparql";//"http://live.dbpedia.org/sparql" private static final int NR_OF_POS_START_EXAMPLES_COUNT = 3; private static final int NR_OF_NEG_START_EXAMPLES_COUNT = 3; private static final int TOP_K = 20; - private static final double SIMILARITY_THRESHOLD = 0.4; + private static final double SIMILARITY_THRESHOLD = 0.5; private Map<String, String> question2query = new Hashtable<String, String>(); @@ -127,7 +128,7 @@ prefixes.put("yago","http://dbpedia.org/class/yago/"); prefixes.put("cyc","http://sw.opencyc.org/concept/"); prefixes.put("foaf","http://xmlns.com/foaf/0.1/"); - exFinder = new ExampleFinder(new SPARQLEndpointEx(new URL("http://live.dbpedia.org/sparql"), //new URL("http://lod.openlinksw.com/sparql"), + exFinder = new ExampleFinder(new SPARQLEndpointEx(new URL(ENDPOINT_URL), //new URL("http://lod.openlinksw.com/sparql"), Collections.singletonList("http://dbpedia.org"), Collections.<String>emptyList(), null, baseURI, prefixes, predicateFilters), selectCache, constructCache); // schemaIndex = new DBpediaSchemaIndex(SCHEMA_FILE_PATH); luceneSearch = new LuceneSearch(LUCENE_INDEX_DIRECTORY); @@ -284,7 +285,15 @@ logger.info("Sending query..."); long startTime = System.currentTimeMillis(); Set<String> resources = new HashSet<String>(); - ResultSet rs = SparqlQuery.convertJSONtoResultSet(selectCache.executeSelectQuery(ENDPOINT, query)); + SparqlEndpoint e = null; + try { + e = new SparqlEndpoint(new URL(ENDPOINT_URL), + Collections.singletonList("http://dbpedia.org"), Collections.<String>emptyList()); + } catch (MalformedURLException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + ResultSet rs = SparqlQuery.convertJSONtoResultSet(selectCache.executeSelectQuery(e, query)); while(rs.hasNext()){ resources.add(rs.nextSolution().get(varName).asResource().getURI()); } @@ -299,7 +308,7 @@ query = "SELECT DISTINCT " + query.substring(7); Set<String> resources = getResourcesBySPARQLQuery(query, "x0"); boolean isSolution = resources.equals(answers); - logger.info(isSolution); + logger.info("LGG is already solution:" + isSolution); return isSolution; } @@ -311,7 +320,7 @@ List<String> relevantWords; int i = 1; int learnedQueries = 0; - for(String question : question2Answers.keySet()){question = "Give me all soccer clubs in the Premier League."; + for(String question : question2Answers.keySet()){//question = "Give me all soccer clubs in the Premier League."; logger.debug(getNewQuestionString(i, question)); try { targetQuery = question2query.get(question); @@ -394,18 +403,18 @@ //start learning miniLogger.info("AutoSPARQL: Started learning..."); - if(LGGIsSolution(posExamples, answers)){ - logger.info("Learned successful."); - miniLogger.info("Learning successful."); - logger.info("Learned SPARQL query:\n" + exFinder.getCurrentQuery()); - miniLogger.info("Learned SPARQL query:\n" + exFinder.getCurrentQuery()); - learnedQueries++; - continue; - } + boolean hasToCheckIfLGGIsSolution = true; Set<String> learnedResources; String oldLearnedQuery = ""; boolean learningFailed = false; + do { + if(hasToCheckIfLGGIsSolution){ + if(LGGIsSolution(posExamples, answers)){ + hasToCheckIfLGGIsSolution = false; + break; + } + } // compute new similiar example logger.info("Computing similar example..."); long startTime = System.currentTimeMillis(); @@ -431,6 +440,7 @@ if (answers.contains(example)) { posExamples.add(example); miniLogger.info("User: YES"); + hasToCheckIfLGGIsSolution = true; } else { negExamples.add(example); miniLogger.info("User: NO"); @@ -438,10 +448,13 @@ miniLogger.info("Learned SPARQL query:\n" + learnedQuery); } while (!answers.equals(learnedResources)); if(!learningFailed){ - logger.info("Learned successfully query for question \""+ question + "\"."); + logger.info("Learned successful."); + logger.info("Learned SPARQL query:\n" + exFinder.getCurrentQuery()); miniLogger.info("Learning successful."); + miniLogger.info("Learned SPARQL query:\n" + exFinder.getCurrentQuery()); learnedQueries++; }else { + logger.info("Could not learn query."); miniLogger.info("AutoSPARQL: Could not learn query."); } } catch (TimeOutException e) { @@ -450,7 +463,7 @@ e.printStackTrace(); } catch (Exception e) { logger.error("Something went wrong. Trying next question...", e); - miniLogger.info("AutoSPARQL: Could not learn query."); + miniLogger.info("AutoSPARQL: Could not learn query.", e); } } logger.info("Learned " + learnedQueries + "/" + question2query.keySet().size() + " queries."); @@ -494,6 +507,7 @@ Logger.getRootLogger().removeAllAppenders(); Layout layout = new PatternLayout("%m%n"); ConsoleAppender appender = new ConsoleAppender(layout); + appender.setThreshold(Level.DEBUG); Logger.getRootLogger().addAppender(appender); FileAppender fileAppender = new FileAppender( layout, "log/evaluation.log", false); Modified: trunk/autosparql/src/main/java/org/dllearner/autosparql/server/NBR.java =================================================================== --- trunk/autosparql/src/main/java/org/dllearner/autosparql/server/NBR.java 2011-03-01 17:47:13 UTC (rev 2693) +++ trunk/autosparql/src/main/java/org/dllearner/autosparql/server/NBR.java 2011-03-02 08:45:43 UTC (rev 2694) @@ -29,6 +29,7 @@ import org.dllearner.sparqlquerygenerator.cache.QueryTreeCache; import org.dllearner.sparqlquerygenerator.datastructures.QueryTree; import org.dllearner.sparqlquerygenerator.datastructures.impl.QueryTreeImpl; +import org.dllearner.sparqlquerygenerator.util.Filters; import org.dllearner.sparqlquerygenerator.util.ModelGenerator; import com.hp.hpl.jena.query.QuerySolution; @@ -62,8 +63,10 @@ private List<QueryTree<N>> negTrees; private List<Integer> determiningNodeIds; + private List<List<QueryTreeChange>> noSequences; + private List<QueryTreeChange> lastSequence; + private int negExamplesCount = -1; - private LastQueryTreeChangeComparator comparator = new LastQueryTreeChangeComparator(); private static final Logger logger = Logger.getLogger(NBR.class); @@ -76,6 +79,8 @@ modelGen = new ModelGenerator(endpoint, new HashSet<String>(((SPARQLEndpointEx)endpoint).getPredicateFilters()), constructCache); modelCache = new ModelCache(modelGen); treeCache = new QueryTreeCache(); + + noSequences = new ArrayList<List<QueryTreeChange>>(); } public void setStatementFilter(Filter<Statement> filter){ @@ -306,6 +311,10 @@ return computeQuestionOptimized(lgg, negTrees, knownResources); } + public String getQuestionBetterPerformance(QueryTree<N> lgg, List<QueryTree<N>> negTrees, List<String> knownResources) throws TimeOutException{ + return computeQuestionBetterPerformance(lgg, negTrees, knownResources); + } + private Example computeQuestion(QueryTree<N> lgg, List<QueryTree<N>> negTrees, List<String> knownResources){ lgg = getFilteredTree(lgg); logger.info(lgg.getStringRepresentation()); @@ -412,6 +421,7 @@ if(isTerminationCriteriaReached()){ throw new TimeOutException(maxExecutionTimeInSeconds); } + fSparql(postLGG, tmp.getChanges()); logger.debug("New resource before binary search: " + newResource); if(!(newResource == null)){ logger.debug("binary search for most specific query returning a resource - start"); @@ -428,6 +438,109 @@ return null; } + private String computeQuestionBetterPerformance(QueryTree<N> lgg, List<QueryTree<N>> negTrees, List<String> knownResources) throws TimeOutException{ + startTime = System.currentTimeMillis(); + this.lgg = lgg; + this.negTrees = negTrees; + if(userAnsweredWithNo()){ + noSequences.add(lastSequence); + } + negExamplesCount = negTrees.size(); + determiningNodeIds = getDeterminingNodeIds(lgg, negTrees); + logger.info("Computing next question..."); + postLGG = getFilteredTree(lgg); + PostLGG<N> postGen = new PostLGG<N>((SPARQLEndpointEx) endpoint); + postGen.simplifyTree(postLGG, negTrees); + logger.info("Post LGG(Tree): \n" + TreeHelper.getAbbreviatedTreeRepresentation( + postLGG, endpoint.getBaseURI(), endpoint.getPrefixes())); + logger.info("Post LGG(Query):\n" + postLGG.toSPARQLQueryString()); + logger.info("Post LGG(#Instances):\n" + getAllResources(postLGG.toSPARQLQueryString()).size()); +// logger.debug("Starting generalisation with tree:\n" + postLGG.getStringRepresentation()); + limit = knownResources.size(); + + List<GeneralisedQueryTree<N>> queue = null; + if(generalizeSortedByNegatives){ + queue = getAllowedGeneralisationsSortedByMatrix(new GeneralisedQueryTree<N>(postLGG), negTrees); + } else { + queue = getAllowedGeneralisationsSorted(new GeneralisedQueryTree<N>(postLGG)); + } + logger.debug(getQueueLogInfo(queue)); + + GeneralisedQueryTree<N> tree1; + QueryTree<N> tree2; + GeneralisedQueryTree<N> tmp; + List<GeneralisedQueryTree<N>> gens; + List<GeneralisedQueryTree<N>> neededGeneralisations; + while(!queue.isEmpty()){ + neededGeneralisations = new ArrayList<GeneralisedQueryTree<N>>(); + logger.debug("Selecting first tree from queue"); + tree1 = queue.remove(0); + tmp = tree1; + + if(logger.isDebugEnabled()){ + logger.debug("Changes: " + tmp.getChanges()); + } + boolean coversNegTree = coversNegativeTree(tmp.getQueryTree(), negTrees); + neededGeneralisations.add(tmp); + logger.debug("covers negative tree: " + coversNegTree); + while(!coversNegTree){ + if(generalizeSortedByNegatives){ + gens = getAllowedGeneralisationsSortedByMatrix(tmp, negTrees); + } else { + gens = getAllowedGeneralisationsSorted(tmp); + } + if(gens.isEmpty()){ + if(logger.isDebugEnabled()){ + logger.debug("Couldn't create a generalisation which covers a negative tree."); + } + break; + } + tmp = gens.remove(0); + neededGeneralisations.add(tmp); + if(logger.isDebugEnabled()){ + logger.debug("Changes: " + tmp.getChanges()); + } + queue.addAll(0, gens); + logger.debug(getQueueLogInfo(queue)); + coversNegTree = coversNegativeTree(tmp.getQueryTree(), negTrees); + if(coversNegTree) { + logger.debug("covers negative tree"); + } + } + + int index = neededGeneralisations.size()-1; + if(coversNegTree){ + tree2 = neededGeneralisations.get(index--).getQueryTree(); + } else { + tree2 = tmp.getQueryTree(); + } + +// QueryTree<N> newTree = getNewResource(tree2, knownResources); + String newResource = getNewResource(tree2, knownResources); + if(isTerminationCriteriaReached()){ + throw new TimeOutException(maxExecutionTimeInSeconds); + } + fSparql(postLGG, tmp.getChanges()); + logger.debug("New resource before binary search: " + newResource); + if(!(newResource == null)){ + logger.debug("binary search for most specific query returning a resource - start"); + newResource = findMostSpecificResourceTree2(neededGeneralisations, knownResources, 0, neededGeneralisations.size()-1); + logger.debug("binary search for most specific query returning a resource - completed"); + // TODO: probably the corresponding tree, which resulted in the resource, should also be returned + return newResource; + } else { + if(logger.isDebugEnabled()){ + logger.debug("Query result contains no new resources. Trying next tree from queue..."); + } + } + } + return null; + } + + private boolean userAnsweredWithNo(){ + return (negExamplesCount != -1) && (negTrees.size() > negExamplesCount); + } + private SortedSet<String> getAllResources(String query){ SortedSet<String> resources = new TreeSet<String>(); query = "SELECT DISTINCT " + query.substring(7) + " LIMIT 1000"; @@ -443,7 +556,7 @@ return resources; } - private QueryTree<N> getQueryTree(String resource){ + private QueryTree<N> getQueryTree(String resource){System.err.println(resource); Model model = modelCache.getModel(resource); QueryTree<String> tree = treeCache.getQueryTree(resource, model); return getFilteredTree((QueryTree<N>) tree); @@ -478,6 +591,36 @@ } } + private String findMostSpecificResourceTree2(List<GeneralisedQueryTree<N>> trees, List<String> knownResources, int low, int high) throws TimeOutException { +// if(low==high) { +// return low; +// } + int testIndex = low + (high-low)/2; + // perform SPARQL query + +// QueryTree<N> t = getNewResource(trees.get(testIndex), knownResources); + String t = null; + try { + t = getNewResource(trees.get(testIndex).getQueryTree(), knownResources); + } catch (HTTPException e) { + throw new TimeOutException(maxExecutionTimeInSeconds); + } + if(isTerminationCriteriaReached()){ + throw new TimeOutException(maxExecutionTimeInSeconds); + } + if(testIndex == high){ + lastSequence = trees.get(testIndex).getChanges(); + return t; + } + if(t == null) { + return findMostSpecificResourceTree2(trees,knownResources,testIndex+1,high); + } else { + return findMostSpecificResourceTree2(trees,knownResources,low,testIndex); + } + } + + + // private Queue<QueryTree<N>> gen(QueryTree<N> tree){ // Queue<QueryTree<N>> gens = new LinkedList<QueryTree<N>>(); // @@ -809,6 +952,7 @@ foundResources.removeAll(knownResources); for(String resource : foundResources){ newTree = getQueryTree(resource); + System.out.println(TreeHelper.getAbbreviatedTreeRepresentation(newTree, endpoint.getBaseURI(), endpoint.getPrefixes())); if(!newTree.isSubsumedBy(lgg)){ return resource; } @@ -1053,5 +1197,123 @@ result = totalTimeNeeded >= maxMilliSeconds; return result; } + + private String fSparql(QueryTree<N> tree, List<QueryTreeChange> changes){ + QueryTree<N> copy = new QueryTreeImpl<N>(tree); + StringBuilder query = new StringBuilder(); + StringBuilder triples = new StringBuilder(); + List<String> filters = new ArrayList<String>(); + query.append("SELECT DISTINCT ?x0 WHERE{\n"); +// buildSPARQLQueryString(copy, triples); + buildSPARQLQueryString(copy, changes, triples, filters); + query.append(triples.toString()); + if(filters.size() > 0){ + query.append("FILTER("); + for(int i = 0; i < filters.size()-1; i++){ + query.append("(").append(filters.get(i)).append(") || "); + } + query.append("(").append(filters.get(filters.size()-1)).append(")"); + query.append(")\n"); + } + query.append("}"); + query.append(" LIMIT ").append(limit); + System.err.println("fSparql: \n" + query.toString()); + return query.toString(); + + } + + private void buildSPARQLQueryString(QueryTree<N> tree, StringBuilder triples){ + Object subject = null; + if(tree.getUserObject().equals("?")){ + subject = "?x" + tree.getId(); + } else { + subject = "<" + tree.getUserObject() + ">"; + } + Object predicate; + Object object; + if(!tree.isLeaf()){ + for(QueryTree<N> child : tree.getChildren()){ + predicate = tree.getEdge(child); + object = child.getUserObject(); + boolean objectIsResource = !object.equals("?"); + if(!objectIsResource){ + object = "?x" + child.getId(); + } else if(((String)object).startsWith("http://")){ + object = "<" + object + ">"; + } + triples.append(subject).append(" <").append(predicate).append("> ").append(object).append(".\n"); + if(!objectIsResource){ + buildSPARQLQueryString(child, triples); + } + } + } + } + + private void buildSPARQLQueryString(QueryTree<N> tree, List<QueryTreeChange> changes, StringBuilder triples, List<String> filters){ + Object subject = null; + if(tree.getUserObject().equals("?")){ + subject = "?x" + tree.getId(); + } else { + subject = "<" + tree.getUserObject() + ">"; + } + Object predicate; + Object object; + if(!tree.isLeaf()){ + for(QueryTree<N> child : tree.getChildren()){ + predicate = tree.getEdge(child); + object = child.getUserObject(); + boolean objectIsResource = !object.equals("?"); + boolean addFilter = false; + boolean removed = false; + String uri = null; + if(!objectIsResource){ + object = "?x" + child.getId(); + } else if(((String)object).startsWith("http://")){ + QueryTreeChange c = getChange(changes, child.getId()); + if(c != null){ + if(c.getType() == ChangeType.REPLACE_LABEL){ + uri = (String) object; + child.setUserObject((N)"?"); + object = "?x" + child.getId(); + addFilter = true; + } else { + removed = true; + triples.append("OPTIONAL{").append(subject). + append(" <").append(predicate).append("> ").append("?x").append(child.getId()).append("}\n"); + filters.add("!BOUND(?x" + child.getId() + ")"); + child.getParent().removeChild((QueryTreeImpl<N>) child); + } + + } else { + object = "<" + object + ">"; + } + + } + if(!removed){ + triples.append(subject).append(" <").append(predicate).append("> ").append(object).append(".\n"); + } + if(addFilter){ + filters.add("?x" + child.getId() + "!=<" + uri + ">"); + } + if(!objectIsResource){ + buildSPARQLQueryString(child, changes, triples, filters); + } + } + } + } + + private QueryTreeChange getChange(List<QueryTreeChange> changes, int nodeId){ + QueryTreeChange change = null; + for(QueryTreeChange c : changes){ + if(c.getNodeId() == nodeId){ + if(c.getType() == ChangeType.REMOVE_NODE){ + return c; + } else { + change = c; + } + } + } + return change; + } } Modified: trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/QueryTreeFactory.java =================================================================== --- trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/QueryTreeFactory.java 2011-03-01 17:47:13 UTC (rev 2693) +++ trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/QueryTreeFactory.java 2011-03-02 08:45:43 UTC (rev 2694) @@ -36,6 +36,8 @@ QueryTreeImpl<N> getQueryTree(String example, Model model); + QueryTreeImpl<N> getQueryTree(String example, Model model, int maxEdges); + QueryTreeImpl<N> getQueryTree(Resource example, Model model); QueryTreeImpl<N> getQueryTree(String example); Modified: trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/impl/QueryTreeFactoryImpl.java =================================================================== --- trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/impl/QueryTreeFactoryImpl.java 2011-03-01 17:47:13 UTC (rev 2693) +++ trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/impl/QueryTreeFactoryImpl.java 2011-03-02 08:45:43 UTC (rev 2694) @@ -32,6 +32,7 @@ import org.dllearner.sparqlquerygenerator.datastructures.impl.QueryTreeImpl; import org.dllearner.sparqlquerygenerator.util.Filter; import org.dllearner.sparqlquerygenerator.util.Filters; +import org.dllearner.sparqlquerygenerator.util.QuestionBasedStatementFilter; import org.dllearner.sparqlquerygenerator.util.ZeroFilter; import com.hp.hpl.jena.rdf.model.Literal; @@ -92,6 +93,15 @@ return createTreeOptimized(model.getResource(example), model); } } + + @Override + public QueryTreeImpl<String> getQueryTree(String example, Model model, int maxEdges) { + if(keepFilter == null){ + return createTree(model.getResource(example), model); + } else { + return createTreeOptimized(model.getResource(example), model, maxEdges); + } + } @Override public QueryTreeImpl<String> getQueryTree(Resource example, Model model) { @@ -103,6 +113,46 @@ return new QueryTreeImpl<String>(example); } + private QueryTreeImpl<String> createTreeOptimized(Resource s, Model model, int maxEdges){ + nodeId = 0; + SortedMap<String, SortedSet<Statement>> resource2Statements = new TreeMap<String, SortedSet<Statement>>(); + + fillMap(s, model, resource2Statements); + + QuestionBasedStatementFilter filter = (QuestionBasedStatementFilter)keepFilter; + Set<Statement> statements; + int diff = valueCount(resource2Statements) - maxEdges; + main:while(diff > 0){ + double oldThreshold = filter.getThreshold(); + statements = filter.getStatementsBelowThreshold(oldThreshold+0.1); + for(SortedSet<Statement> set : resource2Statements.values()){ + for(Statement st : statements){ + if(set.remove(st)){ + diff--; + if(diff == 0){ + break main; + } + } + } + } + } + + + QueryTreeImpl<String> tree = new QueryTreeImpl<String>(s.toString()); + fillTree(tree, resource2Statements); + + tree.setUserObject("?"); + return tree; + } + + private int valueCount(SortedMap<String, SortedSet<Statement>> map){ + int cnt = 0; + for(SortedSet<Statement> statements : map.values()){ + cnt += statements.size(); + } + return cnt; + } + private QueryTreeImpl<String> createTreeOptimized(Resource s, Model model){ nodeId = 0; SortedMap<String, SortedSet<Statement>> resource2Statements = new TreeMap<String, SortedSet<Statement>>(); Added: trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/I_Sub.java =================================================================== --- trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/I_Sub.java (rev 0) +++ trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/I_Sub.java 2011-03-02 08:45:43 UTC (rev 2694) @@ -0,0 +1,179 @@ +/* Copyright 2004-2011 by the National and Technical University of Athens + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package org.dllearner.sparqlquerygenerator.util; + +/** + * @author Giorgos Stoilos (gs...@im...) + * + * This class implements the string matching method proposed in the paper + * "A String Metric For Ontology Alignment", published in ISWC 2005 + * + */ +public class I_Sub { + + /** + * @param s1 input string 1 + * @param s2 input string 2 + * @param normaliseStrings a boolean value that specifies whether the two strings are to be normalised by + * a custom normalisation algorithm that basically removes punctuation symbols and converts both input strings + * to lower-case. Note that without normalisation the method is case sensitive. + * + * @return degree of similarity between the two strings. + */ + public double score(String s1, String s2, boolean normaliseStrings) { + + if( (s1 == null) || (s2 == null) ) + return -1; + + String inputStr1 = s1; + String inputStr2 = s2; + + if( normaliseStrings ){ + s1 = s1.toLowerCase(); + s2 = s2.toLowerCase(); + + s1 = normalizeString( s1 , '.' ); + s2 = normalizeString( s2 , '.' ); + s1 = normalizeString( s1 , '_' ); + s2 = normalizeString( s2 , '_' ); + s1 = normalizeString( s1 , ' ' ); + s2 = normalizeString( s2 , ' ' ); + } + + int l1 = s1.length(); // length of s + int l2 = s2.length(); // length of t + + int L1 = l1; + int L2 = l2; + + if ((L1 == 0) && (L2 == 0)) + return 1; + if ((L1 == 0) || (L2 == 0)) + return -1; + + double common = 0; + int best = 2; + + while( s1.length() > 0 && s2.length() > 0 && best != 0 ) { + best = 0; // the best subs length so far + + l1 = s1.length(); // length of s + l2 = s2.length(); // length of t + + int i = 0; // iterates through s1 + int j = 0; // iterates through s2 + + int startS2 = 0; + int endS2 = 0; + int startS1 = 0; + int endS1 = 0; + int p=0; + + for( i = 0; (i < l1) && (l1 - i > best); i++) { + j = 0; + while (l2 - j > best) { + int k = i; + for(;(j < l2) && (s1.charAt(k) != s2.charAt(j)); j++); + + if (j != l2) { // we have found a starting point + p = j; + for (j++, k++; + (j < l2) && (k < l1) && (s1.charAt(k) == s2.charAt(j)); + j++, k++); + if( k-i > best){ + best = k-i; + startS1 = i; + endS1 = k; + startS2 = p; + endS2 = j; + } + } + } + } + char[] newString = new char[ s1.length() - (endS1 - startS1) ]; + + j=0; + for( i=0 ;i<s1.length() ; i++ ) { + if( i>=startS1 && i< endS1 ) + continue; + newString[j++] = s1.charAt( i ); + } + + s1 = new String( newString ); + + newString = new char[ s2.length() - ( endS2 - startS2 ) ]; + j=0; + for( i=0 ;i<s2.length() ; i++ ) { + if( i>=startS2 && i< endS2 ) + continue; + newString[j++] = s2.charAt( i ); + } + s2 = new String( newString ); + + if( best > 2 ) + common += best; + else + best = 0; + } + + double commonality = 0; + double scaledCommon = (double)(2*common)/(L1+L2); + commonality = scaledCommon; + + double winklerImprovement = winklerImprovement(inputStr1, inputStr2, commonality); + double dissimilarity = 0; + + double rest1 = L1 - common; + double rest2 = L2 - common; + + double unmatchedS1 = Math.max( rest1 , 0 ); + double unmatchedS2 = Math.max( rest2 , 0 ); + unmatchedS1 = rest1/L1; + unmatchedS2 = rest2/L2; + + /** Hamacher Product */ + double suma = unmatchedS1 + unmatchedS2; + double product = unmatchedS1 * unmatchedS2; + double p = 0.6; //For 1 it coincides with the algebraic product + if( (suma-product) == 0 ) + dissimilarity = 0; + else + dissimilarity = (product)/(p+(1-p)*(suma-product)); + + return commonality - dissimilarity + winklerImprovement; + } + + private double winklerImprovement(String s1, String s2, double commonality) { + + int i; + int n = Math.min( s1.length() , s2.length() ); + for( i=0 ; i<n ; i++ ) + if( s1.charAt( i ) != s2.charAt( i ) ) + break; + + return Math.min(4, i)*0.1*(1-commonality); + } + + public String normalizeString(String str, char remo) { + + StringBuffer strBuf = new StringBuffer(); + for( int i=0 ; i<str.length() ; i++ ){ + if( str.charAt( i ) != remo ) + strBuf.append( str.charAt( i ) ); + } + return strBuf.toString(); + } +} \ No newline at end of file Modified: trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/QuestionBasedQueryTreeFilter.java =================================================================== --- trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/QuestionBasedQueryTreeFilter.java 2011-03-01 17:47:13 UTC (rev 2693) +++ trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/QuestionBasedQueryTreeFilter.java 2011-03-02 08:45:43 UTC (rev 2694) @@ -70,6 +70,9 @@ } private boolean areSimiliar(String s1, String s2){//cnt++;System.out.println(cnt); + if(s1.toLowerCase().contains(s2.toLowerCase()) || s2.toLowerCase().contains(s1.toLowerCase())){ + return true; + } float qSim = qGramMetric.getSimilarity(s1, s2); float lSim = levensteinMetric.getSimilarity(s1, s2); // float jSim = jaroWinklerMetric.getSimilarity(s1, s2); Modified: trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/QuestionBasedStatementFilter.java =================================================================== --- trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/QuestionBasedStatementFilter.java 2011-03-01 17:47:13 UTC (rev 2693) +++ trunk/sparql-query-generator/src/main/java/org/dllearner/sparqlquerygenerator/util/QuestionBasedStatementFilter.java 2011-03-02 08:45:43 UTC (rev 2694) @@ -1,5 +1,9 @@ package org.dllearner.sparqlquerygenerator.util; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; @@ -17,9 +21,12 @@ private AbstractStringMetric qGramMetric; private AbstractStringMetric levensteinMetric; private AbstractStringMetric jaroWinklerMetric; + private I_Sub substringMetric; private double threshold = 0.4; + private Map<Statement, Double> statement2Similarity = new HashMap<Statement, Double>(); + int cnt = 0; public QuestionBasedStatementFilter(Set<String> questionWords){ @@ -27,26 +34,43 @@ qGramMetric = new QGramsDistance(); levensteinMetric = new Levenshtein(); jaroWinklerMetric = new JaroWinkler(); + substringMetric = new I_Sub(); } - private boolean isSimiliar2QuestionWord(String s){ + private boolean isSimiliar2QuestionWord(String s, Statement st){ for(String word : questionWords){ - if(areSimiliar(word, s)){ + if(areSimiliar(word, s, st)){ return true; } } return false; } - private boolean areSimiliar(String s1, String s2){//cnt++;System.out.println(cnt); + private boolean areSimiliar(String s1, String s2, Statement st){//cnt++;System.out.println(cnt); float qSim = qGramMetric.getSimilarity(s1, s2); float lSim = levensteinMetric.getSimilarity(s1, s2); // float jSim = jaroWinklerMetric.getSimilarity(s1, s2); + double subSim = substringMetric.score(s1, s2, true); float sim = Math.max(qSim, lSim); + sim = Math.max(sim, Double.valueOf(subSim).floatValue()); + // sim = Math.max(sim, jSim); - return sim >= threshold; + if(sim >= threshold){ + statement2Similarity.put(st, Double.valueOf(sim)); + return true; + } + return false; } + + private String getFragment(String uri){ + int i = uri.lastIndexOf("#"); + if(i > 0){ + return uri.substring(i+1); + } else { + return uri.substring(uri.lastIndexOf("/")+1); + } + } @Override public boolean accept(Statement s) { @@ -54,11 +78,11 @@ String object = null; if(s.getObject().isURIResource()){ object = s.getObject().asResource().getURI(); - object = object.substring(object.lastIndexOf("/")+1); + object = getFragment(s.getObject().asResource().getURI()); } else if(s.getObject().isLiteral()){ object = s.getObject().asLiteral().getLexicalForm(); } - if(isSimiliar2QuestionWord(object) || isSimiliar2QuestionWord(predicate)){ + if(isSimiliar2QuestionWord(object, s) || isSimiliar2QuestionWord(predicate, s)){ return true; } @@ -68,5 +92,19 @@ public void setThreshold(double threshold){ this.threshold = threshold; } + + public double getThreshold(){ + return threshold; + } + + public Set<Statement> getStatementsBelowThreshold(double threshold){ + Set<Statement> statements = new HashSet<Statement>(); + for(Entry<Statement, Double> entry : statement2Similarity.entrySet()){ + if(entry.getValue().doubleValue() < threshold){ + statements.add(entry.getKey()); + } + } + return statements; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |