From: <jen...@us...> - 2009-04-21 13:06:42
|
Revision: 1721 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=1721&view=rev Author: jenslehmann Date: 2009-04-21 13:06:39 +0000 (Tue, 21 Apr 2009) Log Message: ----------- running matching script (needs a lot more tuning) Modified Paths: -------------- trunk/src/dl-learner/org/dllearner/Info.java trunk/src/dl-learner/org/dllearner/scripts/matching/DBpediaLinkedGeoData.java trunk/src/dl-learner/org/dllearner/scripts/matching/DBpediaPoint.java trunk/src/dl-learner/org/dllearner/scripts/matching/Point.java trunk/src/dl-learner/org/dllearner/test/junit/TestOntologies.java Modified: trunk/src/dl-learner/org/dllearner/Info.java =================================================================== --- trunk/src/dl-learner/org/dllearner/Info.java 2009-04-20 17:32:06 UTC (rev 1720) +++ trunk/src/dl-learner/org/dllearner/Info.java 2009-04-21 13:06:39 UTC (rev 1721) @@ -3,6 +3,6 @@ package org.dllearner; public class Info { - public static final String build = "2009-04-17"; + public static final String build = "2009-04-20"; } \ No newline at end of file Modified: trunk/src/dl-learner/org/dllearner/scripts/matching/DBpediaLinkedGeoData.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/matching/DBpediaLinkedGeoData.java 2009-04-20 17:32:06 UTC (rev 1720) +++ trunk/src/dl-learner/org/dllearner/scripts/matching/DBpediaLinkedGeoData.java 2009-04-21 13:06:39 UTC (rev 1721) @@ -24,13 +24,21 @@ import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; +import java.io.InputStreamReader; import java.net.URI; +import java.net.URL; +import java.net.URLConnection; +import java.util.Collection; +import java.util.LinkedList; import org.dllearner.kb.sparql.SparqlEndpoint; import org.dllearner.kb.sparql.SparqlQuery; +import org.dllearner.utilities.Files; import com.hp.hpl.jena.query.QuerySolution; import com.hp.hpl.jena.query.ResultSet; +import com.wcohen.ss.Jaro; +import com.wcohen.ss.api.StringDistance; /** * Computes owl:sameAs links between DBpedia and LinkedGeoData @@ -41,16 +49,27 @@ */ public class DBpediaLinkedGeoData { - // chose between nt and csv - private static String dbpediaFileFormat = "csv"; + // chose between nt and dat + private static String dbpediaFileFormat = "dat"; private static File dbpediaFile = new File("log/DBpedia_POIs." + dbpediaFileFormat); private static boolean regenerateFile = false; private static File matchingFile = new File("log/DBpedia_GeoData_Links.nt"); + private static File missesFile = new File("log/DBpedia_GeoData_Misses.dat"); + private static double scoreThreshold = 0.8; + private static StringDistance distance = new Jaro(); private static SparqlEndpoint dbpediaEndpoint = SparqlEndpoint.getEndpointLOCALDBpedia(); private static SparqlEndpoint geoDataEndpoint = SparqlEndpoint.getEndpointLOCALGeoData(); + // read in DBpedia ontology such that we perform taxonomy reasoning +// private static ReasonerComponent reasoner = TestOntologies.getTestOntology(TestOntology.DBPEDIA_OWL); +// private static ClassHierarchy hierarchy = reasoner.getClassHierarchy(); + + // true = SPARQL is used for retrieving close points; + // false = Triplify spatial extension is used + private static boolean useSparqlForGettingNearbyPoints = false; + public static void main(String[] args) throws IOException { // download all objects having geo-coordinates from DBpedia if necessary @@ -58,34 +77,66 @@ createDBpediaFile(); } + Files.clearFile(matchingFile); + Files.clearFile(missesFile); FileOutputStream fos = new FileOutputStream(matchingFile, true); + FileOutputStream fosMiss = new FileOutputStream(missesFile, true); // read file point by point BufferedReader br = new BufferedReader(new FileReader(dbpediaFile)); String line; int counter = 0; int matches = 0; + + // temporary variables needed while reading in file + int itemCount = 0; + URI uri = null; + String label = null; + String[] classes = null; + int decimalCount = 0; + double geoLat = 0; + double geoLong = 0; + while ((line = br.readLine()) != null) { - // read line and convert it into an object - String[] parts = line.split(","); - URI uri = URI.create(parts[0]); - String label = parts[1]; - double geoLat = new Double(parts[2]); - double geoLong = new Double(parts[3]); - DBpediaPoint dp = new DBpediaPoint(uri, label, geoLat, geoLong); - - // find match (we assume there is exactly one match) - URI matchURI = findGeoDataMatch(dp); - if(matchURI != null) { - String matchStr = "<" + uri + "> <http://www.w3.org/2002/07/owl#sameAs> <" + matchURI + "> .\n"; - fos.write(matchStr.getBytes()); - matches++; + if(line.isEmpty()) { + DBpediaPoint dp = new DBpediaPoint(uri, label, classes, geoLat, geoLong, decimalCount); + + // find match (we assume there is exactly one match) + URI matchURI = findGeoDataMatch(dp); + if(matchURI == null) { + String missStr = dp.toString() + "\n"; + fosMiss.write(missStr.getBytes()); + } else { + String matchStr = "<" + dp.getUri() + "> <http://www.w3.org/2002/07/owl#sameAs> <" + matchURI + "> .\n"; + fos.write(matchStr.getBytes()); + matches++; + } + counter++; + + if(counter % 1000 == 0) { + System.out.println(counter + " points processed. " + matches + " matches found."); + } + + itemCount = 0; + } else { + switch(itemCount) { + case 0 : uri = URI.create(line); break; + case 1 : label = line; break; + case 2 : classes = line.substring(1, line.length()).split(","); break; + case 3 : + geoLat = new Double(line); + decimalCount = 0; + String[] tmp = line.split("."); + if(tmp.length == 2) { + decimalCount = tmp[1].length(); + } + break; + case 4: geoLong = new Double(line); + } + + itemCount++; } - counter++; - if(counter % 1000 == 0) { - System.out.println(counter + " points processed. " + matches + " matches found."); - } } br.close(); fos.close(); @@ -99,98 +150,194 @@ int offset = 0; int counter = 0; + int points = 0; FileOutputStream fos = new FileOutputStream(dbpediaFile, true); do { counter = 0; // query DBpedia for all objects having geo-coordinates - String queryStr = "SELECT ?object, ?lat, ?long, ?label WHERE {"; + String queryStr = "SELECT ?object, ?lat, ?long, ?label, ?type WHERE {"; queryStr += "?object <http://www.w3.org/2003/01/geo/wgs84_pos#lat> ?lat ."; queryStr += "?object <http://www.w3.org/2003/01/geo/wgs84_pos#long> ?long ."; - queryStr += "?object rdfs:label ?label . }"; + queryStr += "?object rdfs:label ?label . "; + queryStr += "OPTIONAL { ?object rdf:type ?type . "; + queryStr += "FILTER (!(?type LIKE <http://dbpedia.org/ontology/Resource>)) ."; + queryStr += "FILTER (?type LIKE <http://dbpedia.org/ontology/%>) ."; + queryStr += "} }"; queryStr += "LIMIT " + limit + " OFFSET " + offset; SparqlQuery query = new SparqlQuery(queryStr, dbpediaEndpoint); ResultSet rs = query.send(); + String previousObject = null; + String geoLat = ""; + String geoLong = ""; + String label = ""; + Collection<String> types = new LinkedList<String>(); while(rs.hasNext()) { QuerySolution qs = rs.nextSolution(); String object = qs.get("object").toString(); - String geoLat = qs.getLiteral("lat").getString(); - String geoLong = qs.getLiteral("long").getString(); - String label = qs.getLiteral("label").getString(); - String content = ""; - if(dbpediaFileFormat.equals("nt")) { - content += "<" + object + ">" + " <http://www.w3.org/2000/01/rdf-schema#label> \"" + label + "\" .\n"; - content += "<" + object + ">" + " <http://www.w3.org/2003/01/geo/wgs84_pos#lat> \"" + geoLat + "\"^^<http://www.w3.org/2001/XMLSchema#float> .\n"; - content += "<" + object + ">" + " <http://www.w3.org/2003/01/geo/wgs84_pos#long> \"" + geoLong + "\"^^<http://www.w3.org/2001/XMLSchema#float> .\n"; + if(object.equals(previousObject)) { + // only type has changed compared to previous row + types.add(qs.get("type").toString()); + + // we are only interested in the most special DBpedia class +// NamedClass nc = new NamedClass(typeTmp); +// if(hierarchy.getSubClasses(nc).size()==1) { + // usually there is just one type assigned in the DBpedia ontology +// if(!type.equals("unknown")) { +// throw new Error("two different types for " + object + ": " + type + " and " + typeTmp); +// } +// type = typeTmp; +// } } else { - content += object + ",\"" + label + "\"," + geoLat + "," + geoLong + "\n"; + if(previousObject != null) { + // we have new a new point => write previous point to file + String content = ""; + if(dbpediaFileFormat.equals("nt")) { + content += "<" + previousObject + ">" + " <http://www.w3.org/2000/01/rdf-schema#label> \"" + label + "\" .\n"; + for(String type : types) { + content += "<" + previousObject + ">" + " <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> \"" + type + "\" .\n"; + } + content += "<" + previousObject + ">" + " <http://www.w3.org/2003/01/geo/wgs84_pos#lat> \"" + geoLat + "\"^^<http://www.w3.org/2001/XMLSchema#float> .\n"; + content += "<" + previousObject + ">" + " <http://www.w3.org/2003/01/geo/wgs84_pos#long> \"" + geoLong + "\"^^<http://www.w3.org/2001/XMLSchema#float> .\n"; + } else { + content += previousObject + "\n" + label + "\n" + types.toString().replace(" ", "") + "\n" + geoLat + "\n" + geoLong + "\n\n"; + } + + fos.write(content.getBytes()); + + } + + // reset default values + types.clear(); + + // get new data + geoLat = qs.getLiteral("lat").getString(); + geoLong = qs.getLiteral("long").getString(); + label = qs.getLiteral("label").getString(); + if(qs.contains("type")) { + types.add(qs.get("type").toString()); + + // we are only interested in the most special DBpedia class +// NamedClass nc = new NamedClass(typeTmp); +// if(hierarchy.getSubClasses(nc).size()==1) { + // usually there is just one type assigned in the DBpedia ontology +// if(!type.equals("unknown")) { +// throw new Error("two different types for " + object + ": " + type + " and " + typeTmp); +// } +// type = typeTmp; +// } + } + + previousObject = object; + points++; } - - fos.write(content.getBytes()); - + counter++; } offset += limit; - System.out.println(offset + " points queried."); + System.out.println(points + " points queried."); } while(counter == limit); fos.close(); } - private static URI findGeoDataMatch(DBpediaPoint dbpediaPoint) { + private static URI findGeoDataMatch(DBpediaPoint dbpediaPoint) throws IOException { - // get all GeoData points close to the given point -// SparqlQuery query = new SparqlQuery("", geoDataEndpoint); + // 1 degree is about 111 km (depending on the specific point) + int distanceThresholdMeters = 1000; + boolean quiet = true; - /* - int distanceThresholdMeters = 100; - - // use official DBpedia endpoint (switch to db0 later) - SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); - SPARQLTasks st = new SPARQLTasks(endpoint); - - // query latitude and longitude - String query = "SELECT ?lat ?long WHERE { "; - query += "<" + dbpediaURI + "> <http://www.w3.org/2003/01/geo/wgs84_pos#lat> ?lat ."; - query += "<" + dbpediaURI + "> <http://www.w3.org/2003/01/geo/wgs84_pos#long> ?long . } LIMIT 1"; - - // perform query and read lat and long from results - ResultSet results = st.queryAsResultSet(query); - QuerySolution qs = results.nextSolution(); - String geoLat = qs.getLiteral("lat").getString(); - String geoLong = qs.getLiteral("long").getString(); - - System.out.println("lat: " + geoLat + ", long: " + geoLong); - - URL linkedGeoDataURL = new URL("http://linkedgeodata.org/triplify/near/"+geoLat+","+geoLong+"/"+distanceThresholdMeters); - - // TODO: replace by SPARQL query - - URLConnection conn = linkedGeoDataURL.openConnection(); - BufferedReader rd = new BufferedReader(new InputStreamReader(conn.getInputStream())); - StringBuffer sb = new StringBuffer(); - String line=""; -// int pointID = 0; - while ((line = rd.readLine()) != null) - { - if(line.contains("Auerbach")) { - System.out.println(line); + if(useSparqlForGettingNearbyPoints) { + // TODO: convert from meters to lat/long + double distanceThresholdLat = 0.3; + double distanceThresholdLong = 0.3; + + // create a box around the point + double minLat = dbpediaPoint.getGeoLat() - distanceThresholdLat; + double maxLat = dbpediaPoint.getGeoLat() + distanceThresholdLat; + double minLong = dbpediaPoint.getGeoLong() - distanceThresholdLong; + double maxLong = dbpediaPoint.getGeoLong() + distanceThresholdLong; + + // query all points in the box + String queryStr = "select ?point ?lat ?long ?name where { "; + queryStr += "?point <http://linkedgeodata.org/vocabulary/latitude> ?lat ."; + queryStr += "FILTER (xsd:float(?lat) > " + minLat + ") ."; + queryStr += "FILTER (xsd:float(?lat) < " + maxLat + ") ."; + queryStr += "?point <http://linkedgeodata.org/vocabulary/longitude> ?long ."; + queryStr += "FILTER (xsd:float(?long) > " + minLong + ") ."; + queryStr += "FILTER (xsd:float(?long) < " + maxLong + ") ."; + queryStr += "?point <http://linkedgeodata.org/vocabulary/name> ?name ."; + queryStr += "}"; + + SparqlQuery query = new SparqlQuery(queryStr, geoDataEndpoint); + ResultSet rs = query.send(); + + while(rs.hasNext()) { +// QuerySolution qs = rs.nextSolution(); + + // measure string similarity and proximity + // TODO: incomplete + } + return null; + // use Tripliy spatial extension + } else { + if(dbpediaPoint.getGeoLat() < 0 || dbpediaPoint.getGeoLong() < 0) { + return null; } - sb.append(line); + if(!quiet) + System.out.println(dbpediaPoint.getLabel()); + + URL linkedGeoDataURL = new URL("http://linkedgeodata.org/triplify/near/"+dbpediaPoint.getGeoLat()+","+dbpediaPoint.getGeoLong()+"/"+distanceThresholdMeters); + + double highestScore = 0; + String bestURI = null; + String bestLabel = null; + URLConnection conn = linkedGeoDataURL.openConnection(); + BufferedReader rd = new BufferedReader(new InputStreamReader(conn.getInputStream())); +// StringBuffer sb = new StringBuffer(); + String line=""; + while ((line = rd.readLine()) != null) + { + if(line.contains("<http://linkedgeodata.org/vocabulary#name>") || line.contains("<http://linkedgeodata.org/vocabulary/#name%25en>")) { + int first = line.indexOf("\"") + 1; + int last = line.lastIndexOf("\""); + String label = line.substring(first, last); + + // perform string similarity + // (we can use a variety of string matching heuristics) + double score = distance.score(label, dbpediaPoint.getLabel()); + if(score > highestScore) { + highestScore = score; + bestURI = line.substring(1, line.indexOf(" ")-1); + bestLabel = label; + } + } +// sb.append(line); + } + rd.close(); + + if(!quiet) { + System.out.println(" " + linkedGeoDataURL); + System.out.println(" " + highestScore); + System.out.println(" " + bestURI); + System.out.println(" " + bestLabel); + } + + if(highestScore > scoreThreshold) { +// System.out.println(" match"); + return URI.create(bestURI); + } else { +// System.out.println(" no match"); + return null; + } } - rd.close(); - -// System.out.println(sb.toString()); - - */ - return null; } } Modified: trunk/src/dl-learner/org/dllearner/scripts/matching/DBpediaPoint.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/matching/DBpediaPoint.java 2009-04-20 17:32:06 UTC (rev 1720) +++ trunk/src/dl-learner/org/dllearner/scripts/matching/DBpediaPoint.java 2009-04-21 13:06:39 UTC (rev 1721) @@ -32,11 +32,19 @@ private URI uri; private String label; + + private String[] classes; - public DBpediaPoint(URI uri, String label, double geoLat, double geoLong) { + // decimal count in latitude value => indicator for size of object (no or low + // number of decimals indicates a large object) + private int decimalCount; + + public DBpediaPoint(URI uri, String label, String[] classes, double geoLat, double geoLong, int decimalCount) { super(geoLat,geoLong); this.uri = uri; this.label = label; + this.classes = classes; + this.decimalCount = decimalCount; } /** @@ -53,4 +61,23 @@ return label; } + public String[] getClasses() { + return classes; + } + + /** + * @return the decimalCount + */ + public int getDecimalCount() { + return decimalCount; + } + + @Override + public String toString() { + String str = uri + ", \"" + label + "\", " + geoLat + ", " + geoLong + " (classes: "; + for(String clazz : classes) { + str += clazz + " "; + } + return str + ")"; + } } Modified: trunk/src/dl-learner/org/dllearner/scripts/matching/Point.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/matching/Point.java 2009-04-20 17:32:06 UTC (rev 1720) +++ trunk/src/dl-learner/org/dllearner/scripts/matching/Point.java 2009-04-21 13:06:39 UTC (rev 1721) @@ -27,9 +27,9 @@ */ public class Point { - private double geoLat; + protected double geoLat; - private double geoLong; + protected double geoLong; public Point(double geoLat, double geoLong) { this.geoLat = geoLat; Modified: trunk/src/dl-learner/org/dllearner/test/junit/TestOntologies.java =================================================================== --- trunk/src/dl-learner/org/dllearner/test/junit/TestOntologies.java 2009-04-20 17:32:06 UTC (rev 1720) +++ trunk/src/dl-learner/org/dllearner/test/junit/TestOntologies.java 2009-04-21 13:06:39 UTC (rev 1721) @@ -41,7 +41,7 @@ */ public final class TestOntologies { - public enum TestOntology { EMPTY, SIMPLE, SIMPLE_NO_DR, SIMPLE_NO_DISJOINT, SIMPLE_NO_DR_DISJOINT, SIMPLE2, SIMPLE3, R1SUBR2, DATA1, FIVE_ROLES, FATHER_OE, CARCINOGENESIS, EPC_OE, KRK_ZERO_ONE }; + public enum TestOntology { EMPTY, SIMPLE, SIMPLE_NO_DR, SIMPLE_NO_DISJOINT, SIMPLE_NO_DR_DISJOINT, SIMPLE2, SIMPLE3, R1SUBR2, DATA1, FIVE_ROLES, FATHER_OE, CARCINOGENESIS, EPC_OE, KRK_ZERO_ONE, DBPEDIA_OWL }; public static ReasonerComponent getTestOntology(TestOntology ont) { String kbString = ""; @@ -117,7 +117,9 @@ owlFile = "examples/epc/sap_epc_oe.owl"; } else if(ont.equals(TestOntology.KRK_ZERO_ONE)) { owlFile = "examples/krk/KRK_ZERO_ONE.owl"; - } + } else if(ont.equals(TestOntology.DBPEDIA_OWL)) { + owlFile = "/home/jl/promotion/ontologien/dbpedia.owl"; + } try { ComponentManager cm = ComponentManager.getInstance(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |