From: <lor...@us...> - 2013-12-09 14:36:41
|
Revision: 4194 http://sourceforge.net/p/dl-learner/code/4194 Author: lorenz_b Date: 2013-12-09 14:36:38 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Added DBpedia experiment. Modified Paths: -------------- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java Added Paths: ----------- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaCorpusGenerator.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/KnowledgebaseSampleGenerator.java Added: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaCorpusGenerator.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaCorpusGenerator.java (rev 0) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaCorpusGenerator.java 2013-12-09 14:36:38 UTC (rev 4194) @@ -0,0 +1,169 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLEncoder; +import java.sql.SQLException; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +import org.aksw.jena_sparql_api.cache.core.QueryExecutionFactoryCacheEx; +import org.aksw.jena_sparql_api.cache.extra.CacheCoreEx; +import org.aksw.jena_sparql_api.cache.extra.CacheCoreH2; +import org.aksw.jena_sparql_api.cache.extra.CacheEx; +import org.aksw.jena_sparql_api.cache.extra.CacheExImpl; +import org.aksw.jena_sparql_api.core.QueryExecutionFactory; +import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.dllearner.core.owl.NamedClass; +import org.dllearner.kb.sparql.SparqlEndpoint; +import org.semanticweb.owlapi.apibinding.OWLManager; +import org.semanticweb.owlapi.model.OWLClass; +import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.OWLOntologyCreationException; + +import com.google.common.base.Charsets; +import com.google.common.io.Files; +import com.hp.hpl.jena.query.QueryExecution; +import com.hp.hpl.jena.query.QuerySolution; +import com.hp.hpl.jena.query.ResultSet; + +/** + * This class loads a set of English labels for a given number of instances for each class in the DBpedia ontology. + * @author Lorenz Buehmann + * + */ +public class DBpediaCorpusGenerator { + + /** + * Loads DBpedia ontology from remote URL. + */ + private static OWLOntology loadDBpediaOntology(){ + try { + URL url = new URL("http://downloads.dbpedia.org/3.9/dbpedia_3.9.owl.bz2"); + InputStream is = new BufferedInputStream(url.openStream()); + CompressorInputStream in = new CompressorStreamFactory().createCompressorInputStream("bzip2", is); + OWLOntology ontology = OWLManager.createOWLOntologyManager().loadOntologyFromOntologyDocument(in); + return ontology; + } catch (MalformedURLException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (OWLOntologyCreationException e) { + e.printStackTrace(); + } + return null; + } + + public static Set<String> getDBpediaCorpusSample(String textProperty, int maxNrOfInstancesPerClass){ + Set<String> documents = new HashSet<>(); + + SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); + String cacheDirectory = "cache"; + File corpusDirectory = new File("tmp/dbpedia-corpus"); + corpusDirectory.mkdirs(); + + QueryExecutionFactory qef = new QueryExecutionFactoryHttp(endpoint.getURL().toString(), endpoint.getDefaultGraphURIs()); + try { + long timeToLive = TimeUnit.DAYS.toMillis(30); + CacheCoreEx cacheBackend = CacheCoreH2.create(cacheDirectory, timeToLive, true); + CacheEx cacheFrontend = new CacheExImpl(cacheBackend); + qef = new QueryExecutionFactoryCacheEx(qef, cacheFrontend); + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } catch (SQLException e) { + e.printStackTrace(); + } + + //load the DBpedia ontology + OWLOntology ontology = loadDBpediaOntology(); + + //get a random set of instances for each class and their English label + for (OWLClass cls : ontology.getClassesInSignature()) { + String query = "SELECT ?s ?text WHERE {" + + "?s a <" + cls.toStringID() + ">. " + + "?s <" + textProperty + "> ?text. " + + "FILTER(LANGMATCHES(LANG(?text),'en'))} LIMIT " + maxNrOfInstancesPerClass; + QueryExecution qe = qef.createQueryExecution(query); + ResultSet rs = qe.execSelect(); + QuerySolution qs; + while(rs.hasNext()){ + qs = rs.next(); + + String uri = qs.getResource("s").getURI(); + String text = qs.getLiteral("text").getLexicalForm(); + + documents.add(text); + + //save to disk + try { + Files.write(text, new File(corpusDirectory, URLEncoder.encode(uri, "UTF-8")), Charsets.UTF_8); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + return documents; + } + + public static Set<String> getDBpediaCorpusSample(String textProperty, Set<NamedClass> classes, int maxNrOfInstancesPerClass){ + Set<String> documents = new HashSet<>(); + + SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); + String cacheDirectory = "cache"; + File corpusDirectory = new File("tmp/dbpedia-corpus"); + corpusDirectory.mkdirs(); + + QueryExecutionFactory qef = new QueryExecutionFactoryHttp(endpoint.getURL().toString(), endpoint.getDefaultGraphURIs()); + try { + long timeToLive = TimeUnit.DAYS.toMillis(30); + CacheCoreEx cacheBackend = CacheCoreH2.create(cacheDirectory, timeToLive, true); + CacheEx cacheFrontend = new CacheExImpl(cacheBackend); + qef = new QueryExecutionFactoryCacheEx(qef, cacheFrontend); + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } catch (SQLException e) { + e.printStackTrace(); + } + + //get a random set of instances for each class and their English label + for (NamedClass cls : classes) { + String query = "SELECT ?s ?text WHERE {" + + "?s a <" + cls.getName() + ">. " + + "?s <" + textProperty + "> ?text. " + + "FILTER(LANGMATCHES(LANG(?text),'en'))} LIMIT " + maxNrOfInstancesPerClass; + QueryExecution qe = qef.createQueryExecution(query); + ResultSet rs = qe.execSelect(); + QuerySolution qs; + while(rs.hasNext()){ + qs = rs.next(); + + String uri = qs.getResource("s").getURI(); + String text = qs.getLiteral("text").getLexicalForm(); + + documents.add(text); + + //save to disk + try { + Files.write(text, new File(corpusDirectory, URLEncoder.encode(uri, "UTF-8")), Charsets.UTF_8); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + return documents; + } + +} Added: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java (rev 0) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java 2013-12-09 14:36:38 UTC (rev 4194) @@ -0,0 +1,103 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.dllearner.core.owl.NamedClass; +import org.dllearner.kb.sparql.SparqlEndpoint; +import org.semanticweb.owlapi.apibinding.OWLManager; +import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.OWLOntologyCreationException; +import org.semanticweb.owlapi.model.OWLOntologyManager; + +import com.google.common.collect.Sets; +import com.hp.hpl.jena.rdf.model.Model; + +/** + * @author Lorenz Buehmann + * + */ +public class DBpediaExperiment extends Experiment{ + + final SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); + final int maxNrOfInstancesPerClass = 100; + + + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.Experiment#getOntology() + */ + @Override + protected OWLOntology getOntology() { + //load the DBpedia schema + try { + URL url = new URL("http://downloads.dbpedia.org/3.9/dbpedia_3.9.owl.bz2"); + InputStream is = new BufferedInputStream(url.openStream()); + CompressorInputStream in = new CompressorStreamFactory().createCompressorInputStream("bzip2", is); + OWLOntology schema = OWLManager.createOWLOntologyManager().loadOntologyFromOntologyDocument(in); + return schema; + } catch (MalformedURLException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (OWLOntologyCreationException e) { + e.printStackTrace(); + } + //load some sample data for the machine learning part + Model sample = KnowledgebaseSampleGenerator.createKnowledgebaseSample( + endpoint, + "http://dbpedia.org/ontology/", + Sets.newHashSet(new NamedClass("http://dbpedia.org/ontology/Person")), + maxNrOfInstancesPerClass); + try { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + sample.write(baos, "TURTLE", null); + OWLOntologyManager man = OWLManager.createOWLOntologyManager(); + OWLOntology ontology = man.loadOntologyFromOntologyDocument(new ByteArrayInputStream(baos.toByteArray())); + return ontology; + } catch (Exception e) { + e.printStackTrace(); + } + + return null; + } + + + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.Experiment#getDocuments() + */ + @Override + protected Set<String> getDocuments() { + Set<String> documents = new HashSet<String>(); + + documents.addAll(DBpediaCorpusGenerator.getDBpediaCorpusSample( + "http://dbpedia.org/ontology/abstract", + Sets.newHashSet(new NamedClass("http://dbpedia.org/ontology/Person")), + maxNrOfInstancesPerClass)); + + documents.clear(); + documents.add("Thomas Cruise Mapother IV, widely known as Tom Cruise, is an American film actor and producer. He has been nominated for three Academy Awards and has won three Golden Globe Awards. He started his career at age 19 in the 1981 film Taps. His first leading role was in Risky Business, released in August 1983. Cruise became a full-fledged movie star after starring in Top Gun (1986). He is well known for his role as secret agent Ethan Hunt in the Mission: Impossible film series between 1996 and 2011. Cruise has starred in many Hollywood blockbusters, including Rain Man (1988), A Few Good Men (1992), Jerry Maguire (1996), Vanilla Sky (2001), Minority Report (2002), The Last Samurai (2003), Collateral (2004), War of the Worlds (2005), Tropic Thunder (2008) and Jack Reacher (2012). As of 2012, Cruise is Hollywood's highest-paid actor. Cruise is known for his Scientologist faith and for his support of the Church of Scientology."); + + return documents; + } + + public static void main(String[] args) throws Exception { + new DBpediaExperiment().run(new NamedClass("http://dbpedia.org/ontology/Person")); + } +} Added: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/KnowledgebaseSampleGenerator.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/KnowledgebaseSampleGenerator.java (rev 0) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/KnowledgebaseSampleGenerator.java 2013-12-09 14:36:38 UTC (rev 4194) @@ -0,0 +1,223 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.SortedSet; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorOutputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.log4j.Logger; +import org.dllearner.core.owl.Individual; +import org.dllearner.core.owl.NamedClass; +import org.dllearner.kb.SparqlEndpointKS; +import org.dllearner.kb.sparql.ConciseBoundedDescriptionGenerator; +import org.dllearner.kb.sparql.ConciseBoundedDescriptionGeneratorImpl; +import org.dllearner.kb.sparql.SparqlEndpoint; +import org.dllearner.reasoning.SPARQLReasoner; + +import com.google.common.base.Charsets; +import com.google.common.hash.HashCode; +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import com.hp.hpl.jena.rdf.model.Model; +import com.hp.hpl.jena.rdf.model.ModelFactory; + +/** + * @author Lorenz Buehmann + * + */ +public class KnowledgebaseSampleGenerator { + + private static final Logger logger = Logger.getLogger(KnowledgebaseSampleGenerator.class.getName()); + + private static String cacheDir = "sparql-cache"; + private static int maxCBDDepth = 0; + + public static Model createKnowledgebaseSample(SparqlEndpoint endpoint, String namespace, Set<NamedClass> classes, int maxNrOfInstancesPerClass){ + Model model = ModelFactory.createDefaultModel(); + + //try to load existing sample from file system + HashFunction hf = Hashing.md5(); + HashCode hc = hf.newHasher().putString(endpoint.getURL().toString(), Charsets.UTF_8). + putInt(classes.hashCode()).hash(); + String filename = hc.toString() + "-" + maxNrOfInstancesPerClass + ".ttl.bz2"; + File file = new File(filename); + + if(!file.exists()){//if not exists + logger.info("Generating sample..."); + long startTime = System.currentTimeMillis(); + SPARQLReasoner reasoner = new SPARQLReasoner(new SparqlEndpointKS(endpoint), cacheDir); + ConciseBoundedDescriptionGenerator cbdGen = new ConciseBoundedDescriptionGeneratorImpl(endpoint, cacheDir); + + //get for each class n instances and compute the CBD for each instance + for (NamedClass cls : classes) { + logger.debug("\t...processing class " + cls + "..."); + SortedSet<Individual> individuals = reasoner.getIndividuals(cls, maxNrOfInstancesPerClass*2); + + Model cbd; + int cnt = 0; + for (Individual individual : individuals) { + try { + cbd = cbdGen.getConciseBoundedDescription(individual.getName(), maxCBDDepth); + model.add(cbd); + if(cnt++ == maxNrOfInstancesPerClass){ + break; + } + } catch (Exception e) { + e.printStackTrace(); + } + } + } + logger.info("...done in " + (System.currentTimeMillis() - startTime) + "ms"); + //add schema + model.add(reasoner.loadOWLSchema()); + logger.debug("Writing sample to disk..."); + startTime = System.currentTimeMillis(); + try { + CompressorOutputStream out = new CompressorStreamFactory() + .createCompressorOutputStream(CompressorStreamFactory.BZIP2, new FileOutputStream(file)); + model.write(out,"TURTLE"); + out.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + logger.debug("...done in " + (System.currentTimeMillis() - startTime) + "ms"); + } else { + logger.info("Loading sample from disk..."); + long startTime = System.currentTimeMillis(); + try { + CompressorInputStream in = new CompressorStreamFactory(). + createCompressorInputStream(CompressorStreamFactory.BZIP2, new FileInputStream(file)); + model.read(in, null, "TURTLE"); + in.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + logger.info("...done in " + (System.currentTimeMillis() - startTime) + "ms"); + } + + return model; + } + + public static Model createKnowledgebaseSample(SparqlEndpoint endpoint, String namespace, int maxNrOfClasses, int maxNrOfInstancesPerClass){ + Model model = ModelFactory.createDefaultModel(); + + //try to load existing sample from file system + HashFunction hf = Hashing.md5(); + HashCode hc = hf.newHasher().putString(endpoint.getURL().toString(), Charsets.UTF_8).hash(); + String filename = hc.toString() + ("-" + ((maxNrOfClasses == Integer.MAX_VALUE) ? "all" : maxNrOfClasses)) + "-" + maxNrOfInstancesPerClass + ".ttl.bz2"; + File file = new File(filename); + + if(!file.exists()){//if not exists + logger.info("Generating sample..."); + long startTime = System.currentTimeMillis(); + SPARQLReasoner reasoner = new SPARQLReasoner(new SparqlEndpointKS(endpoint), cacheDir); + ConciseBoundedDescriptionGenerator cbdGen = new ConciseBoundedDescriptionGeneratorImpl(endpoint, cacheDir); + + //get all OWL classes + Set<NamedClass> classes = reasoner.getOWLClasses(namespace); + if(maxNrOfClasses != -1 && maxNrOfClasses != Integer.MAX_VALUE){ + List<NamedClass> tmpClasses = new ArrayList<NamedClass>(classes); + Collections.shuffle(tmpClasses); + classes = new HashSet<NamedClass>(tmpClasses.subList(0, Math.min(tmpClasses.size(), maxNrOfClasses))); + } + + //get for each class n instances and compute the CBD for each instance + for (NamedClass cls : classes) { + logger.debug("\t...processing class " + cls + "..."); + SortedSet<Individual> individuals = reasoner.getIndividuals(cls, maxNrOfInstancesPerClass*2); + + Model cbd; + int cnt = 0; + for (Individual individual : individuals) { + try { + cbd = cbdGen.getConciseBoundedDescription(individual.getName(), maxCBDDepth); + model.add(cbd); + if(cnt++ == maxNrOfInstancesPerClass){ + break; + } + } catch (Exception e) { + e.printStackTrace(); + } + } + } + logger.info("...done in " + (System.currentTimeMillis() - startTime) + "ms"); + //add schema + model.add(reasoner.loadOWLSchema()); + logger.debug("Writing sample to disk..."); + startTime = System.currentTimeMillis(); + try { + CompressorOutputStream out = new CompressorStreamFactory() + .createCompressorOutputStream(CompressorStreamFactory.BZIP2, new FileOutputStream(file)); + model.write(out,"TURTLE"); + out.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + logger.debug("...done in " + (System.currentTimeMillis() - startTime) + "ms"); + } else { + logger.info("Loading sample from disk..."); + long startTime = System.currentTimeMillis(); + try { + CompressorInputStream in = new CompressorStreamFactory(). + createCompressorInputStream(CompressorStreamFactory.BZIP2, new FileInputStream(file)); + model.read(in, null, "TURTLE"); + in.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + logger.info("...done in " + (System.currentTimeMillis() - startTime) + "ms"); + } + + return model; + } + + public static Model createKnowledgebaseSample(SparqlEndpoint endpoint, int maxNrOfClasses, int maxNrOfInstancesPerClass){ + return createKnowledgebaseSample(endpoint, null, maxNrOfClasses, maxNrOfInstancesPerClass); + } + + public static Model createKnowledgebaseSample(SparqlEndpoint endpoint, Set<NamedClass> classes, int maxNrOfInstancesPerClass){ + return createKnowledgebaseSample(endpoint, null, classes, maxNrOfInstancesPerClass); + } + + public static Model createKnowledgebaseSample(SparqlEndpoint endpoint, int maxNrOfInstancesPerClass){ + return createKnowledgebaseSample(endpoint, Integer.MAX_VALUE, maxNrOfInstancesPerClass); + } + + public static Model createKnowledgebaseSample(SparqlEndpoint endpoint, String namespace, int maxNrOfInstancesPerClass){ + return createKnowledgebaseSample(endpoint, null, Integer.MAX_VALUE, maxNrOfInstancesPerClass); + } + + public static void main(String[] args) throws Exception { + Model kb = createKnowledgebaseSample(SparqlEndpoint.getEndpointDBpedia(), "http://dbpedia.org/ontology", 100); + } +} Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java 2013-12-09 14:22:20 UTC (rev 4193) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java 2013-12-09 14:36:38 UTC (rev 4194) @@ -79,8 +79,8 @@ } catch (IOException e) { e.printStackTrace(); } -// documents.clear(); -// documents.add("and in that day seven women shall take hold of one man saying we will eat our own bread and wear our own apparel only let us be called by thy name to take away our reproach in that day shall the branch of the lord be beautiful and glorious and the fruit of the earth excellent and comely for them that are escaped of israel and it shall come to pass left in zion and remaineth in jerusalem shall be called holy every one that is written among the living in jerusalem when the lord shall have washed away the filth of the daughters of zion and shall have purged the blood of jerusalem from the midst thereof by the spirit of judgment and by the spirit of burning and the lord will create upon every dwelling place of mount zion and upon her assemblies a cloud and smoke by day and the shining of a flaming fire by night for upon all the glory a defence and there shall be a tabernacle for a shadow in the daytime from the heat and for a place of refuge and for a covert from storm and from rain"); + documents.clear(); + documents.add("and in that day seven women shall take hold of one man saying we will eat our own bread and wear our own apparel only let us be called by thy name to take away our reproach in that day shall the branch of the lord be beautiful and glorious and the fruit of the earth excellent and comely for them that are escaped of israel and it shall come to pass left in zion and remaineth in jerusalem shall be called holy every one that is written among the living in jerusalem when the lord shall have washed away the filth of the daughters of zion and shall have purged the blood of jerusalem from the midst thereof by the spirit of judgment and by the spirit of burning and the lord will create upon every dwelling place of mount zion and upon her assemblies a cloud and smoke by day and the shining of a flaming fire by night for upon all the glory a defence and there shall be a tabernacle for a shadow in the daytime from the heat and for a place of refuge and for a covert from storm and from rain"); return documents; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |