From: <ku...@us...> - 2008-07-31 16:27:45
|
Revision: 1042 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=1042&view=rev Author: kurzum Date: 2008-07-31 16:27:38 +0000 (Thu, 31 Jul 2008) Log Message: ----------- Wikipedia Category cleaner Modified Paths: -------------- trunk/src/dl-learner/org/dllearner/utilities/examples/AutomaticNegativeExampleFinderSPARQL.java trunk/src/dl-learner/org/dllearner/utilities/learn/LearnSparql.java Added Paths: ----------- trunk/src/dl-learner/org/dllearner/scripts/WikipediaCategoryCleaner.java trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSPARQLReEvaluator.java trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSelector.java trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/WikipediaCategoryTasks.java trunk/src/dl-learner/org/dllearner/utilities/learn/LearnSPARQLConfiguration.java trunk/src/dl-learner/org/dllearner/utilities/learn/LearnSparqlold.java trunk/src/dl-learner/org/dllearner/utilities/learn/SPARQLExtractionEvaluation.java trunk/src/dl-learner/org/dllearner/utilities/learn/SPARQLMassLearning.java Removed Paths: ------------- trunk/src/dl-learner/org/dllearner/scripts/SKOS7030.java trunk/src/dl-learner/org/dllearner/scripts/SPARQLExtractionEvaluation.java trunk/src/dl-learner/org/dllearner/scripts/SPARQLMassLearning.java Deleted: trunk/src/dl-learner/org/dllearner/scripts/SKOS7030.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/SKOS7030.java 2008-07-31 16:23:42 UTC (rev 1041) +++ trunk/src/dl-learner/org/dllearner/scripts/SKOS7030.java 2008-07-31 16:27:38 UTC (rev 1042) @@ -1,562 +0,0 @@ -/** - * Copyright (C) 2007-2008, Jens Lehmann - * - * This file is part of DL-Learner. - * - * DL-Learner is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * DL-Learner is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - */ -package org.dllearner.scripts; - -import java.io.File; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.SortedSet; -import java.util.TreeSet; - -import org.apache.log4j.ConsoleAppender; -import org.apache.log4j.FileAppender; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.log4j.SimpleLayout; -import org.dllearner.algorithms.refexamples.ExampleBasedROLComponent; -import org.dllearner.core.ComponentManager; -import org.dllearner.core.EvaluatedDescription; -import org.dllearner.core.KnowledgeSource; -import org.dllearner.core.LearningAlgorithm; -import org.dllearner.core.LearningProblem; -import org.dllearner.core.ReasonerComponent; -import org.dllearner.core.ReasoningService; -import org.dllearner.core.owl.Description; -import org.dllearner.kb.sparql.Cache; -import org.dllearner.kb.sparql.SPARQLTasks; -import org.dllearner.kb.sparql.SparqlEndpoint; -import org.dllearner.kb.sparql.SparqlKnowledgeSource; -import org.dllearner.learningproblems.PosNegDefinitionLP; -import org.dllearner.learningproblems.PosNegLP; -import org.dllearner.reasoning.FastInstanceChecker; -import org.dllearner.utilities.Files; -import org.dllearner.utilities.JamonMonitorLogger; -import org.dllearner.utilities.datastructures.SetManipulation; -import org.dllearner.utilities.examples.AutomaticNegativeExampleFinderSPARQL; -import org.dllearner.utilities.examples.AutomaticPositiveExampleFinderSPARQL; - -public class SKOS7030 { - - public int test = 0; - - private static SPARQLTasks sparqlTasks; - - private static LearningAlgorithm la; - - private static final long wash = 1216800000000L; - - private boolean stable = true; - - // private static long wash = 1216901570168 - - private static Logger logger = Logger.getRootLogger(); - - static boolean local = true; - - static String url = ""; - - // LEARNING - static int recursiondepth = 1; - - static boolean closeAfterRecursion = true; - - static boolean randomizeCache = false; - - static double noise = 15; - - static int maxExecutionTimeInSeconds = 30; - - static int guaranteeXgoodDescriptions = 40; - - // examples - static int sparqlResultSize = 2000; - - static double percentOfSKOSSet = 0.2; - - static double negfactor = 1.0; - - SortedSet<String> posExamples = new TreeSet<String>(); - - SortedSet<String> fullPositiveSet = new TreeSet<String>(); - - SortedSet<String> fullPosSetWithoutPosExamples = new TreeSet<String>(); - - SortedSet<String> negExamples = new TreeSet<String>(); - - /** - * @param args - */ - public static void main(String[] args) { - initLogger(); - logger.info("Start"); - // String resultString=""; - // System.out.println(time()); - // System.out.println(System.currentTimeMillis()); - - - // parameters - - if (local) { - url = "http://139.18.2.37:8890/sparql"; - sparqlTasks = new SPARQLTasks(Cache.getPersistentCache(), - SparqlEndpoint.getEndpointLOCALDBpedia()); - } else { - url = "http://dbpedia.openlinksw.com:8890/sparql"; - sparqlTasks = new SPARQLTasks(Cache.getPersistentCache(), - SparqlEndpoint.getEndpointDBpedia()); - } - - - String st= "http://dbpedia.org/class/yago/Person100007846"; - st = "http://dbpedia.org/class/yago/Leader109623038"; - System.out.println(sparqlTasks.getSuperClasses(st, 2)); - System.out.println(sparqlTasks.getSuperClasses(st, 1)); - System.out.println(sparqlTasks.getSubClasses(st, 0)); - - System.exit(0); - - //System.out.println(sparqlTasks.getDomain( - // "http://dbpedia.org/property/predecessor", 1000)); - - String target = "http://dbpedia.org/resource/Category:Prime_Ministers_of_the_United_Kingdom"; - - // String - // award=("http://dbpedia.org/resource/Category:Best_Actor_Academy_Award_winners"); - - SKOS7030 s = new SKOS7030(); - - s.makeExamples(target, percentOfSKOSSet, negfactor, sparqlResultSize); - - // System.exit(0); - List<Description> conceptresults = new ArrayList<Description>(); - List<EvaluatedDescription> conceptresults2 = new ArrayList<EvaluatedDescription>(); - s.learn(); - - recordConceptClasses(); - - System.exit(0); - - // EvaluatedDescription - logger.debug("found nr of concepts: " + conceptresults.size()); - System.out.println(conceptresults); - - int x = 0; - - SortedSet<ResultMostCoveredInRest> res = new TreeSet<ResultMostCoveredInRest>(); - for (Description concept : conceptresults) { - if (x++ == 100) - break; - res.add(s.evaluate(concept, 1000)); - - } - - x = 0; - for (ResultMostCoveredInRest resultMostCoveredInRest : res) { - if (x++ == 10) - break; - System.out.println(resultMostCoveredInRest.concept); - System.out.println(resultMostCoveredInRest.accuracy); - System.out.println(resultMostCoveredInRest.retrievedInstancesSize); - - } - - s.print(res.first().concept, 1000); - - System.out.println("Finished"); - JamonMonitorLogger.printAllSortedByLabel(); - - } - - void print(final Description concept, final int sparqlResultLimit) { - logger.debug("evaluating concept: " + concept); - // SortedSet<String> instances = - // sparqlTasks.retrieveInstancesForConcept(oneConcept.toKBSyntaxString(), - // sparqlResultLimit); - SortedSet<String> instances = sparqlTasks - .retrieveInstancesForClassDescriptionIncludingSubclasses(concept - .toKBSyntaxString(), sparqlResultLimit,1); - - SortedSet<String> coveredInRest = new TreeSet<String>( - fullPosSetWithoutPosExamples); - coveredInRest.retainAll(instances); - - SortedSet<String> coveredTotal = new TreeSet<String>(fullPositiveSet); - coveredTotal.retainAll(instances); - - SortedSet<String> notCoveredInRest = new TreeSet<String>( - fullPosSetWithoutPosExamples); - notCoveredInRest.retainAll(coveredInRest); - System.out.println(notCoveredInRest); - - SortedSet<String> notCoveredTotal = new TreeSet<String>(fullPositiveSet); - notCoveredTotal.retainAll(coveredTotal); - System.out.println(notCoveredTotal); - - } - - ResultMostCoveredInRest evaluate(Description concept, int sparqlResultLimit) { - logger.debug("evaluating concept: " + concept); - // SortedSet<String> instances = - // sparqlTasks.retrieveInstancesForConcept(oneConcept.toKBSyntaxString(), - // sparqlResultLimit); - SortedSet<String> instances = sparqlTasks - .retrieveInstancesForClassDescriptionIncludingSubclasses(concept - .toKBSyntaxString(), sparqlResultLimit, 1 ); - - SortedSet<String> coveredInRest = new TreeSet<String>( - fullPosSetWithoutPosExamples); - coveredInRest.retainAll(instances); - - SortedSet<String> coveredTotal = new TreeSet<String>(fullPositiveSet); - coveredTotal.retainAll(instances); - - SortedSet<String> notCoveredInRest = new TreeSet<String>( - fullPosSetWithoutPosExamples); - notCoveredInRest.retainAll(coveredInRest); - - SortedSet<String> notCoveredTotal = new TreeSet<String>(fullPositiveSet); - notCoveredTotal.retainAll(coveredTotal); - double acc = (double) (coveredInRest.size() / fullPosSetWithoutPosExamples - .size()); - System.out.println("Accuracy: " + acc); - return new ResultMostCoveredInRest(concept, acc, instances.size()); - - } - - private static void initLogger() { - - SimpleLayout layout = new SimpleLayout(); - // create logger (a simple logger which outputs - // its messages to the console) - FileAppender fileAppender = null; - try { - fileAppender = new FileAppender(layout, "log/progress/skos" - + time() + ".txt", false); - } catch (Exception e) { - e.printStackTrace(); - } - - ConsoleAppender consoleAppender = new ConsoleAppender(layout); - logger.removeAllAppenders(); - logger.addAppender(consoleAppender); - logger.addAppender(fileAppender); - logger.setLevel(Level.DEBUG); - Logger.getLogger(KnowledgeSource.class).setLevel(Level.WARN); - - } - - /* - * public static SortedSet<String> selectDBpediaConcepts(int number){ - * String query = "SELECT DISTINCT ?concept WHERE { \n" + "[] a ?concept - * .FILTER (regex(str(?concept),'yago'))" + " \n} \n"; //LIMIT "+number+" - * - * String JSON = (c.executeSparqlQuery(new SparqlQuery(query, se))); - * ResultSet rs =SparqlQuery.JSONtoResultSet(JSON); JenaResultSetConvenience - * rsc = new JenaResultSetConvenience(rs); return - * SetManipulation.fuzzyShrink(rsc.getStringListForVariable("concept"),number); } - */ - - public void makeExamples(String SKOSConcept, double percentOfSKOSSet, - double negfactor, int sparqlResultSize) { - - // POSITIVES - AutomaticPositiveExampleFinderSPARQL apos = new AutomaticPositiveExampleFinderSPARQL( - sparqlTasks); - apos.makePositiveExamplesFromSKOSConcept(SKOSConcept); - fullPositiveSet = apos.getPosExamples(); - - // System.exit(0); - - int poslimit = (int) Math.round(percentOfSKOSSet - * fullPositiveSet.size()); - int neglimit = (int) Math.round(poslimit * negfactor); - - posExamples = SetManipulation.fuzzyShrink(fullPositiveSet, poslimit); - - // NEGATIVES - - AutomaticNegativeExampleFinderSPARQL aneg = new AutomaticNegativeExampleFinderSPARQL( - fullPositiveSet, sparqlTasks); - - aneg.makeNegativeExamplesFromParallelClasses(posExamples, - sparqlResultSize); - negExamples = aneg.getNegativeExamples(neglimit, stable); - - logger.debug("POSITIVE EXAMPLES"); - for (String pos : posExamples) { - logger.debug("+" + pos); - } - - logger.debug("NEGATIVE EXAMPLES"); - for (String negs : this.negExamples) { - logger.debug("-" + negs); - } - - fullPosSetWithoutPosExamples = fullPositiveSet; - fullPosSetWithoutPosExamples.removeAll(posExamples); - - logger.debug(fullPositiveSet); - logger.debug(fullPosSetWithoutPosExamples); - } - - public void learn() { - - SortedSet<String> instances = new TreeSet<String>(); - instances.addAll(this.posExamples); - instances.addAll(this.negExamples); - - logger.info("Start Learning with"); - logger.info("positive examples: \t" + posExamples.size()); - logger.info("negative examples: \t" + negExamples.size()); - logger.info("instances \t" + instances.size()); - - ComponentManager cm = ComponentManager.getInstance(); - // LearningAlgorithm la = null; - ReasoningService rs = null; - LearningProblem lp = null; - SparqlKnowledgeSource ks = null; - try { - Set<KnowledgeSource> sources = new HashSet<KnowledgeSource>(); - ks = cm.knowledgeSource(SparqlKnowledgeSource.class); - ReasonerComponent r = new FastInstanceChecker(sources); - rs = new ReasoningService(r); - // System.out.println("satisfy: "+rs.isSatisfiable()); - lp = new PosNegDefinitionLP(rs); - ((PosNegLP) lp).setPositiveExamples(SetManipulation - .stringToInd(this.posExamples)); - ((PosNegLP) lp).setNegativeExamples(SetManipulation - .stringToInd(this.negExamples)); - - la = cm.learningAlgorithm(ExampleBasedROLComponent.class, lp, rs); - - logger.debug("start learning"); - - // KNOWLEDGESOURCE - cm.applyConfigEntry(ks, "instances", instances); - cm.applyConfigEntry(ks, "url", url); - cm.applyConfigEntry(ks, "recursionDepth", recursiondepth); - cm.applyConfigEntry(ks, "closeAfterRecursion", closeAfterRecursion); - cm.applyConfigEntry(ks, "predefinedFilter", "YAGO"); - if (local) - cm.applyConfigEntry(ks, "predefinedEndpoint", "LOCALDBPEDIA"); - else { - cm.applyConfigEntry(ks, "predefinedEndpoint", "DBPEDIA"); - } - if (randomizeCache) - cm.applyConfigEntry(ks, "cacheDir", "cache/" - + System.currentTimeMillis() + ""); - else { - cm.applyConfigEntry(ks, "cacheDir", Cache.getDefaultCacheDir()); - } - - // LEARNINGALGORITHM - cm.applyConfigEntry(la, "useAllConstructor", false); - cm.applyConfigEntry(la, "useExistsConstructor", true); - cm.applyConfigEntry(la, "useCardinalityRestrictions", false); - cm.applyConfigEntry(la, "useNegation", false); - cm.applyConfigEntry(la, "minExecutionTimeInSeconds", 0); - cm.applyConfigEntry(la, "maxExecutionTimeInSeconds", - maxExecutionTimeInSeconds); - cm.applyConfigEntry(la, "guaranteeXgoodDescriptions", - guaranteeXgoodDescriptions); - cm.applyConfigEntry(la, "writeSearchTree", false); - cm.applyConfigEntry(la, "searchTreeFile", "log/SKOS.txt"); - cm.applyConfigEntry(la, "replaceSearchTree", true); - cm.applyConfigEntry(la, "noisePercentage", noise); - // cm.applyConfigEntry(la,"guaranteeXgoodDescriptions",999999); - cm.applyConfigEntry(la, "logLevel", "TRACE"); - /* - * if(ignoredConcepts.size()>0) - * cm.applyConfigEntry(la,"ignoredConcepts",ignoredConcepts); - */ - - ks.init(); - sources.add(ks); - r.init(); - lp.init(); - la.init(); - - la.start(); - // Statistics.addTimeCollecting(sc.getTime()); - // Statistics.addTimeLearning(sc.getTime()); - - // return la.getCurrentlyBestDescriptions(); - - } catch (Exception e) { - e.printStackTrace(); - } - // return null; - - } - - // String t="\"http://dbpedia.org/class/yago/Fiction106367107\""; - // t="(\"http://dbpedia.org/class/yago/HeadOfState110164747\" AND - // (\"http://dbpedia.org/class/yago/Negotiator110351874\" AND - // \"http://dbpedia.org/class/yago/Representative110522035\"))"; - // //System.out.println(t); - // //t="\"http://www.w3.org/2004/02/skos/core#subject\""; - // //conceptRewrite(t); - // //getSubClasses(t); - // - // AutomaticExampleFinderSKOSSPARQL ae= new - // AutomaticExampleFinderSKOSSPARQL( se); - // try{ - // System.out.println("oneconcept: "+t); - // SortedSet<String> instances = - // ae.queryConceptAsStringSet(conceptRewrite(t), 200); - // if(instances.size()>=0)System.out.println("size of instances - // "+instances.size()); - // if(instances.size()>=0 && instances.size()<100) - // System.out.println("instances"+instances); - // }catch (Exception e) { - // e.printStackTrace(); - // } - // SortedSet<String> concepts = new TreeSet<String>(); - - // System.out.println(DBpediaSKOS(prim)); - // double acc1=0.0; - // for (int i = 0; i < 5; i++) { - // acc1+=DBpediaSKOS(prim); - // } - // System.out.println("accprim"+(acc1/5)); - // - // double acc2=0.0; - // for (int i = 0; i < 5; i++) { - // acc2+=DBpediaSKOS(award); - // } - // System.out.println("accprim"+(acc2/5)); - - // DBpediaSKOS(concepts.first()); - // DBpediaSKOS(concepts.first()); - // concepts.remove(concepts.first()); - // DBpediaSKOS(concepts.first()); - // DBpediaSKOS(concepts.first()); - // concepts.remove(concepts.first()); - // DBpediaSKOS(concepts.first()); - // DBpediaSKOS(concepts.first()); - // algorithm="refinement"; - // roles(); - - /* - * System.out.println(Level.DEBUG.getClass()); - * System.out.println(Level.toLevel("INFO")); - * System.out.println(Level.INFO); - */ - // System.exit(0); - private class ResultCompare implements Comparable<ResultCompare> { - Description concept; - - double accuracy = 0.0; - - int retrievedInstancesSize = 0; - - public int compareTo(ResultCompare o2) { - return 0; - } - - - public boolean equals(ResultCompare o2) { - return this.concept.equals(o2.concept); - } - - public ResultCompare(Description conceptKBSyntax, double accuracy, - int retrievedInstancesSize) { - super(); - this.concept = conceptKBSyntax; - this.accuracy = accuracy; - this.retrievedInstancesSize = retrievedInstancesSize; - } - - } - - private class ResultMostCoveredInRest extends ResultCompare { - - public ResultMostCoveredInRest(Description concept, double accuracy, - int retrievedInstancesSize) { - super(concept, accuracy, retrievedInstancesSize); - - } - - public int compareTo(ResultMostCoveredInRest o2) { - if (this.equals(o2)) - return 0; - - if (this.accuracy > o2.accuracy) { - return 1; - } else if (this.accuracy == o2.accuracy) { - if (this.retrievedInstancesSize < o2.retrievedInstancesSize) - return 1; - else if (this.retrievedInstancesSize > o2.retrievedInstancesSize) { - return -1; - } else - return this.concept.toKBSyntaxString().compareTo( - o2.concept.toKBSyntaxString()); - } else { - return -1; - } - - } - - } - - public static String time() { - return ("" + (System.currentTimeMillis() - wash)).substring(0, 7); - - } - - /** - * - */ - public static void recordConceptClasses() { - StringBuffer result =new StringBuffer(); - StringBuffer result1 =new StringBuffer("\n\n ***********Entity*****\n"); - StringBuffer result2 =new StringBuffer("\n\n ***********OR*****\n"); - int result1count = 1; - int result2count = 1; - List<EvaluatedDescription> conceptresults = la - .getCurrentlyBestEvaluatedDescriptions(5000, .70, true); - - int x = 0; - for (EvaluatedDescription description : conceptresults) { - if (x < 50) { - x++; - result.append(description + "\n"); - } - - if (!description.toString().contains("Entity")) { - result1.append(description + "\n"); - result1count++; - } - if (!description.toString().contains("OR")) { - result2.append(description + "\n"); - result2count++; - } - } - result.append("full size: " + conceptresults.size()); - result.append(result1.toString() + " size: " + result1count + "\n"); - result.append(result2.toString() + " size: " + result2count + "\n"); - - Files.createFile(new File("results/descriptions/concepts" + time() - + ".txt"), result.toString()); - } - -} Deleted: trunk/src/dl-learner/org/dllearner/scripts/SPARQLExtractionEvaluation.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/SPARQLExtractionEvaluation.java 2008-07-31 16:23:42 UTC (rev 1041) +++ trunk/src/dl-learner/org/dllearner/scripts/SPARQLExtractionEvaluation.java 2008-07-31 16:27:38 UTC (rev 1042) @@ -1,251 +0,0 @@ -package org.dllearner.scripts; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.SortedSet; -import java.util.TreeSet; - -import org.apache.log4j.ConsoleAppender; -import org.apache.log4j.FileAppender; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.log4j.SimpleLayout; -import org.dllearner.kb.sparql.Cache; -import org.dllearner.kb.sparql.SparqlEndpoint; -import org.dllearner.kb.sparql.SparqlQuery; -import org.dllearner.utilities.learn.LearnSparql; -import org.dllearner.utilities.statistics.SimpleClock; -import org.dllearner.utilities.statistics.Statistics; - -import com.hp.hpl.jena.query.ResultSet; - -public class SPARQLExtractionEvaluation { - - static Cache c; - static SparqlEndpoint se; - private static Logger logger = Logger.getRootLogger(); - - //static String standardSettings=""; - //static String algorithm="refexamples"; - - //vars - static boolean useRelated = false; - static boolean useSuperClasses = true; - static boolean useParallelClasses = true; - static int poslimit = 0; - static int neglimit = 0; - static boolean randomizeCache = true; - - /** - * @param args - */ - public static void main(String[] args) { - init(); - System.out.println("Start"); - //logger.setLevel(Level.TRACE); - logger.setLevel(Level.WARN); - //Logger.getLogger(SparqlKnowledgeSource.class).setLevel(Level.WARN); - //Logger.getLogger(KnowledgeSource.class).setLevel(Level.WARN); - //System.out.println(Logger.getLogger(SparqlQuery.class).getLevel()); - SimpleClock sc=new SimpleClock(); - LocalDBpediaEvaluation(); - - sc.printAndSet("Finished"); - - } - - - static void LocalDBpediaEvaluation(){ - boolean local=true; - SimpleClock total =new SimpleClock(); - String url=""; - if(local){ - se = SparqlEndpoint.getEndpointLOCALDBpedia(); - - url = "http://139.18.2.37:8890/sparql"; - - }else{ - se = SparqlEndpoint.getEndpointDBpedia(); - url= "http://dbpedia.openlinksw.com:8890/sparql"; - } - - - SortedSet<String> concepts = new TreeSet<String>(); - SortedSet<String> tmpSet = new TreeSet<String>(); - //System.out.println(selectDBpediaConcepts(10)); - tmpSet=initConcepts(); - int number=tmpSet.size(); - //System.out.println(number); - //concepts.add("\"http://dbpedia.org/class/yago/Flamethrower103356559\""); - for (String string : tmpSet) { - //System.out.println("\""+string+"\","); - concepts.add("\""+string+"\""); - } - - - - SortedSet<String> posExamples = new TreeSet<String>(); - SortedSet<String> negExamples = new TreeSet<String>(); - - for (int a = 0; a < 1; a++) { - - poslimit+=15; - neglimit+=15; - printProgress(0, concepts.size(),0, "beginning",total.getTime()); - - int concount=0; - for (String oneConcept : concepts) { - concount++; - printProgress(concount, concepts.size(),0, oneConcept,total.getTime()); - int recursiondepth=0; - boolean closeAfterRecursion=true; - - System.out.println(oneConcept); - //AutomaticExampleFinderSPARQLold ae= new AutomaticExampleFinderSPARQLold( se); - - //ae.initDBpedia(oneConcept, useRelated, useSuperClasses,useParallelClasses, poslimit, neglimit); - - //posExamples = ae.getPosExamples(); - //negExamples = ae.getNegExamples(); - - for(recursiondepth=0;recursiondepth<4;recursiondepth++) { - - Statistics.setCurrentLabel(recursiondepth+""); - printProgress(concount, concepts.size(),recursiondepth, oneConcept,total.getTime()); - /*if(i==0){;} - else if(closeAfterRecursion) { - closeAfterRecursion=false; - recursiondepth++; - } - else { - closeAfterRecursion=true; - }*/ - - - Statistics.print(number); - - //System.out.println("currently at label "+Statistics.getCurrentLabel()+"||i: "+recursiondepth); - - LearnSparql ls = new LearnSparql(); - TreeSet<String> igno = new TreeSet<String>(); - igno.add(oneConcept.replaceAll("\"", "")); - //igno.add("\""+oneConcept+"\""); - //System.out.println(oneConcept); - - ls.learnDBpedia(posExamples, negExamples, url,igno,recursiondepth, closeAfterRecursion,randomizeCache); - - - } - } - Statistics.print(number); - String pre="log/gnu_"; - int examples=poslimit+neglimit; - String comment1="# "+examples+"examples\n"; - String f1=pre+"1avgtrip_"+examples+"example"+concepts.size()+"classes"; - writeToFile(f1, comment1+Statistics.getAVGTriplesForRecursionDepth(number)); - String comment2="# "+examples+"examples\n"; - String f2=pre+"2avgTimeExtraction_"+examples+"example"+concepts.size()+"classes"; - writeToFile(f2, comment2+Statistics.getAVGTimeCollecting(number)); - String comment3="# "+examples+"examples\n"; - String f3=pre+"2avgTimeLearning_"+examples+"example"+concepts.size()+"classes"; - writeToFile(f3, comment3+Statistics.getAVGTimeLearning(number)); - String comment4="# "+examples+"examples\n"; - String f4=pre+"2avgTotalTime_"+examples+"example"+concepts.size()+"classes"; - writeToFile(f4, comment4+Statistics.getAVGtotalTime(number)); - Statistics.reset(); - - }//outer - } - - - - public static void init() { - - SimpleLayout layout = new SimpleLayout(); - // create logger (a simple logger which outputs - // its messages to the console) - FileAppender fileAppender =null; ; - try{ - fileAppender = new FileAppender(layout,"log/sparqleval.txt",false); - }catch (Exception e) {e.printStackTrace();} - - ConsoleAppender consoleAppender = new ConsoleAppender(layout); - logger.removeAllAppenders(); - logger.addAppender(consoleAppender); - logger.addAppender(fileAppender); - - c = new Cache("cachetemp"); - - - } - - //FIXME - public static SortedSet<String> selectDBpediaConcepts(int number){ - String query = "SELECT DISTINCT ?concept WHERE { \n" + - "[] a ?concept .FILTER (regex(str(?concept),'yago'))" + - " \n} LIMIT "+1000+" \n "; // - - String JSON = (c.executeSparqlQuery(new SparqlQuery(query, se))); - ResultSet rs =SparqlQuery.convertJSONtoResultSet(JSON); - if(rs==null); - //JenaResultSetConvenience rsc = new JenaResultSetConvenience(rs); - //return SetManipulation.fuzzyShrink(rsc.getStringListForVariable("concept"),number); - return null; - } - - public static SortedSet<String> initConcepts(){ - SortedSet<String> concepts = new TreeSet<String>(); - concepts.add("http://dbpedia.org/class/yago/AirLane108492546"); - concepts.add("http://dbpedia.org/class/yago/AlphaBlocker102698769"); - concepts.add("http://dbpedia.org/class/yago/Articulation107131854"); - - concepts.add("http://dbpedia.org/class/yago/Ceremony107450842"); - concepts.add("http://dbpedia.org/class/yago/CookingOil107673145"); - concepts.add("http://dbpedia.org/class/yago/Corticosteroid114751417"); - concepts.add("http://dbpedia.org/class/yago/Curlew102033561"); - concepts.add("http://dbpedia.org/class/yago/DataStructure105728493"); - concepts.add("http://dbpedia.org/class/yago/Disappearance100053609"); - concepts.add("http://dbpedia.org/class/yago/Flintstone114871268"); -// concepts.add("http://dbpedia.org/class/yago/Form105930736"); -// concepts.add("http://dbpedia.org/class/yago/Hypochondriac110195487"); -// concepts.add("http://dbpedia.org/class/yago/Industrialist110204177"); -// concepts.add("http://dbpedia.org/class/yago/Lifeboat103662601"); -// concepts.add("http://dbpedia.org/class/yago/Particulate114839439"); -// concepts.add("http://dbpedia.org/class/yago/Patriot110407310"); -// concepts.add("http://dbpedia.org/class/yago/Reservation108587174"); -// concepts.add("http://dbpedia.org/class/yago/Schoolteacher110560352"); -// concepts.add("http://dbpedia.org/class/yago/Singer110599806"); -// concepts.add("http://dbpedia.org/class/yago/SupremeCourt108336188"); - - return concepts; - } - - protected static void writeToFile(String filename, String content) { - // create the file we want to use - File file = new File( filename); - - try { - file.createNewFile(); - FileOutputStream fos = new FileOutputStream(filename, false); - // ObjectOutputStream o = new ObjectOutputStream(fos); - fos.write(content.getBytes()); - fos.flush(); - fos.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public static void printProgress(int con, int consize,int recdepth, String conceptname, long needed){ - int ex=poslimit+neglimit; - System.out.println("**********************STAT\n" + - "XXX num ex : "+ex+ " \n" + - "concept : "+con+"/"+consize+ " \n" + - "recursion : "+recdepth+" \n" + - "conceptname : "+conceptname+ "\n" + - "needed total: "+needed); - } - - -} Deleted: trunk/src/dl-learner/org/dllearner/scripts/SPARQLMassLearning.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/SPARQLMassLearning.java 2008-07-31 16:23:42 UTC (rev 1041) +++ trunk/src/dl-learner/org/dllearner/scripts/SPARQLMassLearning.java 2008-07-31 16:27:38 UTC (rev 1042) @@ -1,269 +0,0 @@ -package org.dllearner.scripts; - -import java.net.URLEncoder; -import java.util.SortedSet; -import java.util.TreeSet; - -import org.apache.log4j.ConsoleAppender; -import org.apache.log4j.FileAppender; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.log4j.SimpleLayout; -import org.dllearner.kb.sparql.Cache; -import org.dllearner.kb.sparql.SparqlEndpoint; -import org.dllearner.kb.sparql.SparqlKnowledgeSource; -import org.dllearner.kb.sparql.SparqlQuery; -import org.dllearner.utilities.learn.ConfWriter; -import org.dllearner.utilities.learn.LearnSparql; -import org.dllearner.utilities.statistics.SimpleClock; -import org.dllearner.utilities.statistics.Statistics; - -import com.hp.hpl.jena.query.ResultSet; - -public class SPARQLMassLearning { - - - static Cache c; - static SparqlEndpoint se; - private static Logger logger = Logger.getRootLogger(); - - static String standardSettings=""; - static String algorithm="refexamples"; - static String standardSettingsRefexamples = - "refexamples.minExecutionTimeInSeconds = 30;\n" + - "refexamples.maxExecutionTimeInSeconds = 30;\n" + - "//refexamples.guaranteeXgoodDescriptions = 10;\n" + - "refexamples.logLevel=\"TRACE\";\n" + - "refexamples.noisePercentage = 0.10;\n" + - "refexamples.writeSearchTree = false;\n" + - "refexamples.searchTreeFile = \"searchTree.txt\";\n" + - "refexamples.replaceSearchTree = true;\n\n" ; - - static String standardSettingsRefinement = - "refinement.minExecutionTimeInSeconds = 30;\n" + - "refinement.maxExecutionTimeInSeconds = 30;\n" + - "//refinement.guaranteeXgoodDescriptions = 10;\n" + - "refinement.logLevel=\"TRACE\";\n" + - "refinement.writeSearchTree = false;\n" + - "refinement.searchTreeFile = \"searchTree.txt\";\n" + - "refinement.replaceSearchTree = true;\n\n" ; - - - - static String standardDBpedia="" + - "sparql.recursionDepth = 1;\n" + - "sparql.predefinedFilter = \"YAGO\";\n" + - "sparql.predefinedEndpoint = \"DBPEDIA\";\n"; - //"sparql.logLevel = \"INFO\";\n"; - - - //vars - static boolean useRelated = false; - static boolean useSuperClasses = false; - static boolean useParallelClasses = true; - static int poslimit = 10; - static int neglimit = 20; - - /** - * @param args - */ - public static void main(String[] args) { - init(); - //logger.setLevel(Level.TRACE); - Logger.getLogger(SparqlKnowledgeSource.class).setLevel(Level.INFO); - //System.out.println(Logger.getLogger(SparqlQuery.class).getLevel()); - SimpleClock sc=new SimpleClock(); - - standardSettings=standardSettingsRefexamples+standardDBpedia; - //standardSettings=standardSettingsRefinement+standardDBpedia; - - DBpedia(); - //algorithm="refinement"; - //roles(); - - /*System.out.println(Level.DEBUG.getClass()); - System.out.println(Level.toLevel("INFO")); - System.out.println(Level.INFO);*/ - //System.exit(0); - - - - sc.printAndSet("Finished"); - - } - - - - - static void roles(){ - - se = SparqlEndpoint.getEndpointDBpedia(); - //se = SparqlEndpoint.EndpointUSCensus(); - SortedSet<String> roles = new TreeSet<String>(); - roles.add("http://dbpedia.org/property/birthPlace"); - //roles.add("http://www.rdfabout.com/rdf/schema/census/landArea"); - standardSettings+=algorithm+".ignoredRoles = {\""+roles.first()+"\"};\n"; - - SortedSet<String> posExamples = new TreeSet<String>(); - SortedSet<String> negExamples = new TreeSet<String>(); - String url = "http://dbpedia.openlinksw.com:8890/sparql"; - //HashMap<String, ResultSet> result = new HashMap<String, ResultSet>(); - //HashMap<String, String> result2 = new HashMap<String, String>(); - //System.out.println(concepts.first()); - //logger.setLevel(Level.TRACE); - //AutomaticExampleFinderRolesSPARQL ae= new AutomaticExampleFinderRolesSPARQL( se); - - //ae.initDomainRange(roles.first(), poslimit, neglimit); - - //posExamples = ae.getPosExamples(); - //negExamples = ae.getNegExamples(); - - System.out.println(posExamples); - System.out.println(negExamples); - //System.exit(0); - String tmp = roles.first().replace("http://dbpedia.org/property/", "").replace("\"",""); - String confname1 = ""; - String confname2 = ""; - try{ - confname1 = URLEncoder.encode(tmp, "UTF-8")+"_domain.conf"; - confname2 = URLEncoder.encode(tmp, "UTF-8")+"_range.conf"; - }catch (Exception e) {e.printStackTrace();} - // - ConfWriter cf=new ConfWriter(); - cf.addToStats("relearned role: "+roles.first()); - - //System.exit(0); - //"relearned concept: "; - cf.writeSPARQL(confname1, negExamples,posExamples, url, new TreeSet<String>(),standardSettings,algorithm); - - cf.writeSPARQL(confname2, posExamples, negExamples, url, new TreeSet<String>(),standardSettings,algorithm); - //new LearnSparql().learn(posExamples, negExamples, "http://dbpedia.openlinksw.com:8890/sparql", new TreeSet<String>()); - - - } - - static void DBpedia(){ - se = SparqlEndpoint.getEndpointLOCALDBpedia(); - //concepts.add("(EXISTS \"monarch\".TOP AND EXISTS \"predecessor\".(\"Knight\" OR \"Secretary\"))"); - - SortedSet<String> concepts = new TreeSet<String>(); - SortedSet<String> tmpSet=selectDBpediaConcepts(20); - System.out.println(concepts.size()); - for (String string : tmpSet) { - concepts.add("\""+string+"\""); - } - concepts.remove(concepts.first()); - concepts.remove(concepts.first()); - concepts.remove(concepts.first()); - concepts.remove(concepts.first()); - concepts.remove(concepts.first()); - concepts.remove(concepts.first()); - concepts.remove(concepts.first()); - //concepts.remove(concepts.first()); - //concepts.add("(\"http://dbpedia.org/class/yago/HeadOfState110164747\" AND (\"http://dbpedia.org/class/yago/Negotiator110351874\" AND \"http://dbpedia.org/class/yago/Representative110522035\"))"); - //concepts.add("\"http://dbpedia.org/class/yago/Person100007846\""); - //concepts.add("\"http://dbpedia.org/class/yago/FieldMarshal110086821\""); - //concepts.add("http://dbpedia.org/resource/Category:Prime_Ministers_of_the_United_Kingdom"); - //concepts.add("http://dbpedia.org/resource/Category:Grammy_Award_winners"); - //concepts.add("EXISTS \"http://dbpedia.org/property/grammyawards\".TOP"); - - SortedSet<String> posExamples = new TreeSet<String>(); - SortedSet<String> negExamples = new TreeSet<String>(); - String url = "http://dbpedia.openlinksw.com:8890/sparql"; - url = "http://139.18.2.37:8890/sparql"; - //HashMap<String, ResultSet> result = new HashMap<String, ResultSet>(); - //HashMap<String, String> result2 = new HashMap<String, String>(); - //System.out.println(concepts.first()); - //logger.setLevel(Level.TRACE); - - //String concept=concepts.first(); - //int i=0; - Statistics.setCurrentLabel("0"); - int recursiondepth=0; - boolean closeAfterRecursion=false; - //int numberOfTriples = 0; - for (String oneConcept : concepts) { - //AutomaticExampleFinderSPARQLold ae= new AutomaticExampleFinderSPARQLold( se); - useRelated = true; - useSuperClasses = true; - useParallelClasses = false; - - poslimit=10; - neglimit=10; - //ae.initDBpedia(concept, useRelated, useSuperClasses,useParallelClasses, poslimit, neglimit); - //posExamples = ae.getPosExamples(); - //negExamples = ae.getNegExamples(); - - - /*String tmp = concepts.first().replace("http://dbpedia.org/resource/Category:", "").replace("\"",""); - tmp = tmp.replace("http://dbpedia.org/class/yago/", ""); - tmp = tmp.replace("http://dbpedia.org/property/", ""); - String confname = ""; - try{ - confname = URLEncoder.encode(tmp, "UTF-8")+".conf"; - }catch (Exception e) {e.printStackTrace();}*/ - // - //ConfWriter cf=new ConfWriter(); - //cf.addToStats("relearned concept: "+concepts.first()); - //System.out.println(confname); - LearnSparql ls = new LearnSparql(); - TreeSet<String> igno = new TreeSet<String>(); - System.out.println(oneConcept); - //igno.add(oneConcept.replaceAll("\"", "")); - - ls.learnDBpedia(posExamples, negExamples, url,igno,recursiondepth, closeAfterRecursion,false); - - //System.out.println("AAAAAAAA"); - //System.exit(0); - //"relearned concept: "; - //cf.writeSPARQL(confname, posExamples, negExamples, url, new TreeSet<String>(),standardSettings,algorithm); - // - - } - //Statistics.print(); - } - - - - - - - - - public static void init() { - - SimpleLayout layout = new SimpleLayout(); - // create logger (a simple logger which outputs - // its messages to the console) - FileAppender fileAppender =null; ; - try{ - fileAppender = new FileAppender(layout,"the_log.txt",false); - }catch (Exception e) {e.printStackTrace();} - - ConsoleAppender consoleAppender = new ConsoleAppender(layout); - logger.removeAllAppenders(); - logger.addAppender(consoleAppender); - logger.addAppender(fileAppender); - logger.setLevel(Level.DEBUG); - c = new Cache("cachetemp"); - - - } - - public static SortedSet<String> selectDBpediaConcepts(int number){ - String query = "SELECT DISTINCT ?concept WHERE { \n" + - "[] a ?concept .FILTER (regex(str(?concept),'yago'))" + - " \n} \n"; //LIMIT "+number+" - - - String JSON = (c.executeSparqlQuery(new SparqlQuery(query, se))); - ResultSet rs =SparqlQuery.convertJSONtoResultSet(JSON); - if(rs==null); - //JenaResultSetConvenience rsc = new JenaResultSetConvenience(rs); - //return SetManipulation.fuzzyShrink(rsc.getStringListForVariable("concept"),number); - return null; - } - - - -} Added: trunk/src/dl-learner/org/dllearner/scripts/WikipediaCategoryCleaner.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/WikipediaCategoryCleaner.java (rev 0) +++ trunk/src/dl-learner/org/dllearner/scripts/WikipediaCategoryCleaner.java 2008-07-31 16:27:38 UTC (rev 1042) @@ -0,0 +1,167 @@ +/** + * Copyright (C) 2007-2008, Jens Lehmann + * + * This file is part of DL-Learner. + * + * DL-Learner is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * DL-Learner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + */ +package org.dllearner.scripts; + +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.FileAppender; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.SimpleLayout; +import org.dllearner.core.EvaluatedDescription; +import org.dllearner.core.KnowledgeSource; +import org.dllearner.core.owl.Individual; +import org.dllearner.kb.sparql.Cache; +import org.dllearner.kb.sparql.SPARQLTasks; +import org.dllearner.kb.sparql.SparqlEndpoint; +import org.dllearner.kb.sparql.SparqlQuery; +import org.dllearner.scripts.improveWikipedia.ConceptSelector; +import org.dllearner.scripts.improveWikipedia.WikipediaCategoryTasks; +import org.dllearner.utilities.JamonMonitorLogger; +import org.dllearner.utilities.examples.AutomaticNegativeExampleFinderSPARQL; +import org.dllearner.utilities.examples.AutomaticPositiveExampleFinderSPARQL; + +public class WikipediaCategoryCleaner { + + private static SPARQLTasks sparqlTasks; + + private static Cache cache; + + private static Logger logger = Logger.getRootLogger(); + + private static boolean local = true; // localEndpoint switch + + // parameters + public static final int SPARQL_RESULTSET_LIMIT = 1000; + + public static double PERCENT_OF_SKOSSET = 1.0; // the 70/30 strategy was + + // abandoned + + public static double NEGFACTOR = 1.0; // size of randomly choosen negative + + // examples compared to positives + + /** + * @param args + */ + public static void main(String[] args) { + initLogger(); + logger.info("Start"); + + // SETUP cache and sparqltasks + cache = Cache.getPersistentCache(); + + if (local) { + // url = "http://139.18.2.37:8890/sparql"; + sparqlTasks = new SPARQLTasks(cache, SparqlEndpoint + .getEndpointLOCALDBpedia()); + } else { + // url = "http://dbpedia.openlinksw.com:8890/sparql"; + sparqlTasks = new SPARQLTasks(cache, SparqlEndpoint + .getEndpointDBpedia()); + } + + String target = "http://dbpedia.org/resource/Category:Prime_Ministers_of_the_United_Kingdom"; + // target = + // "http://dbpedia.org/resource/Category:Best_Actor_Academy_Award_winners"; + + WikipediaCategoryTasks s = new WikipediaCategoryTasks(sparqlTasks); + // TODO Optimize + s.calculateDefinitelyWrongIndividuals(target, PERCENT_OF_SKOSSET, + NEGFACTOR, SPARQL_RESULTSET_LIMIT); + + logger.info("Found " + s.getDefinitelyWrongIndividuals().size() + + " incorrect individuals"); + logger.debug("incorrect Individuals: " + + s.getDefinitelyWrongIndividuals()); + logger.info("reevaluating " + s.getConceptresults().size() + + " found Concepts"); + logger + .info("END OF PHASE 1 **********************************************"); + + s.reevaluateAndRelearn(); + List<EvaluatedDescription> newEval = s.getConceptresults(); + printEvaluatedDescriptionCollection(5, newEval); + + System.out.println("Finished"); + JamonMonitorLogger.printAllSortedByLabel(); + + } + + private static void initLogger() { + + SimpleLayout layout = new SimpleLayout(); + // create logger (a simple logger which outputs + // its messages to the console) + FileAppender fileAppender = null; + try { + fileAppender = new FileAppender(layout, "log/progress/skos" + + ConceptSelector.time() + ".txt", false); + } catch (Exception e) { + e.printStackTrace(); + } + + ConsoleAppender consoleAppender = new ConsoleAppender(layout); + logger.removeAllAppenders(); + logger.addAppender(consoleAppender); + logger.addAppender(fileAppender); + logger.setLevel(Level.DEBUG); + Logger.getLogger(KnowledgeSource.class).setLevel(Level.WARN); + + Logger.getLogger(SparqlQuery.class).setLevel(Level.INFO); + Logger.getLogger(Cache.class).setLevel(Level.INFO); + Logger.getLogger(AutomaticNegativeExampleFinderSPARQL.class).setLevel( + Level.INFO); + Logger.getLogger(AutomaticPositiveExampleFinderSPARQL.class).setLevel( + Level.INFO); + } + + public static void printEvaluatedDescriptionCollection(int howMany, + Collection<EvaluatedDescription> c) { + int x = 0; + Set<Individual> first = null; + Set<Individual> tmp = new HashSet<Individual>(); + for (EvaluatedDescription ed : c) { + if (x == 0) { + first = ed.getNotCoveredPositives(); + } + if (x >= howMany) { + x++; + break; + } + + tmp.addAll(ed.getNotCoveredPositives()); + tmp.removeAll(first); + logger.debug("*************************"); + logger.debug("Concept: " + ed); + logger.debug("accuracy: " + ed.getAccuracy()); + logger.debug("Not Covered compared to First: " + tmp); + logger.debug(ed.getScore()); + tmp.clear(); + + } + } + +} Added: trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSPARQLReEvaluator.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSPARQLReEvaluator.java (rev 0) +++ trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSPARQLReEvaluator.java 2008-07-31 16:27:38 UTC (rev 1042) @@ -0,0 +1,200 @@ +/** + * Copyright (C) 2007-2008, Jens Lehmann + * + * This file is part of DL-Learner. + * + * DL-Learner is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * DL-Learner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + */ +package org.dllearner.scripts.improveWikipedia; + +import java.util.ArrayList; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.log4j.Logger; +import org.dllearner.core.EvaluatedDescription; +import org.dllearner.core.owl.Individual; +import org.dllearner.kb.sparql.SPARQLTasks; +import org.dllearner.utilities.Helper; +import org.dllearner.utilities.owl.EvaluatedDescriptionComparator; + +/** + * @author Sebastian Hellmann + * + * The EvaluatedDescriptions from a fragment are + * validated against the SPARQLendpoint. + * There are different strategies, see the methods; + */ +public class ConceptSPARQLReEvaluator { + + private static Logger logger = Logger + .getLogger(ConceptSPARQLReEvaluator.class); + + List<EvaluatedDescription> descToBeReevaluated; + + SPARQLTasks sparqlTasks; + + int sparqlResultLimit = 1000; + + int depthOfRDFS = 1; + + public ConceptSPARQLReEvaluator(SPARQLTasks sparqlTasks, + List<EvaluatedDescription> descToBeReevaluated) { + this.descToBeReevaluated = descToBeReevaluated; + this.sparqlTasks = sparqlTasks; + } + + public ConceptSPARQLReEvaluator(SPARQLTasks sparqlTasks, + List<EvaluatedDescription> descToBeReevaluated, int depthOfRDFS, + int sparqlResultLimit) { + this(sparqlTasks, descToBeReevaluated); + this.depthOfRDFS = depthOfRDFS; + this.sparqlResultLimit = sparqlResultLimit; + } + + + public List<EvaluatedDescription> reevaluateConceptsByDataCoverage( + SortedSet<String> positiveSet, int maxNrOfConcepts) { + List<EvaluatedDescription> tmp = reevaluateConceptsByLowestRecall(positiveSet); + List<EvaluatedDescription> returnSet = new ArrayList<EvaluatedDescription>(); + + while ((!tmp.isEmpty()) && (returnSet.size() <= maxNrOfConcepts)) { + returnSet.add(tmp.remove(0)); + } + + return returnSet; + } + + /** + * Accuracy is calculated as correct positive classified over (correct + * positive classified + incorrect negative classified) "How many are + * correctly positive classified?" e.g. 50 individuals of a 60-individual + * Category (50/60) + * + * @param positiveSet + * @return + */ + public List<EvaluatedDescription> reevaluateConceptsByDataCoverage( + SortedSet<String> positiveSet) { + + SortedSet<EvaluatedDescription> returnSet = new TreeSet<EvaluatedDescription>( + new EvaluatedDescriptionComparator()); + + SortedSet<String> instances = new TreeSet<String>(); + SortedSet<String> PosAsPos = new TreeSet<String>(); + SortedSet<String> PosAsNeg = new TreeSet<String>(); + + // NegAsPos doesnt exist, because they are supposed to be possible + // candidates + SortedSet<Individual> NegAsPos = new TreeSet<Individual>(); + // NegAsNeg doesnt exist, because all + SortedSet<Individual> NegAsNeg = new TreeSet<Individual>(); + + for (EvaluatedDescription ed : descToBeReevaluated) { + instances = retrieveInstances(ed); + + // PosAsPos + PosAsPos.addAll(positiveSet); + PosAsPos.retainAll(instances); + + // PosAsNeg + PosAsNeg.addAll(positiveSet); + PosAsNeg.removeAll(PosAsPos); + + returnSet.add(new EvaluatedDescription(ed.getDescription(), Helper + .getIndividualSet(PosAsPos), Helper + .getIndividualSet(PosAsNeg), NegAsPos, NegAsNeg)); + + PosAsPos.clear(); + PosAsNeg.clear(); + + } + + return new ArrayList<EvaluatedDescription>(returnSet); + + } + + public List<EvaluatedDescription> reevaluateConceptsByLowestRecall( + SortedSet<String> positiveSet, int maxNrOfConcepts) { + List<EvaluatedDescription> tmp = reevaluateConceptsByLowestRecall(positiveSet); + List<EvaluatedDescription> returnSet = new ArrayList<EvaluatedDescription>(); + + while ((!tmp.isEmpty()) && (returnSet.size() <= maxNrOfConcepts)) { + returnSet.add(tmp.remove(0)); + } + + return returnSet; + } + + /** + * Accuracy is calculated as correct positive classified over all retrieved + * e.g. 50 correct out of 400 retrieved (50/400) + * + * @param positiveSet + * @return + */ + public List<EvaluatedDescription> reevaluateConceptsByLowestRecall( + SortedSet<String> positiveSet) { + logger.info("reevaluating by lowest recall " + + descToBeReevaluated.size() + " concepts"); + SortedSet<EvaluatedDescription> returnSet = new TreeSet<EvaluatedDescription>( + new EvaluatedDescriptionComparator()); + + SortedSet<String> instances = new TreeSet<String>(); + + SortedSet<String> PosAsPos = new TreeSet<String>(); + SortedSet<String> PosAsNeg = new TreeSet<String>(); + + SortedSet<Individual> NegAsPos = new TreeSet<Individual>(); + + SortedSet<Individual> NegAsNeg = new TreeSet<Individual>(); + + // elements are immediately removed from the list to save memory + while (!descToBeReevaluated.isEmpty()) { + EvaluatedDescription ed = descToBeReevaluated.remove(0); + + instances = retrieveInstances(ed); + + // PosAsPos + PosAsPos.addAll(positiveSet); + PosAsPos.retainAll(instances); + + // PosAsNeg + PosAsNeg.addAll(instances); + PosAsNeg.removeAll(PosAsPos); + + returnSet.add(new EvaluatedDescription(ed.getDescription(), Helper + .getIndividualSet(PosAsPos), Helper + .getIndividualSet(PosAsNeg), NegAsPos, NegAsNeg)); + + PosAsPos.clear(); + PosAsNeg.clear(); + + } + logger.info("finished reevaluating by lowest recall :" + + returnSet.size() + " concepts"); + return new ArrayList<EvaluatedDescription>(returnSet); + + } + + private SortedSet<String> retrieveInstances(EvaluatedDescription ed) { + String kbsyntax = ed.getDescription().toKBSyntaxString(); + return sparqlTasks + .retrieveInstancesForClassDescriptionIncludingSubclasses( + kbsyntax, sparqlResultLimit, depthOfRDFS); + } + +} Added: trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSelector.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSelector.java (rev 0) +++ trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSelector.java 2008-07-31 16:27:38 UTC (rev 1042) @@ -0,0 +1,130 @@ +/** + * Copyright (C) 2007-2008, Jens Lehmann + * + * This file is part of DL-Learner. + * + * DL-Learner is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * DL-Learner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + */ +package org.dllearner.scripts.improveWikipedia; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import org.dllearner.core.EvaluatedDescription; +import org.dllearner.core.LearningAlgorithm; +import org.dllearner.utilities.Files; + +/** + * This is a simple class, it might be worked into other classes later. + * filters concepts and records some results + * + * @author Sebastian Hellmann + * + */ +public class ConceptSelector { + + private static final long WASH = 1216800000000L; + + List<EvaluatedDescription> concepts; + + public ConceptSelector(LearningAlgorithm la) { + super(); + this.concepts = la.getCurrentlyBestEvaluatedDescriptions(Integer.MAX_VALUE, 0.0, true); + this.recordConceptClasses(); + + } + + + public ConceptSelector(LearningAlgorithm la, int maxNrOfConcepts) { + super(); + this.concepts = la.getCurrentlyBestEvaluatedDescriptions(maxNrOfConcepts); + + } + + public ConceptSelector(LearningAlgorithm la, int maxNrOfConcepts, double acctreshold) { + super(); + this.concepts = la.getCurrentlyBestEvaluatedDescriptions(maxNrOfConcepts, acctreshold, true); + this.recordConceptClasses(); + } + + public List<EvaluatedDescription> getConceptsWithoutOR(){ + return getConceptsNotContainingString("OR"); + } + + public List<EvaluatedDescription> getConceptsNotContainingString(String filterString, int limitSize){ + List<EvaluatedDescription> tmp = getConceptsNotContainingString(filterString); + List<EvaluatedDescription> result = new ArrayList<EvaluatedDescription>(); + + while ((!tmp.isEmpty()) && (result.size() <= limitSize)) { + result.add(tmp.remove(0)); + } + return result; + } + + + public List<EvaluatedDescription> getConceptsNotContainingString(String filterString){ + + List<EvaluatedDescription> result = new ArrayList<EvaluatedDescription>(); + for (EvaluatedDescription description : concepts) { + if (!description.toString().contains(filterString)) { + result.add(description); + } + + } + return result; + } + + + + + public void recordConceptClasses() { + StringBuffer result =new StringBuffer(); + StringBuffer result1 =new StringBuffer("\n\n ***********Entity*****\n"); + StringBuffer result2 =new StringBuffer("\n\n ***********OR*****\n"); + int result1count = 1; + int result2count = 1; + + + int x = 0; + for (EvaluatedDescription description : concepts) { + if (x < 50) { + x++; + result.append(description + "\n"); + } + + if (!description.toString().contains("Entity")) { + result1.append(description + "\n"); + result1count++; + } + if (!description.toString().contains("OR")) { + result2.append(description + "\n"); + result2count++; + } + } + result.append("full size: " + concepts.size()); + result.append(result1.toString() + " size: " + result1count + "\n"); + result.append(result2.toString() + " size: " + result2count + "\n"); + + Files.createFile(new File("results/descriptions/concepts" + time() + + ".txt"), result.toString()); + } + + public static String time() { + return ("" + (System.currentTimeMillis() - WASH)).substring(0, 7); + + } + +} Added: trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/WikipediaCategoryTasks.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/WikipediaCategoryTasks.java (rev 0) +++ trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/WikipediaCategoryTasks.java 2008-07-31 16:27:38 UTC (rev 1042) @@ -0,0 +1,251 @@ +/** + * Copyright (C) 2007-2008, Jens Lehmann + * + * This file is part of DL-Learner. + * + * DL-Learner is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * DL-Learner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + */ +package org.dllearner.scripts.improveWikipedia; + +import java.util.ArrayList; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.log4j.Logger; +import org.dllearner.core.EvaluatedDescription; +import org.dllearner.core.LearningAlgorithm; +import org.dllearner.kb.sparql.SPARQLTasks; +import org.dllearner.scripts.WikipediaCategoryCleaner; +import org.dllearner.utilities.Helper; +import org.dllearner.utilities.datastructures.SetManipulation; +import org.dllearner.utilities.examples.AutomaticNegativeExampleFinderSPARQL; +import org.dllearner.utilities.examples.AutomaticPositiveExampleFinderSPARQL; +import org.dllearner.utilities.learn.LearnSPARQLConfiguration; +import org.dllearner.utilities.learn.LearnSparql; + +public class WikipediaCategoryTasks { + + private static Logger logger = Logger + .getLogger(WikipediaCategoryTasks.class); + + private static final boolean STABLE = true; // used for developing, same + + // negExamples not random + + private static final int MAXIMUM_NUMBER_OF_CONCEPTS_KEPT = Integer.MAX_VALUE; + + private static final double ACCTRESHOLD = 0.0; + + private SPARQLTasks sparqlTasks; + + private SortedSet<String> posExamples = new TreeSet<String>(); + + private SortedSet<String> fullPositiveSet = new TreeSet<String>(); + + // private SortedSet<String> fullPosSetWithoutPosExamples = new + // TreeSet<String>(); + + private SortedSet<String> negExamples = new TreeSet<String>(); + + private SortedSet<String> definitelyWrongIndividuals = new TreeSet<String>(); + + private List<EvaluatedDescription> conceptresults = new ArrayList<EvaluatedDescription>(); + + public WikipediaCategoryTasks(SPARQLTasks sparqlTasks) { + this.sparqlTasks = sparqlTasks; + } + + /** + * @param SKOSConcept + * @param percentOfSKOSSet + * @param negfactor + * @param sparqlResultLimit + */ + public void calculateDefinitelyWrongIndividuals(String SKOSConcept, + double percentOfSKOSSet, double negfactor, int sparqlResultLimit) { + + makeExamples(SKOSConcept, percentOfSKOSSet, negfactor, + sparqlResultLimit); + + LearnSparql learner = new LearnSparql( + prepareConfigurationToFindWrongIndividuals()); + LearningAlgorithm la = null; + try { + la = learner.learn(posExamples, negExamples); + } catch (Exception e) { + e.printStackTrace(); + } + // TODO maybe not smart here + ConceptSelector cs = new ConceptSelector(la, + MAXIMUM_NUMBER_OF_CONCEPTS_KEPT, ACCTRESHOLD); + conceptresults = cs.getConceptsNotContainingString("Entity", + MAXIMUM_NUMBER_OF_CONCEPTS_KEPT); + if (conceptresults.size() == 0) { + logger.warn("NO GOOD CONCEPTS FOUND"); + } + + definitelyWrongIndividuals = Helper.getStringSet(conceptresults.get(0) + .getNotCoveredPositives()); + + // clean the examples + posExamples.removeAll(definitelyWrongIndividuals); + fullPositiveSet.removeAll(definitelyWrongIndividuals); + // fullPosSetWithoutPosExamples.removeAll(definitelyWrongIndividuals); + + logger.trace("posExamples" + posExamples.size()); + logger.trace("fullPositives" + fullPositiveSet.size()); + + negExamples.clear(); + + } + + public void reevaluateAndRelearn() { + + ConceptSPARQLReEvaluator csparql = new ConceptSPARQLReEvaluator( + sparqlTasks, conceptresults); + List<EvaluatedDescription> reEvaluatedDesc; + + // TODO Optimize here + reEvaluatedDesc = csparql.reevaluateConceptsByLowestRecall( + fullPositiveSet, 1); + + // TODO add check if it is correct + WikipediaCategoryCleaner.printEvaluatedDescriptionCollection(10, + reEvaluatedDesc); + EvaluatedDescription newDesc = reEvaluatedDesc.get(0); + logger.info("Best concept: " + newDesc.getDescription()); + + negExamples.clear(); + negExamples.addAll(Helper.getStringSet(newDesc.getCoveredPositives())); + negExamples.addAll(Helper + .getStringSet(newDesc.getNotCoveredPositives())); + negExamples.... [truncated message content] |