From: <ku...@us...> - 2008-08-02 18:25:46
|
Revision: 1045 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=1045&view=rev Author: kurzum Date: 2008-08-02 18:25:33 +0000 (Sat, 02 Aug 2008) Log Message: ----------- working category improver. one problem: has many parameters, which determine the outcome it has to be tested which are the best. Modified Paths: -------------- trunk/src/dl-learner/org/dllearner/Info.java trunk/src/dl-learner/org/dllearner/kb/extraction/ExtractionAlgorithm.java trunk/src/dl-learner/org/dllearner/kb/extraction/Manager.java trunk/src/dl-learner/org/dllearner/scripts/WikipediaCategoryCleaner.java trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSPARQLReEvaluator.java trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSelector.java trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/WikipediaCategoryTasks.java trunk/src/dl-learner/org/dllearner/utilities/datastructures/SetManipulation.java trunk/src/dl-learner/org/dllearner/utilities/examples/AutomaticNegativeExampleFinderSPARQL.java Modified: trunk/src/dl-learner/org/dllearner/Info.java =================================================================== --- trunk/src/dl-learner/org/dllearner/Info.java 2008-07-31 16:37:22 UTC (rev 1044) +++ trunk/src/dl-learner/org/dllearner/Info.java 2008-08-02 18:25:33 UTC (rev 1045) @@ -3,6 +3,6 @@ package org.dllearner; public class Info { - public static final String build = "2008-05-15"; + public static final String build = "2008-08-01"; } \ No newline at end of file Modified: trunk/src/dl-learner/org/dllearner/kb/extraction/ExtractionAlgorithm.java =================================================================== --- trunk/src/dl-learner/org/dllearner/kb/extraction/ExtractionAlgorithm.java 2008-07-31 16:37:22 UTC (rev 1044) +++ trunk/src/dl-learner/org/dllearner/kb/extraction/ExtractionAlgorithm.java 2008-08-02 18:25:33 UTC (rev 1045) @@ -24,7 +24,6 @@ import java.util.Vector; import org.apache.log4j.Logger; -import org.dllearner.core.KnowledgeSource; /** * This class is used to extract the information . @@ -39,7 +38,7 @@ // private boolean getAllSuperClasses = true; // private boolean closeAfterRecursion = true; private static Logger logger = Logger - .getLogger(KnowledgeSource.class); + .getLogger(ExtractionAlgorithm.class); public ExtractionAlgorithm(Configuration Configuration) { this.configuration = Configuration; @@ -75,7 +74,7 @@ //System.out.println(this.configuration); long time = System.currentTimeMillis(); Node n = getFirstNode(uri); - System.out.println(n); + logger.info(n); Vector<Node> v = new Vector<Node>(); v.add(n); logger.info("StartVector: " + v); Modified: trunk/src/dl-learner/org/dllearner/kb/extraction/Manager.java =================================================================== --- trunk/src/dl-learner/org/dllearner/kb/extraction/Manager.java 2008-07-31 16:37:22 UTC (rev 1044) +++ trunk/src/dl-learner/org/dllearner/kb/extraction/Manager.java 2008-08-02 18:25:33 UTC (rev 1045) @@ -25,7 +25,6 @@ import java.util.TreeSet; import org.apache.log4j.Logger; -import org.dllearner.core.KnowledgeSource; import org.dllearner.kb.sparql.SparqlEndpoint; import org.dllearner.kb.sparql.SparqlQueryType; import org.dllearner.utilities.statistics.Statistics; @@ -43,7 +42,7 @@ private ExtractionAlgorithm extractionAlgorithm; private static Logger logger = Logger - .getLogger(KnowledgeSource.class); + .getLogger(Manager.class); public void useConfiguration(SparqlQueryType SparqlQueryType, @@ -77,12 +76,14 @@ public String extract(Set<String> instances) { // this.TypedSparqlQuery.query(uri); // System.out.println(ExtractionAlgorithm.getFirstNode(uri)); - System.out.println("Start extracting"); + logger.info("Start extracting"); SortedSet<String> ret = new TreeSet<String>(); int progress=0; for (String one : instances) { progress++; - logger.info("Progress: "+progress+" of "+instances.size()+" finished"); + //if(progress % 10 == 0) { + logger.info("Progress: "+progress+" of "+instances.size()+" finished: "+one); + //} try { Node n = extractionAlgorithm.expandNode(new URI(one), typedSparqlQuery); @@ -91,15 +92,15 @@ e.printStackTrace(); } } - System.out.println("Finished extracting, start conversion"); + logger.info("Finished extracting, start conversion"); StringBuffer nt = new StringBuffer(); Object[] arr = ret.toArray(); for (int i = 0; i < arr.length; i++) { nt.append((String) arr[i] + "\n"); if (i % 1000 == 0) - System.out.println(i + " of " + arr.length + " triples done"); + logger.info(i + " of " + arr.length + " triples done"); } - System.out.println(arr.length + " of " + arr.length + " triples done"); + logger.info(arr.length + " of " + arr.length + " triples done"); /* * String tmp=""; while ( ret.size() > 0) { tmp=ret.first(); nt+=tmp; * ret.remove(tmp); System.out.println(ret.size()); } /*for (String str : Modified: trunk/src/dl-learner/org/dllearner/scripts/WikipediaCategoryCleaner.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/WikipediaCategoryCleaner.java 2008-07-31 16:37:22 UTC (rev 1044) +++ trunk/src/dl-learner/org/dllearner/scripts/WikipediaCategoryCleaner.java 2008-08-02 18:25:33 UTC (rev 1045) @@ -23,24 +23,33 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; import org.apache.log4j.ConsoleAppender; import org.apache.log4j.FileAppender; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.log4j.SimpleLayout; +import org.dllearner.algorithms.refexamples.ExampleBasedROLComponent; import org.dllearner.core.EvaluatedDescription; import org.dllearner.core.KnowledgeSource; +import org.dllearner.core.LearningAlgorithm; import org.dllearner.core.owl.Individual; +import org.dllearner.kb.extraction.ExtractionAlgorithm; +import org.dllearner.kb.extraction.Manager; import org.dllearner.kb.sparql.Cache; import org.dllearner.kb.sparql.SPARQLTasks; import org.dllearner.kb.sparql.SparqlEndpoint; +import org.dllearner.kb.sparql.SparqlKnowledgeSource; import org.dllearner.kb.sparql.SparqlQuery; +import org.dllearner.scripts.improveWikipedia.ConceptSPARQLReEvaluator; import org.dllearner.scripts.improveWikipedia.ConceptSelector; import org.dllearner.scripts.improveWikipedia.WikipediaCategoryTasks; -import org.dllearner.utilities.JamonMonitorLogger; import org.dllearner.utilities.examples.AutomaticNegativeExampleFinderSPARQL; import org.dllearner.utilities.examples.AutomaticPositiveExampleFinderSPARQL; +import org.dllearner.utilities.learn.LearnSPARQLConfiguration; +import org.dllearner.utilities.learn.LearnSparql; public class WikipediaCategoryCleaner { @@ -50,93 +59,173 @@ private static Logger logger = Logger.getRootLogger(); - private static boolean local = true; // localEndpoint switch + // localEndpoint switch + private static final boolean LOCAL = false; // parameters - public static final int SPARQL_RESULTSET_LIMIT = 1000; + // used for developing, + private static final boolean DEVELOP = true; - public static double PERCENT_OF_SKOSSET = 1.0; // the 70/30 strategy was + public static final int SPARQL_RESULTSET_LIMIT = 500; - // abandoned + // the 70/30 strategy was abandoned + public static double PERCENT_OF_SKOSSET = 1.0; - public static double NEGFACTOR = 1.0; // size of randomly choosen negative + // size of randomly choosen negative examples compared to positives + public static double NEGFACTOR = 1.0; - // examples compared to positives + public static int MAX_NR_CONCEPTS_TO_BE_EVALUATED = Integer.MAX_VALUE; + public static double ACCURACY_THRESHOLD = 0.0; + + public static String FILTER_CONCEPTS_BY = "Entity"; + /** * @param args */ public static void main(String[] args) { initLogger(); + setup(); logger.info("Start"); - // SETUP cache and sparqltasks - cache = Cache.getPersistentCache(); + String test = "http://dbpedia.org/resource/Category:Prime_Ministers_of_the_United_Kingdom"; + test = "http://dbpedia.org/resource/Category:Best_Actor_Academy_Award_winners"; + SortedSet<String> wikipediaCategories = new TreeSet<String>(); + wikipediaCategories.add(test); - if (local) { - // url = "http://139.18.2.37:8890/sparql"; - sparqlTasks = new SPARQLTasks(cache, SparqlEndpoint - .getEndpointLOCALDBpedia()); - } else { - // url = "http://dbpedia.openlinksw.com:8890/sparql"; - sparqlTasks = new SPARQLTasks(cache, SparqlEndpoint - .getEndpointDBpedia()); + for (String target : wikipediaCategories) { + + doit(target); + } - String target = "http://dbpedia.org/resource/Category:Prime_Ministers_of_the_United_Kingdom"; - target = "http://dbpedia.org/resource/Category:Best_Actor_Academy_Award_winners"; + System.out.println("Finished"); + // JamonMonitorLogger.printAllSortedByLabel(); - WikipediaCategoryTasks s = new WikipediaCategoryTasks(sparqlTasks); - // TODO Optimize - s.calculateDefinitelyWrongIndividuals(target, PERCENT_OF_SKOSSET, - NEGFACTOR, SPARQL_RESULTSET_LIMIT); + } - logger.info("Found " + s.getDefinitelyWrongIndividuals().size() - + " incorrect individuals"); - logger.debug("incorrect Individuals: " - + s.getDefinitelyWrongIndividuals()); - logger.info("reevaluating " + s.getConceptresults().size() - + " found Concepts"); - logger - .info("END OF PHASE 1 **********************************************"); + private static void doit(String target) { + List<EvaluatedDescription> conceptresults; + SortedSet<String> currentPOSITIVEex = new TreeSet<String>(); + SortedSet<String> currentNEGATIVEex = new TreeSet<String>(); + SortedSet<String> wrongIndividuals; - s.reevaluateAndRelearn(); - List<EvaluatedDescription> newEval = s.getConceptresults(); - printEvaluatedDescriptionCollection(5, newEval); + WikipediaCategoryTasks wikiTasks; + ConceptSPARQLReEvaluator csparql; - System.out.println("Finished"); - JamonMonitorLogger.printAllSortedByLabel(); + wikiTasks = new WikipediaCategoryTasks(sparqlTasks); + csparql = new ConceptSPARQLReEvaluator(sparqlTasks); + // PHASE 1 ************* + + wikiTasks.makeInitialExamples(target, PERCENT_OF_SKOSSET, NEGFACTOR, + SPARQL_RESULTSET_LIMIT, DEVELOP); + currentPOSITIVEex.addAll(wikiTasks.getPosExamples()); + currentNEGATIVEex.addAll(wikiTasks.getNegExamples()); + // get wrong individuals and reevaluate concepts + conceptresults = learn(getConfToFindWrongIndividuals(), + currentPOSITIVEex, currentNEGATIVEex); + // TODO select concepts + conceptresults = selectConcepts(conceptresults); + wrongIndividuals = wikiTasks.calculateWrongIndividualsAndNewPosEx( + conceptresults, currentPOSITIVEex); + currentPOSITIVEex.clear(); + currentPOSITIVEex.addAll(wikiTasks.getCleanedPositiveSet()); + + // reevaluate versus the Endpoint + conceptresults = csparql.reevaluateConceptsByLowestRecall( + conceptresults, currentPOSITIVEex); + + WikipediaCategoryCleaner.printEvaluatedDescriptionCollection(2, + conceptresults); + + printIntermediateResults(wikiTasks.getFullPositiveSet(), + wikiTasks.getCleanedPositiveSet(), + wrongIndividuals, conceptresults.size()); + + // PHASE 2 *********************** + logger.info("PHASE 2 ***********************"); + logger.info("making new Negative Examples"); + currentNEGATIVEex = wikiTasks.makeNewNegativeExamples(conceptresults, + currentPOSITIVEex, NEGFACTOR); + + logger.info("learning"); + conceptresults = learn(getConfToRelearn(), currentPOSITIVEex, + currentNEGATIVEex); + // TODO select concepts + logger.info("reducing concept size before evaluating"); + conceptresults = selectConcepts(conceptresults); + // reevaluate versus the Endpoint + conceptresults = csparql.reevaluateConceptsByLowestRecall( + conceptresults, currentPOSITIVEex); + + printEvaluatedDescriptionCollection(2, conceptresults); + collectResults(wikiTasks); + } - private static void initLogger() { + private static void collectResults(WikipediaCategoryTasks wikiTasks) { + System.out.println(wikiTasks.getFullPositiveSet()); + System.out.println(wikiTasks.getCleanedPositiveSet()); + System.out.println(wikiTasks.getDefinitelyWrongIndividuals()); + } - SimpleLayout layout = new SimpleLayout(); - // create logger (a simple logger which outputs - // its messages to the console) - FileAppender fileAppender = null; + private static List<EvaluatedDescription> selectConcepts( + List<EvaluatedDescription> concepts) { + // TODO maybe not smart here + ConceptSelector cs = new ConceptSelector(); + concepts = cs.getConceptsNotContainingString(concepts, + FILTER_CONCEPTS_BY, MAX_NR_CONCEPTS_TO_BE_EVALUATED); + if (concepts.size() == 0) { + logger.warn("NO GOOD CONCEPTS FOUND"); + // TODO if this happens there has to be a fallback + } + return concepts; + } + + /** + * All Concepts are returned, filtering these are done separately + * + * @param conf + * @param posExamples + * @param negExamples + * @return + */ + private static List<EvaluatedDescription> learn( + LearnSPARQLConfiguration conf, SortedSet<String> posExamples, + SortedSet<String> negExamples) { + LearnSparql learner = new LearnSparql(getConfToRelearn()); + LearningAlgorithm la = null; try { - fileAppender = new FileAppender(layout, "log/progress/skos" - + ConceptSelector.time() + ".txt", false); + la = learner.learn(posExamples, negExamples); } catch (Exception e) { e.printStackTrace(); } + List<EvaluatedDescription> conceptresults = la + .getCurrentlyBestEvaluatedDescriptions(Integer.MAX_VALUE, 0.0, + true); + return conceptresults; + } - ConsoleAppender consoleAppender = new ConsoleAppender(layout); - logger.removeAllAppenders(); - logger.addAppender(consoleAppender); - logger.addAppender(fileAppender); - logger.setLevel(Level.DEBUG); - Logger.getLogger(KnowledgeSource.class).setLevel(Level.WARN); + private static LearnSPARQLConfiguration getConfToFindWrongIndividuals() { + LearnSPARQLConfiguration lsc = new LearnSPARQLConfiguration(); + lsc.sparqlEndpoint = sparqlTasks.getSparqlEndpoint(); - Logger.getLogger(SparqlQuery.class).setLevel(Level.INFO); - Logger.getLogger(Cache.class).setLevel(Level.INFO); - Logger.getLogger(AutomaticNegativeExampleFinderSPARQL.class).setLevel( - Level.INFO); - Logger.getLogger(AutomaticPositiveExampleFinderSPARQL.class).setLevel( - Level.INFO); + lsc.noisePercentage = 15; + lsc.guaranteeXgoodDescriptions = 200; + lsc.maxExecutionTimeInSeconds = 50; + lsc.logLevel = "INFO"; + // lsc.searchTreeFile = "log/WikipediaCleaner.txt"; + + return lsc; + } + private static LearnSPARQLConfiguration getConfToRelearn() { + return getConfToFindWrongIndividuals(); + + } + public static void printEvaluatedDescriptionCollection(int howMany, Collection<EvaluatedDescription> c) { int x = 0; @@ -147,10 +236,10 @@ first = ed.getNotCoveredPositives(); } if (x >= howMany) { - x++; + break; } - + x++; tmp.addAll(ed.getNotCoveredPositives()); tmp.removeAll(first); logger.debug("*************************"); @@ -163,4 +252,74 @@ } } + + + private static void printIntermediateResults( + SortedSet fullSet, + SortedSet correctIndividuals, + SortedSet wrongIndividuals, + int numberOfConcepts) { + printSet("full Individual set: ", fullSet); + + printSet("correct Individuals: ", correctIndividuals); + printSet("incorrect Individuals: ", wrongIndividuals); + logger.info("reevaluated " + numberOfConcepts + " found Concepts"); + logger.info("END OF PHASE 1 **********************"); + } + private static void printSet(String s, SortedSet set) { + if(logger.getLevel().equals(Level.DEBUG)){ + logger.info(s +" ["+ set.size()+"]: "+set); + }else{ + logger.info(s +" ["+ set.size()+"]"); + } + + } + + private static void setup() { + // SETUP cache and sparqltasks + cache = Cache.getPersistentCache(); + + if (LOCAL) { + // url = "http://139.18.2.37:8890/sparql"; + sparqlTasks = new SPARQLTasks(cache, SparqlEndpoint + .getEndpointLOCALDBpedia()); + } else { + // url = "http://dbpedia.openlinksw.com:8890/sparql"; + sparqlTasks = new SPARQLTasks(cache, SparqlEndpoint + .getEndpointDBpedia()); + } + } + + private static void initLogger() { + + SimpleLayout layout = new SimpleLayout(); + // create logger (a simple logger which outputs + // its messages to the console) + FileAppender fileAppender = null; + try { + fileAppender = new FileAppender(layout, "log/progress/skos" + + ConceptSelector.time() + ".txt", false); + } catch (Exception e) { + e.printStackTrace(); + } + + ConsoleAppender consoleAppender = new ConsoleAppender(layout); + logger.removeAllAppenders(); + logger.addAppender(consoleAppender); + logger.addAppender(fileAppender); + logger.setLevel(Level.DEBUG); + Logger.getLogger(KnowledgeSource.class).setLevel(Level.WARN); + Logger.getLogger(SparqlKnowledgeSource.class).setLevel(Level.WARN); + Logger.getLogger(Manager.class).setLevel(Level.INFO); + Logger.getLogger(ExtractionAlgorithm.class).setLevel(Level.WARN); + Logger.getLogger(AutomaticNegativeExampleFinderSPARQL.class).setLevel( + Level.WARN); + Logger.getLogger(AutomaticPositiveExampleFinderSPARQL.class).setLevel( + Level.WARN); + Logger.getLogger(ExampleBasedROLComponent.class).setLevel(Level.WARN); + Logger.getLogger(SparqlQuery.class).setLevel(Level.INFO); + Logger.getLogger(Cache.class).setLevel(Level.INFO); + + } + } Modified: trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSPARQLReEvaluator.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSPARQLReEvaluator.java 2008-07-31 16:37:22 UTC (rev 1044) +++ trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSPARQLReEvaluator.java 2008-08-02 18:25:33 UTC (rev 1045) @@ -32,11 +32,10 @@ import org.dllearner.utilities.owl.EvaluatedDescriptionComparator; /** - * @author Sebastian Hellmann + * @author Sebastian Hellmann * - * The EvaluatedDescriptions from a fragment are - * validated against the SPARQLendpoint. - * There are different strategies, see the methods; + * The EvaluatedDescriptions from a fragment are validated against the + * SPARQLendpoint. There are different strategies, see the methods; */ public class ConceptSPARQLReEvaluator { @@ -51,33 +50,29 @@ int depthOfRDFS = 1; - public ConceptSPARQLReEvaluator(SPARQLTasks sparqlTasks, - List<EvaluatedDescription> descToBeReevaluated) { - this.descToBeReevaluated = descToBeReevaluated; + /** + * Constructor using default settings + * + * @param sparqlTasks + */ + public ConceptSPARQLReEvaluator(SPARQLTasks sparqlTasks) { this.sparqlTasks = sparqlTasks; } - public ConceptSPARQLReEvaluator(SPARQLTasks sparqlTasks, - List<EvaluatedDescription> descToBeReevaluated, int depthOfRDFS, + /** + * constructor to manually set parameters + * + * @param sparqlTasks + * @param depthOfRDFS + * @param sparqlResultLimit + */ + public ConceptSPARQLReEvaluator(SPARQLTasks sparqlTasks, int depthOfRDFS, int sparqlResultLimit) { - this(sparqlTasks, descToBeReevaluated); + this(sparqlTasks); this.depthOfRDFS = depthOfRDFS; this.sparqlResultLimit = sparqlResultLimit; } - - public List<EvaluatedDescription> reevaluateConceptsByDataCoverage( - SortedSet<String> positiveSet, int maxNrOfConcepts) { - List<EvaluatedDescription> tmp = reevaluateConceptsByLowestRecall(positiveSet); - List<EvaluatedDescription> returnSet = new ArrayList<EvaluatedDescription>(); - - while ((!tmp.isEmpty()) && (returnSet.size() <= maxNrOfConcepts)) { - returnSet.add(tmp.remove(0)); - } - - return returnSet; - } - /** * Accuracy is calculated as correct positive classified over (correct * positive classified + incorrect negative classified) "How many are @@ -88,6 +83,7 @@ * @return */ public List<EvaluatedDescription> reevaluateConceptsByDataCoverage( + List<EvaluatedDescription> descToBeReevaluated, SortedSet<String> positiveSet) { SortedSet<EvaluatedDescription> returnSet = new TreeSet<EvaluatedDescription>( @@ -127,18 +123,6 @@ } - public List<EvaluatedDescription> reevaluateConceptsByLowestRecall( - SortedSet<String> positiveSet, int maxNrOfConcepts) { - List<EvaluatedDescription> tmp = reevaluateConceptsByLowestRecall(positiveSet); - List<EvaluatedDescription> returnSet = new ArrayList<EvaluatedDescription>(); - - while ((!tmp.isEmpty()) && (returnSet.size() <= maxNrOfConcepts)) { - returnSet.add(tmp.remove(0)); - } - - return returnSet; - } - /** * Accuracy is calculated as correct positive classified over all retrieved * e.g. 50 correct out of 400 retrieved (50/400) @@ -147,6 +131,7 @@ * @return */ public List<EvaluatedDescription> reevaluateConceptsByLowestRecall( + List<EvaluatedDescription> descToBeReevaluated, SortedSet<String> positiveSet) { logger.info("reevaluating by lowest recall " + descToBeReevaluated.size() + " concepts"); @@ -159,7 +144,6 @@ SortedSet<String> PosAsNeg = new TreeSet<String>(); SortedSet<Individual> NegAsPos = new TreeSet<Individual>(); - SortedSet<Individual> NegAsNeg = new TreeSet<Individual>(); // elements are immediately removed from the list to save memory @@ -197,4 +181,30 @@ kbsyntax, sparqlResultLimit, depthOfRDFS); } + /* + * public List<EvaluatedDescription> reevaluateConceptsByLowestRecall( List<EvaluatedDescription> + * descToBeReevaluated, SortedSet<String> positiveSet, int maxNrOfConcepts) { + * List<EvaluatedDescription> tmp = + * reevaluateConceptsByLowestRecall(descToBeReevaluated, positiveSet); List<EvaluatedDescription> + * returnSet = new ArrayList<EvaluatedDescription>(); + * + * while ((!tmp.isEmpty()) && (returnSet.size() <= maxNrOfConcepts)) { + * returnSet.add(tmp.remove(0)); } + * + * return returnSet; } + */ + + /* + * public List<EvaluatedDescription> reevaluateConceptsByDataCoverage( List<EvaluatedDescription> + * descToBeReevaluated, SortedSet<String> positiveSet, int maxNrOfConcepts) { + * List<EvaluatedDescription> tmp = + * reevaluateConceptsByLowestRecall(descToBeReevaluated, positiveSet); List<EvaluatedDescription> + * returnSet = new ArrayList<EvaluatedDescription>(); + * + * while ((!tmp.isEmpty()) && (returnSet.size() <= maxNrOfConcepts)) { + * returnSet.add(tmp.remove(0)); } + * + * return returnSet; } + */ + } Modified: trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSelector.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSelector.java 2008-07-31 16:37:22 UTC (rev 1044) +++ trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/ConceptSelector.java 2008-08-02 18:25:33 UTC (rev 1045) @@ -24,79 +24,69 @@ import java.util.List; import org.dllearner.core.EvaluatedDescription; -import org.dllearner.core.LearningAlgorithm; import org.dllearner.utilities.Files; +import org.dllearner.utilities.datastructures.SetManipulation; /** - * This is a simple class, it might be worked into other classes later. - * filters concepts and records some results + * This is a simple class, it might be worked into other classes later. filters + * concepts and records some results * * @author Sebastian Hellmann - * + * */ public class ConceptSelector { private static final long WASH = 1216800000000L; - - List<EvaluatedDescription> concepts; - - public ConceptSelector(LearningAlgorithm la) { + + // List<EvaluatedDescription> concepts; + + public ConceptSelector() { super(); - this.concepts = la.getCurrentlyBestEvaluatedDescriptions(Integer.MAX_VALUE, 0.0, true); - this.recordConceptClasses(); - + // this.concepts = concepts; + // this.recordConceptClasses(); + } - - - public ConceptSelector(LearningAlgorithm la, int maxNrOfConcepts) { - super(); - this.concepts = la.getCurrentlyBestEvaluatedDescriptions(maxNrOfConcepts); - + + public List<EvaluatedDescription> getAllConceptsWithoutOR( + List<EvaluatedDescription> concepts) { + return getConceptsNotContainingString(concepts, "OR"); } - - public ConceptSelector(LearningAlgorithm la, int maxNrOfConcepts, double acctreshold) { - super(); - this.concepts = la.getCurrentlyBestEvaluatedDescriptions(maxNrOfConcepts, acctreshold, true); - this.recordConceptClasses(); + + @SuppressWarnings("unchecked") + public List<EvaluatedDescription> getConceptsNotContainingString( + List<EvaluatedDescription> concepts, String filterString, + int limitSize) { + // List<EvaluatedDescription> tmp = + // getConceptsNotContainingString(filterString); + // List<EvaluatedDescription> result = new + // ArrayList<EvaluatedDescription>(); + return SetManipulation.getFirst(getConceptsNotContainingString( + concepts, filterString), limitSize); + /* + * while ((!tmp.isEmpty()) && (result.size() <= limitSize)) { + * result.add(tmp.remove(0)); } return result; + */ } - - public List<EvaluatedDescription> getConceptsWithoutOR(){ - return getConceptsNotContainingString("OR"); - } - - public List<EvaluatedDescription> getConceptsNotContainingString(String filterString, int limitSize){ - List<EvaluatedDescription> tmp = getConceptsNotContainingString(filterString); - List<EvaluatedDescription> result = new ArrayList<EvaluatedDescription>(); - - while ((!tmp.isEmpty()) && (result.size() <= limitSize)) { - result.add(tmp.remove(0)); - } - return result; - } - - - public List<EvaluatedDescription> getConceptsNotContainingString(String filterString){ - + + public List<EvaluatedDescription> getConceptsNotContainingString( + List<EvaluatedDescription> concepts, String filterString) { + List<EvaluatedDescription> result = new ArrayList<EvaluatedDescription>(); for (EvaluatedDescription description : concepts) { if (!description.toString().contains(filterString)) { - result.add(description); + result.add(description); } - + } return result; } - - - - public void recordConceptClasses() { - StringBuffer result =new StringBuffer(); - StringBuffer result1 =new StringBuffer("\n\n ***********Entity*****\n"); - StringBuffer result2 =new StringBuffer("\n\n ***********OR*****\n"); + public void recordConceptClasses(List<EvaluatedDescription> concepts) { + StringBuffer result = new StringBuffer(); + StringBuffer result1 = new StringBuffer("\n\n ***********Entity*****\n"); + StringBuffer result2 = new StringBuffer("\n\n ***********OR*****\n"); int result1count = 1; int result2count = 1; - int x = 0; for (EvaluatedDescription description : concepts) { @@ -121,7 +111,7 @@ Files.createFile(new File("results/descriptions/concepts" + time() + ".txt"), result.toString()); } - + public static String time() { return ("" + (System.currentTimeMillis() - WASH)).substring(0, 7); Modified: trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/WikipediaCategoryTasks.java =================================================================== --- trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/WikipediaCategoryTasks.java 2008-07-31 16:37:22 UTC (rev 1044) +++ trunk/src/dl-learner/org/dllearner/scripts/improveWikipedia/WikipediaCategoryTasks.java 2008-08-02 18:25:33 UTC (rev 1045) @@ -19,90 +19,62 @@ */ package org.dllearner.scripts.improveWikipedia; -import java.util.ArrayList; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; import org.apache.log4j.Logger; import org.dllearner.core.EvaluatedDescription; -import org.dllearner.core.LearningAlgorithm; import org.dllearner.kb.sparql.SPARQLTasks; -import org.dllearner.scripts.WikipediaCategoryCleaner; import org.dllearner.utilities.Helper; import org.dllearner.utilities.datastructures.SetManipulation; import org.dllearner.utilities.examples.AutomaticNegativeExampleFinderSPARQL; import org.dllearner.utilities.examples.AutomaticPositiveExampleFinderSPARQL; -import org.dllearner.utilities.learn.LearnSPARQLConfiguration; -import org.dllearner.utilities.learn.LearnSparql; public class WikipediaCategoryTasks { private static Logger logger = Logger .getLogger(WikipediaCategoryTasks.class); - private static final boolean STABLE = true; // used for developing, same - - // negExamples not random - - private static final int MAXIMUM_NUMBER_OF_CONCEPTS_KEPT = Integer.MAX_VALUE; - - private static final double ACCTRESHOLD = 0.0; - private SPARQLTasks sparqlTasks; + // these cahnge all the time private SortedSet<String> posExamples = new TreeSet<String>(); - private SortedSet<String> fullPositiveSet = new TreeSet<String>(); + private SortedSet<String> negExamples = new TreeSet<String>(); - // private SortedSet<String> fullPosSetWithoutPosExamples = new - // TreeSet<String>(); + // these dont change, they are for collecting + private SortedSet<String> cleanedPositiveSet = new TreeSet<String>(); - private SortedSet<String> negExamples = new TreeSet<String>(); + private SortedSet<String> fullPositiveSet = new TreeSet<String>(); private SortedSet<String> definitelyWrongIndividuals = new TreeSet<String>(); - private List<EvaluatedDescription> conceptresults = new ArrayList<EvaluatedDescription>(); - public WikipediaCategoryTasks(SPARQLTasks sparqlTasks) { this.sparqlTasks = sparqlTasks; } /** - * @param SKOSConcept - * @param percentOfSKOSSet - * @param negfactor - * @param sparqlResultLimit + * The strategy is yet really simple. //TODO take the best concept and the + * notCoveredPositives are the ones definitely wrong these are removed from + * the positives examples. + * + * @param conceptresults + * @param posExamples + * @return */ - public void calculateDefinitelyWrongIndividuals(String SKOSConcept, - double percentOfSKOSSet, double negfactor, int sparqlResultLimit) { + public SortedSet<String> calculateWrongIndividualsAndNewPosEx( + List<EvaluatedDescription> conceptresults, + SortedSet<String> posExamples) { - makeExamples(SKOSConcept, percentOfSKOSSet, negfactor, - sparqlResultLimit); - - LearnSparql learner = new LearnSparql( - prepareConfigurationToFindWrongIndividuals()); - LearningAlgorithm la = null; - try { - la = learner.learn(posExamples, negExamples); - } catch (Exception e) { - e.printStackTrace(); - } - // TODO maybe not smart here - ConceptSelector cs = new ConceptSelector(la, - MAXIMUM_NUMBER_OF_CONCEPTS_KEPT, ACCTRESHOLD); - conceptresults = cs.getConceptsNotContainingString("Entity", - MAXIMUM_NUMBER_OF_CONCEPTS_KEPT); - if (conceptresults.size() == 0) { - logger.warn("NO GOOD CONCEPTS FOUND"); - } - definitelyWrongIndividuals = Helper.getStringSet(conceptresults.get(0) .getNotCoveredPositives()); // clean the examples posExamples.removeAll(definitelyWrongIndividuals); - fullPositiveSet.removeAll(definitelyWrongIndividuals); + this.posExamples.clear(); + this.posExamples.addAll(posExamples); + this.cleanedPositiveSet.addAll(posExamples); // fullPosSetWithoutPosExamples.removeAll(definitelyWrongIndividuals); logger.trace("posExamples" + posExamples.size()); @@ -110,25 +82,24 @@ negExamples.clear(); + return definitelyWrongIndividuals; + } - public void reevaluateAndRelearn() { + /** + * TODO could be more sophisticated + * + * @param reEvaluatedDesc + * @return + */ + public SortedSet<String> makeNewNegativeExamples( + List<EvaluatedDescription> reEvaluatedDesc, + SortedSet<String> posExamples, double negFactor) { + negExamples.clear(); - ConceptSPARQLReEvaluator csparql = new ConceptSPARQLReEvaluator( - sparqlTasks, conceptresults); - List<EvaluatedDescription> reEvaluatedDesc; - - // TODO Optimize here - reEvaluatedDesc = csparql.reevaluateConceptsByLowestRecall( - fullPositiveSet, 1); - - // TODO add check if it is correct - WikipediaCategoryCleaner.printEvaluatedDescriptionCollection(10, - reEvaluatedDesc); EvaluatedDescription newDesc = reEvaluatedDesc.get(0); logger.info("Best concept: " + newDesc.getDescription()); - negExamples.clear(); negExamples.addAll(Helper.getStringSet(newDesc.getCoveredPositives())); negExamples.addAll(Helper .getStringSet(newDesc.getNotCoveredPositives())); @@ -137,33 +108,28 @@ .getStringSet(newDesc.getNotCoveredNegatives())); negExamples.removeAll(posExamples); - // TODO could be more negatives - negExamples = SetManipulation.fuzzyShrink(negExamples, posExamples - .size()); - LearnSparql learner = new LearnSparql(prepareConfigurationToRelearn()); - LearningAlgorithm la = null; - try { - la = learner.learn(posExamples, negExamples); - } catch (Exception e) { - e.printStackTrace(); - } - conceptresults = la.getCurrentlyBestEvaluatedDescriptions(500, - ACCTRESHOLD, true); + int neglimit = (int) Math.round(posExamples.size() * negFactor); + negExamples = SetManipulation.fuzzyShrink(negExamples, neglimit); + return negExamples; } /** - * @param SKOSConcept + * makes positive and negative Examples. positives are a simple retrieval of + * the category. negatives are made from parallelclasses. + * + * @param targetCategory * @param percentOfSKOSSet * percentage used from the SKOSSet for training - * @param negfactor + * @param negFactor * size of the negative Examples compared to the posExample size * (1.0 means equal size) * @param sparqlResultLimit */ - public void makeExamples(String SKOSConcept, double percentOfSKOSSet, - double negfactor, int sparqlResultLimit) { + public void makeInitialExamples(String targetCategory, + double percentOfSKOSSet, double negFactor, int sparqlResultLimit, + boolean develop) { fullPositiveSet.clear(); // fullPosSetWithoutPosExamples.clear(); posExamples.clear(); @@ -172,12 +138,12 @@ // POSITIVES AutomaticPositiveExampleFinderSPARQL apos = new AutomaticPositiveExampleFinderSPARQL( sparqlTasks); - apos.makePositiveExamplesFromSKOSConcept(SKOSConcept); - fullPositiveSet = apos.getPosExamples(); + apos.makePositiveExamplesFromSKOSConcept(targetCategory); + fullPositiveSet.addAll(apos.getPosExamples()); int poslimit = (int) Math.round(percentOfSKOSSet * fullPositiveSet.size()); - int neglimit = (int) Math.round(poslimit * negfactor); + int neglimit = (int) Math.round(poslimit * negFactor); posExamples = SetManipulation.fuzzyShrink(fullPositiveSet, poslimit); @@ -188,7 +154,7 @@ aneg.makeNegativeExamplesFromParallelClasses(posExamples, sparqlResultLimit); - negExamples = aneg.getNegativeExamples(neglimit, STABLE); + negExamples = aneg.getNegativeExamples(neglimit, develop); logger.debug("POSITIVE EXAMPLES"); for (String pos : posExamples) { @@ -209,25 +175,6 @@ } - private LearnSPARQLConfiguration prepareConfigurationToFindWrongIndividuals() { - LearnSPARQLConfiguration lsc = new LearnSPARQLConfiguration(); - lsc.sparqlEndpoint = sparqlTasks.getSparqlEndpoint(); - - lsc.noisePercentage = 15; - lsc.guaranteeXgoodDescriptions = 200; - lsc.maxExecutionTimeInSeconds = 50; - lsc.logLevel = "INFO"; - // lsc.searchTreeFile = "log/WikipediaCleaner.txt"; - - return lsc; - - } - - private LearnSPARQLConfiguration prepareConfigurationToRelearn() { - return prepareConfigurationToFindWrongIndividuals(); - - } - public SortedSet<String> getPosExamples() { return posExamples; } @@ -244,8 +191,8 @@ return definitelyWrongIndividuals; } - public List<EvaluatedDescription> getConceptresults() { - return conceptresults; + public SortedSet<String> getCleanedPositiveSet() { + return cleanedPositiveSet; } } Modified: trunk/src/dl-learner/org/dllearner/utilities/datastructures/SetManipulation.java =================================================================== --- trunk/src/dl-learner/org/dllearner/utilities/datastructures/SetManipulation.java 2008-07-31 16:37:22 UTC (rev 1044) +++ trunk/src/dl-learner/org/dllearner/utilities/datastructures/SetManipulation.java 2008-08-02 18:25:33 UTC (rev 1045) @@ -1,5 +1,6 @@ package org.dllearner.utilities.datastructures; +import java.util.List; import java.util.Random; import java.util.SortedSet; import java.util.TreeSet; @@ -8,57 +9,75 @@ public class SetManipulation { - /** - * shrinks a set to the limit - * fuzzy here means the elements will be randomly picked + * shrinks a set to the limit fuzzy here means the elements will be randomly + * picked + * * @param set * @param limit * @return */ public static SortedSet<String> fuzzyShrink(SortedSet<String> set, int limit) { - if (set.size()<=limit)return set; + if (set.size() <= limit) + return set; SortedSet<String> ret = new TreeSet<String>(); Random r = new Random(); - double treshold = ((double)limit)/set.size(); - //System.out.println("treshold"+howmany); - //System.out.println("treshold"+allRetrieved.size()); - //System.out.println("treshold"+treshold); - - - while(ret.size()< limit){ + double treshold = ((double) limit) / set.size(); + // System.out.println("treshold"+howmany); + // System.out.println("treshold"+allRetrieved.size()); + // System.out.println("treshold"+treshold); + + while (ret.size() < limit) { for (String oneInd : set) { - if(r.nextDouble()<treshold) { + if (r.nextDouble() < treshold) { ret.add(oneInd); - if(ret.size()>= limit)break; + if (ret.size() >= limit) + break; } } } return ret; } - + /** - * shrinks a set to the limit - * takes the first elements up to limit + * shrinks a set to the limit takes the first elements up to limit + * * @param set * @param limit * @return */ - public static SortedSet<String> stableShrink(SortedSet<String> set, int limit) { - if (set.size()<=limit)return set; + public static SortedSet<String> stableShrink(SortedSet<String> set, + int limit) { + if (set.size() <= limit) + return set; SortedSet<String> ret = new TreeSet<String>(); - - + for (String oneInd : set) { ret.add(oneInd); - if(ret.size()>= limit)break; - + if (ret.size() >= limit) + break; + } - + return ret; } - - public static SortedSet<Individual> stringToInd(SortedSet<String> set ){ + + /** + * getFirst n Elements from list. + * + * @param list + * @param nrElements + * @return returns the list shrunken to size. it is an ARRAYLIST now + */ + public static List getFirst(List list, int nrElements) { + int size; + while ((size = list.size()) > nrElements) { + list.remove(size - 1); + } + return list; + } + + public static SortedSet<Individual> stringToInd(SortedSet<String> set) { SortedSet<Individual> ret = new TreeSet<Individual>(); for (String ind : set) { ret.add(new Individual(ind)); Modified: trunk/src/dl-learner/org/dllearner/utilities/examples/AutomaticNegativeExampleFinderSPARQL.java =================================================================== --- trunk/src/dl-learner/org/dllearner/utilities/examples/AutomaticNegativeExampleFinderSPARQL.java 2008-07-31 16:37:22 UTC (rev 1044) +++ trunk/src/dl-learner/org/dllearner/utilities/examples/AutomaticNegativeExampleFinderSPARQL.java 2008-08-02 18:25:33 UTC (rev 1045) @@ -129,16 +129,17 @@ }*/ /** - * makes neg ex from classes, the pos ex belong to + * makes negEx from classes, the posEx belong to. + * Gets all Classes from PosEx, gets Instances from these Classes, returns all * @param positiveSet - * @param resultLimit + * @param sparqlResultLimit */ - public void makeNegativeExamplesFromParallelClasses(SortedSet<String> positiveSet, int resultLimit){ - makeNegativeExamplesFromClassesOfInstances(positiveSet, resultLimit); + public void makeNegativeExamplesFromParallelClasses(SortedSet<String> positiveSet, int sparqlResultLimit){ + makeNegativeExamplesFromClassesOfInstances(positiveSet, sparqlResultLimit); } private void makeNegativeExamplesFromClassesOfInstances(SortedSet<String> positiveSet, - int resultLimit) { + int sparqlResultLimit) { logger.debug("making neg Examples from parallel classes"); SortedSet<String> classes = new TreeSet<String>(); // superClasses.add(concept.replace("\"", "")); @@ -148,7 +149,7 @@ // superclasses"); for (String instance : positiveSet) { - classes.addAll(sparqltasks.getClassesForInstance(instance, resultLimit)); + classes.addAll(sparqltasks.getClassesForInstance(instance, sparqlResultLimit)); } logger.debug("getting negExamples from " + classes.size() + " parallel classes"); for (String oneClass : classes) { @@ -156,7 +157,7 @@ // rsc = new // JenaResultSetConvenience(queryConcept("\""+oneClass+"\"",limit)); this.fromParallelClasses.addAll(sparqltasks.retrieveInstancesForClassDescription("\"" + oneClass - + "\"", resultLimit)); + + "\"", sparqlResultLimit)); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |