From: <ku...@us...> - 2010-02-11 09:16:11
|
Revision: 2010 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=2010&view=rev Author: kurzum Date: 2010-02-11 09:16:04 +0000 (Thu, 11 Feb 2010) Log Message: ----------- Modified Paths: -------------- trunk/examples/datatypes/stringtyped.conf trunk/examples/datatypes/stringuntyped.conf trunk/examples/nlp2rdf/passive_vs_active.conf trunk/src/dl-learner/org/dllearner/core/EvaluatedDescription.java trunk/src/dl-learner/org/dllearner/kb/sparql/SPARQLTasks.java trunk/src/dl-learner/org/dllearner/kb/sparql/SparqlQueryDescriptionConvertVisitor.java trunk/src/dl-learner/org/dllearner/server/DLLearnerWS.java trunk/src/dl-learner/org/dllearner/utilities/examples/ExampleDataCollector.java Added Paths: ----------- trunk/src/dl-learner/org/dllearner/scripts/evaluation/cinema.res trunk/src/dl-learner/org/dllearner/utilities/examples/ExMakerFixedSize.java trunk/src/dl-learner/org/dllearner/utilities/examples/ExMakerRandomizer.java Removed Paths: ------------- trunk/src/dl-learner/org/dllearner/utilities/examples/Randomizer.java Property Changed: ---------------- trunk/ trunk/examples/ trunk/examples/nlp2rdf/ Property changes on: trunk ___________________________________________________________________ Modified: svn:ignore - .lastUsedExample .settings .project .classpath classes log cache cachePersistant reports results local rdbtoonto the_log.txt tmp fragmentOntology.owl output ling bin log + .lastUsedExample .settings .project .classpath classes log cache cachePersistant reports results local rdbtoonto the_log.txt tmp fragmentOntology.owl output ling osmdata matching stanley dllearner.jar father.inp lgd.nt Property changes on: trunk/examples ___________________________________________________________________ Added: svn:ignore + nlp nodeExtractionBug.conf Modified: trunk/examples/datatypes/stringtyped.conf =================================================================== --- trunk/examples/datatypes/stringtyped.conf 2010-02-11 08:10:50 UTC (rev 2009) +++ trunk/examples/datatypes/stringtyped.conf 2010-02-11 09:16:04 UTC (rev 2010) @@ -8,6 +8,7 @@ // refexamples.writeSearchTree = true; refexamples.searchTreeFile = "log/stringTypedTree.txt"; reasoner = fastInstanceChecker; +refexamples.useDataHasValueConstructor=true; import("string.owl"); Modified: trunk/examples/datatypes/stringuntyped.conf =================================================================== --- trunk/examples/datatypes/stringuntyped.conf 2010-02-11 08:10:50 UTC (rev 2009) +++ trunk/examples/datatypes/stringuntyped.conf 2010-02-11 09:16:04 UTC (rev 2010) @@ -7,6 +7,7 @@ algorithm = refexamples; // refexamples.writeSearchTree = true; refexamples.searchTreeFile = "log/stringUntypedTree.txt"; +refexamples.useDataHasValueConstructor=true; reasoner = fastInstanceChecker; import("string.owl"); Property changes on: trunk/examples/nlp2rdf ___________________________________________________________________ Added: svn:ignore + dllearner_last_run_examples.conf output.rdf Modified: trunk/examples/nlp2rdf/passive_vs_active.conf =================================================================== --- trunk/examples/nlp2rdf/passive_vs_active.conf 2010-02-11 08:10:50 UTC (rev 2009) +++ trunk/examples/nlp2rdf/passive_vs_active.conf 2010-02-11 09:16:04 UTC (rev 2010) @@ -61,7 +61,7 @@ -"http://nlp2rdf.org/ontology/sentence-structure/s32" -"http://nlp2rdf.org/ontology/sentence-structure/s33" -"http://nlp2rdf.org/ontology/sentence-structure/s34" --"http://nlp2rdf.org/ontology/sentence-structure/s35" ++"http://nlp2rdf.org/ontology/sentence-structure/s35" -"http://nlp2rdf.org/ontology/sentence-structure/s36" -"http://nlp2rdf.org/ontology/sentence-structure/s37" -"http://nlp2rdf.org/ontology/sentence-structure/s38" Modified: trunk/src/dl-learner/org/dllearner/core/EvaluatedDescription.java =================================================================== --- trunk/src/dl-learner/org/dllearner/core/EvaluatedDescription.java 2010-02-11 08:10:50 UTC (rev 2009) +++ trunk/src/dl-learner/org/dllearner/core/EvaluatedDescription.java 2010-02-11 09:16:04 UTC (rev 2010) @@ -111,7 +111,7 @@ * @return A SPARQL query of the underlying description. */ public String getSparqlQuery(int limit) { - return SparqlQueryDescriptionConvertVisitor.getSparqlQuery(description, limit, false); + return SparqlQueryDescriptionConvertVisitor.getSparqlQuery(description, limit, false, false); } /** Modified: trunk/src/dl-learner/org/dllearner/kb/sparql/SPARQLTasks.java =================================================================== --- trunk/src/dl-learner/org/dllearner/kb/sparql/SPARQLTasks.java 2010-02-11 08:10:50 UTC (rev 2009) +++ trunk/src/dl-learner/org/dllearner/kb/sparql/SPARQLTasks.java 2010-02-11 09:16:04 UTC (rev 2010) @@ -238,7 +238,7 @@ String sparqlQueryString = ""; try { sparqlQueryString = SparqlQueryDescriptionConvertVisitor - .getSparqlQuery(conceptKBSyntax, sparqlResultLimit); + .getSparqlQuery(conceptKBSyntax, sparqlResultLimit, false, false); } catch (Exception e) { logger.warn(e.getMessage()); } Modified: trunk/src/dl-learner/org/dllearner/kb/sparql/SparqlQueryDescriptionConvertVisitor.java =================================================================== --- trunk/src/dl-learner/org/dllearner/kb/sparql/SparqlQueryDescriptionConvertVisitor.java 2010-02-11 08:10:50 UTC (rev 2009) +++ trunk/src/dl-learner/org/dllearner/kb/sparql/SparqlQueryDescriptionConvertVisitor.java 2010-02-11 09:16:04 UTC (rev 2010) @@ -19,7 +19,10 @@ */ package org.dllearner.kb.sparql; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.SortedSet; import java.util.Stack; import java.util.TreeSet; @@ -61,49 +64,117 @@ */ public class SparqlQueryDescriptionConvertVisitor implements DescriptionVisitor { - // private SparqlEndpoint se = null; - // private boolean RDFSReasoning = false; - private static int defaultLimit = 5; - + private static Logger logger = Logger.getLogger(ComponentManager.class); private Stack<String> stack = new Stack<String>(); - private String query = ""; - private int currentObject = 0; - + + private int limit = 5; + private boolean labels = false; + private boolean distinct = false; + private Map<String,String> classToSubclassesVirtuoso = null; + private List<String> foundNamedClasses = new ArrayList<String>(); + public SparqlQueryDescriptionConvertVisitor() { stack.push("subject"); } + + public void reset(){ + currentObject = 0; + stack = new Stack<String>(); + stack.push("subject"); + query = ""; + } - /* - * public SparqlQueryDescriptionConvertVisitor(SparqlEndpoint se, boolean - * RDFSReasoning) { stack.push("subject"); this.se = se; this.RDFSReasoning - * = RDFSReasoning; } - */ + public String getSparqlQuery( String descriptionKBSyntax) throws ParseException { + Description description = KBParser.parseConcept(descriptionKBSyntax); + return getSparqlQuery( description); + } + + public String getSparqlQuery( Description description) { + description.accept(this); + expandSubclasses(); + String ret = "SELECT "+distinct()+"?subject "+((labels)?"?label":"")+" { "+labels()+ query + " \n } " + limit(); + this.reset(); + return ret; + } + + private void expandSubclasses(){ + if(classToSubclassesVirtuoso == null){ + return; + } + int counter = 0; + int index = 0; + String filter = ""; + String var = ""; + String uri = ""; + StringBuffer tmp ; + for(String nc: foundNamedClasses){ + index = query.indexOf("<"+nc+">"); + filter = classToSubclassesVirtuoso.get(nc); + if(index == -1){ + logger.warn("named class found before, but not in query?? "+nc); + }else if(filter != null){ + var = "?expanded"+counter; + uri = "<"+nc+">"; + tmp = new StringBuffer(); + tmp.append(query.substring(0, index)); + tmp.append(var); + tmp.append(query.substring(index+(uri.length()))); + tmp.append("\nFILTER ( " +var+ " in (" +filter+ ") ). "); + query = tmp.toString(); +// = query.substring(0, index)+var+query.substring(index+(uri.length())); + +// query += "\nFILTER (?expanded" +counter+ +// " in (" +filter+ +// ") ). "; + }else{ + logger.debug("no mapping found ("+nc+") "+this.getClass().getSimpleName()); + } + counter++; + } + } + + private String limit() { + return (limit > 0) ? " LIMIT " + limit + " " : ""; + } + private String labels() { + return (labels)?"\n?subject rdfs:label ?label . ":""; + } + private String distinct() { + return (distinct)?"DISTINCT ":""; + } - private String getSparqlQuery(int resultLimit, boolean labels) { - return "SELECT ?subject \nWHERE { "+((labels)?" ?subject rdfs:label ?label .":"")+" " + query + " }\n " + limit(resultLimit); + public void setLimit(int limit) { + this.limit = limit; } - public static String getSparqlQuery(String descriptionKBSyntax) throws ParseException { - return getSparqlQuery(descriptionKBSyntax, defaultLimit); + public void setLabels(boolean labels) { + this.labels = labels; } - public static String getSparqlQuery(String descriptionKBSyntax, int limit) throws ParseException { - Description d = KBParser.parseConcept(descriptionKBSyntax); - return getSparqlQuery(d, limit, false); + + public void setDistinct(boolean distinct) { + this.distinct = distinct; } + + public void setClassToSubclassesVirtuoso(Map<String,String> classToSubclassesVirtuoso) { + this.classToSubclassesVirtuoso = classToSubclassesVirtuoso; + } - public static String getSparqlQuery(Description description) { - return getSparqlQuery(description, defaultLimit, false); + public static String getSparqlQuery(String descriptionKBSyntax, int limit, boolean labels, boolean distinct) throws ParseException { + Description d = KBParser.parseConcept(descriptionKBSyntax); + return getSparqlQuery(d, limit, labels, distinct); } - - public static String getSparqlQuery(Description description, int resultLimit, boolean labels) { + + public static String getSparqlQuery(Description description, int limit, boolean labels, boolean distinct) { SparqlQueryDescriptionConvertVisitor visitor = new SparqlQueryDescriptionConvertVisitor(); - description.accept(visitor); - return visitor.getSparqlQuery(resultLimit, labels); + visitor.setDistinct(distinct); + visitor.setLabels(labels); + visitor.setLimit(limit); + return visitor.getSparqlQuery(description); } /** @@ -124,7 +195,7 @@ String rewritten = SparqlQueryDescriptionConvertRDFS .conceptRewrite(descriptionKBSyntax, st, maxDepth); - return getSparqlQuery(rewritten, resultLimit); + return getSparqlQuery(rewritten, resultLimit, false, false); } @@ -137,6 +208,8 @@ try { SortedSet<String> s = new TreeSet<String>(); HashMap<String, String> result = new HashMap<String, String>(); + HashMap<String, String> subclassMap = new HashMap<String, String>(); + subclassMap.put("http://nlp2rdf.org/ontology/Sentence","<http://nlp2rdf.org/ontology/Subsentence>"); String conj = "(\"http://dbpedia.org/class/yago/Person100007846\" AND \"http://dbpedia.org/class/yago/Head110162991\")"; s.add("EXISTS \"http://dbpedia.org/property/disambiguates\".TOP"); @@ -150,8 +223,19 @@ s.add("NOT \"http://dbpedia.org/class/yago/Person100007846\""); s.add("(\"http://dbpedia.org/class/yago/HeadOfState110164747\" AND (\"http://dbpedia.org/class/yago/Negotiator110351874\" AND \"http://dbpedia.org/class/yago/Representative110522035\"))"); + s.clear(); + s.add("(\"http://nlp2rdf.org/ontology/Sentence\" AND (EXISTS \"http://nlp2rdf.org/ontology/syntaxTreeHasPart\".\"http://nachhalt.sfb632.uni-potsdam.de/owl/stts.owl#Pronoun\" AND EXISTS \"http://nlp2rdf.org/ontology/syntaxTreeHasPart\".\"http://nlp2rdf.org/ontology/sentencefinalpunctuation_tag\"))"); + +// <http://nlp2rdf.org/ontology/sentencefinalpunctuation_tag> + String query = ""; + SparqlQueryDescriptionConvertVisitor visit = new SparqlQueryDescriptionConvertVisitor(); + visit.setLabels(true); + visit.setDistinct(true); + visit.setClassToSubclassesVirtuoso(subclassMap); + for (String kbsyntax : s) { - result.put(kbsyntax, SparqlQueryDescriptionConvertVisitor.getSparqlQuery(kbsyntax)); + query = visit.getSparqlQuery(kbsyntax); + result.put(kbsyntax, query); } System.out.println("************************"); for (String string : result.keySet()) { @@ -196,7 +280,7 @@ */ public void visit(ObjectSomeRestriction description) { logger.trace("ObjectSomeRestriction"); - query += "?" + stack.peek() + " <" + description.getRole() + "> ?object" + currentObject + "."; + query += "\n?" + stack.peek() + " <" + description.getRole() + "> ?object" + currentObject + ". "; stack.push("object" + currentObject); currentObject++; description.getChild(0).accept(this); @@ -238,7 +322,7 @@ public void visit(Intersection description) { logger.trace("Intersection"); description.getChild(0).accept(this); - query += "."; + query += ". "; description.getChild(1).accept(this); } @@ -302,7 +386,7 @@ public void visit(ObjectValueRestriction description) { ObjectProperty op = (ObjectProperty) description.getRestrictedPropertyExpression(); Individual ind = description.getIndividual(); - query += "?" + stack.peek() + " <" + op.getName() + "> <" + ind.getName() + ">"; + query += "\n?" + stack.peek() + " <" + op.getName() + "> <" + ind.getName() + "> "; } /* @@ -326,7 +410,8 @@ public void visit(NamedClass description) { logger.trace("NamedClass"); - query += "?" + stack.peek() + " a <" + description.getName() + ">"; + query += "\n?" + stack.peek() + " a <" + description.getName() + "> "; + foundNamedClasses.add(description.getName()); } /* @@ -384,8 +469,6 @@ logger.trace("DatatypeSomeRestriction"); } - private String limit(int resultLimit) { - return (resultLimit > 0) ? " LIMIT " + resultLimit + " " : ""; - } + } Added: trunk/src/dl-learner/org/dllearner/scripts/evaluation/cinema.res =================================================================== (Binary files differ) Property changes on: trunk/src/dl-learner/org/dllearner/scripts/evaluation/cinema.res ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Modified: trunk/src/dl-learner/org/dllearner/server/DLLearnerWS.java =================================================================== --- trunk/src/dl-learner/org/dllearner/server/DLLearnerWS.java 2010-02-11 08:10:50 UTC (rev 2009) +++ trunk/src/dl-learner/org/dllearner/server/DLLearnerWS.java 2010-02-11 09:16:04 UTC (rev 2010) @@ -966,7 +966,7 @@ @WebMethod public String SparqlRetrieval(String conceptString,int limit) throws ParseException { // call parser to parse concept - return SparqlQueryDescriptionConvertVisitor.getSparqlQuery(conceptString,limit); + return SparqlQueryDescriptionConvertVisitor.getSparqlQuery(conceptString,limit, false, false); } @WebMethod Added: trunk/src/dl-learner/org/dllearner/utilities/examples/ExMakerFixedSize.java =================================================================== --- trunk/src/dl-learner/org/dllearner/utilities/examples/ExMakerFixedSize.java (rev 0) +++ trunk/src/dl-learner/org/dllearner/utilities/examples/ExMakerFixedSize.java 2010-02-11 09:16:04 UTC (rev 2010) @@ -0,0 +1,96 @@ +/** + * Copyright (C) 2007-2008, Jens Lehmann + * + * This file is part of DL-Learner. + * + * DL-Learner is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * DL-Learner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + */ +package org.dllearner.utilities.examples; + +import java.util.Random; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.log4j.Logger; + +/** + * used to randomize examples and split them into training and test sets + * @author Sebastian Hellmann <hel...@in...> + * + */ +public class ExMakerFixedSize { + private static Logger logger = Logger.getLogger(ExMakerFixedSize.class); + + private final Examples examples; + + public ExMakerFixedSize(Examples examples ){ + this.examples = examples; + } + + public static void main(String[] args) { + Examples ex = new Examples(); + + for (int i = 0; i < 20; i++) { + ex.addPosTrain("p"+i); + ex.addNegTrain("n"+i); + } + + ExMakerFixedSize r = new ExMakerFixedSize(ex); + ex = r.select(5, 5); + System.out.println(ex.toString()); + + } + + public Examples select(int nrOfPos, int nrOfNeg){ + + SortedSet<String> posTrain = new TreeSet<String>(); + SortedSet<String> negTrain = new TreeSet<String>(); + + SortedSet<String> posTest = new TreeSet<String>(); + SortedSet<String> negTest = new TreeSet<String>(); + + SortedSet<String> posOld = new TreeSet<String>(); + SortedSet<String> negOld = new TreeSet<String>(); + posOld.addAll(examples.getPositiveExamples()); + negOld.addAll(examples.getNegativeExamples()); + + while (!posOld.isEmpty() && posTrain.size()< nrOfPos) { + String one = pickOneRandomly(posOld.toArray(new String[] {})); + posOld.remove(one); + posTrain.add(one); + } + posTest.addAll(posOld); + + while (!negOld.isEmpty() && negTrain.size()< nrOfNeg) { + String one = pickOneRandomly(negOld.toArray(new String[] {})); + negOld.remove(one); + negTrain.add(one); + } + negTest.addAll(negOld); + + return new Examples(posTrain, negTrain, posTest, negTest); + } + + public static String pickOneRandomly(String[] from){ + Random r = new Random(); + int index = Math.round((float)(from.length*r.nextFloat())); + try{ + return from[index]; + }catch (Exception e) { + return pickOneRandomly(from); + } + } + +} Copied: trunk/src/dl-learner/org/dllearner/utilities/examples/ExMakerRandomizer.java (from rev 1996, trunk/src/dl-learner/org/dllearner/utilities/examples/Randomizer.java) =================================================================== --- trunk/src/dl-learner/org/dllearner/utilities/examples/ExMakerRandomizer.java (rev 0) +++ trunk/src/dl-learner/org/dllearner/utilities/examples/ExMakerRandomizer.java 2010-02-11 09:16:04 UTC (rev 2010) @@ -0,0 +1,108 @@ +/** + * Copyright (C) 2007-2008, Jens Lehmann + * + * This file is part of DL-Learner. + * + * DL-Learner is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * DL-Learner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + */ +package org.dllearner.utilities.examples; + +import java.util.Random; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.log4j.Logger; + +/** + * used to randomize examples and split them into training and test sets + * @author Sebastian Hellmann <hel...@in...> + * + */ +public class ExMakerRandomizer { + private static Logger logger = Logger.getLogger(ExMakerRandomizer.class); + + private final Examples examples; + + public ExMakerRandomizer(Examples examples ){ + this.examples = examples; + } + + public static void main(String[] args) { + Examples ex = new Examples(); + + for (int i = 0; i < 20; i++) { + ex.addPosTrain("p"+i); + ex.addNegTrain("n"+i); + } + + ExMakerRandomizer r = new ExMakerRandomizer(ex); + ex = r.split(0.7d); + System.out.println(ex.toString()); + + } + + public Examples split(double percentageOfTrainingSet){ +// System.out.println(GlobalConfig.trainingDataPercentage+""); + SortedSet<String> posTrain = new TreeSet<String>(); + SortedSet<String> negTrain = new TreeSet<String>(); + + SortedSet<String> posTest = new TreeSet<String>(); + SortedSet<String> negTest = new TreeSet<String>(); + + SortedSet<String> posOld = new TreeSet<String>(); + SortedSet<String> negOld = new TreeSet<String>(); + posOld.addAll(examples.getPositiveExamples()); + negOld.addAll(examples.getNegativeExamples()); + + int posOldSize = posOld.size(); + int negOldSize = negOld.size(); + + while (!posOld.isEmpty() && (((double)posOld.size()/(double)posOldSize)) > percentageOfTrainingSet) { + String one = pickOneRandomly(posOld.toArray(new String[] {})); + posOld.remove(one); + posTest.add(one); + } + posTrain.addAll(posOld); + + while (!negOld.isEmpty() && (((double)negOld.size()/(double)negOldSize)) > percentageOfTrainingSet) { + String one = pickOneRandomly(negOld.toArray(new String[] {})); + negOld.remove(one); + negTest.add(one); + } + negTrain.addAll(negOld); + + + double posPercent = posTrain.size()/(double)posOldSize; + double negPercent = negTrain.size()/(double)negOldSize; + +// if there is more than a 10% error + if(Math.abs(posPercent - percentageOfTrainingSet)>0.1d || Math.abs(negPercent - percentageOfTrainingSet)>0.1d ){ + logger.info("repeating, unevenly matched"); + return split(percentageOfTrainingSet); + } + return new Examples(posTrain, negTrain, posTest, negTest); + } + + public static String pickOneRandomly(String[] from){ + Random r = new Random(); + int index = Math.round((float)(from.length*r.nextFloat())); + try{ + return from[index]; + }catch (Exception e) { + return pickOneRandomly(from); + } + } + +} Modified: trunk/src/dl-learner/org/dllearner/utilities/examples/ExampleDataCollector.java =================================================================== --- trunk/src/dl-learner/org/dllearner/utilities/examples/ExampleDataCollector.java 2010-02-11 08:10:50 UTC (rev 2009) +++ trunk/src/dl-learner/org/dllearner/utilities/examples/ExampleDataCollector.java 2010-02-11 09:16:04 UTC (rev 2010) @@ -119,7 +119,7 @@ } - public File collect(){ + private File collect(){ String from = null; File tmpFile = null; FileWriter fw = null; Deleted: trunk/src/dl-learner/org/dllearner/utilities/examples/Randomizer.java =================================================================== --- trunk/src/dl-learner/org/dllearner/utilities/examples/Randomizer.java 2010-02-11 08:10:50 UTC (rev 2009) +++ trunk/src/dl-learner/org/dllearner/utilities/examples/Randomizer.java 2010-02-11 09:16:04 UTC (rev 2010) @@ -1,108 +0,0 @@ -/** - * Copyright (C) 2007-2008, Jens Lehmann - * - * This file is part of DL-Learner. - * - * DL-Learner is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * DL-Learner is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - */ -package org.dllearner.utilities.examples; - -import java.util.Random; -import java.util.SortedSet; -import java.util.TreeSet; - -import org.apache.log4j.Logger; - -/** - * used to randomize examples and split them into training and test sets - * @author Sebastian Hellmann <hel...@in...> - * - */ -public class Randomizer { - private static Logger logger = Logger.getLogger(Randomizer.class); - - private final Examples examples; - - public Randomizer(Examples examples ){ - this.examples = examples; - } - - public static void main(String[] args) { - Examples ex = new Examples(); - - for (int i = 0; i < 20; i++) { - ex.addPosTrain("p"+i); - ex.addNegTrain("n"+i); - } - - Randomizer r = new Randomizer(ex); - ex = r.split(0.7d); - System.out.println(ex.toString()); - - } - - public Examples split(double percentageOfTrainingSet){ -// System.out.println(GlobalConfig.trainingDataPercentage+""); - SortedSet<String> posTrain = new TreeSet<String>(); - SortedSet<String> negTrain = new TreeSet<String>(); - - SortedSet<String> posTest = new TreeSet<String>(); - SortedSet<String> negTest = new TreeSet<String>(); - - SortedSet<String> posOld = new TreeSet<String>(); - SortedSet<String> negOld = new TreeSet<String>(); - posOld.addAll(examples.getPositiveExamples()); - negOld.addAll(examples.getNegativeExamples()); - - int posOldSize = posOld.size(); - int negOldSize = negOld.size(); - - while (!posOld.isEmpty() && (((double)posOld.size()/(double)posOldSize)) > percentageOfTrainingSet) { - String one = pickOneRandomly(posOld.toArray(new String[] {})); - posOld.remove(one); - posTest.add(one); - } - posTrain.addAll(posOld); - - while (!negOld.isEmpty() && (((double)negOld.size()/(double)negOldSize)) > percentageOfTrainingSet) { - String one = pickOneRandomly(negOld.toArray(new String[] {})); - negOld.remove(one); - negTest.add(one); - } - negTrain.addAll(negOld); - - - double posPercent = posTrain.size()/(double)posOldSize; - double negPercent = negTrain.size()/(double)negOldSize; - -// if there is more than a 10% error - if(Math.abs(posPercent - percentageOfTrainingSet)>0.1d || Math.abs(negPercent - percentageOfTrainingSet)>0.1d ){ - logger.info("repeating, unevenly matched"); - return split(percentageOfTrainingSet); - } - return new Examples(posTrain, negTrain, posTest, negTest); - } - - public static String pickOneRandomly(String[] from){ - Random r = new Random(); - int index = Math.round((float)(from.length*r.nextFloat())); - try{ - return from[index]; - }catch (Exception e) { - return pickOneRandomly(from); - } - } - -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |