You can subscribe to this list here.
2007 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
(120) |
Sep
(36) |
Oct
(116) |
Nov
(17) |
Dec
(44) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2008 |
Jan
(143) |
Feb
(192) |
Mar
(74) |
Apr
(84) |
May
(105) |
Jun
(64) |
Jul
(49) |
Aug
(120) |
Sep
(159) |
Oct
(156) |
Nov
(51) |
Dec
(28) |
2009 |
Jan
(17) |
Feb
(55) |
Mar
(33) |
Apr
(57) |
May
(54) |
Jun
(28) |
Jul
(6) |
Aug
(16) |
Sep
(38) |
Oct
(30) |
Nov
(26) |
Dec
(52) |
2010 |
Jan
(7) |
Feb
(91) |
Mar
(65) |
Apr
(2) |
May
(14) |
Jun
(25) |
Jul
(38) |
Aug
(48) |
Sep
(80) |
Oct
(70) |
Nov
(75) |
Dec
(77) |
2011 |
Jan
(68) |
Feb
(53) |
Mar
(51) |
Apr
(35) |
May
(65) |
Jun
(101) |
Jul
(29) |
Aug
(230) |
Sep
(95) |
Oct
(49) |
Nov
(110) |
Dec
(63) |
2012 |
Jan
(41) |
Feb
(42) |
Mar
(25) |
Apr
(46) |
May
(51) |
Jun
(44) |
Jul
(45) |
Aug
(29) |
Sep
(12) |
Oct
(9) |
Nov
(17) |
Dec
(2) |
2013 |
Jan
(12) |
Feb
(14) |
Mar
(7) |
Apr
(16) |
May
(54) |
Jun
(27) |
Jul
(11) |
Aug
(5) |
Sep
(85) |
Oct
(27) |
Nov
(37) |
Dec
(32) |
2014 |
Jan
(8) |
Feb
(29) |
Mar
(5) |
Apr
(3) |
May
(22) |
Jun
(3) |
Jul
(4) |
Aug
(3) |
Sep
|
Oct
|
Nov
|
Dec
|
From: <lor...@us...> - 2014-01-08 19:37:26
|
Revision: 4210 http://sourceforge.net/p/dl-learner/code/4210 Author: lorenz_b Date: 2014-01-08 19:37:22 +0000 (Wed, 08 Jan 2014) Log Message: ----------- Added ISLE test cases. Extended QTL2 DBpedia experiment. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java trunk/components-core/src/main/java/org/dllearner/algorithms/properties/ObjectPropertyDomainAxiomLearner2.java trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/datastructures/QueryTree.java trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/datastructures/impl/QueryTreeImpl.java trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/impl/QueryTreeFactoryImpl.java trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/LGGGeneratorImpl.java trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/NoiseSensitiveLGG.java trunk/components-core/src/main/java/org/dllearner/kb/sparql/QueryEngineHTTP.java trunk/components-core/src/main/java/org/dllearner/reasoning/PelletReasoner.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java trunk/components-core/src/test/java/org/dllearner/algorithms/qtl/QALDExperiment.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityTokenizer.java trunk/components-core/src/main/java/org/dllearner/utilities/TriplePatternExtractor.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaPlainExperiment.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaSemanticIndexExperiment.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaSyntacticIndexBasedExperiment.java Removed Paths: ------------- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityTokenizer.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityTokenizer.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityTokenizer.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -0,0 +1,31 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.util.HashMap; +import java.util.List; + +import org.dllearner.algorithms.isle.index.Token; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; + +/** + * @author Lorenz Buehmann + * + */ +public class EntityTokenizer extends HashMap<Entity, List<Token>>{ + + + + + /* (non-Javadoc) + * @see java.util.HashMap#get(java.lang.Object) + */ + @Override + public List<Token> get(Object key) { + return super.get(key); + } + + +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java 2014-01-02 13:07:18 UTC (rev 4209) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -4,6 +4,7 @@ package org.dllearner.algorithms.isle.index.syntactic; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -41,6 +42,8 @@ long totalNumberOfDocuments = -1; + Map<Entity, Long> cache = new HashMap<>(); + public SolrSyntacticIndex(OWLOntology ontology, String solrServerURL, String searchField) { this.searchField = searchField; solr = new HttpSolrServer(solrServerURL); @@ -102,6 +105,9 @@ */ @Override public long getNumberOfDocumentsFor(Entity entity) { + if(cache.containsKey(entity)){ + return cache.get(entity); + } Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); String queryString = "("; @@ -123,6 +129,7 @@ try { QueryResponse response = solr.query(query); SolrDocumentList list = response.getResults(); + cache.put(entity, list.getNumFound()); return list.getNumFound(); } catch (SolrServerException e) { e.printStackTrace(); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java 2014-01-02 13:07:18 UTC (rev 4209) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -3,21 +3,28 @@ */ package org.dllearner.algorithms.isle.wsd; -import com.google.common.base.Joiner; -import com.google.common.collect.Sets; +import java.io.IOException; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + import org.dllearner.algorithms.isle.StructuralEntityContext; import org.dllearner.algorithms.isle.VSMCosineDocumentSimilarity; import org.dllearner.algorithms.isle.index.Annotation; import org.dllearner.algorithms.isle.index.EntityScorePair; import org.dllearner.algorithms.isle.index.SemanticAnnotation; +import org.dllearner.algorithms.isle.index.Token; +import org.dllearner.algorithms.isle.textretrieval.AnnotationEntityTextRetriever; +import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; -import java.io.IOException; -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import com.google.common.base.Joiner; +import com.google.common.collect.Sets; /** * @author Lorenz Buehmann @@ -26,6 +33,7 @@ public class StructureBasedWordSenseDisambiguation extends WordSenseDisambiguation{ private ContextExtractor contextExtractor; + private AnnotationEntityTextRetriever textRetriever; /** * @param ontology @@ -33,6 +41,8 @@ public StructureBasedWordSenseDisambiguation(ContextExtractor contextExtractor, OWLOntology ontology) { super(ontology); this.contextExtractor = contextExtractor; + + textRetriever = new RDFSLabelEntityTextRetriever(ontology); } /* (non-Javadoc) @@ -40,6 +50,41 @@ */ @Override public SemanticAnnotation disambiguate(Annotation annotation, Set<EntityScorePair> candidateEntities) { + //filter out candidates for which the head noun does not match with the annotated token + for (Iterator<EntityScorePair> iterator = candidateEntities.iterator(); iterator.hasNext();) { + EntityScorePair entityPair = iterator.next(); + Entity entity = entityPair.getEntity(); + + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + + boolean matched = false; + + for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { + List<Token> tokens = entry.getKey(); + + + for (Token token : tokens) { + if(token.isHead()){ + for (Token annotatedToken : annotation.getTokens()) { + if(token.getRawForm().equals(annotatedToken.getRawForm())){ + matched = true; + } + } + } + } + + } + + if(!matched){ + iterator.remove(); + } + } + + System.out.println(annotation); + for (EntityScorePair entityScorePair : candidateEntities) { + System.out.println(entityScorePair); + } + if(!candidateEntities.isEmpty()){ //get the context of the annotated token List<String> tokenContext = contextExtractor.extractContext(annotation); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/properties/ObjectPropertyDomainAxiomLearner2.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/properties/ObjectPropertyDomainAxiomLearner2.java 2014-01-02 13:07:18 UTC (rev 4209) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/properties/ObjectPropertyDomainAxiomLearner2.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -232,7 +232,7 @@ private void runSPARQL1_0_Mode() { workingModel = ModelFactory.createDefaultModel(); - int limit = 1000; + int limit = 10000; int offset = 0; String baseQuery = "CONSTRUCT {?s a ?type.} WHERE {?s <%s> ?o. ?s a ?type.} LIMIT %d OFFSET %d"; String query = String.format(baseQuery, propertyToDescribe.getName(), limit, offset); @@ -246,7 +246,7 @@ int all = 1; while (rs.hasNext()) { qs = rs.next(); - all = qs.getLiteral("all").getInt(); + all = qs.getLiteral("all").getInt();System.out.println(all); } // get class and number of instances @@ -318,21 +318,38 @@ ObjectPropertyDomainAxiomLearner2 l = new ObjectPropertyDomainAxiomLearner2(ks); l.setReasoner(reasoner); - for (ObjectProperty p : reasoner.getOWLObjectProperties("http://dbpedia.org/ontology/")) { - System.out.println(p); - l.setPropertyToDescribe(p); - l.setMaxExecutionTimeInSeconds(10); - l.addFilterNamespace("http://dbpedia.org/ontology/"); -// l.setReturnOnlyNewAxioms(true); - l.init(); -// l.start(); - l.run(); - List<EvaluatedAxiom> axioms = l.getCurrentlyBestEvaluatedAxioms(10, 0.5); -// System.out.println(axioms); - System.out.println(l.getBestEvaluatedAxiom()); - } + l.setPropertyToDescribe(new ObjectProperty("http://dbpedia.org/ontology/birthPlace")); + l.setMaxExecutionTimeInSeconds(20); + l.addFilterNamespace("http://dbpedia.org/ontology/"); + l.init(); + l.start(); +// l.run(); + System.out.println(l.getBestEvaluatedAxiom()); + ObjectPropertyDomainAxiomLearner l2 = new ObjectPropertyDomainAxiomLearner(ks); + l2.setReasoner(reasoner); + l2.setPropertyToDescribe(new ObjectProperty("http://dbpedia.org/ontology/birthPlace")); + l2.setMaxExecutionTimeInSeconds(10); + l2.addFilterNamespace("http://dbpedia.org/ontology/"); + l2.init(); + l2.start(); + System.out.println(l2.getCurrentlyBestEvaluatedAxioms(0.2)); + System.out.println(l2.getBestEvaluatedAxiom()); +// for (ObjectProperty p : reasoner.getOWLObjectProperties("http://dbpedia.org/ontology/")) { +// System.out.println(p); +// l.setPropertyToDescribe(p); +// l.setMaxExecutionTimeInSeconds(10); +// l.addFilterNamespace("http://dbpedia.org/ontology/"); +//// l.setReturnOnlyNewAxioms(true); +// l.init(); +//// l.start(); +// l.run(); +// List<EvaluatedAxiom> axioms = l.getCurrentlyBestEvaluatedAxioms(10, 0.5); +//// System.out.println(axioms); +// System.out.println(l.getBestEvaluatedAxiom()); +// } + // for(EvaluatedAxiom axiom : axioms){ // printSubset(l.getPositiveExamples(axiom), 10); // printSubset(l.getNegativeExamples(axiom), 10); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/datastructures/QueryTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/datastructures/QueryTree.java 2014-01-02 13:07:18 UTC (rev 4209) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/datastructures/QueryTree.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -147,6 +147,6 @@ RDFDatatype getDatatype(); - List<Literal> getLiterals(); + Set<Literal> getLiterals(); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/datastructures/impl/QueryTreeImpl.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/datastructures/impl/QueryTreeImpl.java 2014-01-02 13:07:18 UTC (rev 4209) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/datastructures/impl/QueryTreeImpl.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -95,7 +95,7 @@ private boolean isResourceNode = false; private boolean isBlankNode = false; - private List<Literal> literals = new ArrayList<Literal>(); + private Set<Literal> literals = new HashSet<Literal>(); public QueryTreeImpl(N userObject) { @@ -107,6 +107,11 @@ public String render(QueryTree<N> object) { String label = object.toString() + "(" + object.getId() + ")"; if(object.isLiteralNode()){ +// if(object.getLiterals().size() == 1){ +// label += object.getLiterals().iterator().next(); +// } else if(object.getLiterals().size() > 1){ +// label += "Values: " + object.getLiterals(); +// } if(!object.getLiterals().isEmpty()){ label += "Values: " + object.getLiterals(); } @@ -809,7 +814,7 @@ } } - private String getFilter(String varName, List<Literal> literals){ + private String getFilter(String varName, Set<Literal> literals){ String filter = "FILTER("; Literal min = getMin(literals); @@ -824,7 +829,7 @@ return filter; } - private Literal getMin(List<Literal> literals){ + private Literal getMin(Set<Literal> literals){ Iterator<Literal> iter = literals.iterator(); Literal min = iter.next(); Literal l; @@ -841,7 +846,7 @@ return min; } - private Literal getMax(List<Literal> literals){ + private Literal getMax(Set<Literal> literals){ Iterator<Literal> iter = literals.iterator(); Literal max = iter.next(); Literal l; @@ -928,7 +933,7 @@ literals.add(l); } - public List<Literal> getLiterals() { + public Set<Literal> getLiterals() { return literals; } @@ -939,7 +944,7 @@ public RDFDatatype getDatatype(){ if(isLiteralNode){ if(!literals.isEmpty()){ - return literals.get(0).getDatatype(); + return literals.iterator().next().getDatatype(); } else { return null; } @@ -972,7 +977,7 @@ if(child.isLiteralNode()){ OWLDataProperty p = df.getOWLDataProperty(IRI.create((String) tree.getEdge(child))); if(childLabel.equals("?")){ - List<Literal> literals = child.getLiterals(); + Set<Literal> literals = child.getLiterals(); Literal lit = literals.iterator().next(); RDFDatatype datatype = lit.getDatatype(); String datatypeURI; @@ -983,7 +988,7 @@ } classExpressions.add(df.getOWLDataSomeValuesFrom(p, df.getOWLDatatype(IRI.create(datatypeURI)))); } else { - List<Literal> literals = child.getLiterals(); + Set<Literal> literals = child.getLiterals(); Literal lit = literals.iterator().next(); RDFDatatype datatype = lit.getDatatype(); OWLLiteral owlLiteral; Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/impl/QueryTreeFactoryImpl.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/impl/QueryTreeFactoryImpl.java 2014-01-02 13:07:18 UTC (rev 4209) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/impl/QueryTreeFactoryImpl.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -221,7 +221,12 @@ } }; } - Iterator<Statement> it = model.listStatements(s, null, (RDFNode)null).filterKeep(nsFilter); + Iterator<Statement> it; + if(nsFilter != null){ + it = model.listStatements(s, null, (RDFNode)null).filterKeep(nsFilter); + } else { + it = model.listStatements(s, null, (RDFNode)null); + } Statement st; SortedSet<Statement> statements; Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/LGGGeneratorImpl.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/LGGGeneratorImpl.java 2014-01-02 13:07:18 UTC (rev 4209) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/LGGGeneratorImpl.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -131,11 +131,11 @@ logger.debug(tree2.getStringRepresentation()); } - QueryTree<N> lgg; //firstly, we check if both root nodes are resource nodes and have the same URI, i.e. the trees describe the same resource //if YES all child nodes should be also the same and we can just return one of the two tree as LGG - if(tree1.isResourceNode() && tree2.isResourceNode() && tree1.getUserObject().equals(tree2.getUserObject())){ + if((tree1.isResourceNode() && tree2.isResourceNode() || tree1.isLiteralNode() && tree2.isLiteralNode()) + && tree1.getUserObject().equals(tree2.getUserObject())){ if(logger.isDebugEnabled()){ logger.debug("Early termination. Tree 1(" + tree1 + ") and tree 2(" + tree2 + ") describe the same resource."); } @@ -171,7 +171,7 @@ lgg.setUserObject((N)"?"); lgg.setIsLiteralNode(false); lgg.setIsResourceNode(false); - } + } if(tree1.isLiteralNode() && tree2.isLiteralNode()){ RDFDatatype d1 = tree1.getDatatype(); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/NoiseSensitiveLGG.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/NoiseSensitiveLGG.java 2014-01-02 13:07:18 UTC (rev 4209) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/qtl/operations/lgg/NoiseSensitiveLGG.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -27,6 +27,10 @@ private SortedSet<EvaluatedQueryTree<N>> solutions; private double currentlyBestScore = 0d; + + private List<QueryTree<N>> posExamples; + + private List<QueryTree<N>> negExamples; public NoiseSensitiveLGG() { } @@ -36,7 +40,11 @@ } public List<EvaluatedQueryTree<N>> computeLGG(List<QueryTree<N>> posExamples, List<QueryTree<N>> negExamples){ + this.posExamples = posExamples; + this.negExamples = negExamples; + currentlyBestScore = 0d; + Monitor subMon = MonitorFactory.getTimeMonitor("subsumption-mon"); Monitor lggMon = MonitorFactory.getTimeMonitor("lgg-mon"); init(posExamples, negExamples); @@ -51,23 +59,14 @@ lggMon.start(); QueryTree<N> lgg = lggGenerator.getLGG(tree, example); lggMon.stop(); - //compute positive examples which are not covered by LGG - Collection<QueryTree<N>> uncoveredPositiveExamples = getUncoveredTrees(lgg, posExamples); - //compute negative examples which are covered by LGG - Collection<QueryTree<N>> coveredNegativeExamples = getCoveredTrees(lgg, negExamples); - //compute score - int coveredPositiveExamples = posExamples.size() - uncoveredPositiveExamples.size(); - double recall = coveredPositiveExamples / (double)posExamples.size(); - double precision = (coveredNegativeExamples.size() + coveredPositiveExamples == 0) - ? 0 - : coveredPositiveExamples / (double)(coveredPositiveExamples + coveredNegativeExamples.size()); - double score = Heuristics.getFScore(recall, precision); - if(score > currentlyBestScore){ + //evaluate the LGG + EvaluatedQueryTree<N> solution = evaluate(lgg); + + if(solution.getScore() > currentlyBestScore){ //add to todo list, if not already contained in todo list or solution list - EvaluatedQueryTree<N> solution = new EvaluatedQueryTree<N>(lgg, uncoveredPositiveExamples, coveredNegativeExamples, score); todo(solution); - currentlyBestScore = score; + currentlyBestScore = solution.getScore(); } } @@ -83,6 +82,25 @@ return new ArrayList<EvaluatedQueryTree<N>>(solutions); } + private EvaluatedQueryTree<N> evaluate(QueryTree<N> lgg){ + //compute positive examples which are not covered by LGG + Collection<QueryTree<N>> uncoveredPositiveExamples = getUncoveredTrees(lgg, posExamples); + //compute negative examples which are covered by LGG + Collection<QueryTree<N>> coveredNegativeExamples = getCoveredTrees(lgg, negExamples); + //compute score + int coveredPositiveExamples = posExamples.size() - uncoveredPositiveExamples.size(); + double recall = coveredPositiveExamples / (double)posExamples.size(); + double precision = (coveredNegativeExamples.size() + coveredPositiveExamples == 0) + ? 0 + : coveredPositiveExamples / (double)(coveredPositiveExamples + coveredNegativeExamples.size()); + + double score = Heuristics.getFScore(recall, precision); + + EvaluatedQueryTree<N> solution = new EvaluatedQueryTree<N>(lgg, uncoveredPositiveExamples, coveredNegativeExamples, score); + + return solution; + } + /** * Return all trees from the given list {@code allTrees} which are not already subsumed by {@code tree}. * @param tree Modified: trunk/components-core/src/main/java/org/dllearner/kb/sparql/QueryEngineHTTP.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/kb/sparql/QueryEngineHTTP.java 2014-01-02 13:07:18 UTC (rev 4209) +++ trunk/components-core/src/main/java/org/dllearner/kb/sparql/QueryEngineHTTP.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -268,7 +268,7 @@ private Model execModel(Model model) { HttpQuery httpQuery = makeHttpQuery() ; - httpQuery.setAccept(WebContent.contentTypeTurtleAlt1) ; + httpQuery.setAccept(WebContent.contentTypeRDFXML) ; InputStream in = httpQuery.exec() ; //Don't assume the endpoint actually gives back the content type we asked for @@ -284,7 +284,7 @@ //Try to select language appropriately here based on the model content type Lang lang = WebContent.contentTypeToLang(actualContentType); if (! RDFLanguages.isTriples(lang)) throw new QueryException("Endpoint returned Content Type: " + actualContentType + " which is not a valid RDF Graph syntax"); - model.read(in, null, Lang.TURTLE.getName()) ; + model.read(in, null, Lang.RDFXML.getName()) ; return model ; } Modified: trunk/components-core/src/main/java/org/dllearner/reasoning/PelletReasoner.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/reasoning/PelletReasoner.java 2014-01-02 13:07:18 UTC (rev 4209) +++ trunk/components-core/src/main/java/org/dllearner/reasoning/PelletReasoner.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -1448,7 +1448,7 @@ concepts.add(new Thing()); } else if(concept.isOWLNothing()) { concepts.add(new Nothing()); - } else { + } else if(!concept.isBuiltIn() && !concept.getIRI().isReservedVocabulary()){ concepts.add(new NamedClass(concept.toStringID())); } } Added: trunk/components-core/src/main/java/org/dllearner/utilities/TriplePatternExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/utilities/TriplePatternExtractor.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/utilities/TriplePatternExtractor.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -0,0 +1,249 @@ +package org.dllearner.utilities; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.collections15.ListUtils; + +import com.hp.hpl.jena.graph.Node; +import com.hp.hpl.jena.graph.Triple; +import com.hp.hpl.jena.query.Query; +import com.hp.hpl.jena.sparql.core.TriplePath; +import com.hp.hpl.jena.sparql.core.Var; +import com.hp.hpl.jena.sparql.syntax.Element; +import com.hp.hpl.jena.sparql.syntax.ElementFilter; +import com.hp.hpl.jena.sparql.syntax.ElementGroup; +import com.hp.hpl.jena.sparql.syntax.ElementOptional; +import com.hp.hpl.jena.sparql.syntax.ElementPathBlock; +import com.hp.hpl.jena.sparql.syntax.ElementTriplesBlock; +import com.hp.hpl.jena.sparql.syntax.ElementUnion; +import com.hp.hpl.jena.sparql.syntax.ElementVisitorBase; +import com.hp.hpl.jena.sparql.util.VarUtils; + +public class TriplePatternExtractor extends ElementVisitorBase { + + private Set<Triple> triplePattern; + + private Set<Triple> candidates; + + private boolean inOptionalClause = false; + + private int unionCount = 0; + private int optionalCount = 0; + private int filterCount = 0; + + /** + * Returns all triple patterns in given SPARQL query that have the given node in subject position, i.e. the outgoing + * triple patterns. + * @param query The SPARQL query. + * @param node + * @return + */ + public Set<Triple> extractOutgoingTriplePatterns(Query query, Node node){ + Set<Triple> triplePatterns = extractTriplePattern(query, false); + //remove triple patterns not containing triple patterns with given node in subject position + for (Iterator<Triple> iterator = triplePatterns.iterator(); iterator.hasNext();) { + Triple triple = iterator.next(); + if(!triple.subjectMatches(node)){ + iterator.remove(); + } + } + return triplePatterns; + } + + /** + * Returns all triple patterns in given SPARQL query that have the given node in object position, i.e. the ingoing + * triple patterns. + * @param query The SPARQL query. + * @param node + * @return + */ + public Set<Triple> extractIngoingTriplePatterns(Query query, Node node){ + Set<Triple> triplePatterns = extractTriplePattern(query, false); + //remove triple patterns not containing triple patterns with given node in object position + for (Iterator<Triple> iterator = triplePatterns.iterator(); iterator.hasNext();) { + Triple triple = iterator.next(); + if(!triple.objectMatches(node)){ + iterator.remove(); + } + } + return triplePatterns; + } + + /** + * Returns all triple patterns in given SPARQL query that have the given node either in subject or in object position, i.e. + * the ingoing and outgoing triple patterns. + * @param query The SPARQL query. + * @param node + * @return + */ + public Set<Triple> extractTriplePatterns(Query query, Node node){ + Set<Triple> triplePatterns = new HashSet<Triple>(); + triplePatterns.addAll(extractIngoingTriplePatterns(query, node)); + triplePatterns.addAll(extractOutgoingTriplePatterns(query, node)); + return triplePatterns; + } + + /** + * Returns triple patterns for each projection variable v such that v is either in subject or object position. + * @param query The SPARQL query. + * @param node + * @return + */ + public Map<Var,Set<Triple>> extractTriplePatternsForProjectionVars(Query query){ + Map<Var,Set<Triple>> var2TriplePatterns = new HashMap<Var,Set<Triple>>(); + for (Var var : query.getProjectVars()) { + Set<Triple> triplePatterns = new HashSet<Triple>(); + triplePatterns.addAll(extractIngoingTriplePatterns(query, var)); + triplePatterns.addAll(extractOutgoingTriplePatterns(query, var)); + var2TriplePatterns.put(var, triplePatterns); + } + return var2TriplePatterns; + } + + /** + * Returns triple patterns for each projection variable v such that v is in subject position. + * @param query The SPARQL query. + * @param node + * @return + */ + public Map<Var,Set<Triple>> extractOutgoingTriplePatternsForProjectionVars(Query query){ + Map<Var,Set<Triple>> var2TriplePatterns = new HashMap<Var,Set<Triple>>(); + for (Var var : query.getProjectVars()) { + Set<Triple> triplePatterns = new HashSet<Triple>(); + triplePatterns.addAll(extractOutgoingTriplePatterns(query, var)); + var2TriplePatterns.put(var, triplePatterns); + } + return var2TriplePatterns; + } + + /** + * Returns triple patterns for each projection variable v such that v is in object position. + * @param query The SPARQL query. + * @param node + * @return + */ + public Map<Var,Set<Triple>> extractIngoingTriplePatternsForProjectionVars(Query query){ + Map<Var,Set<Triple>> var2TriplePatterns = new HashMap<Var,Set<Triple>>(); + for (Var var : query.getProjectVars()) { + Set<Triple> triplePatterns = new HashSet<Triple>(); + triplePatterns.addAll(extractIngoingTriplePatterns(query, var)); + var2TriplePatterns.put(var, triplePatterns); + } + return var2TriplePatterns; + } + + public Set<Triple> extractTriplePattern(Query query){ + return extractTriplePattern(query, false); + } + + public Set<Triple> extractTriplePattern(Query query, boolean ignoreOptionals){ + triplePattern = new HashSet<Triple>(); + candidates = new HashSet<Triple>(); + + query.getQueryPattern().visit(this); + + //postprocessing: triplepattern in OPTIONAL clause + if(!ignoreOptionals){ + if(query.isSelectType()){ + for(Triple t : candidates){ + if(!ListUtils.intersection(new ArrayList<Var>(VarUtils.getVars(t)), query.getProjectVars()).isEmpty()){ + triplePattern.add(t); + } + } + } + } + + return triplePattern; + } + + public Set<Triple> extractTriplePattern(ElementGroup group){ + return extractTriplePattern(group, false); + } + + public Set<Triple> extractTriplePattern(ElementGroup group, boolean ignoreOptionals){ + triplePattern = new HashSet<Triple>(); + candidates = new HashSet<Triple>(); + + group.visit(this); + + //postprocessing: triplepattern in OPTIONAL clause + if(!ignoreOptionals){ + for(Triple t : candidates){ + triplePattern.add(t); + } + } + + return triplePattern; + } + + @Override + public void visit(ElementGroup el) { + for (Iterator<Element> iterator = el.getElements().iterator(); iterator.hasNext();) { + Element e = iterator.next(); + e.visit(this); + } + } + + @Override + public void visit(ElementOptional el) { + optionalCount++; + inOptionalClause = true; + el.getOptionalElement().visit(this); + inOptionalClause = false; + } + + @Override + public void visit(ElementTriplesBlock el) { + for (Iterator<Triple> iterator = el.patternElts(); iterator.hasNext();) { + Triple t = iterator.next(); + if(inOptionalClause){ + candidates.add(t); + } else { + triplePattern.add(t); + } + } + } + + @Override + public void visit(ElementPathBlock el) { + for (Iterator<TriplePath> iterator = el.patternElts(); iterator.hasNext();) { + TriplePath tp = iterator.next(); + if(inOptionalClause){ + candidates.add(tp.asTriple()); + } else { + triplePattern.add(tp.asTriple()); + } + } + } + + @Override + public void visit(ElementUnion el) { + unionCount++; + for (Iterator<Element> iterator = el.getElements().iterator(); iterator.hasNext();) { + Element e = iterator.next(); + e.visit(this); + } + } + + @Override + public void visit(ElementFilter el) { + filterCount++; + } + + public int getUnionCount() { + return unionCount; + } + + public int getOptionalCount() { + return optionalCount; + } + + public int getFilterCount() { + return filterCount; + } +} Deleted: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java 2014-01-02 13:07:18 UTC (rev 4209) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -1,209 +0,0 @@ -/** - * - */ -package org.dllearner.algorithms.isle; - -import java.io.BufferedInputStream; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Set; - -import org.apache.commons.compress.compressors.CompressorException; -import org.apache.commons.compress.compressors.CompressorInputStream; -import org.apache.commons.compress.compressors.CompressorStreamFactory; -import org.dllearner.algorithms.isle.index.Index; -import org.dllearner.algorithms.isle.index.syntactic.SolrSyntacticIndex; -import org.dllearner.core.owl.NamedClass; -import org.dllearner.kb.sparql.SparqlEndpoint; -import org.dllearner.utilities.owl.OWLEntityTypeAdder; -import org.semanticweb.owlapi.apibinding.OWLManager; -import org.semanticweb.owlapi.model.AxiomType; -import org.semanticweb.owlapi.model.OWLOntology; -import org.semanticweb.owlapi.model.OWLOntologyCreationException; -import org.semanticweb.owlapi.model.OWLOntologyManager; - -import com.google.common.collect.Sets; -import com.hp.hpl.jena.rdf.model.Literal; -import com.hp.hpl.jena.rdf.model.Model; -import com.hp.hpl.jena.rdf.model.Property; -import com.hp.hpl.jena.rdf.model.RDFNode; -import com.hp.hpl.jena.rdf.model.Statement; -import com.hp.hpl.jena.rdf.model.StmtIterator; -import com.hp.hpl.jena.vocabulary.OWL; -import com.hp.hpl.jena.vocabulary.RDF; -import com.hp.hpl.jena.vocabulary.RDFS; -import com.hp.hpl.jena.vocabulary.XSD; - -/** - * @author Lorenz Buehmann - * - */ -public class DBpediaExperiment extends Experiment{ - - final SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); - final int maxNrOfInstancesPerClass = 10; - static final String solrServerURL = "http://solr.aksw.org/en_dbpedia_resources/"; - static final String searchField = "comment"; - - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.Experiment#getIndex() - */ - @Override - protected Index getIndex() { - return new SolrSyntacticIndex(ontology, solrServerURL, searchField); - } - - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.Experiment#getOntology() - */ - @Override - protected OWLOntology getOntology() { - //load the DBpedia schema - OWLOntology schema = null; - try { - URL url = new URL("http://downloads.dbpedia.org/3.9/dbpedia_3.9.owl.bz2"); - InputStream is = new BufferedInputStream(url.openStream()); - CompressorInputStream in = new CompressorStreamFactory().createCompressorInputStream("bzip2", is); - schema = OWLManager.createOWLOntologyManager().loadOntologyFromOntologyDocument(in); - } catch (MalformedURLException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } catch (CompressorException e) { - e.printStackTrace(); - } catch (OWLOntologyCreationException e) { - e.printStackTrace(); - } - //load some sample data for the machine learning part - Model sample = KnowledgebaseSampleGenerator.createKnowledgebaseSample( - endpoint, - "http://dbpedia.org/ontology/", - Sets.newHashSet(new NamedClass("http://dbpedia.org/ontology/Person")), - maxNrOfInstancesPerClass); - cleanUpModel(sample); - filter(sample, "http://dbpedia.org/ontology/"); - OWLEntityTypeAdder.addEntityTypes(sample); -// StmtIterator iterator = sample.listStatements(); -// while(iterator.hasNext()){ -// System.out.println(iterator.next()); -// } - - try { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - sample.write(baos, "TURTLE", null); - OWLOntologyManager man = OWLManager.createOWLOntologyManager(); - OWLOntology ontology = man.loadOntologyFromOntologyDocument(new ByteArrayInputStream(baos.toByteArray())); - man.addAxioms(ontology, schema.getAxioms()); - man.removeAxioms(ontology, ontology.getAxioms(AxiomType.FUNCTIONAL_DATA_PROPERTY)); - man.removeAxioms(ontology, ontology.getAxioms(AxiomType.FUNCTIONAL_OBJECT_PROPERTY)); - man.removeAxioms(ontology, ontology.getAxioms(AxiomType.DATA_PROPERTY_RANGE)); - return ontology; - } catch (Exception e) { - e.printStackTrace(); - } - - return null; - } - - /** - * Filter triples which are not relevant based on the given knowledge base - * namespace. - * - * @param model - * @param namespace - */ - private void filter(Model model, String namespace) { - List<Statement> statementsToRemove = new ArrayList<Statement>(); - for (Iterator<Statement> iter = model.listStatements().toList().iterator(); iter.hasNext();) { - Statement st = iter.next(); - Property predicate = st.getPredicate(); - if (predicate.equals(RDF.type)) { - if (!st.getObject().asResource().getURI().startsWith(namespace)) { - statementsToRemove.add(st); - } else if (st.getObject().equals(OWL.FunctionalProperty.asNode())) { - statementsToRemove.add(st); - } else if (st.getObject().isLiteral() && st.getObject().asLiteral().getDatatypeURI().equals(XSD.gYear.getURI())) { - statementsToRemove.add(st); - } - } else if (!predicate.equals(RDFS.subClassOf) && !predicate.equals(OWL.sameAs) && !predicate.asResource().getURI().startsWith(namespace)) { - statementsToRemove.add(st); - } - } - model.remove(statementsToRemove); - } - - private static void cleanUpModel(Model model) { - // filter out triples with String literals, as therein often occur - // some syntax errors and they are not relevant for learning - List<Statement> statementsToRemove = new ArrayList<Statement>(); - for (Iterator<Statement> iter = model.listStatements().toList().iterator(); iter.hasNext();) { - Statement st = iter.next(); - RDFNode object = st.getObject(); - if (object.isLiteral()) { - // statementsToRemove.add(st); - Literal lit = object.asLiteral(); - if (lit.getDatatype() == null || lit.getDatatype().equals(XSD.xstring)) { - st.changeObject("shortened", "en"); - } else if (lit.getDatatype().getURI().equals(XSD.gYear.getURI())) { - statementsToRemove.add(st); - // System.err.println("REMOVE " + st); - } else if (lit.getDatatype().getURI().equals(XSD.gYearMonth.getURI())) { - statementsToRemove.add(st); -// System.err.println("REMOVE " + st); - } - } - //remove statements like <x a owl:Class> - if (st.getPredicate().equals(RDF.type)) { - if (object.equals(RDFS.Class.asNode()) || object.equals(OWL.Class.asNode()) || object.equals(RDFS.Literal.asNode()) - || object.equals(RDFS.Resource)) { - statementsToRemove.add(st); - } - } - - //remove unwanted properties - String dbo = "http://dbpedia.org/ontology/"; - Set<String> blackList = Sets.newHashSet(dbo + "wikiPageDisambiguates",dbo + "wikiPageExternalLink", - dbo + "wikiPageID", dbo + "wikiPageInterLanguageLink", dbo + "wikiPageRedirects", dbo + "wikiPageRevisionID", - dbo + "wikiPageWikiLink"); - for(String bl: blackList){ - if (st.getPredicate().getURI().equals(bl)) { - statementsToRemove.add(st); - } - } - } - - model.remove(statementsToRemove); - } - - - - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.Experiment#getDocuments() - */ - @Override - protected Set<String> getDocuments() { - Set<String> documents = new HashSet<String>(); - - documents.addAll(DBpediaCorpusGenerator.getDBpediaCorpusSample( - "http://dbpedia.org/ontology/abstract", - Sets.newHashSet(new NamedClass("http://dbpedia.org/ontology/Person")), - maxNrOfInstancesPerClass)); - - documents.clear(); - documents.add("Thomas Cruise Mapother IV, widely known as Tom Cruise, is an American film player and producer. He has been nominated for three Academy Awards and has won three Golden Globe Awards. He started his career at age 19 in the 1981 film Taps. His first leading role was in Risky Business, released in August 1983. Cruise became a full-fledged movie star after starring in Top Gun (1986). He is well known for his role as secret agent Ethan Hunt in the Mission: Impossible film series between 1996 and 2011. Cruise has starred in many Hollywood blockbusters, including Rain Man (1988), A Few Good Men (1992), Jerry Maguire (1996), Vanilla Sky (2001), Minority Report (2002), The Last Samurai (2003), Collateral (2004), War of the Worlds (2005), Tropic Thunder (2008) and Jack Reacher (2012). As of 2012, Cruise is Hollywood's highest-paid actor. Cruise is known for his Scientologist faith and for his support of the Church of Scientology."); - - return documents; - } - - public static void main(String[] args) throws Exception { - new DBpediaExperiment().run(new NamedClass("http://dbpedia.org/ontology/Person")); - } -} Added: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaPlainExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaPlainExperiment.java (rev 0) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaPlainExperiment.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -0,0 +1,200 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.dllearner.algorithms.isle.index.Index; +import org.dllearner.algorithms.isle.index.syntactic.SolrSyntacticIndex; +import org.dllearner.core.owl.NamedClass; +import org.dllearner.kb.sparql.SparqlEndpoint; +import org.dllearner.utilities.owl.OWLEntityTypeAdder; +import org.semanticweb.owlapi.apibinding.OWLManager; +import org.semanticweb.owlapi.model.AxiomType; +import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.OWLOntologyCreationException; +import org.semanticweb.owlapi.model.OWLOntologyManager; + +import com.google.common.collect.Sets; +import com.hp.hpl.jena.rdf.model.Literal; +import com.hp.hpl.jena.rdf.model.Model; +import com.hp.hpl.jena.rdf.model.Property; +import com.hp.hpl.jena.rdf.model.RDFNode; +import com.hp.hpl.jena.rdf.model.Statement; +import com.hp.hpl.jena.rdf.model.StmtIterator; +import com.hp.hpl.jena.vocabulary.OWL; +import com.hp.hpl.jena.vocabulary.RDF; +import com.hp.hpl.jena.vocabulary.RDFS; +import com.hp.hpl.jena.vocabulary.XSD; + +/** + * @author Lorenz Buehmann + * + */ +public class DBpediaPlainExperiment extends Experiment{ + + final SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); + final int maxNrOfInstancesPerClass = 10; + static final String solrServerURL = "http://solr.aksw.org/en_dbpedia_resources/"; + static final String searchField = "comment"; + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.Experiment#getIndex() + */ + @Override + protected Index getIndex() { + return null; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.Experiment#getOntology() + */ + @Override + protected OWLOntology getOntology() { + //load the DBpedia schema + OWLOntology schema = null; + try { + URL url = new URL("http://downloads.dbpedia.org/3.9/dbpedia_3.9.owl.bz2"); + InputStream is = new BufferedInputStream(url.openStream()); + CompressorInputStream in = new CompressorStreamFactory().createCompressorInputStream("bzip2", is); + schema = OWLManager.createOWLOntologyManager().loadOntologyFromOntologyDocument(in); + } catch (MalformedURLException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (OWLOntologyCreationException e) { + e.printStackTrace(); + } + //load some sample data for the machine learning part + Model sample = KnowledgebaseSampleGenerator.createKnowledgebaseSample( + endpoint, + "http://dbpedia.org/ontology/", + Sets.newHashSet(classToDescribe), + maxNrOfInstancesPerClass); + cleanUpModel(sample); + filter(sample, "http://dbpedia.org/ontology/"); + OWLEntityTypeAdder.addEntityTypes(sample); +// StmtIterator iterator = sample.listStatements(); +// while(iterator.hasNext()){ +// System.out.println(iterator.next()); +// } + + try { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + sample.write(baos, "TURTLE", null); + OWLOntologyManager man = OWLManager.createOWLOntologyManager(); + OWLOntology ontology = man.loadOntologyFromOntologyDocument(new ByteArrayInputStream(baos.toByteArray())); + man.addAxioms(ontology, schema.getAxioms()); + man.removeAxioms(ontology, ontology.getAxioms(AxiomType.FUNCTIONAL_DATA_PROPERTY)); + man.removeAxioms(ontology, ontology.getAxioms(AxiomType.FUNCTIONAL_OBJECT_PROPERTY)); + man.removeAxioms(ontology, ontology.getAxioms(AxiomType.DATA_PROPERTY_RANGE)); + return ontology; + } catch (Exception e) { + e.printStackTrace(); + } + + return null; + } + + /** + * Filter triples which are not relevant based on the given knowledge base + * namespace. + * + * @param model + * @param namespace + */ + private void filter(Model model, String namespace) { + List<Statement> statementsToRemove = new ArrayList<Statement>(); + for (Iterator<Statement> iter = model.listStatements().toList().iterator(); iter.hasNext();) { + Statement st = iter.next(); + Property predicate = st.getPredicate(); + if (predicate.equals(RDF.type)) { + if (!st.getObject().asResource().getURI().startsWith(namespace)) { + statementsToRemove.add(st); + } else if (st.getObject().equals(OWL.FunctionalProperty.asNode())) { + statementsToRemove.add(st); + } else if (st.getObject().isLiteral() && st.getObject().asLiteral().getDatatypeURI().equals(XSD.gYear.getURI())) { + statementsToRemove.add(st); + } + } else if (!predicate.equals(RDFS.subClassOf) && !predicate.equals(OWL.sameAs) && !predicate.asResource().getURI().startsWith(namespace)) { + statementsToRemove.add(st); + } + } + model.remove(statementsToRemove); + } + + private static void cleanUpModel(Model model) { + // filter out triples with String literals, as therein often occur + // some syntax errors and they are not relevant for learning + List<Statement> statementsToRemove = new ArrayList<Statement>(); + for (Iterator<Statement> iter = model.listStatements().toList().iterator(); iter.hasNext();) { + Statement st = iter.next(); + RDFNode object = st.getObject(); + if (object.isLiteral()) { + // statementsToRemove.add(st); + Literal lit = object.asLiteral(); + if (lit.getDatatype() == null || lit.getDatatype().equals(XSD.xstring)) { + st.changeObject("shortened", "en"); + } else if (lit.getDatatype().getURI().equals(XSD.gYear.getURI())) { + statementsToRemove.add(st); + // System.err.println("REMOVE " + st); + } else if (lit.getDatatype().getURI().equals(XSD.gYearMonth.getURI())) { + statementsToRemove.add(st); +// System.err.println("REMOVE " + st); + } + } + //remove statements like <x a owl:Class> + if (st.getPredicate().equals(RDF.type)) { + if (object.equals(RDFS.Class.asNode()) || object.equals(OWL.Class.asNode()) || object.equals(RDFS.Literal.asNode()) + || object.equals(RDFS.Resource)) { + statementsToRemove.add(st); + } + } + + //remove unwanted properties + String dbo = "http://dbpedia.org/ontology/"; + Set<String> blackList = Sets.newHashSet(dbo + "wikiPageDisambiguates",dbo + "wikiPageExternalLink", + dbo + "wikiPageID", dbo + "wikiPageInterLanguageLink", dbo + "wikiPageRedirects", dbo + "wikiPageRevisionID", + dbo + "wikiPageWikiLink"); + for(String bl: blackList){ + if (st.getPredicate().getURI().equals(bl)) { + statementsToRemove.add(st); + } + } + } + + model.remove(statementsToRemove); + } + + + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.Experiment#getDocuments() + */ + @Override + protected Set<String> getDocuments() { + Set<String> documents = new HashSet<String>(); + return documents; + } + + public static void main(String[] args) throws Exception { + new DBpediaPlainExperiment().run(new NamedClass("http://dbpedia.org/ontology/SpaceShuttle")); + } +} Added: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaSemanticIndexExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaSemanticIndexExperiment.java (rev 0) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaSemanticIndexExperiment.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -0,0 +1,202 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.dllearner.algorithms.isle.index.Index; +import org.dllearner.algorithms.isle.index.syntactic.SolrSyntacticIndex; +import org.dllearner.core.owl.NamedClass; +import org.dllearner.kb.sparql.SparqlEndpoint; +import org.dllearner.utilities.owl.OWLEntityTypeAdder; +import org.semanticweb.owlapi.apibinding.OWLManager; +import org.semanticweb.owlapi.model.AxiomType; +import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.OWLOntologyCreationException; +import org.semanticweb.owlapi.model.OWLOntologyManager; + +import com.google.common.collect.Sets; +import com.hp.hpl.jena.rdf.model.Literal; +import com.hp.hpl.jena.rdf.model.Model; +import com.hp.hpl.jena.rdf.model.Property; +import com.hp.hpl.jena.rdf.model.RDFNode; +import com.hp.hpl.jena.rdf.model.Statement; +import com.hp.hpl.jena.rdf.model.StmtIterator; +import com.hp.hpl.jena.vocabulary.OWL; +import com.hp.hpl.jena.vocabulary.RDF; +import com.hp.hpl.jena.vocabulary.RDFS; +import com.hp.hpl.jena.vocabulary.XSD; + +/** + * @author Lorenz Buehmann + * + */ +public class DBpediaSemanticIndexExperiment extends Experiment{ + + final SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); + final int maxNrOfInstancesPerClass = 10; + static final String solrServerURL = "http://solr.aksw.org/en_dbpedia_resources/"; + static final String searchField = "comment"; + + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.Experiment#getOntology() + */ + @Override + protected OWLOntology getOntology() { + //load the DBpedia schema + OWLOntology schema = null; + try { + URL url = new URL("http://downloads.dbpedia.org/3.9/dbpedia_3.9.owl.bz2"); + InputStream is = new BufferedInputStream(url.openStream()); + CompressorInputStream in = new CompressorStreamFactory().createCompressorInputStream("bzip2", is); + schema = OWLManager.createOWLOntologyManager().loadOntologyFromOntologyDocument(in); + } catch (MalformedURLException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (OWLOntologyCreationException e) { + e.printStackTrace(); + } + //load some sample data for the machine learning part + Model sample = KnowledgebaseSampleGenerator.createKnowledgebaseSample( + endpoint, + "http://dbpedia.org/ontology/", + Sets.newHashSet(new NamedClass("http://dbpedia.org/ontology/Person")), + maxNrOfInstancesPerClass); + cleanUpModel(sample); + filter(sample, "http://dbpedia.org/ontology/"); + OWLEntityTypeAdder.addEntityTypes(sample); +// StmtIterator iterator = sample.listStatements(); +// while(iterator.hasNext()){ +// System.out.println(iterator.next()); +// } + + try { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + sample.write(baos, "TURTLE", null); + OWLOntologyManager man = OWLManager.createOWLOntologyManager(); + OWLOntology ontology = man.loadOntologyFromOntologyDocument(new ByteArrayInputStream(baos.toByteArray())); + man.addAxioms(ontology, schema.getAxioms()); + man.removeAxioms(ontology, ontology.getAxioms(AxiomType.FUNCTIONAL_DATA_PROPERTY)); + man.removeAxioms(ontology, ontology.getAxioms(AxiomType.FUNCTIONAL_OBJECT_PROPERTY)); + man.removeAxioms(ontology, ontology.getAxioms(AxiomType.DATA_PROPERTY_RANGE)); + return ontology; + } catch (Exception e) { + e.printStackTrace(); + } + + return null; + } + + /** + * Filter triples which are not relevant based on the given knowledge base + * namespace. + * + * @param model + * @param namespace + */ + private void filter(Model model, String namespace) { + List<Statement> statementsToRemove = new ArrayList<Statement>(); + for (Iterator<Statement> iter = model.listStatements().toList().iterator(); iter.hasNext();) { + Statement st = iter.next(); + Property predicate = st.getPredicate(); + if (predicate.equals(RDF.type)) { + if (!st.getObject().asResource().getURI().startsWith(namespace)) { + statementsToRemove.add(st); + } else if (st.getObject().equals(OWL.FunctionalProperty.asNode())) { + statementsToRemove.add(st); + } else if (st.getObject().isLiteral() && st.getObject().asLiteral().getDatatypeURI().equals(XSD.gYear.getURI())) { + statementsToRemove.add(st); + } + } else if (!predicate.equals(RDFS.subClassOf) && !predicate.equals(OWL.sameAs) && !predicate.asResource().getURI().startsWith(namespace)) { + statementsToRemove.add(st); + } + } + model.remove(statementsToRemove); + } + + private static void cleanUpModel(Model model) { + // filter out triples with String literals, as therein often occur + // some syntax errors and they are not relevant for learning + List<Statement> statementsToRemove = new ArrayList<Statement>(); + for (Iterator<Statement> iter = model.listStatements().toList().iterator(); iter.hasNext();) { + Statement st = iter.next(); + RDFNode object = st.getObject(); + if (object.isLiteral()) { + // statementsToRemove.add(st); + Literal lit = object.asLiteral(); + if (lit.getDatatype() == null || lit.getDatatype().equals(XSD.xstring)) { + st.changeObject("shortened", "en"); + } else if (lit.getDatatype().getURI().equals(XSD.gYear.getURI())) { + statementsToRemove.add(st); + // System.err.println("REMOVE " + st); + } else if (lit.getDatatype().getURI().equals(XSD.gYearMonth.getURI())) { + statementsToRemove.add(st); +// System.err.println("REMOVE " + st); + } + } + //remove statements like <x a owl:Class> + if (st.getPredicate().equals(RDF.type)) { + if (object.equals(RDFS.Class.asNode()) || object.equals(OWL.Class.asNode()) || object.equals(RDFS.Literal.asNode()) + || object.equals(RDFS.Resource)) { + statementsToRemove.add(st); + } + } + + //remove unwanted properties + String dbo = "http://dbpedia.org/ontology/"; + Set<String> blackList = Sets.newHashSet(dbo + "wikiPageDisambiguates",dbo + "wikiPageExternalLink", + dbo + "wikiPageID", dbo + "wikiPageInterLanguageLink", dbo + "wikiPageRedirects", dbo + "wikiPageRevisionID", + dbo + "wikiPageWikiLink"); + for(String bl: blackList){ + if (st.getPredicate().getURI().equals(bl)) { + statementsToRemove.add(st); + } + } + } + + model.remove(statementsToRemove); + } + + + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.Experiment#getDocuments() + */ + @Override + protected Set<String> getDocuments() { + Set<String> documents = new HashSet<String>(); + + documents.addAll(DBpediaCorpusGenerator.getDBpediaCorpusSample( + "http://dbpedia.org/ontology/abstract", + Sets.newHashSet(new NamedClass("http://dbpedia.org/ontology/Person")), + maxNrOfInstancesPerClass)); + + documents.clear(); + documents.add("Thomas Cruise Mapother IV, widely known as Tom Cruise, is an American film player and producer. He has been nominated for three Academy Awards and has won three Golden Globe Awards. He started his career at age 19 in the 1981 film Taps. His first leading role was in Risky Business, released in August 1983. Cruise became a full-fledged movie star after starring in Top Gun (1986). He is well known for his role as secret agent Ethan Hunt in the Mission: Impossible film series between 1996 and 2011. Cruise has starred in many Hollywood blockbusters, including Rain Man (1988), A Few Good Men (1992), Jerry Maguire (1996), Vanilla Sky (2001), Minority Report (2002), The Last Samurai (2003), Collateral (2004), War of the Worlds (2005), Tropic Thunder (2008) and Jack Reacher (2012). As of 2012, Cruise is Hollywood's highest-paid actor. Cruise is known for his Scientologist faith and for his support of the Church of Scientology."); + + return documents; + } + + public static void main(String[] args) throws Exception { + new DBpediaSemanticIndexExperiment().run(new NamedClass("http://dbpedia.org/ontology/Person")); + } +} Added: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaSyntacticIndexBasedExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaSyntacticIndexBasedExperiment.java (rev 0) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaSyntacticIndexBasedExperiment.java 2014-01-08 19:37:22 UTC (rev 4210) @@ -0,0 +1,209 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.dllearner.algorithms.isle.index.Index; +import org.dllearner.algorithms.isle.index.syntactic.SolrSyntacticIndex; +import org.dllearner.core.owl.NamedClass; +import org.dllearner.kb.sparql.SparqlEndpoint; +import org.dllearner.utilities.owl.OWLEntityTypeAdder; +import org.semanticweb.owlapi.apibinding.OWLManager; +import org.semanticweb.owlapi.model.AxiomType; +import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.OWLOntologyCreationException; +import org.semanticweb.owlapi.model.OWLOntologyManager; + +import com.google.common.collect.Sets; +import com.hp.hpl.jena.rdf.model.Literal; +import com.hp.hpl.jena.rdf.model.Model; +import com.hp.hpl.jena.rdf.model.Property; +import com.hp.hpl.jena.rdf.model.RDFNode; +import com.hp.hpl.jena.rdf.model.Statement; +import com.hp.hpl.jena.rdf.model.StmtIterator; +import com.hp.hpl.jena.vocabulary.OWL; +import com.hp.hpl.jena.vocabulary.RDF; +import com.hp.hpl.jena.vocabulary.RDFS; +import com.hp.hpl.jena.vocabulary.XSD; + +/** + * @author Lorenz Buehmann + * + */ +public class DBpediaSyntacticIndexBasedExperiment extends Experiment{ + + final SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); + final int maxNrOfInstancesPerClass = 10; + static final String solrServerURL = "http://solr.aksw.org/en_dbpedia_resources/"; + static final String searchField = "comment"; + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.Experiment#getIndex() + */ + @Override + protected Index getIndex() { + return new SolrSyntacticIndex(ontology, solrServerURL, searchField); + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.Experiment#getOntology() + */ + @Override + protected OWLOntology getOntology() { + //load the DBpedia schema + OWLOntology schema = null; + try { + URL url = new URL("http://downloads.dbpedia.org/3.9/dbpedia_3.9.owl.bz2"); + InputStream is = new BufferedInputStream(url.openStream()); + CompressorInputStream in = new Comp... [truncated message content] |
From: <lor...@us...> - 2014-01-02 13:07:21
|
Revision: 4209 http://sourceforge.net/p/dl-learner/code/4209 Author: lorenz_b Date: 2014-01-02 13:07:18 +0000 (Thu, 02 Jan 2014) Log Message: ----------- Fixed bug. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java Modified: trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java 2013-12-10 15:41:36 UTC (rev 4208) +++ trunk/components-core/src/main/java/org/dllearner/reasoning/SPARQLReasoner.java 2014-01-02 13:07:18 UTC (rev 4209) @@ -408,22 +408,22 @@ } public boolean isFunctional(ObjectProperty property){ - String query = "ASK {<" + property + "> a " + OWL.FunctionalProperty.getURI() + "}"; + String query = "ASK {<" + property + "> a <" + OWL.FunctionalProperty.getURI() + ">}"; return qef.createQueryExecution(query).execAsk(); } public boolean isInverseFunctional(ObjectProperty property){ - String query = "ASK {<" + property + "> a " + OWL.InverseFunctionalProperty.getURI() + "}"; + String query = "ASK {<" + property + "> a <" + OWL.InverseFunctionalProperty.getURI() + ">}"; return qef.createQueryExecution(query).execAsk(); } public boolean isAsymmetric(ObjectProperty property){ - String query = "ASK {<" + property + "> a " + OWL2.AsymmetricProperty.getURI() + "}"; + String query = "ASK {<" + property + "> a <" + OWL2.AsymmetricProperty.getURI() + ">}"; return qef.createQueryExecution(query).execAsk(); } public boolean isIrreflexive(ObjectProperty property){ - String query = "ASK {<" + property + "> a " + OWL2.IrreflexiveProperty.getURI() + "}"; + String query = "ASK {<" + property + "> a <" + OWL2.IrreflexiveProperty.getURI() + ">}"; return qef.createQueryExecution(query).execAsk(); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-10 15:41:39
|
Revision: 4208 http://sourceforge.net/p/dl-learner/code/4208 Author: dfleischhacker Date: 2013-12-10 15:41:36 +0000 (Tue, 10 Dec 2013) Log Message: ----------- Adapt WSD interfaces to scored candidates Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/RandomWordSenseDisambiguation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WordSenseDisambiguation.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -3,13 +3,14 @@ */ package org.dllearner.algorithms.isle; -import java.util.HashMap; -import java.util.Set; - import org.dllearner.algorithms.isle.index.Annotation; +import org.dllearner.algorithms.isle.index.EntityScorePair; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; +import java.util.HashMap; +import java.util.Set; + /** * @author Lorenz Buehmann * @@ -22,8 +23,8 @@ this.ontology = ontology; } - public abstract Set<Entity> getCandidates(Annotation annotation); + public abstract Set<EntityScorePair> getCandidates(Annotation annotation); - public abstract HashMap<Annotation,Set<Entity>> getCandidatesMap(Set<Annotation> annotations); + public abstract HashMap<Annotation,Set<EntityScorePair>> getCandidatesMap(Set<Annotation> annotations); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -17,10 +17,9 @@ /** * Gets set of candidate entities for a list of tokens - * @param s * @return */ - public Set<Entity> getCandidateEntities(List<Token> tokens); + public Set<EntityScorePair> getCandidateEntities(List<Token> tokens); /** @@ -28,14 +27,12 @@ * ontology string when the parameter string has been added to the trie after generation by using * WordNet or other additional methods. * - * @param s the string to search in the trie * @return string generating the path of the longest match in the trie */ public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens); /** * Gets the longest matching string - * @param s * @return */ public List<Token> getLongestMatchingText(List<Token> tokens); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -1,13 +1,12 @@ package org.dllearner.algorithms.isle.index; +import org.dllearner.algorithms.isle.EntityCandidateGenerator; +import org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation; + import java.util.HashMap; import java.util.HashSet; import java.util.Set; -import org.dllearner.algorithms.isle.EntityCandidateGenerator; -import org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation; -import org.dllearner.core.owl.Entity; - /** * Provides methods to annotate documents. * @@ -23,7 +22,6 @@ /** * Initialize this semantic annotator to use the entities from the provided ontology. * - * @param ontology the ontology to use entities from */ public SemanticAnnotator(WordSenseDisambiguation wordSenseDisambiguation, EntityCandidateGenerator entityCandidateGenerator, LinguisticAnnotator linguisticAnnotator) { @@ -41,9 +39,9 @@ public AnnotatedDocument processDocument(TextDocument document){ Set<Annotation> annotations = linguisticAnnotator.annotate(document); Set<SemanticAnnotation> semanticAnnotations = new HashSet<SemanticAnnotation>(); - HashMap<Annotation,Set<Entity>> candidatesMap = entityCandidateGenerator.getCandidatesMap(annotations); + HashMap<Annotation, Set<EntityScorePair>> candidatesMap = entityCandidateGenerator.getCandidatesMap(annotations); for (Annotation annotation : candidatesMap.keySet()) { - Set<Entity> candidateEntities = candidatesMap.get(annotation); + Set<EntityScorePair> candidateEntities = candidatesMap.get(annotation); if (candidateEntities == null || candidateEntities.size() == 0) { continue; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -3,16 +3,16 @@ */ package org.dllearner.algorithms.isle.index; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Set; - import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.core.owl.Entity; import org.dllearner.utilities.owl.OWLAPIConverter; import org.semanticweb.owlapi.model.OWLEntity; import org.semanticweb.owlapi.model.OWLOntology; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + /** * @author Lorenz Buehmann * @@ -36,13 +36,17 @@ * @see org.dllearner.algorithms.isle.EntityCandidateGenerator#getCandidates(org.dllearner.algorithms.isle.index.Annotation) */ @Override - public Set<Entity> getCandidates(Annotation annotation) { - return allEntities; - } + public Set<EntityScorePair> getCandidates(Annotation annotation) { + HashSet<EntityScorePair> result = new HashSet<>(); + for (Entity e : allEntities) { + result.add(new EntityScorePair(e, 1.0)); + } + return result; + } @Override - public HashMap<Annotation, Set<Entity>> getCandidatesMap(Set<Annotation> annotations) { - HashMap<Annotation, Set<Entity>> result = new HashMap<Annotation, Set<Entity>>(); + public HashMap<Annotation, Set<EntityScorePair>> getCandidatesMap(Set<Annotation> annotations) { + HashMap<Annotation, Set<EntityScorePair>> result = new HashMap<Annotation, Set<EntityScorePair>>(); for (Annotation annotation: annotations) result.put(annotation, getCandidates(annotation)); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -1,160 +1,156 @@ -package org.dllearner.algorithms.isle.index; - -import net.didion.jwnl.data.POS; -import org.dllearner.algorithms.isle.WordNet; -import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; -import org.dllearner.core.owl.Entity; -import org.semanticweb.owlapi.model.OWLOntology; - -import java.util.*; -import java.util.Map.Entry; - -public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { - TokenTree tree; - EntityTextRetriever entityTextRetriever; - -// /** -// * Initialize the trie with strings from the provided ontology using a no-op name generator, i.e., only the -// * actual ontology strings are added and no expansion is done. -// * -// * @param entityTextRetriever the text retriever to use -// * @param ontology the ontology to get strings from -// */ -// public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { -// this(entityTextRetriever, ontology, new DummyNameGenerator()); -// } - - /** - * Initialize the trie with strings from the provided ontology and use the given entity name generator - * for generating alternative words. - * - * @param entityTextRetriever the text retriever to use - * @param ontology the ontology to get strings from - */ - public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { - this.entityTextRetriever = entityTextRetriever; - buildTrie(ontology); - } - - public void buildTrie(OWLOntology ontology) { - this.tree = new TokenTree(); - Map<Entity, Set<List<Token>>> entity2TokenSet = entityTextRetriever.getRelevantText(ontology); - - - for (Entry<Entity, Set<List<Token>>> entry : entity2TokenSet.entrySet()) { - Entity entity = entry.getKey(); - Set<List<Token>> tokenSet = entry.getValue(); - for (List<Token> tokens : tokenSet) { - addAlternativeFormsFromWordNet(tokens); - addEntry(tokens, entity); - addSubsequences(entity, tokens); - } - } - } - - /** - * Adds the subsequences of a test - * @param entity - * @param tokens - */ - private void addSubsequences(Entity entity, List<Token> tokens) { - tree.add(tokens, entity); - for (int size = 1; size < tokens.size(); size++) { - for (int start = 0; start < tokens.size() - size + 1; start++) { - ArrayList<Token> subsequence = new ArrayList<>(); - for (int i = 0; i < size; i++) { - subsequence.add(tokens.get(start + i)); - } - addEntry(subsequence, entity); - } - } - } - - private void addAlternativeFormsFromWordNet(List<Token> tokens) { - for (Token t : tokens) { - POS wordnetPos = null; - String posTag = t.getPOSTag(); - if (posTag.startsWith("N")) {//nouns - wordnetPos = POS.NOUN; - } - else if (posTag.startsWith("V")) {//verbs - wordnetPos = POS.VERB; - } - else if (posTag.startsWith("J")) {//adjectives - wordnetPos = POS.ADJECTIVE; - } - else if (posTag.startsWith("R")) {//adverbs - wordnetPos = POS.ADVERB; - } - if (wordnetPos == null) { - continue; - } - //String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); - Set<WordNet.LemmaScorePair> alternativeFormPairs = LinguisticUtil.getInstance() - .getScoredHyponyms(t.getRawForm(), wordnetPos); - - for (WordNet.LemmaScorePair synonym : alternativeFormPairs) { - // ignore all multi word synonyms - if (synonym.getLemma().contains("_")) { - continue; - } - //t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); - t.addAlternativeForm(synonym.getLemma(), synonym.getScore()); - } - } - } - - @Override - public void addEntry(List<Token> s, Entity e) { - tree.add(s, e); - } - - public void addEntry(List<Token> s, Entity e, List<Token> originalTokens) { - tree.add(s, e, originalTokens); - } - - @Override - public Set<Entity> getCandidateEntities(List<Token> tokens) { - Set<Entity> res = tree.getAllEntities(tokens); - System.out.println("Unscored: " + res); - Set<EntityScorePair> scored = tree.getAllEntitiesScored(tokens); - System.out.println("Scored: " + scored); - - return res; - } - - @Override - public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens) { - return tree.getOriginalTokensForLongestMatch(tokens); - } - - @Override - public List<Token> getLongestMatchingText(List<Token> tokens) { - return tree.getLongestMatch(tokens); - } - - public String toString() { - return tree.toString(); - } - - public static void main(String[] args) { - String[] tokens = "this is a long and very complex text".split(" "); - - List<String>[] wordnetTokens = (ArrayList<String>[]) new ArrayList[tokens.length]; - - // generate list of lemmatized wordnet synonyms for each token - for (int i = 0; i < tokens.length; i++) { - wordnetTokens[i] = new ArrayList<String>(); - wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(tokens[i])); - for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(tokens[i], 5)) { - System.out.println("Adding: " + LinguisticUtil.getInstance().getNormalizedForm(w)); - wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).replaceAll("_", " ")); - } - } - } - - public void printTrie() { - System.out.println(this.toString()); - - } -} +package org.dllearner.algorithms.isle.index; + +import net.didion.jwnl.data.POS; +import org.dllearner.algorithms.isle.WordNet; +import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; + +import java.util.*; +import java.util.Map.Entry; + +public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { + TokenTree tree; + EntityTextRetriever entityTextRetriever; + +// /** +// * Initialize the trie with strings from the provided ontology using a no-op name generator, i.e., only the +// * actual ontology strings are added and no expansion is done. +// * +// * @param entityTextRetriever the text retriever to use +// * @param ontology the ontology to get strings from +// */ +// public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { +// this(entityTextRetriever, ontology, new DummyNameGenerator()); +// } + + /** + * Initialize the trie with strings from the provided ontology and use the given entity name generator + * for generating alternative words. + * + * @param entityTextRetriever the text retriever to use + * @param ontology the ontology to get strings from + */ + public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { + this.entityTextRetriever = entityTextRetriever; + buildTrie(ontology); + } + + public void buildTrie(OWLOntology ontology) { + this.tree = new TokenTree(); + Map<Entity, Set<List<Token>>> entity2TokenSet = entityTextRetriever.getRelevantText(ontology); + + + for (Entry<Entity, Set<List<Token>>> entry : entity2TokenSet.entrySet()) { + Entity entity = entry.getKey(); + Set<List<Token>> tokenSet = entry.getValue(); + for (List<Token> tokens : tokenSet) { + addAlternativeFormsFromWordNet(tokens); + addEntry(tokens, entity); + addSubsequences(entity, tokens); + } + } + } + + /** + * Adds the subsequences of a test + * @param entity + * @param tokens + */ + private void addSubsequences(Entity entity, List<Token> tokens) { + tree.add(tokens, entity); + for (int size = 1; size < tokens.size(); size++) { + for (int start = 0; start < tokens.size() - size + 1; start++) { + ArrayList<Token> subsequence = new ArrayList<>(); + for (int i = 0; i < size; i++) { + subsequence.add(tokens.get(start + i)); + } + addEntry(subsequence, entity); + } + } + } + + private void addAlternativeFormsFromWordNet(List<Token> tokens) { + for (Token t : tokens) { + POS wordnetPos = null; + String posTag = t.getPOSTag(); + if (posTag.startsWith("N")) {//nouns + wordnetPos = POS.NOUN; + } + else if (posTag.startsWith("V")) {//verbs + wordnetPos = POS.VERB; + } + else if (posTag.startsWith("J")) {//adjectives + wordnetPos = POS.ADJECTIVE; + } + else if (posTag.startsWith("R")) {//adverbs + wordnetPos = POS.ADVERB; + } + if (wordnetPos == null) { + continue; + } + //String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); + Set<WordNet.LemmaScorePair> alternativeFormPairs = LinguisticUtil.getInstance() + .getScoredHyponyms(t.getRawForm(), wordnetPos); + + for (WordNet.LemmaScorePair synonym : alternativeFormPairs) { + // ignore all multi word synonyms + if (synonym.getLemma().contains("_")) { + continue; + } + //t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); + t.addAlternativeForm(synonym.getLemma(), synonym.getScore()); + } + } + } + + @Override + public void addEntry(List<Token> s, Entity e) { + tree.add(s, e); + } + + public void addEntry(List<Token> s, Entity e, List<Token> originalTokens) { + tree.add(s, e, originalTokens); + } + + @Override + public Set<EntityScorePair> getCandidateEntities(List<Token> tokens) { + Set<EntityScorePair> res = tree.getAllEntitiesScored(tokens); + return res; + } + + @Override + public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens) { + return tree.getOriginalTokensForLongestMatch(tokens); + } + + @Override + public List<Token> getLongestMatchingText(List<Token> tokens) { + return tree.getLongestMatch(tokens); + } + + public String toString() { + return tree.toString(); + } + + public static void main(String[] args) { + String[] tokens = "this is a long and very complex text".split(" "); + + List<String>[] wordnetTokens = (ArrayList<String>[]) new ArrayList[tokens.length]; + + // generate list of lemmatized wordnet synonyms for each token + for (int i = 0; i < tokens.length; i++) { + wordnetTokens[i] = new ArrayList<String>(); + wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(tokens[i])); + for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(tokens[i], 5)) { + System.out.println("Adding: " + LinguisticUtil.getInstance().getNormalizedForm(w)); + wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).replaceAll("_", " ")); + } + } + } + + public void printTrie() { + System.out.println(this.toString()); + + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -3,7 +3,6 @@ import com.google.common.collect.Lists; import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.StopWordFilter; -import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; import java.util.ArrayList; @@ -27,8 +26,8 @@ this.candidatesTrie = candidatesTrie; } - public Set<Entity> getCandidates(Annotation annotation) { - Set<Entity> candidateEntities = candidatesTrie.getCandidateEntities(annotation.getTokens()); + public Set<EntityScorePair> getCandidates(Annotation annotation) { + Set<EntityScorePair> candidateEntities = candidatesTrie.getCandidateEntities(annotation.getTokens()); System.out.println(annotation + " --> " + candidateEntities); return candidateEntities; } @@ -39,7 +38,7 @@ * @param window : maximum distance between the annotations * @return */ - public void postProcess(HashMap<Annotation,Set<Entity>> candidatesMap, int window, StopWordFilter stopWordFilter) { + public void postProcess(HashMap<Annotation,Set<EntityScorePair>> candidatesMap, int window, StopWordFilter stopWordFilter) { Set<Annotation> annotations = candidatesMap.keySet(); List<Annotation> sortedAnnotations = new ArrayList<Annotation>(annotations); //TODO refactoring @@ -119,8 +118,8 @@ } @Override - public HashMap<Annotation, Set<Entity>> getCandidatesMap(Set<Annotation> annotations) { - HashMap<Annotation, Set<Entity>> candidatesMap = new HashMap<Annotation, Set<Entity>>(); + public HashMap<Annotation, Set<EntityScorePair>> getCandidatesMap(Set<Annotation> annotations) { + HashMap<Annotation, Set<EntityScorePair>> candidatesMap = new HashMap<>(); for (Annotation annotation: annotations) candidatesMap.put(annotation, getCandidates(annotation)); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/RandomWordSenseDisambiguation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/RandomWordSenseDisambiguation.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/RandomWordSenseDisambiguation.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -18,14 +18,15 @@ */ package org.dllearner.algorithms.isle.wsd; -import java.util.Random; -import java.util.Set; - import org.dllearner.algorithms.isle.index.Annotation; +import org.dllearner.algorithms.isle.index.EntityScorePair; import org.dllearner.algorithms.isle.index.SemanticAnnotation; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; +import java.util.Random; +import java.util.Set; + /** * Disambiguation by randomly selecting one of the candidates (baseline method). * @@ -43,17 +44,17 @@ @Override public SemanticAnnotation disambiguate(Annotation annotation, - Set<Entity> candidateEntities) { + Set<EntityScorePair> candidateEntities) { int pos = random.nextInt(candidateEntities.size()); int i = 0; - for(Entity e : candidateEntities) - { - if (i == pos) { - return new SemanticAnnotation(annotation, e); - } - i++; - } - return null; + for(EntityScorePair esp : candidateEntities) { + Entity e = esp.getEntity(); + if (i == pos) { + return new SemanticAnnotation(annotation, e); + } + i++; + } + return null; } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -3,26 +3,20 @@ */ package org.dllearner.algorithms.isle.wsd; -import java.util.HashSet; -import java.util.Set; - import org.apache.log4j.Logger; import org.dllearner.algorithms.isle.index.Annotation; +import org.dllearner.algorithms.isle.index.EntityScorePair; import org.dllearner.algorithms.isle.index.SemanticAnnotation; import org.dllearner.core.owl.Entity; import org.dllearner.utilities.owl.OWLAPIConverter; -import org.semanticweb.owlapi.model.IRI; -import org.semanticweb.owlapi.model.OWLAnnotationAssertionAxiom; -import org.semanticweb.owlapi.model.OWLAnnotationProperty; -import org.semanticweb.owlapi.model.OWLDataFactory; -import org.semanticweb.owlapi.model.OWLEntity; -import org.semanticweb.owlapi.model.OWLLiteral; -import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.*; import org.semanticweb.owlapi.util.IRIShortFormProvider; import org.semanticweb.owlapi.util.SimpleIRIShortFormProvider; - import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; +import java.util.HashSet; +import java.util.Set; + /** * @author Lorenz Buehmann * @@ -47,26 +41,27 @@ * @see org.dllearner.algorithms.isle.WordSenseDisambiguation#disambiguate(org.dllearner.algorithms.isle.index.Annotation, java.util.Set) */ @Override - public SemanticAnnotation disambiguate(Annotation annotation, Set<Entity> candidateEntities) { + public SemanticAnnotation disambiguate(Annotation annotation, Set<EntityScorePair> candidateEntities) { logger.debug("Linguistic annotations:\n" + annotation); logger.debug("Candidate entities:" + candidateEntities); String token = annotation.getString().trim(); //check if annotation token matches label of entity or the part behind #(resp. /) - for (Entity entity : candidateEntities) { - Set<String> labels = getLabels(entity); - for (String label : labels) { - if(label.equals(token)){ - logger.debug("Disambiguated entity: " + entity); - return new SemanticAnnotation(annotation, entity); - } - } - String shortForm = sfp.getShortForm(IRI.create(entity.getURI())); - if(annotation.equals(shortForm)){ - logger.debug("Disambiguated entity: " + entity); - return new SemanticAnnotation(annotation, entity); - } - } - return null; + for (EntityScorePair entityScorePair : candidateEntities) { + Entity entity = entityScorePair.getEntity(); + Set<String> labels = getLabels(entity); + for (String label : labels) { + if (label.equals(token)) { + logger.debug("Disambiguated entity: " + entity); + return new SemanticAnnotation(annotation, entity); + } + } + String shortForm = sfp.getShortForm(IRI.create(entity.getURI())); + if (annotation.equals(shortForm)) { + logger.debug("Disambiguated entity: " + entity); + return new SemanticAnnotation(annotation, entity); + } + } + return null; } private Set<String> getLabels(Entity entity){ Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -3,21 +3,21 @@ */ package org.dllearner.algorithms.isle.wsd; -import java.io.IOException; -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - +import com.google.common.base.Joiner; +import com.google.common.collect.Sets; import org.dllearner.algorithms.isle.StructuralEntityContext; import org.dllearner.algorithms.isle.VSMCosineDocumentSimilarity; import org.dllearner.algorithms.isle.index.Annotation; +import org.dllearner.algorithms.isle.index.EntityScorePair; import org.dllearner.algorithms.isle.index.SemanticAnnotation; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; -import com.google.common.base.Joiner; -import com.google.common.collect.Sets; +import java.io.IOException; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; /** * @author Lorenz Buehmann @@ -39,7 +39,7 @@ * @see org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation#disambiguate(org.dllearner.algorithms.isle.index.Annotation, java.util.Set) */ @Override - public SemanticAnnotation disambiguate(Annotation annotation, Set<Entity> candidateEntities) { + public SemanticAnnotation disambiguate(Annotation annotation, Set<EntityScorePair> candidateEntities) { if(!candidateEntities.isEmpty()){ //get the context of the annotated token List<String> tokenContext = contextExtractor.extractContext(annotation); @@ -47,19 +47,20 @@ //compare this context with the context of each entity candidate double maxScore = Double.NEGATIVE_INFINITY; Entity bestEntity = null; - for (Entity entity : candidateEntities) { - //get the context of the entity by analyzing the structure of the ontology - Set<String> entityContext = StructuralEntityContext.getContextInNaturalLanguage(ontology, entity); - //compute the VSM Cosine Similarity - double score = computeScore(tokenContext, entityContext); - //set best entity - if(score > maxScore){ - maxScore = score; - bestEntity = entity; - } - } - - return new SemanticAnnotation(annotation, bestEntity); + for (EntityScorePair entityScorePair : candidateEntities) { + Entity entity = entityScorePair.getEntity(); + //get the context of the entity by analyzing the structure of the ontology + Set<String> entityContext = StructuralEntityContext.getContextInNaturalLanguage(ontology, entity); + //compute the VSM Cosine Similarity + double score = computeScore(tokenContext, entityContext); + //set best entity + if (score > maxScore) { + maxScore = score; + bestEntity = entity; + } + } + + return new SemanticAnnotation(annotation, bestEntity); } return null; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WordSenseDisambiguation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WordSenseDisambiguation.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WordSenseDisambiguation.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -1,12 +1,12 @@ package org.dllearner.algorithms.isle.wsd; -import java.util.Set; - import org.dllearner.algorithms.isle.index.Annotation; +import org.dllearner.algorithms.isle.index.EntityScorePair; import org.dllearner.algorithms.isle.index.SemanticAnnotation; -import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; +import java.util.Set; + /** * Abstract class for the word sense disambiguation component. * @@ -27,9 +27,10 @@ /** * Chooses the correct entity for the given annotation from a set of candidate entities. * + * * @param annotation the annotation to find entity for * @param candidateEntities the set of candidate entities * @return semantic annotation containing the given annotation and the chosen entity */ - public abstract SemanticAnnotation disambiguate(Annotation annotation, Set<Entity> candidateEntities); + public abstract SemanticAnnotation disambiguate(Annotation annotation, Set<EntityScorePair> candidateEntities); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-10 15:25:17
|
Revision: 4207 http://sourceforge.net/p/dl-learner/code/4207 Author: dfleischhacker Date: 2013-12-10 15:25:13 +0000 (Tue, 10 Dec 2013) Log Message: ----------- Add scoring for hyponyms and token tree Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -13,6 +13,8 @@ public class WordNet { + private static final double SYNONYM_FACTOR = 0.8; + private static final double HYPONYM_FACTOR = 0.4; public Dictionary dict; public WordNet() { @@ -280,6 +282,42 @@ } } + public List<LemmaScorePair> getHyponymsScored(POS pos, String s) { + ArrayList<LemmaScorePair> result = new ArrayList<>(); + try { + IndexWord word = dict.getIndexWord(pos, s); + if (word == null) { + System.err.println("Unable to find index word for " + s); + return result; + } + Synset sense = word.getSense(1); + getHyponymsScoredRecursive(result, sense, 3, SYNONYM_FACTOR); + } + catch (JWNLException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + return result; + } + + public void getHyponymsScoredRecursive(List<LemmaScorePair> lemmas, Synset sense, int depthToGo, double score) { + for (Word w : sense.getWords()) { + lemmas.add(new LemmaScorePair(w.getLemma(), score)); + } + if (depthToGo == 0) { + return; + } + try { + PointerTargetNodeList directHyponyms = PointerUtils.getInstance().getDirectHyponyms(sense); + for (Object directHyponym : directHyponyms) { + getHyponymsScoredRecursive(lemmas, ((PointerTargetNode) directHyponym).getSynset(), depthToGo - 1, + score * HYPONYM_FACTOR); + } + } + catch (JWNLException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + } + /** * Funktion returns a List of Hypo and Hypernyms of a given string * @@ -356,4 +394,71 @@ return result; } + public static class LemmaScorePair implements Comparable<LemmaScorePair> { + private String lemma; + private Double score; + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + LemmaScorePair that = (LemmaScorePair) o; + + if (lemma != null ? !lemma.equals(that.lemma) : that.lemma != null) { + return false; + } + if (score != null ? !score.equals(that.score) : that.score != null) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = lemma != null ? lemma.hashCode() : 0; + result = 31 * result + (score != null ? score.hashCode() : 0); + return result; + } + + public String getLemma() { + + return lemma; + } + + public void setLemma(String lemma) { + this.lemma = lemma; + } + + public Double getScore() { + return score; + } + + public void setScore(Double score) { + this.score = score; + } + + public LemmaScorePair(String lemma, Double score) { + + this.lemma = lemma; + this.score = score; + } + + @Override + public int compareTo(LemmaScorePair o) { + int val = score.compareTo(o.score); + + if (val == 0) { + val = lemma.compareTo(o.getLemma()); + } + + return val; + } + } + } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -0,0 +1,77 @@ +package org.dllearner.algorithms.isle.index; + +import org.dllearner.core.owl.Entity; + +/** + * Represents a scored entity. The score is produced from the path used to retrieve it from the candidates tree. + * @author Daniel Fleischhacker + */ +public class EntityScorePair implements Comparable<EntityScorePair> { + @Override + public String toString() { + return entity + " : " + score; + } + + private Entity entity; + private Double score; + + @Override + public int compareTo(EntityScorePair o) { + int val = score.compareTo(o.score); + + if (val == 0) { + val = entity.getURI().toString().compareTo(o.entity.getURI().toString()); + } + + return val; + } + + public EntityScorePair(Entity entity, Double score) { + this.entity = entity; + this.score = score; + } + + public Entity getEntity() { + return entity; + } + + public void setEntity(Entity entity) { + this.entity = entity; + } + + public Double getScore() { + return score; + } + + public void setScore(Double score) { + this.score = score; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + EntityScorePair that = (EntityScorePair) o; + + if (entity != null ? !entity.equals(that.entity) : that.entity != null) { + return false; + } + if (score != null ? !score.equals(that.score) : that.score != null) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = entity != null ? entity.hashCode() : 0; + result = 31 * result + (score != null ? score.hashCode() : 0); + return result; + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -5,8 +5,7 @@ import net.didion.jwnl.data.POS; import org.dllearner.algorithms.isle.WordNet; -import java.util.ArrayList; -import java.util.Collections; +import java.util.*; /** * Provides shortcuts to commonly used linguistic operations @@ -35,6 +34,26 @@ } } + public Set<WordNet.LemmaScorePair> getScoredHyponyms(String word, POS pos) { + List<WordNet.LemmaScorePair> pairs = wn.getHyponymsScored(pos, word); + HashMap<String, Double> lemmaScores = new HashMap<>(); + for (WordNet.LemmaScorePair p : pairs) { + if (!lemmaScores.containsKey(p.getLemma())) { + lemmaScores.put(p.getLemma(), p.getScore()); + } + else { + lemmaScores.put(p.getLemma(), Math.max(p.getScore(), lemmaScores.get(p.getLemma()))); + } + } + + TreeSet<WordNet.LemmaScorePair> scoredPairs = new TreeSet<>(); + for (Map.Entry<String, Double> e : lemmaScores.entrySet()) { + scoredPairs.add(new WordNet.LemmaScorePair(e.getKey(), e.getValue())); + } + + return scoredPairs; + } + /** * Processes the given string and puts camelCased words into single words. * @param camelCase the word containing camelcase to split Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -1,6 +1,7 @@ package org.dllearner.algorithms.isle.index; import net.didion.jwnl.data.POS; +import org.dllearner.algorithms.isle.WordNet; import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; @@ -89,15 +90,16 @@ continue; } //String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); - String[] synonyms = LinguisticUtil.getInstance().getAllHyponymsForWord(t.getRawForm(), wordnetPos); + Set<WordNet.LemmaScorePair> alternativeFormPairs = LinguisticUtil.getInstance() + .getScoredHyponyms(t.getRawForm(), wordnetPos); - for (String synonym : synonyms) { + for (WordNet.LemmaScorePair synonym : alternativeFormPairs) { // ignore all multi word synonyms - if (synonym.contains("_")) { + if (synonym.getLemma().contains("_")) { continue; } //t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); - t.addAlternativeForm(synonym); + t.addAlternativeForm(synonym.getLemma(), synonym.getScore()); } } } @@ -113,9 +115,14 @@ @Override public Set<Entity> getCandidateEntities(List<Token> tokens) { - return tree.getAllEntities(tokens); - } + Set<Entity> res = tree.getAllEntities(tokens); + System.out.println("Unscored: " + res); + Set<EntityScorePair> scored = tree.getAllEntitiesScored(tokens); + System.out.println("Scored: " + scored); + return res; + } + @Override public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens) { return tree.getOriginalTokensForLongestMatch(tokens); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -7,7 +7,8 @@ import java.io.Serializable; import java.util.Collections; -import java.util.HashSet; +import java.util.HashMap; +import java.util.Map; import java.util.Set; /** @@ -23,7 +24,8 @@ private boolean isStopWord; private boolean isHead; /// for storing alternative forms of this token, e.g., generated by WordNet synonyms - private HashSet<String> alternativeForms; + private HashMap<String, Double> alternativeForms; + public Token(String rawForm) { this.rawForm = rawForm; @@ -35,7 +37,7 @@ this.posTag = posTag; this.isPunctuation = isPunctuation; this.isStopWord = isStopWord; - this.alternativeForms = new HashSet<>(); + this.alternativeForms = new HashMap<>(); } /** @@ -66,15 +68,22 @@ * @return unmodifiable set of alternative surface forms for this token */ public Set<String> getAlternativeForms() { - return Collections.unmodifiableSet(alternativeForms); + return Collections.unmodifiableSet(alternativeForms.keySet()); } /** + * Returns the map storing the scored alternative forms of this token. + */ + public Map<String, Double> getScoredAlternativeForms() { + return Collections.unmodifiableMap(alternativeForms); + } + + /** * Adds a new surface form to the alternative forms of this token. Alternative forms are included in comparison of * two tokens when using the {@link #equalsWithAlternativeForms}. */ - public void addAlternativeForm(String alternativeForm) { - this.alternativeForms.add(alternativeForm); + public void addAlternativeForm(String alternativeForm, Double score) { + this.alternativeForms.put(alternativeForm, score); } /** @@ -120,7 +129,7 @@ } /** - * @param wheteher the token is the head of the containg sequence of tokens + * @param isHead the token is the head of the containg sequence of tokens */ public void setIsHead(boolean isHead) { this.isHead = isHead; @@ -158,8 +167,8 @@ return false; } - if (other.stemmedForm.equals(stemmedForm) || other.alternativeForms.contains(stemmedForm) || - alternativeForms.contains(other.stemmedForm)) { + if (other.stemmedForm.equals(stemmedForm) || other.alternativeForms.containsKey(stemmedForm) || + alternativeForms.containsKey(other.stemmedForm)) { return true; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -13,6 +13,9 @@ * @author Daniel Fleischhacker */ public class TokenTree { + public static final double WORDNET_FACTOR = 0.3d; + public static final double ORIGINAL_FACTOR = 1.0d; + private LinkedHashMap<Token, TokenTree> children; private Set<Entity> entities; private List<Token> originalTokens; @@ -23,14 +26,15 @@ this.entities = new HashSet<>(); this.originalTokens = new ArrayList<>(); } - + /** * If set to TRUE, stopwords like 'of, on' are ignored during creation and retrieval operations. - * @param ignoreStopWords the ignoreStopWords to set - */ - public void setIgnoreStopWords(boolean ignoreStopWords) { - this.ignoreStopWords = ignoreStopWords; - } + * + * @param ignoreStopWords the ignoreStopWords to set + */ + public void setIgnoreStopWords(boolean ignoreStopWords) { + this.ignoreStopWords = ignoreStopWords; + } /** * Adds all given entities to the end of the path resulting from the given tokens. @@ -41,14 +45,14 @@ public void add(List<Token> tokens, Set<Entity> entities, List<Token> originalTokens) { TokenTree curNode = this; for (Token t : tokens) { - if(!ignoreStopWords || (ignoreStopWords && !t.isStopWord())){ - TokenTree nextNode = curNode.children.get(t); + if (!ignoreStopWords || (ignoreStopWords && !t.isStopWord())) { + TokenTree nextNode = curNode.children.get(t); if (nextNode == null) { nextNode = new TokenTree(); curNode.children.put(t, nextNode); } curNode = nextNode; - } + } } curNode.entities.addAll(entities); curNode.originalTokens = new ArrayList<>(originalTokens); @@ -90,6 +94,75 @@ return curNode.entities; } + public Set<EntityScorePair> getAllEntitiesScored(List<Token> tokens) { + HashSet<EntityScorePair> resEntities = new HashSet<>(); + getAllEntitiesScoredRec(tokens, 0, this, resEntities, 1.0); + + // only keep highest confidence for each entity + HashMap<Entity, Double> entityScores = new HashMap<>(); + + for (EntityScorePair p : resEntities) { + if (!entityScores.containsKey(p.getEntity())) { + entityScores.put(p.getEntity(), p.getScore()); + } + else { + entityScores.put(p.getEntity(), Math.max(p.getScore(), entityScores.get(p.getEntity()))); + } + } + + TreeSet<EntityScorePair> result = new TreeSet<>(); + for (Map.Entry<Entity, Double> e : entityScores.entrySet()) { + result.add(new EntityScorePair(e.getKey(), e.getValue())); + } + + return result; + } + + public void getAllEntitiesScoredRec(List<Token> tokens, int curPosition, TokenTree curTree, + HashSet<EntityScorePair> resEntities, Double curScore) { + + if (curPosition == tokens.size()) { + for (Entity e : curTree.entities) { + resEntities.add(new EntityScorePair(e, curScore)); + } + return; + } + Token currentTextToken = tokens.get(curPosition); + for (Map.Entry<Token, TokenTree> treeTokenEntry : curTree.children.entrySet()) { + if (currentTextToken.equals(treeTokenEntry.getKey())) { + getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, + curScore * ORIGINAL_FACTOR); + } + else { + for (Map.Entry<String, Double> treeAlternativeForm : treeTokenEntry.getKey().getScoredAlternativeForms() + .entrySet()) { + if (currentTextToken.getStemmedForm().equals(treeAlternativeForm.getKey())) { + getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, + curScore * ORIGINAL_FACTOR * treeAlternativeForm.getValue()); + } + } + for (Map.Entry<String, Double> textAlternativeForm : currentTextToken.getScoredAlternativeForms() + .entrySet()) { + if (treeTokenEntry.getKey().getStemmedForm().equals(textAlternativeForm.getKey())) { + getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, + curScore * ORIGINAL_FACTOR * textAlternativeForm.getValue()); + } + } + + for (Map.Entry<String, Double> treeAlternativeForm : treeTokenEntry.getKey().getScoredAlternativeForms() + .entrySet()) { + for (Map.Entry<String, Double> textAlternativeForm : currentTextToken.getScoredAlternativeForms() + .entrySet()) { + if (treeAlternativeForm.getKey().equals(textAlternativeForm.getKey())) { + getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, + curScore * treeAlternativeForm.getValue() * textAlternativeForm.getValue()); + } + } + } + } + } + } + public Set<Entity> getAllEntities(List<Token> tokens) { HashSet<Entity> resEntities = new HashSet<>(); getAllEntitiesRec(tokens, 0, this, resEntities); @@ -145,7 +218,8 @@ /** * Returns the set of entities assigned to the longest matching token subsequence of the given token sequence. - * @param tokens token sequence to search for longest match + * + * @param tokens token sequence to search for longest match * @return set of entities assigned to the longest matching token subsequence of the given token sequence */ public Set<Entity> getEntitiesForLongestMatch(List<Token> tokens) { @@ -188,34 +262,37 @@ } public static void main(String[] args) throws Exception { - List<Token> tokens1 = Lists.newLinkedList(); - for (String s : Splitter.on(" ").split("this is a token tree")) { - tokens1.add(new Token(s, s, s, false, false)); - }; - - List<Token> tokens2 = Lists.newLinkedList(); - for (String s : Splitter.on(" ").split("this is a tokenized tree")) { - tokens2.add(new Token(s, s, s, false, false)); - }; - - TokenTree tree = new TokenTree(); - tree.add(tokens1, new NamedClass("TokenTree")); - tree.add(tokens2, new NamedClass("TokenizedTree")); + List<Token> tokens1 = Lists.newLinkedList(); + for (String s : Splitter.on(" ").split("this is a token tree")) { + tokens1.add(new Token(s, s, s, false, false)); + } + ; + + List<Token> tokens2 = Lists.newLinkedList(); + for (String s : Splitter.on(" ").split("this is a tokenized tree")) { + tokens2.add(new Token(s, s, s, false, false)); + } + ; + + TokenTree tree = new TokenTree(); + tree.add(tokens1, new NamedClass("TokenTree")); + tree.add(tokens2, new NamedClass("TokenizedTree")); System.out.println(tree); - + System.out.println(tree.getEntitiesForLongestMatch(tokens1)); System.out.println(tree.getLongestMatch(tokens1)); - + List<Token> tokens3 = Lists.newLinkedList(); - for (String s : Splitter.on(" ").split("this is a very nice tokenized tree")) { - tokens3.add(new Token(s, s, s, false, false)); - }; + for (String s : Splitter.on(" ").split("this is a very nice tokenized tree")) { + tokens3.add(new Token(s, s, s, false, false)); + } + ; System.out.println(tree.getLongestMatch(tokens3)); } - + public String toString() { - return "TokenTree\n"+ toString(0); + return "TokenTree\n" + toString(0); } public String toString(int indent) { @@ -233,5 +310,5 @@ return sb.toString(); } - + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-10 14:35:07
|
Revision: 4206 http://sourceforge.net/p/dl-learner/code/4206 Author: lorenz_b Date: 2013-12-10 14:35:02 +0000 (Tue, 10 Dec 2013) Log Message: ----------- Set DBpedia experiment to SOLR index. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/fuzzydll/FuzzyCELOE.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/RelevanceUtils.java trunk/components-core/src/main/java/org/dllearner/reasoning/fuzzydll/FuzzyDLReasonerManager.java trunk/components-core/src/main/java/org/dllearner/refinementoperators/RhoDRDown.java trunk/components-core/src/main/java/org/dllearner/utilities/FuzzyOwl2.java trunk/components-core/src/main/java/org/dllearner/utilities/FuzzyOwl2toFuzzyDL.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetricTest.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/utilities/OWLClassExpression2FuzzyDLConverter.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/fuzzydll/FuzzyCELOE.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/fuzzydll/FuzzyCELOE.java 2013-12-10 13:47:53 UTC (rev 4205) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/fuzzydll/FuzzyCELOE.java 2013-12-10 14:35:02 UTC (rev 4206) @@ -175,7 +175,7 @@ private int maxClassDescriptionTests = 0; - private int maxExecutionTimeInSeconds = 100; + private int maxExecutionTimeInSeconds = 200; private boolean terminateOnNoiseReached = false; @@ -390,6 +390,13 @@ else if (learningProblem instanceof FuzzyPosNegLP) { examples = Helper.union(((FuzzyPosNegLP)learningProblem).getPositiveExamples(),((FuzzyPosNegLP)learningProblem).getNegativeExamples()); } + + + //cardinality has to be deactivated as it is not supported by FuzzyDL + if(operator instanceof RhoDRDown){ + ((RhoDRDown) operator).setUseCardinalityRestrictions(false); + } + } @Override Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java 2013-12-10 13:47:53 UTC (rev 4205) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java 2013-12-10 14:35:02 UTC (rev 4206) @@ -64,7 +64,6 @@ try { QueryResponse response = solr.query(query); SolrDocumentList list = response.getResults(); - System.out.println(list.getNumFound()); for (SolrDocument doc : list) { String uri = (String) doc.getFieldValue("uri"); String comment = (String) doc.getFieldValue(searchField); @@ -120,7 +119,7 @@ queryString += Joiner.on("OR").join(terms); queryString += ")"; - SolrQuery query = new SolrQuery(searchField + ":" + queryString);System.out.println(query); + SolrQuery query = new SolrQuery(searchField + ":" + queryString);//System.out.println(query); try { QueryResponse response = solr.query(query); SolrDocumentList list = response.getResults(); @@ -162,7 +161,7 @@ String queryStringConjuction = "(" + Joiner.on("AND").join(queryStringParts) + ")"; - SolrQuery query = new SolrQuery(searchField + ":" + queryStringConjuction);System.out.println(query); + SolrQuery query = new SolrQuery(searchField + ":" + queryStringConjuction);//System.out.println(query); try { QueryResponse response = solr.query(query); SolrDocumentList list = response.getResults(); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-12-10 13:47:53 UTC (rev 4205) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-12-10 14:35:02 UTC (rev 4206) @@ -3,14 +3,9 @@ */ package org.dllearner.algorithms.isle.metrics; -import java.util.Set; - -import org.dllearner.algorithms.isle.index.AnnotatedDocument; import org.dllearner.algorithms.isle.index.Index; import org.dllearner.core.owl.Entity; -import com.google.common.collect.Sets; - /** * @author Lorenz Buehmann * @@ -40,18 +35,15 @@ @Override public double getNormalizedRelevance(Entity entityA, Entity entityB){ - Set<AnnotatedDocument> documentsA = index.getDocuments(entityA); - Set<AnnotatedDocument> documentsB = index.getDocuments(entityB); - Set<AnnotatedDocument> documentsAB = Sets.intersection(documentsA, documentsB); + long nrOfDocumentsA = index.getNumberOfDocumentsFor(entityA); + long nrOfDocumentsB = index.getNumberOfDocumentsFor(entityB); + long nrOfDocumentsAB = index.getNumberOfDocumentsFor(entityA, entityB); + long nrOfDocuments = index.getTotalNumberOfDocuments(); -// System.out.println("A:" + documentsA.size()); -// System.out.println("B:" + documentsB.size()); -// System.out.println("AB:" + documentsAB.size()); -// System.out.println(nrOfDocuments); - double pA = nrOfDocuments == 0 ? 0 : ((double) documentsA.size() / (double) nrOfDocuments); - double pB = nrOfDocuments == 0 ? 0 : ((double) documentsB.size() / (double) nrOfDocuments); - double pAB = nrOfDocuments == 0 ? 0 : ((double) documentsAB.size() / (double) nrOfDocuments); + double pA = nrOfDocuments == 0 ? 0 : ((double) nrOfDocumentsA / (double) nrOfDocuments); + double pB = nrOfDocuments == 0 ? 0 : ((double) nrOfDocumentsB / (double) nrOfDocuments); + double pAB = nrOfDocuments == 0 ? 0 : ((double) nrOfDocumentsAB / (double) nrOfDocuments); if(pAB == 0 || pA * pB == 0){ return 0; Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/RelevanceUtils.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/RelevanceUtils.java 2013-12-10 13:47:53 UTC (rev 4205) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/RelevanceUtils.java 2013-12-10 14:35:02 UTC (rev 4206) @@ -8,6 +8,7 @@ import java.util.Map; import java.util.Set; +import org.apache.log4j.Logger; import org.dllearner.core.owl.Entity; import org.dllearner.utilities.owl.OWLAPIConverter; import org.semanticweb.owlapi.model.OWLEntity; @@ -19,6 +20,9 @@ */ public class RelevanceUtils { + + private static final Logger logger = Logger.getLogger(RelevanceUtils.class.getName()); + public static Map<Entity, Double> getRelevantEntities(Entity entity, Set<Entity> otherEntities, RelevanceMetric metric){ Map<Entity, Double> relevantEntities = new HashMap<Entity, Double>(); @@ -31,7 +35,7 @@ } public static Map<Entity, Double> getRelevantEntities(Entity entity, OWLOntology ontology, RelevanceMetric metric){ - System.out.println(entity); + logger.info("Get relevant entities for " + entity); Map<Entity, Double> relevantEntities = new HashMap<Entity, Double>(); Set<OWLEntity> owlEntities = new HashSet<OWLEntity>(); @@ -43,7 +47,7 @@ otherEntities.remove(entity); for (Entity otherEntity : otherEntities) { double relevance = metric.getNormalizedRelevance(entity, otherEntity); - System.out.println(otherEntity + ":" + relevance); + logger.info(otherEntity + ":" + relevance); relevantEntities.put(otherEntity, relevance); } Modified: trunk/components-core/src/main/java/org/dllearner/reasoning/fuzzydll/FuzzyDLReasonerManager.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/reasoning/fuzzydll/FuzzyDLReasonerManager.java 2013-12-10 13:47:53 UTC (rev 4205) +++ trunk/components-core/src/main/java/org/dllearner/reasoning/fuzzydll/FuzzyDLReasonerManager.java 2013-12-10 14:35:02 UTC (rev 4206) @@ -33,6 +33,7 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.dllearner.utilities.FuzzyOwl2toFuzzyDL; +import org.dllearner.utilities.OWLClassExpression2FuzzyDLConverter; import org.semanticweb.owlapi.model.AxiomType; import org.semanticweb.owlapi.model.OWLAxiom; import org.semanticweb.owlapi.model.OWLClass; @@ -107,6 +108,7 @@ // private int counter2 = 1; private ByteArrayOutputStream baos; + private OWLClassExpression2FuzzyDLConverter classExpression2fuzzyDLConverter; public FuzzyDLReasonerManager(String ontologyFile, OWLOntology ontology, OWLReasonerConfiguration conf, OWLDataFactory factory, String baseURI) throws Exception { @@ -140,21 +142,24 @@ baos = new ByteArrayOutputStream(); fuzzyFileParser.setPrintStream(new PrintStream(baos)); + classExpression2fuzzyDLConverter = new OWLClassExpression2FuzzyDLConverter(fuzzyFileParser); + solveKB(); // errorFile = new FileOutputStream("errorFile.txt")name; } private Concept convert(OWLClassExpression classExpression){ - baos.reset(); - if(classExpression.isOWLThing()){ - return Concept.CONCEPT_TOP; - } else if(classExpression.isOWLNothing()){ - return Concept.CONCEPT_BOTTOM; - } else { - String name = fuzzyFileParser.getClassName(classExpression); - return fuzzyKB.getConcept(name); - } +// baos.reset(); +// if(classExpression.isOWLThing()){ +// return Concept.CONCEPT_TOP; +// } else if(classExpression.isOWLNothing()){ +// return Concept.CONCEPT_BOTTOM; +// } else { +// String name = fuzzyFileParser.getClassName(classExpression); +// return fuzzyKB.getConcept(name); +// } + return classExpression2fuzzyDLConverter.convert(classExpression); } private Individual convert(OWLIndividual individual){ @@ -201,12 +206,16 @@ Query q = new MinInstanceQuery(fConcept, fIndividual); KnowledgeBase clonedFuzzyKB = fuzzyKB.clone(); - +// q = new MinInstanceQuery(Concept.some("hasCar", Concept.CONCEPT_TOP), fuzzyKB.getIndividual("east1")); +// System.out.println(fConcept); +// System.out.println(Concept.some("hasCar", Concept.CONCEPT_TOP)); +// q = new MinInstanceQuery(fuzzyKB.getConcept("Train"), fuzzyKB.getIndividual("east1")); // TODO: just for testing, remove // long start = System.nanoTime(); queryResult = q.solve(clonedFuzzyKB); System.out.println(q.toString() + queryResult.getSolution()); +// System.exit(0); // TODO: just for testing, remove // out.println(counter + " * " + (System.nanoTime() - start)); // counter++; Modified: trunk/components-core/src/main/java/org/dllearner/refinementoperators/RhoDRDown.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/refinementoperators/RhoDRDown.java 2013-12-10 13:47:53 UTC (rev 4205) +++ trunk/components-core/src/main/java/org/dllearner/refinementoperators/RhoDRDown.java 2013-12-10 14:35:02 UTC (rev 4206) @@ -542,8 +542,8 @@ // currently inverse roles are not supported ObjectProperty ar = (ObjectProperty) role; // remove reasoner calls -// Set<ObjectProperty> moreSpecialRoles = reasoner.getSubProperties(ar); - Set<ObjectProperty> moreSpecialRoles = objectPropertyHierarchy.getMoreSpecialRoles(ar); + Set<ObjectProperty> moreSpecialRoles = reasoner.getSubProperties(ar); +// Set<ObjectProperty> moreSpecialRoles = objectPropertyHierarchy.getMoreSpecialRoles(ar); for(ObjectProperty moreSpecialRole : moreSpecialRoles) refinements.add(new ObjectSomeRestriction(moreSpecialRole, description.getChild(0))); @@ -587,15 +587,15 @@ // rule 3: ALL r.D => ALL s.D or ALL r^-1.D => ALL s^-1.D // currently inverse roles are not supported ObjectProperty ar = (ObjectProperty) role; -// Set<ObjectProperty> moreSpecialRoles = reasoner.getSubProperties(ar); - Set<ObjectProperty> moreSpecialRoles = objectPropertyHierarchy.getMoreSpecialRoles(ar); + Set<ObjectProperty> moreSpecialRoles = reasoner.getSubProperties(ar); +// Set<ObjectProperty> moreSpecialRoles = objectPropertyHierarchy.getMoreSpecialRoles(ar); for(ObjectProperty moreSpecialRole : moreSpecialRoles) { refinements.add(new ObjectAllRestriction(moreSpecialRole, description.getChild(0))); } // rule 4: ALL r.D => <= (maxFillers-1) r.D // (length increases by 1 so we have to check whether max length is sufficient) - // => commented out because this is acutally not a downward refinement + // => commented out because this is actually not a downward refinement // if(useCardinalityRestrictions) { // if(maxLength > description.getLength() && maxNrOfFillers.get(ar)>1) { // ObjectMaxCardinalityRestriction max = new ObjectMaxCardinalityRestriction(maxNrOfFillers.get(ar)-1,role,description.getChild(0)); Modified: trunk/components-core/src/main/java/org/dllearner/utilities/FuzzyOwl2.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/utilities/FuzzyOwl2.java 2013-12-10 13:47:53 UTC (rev 4205) +++ trunk/components-core/src/main/java/org/dllearner/utilities/FuzzyOwl2.java 2013-12-10 14:35:02 UTC (rev 4206) @@ -229,7 +229,7 @@ protected OWLDataFactory dataFactory; protected Hashtable<String, FuzzyConcept> definedConcepts; protected Hashtable<String, FuzzyProperty> definedProperties; - protected Hashtable<String, FuzzyDatatype> fuzzyDatatypes; + public Hashtable<String, FuzzyDatatype> fuzzyDatatypes; protected Hashtable<String, FuzzyModifier> fuzzyModifiers; protected OWLAnnotationProperty label; protected OWLOntologyManager manager; @@ -1187,7 +1187,7 @@ * @param p An OWL 2 object property. * @return A String representation of p. */ - protected String getObjectPropertyName(OWLObjectPropertyExpression p) + public String getObjectPropertyName(OWLObjectPropertyExpression p) { if (p.isOWLTopObjectProperty()) return getTopObjectPropertyName(); @@ -1204,7 +1204,7 @@ * @param p An OWL 2 data property. * @return A String representation of p. */ - protected String getDataPropertyName(OWLDataPropertyExpression p) + public String getDataPropertyName(OWLDataPropertyExpression p) { if (p.isOWLTopDataProperty()) return getTopDataPropertyName(); @@ -1345,7 +1345,7 @@ * @param e An OWL 2 entity. * @return Short name of e. */ - protected String getShortName(OWLEntity e) + public String getShortName(OWLEntity e) { return pm.getShortForm(e); } Modified: trunk/components-core/src/main/java/org/dllearner/utilities/FuzzyOwl2toFuzzyDL.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/utilities/FuzzyOwl2toFuzzyDL.java 2013-12-10 13:47:53 UTC (rev 4205) +++ trunk/components-core/src/main/java/org/dllearner/utilities/FuzzyOwl2toFuzzyDL.java 2013-12-10 14:35:02 UTC (rev 4206) @@ -2,6 +2,7 @@ import java.util.*; +import fuzzydl.Concept; import fuzzyowl2.*; import org.semanticweb.owlapi.model.*; @@ -49,7 +50,7 @@ @Override protected String getTopConceptName() { - return("*top*"); + return Concept.CONCEPT_TOP.toString();//("*top*"); } @@ -92,7 +93,7 @@ @Override protected String getObjectSomeValuesFromName(OWLObjectPropertyExpression p, OWLClassExpression c) { - return "(some " + getObjectPropertyName(p) + " " + getClassName(c) + " )"; + return "(some " + getObjectPropertyName(p) + " " + getClassName(c) + ")"; } @@ -1076,7 +1077,7 @@ @Override - protected String getShortName(OWLEntity e) + public String getShortName(OWLEntity e) { String aux = pm.getShortForm(e); if (isReservedWord(aux)) Added: trunk/components-core/src/main/java/org/dllearner/utilities/OWLClassExpression2FuzzyDLConverter.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/utilities/OWLClassExpression2FuzzyDLConverter.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/utilities/OWLClassExpression2FuzzyDLConverter.java 2013-12-10 14:35:02 UTC (rev 4206) @@ -0,0 +1,230 @@ +/** + * + */ +package org.dllearner.utilities; + +import java.util.ArrayList; +import java.util.Set; + +import org.semanticweb.owlapi.model.DataRangeType; +import org.semanticweb.owlapi.model.OWLClass; +import org.semanticweb.owlapi.model.OWLClassExpression; +import org.semanticweb.owlapi.model.OWLClassExpressionVisitor; +import org.semanticweb.owlapi.model.OWLDataAllValuesFrom; +import org.semanticweb.owlapi.model.OWLDataExactCardinality; +import org.semanticweb.owlapi.model.OWLDataHasValue; +import org.semanticweb.owlapi.model.OWLDataMaxCardinality; +import org.semanticweb.owlapi.model.OWLDataMinCardinality; +import org.semanticweb.owlapi.model.OWLDataOneOf; +import org.semanticweb.owlapi.model.OWLDataRange; +import org.semanticweb.owlapi.model.OWLDataSomeValuesFrom; +import org.semanticweb.owlapi.model.OWLLiteral; +import org.semanticweb.owlapi.model.OWLObjectAllValuesFrom; +import org.semanticweb.owlapi.model.OWLObjectComplementOf; +import org.semanticweb.owlapi.model.OWLObjectExactCardinality; +import org.semanticweb.owlapi.model.OWLObjectHasSelf; +import org.semanticweb.owlapi.model.OWLObjectHasValue; +import org.semanticweb.owlapi.model.OWLObjectIntersectionOf; +import org.semanticweb.owlapi.model.OWLObjectMaxCardinality; +import org.semanticweb.owlapi.model.OWLObjectMinCardinality; +import org.semanticweb.owlapi.model.OWLObjectOneOf; +import org.semanticweb.owlapi.model.OWLObjectSomeValuesFrom; +import org.semanticweb.owlapi.model.OWLObjectUnionOf; + +import fuzzydl.Concept; + +/** + * @author Lorenz Buehmann + * + */ +public class OWLClassExpression2FuzzyDLConverter implements OWLClassExpressionVisitor{ + + Concept fuzzyConcept; + private FuzzyOwl2 fuzzyOwl2; + + + public OWLClassExpression2FuzzyDLConverter(FuzzyOwl2 fuzzyOwl2) { + this.fuzzyOwl2 = fuzzyOwl2; + } + + public Concept convert(OWLClassExpression expr){ + expr.accept(this); + return fuzzyConcept; + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLClass) + */ + @Override + public void visit(OWLClass cls) { + if(cls.isOWLThing()){ + fuzzyConcept = Concept.CONCEPT_TOP; + } else if(cls.isOWLNothing()){ + fuzzyConcept = Concept.CONCEPT_BOTTOM; + } else { + fuzzyConcept = new Concept(fuzzyOwl2.getClassName(cls)); + } + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLObjectIntersectionOf) + */ + @Override + public void visit(OWLObjectIntersectionOf expr) { + ArrayList<Concept> conjuncts = new ArrayList<>(); + for (OWLClassExpression operand : expr.getOperands()) { + conjuncts.add(convert(operand)); + } + fuzzyConcept = Concept.and(conjuncts); + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLObjectUnionOf) + */ + @Override + public void visit(OWLObjectUnionOf expr) { + ArrayList<Concept> disjuncts = new ArrayList<>(); + for (OWLClassExpression operand : expr.getOperands()) { + disjuncts.add(convert(operand)); + } + fuzzyConcept = Concept.or(disjuncts); + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLObjectComplementOf) + */ + @Override + public void visit(OWLObjectComplementOf expr) { + Concept c = convert(expr.getOperand()); + fuzzyConcept = Concept.complement(c); + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLObjectSomeValuesFrom) + */ + @Override + public void visit(OWLObjectSomeValuesFrom expr) { + Concept filler = convert(expr.getFiller()); + fuzzyConcept = Concept.some(fuzzyOwl2.getObjectPropertyName(expr.getProperty()), filler); + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLObjectAllValuesFrom) + */ + @Override + public void visit(OWLObjectAllValuesFrom expr) { + Concept filler = convert(expr.getFiller()); + fuzzyConcept = Concept.all(fuzzyOwl2.getObjectPropertyName(expr.getProperty()), filler); + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLObjectHasValue) + */ + @Override + public void visit(OWLObjectHasValue arg0) { + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLObjectMinCardinality) + */ + @Override + public void visit(OWLObjectMinCardinality arg0) { + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLObjectExactCardinality) + */ + @Override + public void visit(OWLObjectExactCardinality arg0) { + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLObjectMaxCardinality) + */ + @Override + public void visit(OWLObjectMaxCardinality arg0) { + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLObjectHasSelf) + */ + @Override + public void visit(OWLObjectHasSelf arg0) { + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLObjectOneOf) + */ + @Override + public void visit(OWLObjectOneOf arg0) { + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLDataSomeValuesFrom) + */ + @Override + public void visit(OWLDataSomeValuesFrom expr) { + OWLDataRange range = expr.getFiller(); + DataRangeType type = range.getDataRangeType(); + if (type == DataRangeType.DATATYPE) + { + String datatypeName = fuzzyOwl2.getShortName(range.asOWLDatatype()); + if (fuzzyOwl2.fuzzyDatatypes.containsKey(datatypeName)) + fuzzyConcept = Concept.some(fuzzyOwl2.getDataPropertyName(expr.getProperty()), new Concept(datatypeName)); + } + else if (type == DataRangeType.DATA_ONE_OF) + { + OWLDataOneOf o = (OWLDataOneOf) range; + Set<OWLLiteral> set = o.getValues(); + if (!set.isEmpty()) + { + OWLLiteral lit = set.iterator().next(); + fuzzyConcept = Concept.exactValue(fuzzyOwl2.getDataPropertyName(expr.getProperty()), lit.getLiteral()); + } + } + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLDataAllValuesFrom) + */ + @Override + public void visit(OWLDataAllValuesFrom expr) { + OWLDataRange range = expr.getFiller(); + DataRangeType type = range.getDataRangeType(); + if (type == DataRangeType.DATATYPE) + { + String datatypeName = fuzzyOwl2.getShortName(range.asOWLDatatype()); + if (fuzzyOwl2.fuzzyDatatypes.containsKey(datatypeName)) + fuzzyConcept = Concept.all(fuzzyOwl2.getDataPropertyName(expr.getProperty()), new Concept(datatypeName)); + } + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLDataHasValue) + */ + @Override + public void visit(OWLDataHasValue arg0) { + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLDataMinCardinality) + */ + @Override + public void visit(OWLDataMinCardinality arg0) { + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLDataExactCardinality) + */ + @Override + public void visit(OWLDataExactCardinality arg0) { + } + + /* (non-Javadoc) + * @see org.semanticweb.owlapi.model.OWLClassExpressionVisitor#visit(org.semanticweb.owlapi.model.OWLDataMaxCardinality) + */ + @Override + public void visit(OWLDataMaxCardinality arg0) { + } + +} Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java 2013-12-10 13:47:53 UTC (rev 4205) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java 2013-12-10 14:35:02 UTC (rev 4206) @@ -19,6 +19,8 @@ import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorInputStream; import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.dllearner.algorithms.isle.index.Index; +import org.dllearner.algorithms.isle.index.syntactic.SolrSyntacticIndex; import org.dllearner.core.owl.NamedClass; import org.dllearner.kb.sparql.SparqlEndpoint; import org.dllearner.utilities.owl.OWLEntityTypeAdder; @@ -48,8 +50,16 @@ final SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); final int maxNrOfInstancesPerClass = 10; + static final String solrServerURL = "http://solr.aksw.org/en_dbpedia_resources/"; + static final String searchField = "comment"; - + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.Experiment#getIndex() + */ + @Override + protected Index getIndex() { + return new SolrSyntacticIndex(ontology, solrServerURL, searchField); + } /* (non-Javadoc) * @see org.dllearner.algorithms.isle.Experiment#getOntology() @@ -81,10 +91,10 @@ cleanUpModel(sample); filter(sample, "http://dbpedia.org/ontology/"); OWLEntityTypeAdder.addEntityTypes(sample); - StmtIterator iterator = sample.listStatements(); - while(iterator.hasNext()){ - System.out.println(iterator.next()); - } +// StmtIterator iterator = sample.listStatements(); +// while(iterator.hasNext()){ +// System.out.println(iterator.next()); +// } try { ByteArrayOutputStream baos = new ByteArrayOutputStream(); Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java 2013-12-10 13:47:53 UTC (rev 4205) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java 2013-12-10 14:35:02 UTC (rev 4206) @@ -12,8 +12,9 @@ import java.util.Set; import java.util.SortedSet; +import org.apache.log4j.Logger; import org.dllearner.algorithms.celoe.CELOE; -import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.algorithms.isle.index.semantic.SemanticIndexGenerator; import org.dllearner.algorithms.isle.metrics.PMIRelevanceMetric; import org.dllearner.algorithms.isle.metrics.RelevanceMetric; @@ -50,6 +51,9 @@ */ public abstract class Experiment { + + private static final Logger logger = Logger.getLogger(Experiment.class.getName()); + /** * */ @@ -71,7 +75,7 @@ private String testFolder = "experiments/logs/"; - private OWLOntology ontology; + protected OWLOntology ontology; private Set<String> documents; private boolean initialized = false; @@ -93,10 +97,11 @@ documents = getDocuments(); // build semantic index -// SemanticIndex semanticIndex = SemanticIndexGenerator.generateIndex(documents, ontology, false); + Index index = getIndex(); + logger.info("Index created."); // // // set the relevance metric -// relevance = new PMIRelevanceMetric(semanticIndex); + relevance = new PMIRelevanceMetric(index); try { // set KB KnowledgeSource ks = new OWLAPIOntology(ontology); @@ -149,6 +154,10 @@ } } + protected Index getIndex(){ + return SemanticIndexGenerator.generateIndex(documents, ontology, false); + } + private Description getStartClass(NamedClass cls, boolean isEquivalenceProblem, boolean reuseExistingDescription){ //get instances of class to describe SortedSet<Individual> individuals = reasoner.getIndividuals(cls); @@ -224,6 +233,7 @@ public void run(NamedClass cls) throws ComponentInitException { initIfNecessary(); + logger.info("Learning definiton of class " + cls); // lp.setClassToDescribe(cls); //get the positive examples, here just the instances of the class to describe SortedSet<Individual> individuals = reasoner.getIndividuals(cls); @@ -233,8 +243,8 @@ //get the start class for the learning algorithms Description startClass = getStartClass(cls, equivalence, true); -// Map<Entity, Double> entityRelevance = RelevanceUtils.getRelevantEntities(cls, ontology, relevance); -// NLPHeuristic heuristic = new NLPHeuristic(entityRelevance); + Map<Entity, Double> entityRelevance = RelevanceUtils.getRelevantEntities(cls, ontology, relevance); + NLPHeuristic heuristic = new NLPHeuristic(entityRelevance); ClassLearningProblem clp = new ClassLearningProblem(reasoner); clp.setClassToDescribe(cls); @@ -248,7 +258,7 @@ // perform cross validation with ISLE ISLE isle = new ISLE(clp, reasoner); -// isle.setHeuristic(heuristic); + isle.setHeuristic(heuristic); isle.setMaxNrOfResults(20); isle.setOperator(rop); isle.setMaxExecutionTimeInSeconds(maxExecutionTimeInSeconds); Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetricTest.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetricTest.java 2013-12-10 13:47:53 UTC (rev 4205) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetricTest.java 2013-12-10 14:35:02 UTC (rev 4206) @@ -27,7 +27,7 @@ public class PMIRelevanceMetricTest { AbstractRelevanceMetric metric; - static final String solrServerURL = "http://[2001:638:902:2010:0:168:35:138]:8080/solr/en_dbpedia_resources/"; + static final String solrServerURL = "http://solr.aksw.org/en_dbpedia_resources/"; static final String searchField = "comment"; static final String DBPEDIA_NS = "http://dbpedia.org/ontology/"; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-10 13:47:57
|
Revision: 4205 http://sourceforge.net/p/dl-learner/code/4205 Author: lorenz_b Date: 2013-12-10 13:47:53 +0000 (Tue, 10 Dec 2013) Log Message: ----------- Added PMI test. Modified Paths: -------------- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java Added Paths: ----------- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/metrics/ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetricTest.java Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java 2013-12-10 13:25:25 UTC (rev 4204) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java 2013-12-10 13:47:53 UTC (rev 4205) @@ -3,23 +3,42 @@ */ package org.dllearner.algorithms.isle; -import com.google.common.collect.Sets; -import com.hp.hpl.jena.rdf.model.Model; +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorInputStream; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.dllearner.core.owl.NamedClass; import org.dllearner.kb.sparql.SparqlEndpoint; +import org.dllearner.utilities.owl.OWLEntityTypeAdder; import org.semanticweb.owlapi.apibinding.OWLManager; +import org.semanticweb.owlapi.model.AxiomType; import org.semanticweb.owlapi.model.OWLOntology; import org.semanticweb.owlapi.model.OWLOntologyCreationException; import org.semanticweb.owlapi.model.OWLOntologyManager; -import java.io.*; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.HashSet; -import java.util.Set; +import com.google.common.collect.Sets; +import com.hp.hpl.jena.rdf.model.Literal; +import com.hp.hpl.jena.rdf.model.Model; +import com.hp.hpl.jena.rdf.model.Property; +import com.hp.hpl.jena.rdf.model.RDFNode; +import com.hp.hpl.jena.rdf.model.Statement; +import com.hp.hpl.jena.rdf.model.StmtIterator; +import com.hp.hpl.jena.vocabulary.OWL; +import com.hp.hpl.jena.vocabulary.RDF; +import com.hp.hpl.jena.vocabulary.RDFS; +import com.hp.hpl.jena.vocabulary.XSD; /** * @author Lorenz Buehmann @@ -28,7 +47,7 @@ public class DBpediaExperiment extends Experiment{ final SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); - final int maxNrOfInstancesPerClass = 100; + final int maxNrOfInstancesPerClass = 10; @@ -38,12 +57,12 @@ @Override protected OWLOntology getOntology() { //load the DBpedia schema + OWLOntology schema = null; try { URL url = new URL("http://downloads.dbpedia.org/3.9/dbpedia_3.9.owl.bz2"); InputStream is = new BufferedInputStream(url.openStream()); CompressorInputStream in = new CompressorStreamFactory().createCompressorInputStream("bzip2", is); - OWLOntology schema = OWLManager.createOWLOntologyManager().loadOntologyFromOntologyDocument(in); - return schema; + schema = OWLManager.createOWLOntologyManager().loadOntologyFromOntologyDocument(in); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { @@ -59,11 +78,23 @@ "http://dbpedia.org/ontology/", Sets.newHashSet(new NamedClass("http://dbpedia.org/ontology/Person")), maxNrOfInstancesPerClass); + cleanUpModel(sample); + filter(sample, "http://dbpedia.org/ontology/"); + OWLEntityTypeAdder.addEntityTypes(sample); + StmtIterator iterator = sample.listStatements(); + while(iterator.hasNext()){ + System.out.println(iterator.next()); + } + try { ByteArrayOutputStream baos = new ByteArrayOutputStream(); sample.write(baos, "TURTLE", null); OWLOntologyManager man = OWLManager.createOWLOntologyManager(); OWLOntology ontology = man.loadOntologyFromOntologyDocument(new ByteArrayInputStream(baos.toByteArray())); + man.addAxioms(ontology, schema.getAxioms()); + man.removeAxioms(ontology, ontology.getAxioms(AxiomType.FUNCTIONAL_DATA_PROPERTY)); + man.removeAxioms(ontology, ontology.getAxioms(AxiomType.FUNCTIONAL_OBJECT_PROPERTY)); + man.removeAxioms(ontology, ontology.getAxioms(AxiomType.DATA_PROPERTY_RANGE)); return ontology; } catch (Exception e) { e.printStackTrace(); @@ -72,8 +103,78 @@ return null; } + /** + * Filter triples which are not relevant based on the given knowledge base + * namespace. + * + * @param model + * @param namespace + */ + private void filter(Model model, String namespace) { + List<Statement> statementsToRemove = new ArrayList<Statement>(); + for (Iterator<Statement> iter = model.listStatements().toList().iterator(); iter.hasNext();) { + Statement st = iter.next(); + Property predicate = st.getPredicate(); + if (predicate.equals(RDF.type)) { + if (!st.getObject().asResource().getURI().startsWith(namespace)) { + statementsToRemove.add(st); + } else if (st.getObject().equals(OWL.FunctionalProperty.asNode())) { + statementsToRemove.add(st); + } else if (st.getObject().isLiteral() && st.getObject().asLiteral().getDatatypeURI().equals(XSD.gYear.getURI())) { + statementsToRemove.add(st); + } + } else if (!predicate.equals(RDFS.subClassOf) && !predicate.equals(OWL.sameAs) && !predicate.asResource().getURI().startsWith(namespace)) { + statementsToRemove.add(st); + } + } + model.remove(statementsToRemove); + } + private static void cleanUpModel(Model model) { + // filter out triples with String literals, as therein often occur + // some syntax errors and they are not relevant for learning + List<Statement> statementsToRemove = new ArrayList<Statement>(); + for (Iterator<Statement> iter = model.listStatements().toList().iterator(); iter.hasNext();) { + Statement st = iter.next(); + RDFNode object = st.getObject(); + if (object.isLiteral()) { + // statementsToRemove.add(st); + Literal lit = object.asLiteral(); + if (lit.getDatatype() == null || lit.getDatatype().equals(XSD.xstring)) { + st.changeObject("shortened", "en"); + } else if (lit.getDatatype().getURI().equals(XSD.gYear.getURI())) { + statementsToRemove.add(st); + // System.err.println("REMOVE " + st); + } else if (lit.getDatatype().getURI().equals(XSD.gYearMonth.getURI())) { + statementsToRemove.add(st); +// System.err.println("REMOVE " + st); + } + } + //remove statements like <x a owl:Class> + if (st.getPredicate().equals(RDF.type)) { + if (object.equals(RDFS.Class.asNode()) || object.equals(OWL.Class.asNode()) || object.equals(RDFS.Literal.asNode()) + || object.equals(RDFS.Resource)) { + statementsToRemove.add(st); + } + } + //remove unwanted properties + String dbo = "http://dbpedia.org/ontology/"; + Set<String> blackList = Sets.newHashSet(dbo + "wikiPageDisambiguates",dbo + "wikiPageExternalLink", + dbo + "wikiPageID", dbo + "wikiPageInterLanguageLink", dbo + "wikiPageRedirects", dbo + "wikiPageRevisionID", + dbo + "wikiPageWikiLink"); + for(String bl: blackList){ + if (st.getPredicate().getURI().equals(bl)) { + statementsToRemove.add(st); + } + } + } + + model.remove(statementsToRemove); + } + + + /* (non-Javadoc) * @see org.dllearner.algorithms.isle.Experiment#getDocuments() */ Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java 2013-12-10 13:25:25 UTC (rev 4204) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java 2013-12-10 13:47:53 UTC (rev 4205) @@ -93,10 +93,10 @@ documents = getDocuments(); // build semantic index - SemanticIndex semanticIndex = SemanticIndexGenerator.generateIndex(documents, ontology, false); - - // set the relevance metric - relevance = new PMIRelevanceMetric(semanticIndex); +// SemanticIndex semanticIndex = SemanticIndexGenerator.generateIndex(documents, ontology, false); +// +// // set the relevance metric +// relevance = new PMIRelevanceMetric(semanticIndex); try { // set KB KnowledgeSource ks = new OWLAPIOntology(ontology); @@ -233,8 +233,8 @@ //get the start class for the learning algorithms Description startClass = getStartClass(cls, equivalence, true); - Map<Entity, Double> entityRelevance = RelevanceUtils.getRelevantEntities(cls, ontology, relevance); - NLPHeuristic heuristic = new NLPHeuristic(entityRelevance); +// Map<Entity, Double> entityRelevance = RelevanceUtils.getRelevantEntities(cls, ontology, relevance); +// NLPHeuristic heuristic = new NLPHeuristic(entityRelevance); ClassLearningProblem clp = new ClassLearningProblem(reasoner); clp.setClassToDescribe(cls); @@ -247,9 +247,9 @@ rop.init(); // perform cross validation with ISLE - ISLE isle = new ISLE(lp, reasoner); - isle.setHeuristic(heuristic); - isle.setMaxNrOfResults(3); + ISLE isle = new ISLE(clp, reasoner); +// isle.setHeuristic(heuristic); + isle.setMaxNrOfResults(20); isle.setOperator(rop); isle.setMaxExecutionTimeInSeconds(maxExecutionTimeInSeconds); isle.setStartClass(startClass); @@ -260,9 +260,10 @@ // isle.setTerminateOnNoiseReached(true); isle.setIgnoredConcepts(Collections.singleton(cls)); isle.setReplaceSearchTree(true); - isle.setMaxExecutionTimeInSeconds(10); + isle.setMaxExecutionTimeInSeconds(maxExecutionTimeInSeconds); isle.init(); - isle.start();System.exit(1); + isle.start(); + System.exit(1); List<? extends EvaluatedDescription> currentlyBestDescriptions = isle.getCurrentlyBestEvaluatedDescriptions(20); for (EvaluatedDescription description : currentlyBestDescriptions) { System.out.println(description); Added: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetricTest.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetricTest.java (rev 0) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetricTest.java 2013-12-10 13:47:53 UTC (rev 4205) @@ -0,0 +1,83 @@ +/** + * + */ +package org.dllearner.algorithms.isle.metrics; + +import static org.junit.Assert.fail; + +import java.io.BufferedInputStream; +import java.io.InputStream; +import java.net.URL; + +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.dllearner.algorithms.isle.index.Index; +import org.dllearner.algorithms.isle.index.syntactic.SolrSyntacticIndex; +import org.dllearner.core.owl.Entity; +import org.dllearner.core.owl.NamedClass; +import org.dllearner.core.owl.ObjectProperty; +import org.junit.Test; +import org.semanticweb.owlapi.apibinding.OWLManager; +import org.semanticweb.owlapi.model.OWLOntology; + +/** + * @author Lorenz Buehmann + * + */ +public class PMIRelevanceMetricTest { + + AbstractRelevanceMetric metric; + static final String solrServerURL = "http://[2001:638:902:2010:0:168:35:138]:8080/solr/en_dbpedia_resources/"; + static final String searchField = "comment"; + static final String DBPEDIA_NS = "http://dbpedia.org/ontology/"; + + /** + * + */ + public PMIRelevanceMetricTest() { + OWLOntology ontology = null; + try { + URL url = new URL("http://downloads.dbpedia.org/3.9/dbpedia_3.9.owl.bz2"); + InputStream is = new BufferedInputStream(url.openStream()); + CompressorInputStream in = new CompressorStreamFactory().createCompressorInputStream("bzip2", is); + ontology = OWLManager.createOWLOntologyManager().loadOntologyFromOntologyDocument(in); + } catch (Exception e){ + e.printStackTrace(); + } + Index index = new SolrSyntacticIndex(ontology, solrServerURL, searchField); + metric = new PMIRelevanceMetric(index); + } + + /** + * Test method for {@link org.dllearner.algorithms.isle.metrics.PMIRelevanceMetric#getRelevance(org.dllearner.core.owl.Entity, org.dllearner.core.owl.Entity)}. + */ + @Test + public void testGetRelevance() { + //dbo:Person and dbo:Film + Entity entity1 = new NamedClass(DBPEDIA_NS + "Person"); + Entity entity2 = new NamedClass(DBPEDIA_NS + "Film"); + double relevance = metric.getRelevance(entity1, entity2); + System.out.println(relevance); + + //dbo:Person and dbo:Animal + entity1 = new NamedClass(DBPEDIA_NS + "Person"); + entity2 = new NamedClass(DBPEDIA_NS + "Animal"); + relevance = metric.getRelevance(entity1, entity2); + System.out.println(relevance); + + // dbo:Person and dbo:Animal + entity1 = new NamedClass(DBPEDIA_NS + "Person"); + entity2 = new ObjectProperty(DBPEDIA_NS + "birthPlace"); + relevance = metric.getRelevance(entity1, entity2); + System.out.println(relevance); + } + + /** + * Test method for {@link org.dllearner.algorithms.isle.metrics.PMIRelevanceMetric#getNormalizedRelevance(org.dllearner.core.owl.Entity, org.dllearner.core.owl.Entity)}. + */ + @Test + public void testGetNormalizedRelevance() { + fail("Not yet implemented"); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-10 13:25:28
|
Revision: 4204 http://sourceforge.net/p/dl-learner/code/4204 Author: lorenz_b Date: 2013-12-10 13:25:25 +0000 (Tue, 10 Dec 2013) Log Message: ----------- Added SOLR dep. Modified Paths: -------------- trunk/components-core/pom.xml Modified: trunk/components-core/pom.xml =================================================================== --- trunk/components-core/pom.xml 2013-12-10 13:16:47 UTC (rev 4203) +++ trunk/components-core/pom.xml 2013-12-10 13:25:25 UTC (rev 4204) @@ -325,6 +325,11 @@ <groupId>com.h2database</groupId> <artifactId>h2</artifactId> </dependency> + <dependency> + <groupId>org.apache.solr</groupId> + <artifactId>solr-solrj</artifactId> + <version>4.4.0</version> + </dependency> </dependencies> <dependencyManagement> <dependencies> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-10 13:16:50
|
Revision: 4203 http://sourceforge.net/p/dl-learner/code/4203 Author: lorenz_b Date: 2013-12-10 13:16:47 +0000 (Tue, 10 Dec 2013) Log Message: ----------- Added SOLR based synatctic index. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/NTriplesFileLuceneSyntacticIndexCreator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -112,14 +112,14 @@ // OWLClassExpression owlapiDescription = OWLAPIConverter.getOWLAPIDescription(expression); // Set<Entity> entities = OWLAPIConverter.getEntities(owlapiDescription.getSignature()); Set<Entity> entities = expression.getSignature(); - double sum = 0; - for (Entity entity : entities) { - double relevance = entityRelevance.containsKey(entity) ? entityRelevance.get(entity) : 0;//System.out.println(entity + ":" + relevance); - if(!Double.isInfinite(relevance)){ - sum += relevance; - } - } - score += nlpBonusFactor * sum; +// double sum = 0; +// for (Entity entity : entities) { +// double relevance = entityRelevance.containsKey(entity) ? entityRelevance.get(entity) : 0;//System.out.println(entity + ":" + relevance); +// if(!Double.isInfinite(relevance)){ +// sum += relevance; +// } +// } +// score += nlpBonusFactor * sum; return score; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -74,15 +74,15 @@ // System.out.println(tree.headTerminal(headFinder)); head = tree.headTerminal(headFinder).toString(); - // Create a reusable pattern object - TregexPattern patternMW = TregexPattern.compile("__ >># NP"); - // Run the pattern on one particular tree - TregexMatcher matcher = patternMW.matcher(tree); - // Iterate over all of the subtrees that matched - while (matcher.findNextMatchingNode()) { - Tree match = matcher.getMatch(); - // do what we want to with the subtree - } +// // Create a reusable pattern object +// TregexPattern patternMW = TregexPattern.compile("__ >># NP"); +// // Run the pattern on one particular tree +// TregexMatcher matcher = patternMW.matcher(tree); +// // Iterate over all of the subtrees that matched +// while (matcher.findNextMatchingNode()) { +// Tree match = matcher.getMatch(); +// // do what we want to with the subtree +// } } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -21,11 +21,29 @@ * @return set of documents retrieved based on the given query string */ Set<AnnotatedDocument> getDocuments(Entity entity); + + /** + * Returns a set of documents based on how the underlying index is processing the given + * search string. + * + * @param searchString query specifying the documents to retrieve + * @return set of documents retrieved based on the given query string + */ + long getNumberOfDocumentsFor(Entity entity); + + /** + * Returns a set of documents based on how the underlying index is processing the given + * search string. + * + * @param searchString query specifying the documents to retrieve + * @return set of documents retrieved based on the given query string + */ + long getNumberOfDocumentsFor(Entity... entities); /** * Returns the total number of documents contained in the index. * * @return the total number of documents contained in the index */ - int getTotalNumberOfDocuments(); + long getTotalNumberOfDocuments(); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -5,6 +5,7 @@ import java.util.Set; import org.dllearner.algorithms.isle.index.AnnotatedDocument; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.core.owl.Entity; /** @@ -14,7 +15,7 @@ * @author Lorenz Buehmann * @author Daniel Fleischhacker */ -public class SemanticIndex extends HashMap<Entity, Set<AnnotatedDocument>>{ +public class SemanticIndex extends HashMap<Entity, Set<AnnotatedDocument>> implements Index{ private int nrOfDocuments; @@ -49,11 +50,33 @@ this.nrOfDocuments = nrOfDocuments; } - /** - * @return the nrOfDocuments + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() */ - public int getTotalNrOfDocuments() { + @Override + public long getTotalNumberOfDocuments() { return nrOfDocuments; } + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity) + */ + @Override + public long getNumberOfDocumentsFor(Entity entity) { + return getDocuments(entity).size(); + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity[]) + */ + @Override + public long getNumberOfDocumentsFor(Entity... entities) { + + Set<AnnotatedDocument> documents = getDocuments(entities[0]); + for (int i = 1; i < entities.length; i++) { + documents.retainAll(getDocuments(entities[i])); + } + return 0; + } + } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -80,7 +80,7 @@ for (Token token : tokens) { try { Query query = parser.parse(token.getRawForm()); - ScoreDoc[] result = searcher.search(query, getTotalNumberOfDocuments()).scoreDocs; + ScoreDoc[] result = searcher.search(query, indexReader.numDocs()).scoreDocs; for (int i = 0; i < result.length; i++) { Document doc = searcher.doc(result[i].doc); documents.add(new AnnotatedTextDocument( @@ -102,7 +102,7 @@ * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() */ @Override - public int getTotalNumberOfDocuments() { + public long getTotalNumberOfDocuments() { return indexReader.numDocs(); } @@ -120,5 +120,21 @@ return documents; } + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity) + */ + @Override + public long getNumberOfDocumentsFor(Entity entity) { + return 0; + } + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity[]) + */ + @Override + public long getNumberOfDocumentsFor(Entity... entities) { + return 0; + } + + } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/NTriplesFileLuceneSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/NTriplesFileLuceneSyntacticIndexCreator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/NTriplesFileLuceneSyntacticIndexCreator.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -0,0 +1,122 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index.syntactic; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +import org.apache.jena.riot.Lang; +import org.apache.jena.riot.RiotReader; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; + +import com.hp.hpl.jena.graph.Triple; + +/** + * Creates a Lucene Index for the labels if classes and properties. + * @author Lorenz Buehmann + * + */ +public class NTriplesFileLuceneSyntacticIndexCreator { + + public NTriplesFileLuceneSyntacticIndexCreator(InputStream nTriplesStream, String indexPath, String searchField) throws IOException { + //setup the index + Directory directory = FSDirectory.open(new File(indexPath)); + + //setup the index analyzer + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); + indexWriterConfig.setRAMBufferSizeMB(1024.0); + indexWriterConfig.setOpenMode(OpenMode.CREATE); + IndexWriter writer = new IndexWriter(directory, indexWriterConfig); + + System.out.println( "Creating index ..." ); + + // setup the index fields, here two fields, for URI and text + FieldType stringType = new FieldType(StringField.TYPE_STORED); + stringType.setStoreTermVectors(false); + FieldType textType = new FieldType(TextField.TYPE_STORED); + textType.setStoreTermVectors(false); + + Set<Document> documents = new HashSet<Document>(); + + Iterator<Triple> iterator = RiotReader.createIteratorTriples(nTriplesStream, Lang.NTRIPLES, null); + + Triple triple; + String text; + String uri; + Document doc; + int i = 0; + while(iterator.hasNext()){ + triple = iterator.next(); + + uri = triple.getSubject().getURI(); + text = triple.getObject().getLiteralLexicalForm(); + + doc = new Document(); + doc.add(new Field("uri", uri, stringType)); + doc.add(new Field(searchField, text, textType)); + + writer.addDocument(doc); + if(i++ % 10000 == 0){ +// writer.commit(); + System.out.println(i); + } + + } + + writer.commit(); + writer.close(); + } + + public static void main(String[] args) throws Exception { + String indexFile = "/home/me/Documents/short_abstracts_en.nt"; +// indexFile = "/tmp/test.nt"; + String indexPath = "/home/me/Documents/dbpedia/short_abstracts_index"; +// indexPath = "/tmp/index"; + String field = "text"; + new NTriplesFileLuceneSyntacticIndexCreator(new FileInputStream(indexFile), indexPath, field); + + IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); + IndexSearcher searcher = new IndexSearcher(reader); + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + + QueryParser parser = new QueryParser(Version.LUCENE_43, field, analyzer); + Query query = parser.parse("film AND direction"); + + TopDocs docs = searcher.search(query, 10); + ScoreDoc[] scoreDocs = docs.scoreDocs; + + for (int i = 0; i < scoreDocs.length; i++) { + Document doc = searcher.doc(scoreDocs[i].doc); + System.out.println(doc.get(field)); + + } + } + + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -0,0 +1,176 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index.syntactic; + +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrServer; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.impl.HttpSolrServer; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.AnnotatedDocument; +import org.dllearner.algorithms.isle.index.AnnotatedTextDocument; +import org.dllearner.algorithms.isle.index.Index; +import org.dllearner.algorithms.isle.index.Token; +import org.dllearner.algorithms.isle.textretrieval.AnnotationEntityTextRetriever; +import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; + +import com.google.common.base.Joiner; + +/** + * @author Lorenz Buehmann + * + */ +public class SolrSyntacticIndex implements Index{ + + private SolrServer solr; + private AnnotationEntityTextRetriever textRetriever; + private String searchField; + + long totalNumberOfDocuments = -1; + + public SolrSyntacticIndex(OWLOntology ontology, String solrServerURL, String searchField) { + this.searchField = searchField; + solr = new HttpSolrServer(solrServerURL); + textRetriever = new RDFSLabelEntityTextRetriever(ontology); + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getDocuments(org.dllearner.core.owl.Entity) + */ + @Override + public Set<AnnotatedDocument> getDocuments(Entity entity) { + Set<AnnotatedDocument> documents = new HashSet<AnnotatedDocument>(); + + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + + for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { + List<Token> tokens = entry.getKey(); + for (Token token : tokens) { + SolrQuery query = new SolrQuery(searchField + ":" + token.getRawForm()); + query.setRows(Integer.MAX_VALUE);//can be very slow + try { + QueryResponse response = solr.query(query); + SolrDocumentList list = response.getResults(); + System.out.println(list.getNumFound()); + for (SolrDocument doc : list) { + String uri = (String) doc.getFieldValue("uri"); + String comment = (String) doc.getFieldValue(searchField); + + documents.add(new AnnotatedTextDocument( + TextDocumentGenerator.getInstance().generateDocument((String) doc.getFieldValue(searchField)), + Collections.EMPTY_SET)); + } + } catch (SolrServerException e) { + e.printStackTrace(); + } + } + } + return documents; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() + */ + @Override + public long getTotalNumberOfDocuments() { + if(totalNumberOfDocuments == -1){ + SolrQuery q = new SolrQuery("*:*"); + q.setRows(0); // don't actually request any data + try { + totalNumberOfDocuments = solr.query(q).getResults().getNumFound(); + } catch (SolrServerException e) { + e.printStackTrace(); + } + } + return totalNumberOfDocuments; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity) + */ + @Override + public long getNumberOfDocumentsFor(Entity entity) { + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + + String queryString = "("; + Set<String> terms = new HashSet<>(); + for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { + List<Token> tokens = entry.getKey(); + String phrase = ""; + for (Token token : tokens) { +// terms.add(token.getRawForm()); + phrase += token.getRawForm() + " "; + } + phrase.trim(); + terms.add(phrase); + } + queryString += Joiner.on("OR").join(terms); + queryString += ")"; + + SolrQuery query = new SolrQuery(searchField + ":" + queryString);System.out.println(query); + try { + QueryResponse response = solr.query(query); + SolrDocumentList list = response.getResults(); + return list.getNumFound(); + } catch (SolrServerException e) { + e.printStackTrace(); + } + return -1; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity[]) + */ + @Override + public long getNumberOfDocumentsFor(Entity... entities) { + + Set<String> queryStringParts = new HashSet<>(); + + for (Entity entity : entities) { + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + + String queryString = "("; + Set<String> terms = new HashSet<>(); + for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { + List<Token> tokens = entry.getKey(); + String phrase = ""; + for (Token token : tokens) { +// terms.add(token.getRawForm()); + phrase += token.getRawForm() + " "; + } + phrase.trim(); + terms.add(phrase); + } + queryString += Joiner.on("OR").join(terms); + queryString += ")"; + queryStringParts.add(queryString); + } + + String queryStringConjuction = "(" + Joiner.on("AND").join(queryStringParts) + ")"; + + + SolrQuery query = new SolrQuery(searchField + ":" + queryStringConjuction);System.out.println(query); + try { + QueryResponse response = solr.query(query); + SolrDocumentList list = response.getResults(); + return list.getNumFound(); + } catch (SolrServerException e) { + e.printStackTrace(); + } + return -1; + } + +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -6,7 +6,7 @@ import java.util.HashMap; import java.util.Map; -import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; +import org.dllearner.algorithms.isle.index.Index; import org.semanticweb.owlapi.model.OWLEntity; /** @@ -15,9 +15,9 @@ */ public abstract class AbstractRelevanceMetric implements RelevanceMetric { - protected SemanticIndex index; + protected Index index; - public AbstractRelevanceMetric(SemanticIndex index) { + public AbstractRelevanceMetric(Index index) { this.index = index; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -6,7 +6,7 @@ import java.util.Set; import org.dllearner.algorithms.isle.index.AnnotatedDocument; -import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.core.owl.Entity; import com.google.common.collect.Sets; @@ -17,21 +17,22 @@ */ public class PMIRelevanceMetric extends AbstractRelevanceMetric { - public PMIRelevanceMetric(SemanticIndex index) { + public PMIRelevanceMetric(Index index) { super(index); } @Override public double getRelevance(Entity entityA, Entity entityB){ - Set<AnnotatedDocument> documentsA = index.getDocuments(entityA); - Set<AnnotatedDocument> documentsB = index.getDocuments(entityB); - Set<AnnotatedDocument> documentsAB = Sets.intersection(documentsA, documentsB); - int nrOfDocuments = index.getTotalNrOfDocuments(); + long nrOfDocumentsA = index.getNumberOfDocumentsFor(entityA); + long nrOfDocumentsB = index.getNumberOfDocumentsFor(entityB); + long nrOfDocumentsAB = index.getNumberOfDocumentsFor(entityA, entityB); - double pA = nrOfDocuments == 0 ? 0 : ((double) documentsA.size() / (double) nrOfDocuments); - double pB = nrOfDocuments == 0 ? 0 : ((double) documentsB.size() / (double) nrOfDocuments); - double pAB = nrOfDocuments == 0 ? 0 : ((double) documentsAB.size() / (double) nrOfDocuments); + long nrOfDocuments = index.getTotalNumberOfDocuments(); + double pA = nrOfDocuments == 0 ? 0 : ((double) nrOfDocumentsA / (double) nrOfDocuments); + double pB = nrOfDocuments == 0 ? 0 : ((double) nrOfDocumentsB / (double) nrOfDocuments); + double pAB = nrOfDocuments == 0 ? 0 : ((double) nrOfDocumentsAB / (double) nrOfDocuments); + double pmi = Math.log(pAB / pA * pB); return pmi; @@ -42,7 +43,7 @@ Set<AnnotatedDocument> documentsA = index.getDocuments(entityA); Set<AnnotatedDocument> documentsB = index.getDocuments(entityB); Set<AnnotatedDocument> documentsAB = Sets.intersection(documentsA, documentsB); - int nrOfDocuments = index.getTotalNrOfDocuments(); + long nrOfDocuments = index.getTotalNumberOfDocuments(); // System.out.println("A:" + documentsA.size()); // System.out.println("B:" + documentsB.size()); // System.out.println("AB:" + documentsAB.size()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-10 12:52:55
|
Revision: 4202 http://sourceforge.net/p/dl-learner/code/4202 Author: dfleischhacker Date: 2013-12-10 12:52:52 +0000 (Tue, 10 Dec 2013) Log Message: ----------- Use hyponyms for creating the token tree Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-12-10 09:56:55 UTC (rev 4201) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-12-10 12:52:52 UTC (rev 4202) @@ -3,8 +3,7 @@ import net.didion.jwnl.JWNL; import net.didion.jwnl.JWNLException; import net.didion.jwnl.data.*; -import net.didion.jwnl.data.list.PointerTargetNode; -import net.didion.jwnl.data.list.PointerTargetNodeList; +import net.didion.jwnl.data.list.*; import net.didion.jwnl.dictionary.Dictionary; import java.io.InputStream; @@ -49,6 +48,13 @@ public static void main(String[] args) { System.out.println(new WordNet().getBestSynonyms(POS.VERB, "learn")); System.out.println(new WordNet().getSisterTerms(POS.NOUN, "actress")); + System.out.println("Hypernyms **************************"); + System.out.println(new WordNet().getHypernyms(POS.NOUN, "man")); + System.out.println("Hyponyms ****************************"); + System.out.println(new WordNet().getHyponyms(POS.NOUN, "god")); + System.out.println("Words for first synset **************************"); + System.out.println(new WordNet().getWordsForFirstSynset(POS.NOUN, "man")); + } public List<String> getBestSynonyms(POS pos, String s) { @@ -178,6 +184,103 @@ } /** + * Returns a list of lemmas for the most frequent synset of the given word. + * @param word word to get synonyms for + * @param pos POS of the word to look up + * @return list of lemmas of the most frequent synset + */ + public List<String> getWordsForFirstSynset(POS pos, String word) { + List<String> result = new ArrayList<>(); + IndexWord indexWord = null; + Synset sense = null; + + try { + indexWord = dict.getIndexWord(pos, word); + sense = indexWord.getSense(1); + for (Word w : sense.getWords()) { + result.add(w.getLemma()); + } + } + catch (JWNLException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + + return result; + } + + /** + * Returns a list of words being lemmas of a most frequent synset for the given word or one of its hypernyms. + */ + public List<String> getHypernyms(POS pos, String word) { + List<String> result = new ArrayList<>(); + + IndexWord indexWord; + Synset sense; + + try { + indexWord = dict.getIndexWord(pos, word); + if (indexWord == null) { + return result; + } + sense = indexWord.getSense(1); + for (Word w : sense.getWords()) { + result.add(w.getLemma()); + } + PointerTargetNodeList target = PointerUtils.getInstance().getDirectHypernyms(sense); + while (target != null && !target.isEmpty()) { + for (int i = 0; i < target.size(); i++) { + Synset s = ((PointerTargetNode) target.get(i)).getSynset(); + for (Word w : sense.getWords()) { + result.add(w.getLemma()); + } + } + target = PointerUtils.getInstance().getDirectHyponyms(((PointerTargetNode) target.get(0)).getSynset()); + System.out.println(target); + } + } + catch (JWNLException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + + return result; + } + + public List<String> getHyponyms(POS pos, String s) { + ArrayList<String> result = new ArrayList<>(); + try { + IndexWord word = dict.getIndexWord(pos, s); + if (word == null) { + System.err.println("Unable to find index word for " + s); + return result; + } + Synset sense = word.getSense(1); + getHyponymsRecursive(result, sense, 3); + } + catch (JWNLException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + return result; + } + + public void getHyponymsRecursive(List<String> lemmas, Synset sense, int depthToGo) { + for (Word w : sense.getWords()) { + lemmas.add(w.getLemma()); + } + if (depthToGo == 0) { + return; + } + try { + PointerTargetNodeList directHyponyms = PointerUtils.getInstance().getDirectHyponyms(sense); + for (Object directHyponym : directHyponyms) { + getHyponymsRecursive(lemmas, ((PointerTargetNode) directHyponym).getSynset(), depthToGo - 1); + } + } + catch (JWNLException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + } + + /** * Funktion returns a List of Hypo and Hypernyms of a given string * * @param s Word for which you want to get Hypo and Hypersyms Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-12-10 09:56:55 UTC (rev 4201) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-12-10 12:52:52 UTC (rev 4202) @@ -97,10 +97,25 @@ } /** - * Returns an array of all synonyms for the given word. Only synonyms for the POS in {@link #RELEVANT_POS} are - * returned. + * Iterates through the hypernym tree for the given word at the given POS and returns a list of all lemmas of the + * most frequent synsets visited during traversing the tree. + * @param word word to get hypernyms for + * @param pos POS to get hypernyms for + * @return list of all lemmas of all hypernyms for the given word + */ + public String[] getAllHyponymsForWord(String word, POS pos) { + ArrayList<String> hyponyms = new ArrayList<>(); + + hyponyms.addAll(wn.getHyponyms(pos, word)); + + return hyponyms.toArray(new String[hyponyms.size()]); + } + + /** + * Returns an array of all synonyms for the given word for the given POS. * * @param word the word to retrieve synonyms for + * @param pos POS to retrieve synonyms for * @return synonyms for the given word */ public String[] getSynonymsForWord(String word, POS pos) { Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-10 09:56:55 UTC (rev 4201) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-10 12:52:52 UTC (rev 4202) @@ -88,14 +88,16 @@ if (wordnetPos == null) { continue; } - String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); + //String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); + String[] synonyms = LinguisticUtil.getInstance().getAllHyponymsForWord(t.getRawForm(), wordnetPos); for (String synonym : synonyms) { // ignore all multi word synonyms if (synonym.contains("_")) { continue; } - t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); + //t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); + t.addAlternativeForm(synonym); } } } @@ -111,7 +113,7 @@ @Override public Set<Entity> getCandidateEntities(List<Token> tokens) { - return tree.get(tokens); + return tree.getAllEntities(tokens); } @Override Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-10 09:56:55 UTC (rev 4201) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-10 12:52:52 UTC (rev 4202) @@ -28,7 +28,9 @@ } public Set<Entity> getCandidates(Annotation annotation) { - return candidatesTrie.getCandidateEntities(annotation.getTokens()); + Set<Entity> candidateEntities = candidatesTrie.getCandidateEntities(annotation.getTokens()); + System.out.println(annotation + " --> " + candidateEntities); + return candidateEntities; } /** Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java 2013-12-10 09:56:55 UTC (rev 4201) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java 2013-12-10 12:52:52 UTC (rev 4202) @@ -3,16 +3,8 @@ */ package org.dllearner.algorithms.isle; -import java.io.BufferedInputStream; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.HashSet; -import java.util.Set; - +import com.google.common.collect.Sets; +import com.hp.hpl.jena.rdf.model.Model; import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorInputStream; import org.apache.commons.compress.compressors.CompressorStreamFactory; @@ -23,8 +15,11 @@ import org.semanticweb.owlapi.model.OWLOntologyCreationException; import org.semanticweb.owlapi.model.OWLOntologyManager; -import com.google.common.collect.Sets; -import com.hp.hpl.jena.rdf.model.Model; +import java.io.*; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashSet; +import java.util.Set; /** * @author Lorenz Buehmann @@ -92,7 +87,7 @@ maxNrOfInstancesPerClass)); documents.clear(); - documents.add("Thomas Cruise Mapother IV, widely known as Tom Cruise, is an American film actor and producer. He has been nominated for three Academy Awards and has won three Golden Globe Awards. He started his career at age 19 in the 1981 film Taps. His first leading role was in Risky Business, released in August 1983. Cruise became a full-fledged movie star after starring in Top Gun (1986). He is well known for his role as secret agent Ethan Hunt in the Mission: Impossible film series between 1996 and 2011. Cruise has starred in many Hollywood blockbusters, including Rain Man (1988), A Few Good Men (1992), Jerry Maguire (1996), Vanilla Sky (2001), Minority Report (2002), The Last Samurai (2003), Collateral (2004), War of the Worlds (2005), Tropic Thunder (2008) and Jack Reacher (2012). As of 2012, Cruise is Hollywood's highest-paid actor. Cruise is known for his Scientologist faith and for his support of the Church of Scientology."); + documents.add("Thomas Cruise Mapother IV, widely known as Tom Cruise, is an American film player and producer. He has been nominated for three Academy Awards and has won three Golden Globe Awards. He started his career at age 19 in the 1981 film Taps. His first leading role was in Risky Business, released in August 1983. Cruise became a full-fledged movie star after starring in Top Gun (1986). He is well known for his role as secret agent Ethan Hunt in the Mission: Impossible film series between 1996 and 2011. Cruise has starred in many Hollywood blockbusters, including Rain Man (1988), A Few Good Men (1992), Jerry Maguire (1996), Vanilla Sky (2001), Minority Report (2002), The Last Samurai (2003), Collateral (2004), War of the Worlds (2005), Tropic Thunder (2008) and Jack Reacher (2012). As of 2012, Cruise is Hollywood's highest-paid actor. Cruise is known for his Scientologist faith and for his support of the Church of Scientology."); return documents; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-10 09:56:58
|
Revision: 4201 http://sourceforge.net/p/dl-learner/code/4201 Author: dfleischhacker Date: 2013-12-10 09:56:55 +0000 (Tue, 10 Dec 2013) Log Message: ----------- Retrieve entities from all possible leaf nodes in the TokenTree Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-09 15:38:09 UTC (rev 4200) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-10 09:56:55 UTC (rev 4201) @@ -81,7 +81,7 @@ public Set<Entity> get(List<Token> tokens) { TokenTree curNode = this; for (Token t : tokens) { - TokenTree nextNode = curNode.children.get(t); + TokenTree nextNode = getNextTokenTree(curNode, t); if (nextNode == null) { return null; } @@ -90,6 +90,25 @@ return curNode.entities; } + public Set<Entity> getAllEntities(List<Token> tokens) { + HashSet<Entity> resEntities = new HashSet<>(); + getAllEntitiesRec(tokens, 0, this, resEntities); + return resEntities; + } + + public void getAllEntitiesRec(List<Token> tokens, int curPosition, TokenTree curTree, HashSet<Entity> resEntities) { + if (curPosition == tokens.size()) { + resEntities.addAll(curTree.entities); + return; + } + Token t = tokens.get(curPosition); + for (Map.Entry<Token, TokenTree> entry : curTree.children.entrySet()) { + if (t.equalsWithAlternativeForms(entry.getKey())) { + getAllEntitiesRec(tokens, curPosition + 1, entry.getValue(), resEntities); + } + } + } + /** * Returns the list of tokens which are the longest match with entities assigned in this tree. * @@ -148,7 +167,7 @@ } /** - * Returns the original token for the longest match + * Returns the original ontology tokens for the longest match */ public List<Token> getOriginalTokensForLongestMatch(List<Token> tokens) { TokenTree fallback = this.entities.isEmpty() ? null : this; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-09 15:38:13
|
Revision: 4200 http://sourceforge.net/p/dl-learner/code/4200 Author: lorenz_b Date: 2013-12-09 15:38:09 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Refactored. Modified Paths: -------------- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestNoCorpus.java Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestNoCorpus.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestNoCorpus.java 2013-12-09 15:37:39 UTC (rev 4199) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestNoCorpus.java 2013-12-09 15:38:09 UTC (rev 4200) @@ -3,10 +3,10 @@ import java.io.File; import java.util.Map; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.dllearner.algorithms.isle.index.semantic.SemanticIndexGenerator; import org.dllearner.algorithms.isle.index.syntactic.OWLOntologyLuceneSyntacticIndexCreator; -import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; import org.dllearner.algorithms.isle.metrics.PMIRelevanceMetric; import org.dllearner.algorithms.isle.metrics.RelevanceMetric; import org.dllearner.algorithms.isle.metrics.RelevanceUtils; @@ -37,7 +37,7 @@ private RelevanceMetric relevance; private String searchField = "label"; private SemanticIndex semanticIndex; - private SyntacticIndex syntacticIndex; + private Index syntacticIndex; // we assume that the ontology is named "ontology.owl" and that all text files // are in a subdirectory called "corpus" This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-09 15:37:42
|
Revision: 4199 http://sourceforge.net/p/dl-learner/code/4199 Author: dfleischhacker Date: 2013-12-09 15:37:39 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Cleanup and show alternative names Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java 2013-12-09 15:36:38 UTC (rev 4198) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java 2013-12-09 15:37:39 UTC (rev 4199) @@ -1,31 +0,0 @@ -package org.dllearner.algorithms.isle.index; - -import org.dllearner.core.owl.Entity; - -import java.util.HashSet; -import java.util.Set; - -/** - * A pair consisting of a full string token and the corresponding entities - */ -public class FullTokenEntitySetPair { - private String fullToken; - private Set<Entity> entitySet; - - public FullTokenEntitySetPair(String fullToken) { - this.fullToken = fullToken; - this.entitySet = new HashSet<Entity>(); - } - - public String getFullToken() { - return fullToken; - } - - public Set<Entity> getEntitySet() { - return entitySet; - } - - public void addEntity(Entity entity) { - entitySet.add(entity); - } -} Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java 2013-12-09 15:36:38 UTC (rev 4198) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java 2013-12-09 15:37:39 UTC (rev 4199) @@ -1,141 +0,0 @@ -package org.dllearner.algorithms.isle.index; - -import java.util.ArrayList; - -/** - * Provides text normalization and mapping of normalized ranges to the original ones. - */ -public class NormalizedTextMapper { - private Document originalDocument; - private String originalText; - private String normalizedText; - - private ArrayList<OccurenceMappingPair> normalizedIndexToOriginalIndex; - - public NormalizedTextMapper(Document original) { - this.originalDocument = original; - this.originalText = original.getContent(); - this.normalizedIndexToOriginalIndex = new ArrayList<OccurenceMappingPair>(); - - StringBuilder sb = new StringBuilder(); - int currentOriginalIndex = 0; - for (String originalWord : originalText.split(" ")) { - String normalizedWord = getNormalizedWord(originalWord); - normalizedIndexToOriginalIndex - .add(new OccurenceMappingPair(currentOriginalIndex, originalWord.length(), sb.length(), - normalizedWord.length())); - currentOriginalIndex += originalWord.length() + 1; - sb.append(normalizedWord); - sb.append(" "); - } - normalizedText = sb.toString(); - } - - public String getOriginalText() { - return originalText; - } - - public String getNormalizedText() { - return normalizedText; - } - - /** - * Returns the annotation for the original text matching the given position and length in the normalized - * text. - * - * @param position position in the normalized text to get annotation for - * @param length length of the text to get annotation for - * @return - */ - public Annotation getOriginalAnnotationForPosition(int position, int length) { - int curNormalizedLength = 0; - int originalStart = -1; - int curOriginalLength = 0; - - for (OccurenceMappingPair p : normalizedIndexToOriginalIndex) { - if (p.getNormalizedIndex() == position) { - originalStart = p.getOriginalIndex(); - } - if (originalStart != -1) { - curNormalizedLength += p.getNormalizedLength(); - curOriginalLength += p.getOriginalLength(); - if (curNormalizedLength >= length) { - //TODO refactoring -// return new Annotation(originalDocument, originalStart, curOriginalLength); - } - - // include space - curNormalizedLength += 1; - curOriginalLength += 1; - } - } - - return null; - } - - /** - * Returns the normalized form of the given word. Word must not contain any spaces or the like. - * @param word - * @return - */ - private String getNormalizedWord(String word) { - return LinguisticUtil.getInstance().getNormalizedForm(word); - } - - public static void main(String[] args) { -// NormalizedTextMapper n = new NormalizedTextMapper(new TextDocument("This is a testing text using letters")); -// System.out.println(n.getOriginalText()); -// System.out.println(n.getNormalizedText()); -// for (OccurenceMappingPair p : n.normalizedIndexToOriginalIndex) { -// System.out.println(p); -// } -// System.out.println(n.getOriginalAnnotationForPosition(7,6)); -// System.out.println(n.getOriginalAnnotationForPosition(23,6)); -// System.out.println(n.getOriginalAnnotationForPosition(7,1)); -// System.out.println(n.getOriginalAnnotationForPosition(14,15)); - } - - /** - * Maps words identified by index and length in the normalized texts to the original word. - */ - private class OccurenceMappingPair { - private int originalIndex; - private int originalLength; - private int normalizedIndex; - private int normalizedLength; - - private OccurenceMappingPair(int originalIndex, int originalLength, int normalizedIndex, int normalizedLength) { - - this.originalIndex = originalIndex; - this.originalLength = originalLength; - this.normalizedIndex = normalizedIndex; - this.normalizedLength = normalizedLength; - } - - private int getNormalizedIndex() { - return normalizedIndex; - } - - private int getNormalizedLength() { - return normalizedLength; - } - - private int getOriginalLength() { - return originalLength; - } - - private int getOriginalIndex() { - return originalIndex; - } - - @Override - public String toString() { - return "OccurenceMappingPair{" + - "originalIndex=" + originalIndex + - ", originalLength=" + originalLength + - ", normalizedIndex=" + normalizedIndex + - ", normalizedLength=" + normalizedLength + - '}'; - } - } -} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-09 15:36:38 UTC (rev 4198) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-09 15:37:39 UTC (rev 4199) @@ -91,6 +91,10 @@ String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); for (String synonym : synonyms) { + // ignore all multi word synonyms + if (synonym.contains("_")) { + continue; + } t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 15:36:38 UTC (rev 4198) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 15:37:39 UTC (rev 4199) @@ -138,7 +138,7 @@ */ @Override public String toString() { - return "[Word: " + rawForm + " | Stemmed word: " + stemmedForm + " | POS tag: " + posTag + "]"; + return "[Word: " + rawForm + " | Stemmed word: " + stemmedForm + " | POS tag: " + posTag + " | Alternatives: " + alternativeForms.toString() + "]"; } /** Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-09 15:36:38 UTC (rev 4198) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-09 15:37:39 UTC (rev 4199) @@ -1,15 +1,15 @@ package org.dllearner.algorithms.isle.index; import com.google.common.collect.Lists; -import com.google.common.collect.Sets; - import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.StopWordFilter; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; -import java.util.*; -import java.util.regex.Pattern; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Set; /** * Generates candidates using a entity candidates prefix trie @@ -34,7 +34,6 @@ /** * Postprocess the annotations generated by annotate * The objective is to merge annotations which are likely to belong to the same entity - * @param annotations : set of annotations * @param window : maximum distance between the annotations * @return */ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-09 15:36:42
|
Revision: 4198 http://sourceforge.net/p/dl-learner/code/4198 Author: lorenz_b Date: 2013-12-09 15:36:38 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Added syntactic index. Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-12-09 15:35:10 UTC (rev 4197) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-12-09 15:36:38 UTC (rev 4198) @@ -1,122 +0,0 @@ -/** - * - */ -package org.dllearner.algorithms.isle.index.syntactic; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.SimpleFSDirectory; -import org.apache.lucene.util.Version; -import org.dllearner.algorithms.isle.index.Index; -import org.dllearner.algorithms.isle.index.TextDocument; - -/** - * Creates a syntactic index from text files stored on disk - * - */ -public class TextDocumentSyntacticIndexCreator { - - private Directory indexDirectory; - private final File inputDirectory; - private final static String searchField = "text"; - - public TextDocumentSyntacticIndexCreator(File inputDirectory, File indexDirectory) - throws IOException { - this.indexDirectory = new SimpleFSDirectory(indexDirectory); - this.inputDirectory = inputDirectory; - } - - public Index buildIndex() throws Exception{ - Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); - IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); - IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig); - System.out.println( "Creating index ..." ); - - Set<org.apache.lucene.document.Document> luceneDocuments = new HashSet<org.apache.lucene.document.Document>(); - FieldType stringType = new FieldType(StringField.TYPE_STORED); - stringType.setStoreTermVectors(false); - FieldType textType = new FieldType(TextField.TYPE_STORED); - textType.setStoreTermVectors(false); - - for (File f : inputDirectory.listFiles()) { - if (!f.getName().endsWith(".txt")) { - continue; - } - org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document(); - luceneDocument.add(new Field("uri", f.toURI().toString(), stringType)); - - StringBuilder content = new StringBuilder(); - BufferedReader reader = new BufferedReader(new FileReader(f)); - - String line; - while ((line = reader.readLine()) != null) { - content.append(line); - content.append("\n"); - } - reader.close(); - - luceneDocument.add(new Field(searchField, content.toString(), textType)); - luceneDocuments.add(luceneDocument); - } - writer.addDocuments(luceneDocuments); - - System.out.println("Done."); - writer.close(); - - return new LuceneSyntacticIndex(indexDirectory, searchField); - } - - public Index buildIndex(Set<TextDocument> documents) throws Exception{ - Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); - IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); - IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig); - System.out.println( "Creating index ..." ); - - Set<org.apache.lucene.document.Document> luceneDocuments = new HashSet<org.apache.lucene.document.Document>(); - FieldType stringType = new FieldType(StringField.TYPE_STORED); - stringType.setStoreTermVectors(false); - FieldType textType = new FieldType(TextField.TYPE_STORED); - textType.setStoreTermVectors(false); - - int id = 1; - for (TextDocument document : documents) { - org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document(); - luceneDocument.add(new Field("uri", Integer.toString(id++), stringType)); - luceneDocument.add(new Field(searchField, document.getContent(), textType)); - luceneDocuments.add(luceneDocument); - } - writer.addDocuments(luceneDocuments); - - System.out.println("Done."); - writer.close(); - - return new LuceneSyntacticIndex(indexDirectory, searchField); - } - - public static Index loadIndex(File indexDirectory) throws Exception { - return new LuceneSyntacticIndex(new SimpleFSDirectory(indexDirectory), searchField); - } - - public static void main(String[] args) throws Exception { - if (args.length != 2) { - System.err.println("Usage: <input directory> <index directory>"); - System.exit(1); - return; - } - new TextDocumentSyntacticIndexCreator(new File(args[0]), new File(args[1])).buildIndex(); - } -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 4197 http://sourceforge.net/p/dl-learner/code/4197 Author: lorenz_b Date: 2013-12-09 15:35:10 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Added syntactic index. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java 2013-12-09 15:34:15 UTC (rev 4196) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java 2013-12-09 15:35:10 UTC (rev 4197) @@ -93,7 +93,7 @@ System.out.println("Done."); writer.close(); - return new LuceneSyntacticIndex(directory, searchField); + return new LuceneSyntacticIndex(ontology, directory, searchField); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-09 15:34:19
|
Revision: 4196 http://sourceforge.net/p/dl-learner/code/4196 Author: lorenz_b Date: 2013-12-09 15:34:15 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Added syntactic index. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SyntacticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -17,6 +17,8 @@ import edu.stanford.nlp.trees.CollinsHeadFinder; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import edu.stanford.nlp.trees.tregex.TregexPattern; import edu.stanford.nlp.util.CoreMap; public class TextDocumentGenerator { @@ -41,6 +43,10 @@ } public TextDocument generateDocument(String text) { + return generateDocument(text, false); + } + + public TextDocument generateDocument(String text, boolean determineHead) { TextDocument document = new TextDocument(); // create an empty Annotation just with the given text Annotation annotatedDocument = new Annotation(text); @@ -53,6 +59,33 @@ List<CoreMap> sentences = annotatedDocument.get(SentencesAnnotation.class); for(CoreMap sentence: sentences) { + + //determine the head noun + String head = null; + if(determineHead){ + //if phrase only contains one single token, the task is trivial + if(sentence.get(TokensAnnotation.class).size() == 1){ + head = sentence.get(TokensAnnotation.class).get(0).get(TextAnnotation.class); + } else { + Tree tree = sentence.get(TreeAnnotation.class); + CollinsHeadFinder headFinder = new CollinsHeadFinder(); +// Tree head = headFinder.determineHead(tree); +// System.out.println(sentence); +// System.out.println(tree.headTerminal(headFinder)); + head = tree.headTerminal(headFinder).toString(); + + // Create a reusable pattern object + TregexPattern patternMW = TregexPattern.compile("__ >># NP"); + // Run the pattern on one particular tree + TregexMatcher matcher = patternMW.matcher(tree); + // Iterate over all of the subtrees that matched + while (matcher.findNextMatchingNode()) { + Tree match = matcher.getMatch(); + // do what we want to with the subtree + } + } + } + for (CoreLabel label: sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = label.get(TextAnnotation.class); @@ -71,10 +104,9 @@ Token token = new Token(word, lemma, pos, isPunctuation, isStopWord); - //determine the head noun - Tree tree = sentence.get(TreeAnnotation.class); - CollinsHeadFinder headFinder = new CollinsHeadFinder(); - Tree head = headFinder.determineHead(tree); + if(determineHead && word.equals(head)){ + token.setIsHead(true); + } document.add(token); } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -0,0 +1,31 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +import java.util.Set; + +import org.dllearner.core.owl.Entity; + +/** + * @author Lorenz Buehmann + * + */ +public interface Index { + + /** + * Returns a set of documents based on how the underlying index is processing the given + * search string. + * + * @param searchString query specifying the documents to retrieve + * @return set of documents retrieved based on the given query string + */ + Set<AnnotatedDocument> getDocuments(Entity entity); + + /** + * Returns the total number of documents contained in the index. + * + * @return the total number of documents contained in the index + */ + int getTotalNumberOfDocuments(); +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -21,6 +21,7 @@ private String posTag; private boolean isPunctuation; private boolean isStopWord; + private boolean isHead; /// for storing alternative forms of this token, e.g., generated by WordNet synonyms private HashSet<String> alternativeForms; @@ -36,7 +37,7 @@ this.isStopWord = isStopWord; this.alternativeForms = new HashSet<>(); } - + /** * @return the rawForm */ @@ -117,6 +118,20 @@ public void setIsStopWord(boolean isStopWord) { this.isStopWord = isStopWord; } + + /** + * @param wheteher the token is the head of the containg sequence of tokens + */ + public void setIsHead(boolean isHead) { + this.isHead = isHead; + } + + /** + * @return the isHead + */ + public boolean isHead() { + return isHead; + } /* (non-Javadoc) * @see java.lang.Object#toString() Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -3,6 +3,15 @@ */ package org.dllearner.algorithms.isle.index.syntactic; +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; @@ -12,71 +21,88 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TotalHitCountCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.AnnotatedDocument; +import org.dllearner.algorithms.isle.index.AnnotatedTextDocument; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.algorithms.isle.index.TextDocument; +import org.dllearner.algorithms.isle.index.Token; +import org.dllearner.algorithms.isle.textretrieval.AnnotationEntityTextRetriever; +import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; -import java.io.File; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - /** * @author Lorenz Buehmann * */ -public class LuceneSyntacticIndex implements SyntacticIndex { +public class LuceneSyntacticIndex implements Index { private IndexSearcher searcher; private QueryParser parser; private IndexReader indexReader; private String searchField; + + AnnotationEntityTextRetriever textRetriever; - public LuceneSyntacticIndex(IndexReader indexReader, String searchField) throws Exception { + public LuceneSyntacticIndex(OWLOntology ontology, IndexReader indexReader, String searchField) throws Exception { this.indexReader = indexReader; this.searchField = searchField; searcher = new IndexSearcher(indexReader); StandardAnalyzer analyzer = new StandardAnalyzer( Version.LUCENE_43); parser = new QueryParser( Version.LUCENE_43, searchField, analyzer ); + + textRetriever = new RDFSLabelEntityTextRetriever(ontology); } - public LuceneSyntacticIndex(Directory directory, String searchField) throws Exception { - this(DirectoryReader.open(directory), searchField); + public LuceneSyntacticIndex(OWLOntology ontology, Directory directory, String searchField) throws Exception { + this(ontology, DirectoryReader.open(directory), searchField); } - public LuceneSyntacticIndex(String indexDirectory, String searchField) throws Exception { - this(DirectoryReader.open(FSDirectory.open(new File(indexDirectory))), searchField); + public LuceneSyntacticIndex(OWLOntology ontology, String indexDirectory, String searchField) throws Exception { + this(ontology, DirectoryReader.open(FSDirectory.open(new File(indexDirectory))), searchField); } /* (non-Javadoc) * @see org.dllearner.algorithms.isle.SyntacticIndex#getDocuments(java.lang.String) */ @Override - public Set<org.dllearner.algorithms.isle.index.Document> getDocuments(String searchString) { - Set<org.dllearner.algorithms.isle.index.Document> documents = new HashSet<org.dllearner.algorithms.isle.index.Document>(); - try { - Query query = parser.parse(searchString); - ScoreDoc[] result = searcher.search(query, getSize()).scoreDocs; - for (int i = 0; i < result.length; i++) { - Document doc = searcher.doc(result[i].doc); - documents.add(TextDocumentGenerator.getInstance().generateDocument(doc.get(searchField))); + public Set<AnnotatedDocument> getDocuments(Entity entity) { + Set<AnnotatedDocument> documents = new HashSet<AnnotatedDocument>(); + + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + + for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { + List<Token> tokens = entry.getKey(); + for (Token token : tokens) { + try { + Query query = parser.parse(token.getRawForm()); + ScoreDoc[] result = searcher.search(query, getTotalNumberOfDocuments()).scoreDocs; + for (int i = 0; i < result.length; i++) { + Document doc = searcher.doc(result[i].doc); + documents.add(new AnnotatedTextDocument( + TextDocumentGenerator.getInstance().generateDocument(doc.get(searchField)), + Collections.EMPTY_SET)); + } + } catch (ParseException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } } - } catch (ParseException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); } + return documents; } /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.SyntacticIndex#getSize() + * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() */ @Override - public int getSize() { + public int getTotalNumberOfDocuments() { return indexReader.numDocs(); } @@ -94,22 +120,5 @@ return documents; } - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.SyntacticIndex#count(java.lang.String) - */ - @Override - public int count(String searchString) { - try { - Query query = parser.parse(searchString); - TotalHitCountCollector results = new TotalHitCountCollector(); - searcher.search(query, results); - return results.getTotalHits(); - } catch (ParseException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - return -1; - } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -3,6 +3,10 @@ */ package org.dllearner.algorithms.isle.index.syntactic; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Field; @@ -14,14 +18,17 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; -import org.semanticweb.owlapi.model.*; +import org.dllearner.algorithms.isle.index.Index; +import org.semanticweb.owlapi.model.OWLAnnotation; +import org.semanticweb.owlapi.model.OWLAnnotationProperty; +import org.semanticweb.owlapi.model.OWLDataFactory; +import org.semanticweb.owlapi.model.OWLEntity; +import org.semanticweb.owlapi.model.OWLLiteral; +import org.semanticweb.owlapi.model.OWLOntology; import org.semanticweb.owlapi.vocab.OWLRDFVocabulary; + import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - /** * Creates a Lucene Index for the labels if classes and properties. * @author Lorenz Buehmann @@ -49,7 +56,7 @@ schemaEntities.addAll(ontology.getDataPropertiesInSignature()); } - public SyntacticIndex buildIndex() throws Exception{ + public Index buildIndex() throws Exception{ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); IndexWriter writer = new IndexWriter(directory, indexWriterConfig); Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SyntacticIndex.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SyntacticIndex.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -1,43 +0,0 @@ -/** - * - */ -package org.dllearner.algorithms.isle.index.syntactic; - -import org.dllearner.algorithms.isle.index.Document; - -import java.util.Set; - -/** - * Interface for a syntactic index, e.g., a basic string-based inverted index. - * - * @author Lorenz Buehmann - * @author Daniel Fleischhacker - */ -public interface SyntacticIndex { - - /** - * Returns a set of documents based on how the underlying index is processing the given - * search string. - * - * @param searchString query specifying the documents to retrieve - * @return set of documents retrieved based on the given query string - */ - Set<Document> getDocuments(String searchString); - - /** - * Returns the number of documents based on how the underlying index is processing the - * given search string. - * - * @param searchString query specifying the documents to include in the number of documents - * @return number of documents retrieved based on the given query string - */ - int count(String searchString); - - /** - * Returns the total number of documents contained in the index. - * - * @return the total number of documents contained in the index - */ - int getSize(); - -} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -3,6 +3,13 @@ */ package org.dllearner.algorithms.isle.index.syntactic; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Field; @@ -14,15 +21,9 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.algorithms.isle.index.TextDocument; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - /** * Creates a syntactic index from text files stored on disk * @@ -39,7 +40,7 @@ this.inputDirectory = inputDirectory; } - public SyntacticIndex buildIndex() throws Exception{ + public Index buildIndex() throws Exception{ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig); @@ -79,7 +80,7 @@ return new LuceneSyntacticIndex(indexDirectory, searchField); } - public SyntacticIndex buildIndex(Set<TextDocument> documents) throws Exception{ + public Index buildIndex(Set<TextDocument> documents) throws Exception{ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig); @@ -106,7 +107,7 @@ return new LuceneSyntacticIndex(indexDirectory, searchField); } - public static SyntacticIndex loadIndex(File indexDirectory) throws Exception { + public static Index loadIndex(File indexDirectory) throws Exception { return new LuceneSyntacticIndex(new SimpleFSDirectory(indexDirectory), searchField); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -46,6 +46,7 @@ private boolean useShortFormFallback = true; private IRIShortFormProvider sfp = new SimpleIRIShortFormProvider(); + protected boolean determineHeadNoun = false; private OWLAnnotationProperty[] properties; @@ -97,7 +98,7 @@ } //remove content in brackets like (...) label = label.replaceAll("\\s?\\((.*?)\\)", ""); - textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label), weight); + textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label, determineHeadNoun), weight); } } } @@ -107,7 +108,7 @@ String shortForm = sfp.getShortForm(IRI.create(entity.getURI())); shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromCamelCase(shortForm)); shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromUnderscored(shortForm)).trim(); - textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(shortForm), weight); + textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(shortForm, determineHeadNoun), weight); } return textWithWeight; Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -34,10 +34,12 @@ public RDFSLabelEntityTextRetriever(OWLOntology ontology) { super(ontology, new OWLDataFactoryImpl().getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_LABEL.getIRI())); + determineHeadNoun = true; } public RDFSLabelEntityTextRetriever(OWLAPIOntology ontology) { super(ontology, new OWLDataFactoryImpl().getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_LABEL.getIRI())); + determineHeadNoun = true; } public static void main(String[] args) throws Exception { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-09 14:40:08
|
Revision: 4195 http://sourceforge.net/p/dl-learner/code/4195 Author: dfleischhacker Date: 2013-12-09 14:40:04 +0000 (Mon, 09 Dec 2013) Log Message: ----------- WordNet alternative forms Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-12-09 14:36:38 UTC (rev 4194) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-12-09 14:40:04 UTC (rev 4195) @@ -97,6 +97,20 @@ } /** + * Returns an array of all synonyms for the given word. Only synonyms for the POS in {@link #RELEVANT_POS} are + * returned. + * + * @param word the word to retrieve synonyms for + * @return synonyms for the given word + */ + public String[] getSynonymsForWord(String word, POS pos) { + ArrayList<String> synonyms = new ArrayList<String>(); + + synonyms.addAll(wn.getAllSynonyms(pos, word)); + return synonyms.toArray(new String[synonyms.size()]); + } + + /** * Returns an array of the lemmas of the top {@code n} synonyms for the given word. Only synonyms for the POS in * {@link #RELEVANT_POS} are returned. * Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-09 14:36:38 UTC (rev 4194) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-09 14:40:04 UTC (rev 4195) @@ -1,9 +1,8 @@ package org.dllearner.algorithms.isle.index; -import org.apache.commons.lang.StringUtils; +import net.didion.jwnl.data.POS; import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; import org.dllearner.core.owl.Entity; -import org.dllearner.utilities.datastructures.PrefixTrie; import org.semanticweb.owlapi.model.OWLOntology; import java.util.*; @@ -11,7 +10,6 @@ public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { TokenTree tree; - PrefixTrie<FullTokenEntitySetPair> trie; EntityTextRetriever entityTextRetriever; // /** @@ -31,15 +29,13 @@ * * @param entityTextRetriever the text retriever to use * @param ontology the ontology to get strings from - * @param nameGenerator the name generator to use for generating alternative words */ - public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology, - NameGenerator nameGenerator) { + public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { this.entityTextRetriever = entityTextRetriever; - buildTrie(ontology, nameGenerator); + buildTrie(ontology); } - public void buildTrie(OWLOntology ontology, NameGenerator nameGenerator) { + public void buildTrie(OWLOntology ontology) { this.tree = new TokenTree(); Map<Entity, Set<List<Token>>> entity2TokenSet = entityTextRetriever.getRelevantText(ontology); @@ -48,12 +44,9 @@ Entity entity = entry.getKey(); Set<List<Token>> tokenSet = entry.getValue(); for (List<Token> tokens : tokenSet) { + addAlternativeFormsFromWordNet(tokens); addEntry(tokens, entity); addSubsequences(entity, tokens); -// addSubsequencesWordNet(entity, text); -// for (String alternativeText : nameGenerator.getAlternativeText(text)) { -// addEntry(alternativeText.toLowerCase(), entity, text); -// } } } } @@ -76,65 +69,33 @@ } } -// private void addSubsequencesWordNet(Entity entity, String text) { -// if (text.contains(" ")) { -// String[] tokens = text.split(" "); -// -// List<String>[] wordnetTokens = (ArrayList<String>[]) new ArrayList[tokens.length]; -// -// // generate list of lemmatized wordnet synonyms for each token -// for (int i = 0; i < tokens.length; i++) { -// wordnetTokens[i] = new ArrayList<String>(); -// wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(tokens[i].toLowerCase())); -// for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(tokens[i], 5)) { -// wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).toLowerCase()); -// } -// } -// -// // generate subsequences starting at the given start index of the given size -// Set<String[]> allPossibleSubsequences = getAllPossibleSubsequences(tokens, wordnetTokens); -// -// for (String[] s : allPossibleSubsequences) { -// addEntry(s[0], entity, s[1]); -// } -// } -// } + private void addAlternativeFormsFromWordNet(List<Token> tokens) { + for (Token t : tokens) { + POS wordnetPos = null; + String posTag = t.getPOSTag(); + if (posTag.startsWith("N")) {//nouns + wordnetPos = POS.NOUN; + } + else if (posTag.startsWith("V")) {//verbs + wordnetPos = POS.VERB; + } + else if (posTag.startsWith("J")) {//adjectives + wordnetPos = POS.ADJECTIVE; + } + else if (posTag.startsWith("R")) {//adverbs + wordnetPos = POS.ADVERB; + } + if (wordnetPos == null) { + continue; + } + String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); - private static Set<String[]> getAllPossibleSubsequences(String[] originalTokens, List<String>[] wordnetTokens) { - ArrayList<String[]> res = new ArrayList<String[]>(); - - for (int size = 1; size < wordnetTokens.length + 1; size++) { - for (int start = 0; start < wordnetTokens.length - size + 1; start++) { - getPossibleSubsequencesRec(originalTokens, res, new ArrayList<String>(), new ArrayList<String>(), - wordnetTokens, 0, size); + for (String synonym : synonyms) { + t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); } } - - return new HashSet<String[]>(res); } - - private static void getPossibleSubsequencesRec(String[] originalTokens, List<String[]> allSubsequences, - List<String> currentSubsequence, - List<String> currentOriginalSubsequence, - List<String>[] wordnetTokens, - int curStart, int maxLength) { - - if (currentSubsequence.size() == maxLength) { - allSubsequences.add(new String[]{StringUtils.join(currentSubsequence, " ").toLowerCase(), StringUtils - .join(currentOriginalSubsequence, " ").toLowerCase()}); - return; - } - for (String w : wordnetTokens[curStart]) { - ArrayList<String> tmpSequence = new ArrayList<String>(currentSubsequence); - ArrayList<String> tmpOriginalSequence = new ArrayList<String>(currentOriginalSubsequence); - tmpSequence.add(w); - tmpOriginalSequence.add(originalTokens[curStart]); - getPossibleSubsequencesRec(originalTokens, allSubsequences, tmpSequence, tmpOriginalSequence, wordnetTokens, - curStart + 1, maxLength); - } - } - @Override public void addEntry(List<Token> s, Entity e) { tree.add(s, e); @@ -177,111 +138,10 @@ wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).replaceAll("_", " ")); } } - - // generate subsequences starting at the given start index of the given size - Set<String[]> allPossibleSubsequences = getAllPossibleSubsequences(tokens, wordnetTokens); - - for (String[] s : allPossibleSubsequences) { - System.out.println(String.format("%s - %s", s[0], s[1])); - } } public void printTrie() { System.out.println(this.toString()); - + } - - public static interface NameGenerator { - /** - * Returns a list of possible alternative words for the given word - * - * @param text the text to return alternative words for - * @return alternative words for given word - */ - List<String> getAlternativeText(String text); - } - - public static class DummyNameGenerator implements NameGenerator { - @Override - public List<String> getAlternativeText(String word) { - return Collections.singletonList(word); - } - } - - /** - * Generates alternative texts by using WordNet synonyms. - */ - public static class WordNetNameGenerator implements NameGenerator { - private int maxNumberOfSenses = 5; - - /** - * Sets up the generator for returning the lemmas of the top {@code maxNumberOfSenses} senses. - * @param maxNumberOfSenses the maximum number of senses to aggregate word lemmas from - */ - public WordNetNameGenerator(int maxNumberOfSenses) { - this.maxNumberOfSenses = maxNumberOfSenses; - } - - @Override - public List<String> getAlternativeText(String word) { - return Arrays.asList(LinguisticUtil.getInstance().getTopSynonymsForWord(word, maxNumberOfSenses)); - } - } - - /** - * Generates alternative texts by using WordNet synonym and lemmatizing of the original words - */ - public static class LemmatizingWordNetNameGenerator implements NameGenerator { - private int maxNumberOfSenses = 5; - - /** - * Sets up the generator for returning the lemmas of the top {@code maxNumberOfSenses} senses. - * @param maxNumberOfSenses the maximum number of senses to aggregate word lemmas from - */ - public LemmatizingWordNetNameGenerator(int maxNumberOfSenses) { - this.maxNumberOfSenses = maxNumberOfSenses; - } - - @Override - public List<String> getAlternativeText(String word) { - ArrayList<String> res = new ArrayList<String>(); - res.add(LinguisticUtil.getInstance().getNormalizedForm(word)); - - for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(word, maxNumberOfSenses)) { - res.add(LinguisticUtil.getInstance().getNormalizedForm(w.replaceAll("_", " "))); - } - - return res; - } - } - - /** - * Pair of the actual word and the word after processing. - */ - public static class ActualModifiedWordPair { - private String actualString; - private String modifiedString; - - public String getActualString() { - return actualString; - } - - public void setActualString(String actualString) { - this.actualString = actualString; - } - - public String getModifiedString() { - return modifiedString; - } - - public void setModifiedString(String modifiedString) { - this.modifiedString = modifiedString; - } - - public ActualModifiedWordPair(String actualString, String modifiedString) { - - this.actualString = actualString; - this.modifiedString = modifiedString; - } - } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 14:36:38 UTC (rev 4194) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 14:40:04 UTC (rev 4195) @@ -3,13 +3,13 @@ */ package org.dllearner.algorithms.isle.index; +import com.google.common.collect.ComparisonChain; + import java.io.Serializable; import java.util.Collections; import java.util.HashSet; import java.util.Set; -import com.google.common.collect.ComparisonChain; - /** * @author Lorenz Buehmann * Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-09 14:36:38 UTC (rev 4194) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-09 14:40:04 UTC (rev 4195) @@ -13,13 +13,13 @@ * @author Daniel Fleischhacker */ public class TokenTree { - private HashMap<Token, TokenTree> children; + private LinkedHashMap<Token, TokenTree> children; private Set<Entity> entities; private List<Token> originalTokens; private boolean ignoreStopWords = true; public TokenTree() { - this.children = new HashMap<>(); + this.children = new LinkedHashMap<>(); this.entities = new HashSet<>(); this.originalTokens = new ArrayList<>(); } @@ -73,7 +73,7 @@ } /** - * Returns the set of entities located by the given list of tokens. + * Returns the set of entities located by the given list of tokens. This method does not consider alternative forms. * * @param tokens tokens to locate the information to get * @return located set of entities or null if token sequence not contained in tree @@ -101,7 +101,7 @@ TokenTree curNode = this; for (Token t : tokens) { - TokenTree nextNode = curNode.children.get(t); + TokenTree nextNode = getNextTokenTree(curNode, t); if (nextNode == null) { return fallbackTokenList; } @@ -111,6 +111,19 @@ return fallbackTokenList; } + private TokenTree getNextTokenTree(TokenTree current, Token t) { + TokenTree next = current.children.get(t); + if (next != null) { + return next; + } + for (Map.Entry<Token, TokenTree> child : current.children.entrySet()) { + if (child.getKey().equalsWithAlternativeForms(t)) { + return child.getValue(); + } + } + return null; + } + /** * Returns the set of entities assigned to the longest matching token subsequence of the given token sequence. * @param tokens token sequence to search for longest match @@ -121,7 +134,7 @@ TokenTree curNode = this; for (Token t : tokens) { - TokenTree nextNode = curNode.children.get(t); + TokenTree nextNode = getNextTokenTree(curNode, t); if (nextNode == null) { return fallback == null ? null : fallback.entities; } @@ -142,7 +155,7 @@ TokenTree curNode = this; for (Token t : tokens) { - TokenTree nextNode = curNode.children.get(t); + TokenTree nextNode = getNextTokenTree(curNode, t); if (nextNode == null) { return fallback == null ? null : fallback.originalTokens; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java 2013-12-09 14:36:38 UTC (rev 4194) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java 2013-12-09 14:40:04 UTC (rev 4195) @@ -1,38 +1,22 @@ package org.dllearner.algorithms.isle.index.semantic; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.util.HashSet; -import java.util.Set; - +import com.google.common.hash.HashCode; +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; import org.apache.log4j.Logger; import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.TextDocumentGenerator; -import org.dllearner.algorithms.isle.index.AnnotatedDocument; -import org.dllearner.algorithms.isle.index.LinguisticAnnotator; -import org.dllearner.algorithms.isle.index.SemanticAnnotator; -import org.dllearner.algorithms.isle.index.SimpleEntityCandidatesTrie; -import org.dllearner.algorithms.isle.index.TextDocument; -import org.dllearner.algorithms.isle.index.TrieEntityCandidateGenerator; -import org.dllearner.algorithms.isle.index.TrieLinguisticAnnotator; +import org.dllearner.algorithms.isle.index.*; import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; import org.dllearner.algorithms.isle.wsd.StructureBasedWordSenseDisambiguation; import org.dllearner.algorithms.isle.wsd.WindowBasedContextExtractor; import org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation; import org.dllearner.core.owl.Entity; -import org.semanticweb.owlapi.model.OWLAnnotation; -import org.semanticweb.owlapi.model.OWLAnnotationProperty; -import org.semanticweb.owlapi.model.OWLEntity; -import org.semanticweb.owlapi.model.OWLLiteral; -import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.*; -import com.google.common.hash.HashCode; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; +import java.io.*; +import java.util.HashSet; +import java.util.Set; /** * Interface for an index which is able to resolve a given entity's URI to the set of documents containing @@ -86,14 +70,8 @@ public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, boolean useWordNormalization){ SimpleEntityCandidatesTrie trie; - if (useWordNormalization) { - trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), - ontology, new SimpleEntityCandidatesTrie.LemmatizingWordNetNameGenerator(5)); - } - else { - trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), - ontology, new SimpleEntityCandidatesTrie.DummyNameGenerator()); - } + trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), + ontology); trie.printTrie(); TrieLinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(trie); @@ -142,7 +120,10 @@ logger.info("Creating semantic index..."); SemanticIndex index = new SemanticIndex(); for (String document : documents) { - TextDocument textDocument = TextDocumentGenerator.getInstance().generateDocument(document); + if (document.isEmpty()) { + continue; + } + TextDocument textDocument = TextDocumentGenerator.getInstance().generateDocument(document); logger.debug("Processing document:" + textDocument); AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(textDocument); for (Entity entity : annotatedDocument.getContainedEntities()) { Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-09 14:36:38 UTC (rev 4194) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-09 14:40:04 UTC (rev 4195) @@ -3,26 +3,11 @@ */ package org.dllearner.algorithms.isle; -import java.io.File; -import java.io.IOException; -import java.net.URL; -import java.text.DecimalFormat; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - +import com.google.common.base.Charsets; +import com.google.common.base.Joiner; +import com.google.common.io.Files; import org.dllearner.algorithms.celoe.CELOE; -import org.dllearner.algorithms.isle.index.AnnotatedDocument; -import org.dllearner.algorithms.isle.index.EntityCandidatesTrie; -import org.dllearner.algorithms.isle.index.LinguisticAnnotator; -import org.dllearner.algorithms.isle.index.RemoteDataProvider; -import org.dllearner.algorithms.isle.index.SemanticAnnotator; -import org.dllearner.algorithms.isle.index.SimpleEntityCandidatesTrie; -import org.dllearner.algorithms.isle.index.TextDocument; -import org.dllearner.algorithms.isle.index.Token; -import org.dllearner.algorithms.isle.index.TrieEntityCandidateGenerator; -import org.dllearner.algorithms.isle.index.TrieLinguisticAnnotator; +import org.dllearner.algorithms.isle.index.*; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.dllearner.algorithms.isle.index.semantic.SemanticIndexGenerator; import org.dllearner.algorithms.isle.metrics.PMIRelevanceMetric; @@ -43,17 +28,17 @@ import org.junit.Before; import org.junit.Test; import org.semanticweb.owlapi.apibinding.OWLManager; -import org.semanticweb.owlapi.model.IRI; -import org.semanticweb.owlapi.model.OWLDataFactory; -import org.semanticweb.owlapi.model.OWLEntity; -import org.semanticweb.owlapi.model.OWLOntology; -import org.semanticweb.owlapi.model.OWLOntologyManager; - +import org.semanticweb.owlapi.model.*; import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; -import com.google.common.base.Charsets; -import com.google.common.base.Joiner; -import com.google.common.io.Files; +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.text.DecimalFormat; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; /** * Some tests for the ISLE algorithm. @@ -184,8 +169,7 @@ @Test public void testEntityLinkingWithLemmatizing() throws Exception { - EntityCandidatesTrie ect = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology, - new SimpleEntityCandidatesTrie.LemmatizingWordNetNameGenerator(5)); + EntityCandidatesTrie ect = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology); LinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(ect); WordSenseDisambiguation wsd = new SimpleWordSenseDisambiguation(ontology); EntityCandidateGenerator ecg = new TrieEntityCandidateGenerator(ontology, ect); @@ -200,8 +184,7 @@ @Test public void testEntityLinkingWithSimpleStringMatching() throws Exception { - EntityCandidatesTrie ect = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology, - new SimpleEntityCandidatesTrie.DummyNameGenerator()); + EntityCandidatesTrie ect = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology); TrieLinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(ect); linguisticAnnotator.setNormalizeWords(false); WordSenseDisambiguation wsd = new SimpleWordSenseDisambiguation(ontology); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-09 14:36:41
|
Revision: 4194 http://sourceforge.net/p/dl-learner/code/4194 Author: lorenz_b Date: 2013-12-09 14:36:38 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Added DBpedia experiment. Modified Paths: -------------- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java Added Paths: ----------- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaCorpusGenerator.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/KnowledgebaseSampleGenerator.java Added: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaCorpusGenerator.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaCorpusGenerator.java (rev 0) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaCorpusGenerator.java 2013-12-09 14:36:38 UTC (rev 4194) @@ -0,0 +1,169 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLEncoder; +import java.sql.SQLException; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +import org.aksw.jena_sparql_api.cache.core.QueryExecutionFactoryCacheEx; +import org.aksw.jena_sparql_api.cache.extra.CacheCoreEx; +import org.aksw.jena_sparql_api.cache.extra.CacheCoreH2; +import org.aksw.jena_sparql_api.cache.extra.CacheEx; +import org.aksw.jena_sparql_api.cache.extra.CacheExImpl; +import org.aksw.jena_sparql_api.core.QueryExecutionFactory; +import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.dllearner.core.owl.NamedClass; +import org.dllearner.kb.sparql.SparqlEndpoint; +import org.semanticweb.owlapi.apibinding.OWLManager; +import org.semanticweb.owlapi.model.OWLClass; +import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.OWLOntologyCreationException; + +import com.google.common.base.Charsets; +import com.google.common.io.Files; +import com.hp.hpl.jena.query.QueryExecution; +import com.hp.hpl.jena.query.QuerySolution; +import com.hp.hpl.jena.query.ResultSet; + +/** + * This class loads a set of English labels for a given number of instances for each class in the DBpedia ontology. + * @author Lorenz Buehmann + * + */ +public class DBpediaCorpusGenerator { + + /** + * Loads DBpedia ontology from remote URL. + */ + private static OWLOntology loadDBpediaOntology(){ + try { + URL url = new URL("http://downloads.dbpedia.org/3.9/dbpedia_3.9.owl.bz2"); + InputStream is = new BufferedInputStream(url.openStream()); + CompressorInputStream in = new CompressorStreamFactory().createCompressorInputStream("bzip2", is); + OWLOntology ontology = OWLManager.createOWLOntologyManager().loadOntologyFromOntologyDocument(in); + return ontology; + } catch (MalformedURLException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (OWLOntologyCreationException e) { + e.printStackTrace(); + } + return null; + } + + public static Set<String> getDBpediaCorpusSample(String textProperty, int maxNrOfInstancesPerClass){ + Set<String> documents = new HashSet<>(); + + SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); + String cacheDirectory = "cache"; + File corpusDirectory = new File("tmp/dbpedia-corpus"); + corpusDirectory.mkdirs(); + + QueryExecutionFactory qef = new QueryExecutionFactoryHttp(endpoint.getURL().toString(), endpoint.getDefaultGraphURIs()); + try { + long timeToLive = TimeUnit.DAYS.toMillis(30); + CacheCoreEx cacheBackend = CacheCoreH2.create(cacheDirectory, timeToLive, true); + CacheEx cacheFrontend = new CacheExImpl(cacheBackend); + qef = new QueryExecutionFactoryCacheEx(qef, cacheFrontend); + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } catch (SQLException e) { + e.printStackTrace(); + } + + //load the DBpedia ontology + OWLOntology ontology = loadDBpediaOntology(); + + //get a random set of instances for each class and their English label + for (OWLClass cls : ontology.getClassesInSignature()) { + String query = "SELECT ?s ?text WHERE {" + + "?s a <" + cls.toStringID() + ">. " + + "?s <" + textProperty + "> ?text. " + + "FILTER(LANGMATCHES(LANG(?text),'en'))} LIMIT " + maxNrOfInstancesPerClass; + QueryExecution qe = qef.createQueryExecution(query); + ResultSet rs = qe.execSelect(); + QuerySolution qs; + while(rs.hasNext()){ + qs = rs.next(); + + String uri = qs.getResource("s").getURI(); + String text = qs.getLiteral("text").getLexicalForm(); + + documents.add(text); + + //save to disk + try { + Files.write(text, new File(corpusDirectory, URLEncoder.encode(uri, "UTF-8")), Charsets.UTF_8); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + return documents; + } + + public static Set<String> getDBpediaCorpusSample(String textProperty, Set<NamedClass> classes, int maxNrOfInstancesPerClass){ + Set<String> documents = new HashSet<>(); + + SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); + String cacheDirectory = "cache"; + File corpusDirectory = new File("tmp/dbpedia-corpus"); + corpusDirectory.mkdirs(); + + QueryExecutionFactory qef = new QueryExecutionFactoryHttp(endpoint.getURL().toString(), endpoint.getDefaultGraphURIs()); + try { + long timeToLive = TimeUnit.DAYS.toMillis(30); + CacheCoreEx cacheBackend = CacheCoreH2.create(cacheDirectory, timeToLive, true); + CacheEx cacheFrontend = new CacheExImpl(cacheBackend); + qef = new QueryExecutionFactoryCacheEx(qef, cacheFrontend); + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } catch (SQLException e) { + e.printStackTrace(); + } + + //get a random set of instances for each class and their English label + for (NamedClass cls : classes) { + String query = "SELECT ?s ?text WHERE {" + + "?s a <" + cls.getName() + ">. " + + "?s <" + textProperty + "> ?text. " + + "FILTER(LANGMATCHES(LANG(?text),'en'))} LIMIT " + maxNrOfInstancesPerClass; + QueryExecution qe = qef.createQueryExecution(query); + ResultSet rs = qe.execSelect(); + QuerySolution qs; + while(rs.hasNext()){ + qs = rs.next(); + + String uri = qs.getResource("s").getURI(); + String text = qs.getLiteral("text").getLexicalForm(); + + documents.add(text); + + //save to disk + try { + Files.write(text, new File(corpusDirectory, URLEncoder.encode(uri, "UTF-8")), Charsets.UTF_8); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + return documents; + } + +} Added: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java (rev 0) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/DBpediaExperiment.java 2013-12-09 14:36:38 UTC (rev 4194) @@ -0,0 +1,103 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.dllearner.core.owl.NamedClass; +import org.dllearner.kb.sparql.SparqlEndpoint; +import org.semanticweb.owlapi.apibinding.OWLManager; +import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.OWLOntologyCreationException; +import org.semanticweb.owlapi.model.OWLOntologyManager; + +import com.google.common.collect.Sets; +import com.hp.hpl.jena.rdf.model.Model; + +/** + * @author Lorenz Buehmann + * + */ +public class DBpediaExperiment extends Experiment{ + + final SparqlEndpoint endpoint = SparqlEndpoint.getEndpointDBpedia(); + final int maxNrOfInstancesPerClass = 100; + + + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.Experiment#getOntology() + */ + @Override + protected OWLOntology getOntology() { + //load the DBpedia schema + try { + URL url = new URL("http://downloads.dbpedia.org/3.9/dbpedia_3.9.owl.bz2"); + InputStream is = new BufferedInputStream(url.openStream()); + CompressorInputStream in = new CompressorStreamFactory().createCompressorInputStream("bzip2", is); + OWLOntology schema = OWLManager.createOWLOntologyManager().loadOntologyFromOntologyDocument(in); + return schema; + } catch (MalformedURLException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (OWLOntologyCreationException e) { + e.printStackTrace(); + } + //load some sample data for the machine learning part + Model sample = KnowledgebaseSampleGenerator.createKnowledgebaseSample( + endpoint, + "http://dbpedia.org/ontology/", + Sets.newHashSet(new NamedClass("http://dbpedia.org/ontology/Person")), + maxNrOfInstancesPerClass); + try { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + sample.write(baos, "TURTLE", null); + OWLOntologyManager man = OWLManager.createOWLOntologyManager(); + OWLOntology ontology = man.loadOntologyFromOntologyDocument(new ByteArrayInputStream(baos.toByteArray())); + return ontology; + } catch (Exception e) { + e.printStackTrace(); + } + + return null; + } + + + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.Experiment#getDocuments() + */ + @Override + protected Set<String> getDocuments() { + Set<String> documents = new HashSet<String>(); + + documents.addAll(DBpediaCorpusGenerator.getDBpediaCorpusSample( + "http://dbpedia.org/ontology/abstract", + Sets.newHashSet(new NamedClass("http://dbpedia.org/ontology/Person")), + maxNrOfInstancesPerClass)); + + documents.clear(); + documents.add("Thomas Cruise Mapother IV, widely known as Tom Cruise, is an American film actor and producer. He has been nominated for three Academy Awards and has won three Golden Globe Awards. He started his career at age 19 in the 1981 film Taps. His first leading role was in Risky Business, released in August 1983. Cruise became a full-fledged movie star after starring in Top Gun (1986). He is well known for his role as secret agent Ethan Hunt in the Mission: Impossible film series between 1996 and 2011. Cruise has starred in many Hollywood blockbusters, including Rain Man (1988), A Few Good Men (1992), Jerry Maguire (1996), Vanilla Sky (2001), Minority Report (2002), The Last Samurai (2003), Collateral (2004), War of the Worlds (2005), Tropic Thunder (2008) and Jack Reacher (2012). As of 2012, Cruise is Hollywood's highest-paid actor. Cruise is known for his Scientologist faith and for his support of the Church of Scientology."); + + return documents; + } + + public static void main(String[] args) throws Exception { + new DBpediaExperiment().run(new NamedClass("http://dbpedia.org/ontology/Person")); + } +} Added: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/KnowledgebaseSampleGenerator.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/KnowledgebaseSampleGenerator.java (rev 0) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/KnowledgebaseSampleGenerator.java 2013-12-09 14:36:38 UTC (rev 4194) @@ -0,0 +1,223 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.SortedSet; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorOutputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.log4j.Logger; +import org.dllearner.core.owl.Individual; +import org.dllearner.core.owl.NamedClass; +import org.dllearner.kb.SparqlEndpointKS; +import org.dllearner.kb.sparql.ConciseBoundedDescriptionGenerator; +import org.dllearner.kb.sparql.ConciseBoundedDescriptionGeneratorImpl; +import org.dllearner.kb.sparql.SparqlEndpoint; +import org.dllearner.reasoning.SPARQLReasoner; + +import com.google.common.base.Charsets; +import com.google.common.hash.HashCode; +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import com.hp.hpl.jena.rdf.model.Model; +import com.hp.hpl.jena.rdf.model.ModelFactory; + +/** + * @author Lorenz Buehmann + * + */ +public class KnowledgebaseSampleGenerator { + + private static final Logger logger = Logger.getLogger(KnowledgebaseSampleGenerator.class.getName()); + + private static String cacheDir = "sparql-cache"; + private static int maxCBDDepth = 0; + + public static Model createKnowledgebaseSample(SparqlEndpoint endpoint, String namespace, Set<NamedClass> classes, int maxNrOfInstancesPerClass){ + Model model = ModelFactory.createDefaultModel(); + + //try to load existing sample from file system + HashFunction hf = Hashing.md5(); + HashCode hc = hf.newHasher().putString(endpoint.getURL().toString(), Charsets.UTF_8). + putInt(classes.hashCode()).hash(); + String filename = hc.toString() + "-" + maxNrOfInstancesPerClass + ".ttl.bz2"; + File file = new File(filename); + + if(!file.exists()){//if not exists + logger.info("Generating sample..."); + long startTime = System.currentTimeMillis(); + SPARQLReasoner reasoner = new SPARQLReasoner(new SparqlEndpointKS(endpoint), cacheDir); + ConciseBoundedDescriptionGenerator cbdGen = new ConciseBoundedDescriptionGeneratorImpl(endpoint, cacheDir); + + //get for each class n instances and compute the CBD for each instance + for (NamedClass cls : classes) { + logger.debug("\t...processing class " + cls + "..."); + SortedSet<Individual> individuals = reasoner.getIndividuals(cls, maxNrOfInstancesPerClass*2); + + Model cbd; + int cnt = 0; + for (Individual individual : individuals) { + try { + cbd = cbdGen.getConciseBoundedDescription(individual.getName(), maxCBDDepth); + model.add(cbd); + if(cnt++ == maxNrOfInstancesPerClass){ + break; + } + } catch (Exception e) { + e.printStackTrace(); + } + } + } + logger.info("...done in " + (System.currentTimeMillis() - startTime) + "ms"); + //add schema + model.add(reasoner.loadOWLSchema()); + logger.debug("Writing sample to disk..."); + startTime = System.currentTimeMillis(); + try { + CompressorOutputStream out = new CompressorStreamFactory() + .createCompressorOutputStream(CompressorStreamFactory.BZIP2, new FileOutputStream(file)); + model.write(out,"TURTLE"); + out.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + logger.debug("...done in " + (System.currentTimeMillis() - startTime) + "ms"); + } else { + logger.info("Loading sample from disk..."); + long startTime = System.currentTimeMillis(); + try { + CompressorInputStream in = new CompressorStreamFactory(). + createCompressorInputStream(CompressorStreamFactory.BZIP2, new FileInputStream(file)); + model.read(in, null, "TURTLE"); + in.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + logger.info("...done in " + (System.currentTimeMillis() - startTime) + "ms"); + } + + return model; + } + + public static Model createKnowledgebaseSample(SparqlEndpoint endpoint, String namespace, int maxNrOfClasses, int maxNrOfInstancesPerClass){ + Model model = ModelFactory.createDefaultModel(); + + //try to load existing sample from file system + HashFunction hf = Hashing.md5(); + HashCode hc = hf.newHasher().putString(endpoint.getURL().toString(), Charsets.UTF_8).hash(); + String filename = hc.toString() + ("-" + ((maxNrOfClasses == Integer.MAX_VALUE) ? "all" : maxNrOfClasses)) + "-" + maxNrOfInstancesPerClass + ".ttl.bz2"; + File file = new File(filename); + + if(!file.exists()){//if not exists + logger.info("Generating sample..."); + long startTime = System.currentTimeMillis(); + SPARQLReasoner reasoner = new SPARQLReasoner(new SparqlEndpointKS(endpoint), cacheDir); + ConciseBoundedDescriptionGenerator cbdGen = new ConciseBoundedDescriptionGeneratorImpl(endpoint, cacheDir); + + //get all OWL classes + Set<NamedClass> classes = reasoner.getOWLClasses(namespace); + if(maxNrOfClasses != -1 && maxNrOfClasses != Integer.MAX_VALUE){ + List<NamedClass> tmpClasses = new ArrayList<NamedClass>(classes); + Collections.shuffle(tmpClasses); + classes = new HashSet<NamedClass>(tmpClasses.subList(0, Math.min(tmpClasses.size(), maxNrOfClasses))); + } + + //get for each class n instances and compute the CBD for each instance + for (NamedClass cls : classes) { + logger.debug("\t...processing class " + cls + "..."); + SortedSet<Individual> individuals = reasoner.getIndividuals(cls, maxNrOfInstancesPerClass*2); + + Model cbd; + int cnt = 0; + for (Individual individual : individuals) { + try { + cbd = cbdGen.getConciseBoundedDescription(individual.getName(), maxCBDDepth); + model.add(cbd); + if(cnt++ == maxNrOfInstancesPerClass){ + break; + } + } catch (Exception e) { + e.printStackTrace(); + } + } + } + logger.info("...done in " + (System.currentTimeMillis() - startTime) + "ms"); + //add schema + model.add(reasoner.loadOWLSchema()); + logger.debug("Writing sample to disk..."); + startTime = System.currentTimeMillis(); + try { + CompressorOutputStream out = new CompressorStreamFactory() + .createCompressorOutputStream(CompressorStreamFactory.BZIP2, new FileOutputStream(file)); + model.write(out,"TURTLE"); + out.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + logger.debug("...done in " + (System.currentTimeMillis() - startTime) + "ms"); + } else { + logger.info("Loading sample from disk..."); + long startTime = System.currentTimeMillis(); + try { + CompressorInputStream in = new CompressorStreamFactory(). + createCompressorInputStream(CompressorStreamFactory.BZIP2, new FileInputStream(file)); + model.read(in, null, "TURTLE"); + in.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (CompressorException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + logger.info("...done in " + (System.currentTimeMillis() - startTime) + "ms"); + } + + return model; + } + + public static Model createKnowledgebaseSample(SparqlEndpoint endpoint, int maxNrOfClasses, int maxNrOfInstancesPerClass){ + return createKnowledgebaseSample(endpoint, null, maxNrOfClasses, maxNrOfInstancesPerClass); + } + + public static Model createKnowledgebaseSample(SparqlEndpoint endpoint, Set<NamedClass> classes, int maxNrOfInstancesPerClass){ + return createKnowledgebaseSample(endpoint, null, classes, maxNrOfInstancesPerClass); + } + + public static Model createKnowledgebaseSample(SparqlEndpoint endpoint, int maxNrOfInstancesPerClass){ + return createKnowledgebaseSample(endpoint, Integer.MAX_VALUE, maxNrOfInstancesPerClass); + } + + public static Model createKnowledgebaseSample(SparqlEndpoint endpoint, String namespace, int maxNrOfInstancesPerClass){ + return createKnowledgebaseSample(endpoint, null, Integer.MAX_VALUE, maxNrOfInstancesPerClass); + } + + public static void main(String[] args) throws Exception { + Model kb = createKnowledgebaseSample(SparqlEndpoint.getEndpointDBpedia(), "http://dbpedia.org/ontology", 100); + } +} Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java 2013-12-09 14:22:20 UTC (rev 4193) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java 2013-12-09 14:36:38 UTC (rev 4194) @@ -79,8 +79,8 @@ } catch (IOException e) { e.printStackTrace(); } -// documents.clear(); -// documents.add("and in that day seven women shall take hold of one man saying we will eat our own bread and wear our own apparel only let us be called by thy name to take away our reproach in that day shall the branch of the lord be beautiful and glorious and the fruit of the earth excellent and comely for them that are escaped of israel and it shall come to pass left in zion and remaineth in jerusalem shall be called holy every one that is written among the living in jerusalem when the lord shall have washed away the filth of the daughters of zion and shall have purged the blood of jerusalem from the midst thereof by the spirit of judgment and by the spirit of burning and the lord will create upon every dwelling place of mount zion and upon her assemblies a cloud and smoke by day and the shining of a flaming fire by night for upon all the glory a defence and there shall be a tabernacle for a shadow in the daytime from the heat and for a place of refuge and for a covert from storm and from rain"); + documents.clear(); + documents.add("and in that day seven women shall take hold of one man saying we will eat our own bread and wear our own apparel only let us be called by thy name to take away our reproach in that day shall the branch of the lord be beautiful and glorious and the fruit of the earth excellent and comely for them that are escaped of israel and it shall come to pass left in zion and remaineth in jerusalem shall be called holy every one that is written among the living in jerusalem when the lord shall have washed away the filth of the daughters of zion and shall have purged the blood of jerusalem from the midst thereof by the spirit of judgment and by the spirit of burning and the lord will create upon every dwelling place of mount zion and upon her assemblies a cloud and smoke by day and the shining of a flaming fire by night for upon all the glory a defence and there shall be a tabernacle for a shadow in the daytime from the heat and for a place of refuge and for a covert from storm and from rain"); return documents; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-09 14:22:22
|
Revision: 4193 http://sourceforge.net/p/dl-learner/code/4193 Author: lorenz_b Date: 2013-12-09 14:22:20 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Added generator class for semantic index. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-03 12:41:34 UTC (rev 4192) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -14,6 +14,9 @@ import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.trees.CollinsHeadFinder; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; import edu.stanford.nlp.util.CoreMap; public class TextDocumentGenerator { @@ -26,7 +29,7 @@ private TextDocumentGenerator(){ Properties props = new Properties(); - props.put("annotators", "tokenize, ssplit, pos, lemma"); + props.put("annotators", "tokenize, ssplit, pos, lemma, parse"); pipeline = new StanfordCoreNLP(props); } @@ -58,12 +61,21 @@ //this is the POS tag of the token String lemma = label.get(LemmaAnnotation.class); //check if token is punctuation - boolean isPunctuation = word.matches(punctuationPattern); + boolean isPunctuation = word.matches(punctuationPattern) + || pos.equalsIgnoreCase("-lrb-") + || pos.equalsIgnoreCase("-rrb-") + || word.startsWith("'") + ; //check if it is a stop word - boolean isStopWord = stopWordFilter.isStopWord(word); + boolean isStopWord = stopWordFilter.isStopWord(word.toLowerCase()); Token token = new Token(word, lemma, pos, isPunctuation, isStopWord); - + + //determine the head noun + Tree tree = sentence.get(TreeAnnotation.class); + CollinsHeadFinder headFinder = new CollinsHeadFinder(); + Tree head = headFinder.determineHead(tree); + document.add(token); } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-03 12:41:34 UTC (rev 4192) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -4,6 +4,8 @@ import java.util.LinkedList; import java.util.List; +import org.dllearner.algorithms.isle.TextDocumentGenerator; + /** * A simple text document without further formatting or markup. * @@ -11,13 +13,10 @@ */ public class TextDocument extends LinkedList<Token> implements Document { public static void main(String[] args) { - TextDocument t = new TextDocument(); String s = "This is a very long, nice text for testing our new implementation of TextDocument."; - for (String e : s.split(" ")) { - t.add(new Token(e)); - } + TextDocument doc = TextDocumentGenerator.getInstance().generateDocument(s); - System.out.println(t.getRawContent()); + System.out.println(doc.getRawContent()); } @Override Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-03 12:41:34 UTC (rev 4192) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -12,6 +12,8 @@ public class TrieLinguisticAnnotator implements LinguisticAnnotator { EntityCandidatesTrie candidatesTrie; private boolean normalizeWords = true; + + private boolean ignoreStopWords = true; public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) { this.candidatesTrie = candidatesTrie; @@ -30,11 +32,13 @@ List<Token> matchedTokens; for (Token token : document) { - matchedTokens = candidatesTrie.getLongestMatchingText(document.getTokensStartingAtToken(token, true)); - if(matchedTokens != null && !matchedTokens.isEmpty()){ - Annotation annotation = new Annotation(document, matchedTokens); - annotations.add(annotation); - } + if(!(token.isPunctuation() ||token.isStopWord())){ + matchedTokens = candidatesTrie.getLongestMatchingText(document.getTokensStartingAtToken(token, true)); + if(matchedTokens != null && !matchedTokens.isEmpty()){ + Annotation annotation = new Annotation(document, matchedTokens); + annotations.add(annotation); + } + } } return annotations; } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -0,0 +1,163 @@ +package org.dllearner.algorithms.isle.index.semantic; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.HashSet; +import java.util.Set; + +import org.apache.log4j.Logger; +import org.dllearner.algorithms.isle.EntityCandidateGenerator; +import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.AnnotatedDocument; +import org.dllearner.algorithms.isle.index.LinguisticAnnotator; +import org.dllearner.algorithms.isle.index.SemanticAnnotator; +import org.dllearner.algorithms.isle.index.SimpleEntityCandidatesTrie; +import org.dllearner.algorithms.isle.index.TextDocument; +import org.dllearner.algorithms.isle.index.TrieEntityCandidateGenerator; +import org.dllearner.algorithms.isle.index.TrieLinguisticAnnotator; +import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; +import org.dllearner.algorithms.isle.wsd.StructureBasedWordSenseDisambiguation; +import org.dllearner.algorithms.isle.wsd.WindowBasedContextExtractor; +import org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLAnnotation; +import org.semanticweb.owlapi.model.OWLAnnotationProperty; +import org.semanticweb.owlapi.model.OWLEntity; +import org.semanticweb.owlapi.model.OWLLiteral; +import org.semanticweb.owlapi.model.OWLOntology; + +import com.google.common.hash.HashCode; +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; + +/** + * Interface for an index which is able to resolve a given entity's URI to the set of documents containing + * this entity, i.e., documents which contain words disambiguated to the given entity. + * + * @author Lorenz Buehmann + * @author Daniel Fleischhacker + */ +public abstract class SemanticIndexGenerator { + + static HashFunction hf = Hashing.md5(); + private static final Logger logger = Logger.getLogger(SemanticIndexGenerator.class.getName()); + private static boolean useCache = false; + + public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, WordSenseDisambiguation wordSenseDisambiguation, + EntityCandidateGenerator entityCandidateGenerator, LinguisticAnnotator linguisticAnnotator){ + SemanticAnnotator semanticAnnotator = new SemanticAnnotator(wordSenseDisambiguation, entityCandidateGenerator, linguisticAnnotator); + return generateIndex(documents, ontology, semanticAnnotator); + } + + public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, SemanticAnnotator semanticAnnotator){ + SemanticIndex semanticIndex; + //try to load serialized version + HashCode hc = hf.newHasher().putInt(documents.hashCode()).putInt(ontology.hashCode()).hash(); + File file = new File(hc.toString() + ".ser"); + if(useCache && file.exists()){ + try { + logger.info("Loading semantic index from disk..."); + ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file)); + semanticIndex = (SemanticIndex) ois.readObject(); + ois.close(); + logger.info("...done."); + } catch (Exception e) { + e.printStackTrace(); + semanticIndex = buildIndex(semanticAnnotator, documents); + } + } else { + logger.info("Building semantic index..."); + semanticIndex = buildIndex(semanticAnnotator, documents); + try { + ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(file)); + oos.writeObject(semanticIndex); + oos.close(); + } catch (IOException e1) { + e1.printStackTrace(); + } + logger.info("...done."); + } + return semanticIndex; + } + + public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, boolean useWordNormalization){ + SimpleEntityCandidatesTrie trie; + if (useWordNormalization) { + trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), + ontology, new SimpleEntityCandidatesTrie.LemmatizingWordNetNameGenerator(5)); + } + else { + trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), + ontology, new SimpleEntityCandidatesTrie.DummyNameGenerator()); + } + trie.printTrie(); + + TrieLinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(trie); + linguisticAnnotator.setNormalizeWords(useWordNormalization); + + SemanticAnnotator semanticAnnotator = new SemanticAnnotator( + new StructureBasedWordSenseDisambiguation(new WindowBasedContextExtractor(), ontology), + new TrieEntityCandidateGenerator(ontology, trie), + linguisticAnnotator); + return generateIndex(documents, ontology, semanticAnnotator); + } + + public static SemanticIndex generateIndex(OWLOntology ontology, OWLAnnotationProperty annotationProperty, String language, boolean useWordNormalization){ + Set<OWLEntity> schemaEntities = new HashSet<OWLEntity>(); + schemaEntities.addAll(ontology.getClassesInSignature()); + schemaEntities.addAll(ontology.getObjectPropertiesInSignature()); + schemaEntities.addAll(ontology.getDataPropertiesInSignature()); + Set<String> documents = new HashSet<String>(); + for (OWLEntity entity : schemaEntities) { + String label = null; + Set<OWLAnnotation> annotations = entity.getAnnotations(ontology, annotationProperty); + for (OWLAnnotation annotation : annotations) { + if (annotation.getValue() instanceof OWLLiteral) { + OWLLiteral val = (OWLLiteral) annotation.getValue(); + if (language != null) { + if (val.hasLang(language)) { + label = val.getLiteral(); + } + } + else { + label = val.getLiteral(); + } + } + } + if (label != null) { + documents.add(label); + } + } + return generateIndex(documents, ontology, useWordNormalization); + } + + /** + * Precompute the whole index, i.e. iterate over all entities and compute all annotated documents. + */ + private static SemanticIndex buildIndex(SemanticAnnotator semanticAnnotator, Set<String> documents) { + logger.info("Creating semantic index..."); + SemanticIndex index = new SemanticIndex(); + for (String document : documents) { + TextDocument textDocument = TextDocumentGenerator.getInstance().generateDocument(document); + logger.debug("Processing document:" + textDocument); + AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(textDocument); + for (Entity entity : annotatedDocument.getContainedEntities()) { + Set<AnnotatedDocument> existingAnnotatedDocuments = index.get(entity); + if (existingAnnotatedDocuments == null) { + existingAnnotatedDocuments = new HashSet<AnnotatedDocument>(); + index.put(entity, existingAnnotatedDocuments); + } + existingAnnotatedDocuments.add(annotatedDocument); + } + logger.debug("Annotated document:" + annotatedDocument); + } + int size = documents.size(); + index.setTotalNrOfDocuments(size); + logger.info("...done."); + return index; + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-03 12:41:34 UTC (rev 4192) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -90,10 +90,13 @@ if (annotation.getValue() instanceof OWLLiteral) { OWLLiteral val = (OWLLiteral) annotation.getValue(); if (val.hasLang(language)) { + //trim String label = val.getLiteral().trim(); if(entity instanceof NamedClass){ label = label.toLowerCase(); } + //remove content in brackets like (...) + label = label.replaceAll("\\s?\\((.*?)\\)", ""); textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label), weight); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-03 12:41:40
|
Revision: 4192 http://sourceforge.net/p/dl-learner/code/4192 Author: lorenz_b Date: 2013-12-03 12:41:34 +0000 (Tue, 03 Dec 2013) Log Message: ----------- Made semantic index serializable. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-12-03 09:40:14 UTC (rev 4191) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-12-03 12:41:34 UTC (rev 4192) @@ -4,6 +4,7 @@ package org.dllearner.algorithms.isle.index; +import java.io.Serializable; import java.util.ArrayList; import java.util.List; @@ -12,7 +13,7 @@ * @author Lorenz Buehmann * */ -public class Annotation { +public class Annotation implements Serializable{ private Document referencedDocument; private ArrayList<Token> tokens; Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-03 09:40:14 UTC (rev 4191) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-03 12:41:34 UTC (rev 4192) @@ -4,6 +4,9 @@ package org.dllearner.algorithms.isle.index; import java.io.Serializable; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; import com.google.common.collect.ComparisonChain; @@ -159,9 +162,9 @@ Token token = (Token) o; -// if (!posTag.equals(token.posTag)) { -// return false; -// } + if (!WordTypeComparator.sameWordType(posTag, token.posTag)) { + return false; + } if (!stemmedForm.equals(token.stemmedForm)) { return false; } @@ -172,7 +175,7 @@ @Override public int hashCode() { int result = stemmedForm.hashCode(); -// result = 31 * result + posTag.hashCode(); + result = 31 * result + WordTypeComparator.hashCode(posTag); return result; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java 2013-12-03 09:40:14 UTC (rev 4191) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java 2013-12-03 12:41:34 UTC (rev 4192) @@ -18,10 +18,28 @@ * @return */ public static boolean sameWordType(String posTag1, String posTag2){ - if(posTag1.startsWith("NN") && posTag2.startsWith("NN") || - posTag1.startsWith("V") && posTag2.startsWith("V")){ + if(posTag1.startsWith("NN") && posTag2.startsWith("NN") || //nouns + posTag1.startsWith("V") && posTag2.startsWith("V") || //verbs + posTag1.startsWith("JJ") && posTag2.startsWith("JJ") || //adjectives + posTag1.startsWith("RB") && posTag2.startsWith("RB")) //adverbs + { return true; + } else { + return posTag1.equals(posTag2); } - return false; } + + public static int hashCode(String posTag){ + if(posTag.startsWith("NN")){//nouns + return "NN".hashCode(); + } else if(posTag.startsWith("V")){//verbs + return "V".hashCode(); + } else if(posTag.startsWith("JJ")){//adjectives + return "JJ".hashCode(); + } else if(posTag.startsWith("RB")){//adverbs + return "RB".hashCode(); + } else { + return posTag.hashCode(); + } + } } Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java 2013-12-03 09:40:14 UTC (rev 4191) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java 2013-12-03 12:41:34 UTC (rev 4192) @@ -4,20 +4,15 @@ package org.dllearner.algorithms.isle; import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.ObjectOutputStream; import java.util.Collections; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.Set; import java.util.SortedSet; import org.dllearner.algorithms.celoe.CELOE; -import org.dllearner.algorithms.isle.index.TextDocument; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.dllearner.algorithms.isle.index.semantic.SemanticIndexGenerator; import org.dllearner.algorithms.isle.metrics.PMIRelevanceMetric; @@ -47,15 +42,6 @@ import com.google.common.collect.Sets; -import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.pipeline.Annotation; -import edu.stanford.nlp.pipeline.StanfordCoreNLP; -import edu.stanford.nlp.util.CoreMap; - /** * Experimental setup: * @@ -86,54 +72,21 @@ private String testFolder = "experiments/logs/"; private OWLOntology ontology; - private Set<TextDocument> documents; + private Set<String> documents; private boolean initialized = false; private RhoDRDown operator; - protected StanfordCoreNLP pipeline; protected abstract OWLOntology getOntology(); - protected abstract Set<TextDocument> getDocuments(); + protected abstract Set<String> getDocuments(); /** * */ public Experiment() { - // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution - Properties props = new Properties(); - props.put("annotators", "tokenize, ssplit, pos"); - pipeline = new StanfordCoreNLP(props); } - protected String getPOSTaggedText(String text){ - // create an empty Annotation just with the given text - Annotation document = new Annotation(text); - - // run all Annotators on this text - pipeline.annotate(document); - - // these are all the sentences in this document - // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types - List<CoreMap> sentences = document.get(SentencesAnnotation.class); - - StringBuilder sb = new StringBuilder(); - for(CoreMap sentence: sentences) { - // traversing the words in the current sentence - // a CoreLabel is a CoreMap with additional token-specific methods - for (CoreLabel token: sentence.get(TokensAnnotation.class)) { - // this is the text of the token - String word = token.get(TextAnnotation.class); - // this is the POS tag of the token - String pos = token.get(PartOfSpeechAnnotation.class); - - sb.append(word).append("/").append(pos).append(" "); - } - - } - return sb.toString(); - } - private void initIfNecessary() { if(!initialized){ ontology = getOntology(); @@ -141,13 +94,6 @@ // build semantic index SemanticIndex semanticIndex = SemanticIndexGenerator.generateIndex(documents, ontology, false); - try { - ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream("semantic-index.ser")); - oos.writeObject(semanticIndex); - oos.close(); - } catch (IOException e1) { - e1.printStackTrace(); - } // set the relevance metric relevance = new PMIRelevanceMetric(semanticIndex); Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-03 09:40:14 UTC (rev 4191) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-03 12:41:34 UTC (rev 4192) @@ -90,14 +90,14 @@ new URL("http://gold.linkeddata.org/data/bible/chapter_index.zip")); } - private Set<TextDocument> createDocuments(){ - Set<TextDocument> documents = new HashSet<TextDocument>(); + private Set<String> createDocuments(){ + Set<String> documents = new HashSet<String>(); File folder = new File(testFolder+"corpus/"); for (File file : folder.listFiles()) { if(!file.isDirectory() && !file.isHidden()){ try { String text = Files.toString(file, Charsets.UTF_8); - documents.add(TextDocumentGenerator.getInstance().generateDocument(text)); + documents.add(text); } catch (IOException e) { e.printStackTrace(); } @@ -106,8 +106,8 @@ return documents; } - private Set<TextDocument> createBibleDocuments() throws IOException { - Set<TextDocument> documents = new HashSet<TextDocument>(); + private Set<String> createBibleDocuments() throws IOException { + Set<String> documents = new HashSet<String>(); RemoteDataProvider bibleByChapter = new RemoteDataProvider( new URL("http://gold.linkeddata.org/data/bible/split_by_chapter.zip")); File folder = bibleByChapter.getLocalDirectory(); @@ -115,7 +115,7 @@ if(!file.isDirectory() && !file.isHidden()){ try { String text = Files.toString(file, Charsets.UTF_8); - documents.add(TextDocumentGenerator.getInstance().generateDocument(text)); + documents.add(text); } catch (IOException e) { e.printStackTrace(); } @@ -191,9 +191,9 @@ EntityCandidateGenerator ecg = new TrieEntityCandidateGenerator(ontology, ect); SemanticAnnotator semanticAnnotator = new SemanticAnnotator(wsd, ecg, linguisticAnnotator); - Set<TextDocument> docs = createDocuments(); - for (TextDocument doc : docs) { - AnnotatedDocument annotated = semanticAnnotator.processDocument(doc); + Set<String> docs = createDocuments(); + for (String doc : docs) { + AnnotatedDocument annotated = semanticAnnotator.processDocument(TextDocumentGenerator.getInstance().generateDocument(doc)); System.out.println(annotated); } } @@ -208,9 +208,9 @@ EntityCandidateGenerator ecg = new TrieEntityCandidateGenerator(ontology, ect); SemanticAnnotator semanticAnnotator = new SemanticAnnotator(wsd, ecg, linguisticAnnotator); - Set<TextDocument> docs = createDocuments(); - for (TextDocument doc : docs) { - AnnotatedDocument annotated = semanticAnnotator.processDocument(doc); + Set<String> docs = createDocuments(); + for (String doc : docs) { + AnnotatedDocument annotated = semanticAnnotator.processDocument(TextDocumentGenerator.getInstance().generateDocument(doc)); System.out.println(annotated); } } Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java 2013-12-03 09:40:14 UTC (rev 4191) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/SemanticBibleExperiment.java 2013-12-03 12:41:34 UTC (rev 4192) @@ -8,12 +8,9 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.HashSet; -import java.util.List; -import java.util.Properties; import java.util.Set; import org.dllearner.algorithms.isle.index.RemoteDataProvider; -import org.dllearner.algorithms.isle.index.TextDocument; import org.dllearner.core.owl.NamedClass; import org.semanticweb.owlapi.apibinding.OWLManager; import org.semanticweb.owlapi.model.IRI; @@ -22,23 +19,8 @@ import org.semanticweb.owlapi.model.OWLOntologyManager; import com.google.common.base.Charsets; -import com.google.common.collect.Sets; import com.google.common.io.Files; -import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.pipeline.Annotation; -import edu.stanford.nlp.pipeline.StanfordCoreNLP; -import edu.stanford.nlp.trees.Tree; -import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; -import edu.stanford.nlp.trees.semgraph.SemanticGraph; -import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation; -import edu.stanford.nlp.util.CoreMap; - /** * @author Lorenz Buehmann * @@ -73,10 +55,8 @@ * @see org.dllearner.algorithms.isle.Experiment#getDocuments() */ @Override - protected Set<TextDocument> getDocuments() { - Set<TextDocument> documents = new HashSet<TextDocument>(); - File taggedFolder = new File("tmp/tagged"); - taggedFolder.mkdirs(); + protected Set<String> getDocuments() { + Set<String> documents = new HashSet<String>(); try { RemoteDataProvider bibleByChapter = new RemoteDataProvider( new URL("http://gold.linkeddata.org/data/bible/split_by_chapter.zip")); @@ -85,9 +65,10 @@ if(!file.isDirectory() && !file.isHidden()){ try { String text = Files.toString(file, Charsets.UTF_8); -// String posTagged = getPOSTaggedText(text); -// Files.write(posTagged, new File(taggedFolder, file.getName() + ".tagged"), Charsets.UTF_8); -// documents.add(TextDocumentGenerator.getInstance().generateDocument(text)); + text = text.trim(); + if(!text.isEmpty()){ + documents.add(text); + } } catch (IOException e) { e.printStackTrace(); } @@ -98,9 +79,8 @@ } catch (IOException e) { e.printStackTrace(); } - documents.clear(); - TextDocument doc = TextDocumentGenerator.getInstance().generateDocument("and in that day seven women shall take hold of one man saying we will eat our own bread and wear our own apparel only let us be called by thy name to take away our reproach in that day shall the branch of the lord be beautiful and glorious and the fruit of the earth excellent and comely for them that are escaped of israel and it shall come to pass left in zion and remaineth in jerusalem shall be called holy every one that is written among the living in jerusalem when the lord shall have washed away the filth of the daughters of zion and shall have purged the blood of jerusalem from the midst thereof by the spirit of judgment and by the spirit of burning and the lord will create upon every dwelling place of mount zion and upon her assemblies a cloud and smoke by day and the shining of a flaming fire by night for upon all the glory a defence and there shall be a tabernacle for a shadow in the daytime from the heat and for a place of refuge and for a covert from storm and from rain"); - documents.add(doc); +// documents.clear(); +// documents.add("and in that day seven women shall take hold of one man saying we will eat our own bread and wear our own apparel only let us be called by thy name to take away our reproach in that day shall the branch of the lord be beautiful and glorious and the fruit of the earth excellent and comely for them that are escaped of israel and it shall come to pass left in zion and remaineth in jerusalem shall be called holy every one that is written among the living in jerusalem when the lord shall have washed away the filth of the daughters of zion and shall have purged the blood of jerusalem from the midst thereof by the spirit of judgment and by the spirit of burning and the lord will create upon every dwelling place of mount zion and upon her assemblies a cloud and smoke by day and the shining of a flaming fire by night for upon all the glory a defence and there shall be a tabernacle for a shadow in the daytime from the heat and for a place of refuge and for a covert from storm and from rain"); return documents; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-03 09:40:18
|
Revision: 4191 http://sourceforge.net/p/dl-learner/code/4191 Author: lorenz_b Date: 2013-12-03 09:40:14 +0000 (Tue, 03 Dec 2013) Log Message: ----------- Made semantic index serializable. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestNoCorpus.java Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/ Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java 2013-12-03 09:13:46 UTC (rev 4190) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java 2013-12-03 09:40:14 UTC (rev 4191) @@ -5,13 +5,14 @@ import org.dllearner.core.owl.Entity; +import java.io.Serializable; import java.util.Set; /** * @author Lorenz Buehmann * */ -public interface AnnotatedDocument extends Document { +public interface AnnotatedDocument extends Document, Serializable{ /** * Returns a set of entities which are contained in the document. Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-03 09:13:46 UTC (rev 4190) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-03 09:40:14 UTC (rev 4191) @@ -3,15 +3,15 @@ */ package org.dllearner.algorithms.isle.index; -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; +import java.io.Serializable; +import com.google.common.collect.ComparisonChain; + /** * @author Lorenz Buehmann * */ -public class Token { +public class Token implements Comparable<Token>, Serializable{ private String rawForm; private String stemmedForm; @@ -159,9 +159,9 @@ Token token = (Token) o; - if (!posTag.equals(token.posTag)) { - return false; - } +// if (!posTag.equals(token.posTag)) { +// return false; +// } if (!stemmedForm.equals(token.stemmedForm)) { return false; } @@ -172,31 +172,18 @@ @Override public int hashCode() { int result = stemmedForm.hashCode(); - result = 31 * result + posTag.hashCode(); +// result = 31 * result + posTag.hashCode(); return result; } - public static void main(String[] args) { - Token t1 = new Token("requirement", "requirement", "NN", false, false); - t1.addAlternativeForm("demand"); - t1.addAlternativeForm("condition"); - - Token t2 = new Token("demand", "demand", "NN", false, false); - t2.addAlternativeForm("must"); - - - Token t3 = new Token("must", "must", "NN", false, false); - t1.addAlternativeForm("condition"); - - Token t4 = new Token("mustache", "mustache", "NN", false, false); - - - Token[] tokens = new Token[]{t1, t2, t3, t4}; - - for (Token t : tokens) { - for (Token o : tokens) { - System.out.println(t + " - " + o + " --> " + t.equalsWithAlternativeForms(o)); - } - } - } + /* (non-Javadoc) + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + @Override + public int compareTo(Token other) { + return ComparisonChain.start() + .compare(this.rawForm, other.rawForm) + .compare(this.posTag, other.posTag) + .result(); + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-03 09:13:46 UTC (rev 4190) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-03 09:40:14 UTC (rev 4191) @@ -48,9 +48,7 @@ curNode.children.put(t, nextNode); } curNode = nextNode; - } else { - System.out.println("ignored " + t); - } + } } curNode.entities.addAll(entities); curNode.originalTokens = new ArrayList<>(originalTokens); @@ -195,7 +193,7 @@ } String indentString = indentStringBuilder.toString(); StringBuilder sb = new StringBuilder(); - for (Map.Entry<Token, TokenTree> e : children.entrySet()) { + for (Map.Entry<Token, TokenTree> e : new TreeMap<>(children).entrySet()) { sb.append(indentString).append(e.getKey().toString()); sb.append("\n"); sb.append(e.getValue().toString(indent + 1)); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-12-03 09:13:46 UTC (rev 4190) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-12-03 09:40:14 UTC (rev 4191) @@ -2,25 +2,10 @@ import java.util.HashMap; import java.util.HashSet; -import java.util.Map; import java.util.Set; -import org.apache.log4j.Logger; -import org.dllearner.algorithms.isle.EntityCandidateGenerator; -import org.dllearner.algorithms.isle.TextDocumentGenerator; import org.dllearner.algorithms.isle.index.AnnotatedDocument; -import org.dllearner.algorithms.isle.index.LinguisticAnnotator; -import org.dllearner.algorithms.isle.index.SemanticAnnotator; -import org.dllearner.algorithms.isle.index.TextDocument; -import org.dllearner.algorithms.isle.index.Token; -import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; -import org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation; import org.dllearner.core.owl.Entity; -import org.semanticweb.owlapi.model.OWLAnnotation; -import org.semanticweb.owlapi.model.OWLAnnotationProperty; -import org.semanticweb.owlapi.model.OWLEntity; -import org.semanticweb.owlapi.model.OWLLiteral; -import org.semanticweb.owlapi.model.OWLOntology; /** * Interface for an index which is able to resolve a given entity's URI to the set of documents containing @@ -29,106 +14,18 @@ * @author Lorenz Buehmann * @author Daniel Fleischhacker */ -public abstract class SemanticIndex { +public class SemanticIndex extends HashMap<Entity, Set<AnnotatedDocument>>{ + private int nrOfDocuments; - private static final Logger logger = Logger.getLogger(SemanticIndex.class.getName()); - - private SemanticAnnotator semanticAnnotator; - private SyntacticIndex syntacticIndex; - private Map<Entity, Set<AnnotatedDocument>> index; - private OWLOntology ontology; - - private int size = 0; - - public SemanticIndex(OWLOntology ontology, SyntacticIndex syntacticIndex, WordSenseDisambiguation wordSenseDisambiguation, - EntityCandidateGenerator entityCandidateGenerator, LinguisticAnnotator linguisticAnnotator) { - this.ontology = ontology; - this.syntacticIndex = syntacticIndex; - semanticAnnotator = new SemanticAnnotator(wordSenseDisambiguation, entityCandidateGenerator, linguisticAnnotator); - } - - public SemanticIndex(OWLOntology ontology) { - this.ontology = ontology; - - } - /** - * @param semanticAnnotator the semanticAnnotator to set - */ - public void setSemanticAnnotator(SemanticAnnotator semanticAnnotator) { - this.semanticAnnotator = semanticAnnotator; - } - - /** - * Precompute the whole index, i.e. iterate over all entities and compute all annotated documents. - */ - public void buildIndex(Set<TextDocument> documents) { - if (semanticAnnotator == null) { - throw new RuntimeException("No semantic annotator defined, must be set using the setSemanticAnnotator method"); - } - logger.info("Creating semantic index..."); - index = new HashMap<Entity, Set<AnnotatedDocument>>(); - for (TextDocument document : documents) { - logger.debug("Processing document:" + document); - AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(document); - for (Entity entity : annotatedDocument.getContainedEntities()) { - Set<AnnotatedDocument> existingAnnotatedDocuments = index.get(entity); - if (existingAnnotatedDocuments == null) { - existingAnnotatedDocuments = new HashSet<AnnotatedDocument>(); - index.put(entity, existingAnnotatedDocuments); - } - existingAnnotatedDocuments.add(annotatedDocument); - } - logger.debug("Annotated document:" + annotatedDocument); - } - size = documents.size(); - logger.info("...done."); - } - - public void buildIndex(OWLAnnotationProperty annotationProperty, String language) { - Set<OWLEntity> schemaEntities = new HashSet<OWLEntity>(); - schemaEntities.addAll(ontology.getClassesInSignature()); - schemaEntities.addAll(ontology.getObjectPropertiesInSignature()); - schemaEntities.addAll(ontology.getDataPropertiesInSignature()); - Set<TextDocument> documents = new HashSet<TextDocument>(); - for (OWLEntity entity : schemaEntities) { - String label = null; - Set<OWLAnnotation> annotations = entity.getAnnotations(ontology, annotationProperty); - for (OWLAnnotation annotation : annotations) { - if (annotation.getValue() instanceof OWLLiteral) { - OWLLiteral val = (OWLLiteral) annotation.getValue(); - if (language != null) { - if (val.hasLang(language)) { - label = val.getLiteral(); - } - - } - else { - label = val.getLiteral(); - } - } - } - if (label != null) { - documents.add(TextDocumentGenerator.getInstance().generateDocument(label)); - } - } - buildIndex(documents); - } - - /** * Returns the set of annotated documents which reference the given entity using one of its surface forms. * * @param entity entity to retrieve documents * @return documents referencing given entity */ public Set<AnnotatedDocument> getDocuments(Entity entity) { - if (index == null) { - System.err.println("You have to prebuild the index before you can use this method."); - System.exit(1); - } - - Set<AnnotatedDocument> annotatedDocuments = index.get(entity); + Set<AnnotatedDocument> annotatedDocuments = get(entity); if (annotatedDocuments == null) { annotatedDocuments = new HashSet<AnnotatedDocument>(); } @@ -141,16 +38,22 @@ * @param entity entity to return number of referencing documents for * @return number of documents for the given entity in this index */ - public int count(Entity entity) { - return index.get(entity).size(); + public int getNrOfDocumentsFor(Entity entity) { + return get(entity).size(); } + + /** + * @param nrOfDocuments the nrOfDocuments to set + */ + public void setTotalNrOfDocuments(int nrOfDocuments) { + this.nrOfDocuments = nrOfDocuments; + } + + /** + * @return the nrOfDocuments + */ + public int getTotalNrOfDocuments() { + return nrOfDocuments; + } - /** - * Returns the total number of documents contained in the index. - * - * @return the total number of documents contained in the index - */ - public int getSize() { - return size; - } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java 2013-12-03 09:13:46 UTC (rev 4190) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java 2013-12-03 09:40:14 UTC (rev 4191) @@ -3,12 +3,12 @@ */ package org.dllearner.algorithms.isle.metrics; +import java.util.HashMap; +import java.util.Map; + import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.semanticweb.owlapi.model.OWLEntity; -import java.util.HashMap; -import java.util.Map; - /** * @author Lorenz Buehmann * Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-12-03 09:13:46 UTC (rev 4190) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-12-03 09:40:14 UTC (rev 4191) @@ -26,7 +26,7 @@ Set<AnnotatedDocument> documentsA = index.getDocuments(entityA); Set<AnnotatedDocument> documentsB = index.getDocuments(entityB); Set<AnnotatedDocument> documentsAB = Sets.intersection(documentsA, documentsB); - int nrOfDocuments = index.getSize(); + int nrOfDocuments = index.getTotalNrOfDocuments(); double pA = nrOfDocuments == 0 ? 0 : ((double) documentsA.size() / (double) nrOfDocuments); double pB = nrOfDocuments == 0 ? 0 : ((double) documentsB.size() / (double) nrOfDocuments); @@ -42,7 +42,7 @@ Set<AnnotatedDocument> documentsA = index.getDocuments(entityA); Set<AnnotatedDocument> documentsB = index.getDocuments(entityB); Set<AnnotatedDocument> documentsAB = Sets.intersection(documentsA, documentsB); - int nrOfDocuments = index.getSize(); + int nrOfDocuments = index.getTotalNrOfDocuments(); // System.out.println("A:" + documentsA.size()); // System.out.println("B:" + documentsB.size()); // System.out.println("AB:" + documentsAB.size()); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-03 09:13:46 UTC (rev 4190) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-03 09:40:14 UTC (rev 4191) @@ -13,6 +13,7 @@ import org.dllearner.algorithms.isle.index.LinguisticUtil; import org.dllearner.algorithms.isle.index.Token; import org.dllearner.core.owl.Entity; +import org.dllearner.core.owl.NamedClass; import org.dllearner.kb.OWLAPIOntology; import org.dllearner.utilities.owl.OWLAPIConverter; import org.semanticweb.owlapi.model.IRI; @@ -90,6 +91,9 @@ OWLLiteral val = (OWLLiteral) annotation.getValue(); if (val.hasLang(language)) { String label = val.getLiteral().trim(); + if(entity instanceof NamedClass){ + label = label.toLowerCase(); + } textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label), weight); } } Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java 2013-12-03 09:13:46 UTC (rev 4190) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/Experiment.java 2013-12-03 09:40:14 UTC (rev 4191) @@ -4,6 +4,9 @@ package org.dllearner.algorithms.isle; import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectOutputStream; import java.util.Collections; import java.util.HashSet; import java.util.LinkedList; @@ -16,7 +19,7 @@ import org.dllearner.algorithms.celoe.CELOE; import org.dllearner.algorithms.isle.index.TextDocument; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; -import org.dllearner.algorithms.isle.index.semantic.simple.SimpleSemanticIndex; +import org.dllearner.algorithms.isle.index.semantic.SemanticIndexGenerator; import org.dllearner.algorithms.isle.metrics.PMIRelevanceMetric; import org.dllearner.algorithms.isle.metrics.RelevanceMetric; import org.dllearner.algorithms.isle.metrics.RelevanceUtils; @@ -42,14 +45,13 @@ import org.semanticweb.owlapi.model.OWLClass; import org.semanticweb.owlapi.model.OWLOntology; -import com.clarkparsia.sparqlowl.parser.antlr.SparqlOwlParser.whereClause_return; import com.google.common.collect.Sets; -import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; @@ -138,8 +140,15 @@ documents = getDocuments(); // build semantic index - SemanticIndex semanticIndex = new SimpleSemanticIndex(ontology, null, false); - semanticIndex.buildIndex(documents); + SemanticIndex semanticIndex = SemanticIndexGenerator.generateIndex(documents, ontology, false); + try { + ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream("semantic-index.ser")); + oos.writeObject(semanticIndex); + oos.close(); + } catch (IOException e1) { + e1.printStackTrace(); + } + // set the relevance metric relevance = new PMIRelevanceMetric(semanticIndex); try { Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-03 09:13:46 UTC (rev 4190) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-03 09:40:14 UTC (rev 4191) @@ -3,16 +3,28 @@ */ package org.dllearner.algorithms.isle; -import com.google.common.base.Charsets; -import com.google.common.base.Joiner; -import com.google.common.io.Files; +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.text.DecimalFormat; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; import org.dllearner.algorithms.celoe.CELOE; -import org.dllearner.algorithms.isle.index.*; +import org.dllearner.algorithms.isle.index.AnnotatedDocument; +import org.dllearner.algorithms.isle.index.EntityCandidatesTrie; +import org.dllearner.algorithms.isle.index.LinguisticAnnotator; +import org.dllearner.algorithms.isle.index.RemoteDataProvider; +import org.dllearner.algorithms.isle.index.SemanticAnnotator; +import org.dllearner.algorithms.isle.index.SimpleEntityCandidatesTrie; +import org.dllearner.algorithms.isle.index.TextDocument; +import org.dllearner.algorithms.isle.index.Token; +import org.dllearner.algorithms.isle.index.TrieEntityCandidateGenerator; +import org.dllearner.algorithms.isle.index.TrieLinguisticAnnotator; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; -import org.dllearner.algorithms.isle.index.semantic.simple.SimpleSemanticIndex; -import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; -import org.dllearner.algorithms.isle.index.syntactic.TextDocumentSyntacticIndexCreator; +import org.dllearner.algorithms.isle.index.semantic.SemanticIndexGenerator; import org.dllearner.algorithms.isle.metrics.PMIRelevanceMetric; import org.dllearner.algorithms.isle.metrics.RelevanceMetric; import org.dllearner.algorithms.isle.metrics.RelevanceUtils; @@ -31,18 +43,17 @@ import org.junit.Before; import org.junit.Test; import org.semanticweb.owlapi.apibinding.OWLManager; -import org.semanticweb.owlapi.model.*; +import org.semanticweb.owlapi.model.IRI; +import org.semanticweb.owlapi.model.OWLDataFactory; +import org.semanticweb.owlapi.model.OWLEntity; +import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.OWLOntologyManager; import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; -import java.io.File; -import java.io.IOException; -import java.net.URL; -import java.text.DecimalFormat; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; +import com.google.common.base.Charsets; +import com.google.common.base.Joiner; +import com.google.common.io.Files; /** * Some tests for the ISLE algorithm. @@ -59,7 +70,6 @@ private RelevanceMetric relevance; private String searchField = "label"; private SemanticIndex semanticIndex; - private SyntacticIndex syntacticIndex; // we assume that the ontology is named "ontology.owl" and that all text files // are in a subdirectory called "corpus" @@ -78,7 +88,6 @@ textRetriever = new RDFSLabelEntityTextRetriever(ontology); RemoteDataProvider chapterIndexProvider = new RemoteDataProvider( new URL("http://gold.linkeddata.org/data/bible/chapter_index.zip")); - syntacticIndex = TextDocumentSyntacticIndexCreator.loadIndex(chapterIndexProvider.getLocalDirectory()); } private Set<TextDocument> createDocuments(){ @@ -139,18 +148,8 @@ } @Test - public void testSemanticIndexAnnotationProperty(){ - semanticIndex = new SimpleSemanticIndex(ontology, syntacticIndex); - semanticIndex.buildIndex(df.getRDFSLabel(), null); -// NamedClass nc = new NamedClass("http://example.com/father#father"); - Set<AnnotatedDocument> documents = semanticIndex.getDocuments(cls); - System.out.println("Documents for " + cls + ":\n" + documents); - } - - @Test public void testSemanticIndexCorpus(){ - semanticIndex = new SimpleSemanticIndex(ontology, syntacticIndex); - semanticIndex.buildIndex(createDocuments()); + semanticIndex = SemanticIndexGenerator.generateIndex(createDocuments(), ontology, false); Set<AnnotatedDocument> documents = semanticIndex.getDocuments(cls); System.out.println(documents); relevance = new PMIRelevanceMetric(semanticIndex); @@ -169,8 +168,7 @@ lp.setClassToDescribe(cls); lp.init(); - semanticIndex = new SimpleSemanticIndex(ontology, syntacticIndex); - semanticIndex.buildIndex(createBibleDocuments()); + semanticIndex = SemanticIndexGenerator.generateIndex(createBibleDocuments(), ontology, false); relevance = new PMIRelevanceMetric(semanticIndex); @@ -227,8 +225,7 @@ lp.setClassToDescribe(cls); lp.init(); - semanticIndex = new SimpleSemanticIndex(ontology, syntacticIndex, false); - semanticIndex.buildIndex(createBibleDocuments()); + semanticIndex = SemanticIndexGenerator.generateIndex(createBibleDocuments(), ontology, false); relevance = new PMIRelevanceMetric(semanticIndex); Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestNoCorpus.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestNoCorpus.java 2013-12-03 09:13:46 UTC (rev 4190) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestNoCorpus.java 2013-12-03 09:40:14 UTC (rev 4191) @@ -4,7 +4,7 @@ import java.util.Map; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; -import org.dllearner.algorithms.isle.index.semantic.simple.SimpleSemanticIndex; +import org.dllearner.algorithms.isle.index.semantic.SemanticIndexGenerator; import org.dllearner.algorithms.isle.index.syntactic.OWLOntologyLuceneSyntacticIndexCreator; import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; import org.dllearner.algorithms.isle.metrics.PMIRelevanceMetric; @@ -65,8 +65,7 @@ lp.setClassToDescribe(cls); lp.init(); - semanticIndex = new SimpleSemanticIndex(ontology, syntacticIndex); - semanticIndex.buildIndex(df.getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_COMMENT.getIRI()), null); + semanticIndex = SemanticIndexGenerator.generateIndex(ontology, df.getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_COMMENT.getIRI()), null, false); // semanticIndex = new SimpleSemanticIndex(ontology, syntacticIndex); // semanticIndex.buildIndex(createDocuments()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-03 09:13:48
|
Revision: 4190 http://sourceforge.net/p/dl-learner/code/4190 Author: dfleischhacker Date: 2013-12-03 09:13:46 +0000 (Tue, 03 Dec 2013) Log Message: ----------- Add alternative forms to Tokens and corresponding equalsWithAlternativeForms Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-02 18:15:02 UTC (rev 4189) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-03 09:13:46 UTC (rev 4190) @@ -3,6 +3,10 @@ */ package org.dllearner.algorithms.isle.index; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + /** * @author Lorenz Buehmann * @@ -14,6 +18,8 @@ private String posTag; private boolean isPunctuation; private boolean isStopWord; + /// for storing alternative forms of this token, e.g., generated by WordNet synonyms + private HashSet<String> alternativeForms; public Token(String rawForm) { this.rawForm = rawForm; @@ -25,6 +31,7 @@ this.posTag = posTag; this.isPunctuation = isPunctuation; this.isStopWord = isStopWord; + this.alternativeForms = new HashSet<>(); } /** @@ -47,8 +54,26 @@ public String getPOSTag() { return posTag; } - - /** + + /** + * Returns the unmodifiable list of alternative surface forms for this token. These alternative forms might be + * generated by, e.g., WordNet synonym expansion. + * + * @return unmodifiable set of alternative surface forms for this token + */ + public Set<String> getAlternativeForms() { + return Collections.unmodifiableSet(alternativeForms); + } + + /** + * Adds a new surface form to the alternative forms of this token. Alternative forms are included in comparison of + * two tokens when using the {@link #equalsWithAlternativeForms}. + */ + public void addAlternativeForm(String alternativeForm) { + this.alternativeForms.add(alternativeForm); + } + + /** * @return the isPunctuation */ public boolean isPunctuation() { @@ -98,6 +123,31 @@ return "[Word: " + rawForm + " | Stemmed word: " + stemmedForm + " | POS tag: " + posTag + "]"; } + /** + * Compares the given token to this one including alternative forms. This means that tokens are considered to be + * equal iff the POS tags is the same and if the intersection of all surface forms (stemmed forms + alternative + * forms) is not empty. + * + * @param other token to compare this token to + * @return true if tokens are equal considering alternative forms, otherwise false + */ + public boolean equalsWithAlternativeForms(Token other) { + if (this == other) { + return true; + } + + if (!posTag.equals(other.posTag)) { + return false; + } + + if (other.stemmedForm.equals(stemmedForm) || other.alternativeForms.contains(stemmedForm) || + alternativeForms.contains(other.stemmedForm)) { + return true; + } + + return false; + } + @Override public boolean equals(Object o) { if (this == o) { @@ -125,4 +175,28 @@ result = 31 * result + posTag.hashCode(); return result; } + + public static void main(String[] args) { + Token t1 = new Token("requirement", "requirement", "NN", false, false); + t1.addAlternativeForm("demand"); + t1.addAlternativeForm("condition"); + + Token t2 = new Token("demand", "demand", "NN", false, false); + t2.addAlternativeForm("must"); + + + Token t3 = new Token("must", "must", "NN", false, false); + t1.addAlternativeForm("condition"); + + Token t4 = new Token("mustache", "mustache", "NN", false, false); + + + Token[] tokens = new Token[]{t1, t2, t3, t4}; + + for (Token t : tokens) { + for (Token o : tokens) { + System.out.println(t + " - " + o + " --> " + t.equalsWithAlternativeForms(o)); + } + } + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-02 15:43:51
|
Revision: 4188 http://sourceforge.net/p/dl-learner/code/4188 Author: lorenz_b Date: 2013-12-02 15:43:48 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Refactoring ISLE. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 15:22:04 UTC (rev 4187) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 15:43:48 UTC (rev 4188) @@ -160,18 +160,7 @@ } public String toString() { - StringBuilder output = new StringBuilder(); - Map<String,FullTokenEntitySetPair> trieMap = trie.toMap(); - - for (Entry<String, FullTokenEntitySetPair> entry : trieMap.entrySet()) { - String key = entry.getKey(); - FullTokenEntitySetPair pair = entry.getValue(); - output.append(key + " (" + pair.getFullToken() + ") :\n"); - for (Entity candidate: pair.getEntitySet()) { - output.append("\t"+candidate+"\n"); - } - } - return output.toString(); + return tree.toString(); } public static void main(String[] args) { Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 15:22:04 UTC (rev 4187) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 15:43:48 UTC (rev 4188) @@ -16,12 +16,21 @@ private HashMap<Token, TokenTree> children; private Set<Entity> entities; private List<Token> originalTokens; + private boolean ignoreStopWords = true; public TokenTree() { this.children = new HashMap<>(); this.entities = new HashSet<>(); this.originalTokens = new ArrayList<>(); } + + /** + * If set to TRUE, stopwords like 'of, on' are ignored during creation and retrieval operations. + * @param ignoreStopWords the ignoreStopWords to set + */ + public void setIgnoreStopWords(boolean ignoreStopWords) { + this.ignoreStopWords = ignoreStopWords; + } /** * Adds all given entities to the end of the path resulting from the given tokens. @@ -32,12 +41,16 @@ public void add(List<Token> tokens, Set<Entity> entities, List<Token> originalTokens) { TokenTree curNode = this; for (Token t : tokens) { - TokenTree nextNode = curNode.children.get(t); - if (nextNode == null) { - nextNode = new TokenTree(); - curNode.children.put(t, nextNode); - } - curNode = nextNode; + if(!ignoreStopWords || (ignoreStopWords && !t.isStopWord())){ + TokenTree nextNode = curNode.children.get(t); + if (nextNode == null) { + nextNode = new TokenTree(); + curNode.children.put(t, nextNode); + } + curNode = nextNode; + } else { + System.out.println("ignored " + t); + } } curNode.entities.addAll(entities); curNode.originalTokens = new ArrayList<>(originalTokens); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-02 15:22:04 UTC (rev 4187) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-02 15:43:48 UTC (rev 4188) @@ -27,14 +27,14 @@ @Override public Set<Annotation> annotate(TextDocument document) { Set<Annotation> annotations = new HashSet<Annotation>(); - NormalizedTextMapper mapper = new NormalizedTextMapper(document); - String content = mapper.getNormalizedText(); List<Token> matchedTokens; for (Token token : document) { matchedTokens = candidatesTrie.getLongestMatchingText(document.getTokensStartingAtToken(token, true)); - Annotation annotation = new Annotation(document, matchedTokens); - annotations.add(annotation); + if(matchedTokens != null && !matchedTokens.isEmpty()){ + Annotation annotation = new Annotation(document, matchedTokens); + annotations.add(annotation); + } } return annotations; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-02 15:22:07
|
Revision: 4187 http://sourceforge.net/p/dl-learner/code/4187 Author: lorenz_b Date: 2013-12-02 15:22:04 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Refactoring ISLE. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-02 15:20:16 UTC (rev 4186) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-02 15:22:04 UTC (rev 4187) @@ -16,11 +16,11 @@ /** - * Gets set of candidate entities for an exact given String + * Gets set of candidate entities for a list of tokens * @param s * @return */ - public Set<Entity> getCandidateEntities(String s); + public Set<Entity> getCandidateEntities(List<Token> tokens); /** @@ -31,12 +31,12 @@ * @param s the string to search in the trie * @return string generating the path of the longest match in the trie */ - public String getGeneratingStringForLongestMatch(String s); + public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens); /** * Gets the longest matching string * @param s * @return */ - public String getLongestMatchingText(String s); + public List<Token> getLongestMatchingText(List<Token> tokens); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java 2013-12-02 15:20:16 UTC (rev 4186) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java 2013-12-02 15:22:04 UTC (rev 4187) @@ -15,6 +15,6 @@ * @param document the document to get annotation for * @return set of annotations for the given document */ - Set<Annotation> annotate(Document document); + Set<Annotation> annotate(TextDocument document); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 15:20:16 UTC (rev 4186) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 15:22:04 UTC (rev 4187) @@ -145,21 +145,18 @@ } @Override - public Set<Entity> getCandidateEntities(String s) { - FullTokenEntitySetPair res = trie.get(s); - return res == null ? new HashSet<Entity>() : trie.get(s).getEntitySet(); + public Set<Entity> getCandidateEntities(List<Token> tokens) { + return tree.get(tokens); } @Override - public String getGeneratingStringForLongestMatch(String s) { - CharSequence match = trie.getLongestMatch(s); - return (match!=null) ? trie.get(match).getFullToken() : null; + public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens) { + return tree.getOriginalTokensForLongestMatch(tokens); } @Override - public String getLongestMatchingText(String s) { - CharSequence match = trie.getLongestMatch(s); - return (match!=null) ? match.toString() : null; + public List<Token> getLongestMatchingText(List<Token> tokens) { + return tree.getLongestMatch(tokens); } public String toString() { Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-02 15:20:16 UTC (rev 4186) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-02 15:22:04 UTC (rev 4187) @@ -28,7 +28,7 @@ } public Set<Entity> getCandidates(Annotation annotation) { - return candidatesTrie.getCandidateEntities(annotation.getMatchedString()); + return candidatesTrie.getCandidateEntities(annotation.getTokens()); } /** Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-02 15:20:16 UTC (rev 4186) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-02 15:22:04 UTC (rev 4187) @@ -1,6 +1,7 @@ package org.dllearner.algorithms.isle.index; import java.util.HashSet; +import java.util.List; import java.util.Set; /** @@ -24,26 +25,17 @@ * @return the set of annotation for the given document */ @Override - public Set<Annotation> annotate(Document document) { + public Set<Annotation> annotate(TextDocument document) { Set<Annotation> annotations = new HashSet<Annotation>(); NormalizedTextMapper mapper = new NormalizedTextMapper(document); String content = mapper.getNormalizedText(); - for (int i = 0; i < content.length(); i++) { - if (Character.isWhitespace(content.charAt(i))) { - continue; - } - String unparsed = content.substring(i); - String match = candidatesTrie.getLongestMatchingText(unparsed); - if (match != null && !match.isEmpty()) { - Annotation annotation = mapper.getOriginalAnnotationForPosition(i, match.length()); - annotation.setMatchedString(match); - annotations.add(annotation); - i += match.length() - 1; - } - while (!Character.isWhitespace(content.charAt(i)) && i < content.length()) { - i++; - } - } + + List<Token> matchedTokens; + for (Token token : document) { + matchedTokens = candidatesTrie.getLongestMatchingText(document.getTokensStartingAtToken(token, true)); + Annotation annotation = new Annotation(document, matchedTokens); + annotations.add(annotation); + } return annotations; } Modified: trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java =================================================================== --- trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-02 15:20:16 UTC (rev 4186) +++ trunk/components-core/src/test/java/org/dllearner/algorithms/isle/ISLETestCorpus.java 2013-12-02 15:22:04 UTC (rev 4187) @@ -6,6 +6,7 @@ import com.google.common.base.Charsets; import com.google.common.base.Joiner; import com.google.common.io.Files; + import org.dllearner.algorithms.celoe.CELOE; import org.dllearner.algorithms.isle.index.*; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; @@ -31,6 +32,7 @@ import org.junit.Test; import org.semanticweb.owlapi.apibinding.OWLManager; import org.semanticweb.owlapi.model.*; + import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; import java.io.File; @@ -38,6 +40,7 @@ import java.net.URL; import java.text.DecimalFormat; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; @@ -124,7 +127,7 @@ // @Test public void testTextRetrieval() { System.out.println("Text for entity " + cls + ":"); - Map<String, Double> relevantText = textRetriever.getRelevantText(cls); + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(cls); System.out.println(Joiner.on("\n").join(relevantText.entrySet())); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-02 15:20:20
|
Revision: 4186 http://sourceforge.net/p/dl-learner/code/4186 Author: dfleischhacker Date: 2013-12-02 15:20:16 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Re-enabled ignorePunctuation Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-02 15:19:06 UTC (rev 4185) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-02 15:20:16 UTC (rev 4186) @@ -112,13 +112,20 @@ public List<Token> getTokensStartingAtToken(Token start, boolean ignorePunctuation) { ArrayList<Token> tokens = new ArrayList<Token>(); + int relevantTokens = 0; boolean found = false; - for (int i = 0; i < this.size(); i++) { - Token t = this.get(i); - if (t == start) { - return this.subList(i, this.size()); + for (Token t : this) { + if (found) { + tokens.add(t); + if (!ignorePunctuation || !t.isPunctuation()) { + relevantTokens++; + } } + else if (t == start) { + found = true; + tokens.add(t); + } } return tokens; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |