From: <lor...@us...> - 2013-09-03 16:48:19
|
Revision: 4040 http://sourceforge.net/p/dl-learner/code/4040 Author: lorenz_b Date: 2013-09-03 16:48:16 +0000 (Tue, 03 Sep 2013) Log Message: ----------- Added first implementation of semantic index. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndexFactory.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-03 16:09:27 UTC (rev 4039) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-03 16:48:16 UTC (rev 4040) @@ -6,7 +6,6 @@ import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.WordSenseDisambiguation; import org.dllearner.core.owl.Entity; -import org.semanticweb.owlapi.model.OWLOntology; /** * Provides methods to annotate documents. @@ -15,7 +14,6 @@ */ public class SemanticAnnotator { - private OWLOntology ontology; private WordSenseDisambiguation wordSenseDisambiguation; private EntityCandidateGenerator entityCandidateGenerator; private LinguisticAnnotator linguisticAnnotator; @@ -26,9 +24,8 @@ * * @param ontology the ontology to use entities from */ - public SemanticAnnotator(OWLOntology ontology, WordSenseDisambiguation wordSenseDisambiguation, + public SemanticAnnotator(WordSenseDisambiguation wordSenseDisambiguation, EntityCandidateGenerator entityCandidateGenerator, LinguisticAnnotator linguisticAnnotator) { - this.ontology = ontology; this.wordSenseDisambiguation = wordSenseDisambiguation; this.entityCandidateGenerator = entityCandidateGenerator; this.linguisticAnnotator = linguisticAnnotator; Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-03 16:09:27 UTC (rev 4039) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-03 16:48:16 UTC (rev 4040) @@ -1,11 +1,19 @@ package org.dllearner.algorithms.isle.index.semantic; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.dllearner.algorithms.isle.EntityCandidateGenerator; +import org.dllearner.algorithms.isle.WordSenseDisambiguation; import org.dllearner.algorithms.isle.index.AnnotatedDocument; -import org.dllearner.algorithms.isle.index.Document; +import org.dllearner.algorithms.isle.index.LinguisticAnnotator; +import org.dllearner.algorithms.isle.index.SemanticAnnotator; +import org.dllearner.algorithms.isle.index.TextDocument; +import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; -import java.util.Set; - /** * Interface for an index which is able to resolve a given entity's URI to the set of documents containing * this entity, i.e., documents which contain words disambiguated to the given entity. @@ -13,14 +21,51 @@ * @author Lorenz Buehmann * @author Daniel Fleischhacker */ -public interface SemanticIndex { +public abstract class SemanticIndex { + + private SemanticAnnotator semanticAnnotator; + private SyntacticIndex syntacticIndex; + private Map<Entity, Set<AnnotatedDocument>> index; + private OWLOntology ontology; + + public SemanticIndex(OWLOntology ontology, SyntacticIndex syntacticIndex, WordSenseDisambiguation wordSenseDisambiguation, + EntityCandidateGenerator entityCandidateGenerator, LinguisticAnnotator linguisticAnnotator) { + this.ontology = ontology; + this.syntacticIndex = syntacticIndex; + semanticAnnotator = new SemanticAnnotator(wordSenseDisambiguation, entityCandidateGenerator, linguisticAnnotator); + } + + public SemanticIndex(OWLOntology ontology, SyntacticIndex syntacticIndex, SemanticAnnotator semanticAnnotator) { + this.semanticAnnotator = semanticAnnotator; + } + + /** + * Precompute the whole index, i.e. iterate over all entities and compute all annotated documents. + */ + public void buildIndex(Set<TextDocument> documents){ + for (TextDocument document : documents) { + AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(document); + for (Entity entity : annotatedDocument.getContainedEntities()) { + Set<AnnotatedDocument> existingAnnotatedDocuments = index.get(entity); + if(existingAnnotatedDocuments == null){ + existingAnnotatedDocuments = new HashSet<AnnotatedDocument>(); + index.put(entity, existingAnnotatedDocuments); + } + existingAnnotatedDocuments.add(annotatedDocument); + } + } + } + /** * Returns the set of annotated documents which reference the given entity using one of its surface forms. * * @param entity entity to retrieve documents * @return documents referencing given entity */ - public Set<AnnotatedDocument> getDocuments(Entity entity); + public Set<AnnotatedDocument> getDocuments(Entity entity){ + Set<AnnotatedDocument> annotatedDocuments = index.get(entity); + return annotatedDocuments; + } /** * Returns the number of documents for the given entity. @@ -28,12 +73,16 @@ * @param entity entity to return number of referencing documents for * @return number of documents for the given entity in this index */ - public int count(Entity entity); + public int count(Entity entity){ + return index.get(entity).size(); + } /** * Returns the total number of documents contained in the index. * * @return the total number of documents contained in the index */ - public int getSize(); + public int getSize(){ + return index.size(); + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-03 16:09:27 UTC (rev 4039) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-03 16:48:16 UTC (rev 4040) @@ -3,28 +3,20 @@ */ package org.dllearner.algorithms.isle.index.semantic.simple; -import org.dllearner.algorithms.isle.index.AnnotatedDocument; -import org.dllearner.algorithms.isle.index.Document; +import org.dllearner.algorithms.isle.RandomWordSenseDisambiguation; +import org.dllearner.algorithms.isle.index.SimpleEntityCandidateGenerator; +import org.dllearner.algorithms.isle.index.SimpleLinguisticAnnotator; +import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; -import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; -import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; -import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; -import java.util.HashSet; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; - /** * A semantic index which returns all documents which contain at least one of the labels assigned to a specific * entity in a provided ontology. * * @author Lorenz Buehmann */ -public class SimpleSemanticIndex implements SemanticIndex { - private SyntacticIndex syntacticIndex; - private RDFSLabelEntityTextRetriever labelRetriever; +public class SimpleSemanticIndex extends SemanticIndex { /** * Initializes the semantic index to use {@code ontology} for finding all labels of an entity and @@ -34,41 +26,11 @@ * @param syntacticIndex index to query for documents containing the labels */ public SimpleSemanticIndex(OWLOntology ontology, SyntacticIndex syntacticIndex) { - this.syntacticIndex = syntacticIndex; - labelRetriever = new RDFSLabelEntityTextRetriever(ontology); + super(ontology, + syntacticIndex, + new RandomWordSenseDisambiguation(ontology), + new SimpleEntityCandidateGenerator(ontology), + new SimpleLinguisticAnnotator()); } - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.SemanticIndex#getDocuments(org.dllearner.core.owl.Entity) - */ - @Override - public Set<AnnotatedDocument> getDocuments(Entity entity) { - Set<AnnotatedDocument> documents = new HashSet<AnnotatedDocument>(); - Map<String, Double> relevantText = labelRetriever.getRelevantText(entity); - - for (Entry<String, Double> entry : relevantText.entrySet()) { - String label = entry.getKey(); - documents.addAll(syntacticIndex.getDocuments(label)); - } - - return documents; - } - - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.SemanticIndex#count(java.lang.String) - */ - @Override - public int count(Entity entity) { - return getDocuments(entity).size(); - } - - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.SemanticIndex#getSize() - */ - @Override - public int getSize() { - return syntacticIndex.getSize(); - } - - } Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndexFactory.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndexFactory.java 2013-09-03 16:09:27 UTC (rev 4039) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndexFactory.java 2013-09-03 16:48:16 UTC (rev 4040) @@ -1,40 +0,0 @@ -/** - * - */ -package org.dllearner.algorithms.isle.index.semantic.simple; - -import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; -import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; -import org.dllearner.algorithms.isle.index.semantic.SemanticIndexFactory; -import org.semanticweb.owlapi.model.OWLOntology; - -import java.io.File; - -/** - * This gets a syntactic index and returns a semantic index by applying WSD etc. - * - * @author Lorenz Buehmann - * @author Daniel Fleischhacker - */ -public class SimpleSemanticIndexFactory implements SemanticIndexFactory { - private OWLOntology ontology; - private SyntacticIndex syntacticIndex; - - /** - * Initializes a semantic index factory for creating simple semantic indexes. Simple semantic indexes use - * the labels assigned to an entity in {@code ontology} as its surface forms and return the all documents - * from the given syntactic index which contain at least one of these surface forms. - * - * @param syntacticIndex the syntactic index in which occurrences of the labels are searched - * @param ontology the ontology retrieve the entities' labels from - */ - public SimpleSemanticIndexFactory(SyntacticIndex syntacticIndex, OWLOntology ontology) { - this.syntacticIndex = syntacticIndex; - this.ontology = ontology; - } - - @Override - public SemanticIndex createIndex(File inputDirectory) { - return new SimpleSemanticIndex(ontology, syntacticIndex); - } -} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-09-03 16:09:27 UTC (rev 4039) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-09-03 16:48:16 UTC (rev 4040) @@ -3,12 +3,13 @@ */ package org.dllearner.algorithms.isle.metrics; -import com.google.common.collect.Sets; -import org.dllearner.algorithms.isle.index.Document; +import java.util.Set; + +import org.dllearner.algorithms.isle.index.AnnotatedDocument; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.dllearner.core.owl.Entity; -import java.util.Set; +import com.google.common.collect.Sets; /** * @author Lorenz Buehmann @@ -22,9 +23,9 @@ @Override public double getRelevance(Entity entityA, Entity entityB){ - Set<Document> documentsA = index.getDocuments(entityA); - Set<Document> documentsB = index.getDocuments(entityB); - Set<Document> documentsAB = Sets.intersection(documentsA, documentsB); + Set<AnnotatedDocument> documentsA = index.getDocuments(entityA); + Set<AnnotatedDocument> documentsB = index.getDocuments(entityB); + Set<AnnotatedDocument> documentsAB = Sets.intersection(documentsA, documentsB); int nrOfDocuments = index.getSize(); double dPClass = nrOfDocuments == 0 ? 0 : ((double) documentsA.size() / (double) nrOfDocuments); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |