From: <dfl...@us...> - 2013-08-19 09:53:00
|
Revision: 4024 http://sourceforge.net/p/dl-learner/code/4024 Author: dfleischhacker Date: 2013-08-19 09:52:57 +0000 (Mon, 19 Aug 2013) Log Message: ----------- TR API: Document instead of String for documents Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Document.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Document.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Document.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Document.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -0,0 +1,24 @@ +package org.dllearner.algorithms.isle.index; + +/** + * Interface for classes representing documents. + * + * @author Daniel Fleischhacker + */ +public interface Document { + /** + * Returns the cleaned content of this document represented as a string. This returns the cleaned content, + * thus markup and other structure is removed. The raw content can be retrieved using {@link #getRawContent}. + * Methods for retrieving more specialized content formats might be implemented by the actual implementations. + * + * @return this document's text content + */ + public String getContent(); + + /** + * Returns the uncleaned content, i.e., as originally retrieved, of this document represented as string. + * + * @return uncleaned content of this document + */ + public String getRawContent(); +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java 2013-08-15 09:42:17 UTC (rev 4023) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -3,11 +3,6 @@ */ package org.dllearner.algorithms.isle.index; -import java.io.File; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; @@ -22,6 +17,11 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; +import java.io.File; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + /** * @author Lorenz Buehmann * @@ -41,26 +41,26 @@ parser = new QueryParser( Version.LUCENE_43, searchField, analyzer ); } - public LuceneSyntacticIndex(Directory directory, String seachField) throws Exception { - this(DirectoryReader.open(directory), seachField); + public LuceneSyntacticIndex(Directory directory, String searchField) throws Exception { + this(DirectoryReader.open(directory), searchField); } - public LuceneSyntacticIndex(String indexDirectory, String seachField) throws Exception { - this(DirectoryReader.open(FSDirectory.open(new File(indexDirectory))), seachField); + public LuceneSyntacticIndex(String indexDirectory, String searchField) throws Exception { + this(DirectoryReader.open(FSDirectory.open(new File(indexDirectory))), searchField); } /* (non-Javadoc) * @see org.dllearner.algorithms.isle.SyntacticIndex#getDocuments(java.lang.String) */ @Override - public Set<String> getDocuments(String searchString) { - Set<String> documents = new HashSet<String>(); + public Set<org.dllearner.algorithms.isle.index.Document> getDocuments(String searchString) { + Set<org.dllearner.algorithms.isle.index.Document> documents = new HashSet<org.dllearner.algorithms.isle.index.Document>(); try { Query query = parser.parse(searchString); ScoreDoc[] result = searcher.search(query, getSize()).scoreDocs; for (int i = 0; i < result.length; i++) { Document doc = searcher.doc(result[i].doc); - documents.add(doc.get(searchField)); + documents.add(new TextDocument(doc.get(searchField))); } } catch (ParseException e) { e.printStackTrace(); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java 2013-08-15 09:42:17 UTC (rev 4023) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -3,13 +3,8 @@ */ package org.dllearner.algorithms.isle.index; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; @@ -19,16 +14,14 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; -import org.semanticweb.owlapi.model.OWLAnnotation; -import org.semanticweb.owlapi.model.OWLAnnotationProperty; -import org.semanticweb.owlapi.model.OWLDataFactory; -import org.semanticweb.owlapi.model.OWLEntity; -import org.semanticweb.owlapi.model.OWLLiteral; -import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.*; import org.semanticweb.owlapi.vocab.OWLRDFVocabulary; - import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + /** * Creates a Lucene Index for the labels if classes and properties. * @author Lorenz Buehmann @@ -61,8 +54,8 @@ IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); IndexWriter writer = new IndexWriter(directory, indexWriterConfig); System.out.println( "Creating index ..." ); - - Set<Document> luceneDocuments = new HashSet<Document>(); + + Set<org.apache.lucene.document.Document> luceneDocuments = new HashSet<org.apache.lucene.document.Document>(); FieldType stringType = new FieldType(StringField.TYPE_STORED); stringType.setStoreTermVectors(false); FieldType textType = new FieldType(TextField.TYPE_STORED); @@ -81,7 +74,7 @@ } if(label != null){ - Document luceneDocument = new Document(); + org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document(); luceneDocument.add(new Field("uri", entity.toStringID(), stringType)); luceneDocument.add(new Field(searchField, label, textType)); luceneDocuments.add(luceneDocument); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java 2013-08-15 09:42:17 UTC (rev 4023) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -1,35 +1,37 @@ -/** - * - */ package org.dllearner.algorithms.isle.index; +import org.dllearner.core.owl.Entity; + import java.util.Set; -import org.dllearner.core.owl.Entity; - /** - * This class + * Interface for an index which is able to resolve a given entity's URI to the set of documents containing + * this entity, i.e., documents which contain words disambiguated to the given entity. + * * @author Lorenz Buehmann - * + * @author Daniel Fleischhacker */ public interface SemanticIndex { + /** + * Returns the set of documents which reference the given entity using one of its surface forms. + * + * @param entity entity to retrieve documents + * @return documents referencing given entity + */ + public Set<Document> getDocuments(Entity entity); - /** - * This method returns a set of documents for the given entity. - * @param entity - * @return - */ - Set<String> getDocuments(Entity entity); - /** - * This method returns the number of documents for the given entity. - * @param entity - * @return - */ - int count(Entity entity); - /** - * This methods returns the total number of documents contained in the index. - * @return the total number of documents contained in the index - */ - int getSize(); + /** + * Returns the number of documents for the given entity. + * + * @param entity entity to return number of referencing documents for + * @return number of documents for the given entity in this index + */ + public int count(Entity entity); + /** + * Returns the total number of documents contained in the index. + * + * @return the total number of documents contained in the index + */ + public int getSize(); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java 2013-08-15 09:42:17 UTC (rev 4023) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -3,20 +3,20 @@ */ package org.dllearner.algorithms.isle.index; +import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; + import java.util.HashSet; import java.util.Map; import java.util.Map.Entry; import java.util.Set; -import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; -import org.dllearner.core.owl.Entity; -import org.semanticweb.owlapi.model.OWLOntology; - /** * @author Lorenz Buehmann * */ -public class SimpleSemanticIndex implements SemanticIndex{ +public class SimpleSemanticIndex implements SemanticIndex { private SyntacticIndex syntacticIndex; private RDFSLabelEntityTextRetriever labelRetriever; @@ -34,8 +34,8 @@ * @see org.dllearner.algorithms.isle.SemanticIndex#getDocuments(org.dllearner.core.owl.Entity) */ @Override - public Set<String> getDocuments(Entity entity) { - Set<String> documents = new HashSet<String>(); + public Set<Document> getDocuments(Entity entity) { + Set<Document> documents = new HashSet<Document>(); Map<String, Double> relevantText = labelRetriever.getRelevantText(entity); for (Entry<String, Double> entry : relevantText.entrySet()) { Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java 2013-08-15 09:42:17 UTC (rev 4023) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -1,32 +1,41 @@ /** - * + * */ package org.dllearner.algorithms.isle.index; import java.util.Set; /** + * Interface for a syntactic index, e.g., a basic string-based inverted index. + * * @author Lorenz Buehmann - * + * @author Daniel Fleischhacker */ public interface SyntacticIndex { - /** - * This method returns a set of documents based on how the underlying index is processing the given search string. - * @param searchString - * @return - */ - Set<String> getDocuments(String searchString); - /** - * This method returns the number of documents based on how the underlying index is processing the given search string. - * @param searchString - * @return - */ - int count(String searchString); - /** - * This methods returns the total number of documents contained in the index. - * @return the total number of documents contained in the index - */ - int getSize(); - + /** + * Returns a set of documents based on how the underlying index is processing the given + * search string. + * + * @param searchString query specifying the documents to retrieve + * @return set of documents retrieved based on the given query string + */ + Set<Document> getDocuments(String searchString); + + /** + * Returns the number of documents based on how the underlying index is processing the + * given search string. + * + * @param searchString query specifying the documents to include in the number of documents + * @return number of documents retrieved based on the given query string + */ + int count(String searchString); + + /** + * Returns the total number of documents contained in the index. + * + * @return the total number of documents contained in the index + */ + int getSize(); + } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -0,0 +1,29 @@ +package org.dllearner.algorithms.isle.index; + +/** + * A simple text document without further formatting or markup. + * + * @author Daniel Fleischhacker + */ +public class TextDocument implements Document { + private String content; + + public TextDocument(String content) { + this.content = content; + } + + @Override + public String getContent() { + return content; + } + + /** + * The text content of this document. Returns the same data as {@link #getContent()}. + * + * @return text content of this document + */ + @Override + public String getRawContent() { + return content; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |