Thread: [DL-Learner SVN] SF.net SVN: dl-learner:[4024] trunk/components-core/src/main/java/org/ dllearner/a

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4024
          http://sourceforge.net/p/dl-learner/code/4024
Author:   dfleischhacker
Date:     2013-08-19 09:52:57 +0000 (Mon, 19 Aug 2013)
Log Message:
-----------
TR API: Document instead of String for documents

Modified Paths:
--------------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java

Added Paths:
-----------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Document.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java

Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Document.java
===================================================================

--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Document.java	                        (rev 0)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Document.java	2013-08-19 09:52:57 UTC (rev 4024)
@@ -0,0 +1,24 @@
+package org.dllearner.algorithms.isle.index;
+
+/**
+ * Interface for classes representing documents.
+ *
+ * @author Daniel Fleischhacker
+ */
+public interface Document {
+    /**
+     * Returns the cleaned content of this document represented as a string. This returns the cleaned content,
+     * thus markup and other structure is removed. The raw content can be retrieved using {@link #getRawContent}.
+     * Methods for retrieving more specialized content formats might be implemented by the actual implementations.
+     *
+     * @return this document's text content
+     */
+    public String getContent();
+
+    /**
+     * Returns the uncleaned content, i.e., as originally retrieved, of this document represented as string.
+     *
+     * @return uncleaned content of this document
+     */
+    public String getRawContent();
+}

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java	2013-08-15 09:42:17 UTC (rev 4023)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java	2013-08-19 09:52:57 UTC (rev 4024)
@@ -3,11 +3,6 @@
  */
 package org.dllearner.algorithms.isle.index;
 
-import java.io.File;
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Set;
-
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
@@ -22,6 +17,11 @@
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.Version;
 
+import java.io.File;
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
 /**
  * @author Lorenz Buehmann
  *
@@ -41,26 +41,26 @@
 		parser = new QueryParser( Version.LUCENE_43, searchField, analyzer );
 	}
 	
-	public LuceneSyntacticIndex(Directory directory, String seachField) throws Exception {
-		this(DirectoryReader.open(directory), seachField);
+	public LuceneSyntacticIndex(Directory directory, String searchField) throws Exception {
+		this(DirectoryReader.open(directory), searchField);
 	}
 	
-	public LuceneSyntacticIndex(String indexDirectory, String seachField) throws Exception {
-		this(DirectoryReader.open(FSDirectory.open(new File(indexDirectory))), seachField);
+	public LuceneSyntacticIndex(String indexDirectory, String searchField) throws Exception {
+		this(DirectoryReader.open(FSDirectory.open(new File(indexDirectory))), searchField);
 	}
 
 	/* (non-Javadoc)
 	 * @see org.dllearner.algorithms.isle.SyntacticIndex#getDocuments(java.lang.String)
 	 */
 	@Override
-	public Set<String> getDocuments(String searchString) {
-		Set<String> documents = new HashSet<String>();
+	public Set<org.dllearner.algorithms.isle.index.Document> getDocuments(String searchString) {
+		Set<org.dllearner.algorithms.isle.index.Document> documents = new HashSet<org.dllearner.algorithms.isle.index.Document>();
 		try {
 			Query query = parser.parse(searchString);
 			ScoreDoc[] result = searcher.search(query, getSize()).scoreDocs;
 			for (int i = 0; i < result.length; i++) {
 				Document doc = searcher.doc(result[i].doc);
-				documents.add(doc.get(searchField));
+				documents.add(new TextDocument(doc.get(searchField)));
 			}
 		} catch (ParseException e) {
 			e.printStackTrace();

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java	2013-08-15 09:42:17 UTC (rev 4023)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java	2013-08-19 09:52:57 UTC (rev 4024)
@@ -3,13 +3,8 @@
  */
 package org.dllearner.algorithms.isle.index;
 
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Set;
-
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
@@ -19,16 +14,14 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.Version;
-import org.semanticweb.owlapi.model.OWLAnnotation;
-import org.semanticweb.owlapi.model.OWLAnnotationProperty;
-import org.semanticweb.owlapi.model.OWLDataFactory;
-import org.semanticweb.owlapi.model.OWLEntity;
-import org.semanticweb.owlapi.model.OWLLiteral;
-import org.semanticweb.owlapi.model.OWLOntology;
+import org.semanticweb.owlapi.model.*;
 import org.semanticweb.owlapi.vocab.OWLRDFVocabulary;
-
 import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl;
 
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
 /**
  * Creates a Lucene Index for the labels if classes and properties.
  * @author Lorenz Buehmann
@@ -61,8 +54,8 @@
 		IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer);
 		IndexWriter writer = new IndexWriter(directory, indexWriterConfig);
 		System.out.println( "Creating index ..." );
-		
-		Set<Document> luceneDocuments = new HashSet<Document>();
+
+        Set<org.apache.lucene.document.Document> luceneDocuments = new HashSet<org.apache.lucene.document.Document>();
         FieldType stringType = new FieldType(StringField.TYPE_STORED);
         stringType.setStoreTermVectors(false);
         FieldType textType = new FieldType(TextField.TYPE_STORED);
@@ -81,7 +74,7 @@
 			}
 			
 			if(label != null){
-				Document luceneDocument = new Document();
+                org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document();
 	            luceneDocument.add(new Field("uri", entity.toStringID(), stringType));
 	            luceneDocument.add(new Field(searchField, label, textType));
 	            luceneDocuments.add(luceneDocument);

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java	2013-08-15 09:42:17 UTC (rev 4023)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java	2013-08-19 09:52:57 UTC (rev 4024)
@@ -1,35 +1,37 @@
-/**
- * 
- */
 package org.dllearner.algorithms.isle.index;
 
+import org.dllearner.core.owl.Entity;
+
 import java.util.Set;
 
-import org.dllearner.core.owl.Entity;
-
 /**
- * This class 
+ * Interface for an index which is able to resolve a given entity's URI to the set of documents containing
+ * this entity, i.e., documents which contain words disambiguated to the given entity.
+ *
  * @author Lorenz Buehmann
- *
+ * @author Daniel Fleischhacker
  */
 public interface SemanticIndex {
+    /**
+     * Returns the set of documents which reference the given entity using one of its surface forms.
+     *
+     * @param entity entity to retrieve documents
+     * @return documents referencing given entity
+     */
+    public Set<Document> getDocuments(Entity entity);
 
-	/**
-	 * This method returns a set of documents for the given entity.
-	 * @param entity
-	 * @return
-	 */
-	Set<String> getDocuments(Entity entity);
-	/**
-	 * This method returns the number of documents for the given entity.
-	 * @param entity
-	 * @return
-	 */
-	int count(Entity entity);
-	/**
-	 * This methods returns the total number of documents contained in the index.
-	 * @return the total number of documents contained in the index
-	 */
-	int getSize();
+    /**
+     * Returns the number of documents for the given entity.
+     *
+     * @param entity entity to return number of referencing documents for
+     * @return number of documents for the given entity in this index
+     */
+    public int count(Entity entity);
 
+    /**
+     * Returns the total number of documents contained in the index.
+     *
+     * @return the total number of documents contained in the index
+     */
+    public int getSize();
 }

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java	2013-08-15 09:42:17 UTC (rev 4023)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java	2013-08-19 09:52:57 UTC (rev 4024)
@@ -3,20 +3,20 @@
  */
 package org.dllearner.algorithms.isle.index;
 
+import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever;
+import org.dllearner.core.owl.Entity;
+import org.semanticweb.owlapi.model.OWLOntology;
+
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
 
-import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever;
-import org.dllearner.core.owl.Entity;
-import org.semanticweb.owlapi.model.OWLOntology;
-
 /**
  * @author Lorenz Buehmann
  *
  */
-public class SimpleSemanticIndex implements SemanticIndex{
+public class SimpleSemanticIndex implements SemanticIndex {
 	
 	private SyntacticIndex syntacticIndex;
 	private RDFSLabelEntityTextRetriever labelRetriever;
@@ -34,8 +34,8 @@
 	 * @see org.dllearner.algorithms.isle.SemanticIndex#getDocuments(org.dllearner.core.owl.Entity)
 	 */
 	@Override
-	public Set<String> getDocuments(Entity entity) {
-		Set<String> documents = new HashSet<String>();
+	public Set<Document> getDocuments(Entity entity) {
+		Set<Document> documents = new HashSet<Document>();
 		Map<String, Double> relevantText = labelRetriever.getRelevantText(entity);
 		
 		for (Entry<String, Double> entry : relevantText.entrySet()) {

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java	2013-08-15 09:42:17 UTC (rev 4023)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java	2013-08-19 09:52:57 UTC (rev 4024)
@@ -1,32 +1,41 @@
 /**
- * 
+ *
  */
 package org.dllearner.algorithms.isle.index;
 
 import java.util.Set;
 
 /**
+ * Interface for a syntactic index, e.g., a basic string-based inverted index.
+ *
  * @author Lorenz Buehmann
- *
+ * @author Daniel Fleischhacker
  */
 public interface SyntacticIndex {
 
-	/**
-	 * This method returns a set of documents based on how the underlying index is processing the given search string.
-	 * @param searchString
-	 * @return
-	 */
-	Set<String> getDocuments(String searchString);
-	/**
-	 * This method returns the number of documents based on how the underlying index is processing the given search string.
-	 * @param searchString
-	 * @return
-	 */
-	int count(String searchString);
-	/**
-	 * This methods returns the total number of documents contained in the index.
-	 * @return the total number of documents contained in the index
-	 */
-	int getSize();
-	
+    /**
+     * Returns a set of documents based on how the underlying index is processing the given
+     * search string.
+     *
+     * @param searchString query specifying the documents to retrieve
+     * @return set of documents retrieved based on the given query string
+     */
+    Set<Document> getDocuments(String searchString);
+
+    /**
+     * Returns the number of documents based on how the underlying index is processing the
+     * given search string.
+     *
+     * @param searchString query specifying the documents to include in the number of documents
+     * @return number of documents retrieved based on the given query string
+     */
+    int count(String searchString);
+
+    /**
+     * Returns the total number of documents contained in the index.
+     *
+     * @return the total number of documents contained in the index
+     */
+    int getSize();
+
 }

Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java	                        (rev 0)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java	2013-08-19 09:52:57 UTC (rev 4024)
@@ -0,0 +1,29 @@
+package org.dllearner.algorithms.isle.index;
+
+/**
+ * A simple text document without further formatting or markup.
+ *
+ * @author Daniel Fleischhacker
+ */
+public class TextDocument implements Document {
+    private String content;
+
+    public TextDocument(String content) {
+        this.content = content;
+    }
+
+    @Override
+    public String getContent() {
+        return content;
+    }
+
+    /**
+     * The text content of this document. Returns the same data as {@link #getContent()}.
+     *
+     * @return text content of this document
+     */
+    @Override
+    public String getRawContent() {
+        return content;
+    }
+}

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





Thread: [DL-Learner SVN] SF.net SVN: dl-learner:[4024] trunk/components-core/src/main/java/org/ dllearner/a

dl-learner-svn