Thread: [DL-Learner SVN] SF.net SVN: dl-learner:[4126] trunk/components-core/src/main/java/org/ dllearner/

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4126
          http://sourceforge.net/p/dl-learner/code/4126
Author:   dfleischhacker
Date:     2013-10-22 14:08:14 +0000 (Tue, 22 Oct 2013)
Log Message:
-----------
Add TextDocumentSyntacticIndexCreator for creating Lucene indexes from text files

Added Paths:
-----------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java

Copied: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java (from rev 4123, trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java)
===================================================================

--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java	                        (rev 0)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java	2013-10-22 14:08:14 UTC (rev 4126)
@@ -0,0 +1,93 @@
+/**
+ * 
+ */
+package org.dllearner.algorithms.isle.index.syntactic;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.SimpleFSDirectory;
+import org.apache.lucene.util.Version;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Creates a syntactic index from text files stored on disk
+ *
+ */
+public class TextDocumentSyntacticIndexCreator {
+
+	private Directory indexDirectory;
+    private final File inputDirectory;
+    private final static String searchField = "text";
+
+    public TextDocumentSyntacticIndexCreator(File inputDirectory, File indexDirectory)
+            throws IOException {
+        this.indexDirectory = new SimpleFSDirectory(indexDirectory);
+        this.inputDirectory = inputDirectory;
+    }
+
+    public SyntacticIndex buildIndex() throws Exception{
+		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
+		IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer);
+		IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig);
+		System.out.println( "Creating index ..." );
+
+        Set<org.apache.lucene.document.Document> luceneDocuments = new HashSet<org.apache.lucene.document.Document>();
+        FieldType stringType = new FieldType(StringField.TYPE_STORED);
+        stringType.setStoreTermVectors(false);
+        FieldType textType = new FieldType(TextField.TYPE_STORED);
+        textType.setStoreTermVectors(false);
+		
+		for (File f : inputDirectory.listFiles()) {
+            if (!f.getName().endsWith(".txt")) {
+                continue;
+            }
+            org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document();
+            luceneDocument.add(new Field("uri", f.toURI().toString(), stringType));
+
+            StringBuilder content = new StringBuilder();
+            BufferedReader reader = new BufferedReader(new FileReader(f));
+
+            String line;
+            while ((line = reader.readLine()) != null) {
+                content.append(line);
+                content.append("\n");
+            }
+            reader.close();
+
+            luceneDocument.add(new Field(searchField, content.toString(), textType));
+            luceneDocuments.add(luceneDocument);
+        }
+        writer.addDocuments(luceneDocuments);
+		
+		System.out.println("Done.");
+		writer.close();
+		
+		return new LuceneSyntacticIndex(indexDirectory, searchField);
+	}
+
+    public static SyntacticIndex loadIndex(File indexDirectory) throws Exception {
+        return new LuceneSyntacticIndex(new SimpleFSDirectory(indexDirectory), searchField);
+    }
+
+    public static void main(String[] args) throws Exception {
+        if (args.length != 2) {
+            System.err.println("Usage: <input director> <index directory>");
+            System.exit(1);
+            return;
+        }
+        new TextDocumentSyntacticIndexCreator(new File(args[0]), new File(args[1])).buildIndex();
+    }
+}

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





Thread: [DL-Learner SVN] SF.net SVN: dl-learner:[4126] trunk/components-core/src/main/java/org/ dllearner/

dl-learner-svn