From: <dfl...@us...> - 2013-10-22 14:08:18
|
Revision: 4126 http://sourceforge.net/p/dl-learner/code/4126 Author: dfleischhacker Date: 2013-10-22 14:08:14 +0000 (Tue, 22 Oct 2013) Log Message: ----------- Add TextDocumentSyntacticIndexCreator for creating Lucene indexes from text files Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java Copied: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java (from rev 4123, trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java) =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-10-22 14:08:14 UTC (rev 4126) @@ -0,0 +1,93 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index.syntactic; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.SimpleFSDirectory; +import org.apache.lucene.util.Version; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +/** + * Creates a syntactic index from text files stored on disk + * + */ +public class TextDocumentSyntacticIndexCreator { + + private Directory indexDirectory; + private final File inputDirectory; + private final static String searchField = "text"; + + public TextDocumentSyntacticIndexCreator(File inputDirectory, File indexDirectory) + throws IOException { + this.indexDirectory = new SimpleFSDirectory(indexDirectory); + this.inputDirectory = inputDirectory; + } + + public SyntacticIndex buildIndex() throws Exception{ + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); + IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig); + System.out.println( "Creating index ..." ); + + Set<org.apache.lucene.document.Document> luceneDocuments = new HashSet<org.apache.lucene.document.Document>(); + FieldType stringType = new FieldType(StringField.TYPE_STORED); + stringType.setStoreTermVectors(false); + FieldType textType = new FieldType(TextField.TYPE_STORED); + textType.setStoreTermVectors(false); + + for (File f : inputDirectory.listFiles()) { + if (!f.getName().endsWith(".txt")) { + continue; + } + org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document(); + luceneDocument.add(new Field("uri", f.toURI().toString(), stringType)); + + StringBuilder content = new StringBuilder(); + BufferedReader reader = new BufferedReader(new FileReader(f)); + + String line; + while ((line = reader.readLine()) != null) { + content.append(line); + content.append("\n"); + } + reader.close(); + + luceneDocument.add(new Field(searchField, content.toString(), textType)); + luceneDocuments.add(luceneDocument); + } + writer.addDocuments(luceneDocuments); + + System.out.println("Done."); + writer.close(); + + return new LuceneSyntacticIndex(indexDirectory, searchField); + } + + public static SyntacticIndex loadIndex(File indexDirectory) throws Exception { + return new LuceneSyntacticIndex(new SimpleFSDirectory(indexDirectory), searchField); + } + + public static void main(String[] args) throws Exception { + if (args.length != 2) { + System.err.println("Usage: <input director> <index directory>"); + System.exit(1); + return; + } + new TextDocumentSyntacticIndexCreator(new File(args[0]), new File(args[1])).buildIndex(); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |