From: <dfl...@us...> - 2013-10-22 14:08:18
|
Revision: 4126 http://sourceforge.net/p/dl-learner/code/4126 Author: dfleischhacker Date: 2013-10-22 14:08:14 +0000 (Tue, 22 Oct 2013) Log Message: ----------- Add TextDocumentSyntacticIndexCreator for creating Lucene indexes from text files Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java Copied: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java (from rev 4123, trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java) =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-10-22 14:08:14 UTC (rev 4126) @@ -0,0 +1,93 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index.syntactic; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.SimpleFSDirectory; +import org.apache.lucene.util.Version; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +/** + * Creates a syntactic index from text files stored on disk + * + */ +public class TextDocumentSyntacticIndexCreator { + + private Directory indexDirectory; + private final File inputDirectory; + private final static String searchField = "text"; + + public TextDocumentSyntacticIndexCreator(File inputDirectory, File indexDirectory) + throws IOException { + this.indexDirectory = new SimpleFSDirectory(indexDirectory); + this.inputDirectory = inputDirectory; + } + + public SyntacticIndex buildIndex() throws Exception{ + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); + IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig); + System.out.println( "Creating index ..." ); + + Set<org.apache.lucene.document.Document> luceneDocuments = new HashSet<org.apache.lucene.document.Document>(); + FieldType stringType = new FieldType(StringField.TYPE_STORED); + stringType.setStoreTermVectors(false); + FieldType textType = new FieldType(TextField.TYPE_STORED); + textType.setStoreTermVectors(false); + + for (File f : inputDirectory.listFiles()) { + if (!f.getName().endsWith(".txt")) { + continue; + } + org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document(); + luceneDocument.add(new Field("uri", f.toURI().toString(), stringType)); + + StringBuilder content = new StringBuilder(); + BufferedReader reader = new BufferedReader(new FileReader(f)); + + String line; + while ((line = reader.readLine()) != null) { + content.append(line); + content.append("\n"); + } + reader.close(); + + luceneDocument.add(new Field(searchField, content.toString(), textType)); + luceneDocuments.add(luceneDocument); + } + writer.addDocuments(luceneDocuments); + + System.out.println("Done."); + writer.close(); + + return new LuceneSyntacticIndex(indexDirectory, searchField); + } + + public static SyntacticIndex loadIndex(File indexDirectory) throws Exception { + return new LuceneSyntacticIndex(new SimpleFSDirectory(indexDirectory), searchField); + } + + public static void main(String[] args) throws Exception { + if (args.length != 2) { + System.err.println("Usage: <input director> <index directory>"); + System.exit(1); + return; + } + new TextDocumentSyntacticIndexCreator(new File(args[0]), new File(args[1])).buildIndex(); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-09 15:36:42
|
Revision: 4198 http://sourceforge.net/p/dl-learner/code/4198 Author: lorenz_b Date: 2013-12-09 15:36:38 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Added syntactic index. Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-12-09 15:35:10 UTC (rev 4197) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-12-09 15:36:38 UTC (rev 4198) @@ -1,122 +0,0 @@ -/** - * - */ -package org.dllearner.algorithms.isle.index.syntactic; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.SimpleFSDirectory; -import org.apache.lucene.util.Version; -import org.dllearner.algorithms.isle.index.Index; -import org.dllearner.algorithms.isle.index.TextDocument; - -/** - * Creates a syntactic index from text files stored on disk - * - */ -public class TextDocumentSyntacticIndexCreator { - - private Directory indexDirectory; - private final File inputDirectory; - private final static String searchField = "text"; - - public TextDocumentSyntacticIndexCreator(File inputDirectory, File indexDirectory) - throws IOException { - this.indexDirectory = new SimpleFSDirectory(indexDirectory); - this.inputDirectory = inputDirectory; - } - - public Index buildIndex() throws Exception{ - Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); - IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); - IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig); - System.out.println( "Creating index ..." ); - - Set<org.apache.lucene.document.Document> luceneDocuments = new HashSet<org.apache.lucene.document.Document>(); - FieldType stringType = new FieldType(StringField.TYPE_STORED); - stringType.setStoreTermVectors(false); - FieldType textType = new FieldType(TextField.TYPE_STORED); - textType.setStoreTermVectors(false); - - for (File f : inputDirectory.listFiles()) { - if (!f.getName().endsWith(".txt")) { - continue; - } - org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document(); - luceneDocument.add(new Field("uri", f.toURI().toString(), stringType)); - - StringBuilder content = new StringBuilder(); - BufferedReader reader = new BufferedReader(new FileReader(f)); - - String line; - while ((line = reader.readLine()) != null) { - content.append(line); - content.append("\n"); - } - reader.close(); - - luceneDocument.add(new Field(searchField, content.toString(), textType)); - luceneDocuments.add(luceneDocument); - } - writer.addDocuments(luceneDocuments); - - System.out.println("Done."); - writer.close(); - - return new LuceneSyntacticIndex(indexDirectory, searchField); - } - - public Index buildIndex(Set<TextDocument> documents) throws Exception{ - Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); - IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); - IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig); - System.out.println( "Creating index ..." ); - - Set<org.apache.lucene.document.Document> luceneDocuments = new HashSet<org.apache.lucene.document.Document>(); - FieldType stringType = new FieldType(StringField.TYPE_STORED); - stringType.setStoreTermVectors(false); - FieldType textType = new FieldType(TextField.TYPE_STORED); - textType.setStoreTermVectors(false); - - int id = 1; - for (TextDocument document : documents) { - org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document(); - luceneDocument.add(new Field("uri", Integer.toString(id++), stringType)); - luceneDocument.add(new Field(searchField, document.getContent(), textType)); - luceneDocuments.add(luceneDocument); - } - writer.addDocuments(luceneDocuments); - - System.out.println("Done."); - writer.close(); - - return new LuceneSyntacticIndex(indexDirectory, searchField); - } - - public static Index loadIndex(File indexDirectory) throws Exception { - return new LuceneSyntacticIndex(new SimpleFSDirectory(indexDirectory), searchField); - } - - public static void main(String[] args) throws Exception { - if (args.length != 2) { - System.err.println("Usage: <input directory> <index directory>"); - System.exit(1); - return; - } - new TextDocumentSyntacticIndexCreator(new File(args[0]), new File(args[1])).buildIndex(); - } -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |