Revision: 12911
http://genoviz.svn.sourceforge.net/genoviz/?rev=12911&view=rev
Author: hiralv
Date: 2012-09-19 15:58:42 +0000 (Wed, 19 Sep 2012)
Log Message:
-----------
Adding LuceneIndexing tool.
Added Paths:
-----------
trunk/tools/
trunk/tools/LuceneIndexing/
trunk/tools/LuceneIndexing/META-INF/
trunk/tools/LuceneIndexing/META-INF/MANIFEST.MF
trunk/tools/LuceneIndexing/build.properties
trunk/tools/LuceneIndexing/build.xml
trunk/tools/LuceneIndexing/conf/
trunk/tools/LuceneIndexing/conf/config.properties
trunk/tools/LuceneIndexing/docs/
trunk/tools/LuceneIndexing/docs/readme.html
trunk/tools/LuceneIndexing/lib/
trunk/tools/LuceneIndexing/lib/lucene-core-3.6.0.jar
trunk/tools/LuceneIndexing/resources/
trunk/tools/LuceneIndexing/resources/build.xml
trunk/tools/LuceneIndexing/resources/index.properties
trunk/tools/LuceneIndexing/src/
trunk/tools/LuceneIndexing/src/com/
trunk/tools/LuceneIndexing/src/com/gene/
trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/
trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/Activator.java
trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/FileUtil.java
trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/IndexFiles.java
Property changes on: trunk/tools/LuceneIndexing
___________________________________________________________________
Added: svn:ignore
+ .classpath
.project
Added: trunk/tools/LuceneIndexing/META-INF/MANIFEST.MF
===================================================================
--- trunk/tools/LuceneIndexing/META-INF/MANIFEST.MF (rev 0)
+++ trunk/tools/LuceneIndexing/META-INF/MANIFEST.MF 2012-09-19 15:58:42 UTC (rev 12911)
@@ -0,0 +1,21 @@
+Manifest-Version: 1.0
+Bundle-ManifestVersion: 2
+Bundle-Name: LuceneIndexing
+Bundle-SymbolicName: com.gene.luceneindexing.LuceneIndexing
+Bundle-Version: 1.0.0
+Bundle-Activator: com.gene.luceneindexing.Activator
+Bundle-ActivationPolicy: lazy
+Bundle-RequiredExecutionEnvironment: JavaSE-1.6
+Import-Package: com.affymetrix.common;version="6.7.0",
+ com.affymetrix.genometryImpl,
+ com.affymetrix.genometryImpl.parsers,
+ com.affymetrix.genometryImpl.symloader,
+ com.affymetrix.genometryImpl.symmetry,
+ org.apache.commons.lang3,
+ org.apache.lucene.analysis,
+ org.apache.lucene.analysis.standard,
+ org.apache.lucene.document,
+ org.apache.lucene.index,
+ org.apache.lucene.store,
+ org.apache.lucene.util,
+ org.osgi.framework
Property changes on: trunk/tools/LuceneIndexing/META-INF/MANIFEST.MF
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/tools/LuceneIndexing/build.properties
===================================================================
--- trunk/tools/LuceneIndexing/build.properties (rev 0)
+++ trunk/tools/LuceneIndexing/build.properties 2012-09-19 15:58:42 UTC (rev 12911)
@@ -0,0 +1,6 @@
+source.. = src/
+output.. = build/
+bin.includes = META-INF/,\
+ .,\
+ lib/lucene-core-3.6.0.jar
+
\ No newline at end of file
Property changes on: trunk/tools/LuceneIndexing/build.properties
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/tools/LuceneIndexing/build.xml
===================================================================
--- trunk/tools/LuceneIndexing/build.xml (rev 0)
+++ trunk/tools/LuceneIndexing/build.xml 2012-09-19 15:58:42 UTC (rev 12911)
@@ -0,0 +1,62 @@
+<project name="LuceneIndexing" default="full_dist" basedir=".">
+ <property environment="env" />
+ <fail unless="env.IGB_WORKSPACE">Please set the IGB_WORKSPACE env variable to the IGB home directory</fail>
+ <property name="workspace" value="${env.IGB_WORKSPACE}" />
+ <property name="plugin" value="LuceneIndexing" />
+ <property name="extra-classpath" value="lib/lucene-core-3.6.0.jar;${env.IGB_WORKSPACE}/bundles/genometryImpl.jar" />
+ <import file="${env.IGB_WORKSPACE}/build-plugin.xml" />
+ <target name="clean_full" description="clean full distribution">
+ <delete dir="temp" />
+ <delete dir="full_dist" />
+ </target>
+ <target name="full_dist" depends="clean_full,dist" description="full distribution">
+ <mkdir dir="temp" />
+ <copy todir="temp/bundle">
+ <fileset dir="dist">
+ <include name="LuceneIndexing*.jar" />
+ </fileset>
+ </copy>
+ <copy file="resources/build.xml" todir="temp" />
+ <copy file="docs/readme.html" todir="temp" />
+ <mkdir dir="temp/resources" />
+ <copy file="resources/index.properties" todir="temp/resources" />
+ <property name="_basedir" location="${env.IGB_WORKSPACE}" />
+ <property name="main" value="${_basedir}/core/main" />
+ <property name="lib.dir" value="${_basedir}/ext" />
+ <property name="bundle.dir" value="${_basedir}/bundles" />
+ <mkdir dir="temp/bin" />
+ <copy file="${lib.dir}/felix.jar" todir="temp/bin" />
+ <copy file="resources/build.xml" todir="temp" />
+ <mkdir dir="temp/conf" />
+ <copy file="${main}/resources/config.properties" todir="temp/conf" />
+ <mkdir dir="temp/bundle" />
+ <copy file="${bundle.dir}/affx_fusion.bnd.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/colt.bnd.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/commons-codec-1.4.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/commons-net-2.0.bnd.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/commons-lang3-3.1.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/image4j.bnd.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/jlfgr-1_0.bnd.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/log4j-1.2.11.bnd.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/picard.bnd.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/sam--igbext.bnd.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/tribble-0.1.bnd.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/common.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/genometry.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/genoviz.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/igb_service.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/icons--igbext.bnd.jar" todir="temp/bundle" />
+ <copy file="${bundle.dir}/jdom-1.1.3.bnd.jar" todir="temp/bundle" />
+ <mkdir dir="temp" />
+ <copy file="lib/lucene-core-3.6.0.jar" todir="temp/bundle" />
+ <java jar="${lib.dir}/biz.aQute.bnd.jar" fork="true" dir="temp/bundle" >
+ <arg value="wrap"/>
+ <arg value="lucene-core-3.6.0.jar"/>
+ </java>
+ <move file="temp/bundle/lucene-core-3.6.0.bar" tofile="temp/bundle/lucene-core-3.6.0.bnd.jar" />
+ <delete file="temp/bundle/lucene-core-3.6.0.jar" />
+ <zip destfile="temp/LuceneIndexing.zip" basedir="temp" />
+ <mkdir dir="full_dist" />
+ <move file="temp/LuceneIndexing.zip" todir="full_dist" />
+ </target>
+</project>
Property changes on: trunk/tools/LuceneIndexing/build.xml
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/tools/LuceneIndexing/conf/config.properties
===================================================================
--- trunk/tools/LuceneIndexing/conf/config.properties (rev 0)
+++ trunk/tools/LuceneIndexing/conf/config.properties 2012-09-19 15:58:42 UTC (rev 12911)
@@ -0,0 +1,102 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#
+# Framework config properties.
+#
+
+# To override the packages the framework exports by default from the
+# class path, set this variable.
+#org.osgi.framework.system.packages=
+
+# To append packages to the default set of exported system packages,
+# set this value.
+#org.osgi.framework.system.packages.extra=
+org.osgi.framework.system.packages.extra=sun.reflect,com.sun.java.swing.plaf.windows,com.apple.eawt,org.w3c.dom,org.w3c.dom.traversal,org.w3c.dom.xpath
+
+# The following property makes specified packages from the class path
+# available to all bundles. You should avoid using this property.
+#org.osgi.framework.bootdelegation=sun.*,com.sun.*
+#org.osgi.framework.bootdelegation=org.netbeans.lib.profiler,org.netbeans.lib.profiler.*,org.netbeans.lib.profiler.server.*
+
+# Felix tries to guess when to implicitly boot delegate in certain
+# situations to ease integration without outside code. This feature
+# is enabled by default, uncomment the following line to disable it.
+#felix.bootdelegation.implicit=false
+
+# The following property explicitly specifies the location of the bundle
+# cache, which defaults to "felix-cache" in the current working directory.
+# If this value is not absolute, then the felix.cache.rootdir controls
+# how the absolute location is calculated. (See next property)
+#org.osgi.framework.storage=${felix.cache.rootdir}/felix-cache
+
+# The following property is used to convert a relative bundle cache
+# location into an absolute one by specifying the root to prepend to
+# the relative cache path. The default for this property is the
+# current working directory.
+#felix.cache.rootdir=${user.dir}
+
+# The following property controls whether the bundle cache is flushed
+# the first time the framework is initialized. Possible values are
+# "none" and "onFirstInit"; the default is "none".
+#org.osgi.framework.storage.clean=onFirstInit
+
+# The following property determines which actions are performed when
+# processing the auto-deploy directory. It is a comma-delimited list of
+# the following values: 'install', 'start', 'update', and 'uninstall'.
+# An undefined or blank value is equivalent to disabling auto-deploy
+# processing.
+felix.auto.deploy.action=install,start
+
+# The following property specifies the directory to use as the bundle
+# auto-deploy directory; the default is 'bundle' in the working directory.
+#felix.auto.deploy.dir=bundle
+
+# The following property is a space-delimited list of bundle URLs
+# to install when the framework starts. The ending numerical component
+# is the target start level. Any number of these properties may be
+# specified for different start levels.
+#felix.auto.install.1=
+
+# The following property is a space-delimited list of bundle URLs
+# to install and start when the framework starts. The ending numerical
+# component is the target start level. Any number of these properties
+# may be specified for different start levels.
+#felix.auto.start.1=
+
+felix.log.level=1
+
+# Sets the initial start level of the framework upon startup.
+#org.osgi.framework.startlevel.beginning=1
+
+# Sets the start level of newly installed bundles.
+#felix.startlevel.bundle=1
+
+# Felix installs a stream and content handler factories by default,
+# uncomment the following line to not install them.
+#felix.service.urlhandlers=false
+
+# The launcher registers a shutdown hook to cleanly stop the framework
+# by default, uncomment the following line to disable it.
+#felix.shutdown.hook=false
+
+#
+# Bundle config properties.
+#
+
+#org.osgi.service.http.port=8080
+#obr.repository.url=http://felix.apache.org/obr/releases.xml
Property changes on: trunk/tools/LuceneIndexing/conf/config.properties
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/tools/LuceneIndexing/docs/readme.html
===================================================================
--- trunk/tools/LuceneIndexing/docs/readme.html (rev 0)
+++ trunk/tools/LuceneIndexing/docs/readme.html 2012-09-19 15:58:42 UTC (rev 12911)
@@ -0,0 +1,41 @@
+<h1>Lucene Indexing</h1>
+<br/>
+This is a batch utility that is used to create <a href="http://lucene.apache.org">Apache Lucene</a> indexes
+on a directory (or single file) of data files for IGB. The indexes
+are to be used with the <a href="http://research-pub.gene.com/igb_plugins/docs/LuceneIndexing.html">SearchModeLucene IGB plugin</a>.<br/>
+This only needs to be run once, or when the files are modified. To get the LuceneIndexing,
+unzip the <a href="http://research-pub.gene.com/igb_plugins/util/LuceneIndexing.zip">LuceneIndexing.zip</a> file to a new directory.<br/>
+<br/>
+Requirements:<br/>
+<ul>
+<li><a href="http://www.oracle.com/technetwork/java/javase/downloads/index.html">java</a> must be installed</li>
+<li><a href="http://ant.apache.org/">Apache ant</a> must be installed</li>
+</ul>
+<br/>
+If you want to customize the indexing, open the index.properties file. The lines indicate the properties to
+index for each file type, by extension. The _default= applies properties to all file types. The _ignore=
+ignore those properties for all file types. below these, add any file type extensions and the properties to index,
+* means all properties. For example:<br/>
+bed=score,description<br/>
+psl=*<br/>
+<br/>
+Next you need to set the parameters<br/>
+<ul>
+<li>lucene_index_dir - points to the directory containing the data files (or it can be a file)</li>
+<li>dump - an optional parameter to print out the index values instead of creating the index (dump=yes)</li>
+</ul>
+you can set the parameters either by setting the environment variable<br/>
+set lucene_index_dir=c:\mydir<br/>
+or by passing the value in the command line when you run ant<br/>
+ant -Denv.lucene_index_dir=/data/bed -Denv.dump=true<br/>
+The program must have write access to the directory<br/>
+<br/>
+run ant to start the indexing. The program will take a while, depending on the amount of
+data. It will skip file types unrecognized or non-annotation. For each file
+that it index, it will create both an index directory .<filename>.index, and a file with the
+directory listing .<filename>.index.dir.<br/>
+You should see the message "Indexing completed successfully" when the program finishes.
+<br/>
+<br/>
+Note - it may go much faster if you copy all the files to a local disk, and run the indexing
+there. Then copy the .index and .dir files back to the server.
Property changes on: trunk/tools/LuceneIndexing/docs/readme.html
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/tools/LuceneIndexing/lib/lucene-core-3.6.0.jar
===================================================================
(Binary files differ)
Property changes on: trunk/tools/LuceneIndexing/lib/lucene-core-3.6.0.jar
___________________________________________________________________
Added: svn:executable
+ *
Added: svn:mime-type
+ application/octet-stream
Added: trunk/tools/LuceneIndexing/resources/build.xml
===================================================================
--- trunk/tools/LuceneIndexing/resources/build.xml (rev 0)
+++ trunk/tools/LuceneIndexing/resources/build.xml 2012-09-19 15:58:42 UTC (rev 12911)
@@ -0,0 +1,10 @@
+<project name="LuceneIndexing" default="index" basedir=".">
+ <target name="index" description="lucene indexing using felix gogo">
+ <fail unless="env.lucene_index_dir">Please set the lucene_index_dir env variable to the directory to be indexed</fail>
+ <java jar="bin/felix.jar" fork="true" dir="." classpath="resources/">
+ <jvmarg value="-Xmx4096m"/>
+ <jvmarg value="-Dlucene_index_dir=${env.lucene_index_dir}"/>
+ <jvmarg value="-Ddump=${env.dump}"/>
+ </java>
+ </target>
+</project>
Property changes on: trunk/tools/LuceneIndexing/resources/build.xml
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/tools/LuceneIndexing/resources/index.properties
===================================================================
--- trunk/tools/LuceneIndexing/resources/index.properties (rev 0)
+++ trunk/tools/LuceneIndexing/resources/index.properties 2012-09-19 15:58:42 UTC (rev 12911)
@@ -0,0 +1,5 @@
+_default=id,name,gene name
+_ignore=method,source,type,seq
+gff=*
+gff3=*
+
Property changes on: trunk/tools/LuceneIndexing/resources/index.properties
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/Activator.java
===================================================================
--- trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/Activator.java (rev 0)
+++ trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/Activator.java 2012-09-19 15:58:42 UTC (rev 12911)
@@ -0,0 +1,37 @@
+package com.gene.luceneindexing;
+
+import java.io.IOException;
+
+import org.osgi.framework.BundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator implements BundleActivator {
+ public void start(BundleContext bundleContext) throws Exception {
+ System.out.println("**************************");
+ String docsPath = System.getenv("lucene_index_dir");
+ if (docsPath == null) {
+ docsPath = System.getProperty("lucene_index_dir");
+ }
+ if (docsPath == null) {
+ System.err.println("please set the \"lucene_index_dir\" environment variable to the directory to be indexed. " +
+ "Optionally, also set the \"dump\" environment variable to something to trial run the indexing.");
+ }
+ else {
+ String dumpString = System.getenv("dump");
+ if (dumpString == null) {
+ dumpString = System.getProperty("dump");
+ }
+ boolean dump = dumpString != null && dumpString.toLowerCase().startsWith("y");
+ System.out.println("Indexing directory " + docsPath + ", dump = " + dump + ", classpath="+System.getProperty("java.class.path"));
+ try {
+ new IndexFiles().createIndex(docsPath, dump);
+ } catch (IOException e) {
+ System.out.println("!!!!!!!!!!!!!!!!!!!!!");
+ e.printStackTrace(System.out);
+ System.out.println("Lucene Indexing caught a " + e.getClass() + "\n with message: " + e.getMessage());
+ }
+ }
+ bundleContext.getBundle(0).stop();
+ }
+ public void stop(BundleContext bundleContext) throws Exception {}
+}
Property changes on: trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/Activator.java
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/FileUtil.java
===================================================================
--- trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/FileUtil.java (rev 0)
+++ trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/FileUtil.java 2012-09-19 15:58:42 UTC (rev 12911)
@@ -0,0 +1,35 @@
+package com.gene.luceneindexing;
+
+public class FileUtil {
+ private static final FileUtil instance = new FileUtil();
+ private FileUtil() {
+ super();
+ }
+ public static final FileUtil getInstance() {
+ return instance;
+ }
+
+ private int getNamePosition(String uri) {
+ // String separator = (uri.toLowerCase().startsWith(HTTP_PREFIX) || uri.toLowerCase().startsWith(HTTPS_PREFIX)) ? HTTP_SEPARATOR : FILE_SEPARATOR;
+ int pos = Math.max(uri.lastIndexOf("/"), uri.lastIndexOf("\\")) + 1;
+ return pos;
+ }
+
+ public String getIndexName(String uri) {
+ int pos = getNamePosition(uri);
+ if (pos >= uri.length()) {
+ return "";
+ }
+ return uri.substring(0, pos) + "." + uri.substring(pos) + ".index";
+ }
+
+ public boolean isIndexName(String uri) {
+ int pos = getNamePosition(uri);
+ return pos < uri.length() && uri.charAt(pos) == '.' && uri.endsWith(".index");
+ }
+
+ public boolean isDirName(String uri) {
+ int pos = getNamePosition(uri);
+ return pos < uri.length() && uri.charAt(pos) == '.' && uri.endsWith(".index.dir");
+ }
+}
Property changes on: trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/FileUtil.java
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/IndexFiles.java
===================================================================
--- trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/IndexFiles.java (rev 0)
+++ trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/IndexFiles.java 2012-09-19 15:58:42 UTC (rev 12911)
@@ -0,0 +1,327 @@
+package com.gene.luceneindexing;
+
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericField;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
+
+import com.affymetrix.genometryImpl.AnnotatedSeqGroup;
+import com.affymetrix.genometryImpl.BioSeq;
+import com.affymetrix.genometryImpl.SeqSpan;
+import com.affymetrix.genometryImpl.parsers.FileTypeCategory;
+import com.affymetrix.genometryImpl.parsers.FileTypeHandler;
+import com.affymetrix.genometryImpl.parsers.FileTypeHolder;
+import com.affymetrix.genometryImpl.symloader.SymLoader;
+import com.affymetrix.genometryImpl.symmetry.SeqSymmetry;
+import com.affymetrix.genometryImpl.symmetry.SymWithProps;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.lang.reflect.Array;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Date;
+import java.util.List;
+import java.util.Map;
+import java.util.MissingResourceException;
+import java.util.ResourceBundle;
+
+/**
+ * Index all text files under a directory.
+ * <p>
+ * This is a command-line application demonstrating simple Lucene indexing. Run
+ * it with no command-line arguments for usage information.
+ */
+public class IndexFiles {
+ private static final PrintStream DUMP_STREAM = System.err;
+ private static final int SYMS_PER_DOT = 64 * 64;
+ private static final int DOTS_PER_LINE = 64;
+ private static final int SYMS_PER_LINE = SYMS_PER_DOT * DOTS_PER_LINE;
+ private static final ResourceBundle BUNDLE = ResourceBundle.getBundle("index");
+ private static final List<String> DEFAULT_PROPS = new ArrayList<String>();
+ static {
+ Collections.addAll(DEFAULT_PROPS, BUNDLE.getString("_default").split(","));
+ }
+ private static final List<String> IGNORE_PROPS = new ArrayList<String>();
+ static {
+ Collections.addAll(IGNORE_PROPS, BUNDLE.getString("_ignore").split(","));
+ }
+ private static final Analyzer analyzer = new KeywordAnalyzer();
+
+ public IndexFiles() {
+ super();
+ }
+
+ // main method for testing
+ public static void main(final String[] args) {
+ try {
+ new IndexFiles().createIndex(args[0], args.length > 1 && "dump".equals(args[1]));
+ } catch (IOException e) {
+ e.printStackTrace(System.out);
+ System.out.println("Lucene Indexing caught a " + e.getClass() + "\n with message: " + e.getMessage());
+ }
+ }
+
+ /**
+ * @param docsPath the path of the directory to index
+ */
+ public void createIndex(String docsPath, boolean dump) throws IOException {
+ final File docDir = new File(docsPath);
+ if (!docDir.exists() || !docDir.canRead()) {
+ DUMP_STREAM.println("Document directory '" +
+ docDir.getAbsolutePath() +
+ "' does not exist or is not readable, please check the path");
+ return;
+ }
+
+ Date start = new Date();
+ indexDocs(docDir, dump);
+ Date end = new Date();
+ DUMP_STREAM.println("Indexing completed successfully - " + (end.getTime() - start.getTime()) / 1000 + " total seconds");
+
+ }
+
+ // Deletes all files and subdirectories under dir.
+ // Returns true if all deletions were successful.
+ // If a deletion fails, the method stops attempting to delete and returns
+ // false.
+ public static boolean deleteDir(File dir) {
+ if (dir.isDirectory()) {
+ String[] children = dir.list();
+ for (int i = 0; i < children.length; i++) {
+ boolean success = deleteDir(new File(dir, children[i]));
+ if (!success) {
+ return false;
+ }
+ }
+ }
+
+ // The directory is now empty so delete it
+ return dir.delete();
+ }
+
+ private void addProp(Document doc, String name, Object value, boolean index) {
+ if (doc == null) {
+ DUMP_STREAM.println(">>>index string name=" + name + ";value=" + ArrayUtils.toString(value) + ";index=" + index);
+ }
+ else {
+ Field field = new Field(name, ArrayUtils.toString(value), Field.Store.YES, Field.Index.NO);
+ doc.add(field);
+ if (index) {
+ Field field2 = new Field(name, ArrayUtils.toString(value).toLowerCase(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS);
+ field2.setIndexOptions(IndexOptions.DOCS_ONLY);
+ doc.add(field2);
+ }
+ }
+ }
+
+ private void addProp(Document doc, String name, int value) {
+ if (doc == null) {
+ DUMP_STREAM.println(">>>index int name=" + name + ";value=" + value);
+ }
+ else {
+ NumericField field = new NumericField(name, Field.Store.YES, false);
+ field.setIntValue(value);
+ doc.add(field);
+ }
+ }
+
+ private void processSym(IndexWriter writer, SeqSymmetry sym, List<String> props, BioSeq seq) throws IOException {
+ if (sym.getChildCount() > 0) {
+ for (int i = 0; i < sym.getChildCount(); i++) {
+ processSym(writer, sym.getChild(i), props, seq);
+ }
+ }
+ Document doc = (writer == null) ? null : new Document();
+ if (sym.getID() != null) {
+ addProp(doc, "id", sym.getID(), true);
+ }
+ SeqSpan span = sym.getSpan(seq);
+ addProp(doc, "seq", span.getBioSeq().getID(), false);
+ addProp(doc, "start", span.getStart());
+ addProp(doc, "end", span.getEnd());
+ if (sym instanceof SymWithProps) {
+ Map<String, Object> symProps = ((SymWithProps)sym).getProperties();
+ if (props.contains("*")) {
+ props = new ArrayList<String>(symProps.keySet());
+ }
+ for (String prop : props) {
+ if (!IGNORE_PROPS.contains(prop.toLowerCase())) {
+ if ("id".equals(prop) && symProps.get(prop) != null && sym.getID() != null) {
+ if (!sym.getID().equals(symProps.get(prop))) {
+ DUMP_STREAM.println("!!!!! ERROR !!!!! - seq symmetry sym.getID() = \"" + sym.getID() + "\" is not the same as sym.getProperties().get(\"" + prop + "\") = \"" + symProps.get(prop));
+ }
+ }
+ addProp(doc, prop, symProps.get(prop), symProps.get(prop) != null);
+ }
+ }
+ }
+ if (writer == null) {
+ DUMP_STREAM.println("==============================================================");
+ }
+ else {
+ writer.addDocument(doc);
+ }
+ }
+
+ /**
+ * Indexes the given file using the given writer, or if a directory is
+ * given, recurses over files and directories found under the given
+ * directory.
+ *
+ * NOTE: This method indexes one document per input file. This is slow. For
+ * good throughput, put multiple documents into your input file(s). An
+ * example of this is in the benchmark module, which can create "line doc"
+ * files, one document per line, using the <a href=
+ * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
+ * >WriteLineDocTask</a>.
+ *
+ * @param writer
+ * Writer to the index where the given file/dir info will be
+ * stored
+ * @param file
+ * The file to index, or the directory to recurse into to find
+ * files to index
+ * @throws IOException
+ */
+ private void indexDocs(File file, boolean dump) throws IOException {
+
+ // do not try to index files that cannot be read
+ if (file.canRead()) {
+ if (file.isDirectory()) {
+ String[] files = file.list();
+ // an IO error could occur
+ if (files != null) {
+ if (!dump) {
+ // first delete old indexes
+ for (String dirName : file.list()) {
+ File dir = new File(file, dirName);
+ if (dir.isDirectory() && FileUtil.getInstance().isIndexName(dirName)) {
+ deleteDir(dir);
+ File dirFile = new File(dir.getAbsolutePath() + ".dir");
+ if (dirFile.exists()) {
+ dirFile.delete();
+ }
+ }
+ }
+ }
+ if (!FileUtil.getInstance().isIndexName(file.getAbsolutePath())) {
+ for (int i = 0; i < files.length; i++) {
+ indexDocs(new File(file, files[i]), dump);
+ }
+ }
+ }
+ }
+ else {
+ FileInputStream fis;
+ try {
+ fis = new FileInputStream(file);
+ }
+ catch (FileNotFoundException fnfe) {
+ // at least on windows, some temporary files raise this
+ // exception with an "access denied" message
+ // checking if the file can be read doesn't help
+ return;
+ }
+
+ String indexPath = FileUtil.getInstance().getIndexName(file.getAbsolutePath());
+ Directory dir = FSDirectory.open(new File(indexPath));
+ try {
+ URI uri = file.toURI();
+ String extension = FileTypeHolder.getInstance().getExtensionForURI(uri.toString());
+ FileTypeHandler fth = FileTypeHolder.getInstance().getFileTypeHandler(extension);
+ if (FileUtil.getInstance().isDirName(uri.toString()) || fth == null || fth.getFileTypeCategory() != FileTypeCategory.Annotation) {
+ DUMP_STREAM.println("skipping " + file);
+ return;
+ }
+ // get properties to index
+ List<String> props = new ArrayList<String>(DEFAULT_PROPS);
+ try {
+ Collections.addAll(props, BUNDLE.getString(extension).split(","));
+ }
+ catch (MissingResourceException x) {}
+ String featureName = ""; // dummy
+ AnnotatedSeqGroup group = new AnnotatedSeqGroup(""); // dummy
+ DUMP_STREAM.println("loading " + file);
+ SymLoader symL = fth.createSymLoader(uri, featureName, group);
+ IndexWriter writer = null;
+ if (!dump) {
+// IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35, analyzer);
+// iwc.setOpenMode(OpenMode.CREATE);
+ writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
+ }
+ // get all seq symmetries and index them.
+ List<? extends SeqSymmetry> syms;// = symL.getGenome();
+ int count = 0;
+ for (BioSeq seq : symL.getChromosomeList()) {
+ syms = symL.getChromosome(seq);
+ DUMP_STREAM.println("processing " + syms.size() + " records for " + file + " on sequence " + seq.getID());
+ Date start = new Date();
+ for (SeqSymmetry sym : syms) {
+ processSym(writer, sym, props, seq);
+ count++;
+ if (count % SYMS_PER_DOT == 0) {
+ DUMP_STREAM.print(".");
+ DUMP_STREAM.flush();
+ }
+ if (count % SYMS_PER_LINE == 0) {
+ Date now = new Date();
+ long elapsedSeconds = (now.getTime() - start.getTime()) / 1000;
+ double secsPerSym = (double) elapsedSeconds / (double) count;
+ long remaining = (long) ((syms.size() - count) * secsPerSym);
+ long hours = remaining / 3600;
+ remaining = remaining % 3600;
+ long minutes = remaining / 60;
+ long seconds = remaining % 60;
+ DUMP_STREAM.println(" " + (hours < 10 ? "0" : "") + hours + ":" + (minutes < 10 ? "0" : "") + minutes + ":" + (seconds < 10 ? "0" : "") + seconds + " remaining");
+ DUMP_STREAM.flush();
+ }
+ }
+ if (count % SYMS_PER_LINE > 0) {
+ DUMP_STREAM.println();
+ }
+ }
+
+
+
+
+ if (writer != null) {
+ writer.close();
+ // now get a directory listing in the .dir file
+ File listFile = new File(indexPath + ".dir");
+ listFile.delete();
+ BufferedWriter bw = new BufferedWriter(new FileWriter(listFile));
+ File fileDir = new File(indexPath);
+ for (String fileName : fileDir.list()) {
+ bw.write(fileName);
+ bw.newLine();
+ }
+ bw.close();
+ }
+ DUMP_STREAM.println("finished indexing " + file);
+ }
+ catch(Exception x) {
+ x.printStackTrace(System.out);
+ }
+ finally {
+ fis.close();
+ }
+ }
+ }
+ }
+}
Property changes on: trunk/tools/LuceneIndexing/src/com/gene/luceneindexing/IndexFiles.java
___________________________________________________________________
Added: svn:executable
+ *
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|