Update of /cvsroot/rira/rira/lucene-java/java/ir/rira/search
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26497/java/ir/rira/search
Added Files:
HighFreqTerms.class HighFreqTerms.java IndexObjects.class
IndexObjects.java Makefile SearchObjects.class
SearchObjects.java TermInfo.class TermInfoQueue.class
Log Message:
Change org.rira.search package to ir.rira.search.
--- NEW FILE: HighFreqTerms.class ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: Makefile ---
JAVAS = $(wildcard *.java)
CLASSES = $(subst .java,.class,$(JAVAS))
all: $(CLASSES)
%.class: %.java
../../../../classpath javac $<
clean:
$(RM) *.class
--- NEW FILE: TermInfo.class ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: HighFreqTerms.java ---
package ir.rira.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@....
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.OutputStreamWriter;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
class HighFreqTerms {
public static int numTerms = 100;
public static void main(String[] args) throws Exception {
OutputStreamWriter out = new OutputStreamWriter(System.out, "UTF-8");
IndexReader reader = IndexReader.open(args[0]);
TermInfoQueue tiq = new TermInfoQueue(numTerms);
TermEnum terms = reader.terms();
int minFreq = 0;
while (terms.next()) {
if (terms.docFreq() > minFreq) {
tiq.put(new TermInfo(terms.term(), terms.docFreq()));
if (tiq.size() > numTerms) { // if tiq overfull
tiq.pop(); // remove lowest in tiq
minFreq = ((TermInfo)tiq.top()).docFreq; // reset minFreq
}
}
}
while (tiq.size() != 0) {
TermInfo termInfo = (TermInfo)tiq.pop();
out.write(termInfo.term + " " + termInfo.docFreq + "\n");
}
out.flush();
reader.close();
}
}
final class TermInfo {
TermInfo(Term t, int df) {
term = t;
docFreq = df;
}
int docFreq;
Term term;
}
final class TermInfoQueue extends PriorityQueue {
TermInfoQueue(int size) {
initialize(size);
}
protected final boolean lessThan(Object a, Object b) {
TermInfo termInfoA = (TermInfo)a;
TermInfo termInfoB = (TermInfo)b;
return termInfoA.docFreq < termInfoB.docFreq;
}
}
--- NEW FILE: TermInfoQueue.class ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: SearchObjects.java ---
package ir.rira.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@....
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import org.apache.lucene.analysis.Analyzer;
import com.cybermehr.lucene.analysis.persian.PersianAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import com.cybermehr.lucene.queryParser.QueryParser;
class SearchObjects {
public static void main(String[] args) throws Exception {
Searcher searcher = new IndexSearcher(args[0]);
Analyzer analyzer = new PersianAnalyzer();
BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
OutputStreamWriter out = new OutputStreamWriter(System.out, "UTF-8");
String l1, l2, l3, l4;
int i = 0;
while ((l1 = in.readLine()) != null && (l2 = in.readLine()) != null
&& (l3 = in.readLine()) != null && (l4 = in.readLine()) != null) {
int limit = Integer.parseInt(l1);
int start = Integer.parseInt(l2);
Query q = null;
Query textquery = null;
Query idnquery = null;
if (!l3.equals("")) {
idnquery = QueryParser.parse(l3, "idn", analyzer);
q = textquery;
}
if (!l4.equals("")) {
textquery = QueryParser.parse(l4, "contents", analyzer);
q = textquery;
}
if (idnquery != null && textquery != null) {
BooleanQuery bQuery = new BooleanQuery();
bQuery.add(idnquery, true, false); // REQUIRED
bQuery.add(textquery, true, false); // REQUIRED
q = bQuery;
}
if (textquery != null)
out.write(textquery.toString("contents") + "\n");
else
out.write("\n");
Hits hits = searcher.search(q);
out.write(hits.length() + "\n");
int end = Math.min(hits.length(), start + limit);
out.write(Math.max(0, (end - start)) + "\n");
for (int c = start; c < end; c++) {
Document doc = hits.doc(c);
String idn = doc.get("idn");
String text = doc.get("contents");
out.write(idn + "\n" + text + "\n");
}
out.flush();
if (++i % 10000 == 0)
System.gc();
}
searcher.close();
}
}
--- NEW FILE: SearchObjects.class ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: IndexObjects.java ---
package ir.rira.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@....
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.*;
import java.util.*;
import java.sql.Connection;
import java.sql.Statement;
import java.sql.ResultSet;
import java.sql.DriverManager;
import com.cybermehr.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
public class IndexObjects {
public static void main(String args[]) throws Exception {
BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
System.out.println("Database Host");
String host = in.readLine();
System.out.println("Database Name");
String dbname = in.readLine();
System.out.println("User Name");
String user = in.readLine();
System.out.println("Password");
String password = in.readLine();
String url = "jdbc:postgresql://" + host + "/" + dbname;
Class.forName("org.postgresql.Driver");
Connection db = DriverManager.getConnection(url, user, password);
System.out.println("Connected");
Statement stmt = db.createStatement();
stmt.execute("set client_encoding to 'UNICODE'");
Date start = new Date();
IndexWriter writer = new IndexWriter(args[0], new StandardAnalyzer(), true);
String line;
String module = "public";
while ((line = in.readLine()) != null) {
System.out.println(line);
StreamTokenizer st = new StreamTokenizer(new StringReader(line));
st.ordinaryChar('_');
st.nextToken();
String com = st.sval;
st.nextToken();
String arg = st.sval;
if (com.equals("module")) {
stmt.execute("set search_path to " + arg + ", public");
module = arg;
} else if (com.equals("object")) {
String selectSQL = "select idn, text from " + arg + "_index";
ResultSet res = stmt.executeQuery(selectSQL);
int i = 0;
while(res.next()) {
Document doc = new Document();
String idn = "x " + module + " " + res.getString(1) + " x";
// Use Text field, to be stored and searchable.
doc.add(Field.Text("idn", idn));
// Use UnStored if you don't want contents to be stored.
doc.add(Field.Text("contents", res.getString(2)));
writer.addDocument(doc);
if (++i % 10000 == 0)
System.gc();
System.out.print(".");
}
System.out.println("done");
System.gc();
res.close();
} else {
System.out.println("ignoring command " + com);
}
}
writer.optimize();
writer.close();
Date end = new Date();
System.out.print(end.getTime() - start.getTime());
System.out.println(" total milliseconds");
stmt.close();
db.close();
}
}
--- NEW FILE: IndexObjects.class ---
(This appears to be a binary file; contents omitted.)
|