|
From: Hellen <he...@es...> - 2008-10-31 15:02:26
|
Hi,
I am C/C++ starter and what I need to do is creating an ATL COM object
implementing CLucenes sources to be used by another application based on
VB. For that, I have to create an interfaces methods for indexing files and
searching them by the documents id provided as a parameter... I am
studding this project about some months and Ive already spent a hard time
trying to understand and define the required classes for indexing files and
search them (I mean the classes functionality). After a large time, I think
I have made all customizations what I needed to the original demo project
and now I have the component object created and working... But I have some
points to review
One of these customizations is that when I do a search by the id document
provided I want to return the Hits containing the found documents as the
search results to the Hits object interface property. Another method will
read that Hits property already loaded and returns the documents stored
on it, providing the id as the interface return method.
The problem is that I have to instantiate each document from Hits on
SearchFiles before return it to the Hits object component property. Like
that its working properly but if Id do it when I try to get the documents
from the Hits property I have null documents
My question is: Why? This is
so abstract for me and does not make sense, because if I get the documents
in my SearchFiles those instances will exists only at SearchFiles
runtime
And I do not use those documents instances for nothing - I just
call them before return the Hits container. So, when GetFoundDocument
calls the documents into Hits object property, that should returns the
loaded documents from the Hits property without instance them before the
loading (Just From Hits to Hits must to get all Hits contents, right?!).
I am using Visual Studio 2008 with ATL Project type. Follow
LuceneComponent.cpp class which is the implementation of the methods to get
the general idea how the client will use the component
and the
LuceneFunctions.cpp class to get the Index and Search process.
Thanks a lot in advance!
-Components Interface Methods :: LuceneComponent.cpp-
#include "LuceneActiveX.h"
#include "LuceneComponent.h"
// External Functions (declarations)
void IndexFiles(char* path, char* id, char* target, const bool
clearIndex);
Hits* SearchFiles(const char* index, char* line);
BSTR SearchFiles(Hits* hit, long idoc);
BSTR SearchFiles(Hits* hHit, int iDoc);
long DeleteFiles(const char* dir);
int DeleteFile(const char* target, int iDoc);
int GetStats(const char* directory);
int GetiDoc(char* path, char* target);
// Internal Variables (Object Properties)
char lpIndex[_MAX_PATH]; //Stores the path of the Index.
int lcIniciado; //Stores the status of the inicialization
process.
Hits* hHit; //Stores the Search results.
//Methods
//StartupTest
STDMETHODIMP CLuceneComponent::TestaIniciar (BSTR* Status)
{
[...]
// Inicialization Test
}
//Startup
STDMETHODIMP CLuceneComponent::Iniciar(LPSTR Chave, LPSTR IndiceDir, LPSTR
*Config, long *Status)
{
[...]
// Successfully Startup will load the the path of the Index
(IndiceDir) into lpIndex and returns 1. The another parameters are to set
own component properties
}
//IndexFile
STDMETHODIMP CLuceneComponent::IndexarArquivo(LPSTR ArquivoDir, LPSTR DocId,
LPSTR Tipo, long* nDocs)
{
if(lcIniciado!=1) //Startup Test
{
return S_FALSE;
}else{
//IndexFiles is internal function
See LuceneFunctions.cpp
IndexFiles(ArquivoDir, DocId,lpIndex, false);
//Returns the number of stored documents in the Index.
IndexSearcher s(lpIndex);
*nDocs = s.maxDoc();
return S_OK;
}
}
//GetIndexSize
STDMETHODIMP CLuceneComponent::ObterTamanhoIndice(long* nDocs)
{
//Test if the Index directory has an Index segments
if(!IndexReader::indexExists(lpIndex)){
*nDocs = -1;
return S_OK;
}
//Returns the number of stored documents in the Index.
IndexSearcher s(lpIndex);
*nDocs = s.getReader()->numDocs();
return S_OK;
}
//SearchFiles
STDMETHODIMP CLuceneComponent::Pesquisar(LPSTR StringReferencia, long
*nArquivos)
{
//SearchFiles is an internal function and has 3 overloads!
//See LuceneFunctions.cpp
hHit = SearchFiles(lpIndex, StringReferencia);
//Returns the number of found documents
Note: Its obtained from the Hits
component property loaded by SearchFiles return!
*nArquivos = hHit->length();
return S_OK;
}
//GetFoundDocumentPath
STDMETHODIMP CLuceneComponent::ObterDocDir(long iDoc, BSTR *docDir)
{
CComBSTR bsCom;
//Test if the search has result.
if(hHit->length() != 0)
{
//SearchFiles is an internal function and has 3 overloads!
//See LuceneFunctions.cpp
bsCom.AppendBSTR(SearchFiles(hHit, iDoc));
*docDir = bsCom.Detach();
}else{
//Returns a message informing that there is no search result.
bsCom.Append("A pesquisa não retornou resultados!");
*docDir = bsCom.Detach();
}
return S_OK;
}
//GetFoundDocumentID
STDMETHODIMP CLuceneComponent::ObterDocID(long iDoc, BSTR *docID)
{
CComBSTR bsCom;
//Test if the search has result.
if(hHit->length() != 0)
{
//SearchFiles is an internal function and has 3 overloads!
//See LuceneFunctions.cpp
bsCom.AppendBSTR(SearchFiles(hHit, (int)iDoc));
*docID = bsCom.Detach();
}else{
//Returns a message informing that there is no search result.
bsCom.Append("A pesquisa não retornou resultados!");
*docID = bsCom.Detach();
}
return S_OK;
}
//GetFoundDocumentScore
STDMETHODIMP CLuceneComponent::ObterDocScore(long iDoc, float *docScore)
{
//Test if the search has result.
if(hHit->length() != 0)
{
//Returns the document score obtained by the Hits property.
//Note: Its obtained from the Hits component property
loaded by SearchFiles return!
*docScore = hHit->score((int)iDoc);
}else{
//Abort it
*docScore = 0;
}
return S_OK;
}
//ClearIndex (DeleteAllDocuments)
STDMETHODIMP CLuceneComponent::LimparIndice(long *nDocs)
{
//ClearIndex is an internal function
See LuceneFunctions.cpp
*nDocs = DeleteFiles(lpIndex);
return S_OK;
}
//GetIndexStatistics
STDMETHODIMP CLuceneComponent::ObterEstatisticas(long *Resultado)
{
//Return Index Statistics
[
]
}
//DeleteDocument
STDMETHODIMP CLuceneComponent::ExcluirDoc(LPSTR DocId, int *Status)
{
//DeleteFile is an internal function
See LuceneFunctions.cpp
*Status = DeleteFile(lpIndex, GetiDoc(DocId,lpIndex) );
return S_OK;
}
-Lucene Functions :: LuceneFunctions.cpp-
#include <iostream>
#include <fstream>
#include "stdafx.h"
#include "CLucene.h"
#include "CLucene/util/Reader.h"
#include "CLucene/util/Misc.h"
#include "CLucene/util/dirent.h"
#include "CLucene/index/Term.h"
#include "CLucene/index/SegmentTermEnum.h"
using namespace std;
using namespace lucene::index;
using namespace lucene::analysis;
using namespace lucene::util;
using namespace lucene::search;
using namespace lucene::store;
using namespace lucene::document;
using namespace lucene::queryParser;
//Creates a Document defining the fields and fill them with the file
information contents
Document* FileDocument(const char* f, const char* id){
Document* doc = _CLNEW Document();
TCHAR tbuf[CL_MAX_DIR];
STRCPY_AtoT(tbuf,f,CL_MAX_DIR);
doc->add( *_CLNEW Field(_T("path"), tbuf, true,true,false ) );
STRCPY_AtoT(tbuf,id,CL_MAX_DIR);
doc->add( *_CLNEW Field(_T("id"), tbuf, true,true,false ) );
FILE* fh = fopen(f,"r");
if ( fh != NULL ){
StringBuffer str;
int fn = fileno(fh);
struct stat filestat;
fstat(fn, &filestat);
str.reserve(filestat.st_size);
char abuf[1024];
TCHAR tbuf[1024];
size_t r;
do{
r = fread(abuf,1,1023,fh);
abuf[r]=0;
STRCPY_AtoT(tbuf,abuf,r);
tbuf[r]=0;
str.append(tbuf);
}while(r>0);
fclose(fh);
doc->add( *_CLNEW
Field(_T("contents"),str.getBuffer(),true,true,true));
}
return doc;
}
//Add to the Index the Document created by FileDocument
void IndexFiles(char* path, char* id, char* target, const bool clearIndex){
IndexWriter* writer = NULL;
lucene::analysis::standard::StandardAnalyzer an;
if ( !clearIndex && IndexReader::indexExists(target) ){
if ( IndexReader::isLocked(target) ){
IndexReader::unlock(target);
}
writer = _CLNEW IndexWriter( target, &an, false);
}else{
writer = _CLNEW IndexWriter( target ,&an, true);
}
writer->setMaxFieldLength(IndexWriter::DEFAULT_MAX_FIELD_LENGTH);
Document* doc = FileDocument(path,id);
writer->addDocument(doc);
writer->optimize();
writer->close();
_CLDELETE(writer);
}
int32_t GetStats(const char* directory)
{
[
]
}
//Search for files which have the provided term in the contents field
Hits* SearchFiles(const char* index, char* line)
{
TCHAR tline [256];
standard::StandardAnalyzer analyzer;
IndexSearcher s(index);
STRCPY_AtoT(tline, line, strlen(line));
tline[strlen(line)]=0;
Query* q = QueryParser::parse(tline,_T("contents"), &analyzer);
Hits* hit = s.search(q);
//Here_ I have to do it before returning Hits
int size = hit->length();
for(int i=0; i<size; i++)
{
Document *doc = &hit->doc(i);
CComBSTR bstr = doc->get(_T("path"));
}
//_CLDELETE(hit);
_CLDELETE(q);
s.close();
return hit;
}
//Returns the document path field value
//Note: I have to create this function here because if I call the document
by the component interface from Hits property that will be null
BSTR SearchFiles(Hits* hit, long idoc)
{
Document *doc = &hit->doc((int)idoc);
CComBSTR bstr = doc->get(_T("path"));
return bstr.Detach();
}
//Returns the id path field value
BSTR SearchFiles(Hits* hit, int idoc)
{
Document *doc = &hit->doc(idoc);
CComBSTR bstr = doc->get(_T("id"));
return bstr.Detach();
}
//Delete all files from the index
long DeleteFiles(const char *dir) {
IndexReader* reader = IndexReader::open(dir);
long count = 0;
for (int32_t i = 0; i < reader->maxDoc(); i++)
{
reader->deleteDocument (i);
count ++;
}
reader->close();
_CLDELETE(reader);
//OPTIMIZE
if ( IndexReader::indexExists(dir) )
{
lucene::analysis::SimpleAnalyzer an;
if ( IndexReader::isLocked(dir) )
{
IndexReader::unlock(dir);
}
IndexWriter* writer = _CLNEW IndexWriter( dir, &an,
false);
writer->optimize();
_CLDELETE(writer);
}
return count;
}
//Delete one file
int DeleteFile(const char *target, int iDoc)
{
IndexReader* reader = IndexReader::open(target);
CComBSTR path;
if (reader->numDocs() == 0)
return -1;
if ((iDoc < reader->numDocs()) && (iDoc > -1)){
reader->deleteDocument (iDoc);
reader->close();
_CLDELETE(reader);
//OPTIMIZE
if ( IndexReader::indexExists(target) ){
lucene::analysis::SimpleAnalyzer an;
if ( IndexReader::isLocked(target) )
{
IndexReader::unlock(target);
}
IndexWriter* writer = _CLNEW IndexWriter(
target, &an, false);
writer->optimize();
_CLDELETE(writer);
return iDoc;
}
reader->close();
_CLDELETE(reader);
return -1;
}
}
//Returns the document position from Hits by the document id
int32_t GetiDoc(char* id, char* target){
IndexReader* reader = IndexReader::open(target);
if (reader->numDocs() == 0)
return -1;
Document *doc;
long count = 0;
for (int32_t i = 0; i < reader->maxDoc(); i++)
{
doc = reader->document(i);
CComBSTR str = doc->get(_T("id"));
if(str == id){
Term term = Term(_T("id"),_T(id));
//I have created int getNumDocByTermDocs(Term*) method because I can not
found another way to get the document position to the Hits in this theatre!
This is another point to get help =)
int32_t docid =
reader->getNumDocByTermDocs(&term);
reader->close();
_CLDELETE(reader);
return docid;
}
count ++;
}
reader->close();
_CLDELETE(reader);
return -1;
}
|