[Opentrep-svn] SF.net SVN: opentrep:[129] trunk/opentrep
Status: Beta
Brought to you by:
denis_arnaud
From: <den...@us...> - 2009-07-15 14:45:49
|
Revision: 129 http://opentrep.svn.sourceforge.net/opentrep/?rev=129&view=rev Author: denis_arnaud Date: 2009-07-15 14:45:43 +0000 (Wed, 15 Jul 2009) Log Message: ----------- [Indexer] Fixed a bug in the indexer (where terms were inserted with spaces). Modified Paths: -------------- trunk/opentrep/opentrep/command/IndexBuilder.cpp trunk/opentrep/refdata/data/ref_place_names.csv trunk/opentrep/test/xapian/simple_search.cpp trunk/opentrep/test/xapian/string_search.cpp Modified: trunk/opentrep/opentrep/command/IndexBuilder.cpp =================================================================== --- trunk/opentrep/opentrep/command/IndexBuilder.cpp 2009-07-14 22:34:14 UTC (rev 128) +++ trunk/opentrep/opentrep/command/IndexBuilder.cpp 2009-07-15 14:45:43 UTC (rev 129) @@ -7,6 +7,8 @@ #include <string> #include <vector> #include <exception> +// Boost +#include <boost/tokenizer.hpp> // OpenTrep #include <opentrep/bom/World.hpp> #include <opentrep/bom/Place.hpp> @@ -21,6 +23,30 @@ #include <xapian.h> namespace OPENTREP { + + // ////////////////////////////////////////////////////////////////////// + void tokeniseAndAddToDocument (const std::string& iPhrase, + Xapian::Document& ioDocument, + Xapian::WritableDatabase& ioDatabase) { + + // Boost Tokeniser + typedef boost::tokenizer<boost::char_separator<char> > Tokeniser_T; + + // Define the separators + boost::char_separator<char> lSepatorList(" .,;:|+-*/_=!@#$%`~^&(){}[]?'<>\""); + + // Initialise the phrase to be tokenised + Tokeniser_T lTokens (iPhrase, lSepatorList); + for (Tokeniser_T::const_iterator tok_iter = lTokens.begin(); + tok_iter != lTokens.end(); ++tok_iter) { + const std::string& lTerm = *tok_iter; + + ioDatabase.add_spelling (lTerm); + ioDocument.add_term (lTerm); + + OPENTREP_LOG_DEBUG ("Added term: " << lTerm); + } + } // ////////////////////////////////////////////////////////////////////// void IndexBuilder:: @@ -90,8 +116,9 @@ // extended, alternate, etc.) if (lName.empty() == false) { // OPENTREP_LOG_DEBUG ("Added name: " << lName); - lDocument.add_term (lName); ++idx; - ioDatabase.add_spelling (lName); + // lDocument.add_term (lName); ++idx; + // ioDatabase.add_spelling (lName); + tokeniseAndAddToDocument (lName, lDocument, ioDatabase); } } } Modified: trunk/opentrep/refdata/data/ref_place_names.csv =================================================================== --- trunk/opentrep/refdata/data/ref_place_names.csv 2009-07-14 22:34:14 UTC (rev 128) +++ trunk/opentrep/refdata/data/ref_place_names.csv 2009-07-15 14:45:43 UTC (rev 129) @@ -1826,7 +1826,7 @@ en,jbt,bethel jbt,bethel jbt,bethel/ak/us:city landing en,jca,cannes jca,cannes jca,cannes/fr:croisette hpt en,jcb,joacaba,joacaba,joacaba/sc/br -en,jcc,sanfrancisco jcc,sanfrancisco jc,san francisco/ca/us:china hpt +en,jcc,san francisco jcc,san francisco jc,san francisco/ca/us:china hpt en,jcd,st croix is jcd,st croix is jcd,st croix is/vi:downtown hpt en,jce,convention,convention,convention/ca/us:heliport en,jch,qasigiannguit,qasigiannguit,qasigiannguit/gl Modified: trunk/opentrep/test/xapian/simple_search.cpp =================================================================== --- trunk/opentrep/test/xapian/simple_search.cpp 2009-07-14 22:34:14 UTC (rev 128) +++ trunk/opentrep/test/xapian/simple_search.cpp 2009-07-15 14:45:43 UTC (rev 129) @@ -1,5 +1,6 @@ // STL #include <iostream> +#include <string> // Xapian #include <xapian.h> @@ -7,46 +8,59 @@ int main (int argc, char* argv[]) { // Simplest possible options parsing: we just require two or more - // parameters. - if (argc < 3) { - std::cout << "Usage: " << argv[0] - << " <path to database> <search terms>" << std::endl; - return -1; - } + // parameters. + if (argc < 3) { + std::cout << "Usage: " << argv[0] + << " <path to database> <search terms>" << std::endl; + return -1; + } - // Catch any Xapian::Error exceptions thrown - try { + // Catch any Xapian::Error exceptions thrown + try { - // Make the database - Xapian::Database db (argv[1]); + // Open the database for searching. + Xapian::Database db (argv[1]); - // Start an enquire session - Xapian::Enquire enquire (db); + // Start an enquire session + Xapian::Enquire enquire (db); - // Build the query object - Xapian::Query query (Xapian::Query::OP_AND, argv + 2, argv + argc); - std::cout << "Performing query `" << query.get_description() << "'" - << std::endl; - - // Give the query object to the enquire session - enquire.set_query (query); + // Combine the rest of the command line arguments with spaces between + // them, so that simple queries don't have to be quoted at the shell + // level. + std::string query_string (argv[2]); + argv += 3; + while (*argv) { + query_string += ' '; + query_string += *argv++; + } - // Get the top 10 results of the query - Xapian::MSet matches = enquire.get_mset (0, 10); + // Parse the query string to produce a Xapian::Query object. + Xapian::QueryParser qp; + Xapian::Stem stemmer ("english"); + qp.set_stemmer (stemmer); + qp.set_database (db); + qp.set_stemming_strategy (Xapian::QueryParser::STEM_SOME); + Xapian::Query query = qp.parse_query (query_string); + std::cout << "Parsed query is: " << query.get_description() << std::endl; - // Display the results - std::cout << matches.size() << " results found" << std::endl; + // Find the top 10 results for the query. + enquire.set_query (query); + Xapian::MSet matches = enquire.get_mset(0, 10); - for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i) { - Xapian::Document doc = i.get_document(); - std::cout << "Document ID " << *i << "\t" << - i.get_percent() << "% [" << - doc.get_data() << "]" << std::endl; - } + // Display the results. + std::cout << matches.get_matches_estimated() << " results found." + << std::endl; + std::cout << "Matches 1-" << matches.size() << ":" << std::endl << std::endl; + + for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i) { + std::cout << i.get_rank() + 1 << ": " << i.get_percent() << "% docid=" + << *i << " [" << i.get_document().get_data() << "]" + << std::endl << std::endl; + } - } catch (const Xapian::Error& error) { - std::cerr << "Exception: " << error.get_msg() << std::endl; - } + } catch (const Xapian::Error& error) { + std::cerr << "Exception: " << error.get_msg() << std::endl; + } - return 0; + return 0; } Modified: trunk/opentrep/test/xapian/string_search.cpp =================================================================== --- trunk/opentrep/test/xapian/string_search.cpp 2009-07-14 22:34:14 UTC (rev 128) +++ trunk/opentrep/test/xapian/string_search.cpp 2009-07-15 14:45:43 UTC (rev 129) @@ -41,7 +41,11 @@ // Build the query object Xapian::QueryParser lQueryParser; lQueryParser.set_database (db); - lQueryParser.set_default_op (Xapian::Query::OP_NEAR); + // As explained in http://www.xapian.org/docs/queryparser.html, + // Xapian::Query::OP_ADJ is better than Xapian::Query::OP_PHRASE, + // but only available from version 1.0.13 of Xapian + // lQueryParser.set_default_op (Xapian::Query::OP_ADJ); + lQueryParser.set_default_op (Xapian::Query::OP_PHRASE); std::cout << "Query parser `" << lQueryParser.get_description() << "'" << std::endl; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |