[Opentrep-svn] SF.net SVN: opentrep:[131] trunk/opentrep/test/xapian/string_search.cpp
Status: Beta
Brought to you by:
denis_arnaud
From: <den...@us...> - 2009-07-15 23:42:10
|
Revision: 131 http://opentrep.svn.sourceforge.net/opentrep/?rev=131&view=rev Author: denis_arnaud Date: 2009-07-15 23:42:07 +0000 (Wed, 15 Jul 2009) Log Message: ----------- [Dev] Improved the string search, by allowing several cities at once. There is still some work to do. Modified Paths: -------------- trunk/opentrep/test/xapian/string_search.cpp Modified: trunk/opentrep/test/xapian/string_search.cpp =================================================================== --- trunk/opentrep/test/xapian/string_search.cpp 2009-07-15 16:59:19 UTC (rev 130) +++ trunk/opentrep/test/xapian/string_search.cpp 2009-07-15 23:42:07 UTC (rev 131) @@ -1,140 +1,386 @@ +// C +#include <cassert> // STL #include <iostream> #include <sstream> +#include <string> +#include <list> +// Boost +#include <boost/tokenizer.hpp> // Xapian #include <xapian.h> -// ////////////// M A I N ////////////// -int main (int argc, char* argv[]) { +// /////////////////////// Type definitions ///////////////////////////// +/** List of simple words (STL strings). */ +typedef std::list<std::string> WordList_T; - // Simplest possible options parsing: we just require two or more - // parameters. - if (argc < 3) { - std::cout << "Usage: " << argv[0] - << " <path to database> <search terms>" << std::endl; - return -1; +/** List of Xapian documents. */ +typedef std::list<Xapian::Document> DocumentList_T; + + +// ////////////////////////////////////////////////////////////////////// +void tokeniseAndAddToDocument (const std::string& iPhrase, + WordList_T& ioWordList) { + // Empty the word list + ioWordList.clear(); + + // Boost Tokeniser + typedef boost::tokenizer<boost::char_separator<char> > Tokeniser_T; + + // Define the separators + const boost::char_separator<char> lSepatorList(" .,;:|+-*/_=!@#$%`~^&(){}[]?'<>\""); + + // Initialise the phrase to be tokenised + Tokeniser_T lTokens (iPhrase, lSepatorList); + for (Tokeniser_T::const_iterator tok_iter = lTokens.begin(); + tok_iter != lTokens.end(); ++tok_iter) { + const std::string& lTerm = *tok_iter; + ioWordList.push_back (lTerm); + + // OPENTREP_LOG_DEBUG ("Added term: " << lTerm); + } +} + +// ////////////////////////////////////////////////////////////////////// +std::string createStringFromWordList (const WordList_T& iWordList) { + std::ostringstream oStr; + + unsigned short idx = iWordList.size(); + for (WordList_T::const_iterator itWord = iWordList.begin(); + itWord != iWordList.end(); ++itWord, --idx) { + const std::string& lWord = *itWord; + oStr << lWord; + if (idx > 1) { + oStr << " "; } + } + + return oStr.str(); +} - // Catch any Xapian::Error exceptions thrown - try { +// ////////////////////////////////////////////////////////////////////// +void createCorrectedWordList (const WordList_T& iOriginalWordList, + WordList_T& ioCorrectedWordList, + const Xapian::Database& iDatabase) { + // Empty the target list + ioCorrectedWordList.clear(); + + // Catch any Xapian::Error exceptions thrown + try { - // Make the database - Xapian::Database db (argv[1]); + for (WordList_T::const_iterator itWord = iOriginalWordList.begin(); + itWord != iOriginalWordList.end(); ++itWord) { + const std::string& lOriginalWord = *itWord; + const std::string& lSuggestedWord = + iDatabase.get_spelling_suggestion (lOriginalWord, 3); - // Start an enquire session - Xapian::Enquire enquire (db); + if (lSuggestedWord.empty() == true) { + ioCorrectedWordList.push_back (lOriginalWord); - std::ostringstream oOriginalStr; - std::ostringstream oCorrectedStr; - for (int idx=2; idx != argc; ++idx) { - if (idx != 2) { - oOriginalStr << " "; - oCorrectedStr << " "; - } - const std::string lWord (argv[idx]); - const std::string lSuggestedWord = db.get_spelling_suggestion(lWord, 3); - std::cout << "Word `" << lWord << "' ==> Suggested word `" - << lSuggestedWord << "'" << std::endl; - oOriginalStr << lWord; + } else { + ioCorrectedWordList.push_back (lSuggestedWord); + } - if (lSuggestedWord.empty() == true) { - oCorrectedStr << lWord; - - } else { - oCorrectedStr << lSuggestedWord; - } - } + // DEBUG + std::cout << "Original word: `" << lOriginalWord + << "' ==> corrected word: `" << lSuggestedWord << "'" + << std::endl; + } + + } catch (const Xapian::Error& error) { + std::cerr << "Exception: " << error.get_msg() << std::endl; + } +} + +// /////////////////////////////////////////////////////////////////// +void searchString (Xapian::MSet& ioMatchingSet, + const std::string& iSearchString, + Xapian::Database& ioDatabase) { + + // Catch any Xapian::Error exceptions thrown + try { - const std::string lOriginalQueryString = oOriginalStr.str(); - const std::string lCorrectedQueryString = oCorrectedStr.str(); - const std::string lFullWordCorrectedString = - db.get_spelling_suggestion (lOriginalQueryString, 4); + /** + Build another string, in addition to the original one. Overall, + there are thus two strings: + <br><ul> + <li>One with the original words given by the user</li> + <li>One with the orthographic-corrected words, wherever + relevant (otherwise, the original word is taken)</li> + </ul> + <br>For instance, 'sna francisco' would give the following + two strings: + <br><ul> + <li>'sna francicso' (original)</li> + <li>'sna francisco' (corrected, where relevant, word by word)</li> + </ul> + <br>Note that, as 'sna' exists in the dictionary (Santa Ana, CA, USA), + it is not replaced. We shall take care of the whole string in a + further step below. + */ + WordList_T lOriginalWordList; + tokeniseAndAddToDocument (iSearchString, lOriginalWordList); - std::cout << "Query string `" << lOriginalQueryString - << "' ==> corrected query string: `" << lCorrectedQueryString - << "' and correction for the full query string: `" - << lFullWordCorrectedString << "'" << std::endl; + const std::string lOriginalQueryString = + createStringFromWordList (lOriginalWordList); + + WordList_T lCorrectedWordList; + createCorrectedWordList (lOriginalWordList, lCorrectedWordList, ioDatabase); + + const std::string lCorrectedQueryString = + createStringFromWordList (lCorrectedWordList); - // Build the query object - Xapian::QueryParser lQueryParser; - lQueryParser.set_database (db); - // As explained in http://www.xapian.org/docs/queryparser.html, - // Xapian::Query::OP_ADJ is better than Xapian::Query::OP_PHRASE, - // but only available from version 1.0.13 of Xapian. - // lQueryParser.set_default_op (Xapian::Query::OP_ADJ); - lQueryParser.set_default_op (Xapian::Query::OP_PHRASE); + /** + Try to find, if relevant, an orthographic suggestion for the whole + phrase/string. With the above example, 'sna francisco' yields the + suggestion 'san francisco'. + */ + const std::string lFullWordCorrectedString = + ioDatabase.get_spelling_suggestion (lOriginalQueryString, 3); - std::cout << "Query parser `" << lQueryParser.get_description() << "'" - << std::endl; + std::cout << "Query string `" << lOriginalQueryString + << "' ==> corrected query string: `" << lCorrectedQueryString + << "' and correction for the full query string: `" + << lFullWordCorrectedString << "'" << std::endl; - Xapian::Query lQuery = - lQueryParser.parse_query (lOriginalQueryString, - Xapian::QueryParser::FLAG_BOOLEAN - | Xapian::QueryParser::FLAG_PHRASE - | Xapian::QueryParser::FLAG_LOVEHATE - | Xapian::QueryParser::FLAG_SPELLING_CORRECTION); - //Xapian::Query lCorrectedQuery= lQueryParser.get_corrected_query_string(); - Xapian::Query lCorrectedQuery = - lQueryParser.parse_query (lCorrectedQueryString, - Xapian::QueryParser::FLAG_BOOLEAN - | Xapian::QueryParser::FLAG_PHRASE - | Xapian::QueryParser::FLAG_LOVEHATE); + // Build the query object + Xapian::QueryParser lQueryParser; + lQueryParser.set_database (ioDatabase); + /** + As explained in http://www.xapian.org/docs/queryparser.html, + Xapian::Query::OP_ADJ is better than Xapian::Query::OP_PHRASE, + but only available from version 1.0.13 of Xapian. + */ + // lQueryParser.set_default_op (Xapian::Query::OP_ADJ); + lQueryParser.set_default_op (Xapian::Query::OP_PHRASE); + + std::cout << "Query parser `" << lQueryParser.get_description() << "'" + << std::endl; - Xapian::Query lFullQueryCorrected = - lQueryParser.parse_query (lFullWordCorrectedString, - Xapian::QueryParser::FLAG_BOOLEAN - | Xapian::QueryParser::FLAG_PHRASE - | Xapian::QueryParser::FLAG_LOVEHATE); + /** + The Xapian::QueryParser::parse_query() method aggregates all the words + with operators inbetween them (here, the "PHRASE" operator). + With the above example ('sna francicso'), it yields + "sna PHRASE 2 francicso". + */ + Xapian::Query lQuery = + lQueryParser.parse_query (lOriginalQueryString, + Xapian::QueryParser::FLAG_BOOLEAN + | Xapian::QueryParser::FLAG_PHRASE + | Xapian::QueryParser::FLAG_LOVEHATE + | Xapian::QueryParser::FLAG_SPELLING_CORRECTION); + /** + Strangely enough (is it?), the corrected query given by the Xapian + QueryParser corresponds to the full original string, where words + have been corrected one by one, but considered as a single block. + With the above example, 'sna francicso' yields (wrongly) + 'sna francisco', instead of "sna PHRASE 2 francisco", as generated + by the following code. + */ + // Xapian::Query lCorrectedQuery = + // lQueryParser.get_corrected_query_string(); + Xapian::Query lCorrectedQuery = + lQueryParser.parse_query (lCorrectedQueryString, + Xapian::QueryParser::FLAG_BOOLEAN + | Xapian::QueryParser::FLAG_PHRASE + | Xapian::QueryParser::FLAG_LOVEHATE); - std::cout << "Query `" << lQuery.get_description() - << "', corrected query `" << lCorrectedQuery.get_description() - << "' and corrected for full query `" - << lFullQueryCorrected.get_description() << "' " << std::endl; + /** + As, with the above example, the full corrected string is + 'san francisco', it yields the query "san PHRASE 2 francisco", + which is eventually right. + */ + Xapian::Query lFullQueryCorrected = + lQueryParser.parse_query (lFullWordCorrectedString, + Xapian::QueryParser::FLAG_BOOLEAN + | Xapian::QueryParser::FLAG_PHRASE + | Xapian::QueryParser::FLAG_LOVEHATE); + + std::cout << "Query `" << lQuery.get_description() + << "', corrected query `" << lCorrectedQuery.get_description() + << "' and corrected for full query `" + << lFullQueryCorrected.get_description() << "' " << std::endl; - // Give the query object to the enquire session - enquire.set_query (lQuery); + // Start an enquire session + Xapian::Enquire enquire (ioDatabase); - // Get the top 10 results of the query - Xapian::MSet matches = enquire.get_mset (0, 10); + // Give the query object to the enquire session + enquire.set_query (lQuery); + // Get the top 10 results of the query + ioMatchingSet = enquire.get_mset (0, 10); + + // Display the results + int nbMatches = ioMatchingSet.size(); + std::cout << nbMatches << " results found" << std::endl; + + /** + When no match is found, we search on the corrected phrase/string + (where the words have been corrected one by one). + */ + if (nbMatches == 0) { + enquire.set_query (lCorrectedQuery); + ioMatchingSet = enquire.get_mset (0, 10); + // Display the results - int nbMatches = matches.size(); - std::cout << nbMatches << " results found" << std::endl; + nbMatches = ioMatchingSet.size(); + std::cout << nbMatches << " results found on corrected string" + << std::endl; + } - if (nbMatches == 0) { - enquire.set_query (lCorrectedQuery); - matches = enquire.get_mset (0, 10); + /** + If there is still no match, we search on the string corrected + as a whole. + */ + if (nbMatches == 0) { + enquire.set_query (lFullQueryCorrected); + ioMatchingSet = enquire.get_mset (0, 10); + + // Display the results + nbMatches = ioMatchingSet.size(); + std::cout << nbMatches << " results found on corrected full string" + << std::endl; + } - // Display the results - nbMatches = matches.size(); - std::cout << nbMatches << " results found on corrected string" - << std::endl; + const Xapian::Query& lActualQuery = enquire.get_query(); + std::cout << "Actual query `" << lActualQuery.get_description() + << "'" << std::endl; + + } catch (const Xapian::Error& error) { + std::cerr << "Exception: " << error.get_msg() << std::endl; + } +} - if (nbMatches == 0) { - enquire.set_query (lFullQueryCorrected); - matches = enquire.get_mset (0, 10); +// ////////////////////////////////////////////////////////////////////// +void createDocumentListFromMSet (const Xapian::MSet& iMatchingSet, + DocumentList_T& ioDocumentList) { - // Display the results - nbMatches = matches.size(); - std::cout << nbMatches << " results found on corrected full string" - << std::endl; - } + for (Xapian::MSetIterator itDoc = iMatchingSet.begin(); + itDoc != iMatchingSet.end(); ++itDoc) { + const Xapian::Document& lDocument = itDoc.get_document(); + ioDocumentList.push_back (lDocument); + } +} + +// ////////////////////////////////////////////////////////////////////// +void removeOneWord (std::string& ioQueryString) { + assert (ioQueryString.empty() == false); + + WordList_T lWordList; + tokeniseAndAddToDocument (ioQueryString, lWordList); + assert (lWordList.empty() == false); + + // Remove the furthest right word + lWordList.pop_back(); + + const std::string& lReducedString = createStringFromWordList (lWordList); + ioQueryString = lReducedString; +} + +// ////////////////////////////////////////////////////////////////////// +std::string display (const Xapian::MSet& iMatchingSet) { + std::ostringstream oStr; + + for (Xapian::MSetIterator itDoc = iMatchingSet.begin(); + itDoc != iMatchingSet.end(); ++itDoc) { + const Xapian::Document& lDocument = itDoc.get_document(); + const Xapian::docid& lDocID = lDocument.get_docid(); + + oStr << "Document ID " << lDocID << "\t" << itDoc.get_percent() + << "% [" << lDocument.get_data() << "]" << std::endl; + } + + return oStr.str(); +} + +// ////////////////////////////////////////////////////////////////////// +std::string display (const DocumentList_T& iDocumentList) { + std::ostringstream oStr; + + for (DocumentList_T::const_iterator itDoc = iDocumentList.begin(); + itDoc != iDocumentList.end(); ++itDoc) { + const Xapian::Document& lDocument = *itDoc; + const Xapian::docid& lDocID = lDocument.get_docid(); + + oStr << "Document ID " << lDocID << "\t[" + << lDocument.get_data() << "]" << std::endl; + } + + return oStr.str(); +} + +// //////////////////////////// M A I N ////////////////////////////// +int main (int argc, char* argv[]) { + + // Simplest possible options parsing: we just require two or more + // parameters. + if (argc < 3) { + std::cout << "Usage: " << argv[0] + << " <path to database> <search terms>" << std::endl; + return -1; + } + + // Catch any Xapian::Error exceptions thrown + try { + + // Make the database + Xapian::Database lDatabase (argv[1]); + + /** + Build a query string from the command line parameters. + That way, any other front end producing a query string will + be fine. + */ + std::ostringstream lQueryStringStr; + for (unsigned int idx = 2; idx != argc; ++idx) { + if (idx != 2) { + lQueryStringStr << " "; } + const std::string lWord (argv[idx]); + lQueryStringStr << lWord; + } + const std::string& lCommandLineQueryString = lQueryStringStr.str(); - const Xapian::Query& lActualQuery = enquire.get_query(); - std::cout << "Actual query `" << lActualQuery.get_description() - << "'" << std::endl; + /** + Search with the initial full string, then by removing a word if + no there was result, then by removing another word if there was + again no result, until either a result is found or the + resulting string gets empty. + */ + std::string lQueryString (lCommandLineQueryString); + bool shouldStop = false; + while (shouldStop == false) { + // DEBUG + std::cout << std::endl << "================================" << std::endl + << "New query string: `" << lQueryString << "'" << std::endl; - for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i) { - Xapian::Document doc = i.get_document(); - std::cout << "Document ID " << *i << "\t" << - i.get_percent() << "% [" << - doc.get_data() << "]" << std::endl; + // Retrieve the list of documents matching the query string + Xapian::MSet lMatchingSet; + searchString (lMatchingSet, lQueryString, lDatabase); + std::cout << display (lMatchingSet); + + // Create the corresponding list of documents + DocumentList_T lDocumentList; + createDocumentListFromMSet (lMatchingSet, lDocumentList); + + // Stop if a result is found. + if (lDocumentList.empty() == false) { + shouldStop = true; + break; } - } catch (const Xapian::Error& error) { - std::cerr << "Exception: " << error.get_msg() << std::endl; + // Remove a word from the query string + removeOneWord (lQueryString); + + // Stop when the resulting string gets empty. + if (lQueryString.empty() == true) { + shouldStop = true; + } } + + } catch (const Xapian::Error& error) { + std::cerr << "Exception: " << error.get_msg() << std::endl; + } - return 0; + return 0; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |