From: <ust...@us...> - 2009-07-08 09:56:43
|
Revision: 3015 http://clucene.svn.sourceforge.net/clucene/?rev=3015&view=rev Author: ustramooner Date: 2009-07-08 09:56:41 +0000 (Wed, 08 Jul 2009) Log Message: ----------- Added FuzzyQuery tests - and removed old ones from TestWildcard.cpp Modified Paths: -------------- branches/lucene2_3_2/src/test/search/TestQueries.cpp branches/lucene2_3_2/src/test/search/TestWildcard.cpp Modified: branches/lucene2_3_2/src/test/search/TestQueries.cpp =================================================================== --- branches/lucene2_3_2/src/test/search/TestQueries.cpp 2009-07-08 09:55:52 UTC (rev 3014) +++ branches/lucene2_3_2/src/test/search/TestQueries.cpp 2009-07-08 09:56:41 UTC (rev 3015) @@ -7,46 +7,368 @@ #include "test.h" - void testPrefixQuery(CuTest *tc){ - WhitespaceAnalyzer analyzer; +/// Java PrefixQuery test, 2009-06-02 +void testPrefixQuery(CuTest *tc){ + WhitespaceAnalyzer analyzer; + RAMDirectory directory; + const TCHAR* categories[] = {_T("/Computers"), _T("/Computers/Mac"), _T("/Computers/Windows")}; + + IndexWriter writer( &directory, &analyzer, true); + for (int i = 0; i < 3; i++) { + Document *doc = _CLNEW Document(); + doc->add(*_CLNEW Field(_T("category"), categories[i], Field::STORE_YES | Field::INDEX_UNTOKENIZED)); + writer.addDocument(doc); + _CLDELETE(doc); + } + writer.close(); + + Term* t = _CLNEW Term(_T("category"), _T("/Computers")); + PrefixQuery *query = _CLNEW PrefixQuery(t); + IndexSearcher searcher(&directory); + Hits *hits = searcher.search(query); + CLUCENE_ASSERT(3 == hits->length()); // All documents in /Computers category and below + _CLDELETE(query); + _CLDELETE(t); + _CLDELETE(hits); + + t = _CLNEW Term(_T("category"), _T("/Computers/Mac")); + query = _CLNEW PrefixQuery(t); + hits = searcher.search(query); + CLUCENE_ASSERT(1 == hits->length()); // One in /Computers/Mac + _CLDELETE(query); + _CLDELETE(t); + _CLDELETE(hits); +} + +#ifndef NO_FUZZY_QUERY + +class TestFuzzyQuery { +private: + CuTest *tc; + + void addDoc(const TCHAR* text, IndexWriter* writer) { + Document* doc = _CLNEW Document(); + doc->add(*_CLNEW Field(_T("field"), text, Field::STORE_YES, Field::INDEX_TOKENIZED)); + writer->addDocument(doc); + _CLLDELETE(doc); + } + + Hits* searchQuery(IndexSearcher* searcher, const TCHAR* field, const TCHAR* text, + float_t minSimilarity=FuzzyQuery::defaultMinSimilarity, size_t prefixLen=0){ + + Term* t = _CLNEW Term(field, text); + FuzzyQuery* query = _CLNEW FuzzyQuery(t, minSimilarity, prefixLen); + Hits* hits = searcher->search(query); + _CLLDELETE(query); + _CLLDECDELETE(t); + return hits; + } + + size_t getHitsLength(IndexSearcher* searcher, const TCHAR* field, const TCHAR* text, + float_t minSimilarity=FuzzyQuery::defaultMinSimilarity, size_t prefixLen=0){ + + Hits* hits = searchQuery(searcher, field, text, minSimilarity, prefixLen); + size_t ret = hits->length(); + _CLLDELETE(hits); + return ret; + } +public: + TestFuzzyQuery(CuTest *_tc):tc(_tc){ + } + ~TestFuzzyQuery(){ + } + + void testFuzziness() { RAMDirectory directory; - const TCHAR* categories[] = {_T("/Computers"), _T("/Computers/Mac"), _T("/Computers/Windows")}; - - IndexWriter writer( &directory, &analyzer, true); - for (int i = 0; i < 3; i++) { - Document *doc = _CLNEW Document(); - doc->add(*_CLNEW Field(_T("category"), categories[i], Field::STORE_YES | Field::INDEX_UNTOKENIZED)); - writer.addDocument(doc); - _CLDELETE(doc); - } + WhitespaceAnalyzer a; + IndexWriter writer(&directory, &a, true); + addDoc(_T("aaaaa"), &writer); + addDoc(_T("aaaab"), &writer); + addDoc(_T("aaabb"), &writer); + addDoc(_T("aabbb"), &writer); + addDoc(_T("abbbb"), &writer); + addDoc(_T("bbbbb"), &writer); + addDoc(_T("ddddd"), &writer); + writer.optimize(); writer.close(); - - Term* t = _CLNEW Term(_T("category"), _T("/Computers")); - PrefixQuery *query = _CLNEW PrefixQuery(t); IndexSearcher searcher(&directory); - Hits *hits = searcher.search(query); - CLUCENE_ASSERT(3 == hits->length()); // All documents in /Computers category and below - _CLDELETE(query); - _CLDELETE(t); - _CLDELETE(hits); - t = _CLNEW Term(_T("category"), _T("/Computers/Mac")); - query = _CLNEW PrefixQuery(t); - hits = searcher.search(query); - CLUCENE_ASSERT(1 == hits->length()); // One in /Computers/Mac - _CLDELETE(query); - _CLDELETE(t); - _CLDELETE(hits); + CLUCENE_ASSERT( getHitsLength(&searcher, _T("field"), _T("aaaaa")) == 3); + + // same with prefix + CLUCENE_ASSERT( getHitsLength(&searcher, _T("field"), _T("aaaaa"),FuzzyQuery::defaultMinSimilarity,1) == 3); + CLUCENE_ASSERT( getHitsLength(&searcher, _T("field"), _T("aaaaa"),FuzzyQuery::defaultMinSimilarity,2) == 3); + CLUCENE_ASSERT( getHitsLength(&searcher, _T("field"), _T("aaaaa"),FuzzyQuery::defaultMinSimilarity,3) == 3); + CLUCENE_ASSERT( getHitsLength(&searcher, _T("field"), _T("aaaaa"),FuzzyQuery::defaultMinSimilarity,4) == 2); + CLUCENE_ASSERT( getHitsLength(&searcher, _T("field"), _T("aaaaa"),FuzzyQuery::defaultMinSimilarity,5) == 1); + CLUCENE_ASSERT( getHitsLength(&searcher, _T("field"), _T("aaaaa"),FuzzyQuery::defaultMinSimilarity,6) == 1); + + // not similar enough: + CuAssertTrue(tc, getHitsLength(&searcher, _T("field"), _T("xxxxx")) == 0); + CuAssertTrue(tc, getHitsLength(&searcher, _T("field"), _T("aaccc")) == 0); // edit distance to "aaaaa" = 3 + + // query identical to a word in the index: + Hits* hits = searchQuery(&searcher, _T("field"), _T("aaaaa")); + CLUCENE_ASSERT( hits->length() == 3); + CuAssertStrEquals(tc, NULL, _T("aaaaa"), hits->doc(0).get(_T("field"))); + // default allows for up to two edits: + CuAssertStrEquals(tc, NULL, _T("aaaab"), hits->doc(1).get(_T("field"))); + CuAssertStrEquals(tc, NULL, _T("aaabb"), hits->doc(2).get(_T("field"))); + _CLLDELETE(hits); + + // query similar to a word in the index: + hits = searchQuery(&searcher, _T("field"), _T("aaaac")); + CLUCENE_ASSERT( hits->length() == 3); + CuAssertStrEquals(tc, NULL, _T("aaaaa"), hits->doc(0).get(_T("field"))); + CuAssertStrEquals(tc, NULL, _T("aaaab"), hits->doc(1).get(_T("field"))); + CuAssertStrEquals(tc, NULL, _T("aaabb"), hits->doc(2).get(_T("field"))); + _CLLDELETE(hits); + + // now with prefix + hits = searchQuery(&searcher, _T("field"), _T("aaaac"), FuzzyQuery::defaultMinSimilarity, 1); + CLUCENE_ASSERT( hits->length() == 3); + CuAssertStrEquals(tc, NULL, _T("aaaaa"), hits->doc(0).get(_T("field"))); + CuAssertStrEquals(tc, NULL, _T("aaaab"), hits->doc(1).get(_T("field"))); + CuAssertStrEquals(tc, NULL, _T("aaabb"), hits->doc(2).get(_T("field"))); + _CLLDELETE(hits); + + hits = searchQuery(&searcher, _T("field"), _T("aaaac"), FuzzyQuery::defaultMinSimilarity, 2); + CLUCENE_ASSERT( hits->length() == 3); + CuAssertStrEquals(tc, NULL, _T("aaaaa"), hits->doc(0).get(_T("field"))); + CuAssertStrEquals(tc, NULL, _T("aaaab"), hits->doc(1).get(_T("field"))); + CuAssertStrEquals(tc, NULL, _T("aaabb"), hits->doc(2).get(_T("field"))); + _CLLDELETE(hits); + + hits = searchQuery(&searcher, _T("field"), _T("aaaac"), FuzzyQuery::defaultMinSimilarity, 3); + CLUCENE_ASSERT( hits->length() == 3); + CuAssertStrEquals(tc, NULL, _T("aaaaa"), hits->doc(0).get(_T("field"))); + CuAssertStrEquals(tc, NULL, _T("aaaab"), hits->doc(1).get(_T("field"))); + CuAssertStrEquals(tc, NULL, _T("aaabb"), hits->doc(2).get(_T("field"))); + _CLLDELETE(hits); + + hits = searchQuery(&searcher, _T("field"), _T("aaaac"), FuzzyQuery::defaultMinSimilarity, 4); + CLUCENE_ASSERT( hits->length() == 2); + CuAssertStrEquals(tc, NULL, _T("aaaaa"), hits->doc(0).get(_T("field"))); + CuAssertStrEquals(tc, NULL, _T("aaaab"), hits->doc(1).get(_T("field"))); + _CLLDELETE(hits); + + hits = searchQuery(&searcher, _T("field"), _T("aaaac"), FuzzyQuery::defaultMinSimilarity, 5); + CLUCENE_ASSERT( hits->length() == 0); + CuAssertStrEquals(tc, NULL, _T("aaaaa"), hits->doc(0).get(_T("field"))); + _CLLDELETE(hits); + + + hits = searchQuery(&searcher, _T("field"), _T("ddddX")); + CLUCENE_ASSERT( hits->length() == 1); + CuAssertStrEquals(tc, NULL, _T("ddddd"), hits->doc(0).get(_T("field"))); + _CLLDELETE(hits); + + // now with prefix + hits = searchQuery(&searcher, _T("field"), _T("ddddX"), FuzzyQuery::defaultMinSimilarity, 1); + CLUCENE_ASSERT( hits->length() == 1); + CuAssertStrEquals(tc, NULL, _T("ddddd"), hits->doc(0).get(_T("field"))); + _CLLDELETE(hits); + + hits = searchQuery(&searcher, _T("field"), _T("ddddX"), FuzzyQuery::defaultMinSimilarity, 2); + CLUCENE_ASSERT( hits->length() == 1); + CuAssertStrEquals(tc, NULL, _T("ddddd"), hits->doc(0).get(_T("field"))); + _CLLDELETE(hits); + + hits = searchQuery(&searcher, _T("field"), _T("ddddX"), FuzzyQuery::defaultMinSimilarity, 3); + CLUCENE_ASSERT( hits->length() == 1); + CuAssertStrEquals(tc, NULL, _T("ddddd"), hits->doc(0).get(_T("field"))); + _CLLDELETE(hits); + + hits = searchQuery(&searcher, _T("field"), _T("ddddX"), FuzzyQuery::defaultMinSimilarity, 4); + CLUCENE_ASSERT( hits->length() == 1); + CuAssertStrEquals(tc, NULL, _T("ddddd"), hits->doc(0).get(_T("field"))); + _CLLDELETE(hits); + + hits = searchQuery(&searcher, _T("field"), _T("ddddX"), FuzzyQuery::defaultMinSimilarity, 5); + CLUCENE_ASSERT( hits->length() == 0); + _CLLDELETE(hits); + + // different field = no match: + hits = searchQuery(&searcher, _T("anotherfield"), _T("ddddX")); + CLUCENE_ASSERT( hits->length() == 0); + _CLLDELETE(hits); + + searcher.close(); + directory.close(); } + /* + void testFuzzinessLong() { + RAMDirectory directory; + IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true); + addDoc("aaaaaaa", writer); + addDoc("segment", writer); + writer.optimize(); + writer.close(); + IndexSearcher searcher = new IndexSearcher(directory); + FuzzyQuery query; + // not similar enough: + query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0); + Hits hits = searcher.search(query); + assertEquals(0, hits.length()); + // edit distance to "aaaaaaa" = 3, this matches because the string is longer than + // in testDefaultFuzziness so a bigger difference is allowed: + query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.search(query); + assertEquals(1, hits.length()); + assertEquals(hits.doc(0).get("field"), ("aaaaaaa")); + + // now with prefix + query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 1); + hits = searcher.search(query); + assertEquals(1, hits.length()); + assertEquals(hits.doc(0).get("field"), ("aaaaaaa")); + query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 4); + hits = searcher.search(query); + assertEquals(1, hits.length()); + assertEquals(hits.doc(0).get("field"), ("aaaaaaa")); + query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 5); + hits = searcher.search(query); + assertEquals(0, hits.length()); + + // no match, more than half of the characters is wrong: + query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.search(query); + assertEquals(0, hits.length()); + + // now with prefix + query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 2); + hits = searcher.search(query); + assertEquals(0, hits.length()); + + // "student" and "stellent" are indeed similar to "segment" by default: + query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.search(query); + assertEquals(1, hits.length()); + query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.search(query); + assertEquals(1, hits.length()); + + // now with prefix + query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 1); + hits = searcher.search(query); + assertEquals(1, hits.length()); + query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 1); + hits = searcher.search(query); + assertEquals(1, hits.length()); + query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 2); + hits = searcher.search(query); + assertEquals(0, hits.length()); + query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 2); + hits = searcher.search(query); + assertEquals(0, hits.length()); + + // "student" doesn't match anymore thanks to increased minimum similarity: + query = new FuzzyQuery(new Term("field", "student"), 0.6f, 0); + hits = searcher.search(query); + assertEquals(0, hits.length()); + + try { + query = new FuzzyQuery(new Term("field", "student"), 1.1f); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expecting exception + } + try { + query = new FuzzyQuery(new Term("field", "student"), -0.1f); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expecting exception + } + + searcher.close(); + directory.close(); + } + */ +}; + +void testFuzzyQuery(CuTest *tc){ + + /// Run Java Lucene tests + TestFuzzyQuery tester(tc); + tester.testFuzziness(); + + /// Legacy CLucene tests + RAMDirectory ram; + + //--- + WhitespaceAnalyzer an; + IndexWriter* writer = _CLNEW IndexWriter(&ram, &an, true); + + //--- + Document *doc = 0; + //**** + doc = _CLNEW Document(); + doc->add(*_CLNEW Field(_T("body"),_T("test"),Field::STORE_NO | Field::INDEX_TOKENIZED)); + writer->addDocument(doc); + _CLDELETE(doc); + //**** + doc = _CLNEW Document(); + doc->add(*_CLNEW Field(_T("body"),_T("home"),Field::STORE_NO | Field::INDEX_TOKENIZED)); + writer->addDocument(doc); + _CLDELETE(doc); + //**** + doc = _CLNEW Document(); + doc->add(*_CLNEW Field(_T("body"), _T("pc linux"),Field::STORE_NO | Field::INDEX_TOKENIZED)); + writer->addDocument(doc); + _CLDELETE(doc); + //**** + doc = _CLNEW Document(); + doc->add(*_CLNEW Field(_T("body"), _T("tested"),Field::STORE_NO | Field::INDEX_TOKENIZED)); + writer->addDocument(doc); + _CLDELETE(doc); + //**** + doc = _CLNEW Document(); + doc->add(*_CLNEW Field(_T("body"), _T("source"),Field::STORE_NO | Field::INDEX_TOKENIZED)); + writer->addDocument(doc); + _CLDELETE(doc); + + //--- + writer->close(); + _CLDELETE(writer); + + //--- + IndexSearcher searcher (&ram); + + //--- + Term* term = _CLNEW Term(_T("body"), _T("test~")); + Query* query = _CLNEW FuzzyQuery(term); + Hits* result = searcher.search(query); + + CLUCENE_ASSERT(result && result->length() > 0); + + //--- + _CLDELETE(result); + _CLDELETE(query); + _CLDECDELETE(term); + searcher.close(); + ram.close(); +} +#else + void _NO_FUZZY_QUERY(CuTest *tc){ + CuNotImpl(tc,_T("Fuzzy")); + } +#endif + CuSuite *testqueries(void) { CuSuite *suite = CuSuiteNew(_T("CLucene Queries Test")); - SUITE_ADD_TEST(suite, testPrefixQuery); + SUITE_ADD_TEST(suite, testPrefixQuery); + #ifndef NO_FUZZY_QUERY + SUITE_ADD_TEST(suite, testFuzzyQuery); + #else + SUITE_ADD_TEST(suite, _NO_FUZZY_QUERY); + #endif - return suite; + + return suite; } -//EOF +//EOF Modified: branches/lucene2_3_2/src/test/search/TestWildcard.cpp =================================================================== --- branches/lucene2_3_2/src/test/search/TestWildcard.cpp 2009-07-08 09:55:52 UTC (rev 3014) +++ branches/lucene2_3_2/src/test/search/TestWildcard.cpp 2009-07-08 09:56:41 UTC (rev 3015) @@ -29,64 +29,9 @@ _CLDECDELETE(term); } - void testFuzzyQuery(CuTest *tc){ - RAMDirectory ram; - - //--- - WhitespaceAnalyzer an; - IndexWriter* writer = _CLNEW IndexWriter(&ram, &an, true); - //--- - Document *doc = 0; - //**** - doc = _CLNEW Document(); - doc->add(*_CLNEW Field(_T("body"),_T("test"),Field::STORE_NO | Field::INDEX_TOKENIZED)); - writer->addDocument(doc); - _CLDELETE(doc); - //**** - doc = _CLNEW Document(); - doc->add(*_CLNEW Field(_T("body"),_T("home"),Field::STORE_NO | Field::INDEX_TOKENIZED)); - writer->addDocument(doc); - _CLDELETE(doc); - //**** - doc = _CLNEW Document(); - doc->add(*_CLNEW Field(_T("body"), _T("pc linux"),Field::STORE_NO | Field::INDEX_TOKENIZED)); - writer->addDocument(doc); - _CLDELETE(doc); - //**** - doc = _CLNEW Document(); - doc->add(*_CLNEW Field(_T("body"), _T("tested"),Field::STORE_NO | Field::INDEX_TOKENIZED)); - writer->addDocument(doc); - _CLDELETE(doc); - //**** - doc = _CLNEW Document(); - doc->add(*_CLNEW Field(_T("body"), _T("source"),Field::STORE_NO | Field::INDEX_TOKENIZED)); - writer->addDocument(doc); - _CLDELETE(doc); - //--- - writer->close(); - _CLDELETE(writer); - //--- - IndexSearcher searcher (&ram); - - //--- - Term* term = _CLNEW Term(_T("body"), _T("test~")); - Query* query = _CLNEW FuzzyQuery(term); - Hits* result = searcher.search(query); - - CLUCENE_ASSERT(result && result->length() > 0); - - //--- - _CLDELETE(result); - _CLDELETE(query); - _CLDECDELETE(term); - searcher.close(); - ram.close(); - } - - void testAsterisk(CuTest *tc){ RAMDirectory indexStore; SimpleAnalyzer an; @@ -168,9 +113,6 @@ void _NO_WILDCARD_QUERY(CuTest *tc){ CuNotImpl(tc,_T("Wildcard")); } - void _NO_FUZZY_QUERY(CuTest *tc){ - CuNotImpl(tc,_T("Fuzzy")); - } #endif @@ -178,11 +120,6 @@ { CuSuite *suite = CuSuiteNew(_T("CLucene Wildcard Test")); - #ifndef NO_FUZZY_QUERY - SUITE_ADD_TEST(suite, testFuzzyQuery); - #else - SUITE_ADD_TEST(suite, _NO_FUZZY_QUERY); - #endif #ifndef NO_WILDCARD_QUERY SUITE_ADD_TEST(suite, testQuestionmark); SUITE_ADD_TEST(suite, testAsterisk); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |