From: <ke...@us...> - 2006-11-26 19:34:30
|
Revision: 3711 http://svn.sourceforge.net/mailmanager/?rev=3711&view=rev Author: kevca Date: 2006-11-26 11:34:29 -0800 (Sun, 26 Nov 2006) Log Message: ----------- More restructure Added Paths: ----------- MailManager/branches/RELENG_2_2/support/searching/__init__.py MailManager/branches/RELENG_2_2/support/searching/mysql.py MailManager/branches/RELENG_2_2/support/searching/postgres.py MailManager/branches/RELENG_2_2/support/searching/xapian.py MailManager/branches/RELENG_2_2/support/searching/zope.py Removed Paths: ------------- MailManager/branches/RELENG_2_2/support/searching/searching.py Added: MailManager/branches/RELENG_2_2/support/searching/__init__.py =================================================================== Copied: MailManager/branches/RELENG_2_2/support/searching/mysql.py (from rev 3710, MailManager/branches/RELENG_2_2/support/searching/searching.py) =================================================================== --- MailManager/branches/RELENG_2_2/support/searching/mysql.py (rev 0) +++ MailManager/branches/RELENG_2_2/support/searching/mysql.py 2006-11-26 19:34:29 UTC (rev 3711) @@ -0,0 +1,490 @@ +import os +import re + +from Products.MailManager.support.logger import log +from DateTime import DateTime +from email.Utils import formataddr, parseaddr, mktime_tz, parsedate_tz + +import xapian +import logging + +class ISearch: + """ + Interface for a MailManager search class. + + Any search technology that is to be utilised within MailManager + should implement all of the following methods. + """ + def clearIndex(self): + """ + Clear the search database completely. + """ + pass + + def rebuildIndex(self): + """ + Rebuild the search index i.e. if something gets corrupted it should + be possible to recreate the index from existing messages in the system + for searching. + """ + pass + + def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): + """ + Add a message to the search index. + """ + pass + + def deleteMessageFromIndex(self, message_id): + """ + Add a message to the search index. + """ + pass + + def searchIndex(self, searchParams): + """ + Search the index + """ + pass + +class PostgresSearch(ISearch): + + def __init__(self, mmobj): + self.mmobj = mmobj + + def clearIndex(self): + pass + + def rebuildIndex(self): + pass + + def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): + pass + + def searchIndex(self, searchParams): + searchText = searchParams.get('searchText') + if searchText: + regex = re.compile('([() ])AND([() ])', re.IGNORECASE) + searchText = regex.sub('\g<1>&\g<2>', searchText) + regex = re.compile('([() ])OR([() ])', re.IGNORECASE) + searchText = regex.sub('\g<1>|\g<2>', searchText) + regex = re.compile('([() ]|^)NOT([() ])', re.IGNORECASE) + searchText = regex.sub('\g<1>!\g<2>', searchText) + searchi['searchText'] = searchText + + try: + self.sql.testQuery(sqv_query=searchText) + except: + REQUEST.set('error', 'The supplied query is not correct. ' + 'Words must be separated by AND or OR. Check the help ' + 'page for more information on what queries are valid') + REQUEST.set('flag_searchText', 1) + return self.Search(self, REQUEST) + + searchParams['sort_on'] = 'rank' + + if searchParams.get('from_name'): + from_name = '%' + searchParams.get('from_name') + '%' + else: + from_name = '' + +class MySQLSearch(ISearch): + + def __init__(self, mmobj): + self.mmobj = mmobj + + def clearIndex(self): + pass + + def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): + self.sql.addToMySQLSearchIndex(message_id=message_id, + ticket_id=ticket_id, + body=body, + html_body=body) + + def searchIndex(self, searchParams): + if searchParams.get('from_name'): + from_name = '%' + searchParams.get('from_name') + '%' + else: + from_name = '' + + +class XapianSearch: + """ + A search class that uses Xapian - the open source search engine for very + fast retrievel of documents. + + Each message that goes into MailManager is indexed just before it hits the + database into a special index used by Xapian for + """ + + def __init__(self, databasePath, mmobj): + """ + Creates a new xapian search object + + @param databasePath: filesystem path to store Xapian index + @type databasePath: str + + @param mmobj: a Mailmanager instance + @type mmobj: MailManager + """ + + self.mmobj = mmobj + self.databasePath = databasePath + + def clearIndex(self): + """ + Clears the Xapian database simply by removing the directory on disk. + """ + try: + for root, dirs, files in os.walk(self.databasePath, topdown=False): + for name in files: + print os.path.join(root, name) + os.remove(os.path.join(root, name)) + for name in dirs: + print os.path.join(root, name) + os.rmdir(os.path.join(root, name)) + except Exception, e: + log('%sunable to clear database at %s' \ + % (self.mmobj.getLogName(), self.databasePath),\ + logging.INFO, 'search.indexing') + + + def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): + """ + Stores the given message into the xapian index + + parameters all sanitised + strings supplied in unicode + + @param message_id:the unique message id in the relational database that the message is to be allocated. + @type message_id: int + + @param ticket_id: unique ticket id in the relational database of the ticket the message is being added to. + @type ticket_id: int + + @param ticket: ticket that the message belongs to + @type ticket: ticket + + @param body: the plain text body of the messsage + @type body: str + + @param html_body: the html body of the message + @type html_body: str + + @param from_name: sender of the message is from + @type from_name: str + + @param msg_date: Date of message obtained from database + @type msg_date: DateTime + """ + + log('%s updating index for ticket %s message %s'\ + % (self.mmobj.getLogName(), str(ticket_id), str(message_id)),\ + logging.INFO, 'search.indexing') + + try: + database = xapian.WritableDatabase(self.databasePath,\ + xapian.DB_CREATE_OR_OPEN) + except Exception, e: + log('%s failed to get lock on database for writing'\ + % (self.mmobj.getLogName()),\ + logging.DEBUG, 'search.indexing') + + fname, femail = self.mmobj.getEmailAddrPair(msg, 'from') + subject = self.mmobj.getHeader(msg, 'subject') + + doc = xapian.Document() + + doc.add_value(1, str(ticket_id)) + doc.add_value(2, str(message_id)) + + doc.add_term('XACID' + ticket.account_id.lower().encode('utf-8')) + log('%s==> indexing account id %s' \ + % (self.mmobj.getLogName(), ticket.account_id.lower()),\ + logging.INFO, 'search.indexing') + + doc.add_term('XASN' + ticket.assigned.lower().encode('utf-8')) + log('%s==> indexing assigned %s'\ + % (self.mmobj.getLogName(), ticket.assigned.lower().encode('utf-8')),\ + logging.INFO, 'search.indexing') + + doc.add_term('XST' + ticket.state.lower().encode('utf-8')) + log('%s==> indexing state %s'\ + % (self.mmobj.getLogName(), ticket.state.lower().encode('utf-8')),\ + logging.INFO, 'search.indexing') + + doc.add_term( 'XFE' + femail.lower().encode('utf-8') ) + log('%s==> indexing from email %s'\ + % (self.mmobj.getLogName(), femail.lower()),\ + logging.INFO, 'search.indexing') + + doc.add_term( 'XPRI' + str(ticket.priority)) + log('%s==> indexing priority %s'\ + % (self.mmobj.getLogName(), str(ticket.priority)),\ + logging.INFO, 'search.indexing') + + i = 1 + for subcat in ticket.category0.split(): + doc.add_posting('XCAT0' + subcat.lower().encode('utf-8'), i) + log('%s==> indexing category 0 %s'\ + % (self.mmobj.getLogName(), subcat.lower()),\ + logging.INFO, 'search.indexing') + i += 1 + + i = 1 + for subcat in ticket.category1.split(): + doc.add_posting('XCAT1' + subcat.lower().encode('utf-8'), i) + log('%s==> indexing category 1 %s'\ + % (self.mmobj.getLogName(), subcat.lower()),\ + logging.INFO, 'search.indexing') + i += 1 + + i = 1 + for subcat in ticket.category2.split(): + doc.add_posting('XCAT2' + subcat.lower().encode('utf-8'), i) + log('%s==> indexing category 2 %s'\ + % (self.mmobj.getLogName(), subcat.lower()),\ + logging.INFO, 'search.indexing') + i += 1 + + i = 1 + for subname in fname.split(): + doc.add_posting('XFN' + subname.lower().encode('utf-8'), i) + log('%s==> indexing from name %s'\ + % (self.mmobj.getLogName(), subname.lower()),\ + logging.INFO, 'search.indexing') + i += 1 + + i = 1 + for term in subject.split(): + doc.add_posting('XSB' + term.lower().encode('utf-8'), i) + log('%s==> indexing subject %s'\ + % (self.mmobj.getLogName(), term.lower()),\ + logging.INFO, 'search.indexing') + i += 1 + + i = 1 + for term in body.split(): + doc.add_posting('XBY' + term.lower().encode('utf-8'), i) + log('%s==> indexing body %s'\ + % (self.mmobj.getLogName(), term.lower()),\ + logging.INFO, 'search.indexing') + i += 1 + + day = str(msg_date.day()) + month = str(msg_date.month()) + year = str(msg_date.year()) + + doc.add_term('XFD' + year + month + day) + log('%s==> indexing from date %s'\ + % (self.mmobj.getLogName(), year + month + day),\ + logging.INFO, 'search.indexing') + + doc.add_term('XDYDM' + year + month) + log('%s==> indexing date year and month %s'\ + % (self.mmobj.getLogName(), year + month),\ + logging.INFO, 'search.indexing') + + doc.add_term('XDY' + year) + log('%s==> indexing date year %s'\ + % (self.mmobj.getLogName(), year),\ + logging.INFO, 'search.indexing') + + try: + database.add_document(doc) + except Exception,e: + log('%s unable to add document to database'\ + % (self.mmobj.getLogName()),\ + logging.DEBUG, 'searching.indexing') + + def searchIndex(self, searchParams): + """ + Search for matching tickets in the xapian index + + @param searchParams: values to search for on given field + @type searchParams: dictionary with str keys and values + """ + + # hack for testing + self.rebuildIndex() + + try: + sb = xapian.Database(self.databasePath) + except Exception, e: + log('%s unable to access to xapian database'\ + % (self.mmobj.getLogName()),\ + logging.DEBUG, 'searching.indexing') + + + e = xapian.Enquire(sb) + + # Add the query parameters, one at a time. Multiple options for + # one type are or'd together, others are anded. + + qp = xapian.QueryParser() + + qp.set_database(sb) + qp.set_stemming_strategy(xapian.QueryParser.STEM_NONE) + qp.set_default_op(xapian.Query.OP_OR) + + qp.add_boolean_prefix('account', 'XACID') + qp.add_boolean_prefix('assigned' , 'XASN') + qp.add_boolean_prefix('state' , 'XST') + qp.add_boolean_prefix('fromemail', 'XFE') + qp.add_boolean_prefix('priority' , 'XPRI') + + qp.add_prefix('category0', 'XCAT0') + qp.add_prefix('fromname', 'XFN') + qp.add_prefix('subject' , 'XSB') + qp.add_prefix('body' , 'XBY') + + query = "" + + keymap = { + 'account_id' : 'account', + 'assigned' : 'assigned', + 'state' : 'state', + 'priority' : 'priority', + 'category0' : 'category0' + } + + for key in searchParams: + + value = searchParams[key] + + if key in ('account_id', 'assigned', 'state', 'priority','category0') and value: + items = ['%s:%s ' % (keymap[key], subterm.lower().encode('utf-8')) for subterm in value] + query += '(' + ' OR '.join(items) + ') ' + + if key == 'from_email_lower' and value: + query += 'fromemail:'\ + + value.lower().encode('utf-8')\ + + " " + + # if key == 'from_date' and value: + # value = DateTime(value) + # day = str(value.day()) + # month = str(value.month()) + # year = str(value.year()) + + # query += ' fromdate:' + year + month + day + + # if key == 'to_date' and value: + # value = DateTime(value) + # day = str(value.day()) + # month = str(value.month()) + # year = str(value.year()) + + # query += ' todate:' + year + month + day + + if key == 'from_name' and value: + for subname in value.split(): + query += 'fromname:'\ + + subname.lower().encode('utf8')\ + + " " + + if key == 'subject' and value: + for subterm in value.split(): + query += 'subject:'\ + + subterm.lower().encode('utf8')\ + + " " + + if key == 'searchText' and value: + for subterm in value.split(): + query += 'body:'\ + + subterm.lower().encode('utf8')\ + + " " + + log("%s raw query is %s"\ + % (self.mmobj.getLogName(), query),\ + logging.INFO, 'searching.searching') + + query = query.strip() + + xapianquery = qp.parse_query(query.encode('utf-8'),\ + # Support quoted phrases. + xapian.QueryParser.FLAG_PHRASE |\ + # Support AND, OR, etc and + # bracketted subexpressions. + xapian.QueryParser.FLAG_BOOLEAN |\ + # Support right truncation (e.g. Xap*). + xapian.QueryParser.FLAG_WILDCARD |\ + # Support AND, OR, etc even if + # they aren't in ALLCAPS. + xapian.QueryParser.FLAG_BOOLEAN_ANY_CASE |\ + # Support + and -. + xapian.QueryParser.FLAG_LOVEHATE) + + + + log("%s performing query '%s'"\ + % (self.mmobj.getLogName(), xapianquery.get_description()),\ + logging.INFO, 'searching.searching') + + e.set_query(xapianquery) + matches = e.get_mset(0,100) + + log("%s number of matches '%s'"\ + % (self.mmobj.getLogName(), str(matches.get_matches_estimated())),\ + logging.INFO, 'searching.searching') + + results = [] + tickets = [] + for match in matches: + message_id = match[xapian.MSET_DOCUMENT].get_value(2) + ticket_id = match[xapian.MSET_DOCUMENT].get_value(1) + score = match[xapian.MSET_PERCENT] + + log("%s message: %s ticket: %s score: %f"\ + % (self.mmobj.getLogName(), message_id, ticket_id, score),\ + logging.INFO, 'searching.searching') + + if not ticket_id in tickets: + tickets.append(int(ticket_id)) + results.append((int(ticket_id), score)) + + return results + + sb = None + + + def rebuildIndex(self): + """ + Makes sure the index is clear and then rebuilds the Xapian index + from current messages in the relational database. + """ + log("%s Rebuilding search index %s"\ + % (self.mmobj.getLogName(), self.databasePath),\ + logging.INFO, 'searching.searching') + + print "k" + + messages = self.mmobj.sql.listMessages() + + self.clearIndex() + + print "And..." + + messages = self.mmobj.sql.listMessages() + + print "ik" + + for message in messages: + message_id = message.id + ticket_id = message.ticket_id + ticket = self.ticket(id=ticket_id)[0] + fname = message.from_name + + log("%s adding message"\ + % (self.mmobj.getLogName()), + logging.INFO, 'searching.searching') + + self.addMessageToIndex(message_id, + ticket_id, + ticket, + msg=message, + body=message.body, + html_body=message.html_body, + from_name=fname, + msg_date=message.msg_date) Copied: MailManager/branches/RELENG_2_2/support/searching/postgres.py (from rev 3710, MailManager/branches/RELENG_2_2/support/searching/searching.py) =================================================================== --- MailManager/branches/RELENG_2_2/support/searching/postgres.py (rev 0) +++ MailManager/branches/RELENG_2_2/support/searching/postgres.py 2006-11-26 19:34:29 UTC (rev 3711) @@ -0,0 +1,490 @@ +import os +import re + +from Products.MailManager.support.logger import log +from DateTime import DateTime +from email.Utils import formataddr, parseaddr, mktime_tz, parsedate_tz + +import xapian +import logging + +class ISearch: + """ + Interface for a MailManager search class. + + Any search technology that is to be utilised within MailManager + should implement all of the following methods. + """ + def clearIndex(self): + """ + Clear the search database completely. + """ + pass + + def rebuildIndex(self): + """ + Rebuild the search index i.e. if something gets corrupted it should + be possible to recreate the index from existing messages in the system + for searching. + """ + pass + + def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): + """ + Add a message to the search index. + """ + pass + + def deleteMessageFromIndex(self, message_id): + """ + Add a message to the search index. + """ + pass + + def searchIndex(self, searchParams): + """ + Search the index + """ + pass + +class PostgresSearch(ISearch): + + def __init__(self, mmobj): + self.mmobj = mmobj + + def clearIndex(self): + pass + + def rebuildIndex(self): + pass + + def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): + pass + + def searchIndex(self, searchParams): + searchText = searchParams.get('searchText') + if searchText: + regex = re.compile('([() ])AND([() ])', re.IGNORECASE) + searchText = regex.sub('\g<1>&\g<2>', searchText) + regex = re.compile('([() ])OR([() ])', re.IGNORECASE) + searchText = regex.sub('\g<1>|\g<2>', searchText) + regex = re.compile('([() ]|^)NOT([() ])', re.IGNORECASE) + searchText = regex.sub('\g<1>!\g<2>', searchText) + searchi['searchText'] = searchText + + try: + self.sql.testQuery(sqv_query=searchText) + except: + REQUEST.set('error', 'The supplied query is not correct. ' + 'Words must be separated by AND or OR. Check the help ' + 'page for more information on what queries are valid') + REQUEST.set('flag_searchText', 1) + return self.Search(self, REQUEST) + + searchParams['sort_on'] = 'rank' + + if searchParams.get('from_name'): + from_name = '%' + searchParams.get('from_name') + '%' + else: + from_name = '' + +class MySQLSearch(ISearch): + + def __init__(self, mmobj): + self.mmobj = mmobj + + def clearIndex(self): + pass + + def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): + self.sql.addToMySQLSearchIndex(message_id=message_id, + ticket_id=ticket_id, + body=body, + html_body=body) + + def searchIndex(self, searchParams): + if searchParams.get('from_name'): + from_name = '%' + searchParams.get('from_name') + '%' + else: + from_name = '' + + +class XapianSearch: + """ + A search class that uses Xapian - the open source search engine for very + fast retrievel of documents. + + Each message that goes into MailManager is indexed just before it hits the + database into a special index used by Xapian for + """ + + def __init__(self, databasePath, mmobj): + """ + Creates a new xapian search object + + @param databasePath: filesystem path to store Xapian index + @type databasePath: str + + @param mmobj: a Mailmanager instance + @type mmobj: MailManager + """ + + self.mmobj = mmobj + self.databasePath = databasePath + + def clearIndex(self): + """ + Clears the Xapian database simply by removing the directory on disk. + """ + try: + for root, dirs, files in os.walk(self.databasePath, topdown=False): + for name in files: + print os.path.join(root, name) + os.remove(os.path.join(root, name)) + for name in dirs: + print os.path.join(root, name) + os.rmdir(os.path.join(root, name)) + except Exception, e: + log('%sunable to clear database at %s' \ + % (self.mmobj.getLogName(), self.databasePath),\ + logging.INFO, 'search.indexing') + + + def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): + """ + Stores the given message into the xapian index + + parameters all sanitised + strings supplied in unicode + + @param message_id:the unique message id in the relational database that the message is to be allocated. + @type message_id: int + + @param ticket_id: unique ticket id in the relational database of the ticket the message is being added to. + @type ticket_id: int + + @param ticket: ticket that the message belongs to + @type ticket: ticket + + @param body: the plain text body of the messsage + @type body: str + + @param html_body: the html body of the message + @type html_body: str + + @param from_name: sender of the message is from + @type from_name: str + + @param msg_date: Date of message obtained from database + @type msg_date: DateTime + """ + + log('%s updating index for ticket %s message %s'\ + % (self.mmobj.getLogName(), str(ticket_id), str(message_id)),\ + logging.INFO, 'search.indexing') + + try: + database = xapian.WritableDatabase(self.databasePath,\ + xapian.DB_CREATE_OR_OPEN) + except Exception, e: + log('%s failed to get lock on database for writing'\ + % (self.mmobj.getLogName()),\ + logging.DEBUG, 'search.indexing') + + fname, femail = self.mmobj.getEmailAddrPair(msg, 'from') + subject = self.mmobj.getHeader(msg, 'subject') + + doc = xapian.Document() + + doc.add_value(1, str(ticket_id)) + doc.add_value(2, str(message_id)) + + doc.add_term('XACID' + ticket.account_id.lower().encode('utf-8')) + log('%s==> indexing account id %s' \ + % (self.mmobj.getLogName(), ticket.account_id.lower()),\ + logging.INFO, 'search.indexing') + + doc.add_term('XASN' + ticket.assigned.lower().encode('utf-8')) + log('%s==> indexing assigned %s'\ + % (self.mmobj.getLogName(), ticket.assigned.lower().encode('utf-8')),\ + logging.INFO, 'search.indexing') + + doc.add_term('XST' + ticket.state.lower().encode('utf-8')) + log('%s==> indexing state %s'\ + % (self.mmobj.getLogName(), ticket.state.lower().encode('utf-8')),\ + logging.INFO, 'search.indexing') + + doc.add_term( 'XFE' + femail.lower().encode('utf-8') ) + log('%s==> indexing from email %s'\ + % (self.mmobj.getLogName(), femail.lower()),\ + logging.INFO, 'search.indexing') + + doc.add_term( 'XPRI' + str(ticket.priority)) + log('%s==> indexing priority %s'\ + % (self.mmobj.getLogName(), str(ticket.priority)),\ + logging.INFO, 'search.indexing') + + i = 1 + for subcat in ticket.category0.split(): + doc.add_posting('XCAT0' + subcat.lower().encode('utf-8'), i) + log('%s==> indexing category 0 %s'\ + % (self.mmobj.getLogName(), subcat.lower()),\ + logging.INFO, 'search.indexing') + i += 1 + + i = 1 + for subcat in ticket.category1.split(): + doc.add_posting('XCAT1' + subcat.lower().encode('utf-8'), i) + log('%s==> indexing category 1 %s'\ + % (self.mmobj.getLogName(), subcat.lower()),\ + logging.INFO, 'search.indexing') + i += 1 + + i = 1 + for subcat in ticket.category2.split(): + doc.add_posting('XCAT2' + subcat.lower().encode('utf-8'), i) + log('%s==> indexing category 2 %s'\ + % (self.mmobj.getLogName(), subcat.lower()),\ + logging.INFO, 'search.indexing') + i += 1 + + i = 1 + for subname in fname.split(): + doc.add_posting('XFN' + subname.lower().encode('utf-8'), i) + log('%s==> indexing from name %s'\ + % (self.mmobj.getLogName(), subname.lower()),\ + logging.INFO, 'search.indexing') + i += 1 + + i = 1 + for term in subject.split(): + doc.add_posting('XSB' + term.lower().encode('utf-8'), i) + log('%s==> indexing subject %s'\ + % (self.mmobj.getLogName(), term.lower()),\ + logging.INFO, 'search.indexing') + i += 1 + + i = 1 + for term in body.split(): + doc.add_posting('XBY' + term.lower().encode('utf-8'), i) + log('%s==> indexing body %s'\ + % (self.mmobj.getLogName(), term.lower()),\ + logging.INFO, 'search.indexing') + i += 1 + + day = str(msg_date.day()) + month = str(msg_date.month()) + year = str(msg_date.year()) + + doc.add_term('XFD' + year + month + day) + log('%s==> indexing from date %s'\ + % (self.mmobj.getLogName(), year + month + day),\ + logging.INFO, 'search.indexing') + + doc.add_term('XDYDM' + year + month) + log('%s==> indexing date year and month %s'\ + % (self.mmobj.getLogName(), year + month),\ + logging.INFO, 'search.indexing') + + doc.add_term('XDY' + year) + log('%s==> indexing date year %s'\ + % (self.mmobj.getLogName(), year),\ + logging.INFO, 'search.indexing') + + try: + database.add_document(doc) + except Exception,e: + log('%s unable to add document to database'\ + % (self.mmobj.getLogName()),\ + logging.DEBUG, 'searching.indexing') + + def searchIndex(self, searchParams): + """ + Search for matching tickets in the xapian index + + @param searchParams: values to search for on given field + @type searchParams: dictionary with str keys and values + """ + + # hack for testing + self.rebuildIndex() + + try: + sb = xapian.Database(self.databasePath) + except Exception, e: + log('%s unable to access to xapian database'\ + % (self.mmobj.getLogName()),\ + logging.DEBUG, 'searching.indexing') + + + e = xapian.Enquire(sb) + + # Add the query parameters, one at a time. Multiple options for + # one type are or'd together, others are anded. + + qp = xapian.QueryParser() + + qp.set_database(sb) + qp.set_stemming_strategy(xapian.QueryParser.STEM_NONE) + qp.set_default_op(xapian.Query.OP_OR) + + qp.add_boolean_prefix('account', 'XACID') + qp.add_boolean_prefix('assigned' , 'XASN') + qp.add_boolean_prefix('state' , 'XST') + qp.add_boolean_prefix('fromemail', 'XFE') + qp.add_boolean_prefix('priority' , 'XPRI') + + qp.add_prefix('category0', 'XCAT0') + qp.add_prefix('fromname', 'XFN') + qp.add_prefix('subject' , 'XSB') + qp.add_prefix('body' , 'XBY') + + query = "" + + keymap = { + 'account_id' : 'account', + 'assigned' : 'assigned', + 'state' : 'state', + 'priority' : 'priority', + 'category0' : 'category0' + } + + for key in searchParams: + + value = searchParams[key] + + if key in ('account_id', 'assigned', 'state', 'priority','category0') and value: + items = ['%s:%s ' % (keymap[key], subterm.lower().encode('utf-8')) for subterm in value] + query += '(' + ' OR '.join(items) + ') ' + + if key == 'from_email_lower' and value: + query += 'fromemail:'\ + + value.lower().encode('utf-8')\ + + " " + + # if key == 'from_date' and value: + # value = DateTime(value) + # day = str(value.day()) + # month = str(value.month()) + # year = str(value.year()) + + # query += ' fromdate:' + year + month + day + + # if key == 'to_date' and value: + # value = DateTime(value) + # day = str(value.day()) + # month = str(value.month()) + # year = str(value.year()) + + # query += ' todate:' + year + month + day + + if key == 'from_name' and value: + for subname in value.split(): + query += 'fromname:'\ + + subname.lower().encode('utf8')\ + + " " + + if key == 'subject' and value: + for subterm in value.split(): + query += 'subject:'\ + + subterm.lower().encode('utf8')\ + + " " + + if key == 'searchText' and value: + for subterm in value.split(): + query += 'body:'\ + + subterm.lower().encode('utf8')\ + + " " + + log("%s raw query is %s"\ + % (self.mmobj.getLogName(), query),\ + logging.INFO, 'searching.searching') + + query = query.strip() + + xapianquery = qp.parse_query(query.encode('utf-8'),\ + # Support quoted phrases. + xapian.QueryParser.FLAG_PHRASE |\ + # Support AND, OR, etc and + # bracketted subexpressions. + xapian.QueryParser.FLAG_BOOLEAN |\ + # Support right truncation (e.g. Xap*). + xapian.QueryParser.FLAG_WILDCARD |\ + # Support AND, OR, etc even if + # they aren't in ALLCAPS. + xapian.QueryParser.FLAG_BOOLEAN_ANY_CASE |\ + # Support + and -. + xapian.QueryParser.FLAG_LOVEHATE) + + + + log("%s performing query '%s'"\ + % (self.mmobj.getLogName(), xapianquery.get_description()),\ + logging.INFO, 'searching.searching') + + e.set_query(xapianquery) + matches = e.get_mset(0,100) + + log("%s number of matches '%s'"\ + % (self.mmobj.getLogName(), str(matches.get_matches_estimated())),\ + logging.INFO, 'searching.searching') + + results = [] + tickets = [] + for match in matches: + message_id = match[xapian.MSET_DOCUMENT].get_value(2) + ticket_id = match[xapian.MSET_DOCUMENT].get_value(1) + score = match[xapian.MSET_PERCENT] + + log("%s message: %s ticket: %s score: %f"\ + % (self.mmobj.getLogName(), message_id, ticket_id, score),\ + logging.INFO, 'searching.searching') + + if not ticket_id in tickets: + tickets.append(int(ticket_id)) + results.append((int(ticket_id), score)) + + return results + + sb = None + + + def rebuildIndex(self): + """ + Makes sure the index is clear and then rebuilds the Xapian index + from current messages in the relational database. + """ + log("%s Rebuilding search index %s"\ + % (self.mmobj.getLogName(), self.databasePath),\ + logging.INFO, 'searching.searching') + + print "k" + + messages = self.mmobj.sql.listMessages() + + self.clearIndex() + + print "And..." + + messages = self.mmobj.sql.listMessages() + + print "ik" + + for message in messages: + message_id = message.id + ticket_id = message.ticket_id + ticket = self.ticket(id=ticket_id)[0] + fname = message.from_name + + log("%s adding message"\ + % (self.mmobj.getLogName()), + logging.INFO, 'searching.searching') + + self.addMessageToIndex(message_id, + ticket_id, + ticket, + msg=message, + body=message.body, + html_body=message.html_body, + from_name=fname, + msg_date=message.msg_date) Deleted: MailManager/branches/RELENG_2_2/support/searching/searching.py =================================================================== --- MailManager/branches/RELENG_2_2/support/searching/searching.py 2006-11-26 19:33:21 UTC (rev 3710) +++ MailManager/branches/RELENG_2_2/support/searching/searching.py 2006-11-26 19:34:29 UTC (rev 3711) @@ -1,490 +0,0 @@ -import os -import re - -from Products.MailManager.support.logger import log -from DateTime import DateTime -from email.Utils import formataddr, parseaddr, mktime_tz, parsedate_tz - -import xapian -import logging - -class ISearch: - """ - Interface for a MailManager search class. - - Any search technology that is to be utilised within MailManager - should implement all of the following methods. - """ - def clearIndex(self): - """ - Clear the search database completely. - """ - pass - - def rebuildIndex(self): - """ - Rebuild the search index i.e. if something gets corrupted it should - be possible to recreate the index from existing messages in the system - for searching. - """ - pass - - def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): - """ - Add a message to the search index. - """ - pass - - def deleteMessageFromIndex(self, message_id): - """ - Add a message to the search index. - """ - pass - - def searchIndex(self, searchParams): - """ - Search the index - """ - pass - -class PostgresSearch(ISearch): - - def __init__(self, mmobj): - self.mmobj = mmobj - - def clearIndex(self): - pass - - def rebuildIndex(self): - pass - - def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): - pass - - def searchIndex(self, searchParams): - searchText = searchParams.get('searchText') - if searchText: - regex = re.compile('([() ])AND([() ])', re.IGNORECASE) - searchText = regex.sub('\g<1>&\g<2>', searchText) - regex = re.compile('([() ])OR([() ])', re.IGNORECASE) - searchText = regex.sub('\g<1>|\g<2>', searchText) - regex = re.compile('([() ]|^)NOT([() ])', re.IGNORECASE) - searchText = regex.sub('\g<1>!\g<2>', searchText) - searchi['searchText'] = searchText - - try: - self.sql.testQuery(sqv_query=searchText) - except: - REQUEST.set('error', 'The supplied query is not correct. ' - 'Words must be separated by AND or OR. Check the help ' - 'page for more information on what queries are valid') - REQUEST.set('flag_searchText', 1) - return self.Search(self, REQUEST) - - searchParams['sort_on'] = 'rank' - - if searchParams.get('from_name'): - from_name = '%' + searchParams.get('from_name') + '%' - else: - from_name = '' - -class MySQLSearch(ISearch): - - def __init__(self, mmobj): - self.mmobj = mmobj - - def clearIndex(self): - pass - - def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): - self.sql.addToMySQLSearchIndex(message_id=message_id, - ticket_id=ticket_id, - body=body, - html_body=body) - - def searchIndex(self, searchParams): - if searchParams.get('from_name'): - from_name = '%' + searchParams.get('from_name') + '%' - else: - from_name = '' - - -class XapianSearch: - """ - A search class that uses Xapian - the open source search engine for very - fast retrievel of documents. - - Each message that goes into MailManager is indexed just before it hits the - database into a special index used by Xapian for - """ - - def __init__(self, databasePath, mmobj): - """ - Creates a new xapian search object - - @param databasePath: filesystem path to store Xapian index - @type databasePath: str - - @param mmobj: a Mailmanager instance - @type mmobj: MailManager - """ - - self.mmobj = mmobj - self.databasePath = databasePath - - def clearIndex(self): - """ - Clears the Xapian database simply by removing the directory on disk. - """ - try: - for root, dirs, files in os.walk(self.databasePath, topdown=False): - for name in files: - print os.path.join(root, name) - os.remove(os.path.join(root, name)) - for name in dirs: - print os.path.join(root, name) - os.rmdir(os.path.join(root, name)) - except Exception, e: - log('%sunable to clear database at %s' \ - % (self.mmobj.getLogName(), self.databasePath),\ - logging.INFO, 'search.indexing') - - - def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): - """ - Stores the given message into the xapian index - - parameters all sanitised - strings supplied in unicode - - @param message_id:the unique message id in the relational database that the message is to be allocated. - @type message_id: int - - @param ticket_id: unique ticket id in the relational database of the ticket the message is being added to. - @type ticket_id: int - - @param ticket: ticket that the message belongs to - @type ticket: ticket - - @param body: the plain text body of the messsage - @type body: str - - @param html_body: the html body of the message - @type html_body: str - - @param from_name: sender of the message is from - @type from_name: str - - @param msg_date: Date of message obtained from database - @type msg_date: DateTime - """ - - log('%s updating index for ticket %s message %s'\ - % (self.mmobj.getLogName(), str(ticket_id), str(message_id)),\ - logging.INFO, 'search.indexing') - - try: - database = xapian.WritableDatabase(self.databasePath,\ - xapian.DB_CREATE_OR_OPEN) - except Exception, e: - log('%s failed to get lock on database for writing'\ - % (self.mmobj.getLogName()),\ - logging.DEBUG, 'search.indexing') - - fname, femail = self.mmobj.getEmailAddrPair(msg, 'from') - subject = self.mmobj.getHeader(msg, 'subject') - - doc = xapian.Document() - - doc.add_value(1, str(ticket_id)) - doc.add_value(2, str(message_id)) - - doc.add_term('XACID' + ticket.account_id.lower().encode('utf-8')) - log('%s==> indexing account id %s' \ - % (self.mmobj.getLogName(), ticket.account_id.lower()),\ - logging.INFO, 'search.indexing') - - doc.add_term('XASN' + ticket.assigned.lower().encode('utf-8')) - log('%s==> indexing assigned %s'\ - % (self.mmobj.getLogName(), ticket.assigned.lower().encode('utf-8')),\ - logging.INFO, 'search.indexing') - - doc.add_term('XST' + ticket.state.lower().encode('utf-8')) - log('%s==> indexing state %s'\ - % (self.mmobj.getLogName(), ticket.state.lower().encode('utf-8')),\ - logging.INFO, 'search.indexing') - - doc.add_term( 'XFE' + femail.lower().encode('utf-8') ) - log('%s==> indexing from email %s'\ - % (self.mmobj.getLogName(), femail.lower()),\ - logging.INFO, 'search.indexing') - - doc.add_term( 'XPRI' + str(ticket.priority)) - log('%s==> indexing priority %s'\ - % (self.mmobj.getLogName(), str(ticket.priority)),\ - logging.INFO, 'search.indexing') - - i = 1 - for subcat in ticket.category0.split(): - doc.add_posting('XCAT0' + subcat.lower().encode('utf-8'), i) - log('%s==> indexing category 0 %s'\ - % (self.mmobj.getLogName(), subcat.lower()),\ - logging.INFO, 'search.indexing') - i += 1 - - i = 1 - for subcat in ticket.category1.split(): - doc.add_posting('XCAT1' + subcat.lower().encode('utf-8'), i) - log('%s==> indexing category 1 %s'\ - % (self.mmobj.getLogName(), subcat.lower()),\ - logging.INFO, 'search.indexing') - i += 1 - - i = 1 - for subcat in ticket.category2.split(): - doc.add_posting('XCAT2' + subcat.lower().encode('utf-8'), i) - log('%s==> indexing category 2 %s'\ - % (self.mmobj.getLogName(), subcat.lower()),\ - logging.INFO, 'search.indexing') - i += 1 - - i = 1 - for subname in fname.split(): - doc.add_posting('XFN' + subname.lower().encode('utf-8'), i) - log('%s==> indexing from name %s'\ - % (self.mmobj.getLogName(), subname.lower()),\ - logging.INFO, 'search.indexing') - i += 1 - - i = 1 - for term in subject.split(): - doc.add_posting('XSB' + term.lower().encode('utf-8'), i) - log('%s==> indexing subject %s'\ - % (self.mmobj.getLogName(), term.lower()),\ - logging.INFO, 'search.indexing') - i += 1 - - i = 1 - for term in body.split(): - doc.add_posting('XBY' + term.lower().encode('utf-8'), i) - log('%s==> indexing body %s'\ - % (self.mmobj.getLogName(), term.lower()),\ - logging.INFO, 'search.indexing') - i += 1 - - day = str(msg_date.day()) - month = str(msg_date.month()) - year = str(msg_date.year()) - - doc.add_term('XFD' + year + month + day) - log('%s==> indexing from date %s'\ - % (self.mmobj.getLogName(), year + month + day),\ - logging.INFO, 'search.indexing') - - doc.add_term('XDYDM' + year + month) - log('%s==> indexing date year and month %s'\ - % (self.mmobj.getLogName(), year + month),\ - logging.INFO, 'search.indexing') - - doc.add_term('XDY' + year) - log('%s==> indexing date year %s'\ - % (self.mmobj.getLogName(), year),\ - logging.INFO, 'search.indexing') - - try: - database.add_document(doc) - except Exception,e: - log('%s unable to add document to database'\ - % (self.mmobj.getLogName()),\ - logging.DEBUG, 'searching.indexing') - - def searchIndex(self, searchParams): - """ - Search for matching tickets in the xapian index - - @param searchParams: values to search for on given field - @type searchParams: dictionary with str keys and values - """ - - # hack for testing - self.rebuildIndex() - - try: - sb = xapian.Database(self.databasePath) - except Exception, e: - log('%s unable to access to xapian database'\ - % (self.mmobj.getLogName()),\ - logging.DEBUG, 'searching.indexing') - - - e = xapian.Enquire(sb) - - # Add the query parameters, one at a time. Multiple options for - # one type are or'd together, others are anded. - - qp = xapian.QueryParser() - - qp.set_database(sb) - qp.set_stemming_strategy(xapian.QueryParser.STEM_NONE) - qp.set_default_op(xapian.Query.OP_OR) - - qp.add_boolean_prefix('account', 'XACID') - qp.add_boolean_prefix('assigned' , 'XASN') - qp.add_boolean_prefix('state' , 'XST') - qp.add_boolean_prefix('fromemail', 'XFE') - qp.add_boolean_prefix('priority' , 'XPRI') - - qp.add_prefix('category0', 'XCAT0') - qp.add_prefix('fromname', 'XFN') - qp.add_prefix('subject' , 'XSB') - qp.add_prefix('body' , 'XBY') - - query = "" - - keymap = { - 'account_id' : 'account', - 'assigned' : 'assigned', - 'state' : 'state', - 'priority' : 'priority', - 'category0' : 'category0' - } - - for key in searchParams: - - value = searchParams[key] - - if key in ('account_id', 'assigned', 'state', 'priority','category0') and value: - items = ['%s:%s ' % (keymap[key], subterm.lower().encode('utf-8')) for subterm in value] - query += '(' + ' OR '.join(items) + ') ' - - if key == 'from_email_lower' and value: - query += 'fromemail:'\ - + value.lower().encode('utf-8')\ - + " " - - # if key == 'from_date' and value: - # value = DateTime(value) - # day = str(value.day()) - # month = str(value.month()) - # year = str(value.year()) - - # query += ' fromdate:' + year + month + day - - # if key == 'to_date' and value: - # value = DateTime(value) - # day = str(value.day()) - # month = str(value.month()) - # year = str(value.year()) - - # query += ' todate:' + year + month + day - - if key == 'from_name' and value: - for subname in value.split(): - query += 'fromname:'\ - + subname.lower().encode('utf8')\ - + " " - - if key == 'subject' and value: - for subterm in value.split(): - query += 'subject:'\ - + subterm.lower().encode('utf8')\ - + " " - - if key == 'searchText' and value: - for subterm in value.split(): - query += 'body:'\ - + subterm.lower().encode('utf8')\ - + " " - - log("%s raw query is %s"\ - % (self.mmobj.getLogName(), query),\ - logging.INFO, 'searching.searching') - - query = query.strip() - - xapianquery = qp.parse_query(query.encode('utf-8'),\ - # Support quoted phrases. - xapian.QueryParser.FLAG_PHRASE |\ - # Support AND, OR, etc and - # bracketted subexpressions. - xapian.QueryParser.FLAG_BOOLEAN |\ - # Support right truncation (e.g. Xap*). - xapian.QueryParser.FLAG_WILDCARD |\ - # Support AND, OR, etc even if - # they aren't in ALLCAPS. - xapian.QueryParser.FLAG_BOOLEAN_ANY_CASE |\ - # Support + and -. - xapian.QueryParser.FLAG_LOVEHATE) - - - - log("%s performing query '%s'"\ - % (self.mmobj.getLogName(), xapianquery.get_description()),\ - logging.INFO, 'searching.searching') - - e.set_query(xapianquery) - matches = e.get_mset(0,100) - - log("%s number of matches '%s'"\ - % (self.mmobj.getLogName(), str(matches.get_matches_estimated())),\ - logging.INFO, 'searching.searching') - - results = [] - tickets = [] - for match in matches: - message_id = match[xapian.MSET_DOCUMENT].get_value(2) - ticket_id = match[xapian.MSET_DOCUMENT].get_value(1) - score = match[xapian.MSET_PERCENT] - - log("%s message: %s ticket: %s score: %f"\ - % (self.mmobj.getLogName(), message_id, ticket_id, score),\ - logging.INFO, 'searching.searching') - - if not ticket_id in tickets: - tickets.append(int(ticket_id)) - results.append((int(ticket_id), score)) - - return results - - sb = None - - - def rebuildIndex(self): - """ - Makes sure the index is clear and then rebuilds the Xapian index - from current messages in the relational database. - """ - log("%s Rebuilding search index %s"\ - % (self.mmobj.getLogName(), self.databasePath),\ - logging.INFO, 'searching.searching') - - print "k" - - messages = self.mmobj.sql.listMessages() - - self.clearIndex() - - print "And..." - - messages = self.mmobj.sql.listMessages() - - print "ik" - - for message in messages: - message_id = message.id - ticket_id = message.ticket_id - ticket = self.ticket(id=ticket_id)[0] - fname = message.from_name - - log("%s adding message"\ - % (self.mmobj.getLogName()), - logging.INFO, 'searching.searching') - - self.addMessageToIndex(message_id, - ticket_id, - ticket, - msg=message, - body=message.body, - html_body=message.html_body, - from_name=fname, - msg_date=message.msg_date) Copied: MailManager/branches/RELENG_2_2/support/searching/xapian.py (from rev 3710, MailManager/branches/RELENG_2_2/support/searching/searching.py) =================================================================== --- MailManager/branches/RELENG_2_2/support/searching/xapian.py (rev 0) +++ MailManager/branches/RELENG_2_2/support/searching/xapian.py 2006-11-26 19:34:29 UTC (rev 3711) @@ -0,0 +1,490 @@ +import os +import re + +from Products.MailManager.support.logger import log +from DateTime import DateTime +from email.Utils import formataddr, parseaddr, mktime_tz, parsedate_tz + +import xapian +import logging + +class ISearch: + """ + Interface for a MailManager search class. + + Any search technology that is to be utilised within MailManager + should implement all of the following methods. + """ + def clearIndex(self): + """ + Clear the search database completely. + """ + pass + + def rebuildIndex(self): + """ + Rebuild the search index i.e. if something gets corrupted it should + be possible to recreate the index from existing messages in the system + for searching. + """ + pass + + def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): + """ + Add a message to the search index. + """ + pass + + def deleteMessageFromIndex(self, message_id): + """ + Add a message to the search index. + """ + pass + + def searchIndex(self, searchParams): + """ + Search the index + """ + pass + +class PostgresSearch(ISearch): + + def __init__(self, mmobj): + self.mmobj = mmobj + + def clearIndex(self): + pass + + def rebuildIndex(self): + pass + + def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): + pass + + def searchIndex(self, searchParams): + searchText = searchParams.get('searchText') + if searchText: + regex = re.compile('([() ])AND([() ])', re.IGNORECASE) + searchText = regex.sub('\g<1>&\g<2>', searchText) + regex = re.compile('([() ])OR([() ])', re.IGNORECASE) + searchText = regex.sub('\g<1>|\g<2>', searchText) + regex = re.compile('([() ]|^)NOT([() ])', re.IGNORECASE) + searchText = regex.sub('\g<1>!\g<2>', searchText) + searchi['searchText'] = searchText + + try: + self.sql.testQuery(sqv_query=searchText) + except: + REQUEST.set('error', 'The supplied query is not correct. ' + 'Words must be separated by AND or OR. Check the help ' + 'page for more information on what queries are valid') + REQUEST.set('flag_searchText', 1) + return self.Search(self, REQUEST) + + searchParams['sort_on'] = 'rank' + + if searchParams.get('from_name'): + from_name = '%' + searchParams.get('from_name') + '%' + else: + from_name = '' + +class MySQLSearch(ISearch): + + def __init__(self, mmobj): + self.mmobj = mmobj + + def clearIndex(self): + pass + + def addMessageToIndex(self, message_id, ticket_id, ticket, msg, body, html_body, from_name, msg_date): + self.sql.addToMySQLSearchIndex(message_id=message_id, + ticket_id=ticket_id, + body=body, + html_body=body) + + def searchIndex(self, searchParams): + if searchParams.get('from_name'): + from_name = '%' + searchParams.get('from_name') + '%' + else: + from_name = '' + + +class XapianSearch: + """ + A search class that uses Xapian - the open source search engine for very + fast retrievel of documents. + + Each message that goes into MailManager is indexed just before it hits the + database into a special index used by Xapian for + """ + + def __init__(self, databasePath, mmobj): + """ + Creates a new xapian search object + + @param databasePath: filesystem path to store Xapian index + @type databasePath: str + + @param mmobj: a Mailmanager instance + @type mmobj: MailManager + """ + + self.mmobj = mmobj + self.databasePath = databasePath + + def clearIndex(self): + """ + Clears the Xapian database simply by removing the directory on disk. + """ + try: + for root, dirs, files in os.walk(self.databasePath, topdown=False): + for name in files: + print os.path.join(root, name) + os.remove(os.path.join(root, name)) + for name in dirs: + print os.path.join(root, name) + os.rmdir(os.path.join(root, name)) + except Exception, e: + log('%sunable to clear database at %s' \ + % (self.mmobj.getLogName(), self.databasePath),\ + logging.INFO, 'search.indexing') + + + def addMessageToIndex(self, message_id,... [truncated message content] |