Thread: [Assorted-commits] SF.net SVN: assorted:[1420] mailing-list-filter/trunk/src/mlf.py

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 1420
          http://assorted.svn.sourceforge.net/assorted/?rev=1420&view=rev
Author:   yangzhang
Date:     2009-05-15 16:47:28 +0000 (Fri, 15 May 2009)

Log Message:
-----------
first major update in a long time
- switched to new sqlhash storage system
- added --no-fetch, --refresh

Modified Paths:
--------------
    mailing-list-filter/trunk/src/mlf.py

Modified: mailing-list-filter/trunk/src/mlf.py
===================================================================

--- mailing-list-filter/trunk/src/mlf.py	2009-05-15 07:01:14 UTC (rev 1419)
+++ mailing-list-filter/trunk/src/mlf.py	2009-05-15 16:47:28 UTC (rev 1420)
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 
+# See RFC 3501.
+
 """
 Given a Gmail IMAP mailbox, star all messages in which you were a participant
 (either a sender or an explicit recipient in To: or Cc:), where thread grouping
@@ -7,43 +9,36 @@
 """
 
 from __future__ import with_statement
-from collections import defaultdict
-from email import message_from_string
-from getpass import getpass
-from imaplib import IMAP4_SSL
-from argparse import ArgumentParser
 from path import path
-from re import match
 from functools import partial
-from itertools import count
-from commons.decs import pickle_memoized
-from commons.files import cleanse_filename, soft_makedirs
 from commons.log import *
-from commons.misc import default_if_none, seq
-from commons.networking import logout
-from commons.seqs import concat, grouper
-from commons.startup import run_main
 from contextlib import closing
-import logging
-from commons import log
+import getpass, logging, shelve, email, re, os, imaplib, itertools, argparse, collections
+from commons import log, startup, seqs, networking, files, sqlhash
 
 info    = partial(log.info,    'main')
 debug   = partial(log.debug,   'main')
 warning = partial(log.warning, 'main')
 error   = partial(log.error,   'main')
 die     = partial(log.die,     'main')
+exception = partial(log.exception, 'main')
 
-def thread_dfs(msg, tid, tid2msgs):
+def opendb(dbpath):
+  return sqlhash.Shelf(sqlhash.SQLhash(dbpath, flags = 'w'),
+      protocol = 2, writeback = True)
+
+def thread_dfs(msg, tid, mid2msg, tid2msgs):
   assert msg.tid is None
   msg.tid = tid
   tid2msgs[tid].append(msg)
-  for ref in msg.refs:
-    if ref.tid is None:
-      thread_dfs(ref, tid, tid2msgs)
+  for refmid in msg.refs:
+    refmsg = mid2msg[refmid]
+    if refmsg.tid is None:
+      thread_dfs(refmsg, tid, mid2msg, tid2msgs)
     else:
-      assert ref.tid == tid
+      assert refmsg.tid == tid
 
-def getmail(imap):
+def getmaxuid(imap):
   info( 'finding max UID' )
   # We use UIDs rather than the default of sequence numbers because UIDs are
   # guaranteed to be persistent across sessions.  This means that we can, for
@@ -52,8 +47,10 @@
   ok, [uids] = imap.uid('SEARCH', None, 'ALL')
   maxuid = int( uids.split()[-1] )
   del uids
+  return maxuid
 
-  info( 'actually fetching the messages in chunks up to max', maxuid )
+def getmail(imap, minuid, maxuid):
+  info( 'fetching messages', minuid, 'to', maxuid )
   # The syntax/fields of the FETCH command is documented in RFC 2060.  Also,
   # this article contains a brief overview:
   # http://www.devshed.com/c/a/Python/Python-Email-Libraries-part-2-IMAP/3/
@@ -61,19 +58,32 @@
   query = '(FLAGS BODY.PEEK[HEADER.FIELDS ' \
           '(Message-ID References In-Reply-To From To Cc Subject)])'
   step = 1000
-  return list( concat(
-    seq( lambda: info('fetching', start, 'to', start + step - 1),
-         lambda: imap.uid('FETCH', '%d:%d' % (start, start + step - 1),
-                          query)[1] )
-    for start in xrange(1, maxuid + 1, step) ) )
+  for start in xrange(minuid, maxuid + 1, step):
+    range = '%d:%d' % (start, min(maxuid, start + step - 1))
+    while True:
+      try:
+        info('fetching', range)
+        ok, chunk = imap.uid('FETCH', range, query)
+      except imaplib.abort, ex:
+        error('fetch failed:', ex.message)
+        if 'System Error' not in ex.message: raise
+      except:
+        exception('fetch failed')
+        raise
+      else:
+        break
+    for row in chunk:
+      yield row
 
 def main(argv):
-  p = ArgumentParser(description = __doc__)
+  p = argparse.ArgumentParser(description = __doc__)
   p.add_argument('--credfile', default = path( '~/.mlf.auth' ).expanduser(),
       help = """File containing your login credentials, with the username on the
       first line and the password on the second line.  Ignored iff --prompt.""")
   p.add_argument('--cachedir', default = path( '~/.mlf.cache' ).expanduser(),
       help = "Directory to use for caching our data.")
+  p.add_argument('--refresh', action = 'store_true',
+      help = "Re-fetch all messages, wiping out existing cache.")
   p.add_argument('--prompt', action = 'store_true',
       help = "Interactively prompt for the username and password.")
   p.add_argument('--pretend', action = 'store_true',
@@ -83,6 +93,8 @@
       help = "Do not mark newly revelant threads as unread.")
   p.add_argument('--no-mark-seen', action = 'store_true',
       help = "Do not mark newly irrevelant threads as read.")
+  p.add_argument('--no-fetch', action = 'store_true',
+      help = "Do not fetch new messages; just process already-fetched messages.")
   p.add_argument('--debug', action = 'append', default = [],
       help = """Enable logging for messages of the given flags. Flags include:
       refs (references to missing Message-IDs), dups (duplicate Message-IDs),
@@ -101,136 +113,147 @@
     print "username:",
     cfg.user = raw_input()
     print "password:",
-    cfg.passwd = getpass()
+    cfg.passwd = getpass.getpass()
   else:
     with file(cfg.credfile) as f:
       [cfg.user, cfg.passwd] = map(lambda x: x.strip('\r\n'), f.readlines())
 
   try:
-    m = match( r'(?P<host>[^:/]+)(:(?P<port>\d+))?(/(?P<mailbox>.+))?$',
+    m = re.match( r'(?P<host>[^:/]+)(:(?P<port>\d+))?(/(?P<mailbox>.+))?$',
                cfg.server )
     cfg.host = m.group('host')
-    cfg.port = int( default_if_none(m.group('port'), 993) )
-    cfg.mailbox = default_if_none(m.group('mailbox'), 'INBOX')
+    cfg.port = int( m.group('port') or 993 )
+    cfg.mailbox = m.group('mailbox') or 'INBOX'
   except:
     p.error('Need to specify the server in the correct format.')
 
-  soft_makedirs(cfg.cachedir)
+  files.soft_makedirs(cfg.cachedir)
 
-  with logout(IMAP4_SSL(cfg.host, cfg.port)) as imap:
-    imap.login(cfg.user, cfg.passwd)
+  info('connecting and logging in')
+
+  if True:
+  ###with networking.logout(imaplib.IMAP4_SSL(cfg.host, cfg.port)) as imap:
+    ###imap.login(cfg.user, cfg.passwd)
     # Close is only valid in the authenticated state.
-    with closing(imap) as imap:
-      # Select the main mailbox (INBOX).
-      imap.select(cfg.mailbox)
+    ###with closing(imap) as imap:
 
-      # Fetch message IDs, references, and senders.
-      xs = pickle_memoized \
-          (lambda imap: cfg.cachedir / cleanse_filename(cfg.sender)) \
-          (getmail) \
-          (imap)
+      info('selecting mailbox')
+      ###imap.select(cfg.mailbox)
 
-      log.debug('fetched', xs)
+      dbpath = cfg.cachedir / files.cleanse_filename(cfg.sender)
 
-      info('building message-id map and determining the set of messages sent '
-           'by you or addressed to you (the "source set")')
+      #
+      # Fetch message IDs, references, and senders into persistent store.
+      #
 
-      srcs = []
-      mid2msg = {}
-      # Every second item is just a closing paren.
-      # Example data:
-      # [('13300 (BODY[HEADER.FIELDS (Message-ID References In-Reply-To)] {67}',
-      #   'Message-ID: <mai...@py...>\r\n\r\n'),
-      #  ')',
-      #  ('13301 (BODY[HEADER.FIELDS (Message-ID References In-Reply-To)] {59}',
-      #   'Message-Id: <200...@hv...>\r\n\r\n'),
-      #  ')',
-      #  ('13302 (BODY[HEADER.FIELDS (Message-ID References In-Reply-To)] {92}',
-      #   'Message-ID: <C43EAFC0.2E3AE%ni...@ya...>\r\nIn-Reply-To: <481...@gm...>\r\n\r\n')]
-      for (envelope, data), paren in grouper(2, xs):
-        # Parse the body.
-        msg = message_from_string(data)
+      if cfg.refresh:
+        try: os.remove(dbpath)
+        except: pass
 
-        # Parse the envelope.
-        m = match(
-            r"(?P<seqno>\d+) \(UID (?P<uid>\d+) FLAGS \((?P<flags>[^)]+)\)",
-            envelope )
-        msg.seqno = m.group('seqno')
-        msg.uid   = m.group('uid')
-        msg.flags = m.group('flags').split()
+      if not cfg.no_fetch:
+        with closing(opendb(dbpath)) as mid2msg:
 
-        # Prepare a container for references to other msgs, and initialize the
-        # thread ID.
-        msg.refs = []
-        msg.tid = None
+          minuid = mid2msg.get('maxuid', 1)
+          maxuid = getmaxuid(imap)
 
-        # Add these to the map.
-        if msg['Message-ID'] in mid2msg:
-          log.warning( 'dups', 'duplicate message IDs:',
-                        msg['Message-ID'], msg['Subject'] )
-        mid2msg[ msg['Message-ID'] ] = msg
+          # Every second item is just a closing paren.
+          # Example data:
+          # [('13300 (BODY[HEADER.FIELDS (Message-ID References In-Reply-To)] {67}',
+          #   'Message-ID: <mai...@py...>\r\n\r\n'),
+          #  ')',
+          #  ('13301 (BODY[HEADER.FIELDS (Message-ID References In-Reply-To)] {59}',
+          #   'Message-Id: <200...@hv...>\r\n\r\n'),
+          #  ')',
+          #  ('13302 (BODY[HEADER.FIELDS (Message-ID References In-Reply-To)] {92}',
+          #   'Message-ID: <C43EAFC0.2E3AE%ni...@ya...>\r\nIn-Reply-To: <481...@gm...>\r\n\r\n')]
+          pat = re.compile(r"(?P<seqno>\d+) \(UID (?P<uid>\d+) FLAGS \((?P<flags>[^)]+)\)")
+          for i, ((envelope, data), paren) in enumerate(seqs.grouper(2, getmail(imap, minuid, maxuid))):
+            # Parse the body.
+            msg = email.message_from_string(data)
 
-        # Add to "srcs" set if sent by us or addressed to us.
-        if ( cfg.sender in default_if_none( msg['From'], '' ) or
-             cfg.sender in default_if_none( msg['To'],   '' ) or
-             cfg.sender in default_if_none( msg['Cc'],   '' ) ):
-          srcs.append( msg )
+            # Parse the envelope.
+            m = pat.match(envelope)
+            if m is None: raise Exception('envelope: %r' % envelope)
+            msg.seqno = m.group('seqno')
+            msg.uid   = m.group('uid')
+            msg.flags = m.group('flags').split()
 
-      info( 'constructing undirected graph' )
+            # Prepare a container for references to other msgs, and initialize the
+            # thread ID.
+            msg.refs = set()
+            msg.tid = None
 
-      for mid, msg in mid2msg.iteritems():
-        # Extract any references.
-        irt   = default_if_none( msg.get_all('In-Reply-To'), [] )
-        refs  = default_if_none( msg.get_all('References'), [] )
-        refs  = set( ' '.join( irt + refs ).replace('><', '> <').split() )
+            # Add these to the map.
+            if msg['Message-ID'] in mid2msg:
+              log.warning( 'dups', 'duplicate message IDs:',
+                            msg['Message-ID'], msg['Subject'] )
+            mid2msg[ msg['Message-ID'] ] = msg
 
-        # Connect nodes in graph bidirectionally.  Ignore references to MIDs
-        # that don't exist.
-        for ref in refs:
-          try:
-            refmsg = mid2msg[ref]
-            # We can use lists/append (not worry about duplicates) because the
-            # original sources should be acyclic.  If a -> b, then there is no b ->
-            # a, so when crawling a we can add a <-> b without worrying that later
-            # we may re-add b -> a.
-            msg.refs.append(refmsg)
-            refmsg.refs.append(msg)
-          except:
-            log.warning( 'refs', ref )
+            # Periodically sync to disk.
+            if len(mid2msg.cache) > 1000: mid2msg.sync()
 
-      info('finding connected components (grouping the messages into threads)')
+          mid2msg['maxuid'] = maxuid
 
-      tids = count()
-      tid2msgs = defaultdict(list)
-      for mid, msg in mid2msg.iteritems():
-        if msg.tid is None:
-          thread_dfs(msg, tids.next(), tid2msgs)
+      with closing(opendb(dbpath)) as mid2msg:
 
-      info( 'starring the relevant threads, in which I am a participant' )
+        info( 'maxuid', mid2msg['maxuid'] )
 
-      rel_tids = set()
-      for srcmsg in srcs:
-        if srcmsg.tid not in rel_tids:
-          rel_tids.add(srcmsg.tid)
-          for msg in tid2msgs[srcmsg.tid]:
-            if r'\Flagged' not in msg.flags:
-              log.info( 'star', '\n', msg )
-              if not cfg.pretend:
-                imap.uid('STORE', msg.uid, '+FLAGS', r'\Flagged')
-                if not cfg.no_mark_unseen and r'\Seen' in msg.flags:
-                  imap.uid('STORE', msg.uid, '-FLAGS', r'\Seen')
+        info( 'constructing undirected graph' )
 
-      info( 'unstarring irrelevant threads, in which I am not a participant' )
+        for i, (mid, msg) in enumerate(mid2msg.iteritems()):
+          # Extract any references.
+          irt  = msg.get_all('In-Reply-To', [])
+          refs = msg.get_all('References', [])
+          msg.refs.update( ' '.join( irt + refs ).replace('><', '> <').split() )
 
-      all_tids = set( tid2msgs.iterkeys() )
-      irrel_tids = all_tids - rel_tids
-      for tid in irrel_tids:
-        for msg in tid2msgs[tid]:
-          if r'\Flagged' in msg.flags:
-            log.info( 'unstar', '\n', msg )
-            if not cfg.pretend:
-              imap.uid('STORE', msg.uid, '-FLAGS', r'\Flagged')
-              if not cfg.no_mark_seen and r'\Seen' not in msg.flags:
-                imap.uid('STORE', msg.uid, '+FLAGS', r'\Seen')
+          # Connect nodes in graph bidirectionally.  Ignore references to MIDs
+          # that don't exist.
+          for ref in msg.refs:
+            try: mid2msg[ref].refs.add(msg['Message-ID'])
+            except KeyError: log.warning( 'no message with id', ref )
 
-run_main()
+          # Periodically sync to disk.
+          if len(mid2msg.cache) > 10000:
+            info( 'syncing; now at', i )
+            mid2msg.sync()
+
+        info('looking for relevant (grouping the messages into threads)')
+
+        # Look for messages sent by us or addressed to us, and add their
+        # connected components into tid2msgs.
+        tids = itertools.count()
+        tid2msgs = collections.defaultdict(list)
+        for mid, msg in mid2msg.iteritems():
+          if ( cfg.sender in msg.get('From', '' ) or
+               cfg.sender in msg.get('To',   '' ) or
+               cfg.sender in msg.get('Cc',   '' ) ):
+            thread_dfs(msg, tids.next(), mid2msg, tid2msgs)
+
+        info( 'starring the relevant threads, in which I am a participant' )
+
+        rel_tids = set()
+        for srcmsg in srcs:
+          if srcmsg.tid not in rel_tids:
+            rel_tids.add(srcmsg.tid)
+            for msg in tid2msgs[srcmsg.tid]:
+              if r'\Flagged' not in msg.flags:
+                log.info( 'star', '\n', msg )
+                if not cfg.pretend:
+                  imap.uid('STORE', msg.uid, '+FLAGS', r'\Flagged')
+                  if not cfg.no_mark_unseen and r'\Seen' in msg.flags:
+                    imap.uid('STORE', msg.uid, '-FLAGS', r'\Seen')
+
+        info( 'unstarring irrelevant threads, in which I am not a participant' )
+
+        all_tids = set( tid2msgs.iterkeys() )
+        irrel_tids = all_tids - rel_tids
+        for tid in irrel_tids:
+          for msg in tid2msgs[tid]:
+            if r'\Flagged' in msg.flags:
+              log.info( 'unstar', '\n', msg )
+              if not cfg.pretend:
+                imap.uid('STORE', msg.uid, '-FLAGS', r'\Flagged')
+                if not cfg.no_mark_seen and r'\Seen' not in msg.flags:
+                  imap.uid('STORE', msg.uid, '+FLAGS', r'\Seen')
+
+startup.run_main()


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




Thread: [Assorted-commits] SF.net SVN: assorted:[1420] mailing-list-filter/trunk/src/mlf.py

assorted-commits