From: <di...@us...> - 2008-03-18 15:47:06
|
Revision: 592 http://safekeep.svn.sourceforge.net/safekeep/?rev=592&view=rev Author: dimi Date: 2008-03-18 08:47:01 -0700 (Tue, 18 Mar 2008) Log Message: ----------- Frank Crawford <fr...@cr...> * Added a cleanup option to client and server modes to remove safekeep LVM snapshots and mounts after a crash or problem. * Added new communications tag "SCRUB" to do a full remote cleanup. * Added a warning if there is a mismatch in the communications protocol minor level. * Append specific paths (/sbin, /usr/sbin and /usr/local/sbin) to the client path when run in cleanup mode, to cover any path issues. * Fixed a couple of issues with pass client exceptions back to the server, and strip off excess newlines. * Add test and abort run on client if there are any existing safekeep LVM snapshots. Modified Paths: -------------- safekeep/trunk/doc/safekeep.txt safekeep/trunk/safekeep Modified: safekeep/trunk/doc/safekeep.txt =================================================================== --- safekeep/trunk/doc/safekeep.txt 2008-03-01 22:34:36 UTC (rev 591) +++ safekeep/trunk/doc/safekeep.txt 2008-03-18 15:47:01 UTC (rev 592) @@ -7,13 +7,13 @@ SYNOPSIS -------- -'safekeep' --server [-q] [-v] [--noemail] [--force] [-c file] <clientid>* +'safekeep' --server [-q] [-v] [--noemail] [--force] [-c file] [--cleanup] <clientid>* 'safekeep' --keys [-q] [-v] [--noemail] [-c file] [-i file] [--status] [--print] [--deploy] <clientid>* 'safekeep' --list [-q] [-v] [--noemail] [-c file] [--increments] [--parsable-output] [--sizes] [--changed=<time>] [--at-time=<time>] <clientid>* -'safekeep' --client +'safekeep' --client [--cleanup] 'safekeep' -h | -V @@ -40,6 +40,11 @@ Note that the client mode of SafeKeep should never be invoked manually, this mode is meant to be used only by the server mode of SafeKeep. +The only exception to this is if run with the `--cleanup` option, which +is used to remove LVM snapshots and mounts created by Safekeep, after a +crash or some other failure, without a connection to the server. +Normally this cleanup would be performed through the server command +`safekeep --server --cleanup`. The SSH key management mode is a helper mode for deploying or verifying the setup of the SSH authentification keys. @@ -111,6 +116,25 @@ backup directory becomes corrupt, and `rdiff-backup` error logs tells you to use this option. +--cleanup:: + Remove LVM snapshots and mounts left by Safekeep after a + crash or other failure. This will run also run the standard + cleanup processes, such as the removal of an DB dumps, and + forces a consistency check of the `rdiff-backup` destination + directory. This is the prefered cleanup procedure and can + be run with no danger of corrupting the system if there is + nothing to cleanup. + +CLIENT OPTIONS +-------------- +--cleanup:: + Remove LVM snapshots and mounts left after a crash or other + failure from the local system. Unlike the equivalent `--server` + option, it does not do any other of the standard cleanups. + This option should only be used when it is not possible to + refer to the server, for example, when the network connection + to the server is no longer available. + KEYS OPTIONS ------------ -i FILE:: Modified: safekeep/trunk/safekeep =================================================================== --- safekeep/trunk/safekeep 2008-03-01 22:34:36 UTC (rev 591) +++ safekeep/trunk/safekeep 2008-03-18 15:47:01 UTC (rev 592) @@ -16,7 +16,7 @@ # along with Safekeep. If not, see <http://www.gnu.org/licenses/>. from __future__ import generators -import getopt, os, os.path, popen2, re, sys +import getopt, os, os.path, popen2, re, sys, fnmatch import commands, tempfile, time, traceback import getpass, pwd, xml.dom.minidom import socket, smtplib @@ -53,7 +53,7 @@ home_dir = None base_dir = None -PROTOCOL = "1.0" +PROTOCOL = "1.1" VERSION = "1.0.4" VEBOSITY_BY_CLASS = {'DBG': 3, 'INFO': 2, 'WARN': 1, 'ERR': 0} @@ -394,19 +394,48 @@ warn('Unable to remove dump file: %s for database %s because: %s' % (dump['file'], dump['db'], e)) -def gather_lvm_information(device): - device = device.replace('/mapper','').replace('-','/') - (group, volume) = device.split('/')[-2:] +def lvm_snap_information(): + (cin, cout) = os.popen4(['lvs', '--separator', ':', '--noheadings']) + lines = cout.readlines() + cout.close() + cin.close() + lvms = [] + for line in lines: + if line.count(':') > 3: + (volume, group, attr, blah1) = line.lstrip().split(':', 3) + if fnmatch.fnmatch(volume, '*_snap_safekeep-*') and attr[0].lower() == 's': + lvms.append([volume, group]) + return lvms + +def mount_information(reverse = False): (cin, cout) = os.popen4('mount') lines = cout.readlines() cout.close() cin.close() + mounts = [] + if reverse: + lines.reverse() for line in lines: - (device, blah1, mountpoint, blah2, mounttype, blah3) = line.split(' ', 5) - if line.startswith('/dev/mapper/' + group + '-' + volume + ' '): + (device, blah1, mountpoint, blah2, mounttype, mountoptions) = line.split() + mounts.append([device, mountpoint, mounttype, mountoptions[1:-1]]) + return mounts + +def map_lvm_device(device): + device = device.replace('/mapper','').replace('-','/') + return device.split('/')[-2:] + +def check_lvm_information(device): + (group, volume) = map_lvm_device(device) + for (lvm_volume, lvm_group) in lvm_snap_information(): + if lvm_group == group and lvm_volume.startswith(volume): + return True + return False + +def gather_lvm_information(device): + (group, volume) = map_lvm_device(device) + for (device, mountpoint, mounttype, mountoptions) in mount_information(False): + if [group, volume] == map_lvm_device(device): return (group, volume, mountpoint, mounttype) - elif line.startswith('/dev/' + group + '/' + volume + ' '): - return (group, volume, mountpoint, mounttype) return (None, None, None, None) def gather_snap_information(device, bdir): @@ -486,6 +515,12 @@ do_client_dbdump(cfg) if len(cfg['snaps']) > 0: + debug('Checking FS snapshots') + for snap in cfg['snaps']: + device = snap['device'] + if check_lvm_information(device): + raise Exception("Previous snapshots found for %s: run 'safekeep --server --cleanup' to correct" % device) + ret = spawn(['modprobe', 'dm-snapshot']) if ret: warn('modprobe dm-snapshot failed, continuing') @@ -525,6 +560,76 @@ def do_client_compat(server_versions): debug('Server versions: %s' % server_versions) +def do_client_scrub(): + debug("Do client scrub loop") + + if os.getuid(): + if is_client: + raise Exception('client not running as root') + else: + error("--cleanup must be run as root") + sys.exit(2) + + scrubbed = False + if os.environ['PATH'][-1] == ':': + os.environ['PATH'] += '/sbin:/usr/sbin:/usr/local/sbin:' + else: + os.environ['PATH'] += ':/sbin:/usr/sbin:/usr/local/sbin' + + # Go through and unmount anythings that are still hanging around + + debug("Cleaning up existing mounts") + for (device, mountpoint, mounttype, mountoptions) in mount_information(True): + if mountpoint.startswith('/mnt/safekeep-'): + info("Removing mount %s" % mountpoint) + if device == '/' and 'bind' in mountoptions.split(','): + info("Removing rbind directory %s" % mountpoint) + ret = spawn(['umount', '-l', mountpoint]) + if ret: + warn('Failed to unmount: ' + mountpoint) + else: + try: + os.rmdir(mountpoint) + except Exception, e: + warn('Failed to remove: ' + mountpoint) + else: + ret = spawn(['umount', mountpoint]) + if ret: + warn('Can not unmount the snapshot: %s' % mountpoint) + if fnmatch.fnmatch(device, '*_snap_safekeep-*'): + info("Removing snapshot %s" % device) + ret = spawn(['lvremove', '--force', device]) + if ret: + warn('Can not tear down snapshot: ' + device) + scrubbed = True + + # Now cleanup any snapshots still hanging around + + debug("Cleaning up remaining snapshots") + for (volume, group) in lvm_snap_information(): + device = os.path.join('/dev', group, volume) + info("Removing snapshot %s" % device) + ret = spawn(['lvremove', '--force', device]) + if ret: + warn('Can not tear down snapshot: ' + device) + scrubbed = True + + # Now cleanup any safekeep directories still hanging around + + debug("Cleaning up remaining safekeep directories") + if os.path.isdir('/mnt'): + for ent in os.listdir('/mnt'): + mountpoint = os.path.join('/mnt', ent) + if ent.startswith('safekeep-') and os.path.isdir(mountpoint): + info("Removing rbind directory %s" % mountpoint) + try: + os.rmdir(mountpoint) + except Exception, e: + warn('Failed to remove: ' + mountpoint) + + if not scrubbed: + info('No cleanup required') + def do_client(): debug("Do client main loop") should_cleanup = True @@ -547,6 +652,9 @@ if dir == bdir: should_cleanup = False do_client_cleanup(cfg, dir) send('OK') + elif line.startswith('SCRUB'): + do_client_scrub() + send('OK') elif not line: break else: @@ -554,7 +662,7 @@ break except Exception, e: traceback.print_exc(file=sys.stdout) - send('ERROR ' + e) + send('ERROR %s' % e) finally: if should_cleanup: do_client_cleanup(cfg, bdir) @@ -570,7 +678,7 @@ if line.startswith('OK'): return line[2:-1].strip() elif line.startswith('ERROR'): - raise Exception(line[5:]) + raise Exception(line[5:].strip()) elif not line: raise Exception('client died unexpectedly') else: @@ -604,6 +712,12 @@ if ret: raise Exception('Failed to run rdiff-backup') +def do_server_rdiff_cleanup(cfg): + args = ['rdiff-backup', '--check-destination-dir', cfg['dir']] + ret = spawn(args) + if ret: + warn('Failed to cleanup old data, please fix the problem manually') + def do_server_data_cleanup(cfg): args = ['rdiff-backup', '--force', '--remove-older-than', cfg['retention'], cfg['dir']] ret = spawn(args) @@ -616,8 +730,10 @@ (server_major, server_minor) = PROTOCOL.split('.') if server_major != client_major: raise Exception('Incompatible protocols: %s <> %s' % (PROTOCOL, client_protocol)) + elif server_minor > client_minor: + warn('Protocol mismatch: %s <> %s' % (PROTOCOL, client_protocol)) -def do_server(cfgs, ids, force): +def do_server(cfgs, ids, force, cleanup): debug("Do server main loop") for cfg in cfgs.itervalues(): id = cfg['id'] @@ -640,7 +756,7 @@ raise Exception('Can not create data store dir: %s' % datadir) rdiff_logdir = os.path.join(datadir, 'rdiff-backup-data') - if cfg['retention'] and os.path.isdir(rdiff_logdir): + if cfg['retention'] and os.path.isdir(rdiff_logdir) and not cleanup: do_server_data_cleanup(cfg) if cfg['host']: @@ -660,36 +776,44 @@ cin.flush() do_server_getanswer(cout) - cin.write('SETUP\n') - cin.flush() - bdir = do_server_getanswer(cout) - - if os.path.isdir(rdiff_logdir): - rdiff_logpre = os.listdir(rdiff_logdir) + if cleanup: + cin.write('SCRUB\n') + cin.flush() + do_server_getanswer(cout) + bdir = '/' # Fake directory for the rest of the cleanup + do_server_rdiff_cleanup(cfg) + errs = 0 else: - rdiff_logpre = [] + cin.write('SETUP\n') + cin.flush() + bdir = do_server_getanswer(cout) - backup_log = os.path.join(rdiff_logdir, 'backup.log') - if os.path.isfile(backup_log): - backup_marker = '=== Backup session on %s ===' % time.asctime() - fbm = open(backup_log, 'a') - fbm.write(backup_marker + '\n') - fbm.close() - else: - backup_marker = None + if os.path.isdir(rdiff_logdir): + rdiff_logpre = os.listdir(rdiff_logdir) + else: + rdiff_logpre = [] - do_server_rdiff(cfg, bdir, force) + backup_log = os.path.join(rdiff_logdir, 'backup.log') + if os.path.isfile(backup_log): + backup_marker = '=== Backup session on %s ===' % time.asctime() + fbm = open(backup_log, 'a') + fbm.write(backup_marker + '\n') + fbm.close() + else: + backup_marker = None - errs = 0 - if os.path.isdir(rdiff_logdir): - info_file(backup_log, backup_marker) - rdiff_logpost = os.listdir(rdiff_logdir) - for lfn in rdiff_logpost: - if lfn.startswith('session_statistics.') and lfn.endswith('.data') and lfn not in rdiff_logpre: - errs += info_file(os.path.join(rdiff_logdir, lfn)) - else: - warn('Log dir does not exist.') + do_server_rdiff(cfg, bdir, force) + errs = 0 + if os.path.isdir(rdiff_logdir): + info_file(backup_log, backup_marker) + rdiff_logpost = os.listdir(rdiff_logdir) + for lfn in rdiff_logpost: + if lfn.startswith('session_statistics.') and lfn.endswith('.data') and lfn not in rdiff_logpre: + errs += info_file(os.path.join(rdiff_logdir, lfn)) + else: + warn('Log dir does not exist.') + cin.write('CLEANUP %s\n' % bdir) cin.flush() do_server_getanswer(cout) @@ -902,6 +1026,7 @@ print print 'server options:' print '--force force backup destination overwriting, dangerous!' + print '--cleanup perform cleanup actions after a failure' print print 'keys options:' print '-i FILE use FILE as identity for RSA/DSA authentication' @@ -924,7 +1049,7 @@ 'email=', 'force', 'help', 'keys', 'list', 'increments', 'sizes', 'parsable-output', 'changed=', 'at-time=', - 'noemail', + 'noemail', 'cleanup', 'print', 'quiet', 'server', 'smtp=', 'status', 'verbose', 'version']) except getopt.GetoptError: @@ -939,6 +1064,7 @@ verbosity = 0 clientid = None force = 0 + cleanup = 0 noemail = 0 list_type = None list_parsable = 0 @@ -981,6 +1107,8 @@ mode = 'keys' elif o in ('--force', ): force = 1 + elif o in ('--cleanup', ): + cleanup = 1 elif o in ('--noemail', ): noemail = 1 elif o in ('--increments', ): @@ -1027,6 +1155,9 @@ if mode is not 'server' and (email or smtp): usage(2) + if not mode in ['server', 'client'] and cleanup: + usage(2) + if mode is 'client' and cfglocs: usage(2) @@ -1088,7 +1219,7 @@ if mode is 'server': is_client = False verbosity_level = 1 + verbosity - do_server(cfgs, args, force) + do_server(cfgs, args, force, cleanup) elif mode is 'list': if list_type is None: list_type = 'increments' @@ -1096,9 +1227,14 @@ verbosity_level = 2 + verbosity do_list(cfgs, args, list_type, list_date, list_parsable) elif mode is 'client': - is_client = True - verbosity_level = 3 + verbosity - do_client() + if cleanup: + is_client = False + verbosity_level = 1 + verbosity + do_client_scrub() + else: + is_client = True + verbosity_level = 3 + verbosity + do_client() elif mode is 'keys': is_client = False verbosity_level = 1 + verbosity This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |