1. Summary
  2. Files
  3. Support
  4. Report Spam
  5. Create account
  6. Log in

Changeset 3429

Show
Ignore:
Timestamp:
10/07/11 19:32:18 (20 months ago)
Author:
chrfranke
Message:

smartd: Resend warning emails if problem reappears (ticket #167).

Location:
trunk/smartmontools
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • trunk/smartmontools/CHANGELOG

    r3428 r3429  
    4141<DEVELOPERS: ADDITIONS TO THE CHANGE LOG GO JUST BELOW HERE, PLEASE> 
    4242 
     43  [CF] smartd: Resend warning emails if problem reappears (ticket #167). 
     44 
    4345  [CF] smartd: Add separate directives '-l offlinests' and '-l selfteststs' 
    4446       to enable tracking of status changes.  Disable '-l offlinests' by 
  • trunk/smartmontools/NEWS

    r3428 r3429  
    99Summary: smartmontools release 5.42 
    1010----------------------------------------------------------- 
     11- smartd resends warning emails if problem reappears. 
    1112- smartd directives '-l offlinests' and '-l selfteststs'. 
    1213- Platform-specific man pages. 
  • trunk/smartmontools/smartd.conf.5.in

    r3428 r3429  
    597597 
    598598[ATA only] Failed self-tests outdated by a newer successful extended 
    599 self\-test are ignored. 
     599self\-test are ignored.  The warning email counter is reset if the 
     600number of failed self tests dropped to 0.  This typically happens when 
     601an extended self\-test is run after all bad sectors have been reallocated. 
    600602 
    601603.I offlinests 
     
    847849type of disk problem detected. Each interval is twice as long as the 
    848850previous interval. 
     851 
     852If a disk problem is no longer detected, the internal email counter is 
     853reset.  If the problem reappears a new warning email is sent immediately. 
    849854 
    850855In addition, one may add zero or more of the following Directives: 
     
    11311136See also \'\-v 197,increasing\' below. 
    11321137 
     1138The warning email counter is reset if the number of pending sectors 
     1139dropped to 0.  This typically happens when all pending sectors have 
     1140been reallocated or could be read again. 
     1141 
    11331142A pending sector is a disk sector (containing 512 bytes of your data) 
    11341143which the device would like to mark as ``bad" and reallocate. 
     
    11591168See also \'\-v 198,increasing\' below. 
    11601169 
     1170The warning email counter is reset if the number of offline uncorrectable 
     1171sectors dropped to 0.  This typically happens when all offline uncorrectable 
     1172sectors have been reallocated or could be read again. 
     1173 
    11611174An offline uncorrectable sector is a disk sector which was not 
    11621175readable during an off\-line scan or a self\-test.  This is important 
     
    11741187will be send if '-m' is specified. If only the limit \fBINFO\fP is 
    11751188reached, a message with loglevel \fB\'LOG_INFO\'\fP will be logged. 
     1189 
     1190The warning email counter is reset if the temperature dropped below 
     1191\fBINFO\fP or \fBCRIT\fP-5 if \fBINFO\fP is not specified. 
    11761192 
    11771193If this directive is used in conjunction with state persistence 
  • trunk/smartmontools/smartd.cpp

    r3428 r3429  
    942942    return; 
    943943  } 
    944    
     944 
    945945  // Return if a single warning mail has been sent. 
    946946  if ((cfg.emailfreq==1) && mail->logged) 
     
    12431243  // increment mail sent counter 
    12441244  mail->logged++; 
     1245} 
     1246 
     1247static void reset_warning_mail(const dev_config & cfg, dev_state & state, int which, const char *fmt, ...) 
     1248                               __attribute__ ((format (printf, 4, 5))); 
     1249 
     1250static void reset_warning_mail(const dev_config & cfg, dev_state & state, int which, const char *fmt, ...) 
     1251{ 
     1252  if (!(0 <= which && which < SMARTD_NMAIL)) 
     1253    return; 
     1254 
     1255  // Return if no mail sent yet 
     1256  mailinfo & mi = state.maillog[which]; 
     1257  if (!mi.logged) 
     1258    return; 
     1259 
     1260  // Format & print message 
     1261  char msg[256]; 
     1262  va_list ap; 
     1263  va_start(ap, fmt); 
     1264  vsnprintf(msg, sizeof(msg), fmt, ap); 
     1265  va_end(ap); 
     1266 
     1267  PrintOut(LOG_INFO, "Device: %s, %s, warning condition reset after %d email%s\n", cfg.name.c_str(), 
     1268           msg, mi.logged, (mi.logged==1 ? "" : "s")); 
     1269 
     1270  // Clear mail counter and timestamps 
     1271  mi = mailinfo(); 
     1272  state.must_write = true; 
    12451273} 
    12461274 
     
    22962324    // command failed 
    22972325    MailWarning(cfg, state, 8, "Device: %s, Read SMART Self-Test Log Failed", name); 
    2298   else {       
     2326  else { 
     2327    reset_warning_mail(cfg, state, 8, "Read SMART Self-Test Log worked again"); 
     2328 
    22992329    // old and new error counts 
    23002330    int oldc=state.selflogcount; 
     
    23292359 
    23302360    // Print info if error entries have disappeared 
    2331     if (oldc > newc) 
     2361    // or newer successful successful extended self-test exits 
     2362    if (oldc > newc) { 
    23322363      PrintOut(LOG_INFO, "Device: %s, Self-Test Log error count decreased from %d to %d\n", 
    23332364               name, oldc, newc); 
     2365      if (newc == 0) 
     2366        reset_warning_mail(cfg, state, 3, "Self-Test Log does no longer report errors"); 
     2367    } 
    23342368 
    23352369    // Needed since self-test error count may DECREASE.  Hour might 
     
    26732707  // No report if no sectors pending. 
    26742708  uint64_t rawval = ata_get_attr_raw_value(smartval.vendor_attributes[i], cfg.attribute_defs); 
    2675   if (rawval == 0) 
     2709  if (rawval == 0) { 
     2710    reset_warning_mail(cfg, state, mailtype, "No more %s", msg); 
    26762711    return; 
     2712  } 
    26772713 
    26782714  // If attribute is not reset, report only sector count increases. 
     
    27682804      cfg.name.c_str(), currtemp, cfg.tempinfo, fmt_temp(state.tempmin, buf), minchg, state.tempmax, maxchg); 
    27692805  } 
     2806  else if (cfg.tempcrit) { 
     2807    unsigned char limit = (cfg.tempinfo ? cfg.tempinfo : cfg.tempcrit-5); 
     2808    if (currtemp < limit) 
     2809      reset_warning_mail(cfg, state, 12, "Temperature %u Celsius dropped below %u Celsius", currtemp, limit); 
     2810  } 
    27702811} 
    27712812 
     
    28852926    MailWarning(cfg, state, 9, "Device: %s, unable to open device", name); 
    28862927    return 1; 
    2887   } else if (debugmode) 
     2928  } 
     2929  if (debugmode) 
    28882930    PrintOut(LOG_INFO,"Device: %s, opened ATA device\n", name); 
     2931  reset_warning_mail(cfg, state, 9, "open device worked again"); 
    28892932 
    28902933  // user may have requested (with the -n Directive) to leave the disk 
     
    29893032    } 
    29903033    else { 
     3034      reset_warning_mail(cfg, state, 6, "read SMART Attribute Data worked again"); 
     3035 
    29913036      // look for current or offline pending sectors 
    29923037      if (cfg.curr_pending_id)