[Kgdb-bugreport] [PATCH 1/2] prevent Slave CPUs hang on exit
Status: Beta
Brought to you by:
jwessel
From: Konstantin B. <kba...@ru...> - 2008-03-17 18:29:28
|
Problem: Sometimes(after remote gdb was connected) x86 SMP kernel(with KGDB and NMI watchdog enabled) hangs when kernel modules are automatically loaded. Root Cause: Slave CPU hangs in kgdb_wait() when master CPU leaves KGDB, causing the whole system to hang. If watchdog NMI occurs when Slave CPU have already exited kgdb_wait() and Master CPU haven't unset debugger_active, then Slave CPU can reenter kgdb_wait(). As (procindebug[atomic_read(&debugger_active) - 1) is zero(Master CPU have set procindebug[MasterCPU] to zero before exit), Slave loops in kgdb_wait(): ... /* Wait till master processor goes completely into the debugger. */ while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])) { int i = 10; /* an arbitrary number */ while (--i) cpu_relax(); } ... Slave CPU loops until Master CPU completely exits KGDB and set debugger_active to zero. But when debugger_active became zero, Slave CPU don't leaves loop, instead it hangs in while loop, because it starts to check procindebug[-1], because atomic_read(&debugger_active) = 0: ... while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])){...} ... For me procindebug[-1] is always zero, so Slave CPU hangs in NMI handler and stops accept NMIs. It leads to whole system hang. How Solved: New atomic variable debugger_exiting was added. It's set when Master CPU starts waiting Slave CPUs, and is reset after debugger_active is set to zero. Variable debugger_exiting is checked in kgdb_notify() and kgdb_nmihook wouldn't be called until debugger_exiting equal zero. So debugger_exiting guaranties that Slave CPU won't reenter kgdb_wait() until Master CPU completely leaves KGDB. Patch against kernel 2.6.24.3. Signed-off-by: Konstantin Baydarov <kba...@ru...> arch/x86/kernel/kgdb_32.c | 9 ++++++--- arch/x86/kernel/kgdb_64.c | 9 ++++++--- include/linux/kgdb.h | 1 + kernel/kgdb.c | 4 ++++ 4 files changed, 17 insertions(+), 6 deletions(-) Index: ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_32.c =================================================================== --- ko_2_6_24_3_kgdb.orig/arch/x86/kernel/kgdb_32.c +++ ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_32.c @@ -326,14 +326,16 @@ static int kgdb_notify(struct notifier_b switch (cmd) { case DIE_NMI: - if (atomic_read(&debugger_active)) { + if (atomic_read(&debugger_active) && + !atomic_read(&debugger_exiting)) { /* KGDB CPU roundup */ kgdb_nmihook(raw_smp_processor_id(), regs); return NOTIFY_STOP; } return NOTIFY_DONE; case DIE_NMI_IPI: - if (atomic_read(&debugger_active)) { + if (atomic_read(&debugger_active) && + !atomic_read(&debugger_exiting)) { /* KGDB CPU roundup */ if (kgdb_nmihook(raw_smp_processor_id(), regs)) return NOTIFY_DONE; @@ -341,7 +343,8 @@ static int kgdb_notify(struct notifier_b } return NOTIFY_DONE; case DIE_NMIWATCHDOG: - if (atomic_read(&debugger_active)) { + if (atomic_read(&debugger_active) && + !atomic_read(&debugger_exiting)) { /* KGDB CPU roundup */ kgdb_nmihook(raw_smp_processor_id(), regs); return NOTIFY_STOP; Index: ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_64.c =================================================================== --- ko_2_6_24_3_kgdb.orig/arch/x86/kernel/kgdb_64.c +++ ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_64.c @@ -406,14 +406,16 @@ static int kgdb_notify(struct notifier_b switch (cmd) { case DIE_NMI: - if (atomic_read(&debugger_active)) { + if (atomic_read(&debugger_active) && + !atomic_read(&debugger_exiting)) { /* KGDB CPU roundup */ kgdb_nmihook(raw_smp_processor_id(), regs); return NOTIFY_STOP; } return NOTIFY_DONE; case DIE_NMI_IPI: - if (atomic_read(&debugger_active)) { + if (atomic_read(&debugger_active) && + !atomic_read(&debugger_exiting)) { /* KGDB CPU roundup */ if (kgdb_nmihook(raw_smp_processor_id(), regs)) return NOTIFY_DONE; @@ -421,7 +423,8 @@ static int kgdb_notify(struct notifier_b } return NOTIFY_DONE; case DIE_NMIWATCHDOG: - if (atomic_read(&debugger_active)) { + if (atomic_read(&debugger_active) && + !atomic_read(&debugger_exiting)) { /* KGDB CPU roundup */ kgdb_nmihook(raw_smp_processor_id(), regs); return NOTIFY_STOP; Index: ko_2_6_24_3_kgdb/include/linux/kgdb.h =================================================================== --- ko_2_6_24_3_kgdb.orig/include/linux/kgdb.h +++ ko_2_6_24_3_kgdb/include/linux/kgdb.h @@ -281,6 +281,7 @@ extern int kgdb_handle_exception(int ex_ extern int kgdb_nmihook(int cpu, void *regs); extern int debugger_step; extern atomic_t debugger_active; +extern atomic_t debugger_exiting; #else /* Stubs for when KGDB is not set. */ static const atomic_t debugger_active = ATOMIC_INIT(0); Index: ko_2_6_24_3_kgdb/kernel/kgdb.c =================================================================== --- ko_2_6_24_3_kgdb.orig/kernel/kgdb.c +++ ko_2_6_24_3_kgdb/kernel/kgdb.c @@ -117,6 +117,8 @@ int debugger_step; static atomic_t kgdb_sync = ATOMIC_INIT(-1); atomic_t debugger_active; EXPORT_SYMBOL(debugger_active); +atomic_t debugger_exiting = ATOMIC_INIT(0); +EXPORT_SYMBOL(debugger_exiting); /* Our I/O buffers. */ static char remcom_in_buffer[BUFMAX]; @@ -1526,6 +1528,7 @@ default_handle: atomic_set(&procindebug[processor], 0); if (!debugger_step || !kgdb_contthread) { + atomic_set(&debugger_exiting, 1); for (i = 0; i < NR_CPUS; i++) spin_unlock(&slavecpulocks[i]); /* Wait till all the processors have quit @@ -1557,6 +1560,7 @@ default_handle: kgdb_restore: /* Free debugger_active */ atomic_set(&debugger_active, 0); + atomic_set(&debugger_exiting, 0); atomic_set(&kgdb_sync, -1); clocksource_touch_watchdog(); kgdb_softlock_skip[processor] = 1; |