You can subscribe to this list here.
| 2009 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(32) |
Jun
(66) |
Jul
(102) |
Aug
(78) |
Sep
(106) |
Oct
(137) |
Nov
(147) |
Dec
(147) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2010 |
Jan
(71) |
Feb
(139) |
Mar
(86) |
Apr
(76) |
May
(57) |
Jun
(10) |
Jul
(12) |
Aug
(6) |
Sep
(8) |
Oct
(12) |
Nov
(12) |
Dec
(18) |
| 2011 |
Jan
(16) |
Feb
(19) |
Mar
(3) |
Apr
(1) |
May
(16) |
Jun
(17) |
Jul
(74) |
Aug
(22) |
Sep
(18) |
Oct
(24) |
Nov
(21) |
Dec
(30) |
| 2012 |
Jan
(31) |
Feb
(16) |
Mar
(22) |
Apr
(25) |
May
(18) |
Jun
(13) |
Jul
(83) |
Aug
(49) |
Sep
(20) |
Oct
(60) |
Nov
(35) |
Dec
(28) |
| 2013 |
Jan
(39) |
Feb
(61) |
Mar
(35) |
Apr
(21) |
May
(45) |
Jun
(56) |
Jul
(20) |
Aug
(9) |
Sep
(10) |
Oct
(31) |
Nov
(8) |
Dec
(4) |
| 2014 |
Jan
(6) |
Feb
(7) |
Mar
(7) |
Apr
(6) |
May
(4) |
Jun
(8) |
Jul
(5) |
Aug
(2) |
Sep
(4) |
Oct
(4) |
Nov
(11) |
Dec
(5) |
| 2015 |
Jan
(4) |
Feb
(4) |
Mar
(3) |
Apr
(4) |
May
(9) |
Jun
(4) |
Jul
(15) |
Aug
(8) |
Sep
(16) |
Oct
(18) |
Nov
(15) |
Dec
(7) |
| 2016 |
Jan
(20) |
Feb
(9) |
Mar
(15) |
Apr
(24) |
May
(16) |
Jun
(28) |
Jul
(22) |
Aug
(23) |
Sep
(18) |
Oct
(30) |
Nov
(40) |
Dec
(9) |
| 2017 |
Jan
(1) |
Feb
(8) |
Mar
(37) |
Apr
(26) |
May
(25) |
Jun
(46) |
Jul
(24) |
Aug
(9) |
Sep
|
Oct
|
Nov
|
Dec
|
|
From: H. P. A. <hp...@zy...> - 2012-09-28 02:25:32
|
On 09/27/2012 03:33 PM, Seiji Aguchi wrote:
> Hi,
>
>> ... except the cost can be reduced to zero *AND* be made into a more general mechanism by simply hooking the IDT.
>
> Thank you for giving me the comment.
> In my understanding, we can introduce a more general mechanism by sandwiching an existing handler between tracepoints.
> The pseudo code is like this:
>
> @@ -17,7 +18,7 @@ static void default_threshold_interrupt(void)
>
> void (*mce_threshold_vector)(void) = default_threshold_interrupt;
>
> -asmlinkage void smp_threshold_interrupt(void)
> +static void do_smp_threshold_interrupt(void)
> {
> irq_enter();
> exit_idle();
> @@ -27,3 +28,10 @@ asmlinkage void smp_threshold_interrupt(void)
> /* Ack only at the end to avoid potential reentry */
> ack_APIC_irq();
> }
> +
> +asmlinkage void smp_threshold_interrupt(void) {
> + trace_arch_irq_vector_entry(THRESHOLD_APIC_VECTOR);
> + do_smp_threshold_interrupt();
> + trace_arch_irq_vector_exit(THRESHOLD_APIC_VECTOR);
> +}
>
> If I misunderstand something, please let me know.
>
Quite.
These functions are being invoked from the IDT, which is an indirect
pointer structure. When not being traced, there is absolutely no reason
why it should go through a thunk with tracepoints.
-hpa
|
|
From: Satoru M. <sat...@hd...> - 2012-09-27 23:21:58
|
This is a first time for me to post a patch to qemu-devel.
If there is something missing/wrong, please let me know.
We have some plans to migrate old enterprise systems which require
low latency (msec order) to kvm virtualized environment. Usually,
we uses mlock to preallocate and pin down process memory in order
to avoid page allocation in latency critical path. On the other
hand, in kvm environment, mlocking in guests is not effective
because it can't avoid page reclaim in host. Actually, to avoid
guest memory reclaim, qemu has "mem-path" option that is actually
for using hugepage. But a memory region of qemu is not allocated
on hugepage, so it may be reclaimed. That may cause a latency
problem.
To avoid guest and qemu memory reclaim, this patch introduces
a new "mlock" option. With this option, we can preallocate and
pin down guest and qemu memory before booting guest OS.
Tested on Linux, x86_64 (fedora 17).
Signed-off-by: Satoru Moriya <sat...@hd...>
---
cpu-all.h | 1 +
exec.c | 3 +++
qemu-options.hx | 8 ++++++++
vl.c | 4 ++++
4 files changed, 16 insertions(+)
diff --git a/cpu-all.h b/cpu-all.h
index 74d3681..e12e5d5 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -503,6 +503,7 @@ extern RAMList ram_list;
extern const char *mem_path;
extern int mem_prealloc;
+extern int mem_lock;
/* Flags stored in the low bits of the TLB virtual address. These are
defined so that fast path ram access is all zeros. */
diff --git a/exec.c b/exec.c
index bb6aa4a..de13bc9 100644
--- a/exec.c
+++ b/exec.c
@@ -2572,6 +2572,9 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
}
memory_try_enable_merging(new_block->host, size);
}
+ if (mem_lock && mlockall(MCL_CURRENT | MCL_FUTURE)) {
+ perror("mlockall");
+ }
}
new_block->length = size;
diff --git a/qemu-options.hx b/qemu-options.hx
index 7d97f96..9d82f15 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -427,6 +427,14 @@ Preallocate memory when using -mem-path.
ETEXI
#endif
+DEF("mlock", 0, QEMU_OPTION_mlock,
+ "-mlock mlock guest and qemu memory\n",
+ QEMU_ARCH_ALL)
+STEXI
+@item -mlock
+mlock guest and qemu memory.
+ETEXI
+
DEF("k", HAS_ARG, QEMU_OPTION_k,
"-k language use keyboard layout (for example 'fr' for French)\n",
QEMU_ARCH_ALL)
diff --git a/vl.c b/vl.c
index 8d305ca..c902084 100644
--- a/vl.c
+++ b/vl.c
@@ -187,6 +187,7 @@ const char *mem_path = NULL;
#ifdef MAP_POPULATE
int mem_prealloc = 0; /* force preallocation of physical target memory */
#endif
+int mem_lock;
int nb_nics;
NICInfo nd_table[MAX_NICS];
int autostart;
@@ -2770,6 +2771,9 @@ int main(int argc, char **argv, char **envp)
mem_prealloc = 1;
break;
#endif
+ case QEMU_OPTION_mlock:
+ mem_lock = 1;
+ break;
case QEMU_OPTION_d:
log_mask = optarg;
break;
--
1.7.11.4
|
|
From: Seiji A. <sei...@hd...> - 2012-09-27 22:34:12
|
Hi,
> ... except the cost can be reduced to zero *AND* be made into a more general mechanism by simply hooking the IDT.
Thank you for giving me the comment.
In my understanding, we can introduce a more general mechanism by sandwiching an existing handler between tracepoints.
The pseudo code is like this:
@@ -17,7 +18,7 @@ static void default_threshold_interrupt(void)
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-asmlinkage void smp_threshold_interrupt(void)
+static void do_smp_threshold_interrupt(void)
{
irq_enter();
exit_idle();
@@ -27,3 +28,10 @@ asmlinkage void smp_threshold_interrupt(void)
/* Ack only at the end to avoid potential reentry */
ack_APIC_irq();
}
+
+asmlinkage void smp_threshold_interrupt(void) {
+ trace_arch_irq_vector_entry(THRESHOLD_APIC_VECTOR);
+ do_smp_threshold_interrupt();
+ trace_arch_irq_vector_exit(THRESHOLD_APIC_VECTOR);
+}
If I misunderstand something, please let me know.
Seiji
|
|
From: H. P. A. <hp...@zy...> - 2012-09-25 03:38:44
|
On 09/21/2012 05:40 PM, Seiji Aguchi wrote: > > Those x86 specific ones are not really frequently raised vectors, so > enabling them all won't affect performance and readability of the > traces too much. > ... except the cost can be reduced to zero *AND* be made into a more general mechanism by simply hooking the IDT. -hpa -- H. Peter Anvin, Intel Open Source Technology Center I work for Intel. I don't speak on their behalf. |
|
From: Seiji A. <sei...@hd...> - 2012-09-22 00:40:46
|
Change log v3 -> v4 - Add a latency measurement of each tracepoint - Rebased to 3.6-rc6 v2 -> v3 - Remove an invalidate_tlb_vector event because it was replaced by a call function vector in a following commit. http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=52aec3308db85f4e9f5c8b9f5dc4fbd0138c6fa4 v1 -> v2 - Modify variable name from irq to vector. - Merge arch-specific tracepoints below to an arch_irq_vector_entry/exit. - error_apic_vector - thermal_apic_vector - threshold_apic_vector - spurious_apic_vector - x86_platform_ipi_vector As Vaibhav explained in the thread below, tracepoints for irq vectors are useful. http://www.spinics.net/lists/mm-commits/msg85707.html <snip> The current interrupt traces from irq_handler_entry and irq_handler_exit provide when an interrupt is handled. They provide good data about when the system has switched to kernel space and how it affects the currently running processes. There are some IRQ vectors which trigger the system into kernel space, which are not handled in generic IRQ handlers. Tracing such events gives us the information about IRQ interaction with other system events. The trace also tells where the system is spending its time. We want to know which cores are handling interrupts and how they are affecting other processes in the system. Also, the trace provides information about when the cores are idle and which interrupts are changing that state. <snip> On the other hand, my usecase is tracing just local timer event and getting a value of instruction pointer. I suggested to add an argument local timer event to get instruction pointer before. But there is another way to get it with external module like systemtap. So, I don't need to add any argument to irq vector tracepoints now. Vaibhav's patch shared a trace point ,irq_vector_entry/irq_vector_exit, in all events. But there is an above use case to trace specific irq_vector rather than tracing all events. In this case, we are concerned about overhead due to unwanted events. This patch modifies Vaibhav's one as follows. - Separate generic, and across-architecture tracepoints to enable independently. - nmi_vector - local_timer_vector - reschedule_vector - call_function_vector - call_function_single_vector - irq_work_entry_vector - Rename architecture-specific tracepoints from irq_vector_entry/exit to arch_irq_vector_entry/exit. - error_apic_vector - thermal_apic_vector - threshold_apic_vector - spurious_apic_vector - x86_platform_ipi_vector Those x86 specific ones are not really frequently raised vectors, so enabling them all won't affect performance and readability of the traces too much. Latencies of each event are measured by enabling a latency tracer of ftrace because some of tracepoints are in performace critical pathes. In my machine, all tracepoints are an usec order in case where they are enabled. CPU: Xeon X3430 @ 2.40GHz kernel: 3.6-rc6 arch: x86_64 (1) local timer (1-1) local_timer_entry - 3.6-rc6 original <...>-2788 2dNh. 894834337us : exit_idle <-smp_apic_timer_interrupt <...>-2788 2dNh. 894834337us : hrtimer_interrupt <-smp_apic_timer_interrupt - 3.6-rc6 + this patch + trace off <...>-1981 0d.h. 210538us : exit_idle <-smp_apic_timer_interrupt <...>-1981 0d.h. 210538us : hrtimer_interrupt <-smp_apic_timer_interrupt - 3.6-rc6 + this patch + trace on cat-3158 0d.h. 169673us : exit_idle <-smp_apic_timer_interrupt *cat-3158 0d.h. 169674us : local_timer_entry: vector=239* cat-3158 0d.h. 169674us : hrtimer_interrupt <-smp_apic_timer_interrupt (1-2) local_timer_exit - 3.6-rc6 original <...>-2788 2dNh. 894834361us : lapic_next_event <-clockevents_program_event <...>-2788 2dNh. 894834361us : irq_exit <-smp_apic_timer_interrupt - 3.6-rc6 + this patch + trace off <...>-1981 0d.h. 210550us : lapic_next_event <-clockevents_program_event <...>-1981 0d.h. 210550us : irq_exit <-smp_apic_timer_interrupt - 3.6-rc6 + this patch + trace on cat-3158 0d.h. 169695us : lapic_next_event <-clockevents_program_event *cat-3158 0d.h. 169695us : local_timer_exit: vector=239* cat-3158 0d.h. 169695us : irq_exit <-smp_apic_timer_interrupt (2) reschedule (2-1) reschedule_entry - 3.6-rc6 original <idle>-0 2d... 897118426us : smp_reschedule_interrupt <-reschedule_interrupt <idle>-0 2d... 897118426us : scheduler_ipi <-smp_reschedule_interrupt - 3.6-rc6 + this patch + trace off <idle>-0 0d... 212565us : smp_reschedule_interrupt <-reschedule_interrupt <idle>-0 0d... 212565us : scheduler_ipi <-smp_reschedule_interrupt - 3.6-rc6 + this patch + trace on <idle>-0 0d... 181656us : smp_reschedule_interrupt <-reschedule_interrupt *<idle>-0 0d... 181656us : reschedule_entry: vector=253* <idle>-0 0d... 181657us : scheduler_ipi <-smp_reschedule_interrupt (2-2) reschedule_exit - 3.6-rc6 original <idle>-0 2d... 897118464us : rcu_idle_enter_common <-rcu_irq_exit <idle>-0 2d... 897118465us : trace_hardirqs_on_thunk <-restore_args - 3.6-rc6 + this patch + trace off <idle>-0 0d... 212703us : rcu_idle_enter_common <-rcu_irq_exit <idle>-0 0d... 212703us : trace_hardirqs_on_thunk <-restore_args - 3.6-rc6 + this patch + trace on <idle>-0 0d... 182711us : rcu_idle_enter_common <-rcu_irq_exit *<idle>-0 0d... 182712us : reschedule_exit: vector=253* <idle>-0 0d... 182712us : trace_hardirqs_on_thunk <-restore_args (3) call_function - 3.6-rc6 original <idle>-0 0dNs. 158960us : __local_bh_enable <-_local_bh_enable <idle>-0 0dNh. 158960us : generic_smp_call_function_interrupt <-smp_call_function_interrupt <idle>-0 0dNh. 158961us : irq_exit <-smp_call_function_interrupt - 3.6-rc6 + this patch + trace off <idle>-0 0d.s. 209117us : __local_bh_enable <-_local_bh_enable <idle>-0 0d.h. 209117us : generic_smp_call_function_interrupt <-smp_call_function_interrupt <idle>-0 0d.h. 209117us : irq_exit <-smp_call_function_interrupt - 3.6-rc6 + trace on <idle>-0 0d.s. 213068us : __local_bh_enable <-_local_bh_enable *<idle>-0 0d.h. 213069us : call_function_entry: vector=252* <idle>-0 0d.h. 213069us : generic_smp_call_function_interrupt <-smp_call_function_interrupt *<idle>-0 0d.h. 213069us : call_function_exit: vector=252* <idle>-0 0d.h. 213070us : irq_exit <-smp_call_function_interrupt (4) call_function_single - 3.6-rc6 original <idle>-0 0dNs. 158963us : __local_bh_enable <-_local_bh_enable <idle>-0 0dNh. 158963us : generic_smp_call_function_single_interrupt <-smp_call_function_single_interrupt <idle>-0 0dNh. 158963us : _raw_spin_lock <-generic_smp_call_function_single_interrupt <idle>-0 0dNh. 158963us : irq_exit <-smp_call_function_single_interrupt - 3.6-rc6 + this patch + trace off <idle>-0 0d.s. 209117us : __local_bh_enable <-_local_bh_enable <idle>-0 0d.h. 209117us : generic_smp_call_function_interrupt <-smp_call_function_interrupt <idle>-0 0d.h. 209117us : irq_exit <-smp_call_function_interrupt - 3.6-rc6 + this patch + trace on <idle>-0 0d.s. 213095us : __local_bh_enable <-_local_bh_enable *<idle>-0 0d.h. 213095us : call_function_single_entry: vector=251* <idle>-0 0d.h. 213095us : generic_smp_call_function_single_interrupt <-smp_call_function_single_interrupt <idle>-0 0d.h. 213095us : _raw_spin_lock <-generic_smp_call_function_single_interrupt *<idle>-0 0d.h. 213095us : call_function_single_exit: vector=251* <idle>-0 0d.h. 213096us : irq_exit <-smp_call_function_single_interrupt (5) irq_work - 3.6-rc6 original <idle>-0 0dNs. 158976us : __local_bh_enable <-_local_bh_enable <idle>-0 0dNh. 158976us : irq_exit <-smp_irq_work_interrupt - 3.6-rc6 + this patch + trace off <idle>-0 0d.s. 209163us : __local_bh_enable <-_local_bh_enable <idle>-0 0d.h. 209163us : irq_exit <-smp_irq_work_interrupt - 3.6-rc6 + this patch + trace on <idle>-0 0dNs. 181414us : __local_bh_enable <-_local_bh_enable *<idle>-0 0dNh. 181415us : irq_work_entry: vector=246* *<idle>-0 0dNh. 181415us : irq_work_exit: vector=246* <idle>-0 0dNh. 181416us : irq_exit <-smp_irq_work_interrupt (6) nmi - 3.6-rc6 original <...>-2584 0d... 54448291us : trace_hardirqs_on_caller <-sysret_check <...>-2584 0d.h. 54448303us : local_clock <-perf_event_update_userpage <...>-2584 0d.h. 54448303us : intel_pmu_enable_all <-intel_pmu_handle_irq <...>-2584 0d.h. 54448304us : intel_pmu_pebs_enable_all <-intel_pmu_enable_all <...>-2584 0d.h. 54448304us : intel_pmu_lbr_enable_all <-intel_pmu_enable_all <...>-2584 0d.h. 54448304us : arch_trigger_all_cpu_backtrace_handler <-nmi_handle <...>-2584 0d.h. 54448304us : rcu_nmi_exit <-do_nmi - 3.6-rc6 + this patch + trace off <...>-2581 1d.h. 46380362us : local_clock <-perf_event_update_userpage <...>-2581 1d.h. 46380363us : intel_pmu_enable_all <-intel_pmu_handle_irq <...>-2581 1d.h. 46380363us : intel_pmu_pebs_enable_all <-intel_pmu_enable_all <...>-2581 1d.h. 46380363us : intel_pmu_lbr_enable_all <-intel_pmu_enable_all <...>-2581 1d.h. 46380364us : arch_trigger_all_cpu_backtrace_handler <-nmi_handle <...>-2581 1d.h. 46380364us : rcu_nmi_exit <-do_nmi - 3.6-rc6 + this patch + trace on grep-2619 1d... 176854482us : trace_hardirqs_on_caller <-sysret_check *grep-2619 1d.h. 176854528us : nmi_entry: vector=2* grep-2619 1d.h. 176854531us : local_clock <-perf_event_update_userpage grep-2619 1d.h. 176854531us : intel_pmu_enable_all <-intel_pmu_handle_irq grep-2619 1d.h. 176854531us : intel_pmu_pebs_enable_all <-intel_pmu_enable_all grep-2619 1d.h. 176854532us : intel_pmu_lbr_enable_all <-intel_pmu_enable_all grep-2619 1d.h. 176854532us : arch_trigger_all_cpu_backtrace_handler <-nmi_handle *grep-2619 1d.h. 176854532us : nmi_exit: vector=2* grep-2619 1d.h. 176854532us : rcu_nmi_exit <-do_nmi (7) arch_irq_vector (7-1) arch_irq_vector_entry - 3.6-rc6 original <idle>-0 0dNh. 158949us : notifier_call_chain <-atomic_notifier_call_chain <idle>-0 0dNh. 158949us : printk <-smp_spurious_interrupt - 3.6-rc6 + this patch + trace off <idle>-0 0d.h. 209125us : notifier_call_chain <-atomic_notifier_call_chain <idle>-0 0d.h. 209125us : printk <-smp_spurious_interrupt - 3.6-rc6 + this patch + trace on <idle>-0 0d.h. 213076us : notifier_call_chain <-atomic_notifier_call_chain *<idle>-0 0d.h. 213076us : arch_irq_vector_entry: vector=255 name=SPURIOUS_APIC_VECTOR* <idle>-0 0d.h. 213076us : printk <-smp_spurious_interrupt (7-2) arch_irq_vector_exit - 3.6-rc6 original <idle>-0 0dNh. 158955us : _raw_spin_unlock_irqrestore <-console_unlock <idle>-0 0dNh. 158955us : irq_exit <-smp_spurious_interrupt - 3.6-rc6 + this patch + trace off <idle>-0 0d.h. 209131us : _raw_spin_unlock_irqrestore <-console_unlock <idle>-0 0d.h. 209131us : irq_exit <-smp_spurious_interrupt - 3.6-rc6 + this patch + trace on <idle>-0 0d.h. 213082us : _raw_spin_unlock_irqrestore <-console_unlock *<idle>-0 0d.h. 213082us : arch_irq_vector_exit: vector=255 name=SPURIOUS_APIC_VECTOR* <idle>-0 0d.h. 213083us : irq_exit <-smp_spurious_interrupt Signed-off-by: Seiji Aguchi <sei...@hd...> --- arch/x86/include/asm/irq_vectors.h | 9 ++ arch/x86/kernel/apic/apic.c | 7 + arch/x86/kernel/cpu/mcheck/therm_throt.c | 3 + arch/x86/kernel/cpu/mcheck/threshold.c | 3 + arch/x86/kernel/irq.c | 5 + arch/x86/kernel/irq_work.c | 3 + arch/x86/kernel/nmi.c | 3 + arch/x86/kernel/smp.c | 7 + include/trace/events/irq_vectors.h | 209 ++++++++++++++++++++++++++++++ 9 files changed, 249 insertions(+), 0 deletions(-) create mode 100644 include/trace/events/irq_vectors.h diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 1508e51..510ced5 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -158,4 +158,13 @@ static inline int invalid_vm86_irq(int irq) # define NR_IRQS NR_IRQS_LEGACY #endif +#define irq_vector_name(vector) { vector, #vector } + +#define irq_vector_name_table \ + irq_vector_name(ERROR_APIC_VECTOR), \ + irq_vector_name(THERMAL_APIC_VECTOR), \ + irq_vector_name(THRESHOLD_APIC_VECTOR), \ + irq_vector_name(SPURIOUS_APIC_VECTOR), \ + irq_vector_name(X86_PLATFORM_IPI_VECTOR) + #endif /* _ASM_X86_IRQ_VECTORS_H */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 24deb30..b9cdd8f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -34,6 +34,7 @@ #include <linux/dmi.h> #include <linux/smp.h> #include <linux/mm.h> +#include <trace/events/irq_vectors.h> #include <asm/irq_remapping.h> #include <asm/perf_event.h> @@ -895,7 +896,9 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) */ irq_enter(); exit_idle(); + trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); + trace_local_timer_exit(LOCAL_TIMER_VECTOR); irq_exit(); set_irq_regs(old_regs); @@ -1881,6 +1884,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) irq_enter(); exit_idle(); + trace_arch_irq_vector_entry(SPURIOUS_APIC_VECTOR); /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1895,6 +1899,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); + trace_arch_irq_vector_exit(SPURIOUS_APIC_VECTOR); irq_exit(); } @@ -1918,6 +1923,7 @@ void smp_error_interrupt(struct pt_regs *regs) irq_enter(); exit_idle(); + trace_arch_irq_vector_entry(ERROR_APIC_VECTOR); /* First tickle the hardware, only then report what went on. -- REW */ v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); @@ -1938,6 +1944,7 @@ void smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT "\n"); + trace_arch_irq_vector_exit(ERROR_APIC_VECTOR); irq_exit(); } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 47a1870..63c2cc8 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -23,6 +23,7 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/cpu.h> +#include <trace/events/irq_vectors.h> #include <asm/processor.h> #include <asm/apic.h> @@ -382,8 +383,10 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) { irq_enter(); exit_idle(); + trace_arch_irq_vector_entry(THERMAL_APIC_VECTOR); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); + trace_arch_irq_vector_exit(THERMAL_APIC_VECTOR); irq_exit(); /* Ack only at the end to avoid potential reentry */ ack_APIC_irq(); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index aa578ca..de74768 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -3,6 +3,7 @@ */ #include <linux/interrupt.h> #include <linux/kernel.h> +#include <trace/events/irq_vectors.h> #include <asm/irq_vectors.h> #include <asm/apic.h> @@ -21,8 +22,10 @@ asmlinkage void smp_threshold_interrupt(void) { irq_enter(); exit_idle(); + trace_arch_irq_vector_entry(THRESHOLD_APIC_VECTOR); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); + trace_arch_irq_vector_exit(THRESHOLD_APIC_VECTOR); irq_exit(); /* Ack only at the end to avoid potential reentry */ ack_APIC_irq(); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 1f5f1d5..f4d7344 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -18,6 +18,9 @@ #include <asm/mce.h> #include <asm/hw_irq.h> +#define CREATE_TRACE_POINTS +#include <trace/events/irq_vectors.h> + atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ @@ -218,11 +221,13 @@ void smp_x86_platform_ipi(struct pt_regs *regs) exit_idle(); + trace_arch_irq_vector_entry(X86_PLATFORM_IPI_VECTOR); inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) x86_platform_ipi_callback(); + trace_arch_irq_vector_exit(X86_PLATFORM_IPI_VECTOR); irq_exit(); set_irq_regs(old_regs); diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index ca8f703..2cf7505 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -8,13 +8,16 @@ #include <linux/irq_work.h> #include <linux/hardirq.h> #include <asm/apic.h> +#include <trace/events/irq_vectors.h> void smp_irq_work_interrupt(struct pt_regs *regs) { irq_enter(); ack_APIC_irq(); + trace_irq_work_entry(IRQ_WORK_VECTOR); inc_irq_stat(apic_irq_work_irqs); irq_work_run(); + trace_irq_work_exit(IRQ_WORK_VECTOR); irq_exit(); } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index f84f5c5..cc57aba 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -28,6 +28,7 @@ #include <asm/mach_traps.h> #include <asm/nmi.h> #include <asm/x86_init.h> +#include <trace/events/irq_vectors.h> struct nmi_desc { spinlock_t lock; @@ -482,12 +483,14 @@ do_nmi(struct pt_regs *regs, long error_code) nmi_nesting_preprocess(regs); nmi_enter(); + trace_nmi_entry(NMI_VECTOR); inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); + trace_nmi_exit(NMI_VECTOR); nmi_exit(); /* On i386, may loop back to preprocess */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 48d2b7d..5b2d6de 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -23,6 +23,7 @@ #include <linux/interrupt.h> #include <linux/cpu.h> #include <linux/gfp.h> +#include <trace/events/irq_vectors.h> #include <asm/mtrr.h> #include <asm/tlbflush.h> @@ -252,8 +253,10 @@ finish: void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + trace_reschedule_entry(RESCHEDULE_VECTOR); inc_irq_stat(irq_resched_count); scheduler_ipi(); + trace_reschedule_exit(RESCHEDULE_VECTOR); /* * KVM uses this interrupt to force a cpu out of guest mode */ @@ -263,8 +266,10 @@ void smp_call_function_interrupt(struct pt_regs *regs) { ack_APIC_irq(); irq_enter(); + trace_call_function_entry(CALL_FUNCTION_VECTOR); generic_smp_call_function_interrupt(); inc_irq_stat(irq_call_count); + trace_call_function_exit(CALL_FUNCTION_VECTOR); irq_exit(); } @@ -272,8 +277,10 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) { ack_APIC_irq(); irq_enter(); + trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); generic_smp_call_function_single_interrupt(); inc_irq_stat(irq_call_count); + trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); irq_exit(); } diff --git a/include/trace/events/irq_vectors.h b/include/trace/events/irq_vectors.h new file mode 100644 index 0000000..fffe0c0 --- /dev/null +++ b/include/trace/events/irq_vectors.h @@ -0,0 +1,209 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM irq_vectors + +#if !defined(_TRACE_IRQ_VECTORS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_IRQ_VECTORS_H + +#include <linux/tracepoint.h> +#include <asm/irq.h> + +#ifndef irq_vector_name_table +#define irq_vector_name_table { -1, NULL } +#endif + + +/* + * This class is used by generic ,cross-architecture tracepoints. + */ +DECLARE_EVENT_CLASS(irq_vector, + + TP_PROTO(int vector), + + TP_ARGS(vector), + + TP_STRUCT__entry( + __field( int, vector ) + ), + + TP_fast_assign( + __entry->vector = vector; + ), + + TP_printk("vector=%d", __entry->vector) +); + +/* + * nmi_entry - called before enterring a nmi vector handler + */ +DEFINE_EVENT(irq_vector, nmi_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * nmi_exit - called immediately after the interrupt vector + * handler returns + */ +DEFINE_EVENT(irq_vector, nmi_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * local_timer_entry - called before enterring a local timer interrupt + * vector handler + */ +DEFINE_EVENT(irq_vector, local_timer_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * local_timer_exit - called immediately after the interrupt vector + * handler returns + */ +DEFINE_EVENT(irq_vector, local_timer_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * reschedule_entry - called before enterring a reschedule vector handler + */ +DEFINE_EVENT(irq_vector, reschedule_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * reschedule_exit - called immediately after the interrupt vector + * handler returns + */ +DEFINE_EVENT(irq_vector, reschedule_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * call_function_entry - called before enterring a call function + * vector handler + */ +DEFINE_EVENT(irq_vector, call_function_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * call_function_exit - called immediately after the interrupt vector + * handler returns + */ +DEFINE_EVENT(irq_vector, call_function_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * call_function_single_entry - called before enterring a call function + * single vector handler + */ +DEFINE_EVENT(irq_vector, call_function_single_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * call_function_single_exit - called immediately after the interrupt vector + * handler returns + */ +DEFINE_EVENT(irq_vector, call_function_single_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * irq_work_entry - called before enterring an irq work vector handler + */ +DEFINE_EVENT(irq_vector, irq_work_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * irq_work_exit - called immediately after the interrupt vector + * handler returns + */ +DEFINE_EVENT(irq_vector, irq_work_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * This class is used by arch-specific tracepoints. + */ +DECLARE_EVENT_CLASS(arch_irq_vector, + + TP_PROTO(int vector), + + TP_ARGS(vector), + + TP_STRUCT__entry( + __field( int, vector ) + ), + + TP_fast_assign( + __entry->vector = vector; + ), + + TP_printk("vector=%d name=%s", __entry->vector, + __print_symbolic(__entry->vector, irq_vector_name_table)) +); + +/* + * arch_irq_vector_entry - called before enterring a interrupt vector handler + */ +DEFINE_EVENT(arch_irq_vector, arch_irq_vector_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * arch_irq_vector_exit - called immediately after the interrupt vector + * handler returns + */ +DEFINE_EVENT(arch_irq_vector, arch_irq_vector_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +#endif /* _TRACE_IRQ_VECTORS_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> -- 1.7.1 |
|
From: Seiji A. <sei...@hd...> - 2012-09-12 13:38:05
|
Hi, I will submit some performance testing result. Thanks! Seiji > -----Original Message----- > From: H. Peter Anvin [mailto:hp...@zy...] > Sent: Wednesday, September 12, 2012 12:18 AM > To: Seiji Aguchi > Cc: Thomas Gleixner (tg...@li...); lin...@vg...; ro...@go...; 'mi...@el...' (mi...@el...); > x8...@ke...; dle...@li...; Satoru Moriya > Subject: Re: [Resend][PATCH V3] trace,x86: add x86 irq vector tracepoints > > On 09/11/2012 05:00 PM, Seiji Aguchi wrote: > > Thomas, > > > > Please review my patch as we talked in Plumbers. > > > > Is there any measurable latency added here? These are some of the most > performance- (or at least latency-)critical paths in the kernel. > > -hpa > > -- > H. Peter Anvin, Intel Open Source Technology Center I work for Intel. I don't speak on their behalf. |
|
From: H. P. A. <hp...@zy...> - 2012-09-12 04:18:44
|
On 09/11/2012 05:00 PM, Seiji Aguchi wrote: > Thomas, > > Please review my patch as we talked in Plumbers. > Is there any measurable latency added here? These are some of the most performance- (or at least latency-)critical paths in the kernel. -hpa -- H. Peter Anvin, Intel Open Source Technology Center I work for Intel. I don't speak on their behalf. |
|
From: Seiji A. <sei...@hd...> - 2012-09-12 00:00:54
|
Thomas, Please review my patch as we talked in Plumbers. Seiji > -----Original Message----- > From: Seiji Aguchi > Sent: Friday, August 24, 2012 11:22 AM > To: Thomas Gleixner (tg...@li...) > Cc: lin...@vg...; ro...@go...; 'mi...@el...' (mi...@el...); x8...@ke...; dle- > de...@li...; Satoru Moriya > Subject: [Resend][PATCH V3] trace,x86: add x86 irq vector tracepoints > > Tomas, > > It is helpful if you can review this patch. > > Change log > v2 -> v3 > - Remove an invalidate_tlb_vector event because it was replaced by a call function vector > in a following commit. > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=52aec3308db85f4e9f5c8b9f5dc4fbd0138c6fa4 > > v1 -> v2 > - Modify variable name from irq to vector. > - Merge arch-specific tracepoints below to an arch_irq_vector_entry/exit. > - error_apic_vector > - thermal_apic_vector > - threshold_apic_vector > - spurious_apic_vector > - x86_platform_ipi_vector > > As Vaibhav explained in the thread below, tracepoints for irq vectors are useful. > > http://www.spinics.net/lists/mm-commits/msg85707.html > > <snip> > The current interrupt traces from irq_handler_entry and irq_handler_exit provide when an interrupt is handled. They provide good > data about when the system has switched to kernel space and how it affects the currently running processes. > > There are some IRQ vectors which trigger the system into kernel space, which are not handled in generic IRQ handlers. Tracing such > events gives us the information about IRQ interaction with other system events. > > The trace also tells where the system is spending its time. We want to know which cores are handling interrupts and how they are > affecting other processes in the system. Also, the trace provides information about when the cores are idle and which interrupts are > changing that state. > <snip> > > On the other hand, my usecase is tracing just local timer event and getting a value of instruction pointer. > > I suggested to add an argument local timer event to get instruction pointer before. > But there is another way to get it with external module like systemtap. > So, I don't need to add any argument to irq vector tracepoints now. > > Vaibhav's patch shared a trace point ,irq_vector_entry/irq_vector_exit, in all events. > But there is an above use case to trace specific irq_vector rather than tracing all events. > In this case, we are concerned about overhead due to unwanted events. > > This patch modifies Vaibhav's one as follows. > - Separate generic, and across-architecture tracepoints to enable independently. > - nmi_vector > - local_timer_vector > - reschedule_vector > - call_function_vector > - call_function_single_vector > - irq_work_entry_vector > > - Rename architecture-specific tracepoints from irq_vector_entry/exit to > arch_irq_vector_entry/exit. > - error_apic_vector > - thermal_apic_vector > - threshold_apic_vector > - spurious_apic_vector > - x86_platform_ipi_vector > > Those x86 specific ones are not really frequently raised vectors, so > enabling them all won't affect performance and readability of the > traces too much. > > Signed-off-by: Seiji Aguchi <sei...@hd...> > > --- > arch/x86/include/asm/irq_vectors.h | 9 ++ > arch/x86/kernel/apic/apic.c | 7 + > arch/x86/kernel/cpu/mcheck/therm_throt.c | 3 + > arch/x86/kernel/cpu/mcheck/threshold.c | 3 + > arch/x86/kernel/irq.c | 5 + > arch/x86/kernel/irq_work.c | 3 + > arch/x86/kernel/nmi.c | 3 + > arch/x86/kernel/smp.c | 7 + > include/trace/events/irq_vectors.h | 209 ++++++++++++++++++++++++++++++ > 9 files changed, 249 insertions(+), 0 deletions(-) create mode 100644 include/trace/events/irq_vectors.h > > diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h > index 1508e51..510ced5 100644 > --- a/arch/x86/include/asm/irq_vectors.h > +++ b/arch/x86/include/asm/irq_vectors.h > @@ -158,4 +158,13 @@ static inline int invalid_vm86_irq(int irq) > # define NR_IRQS NR_IRQS_LEGACY > #endif > > +#define irq_vector_name(vector) { vector, #vector } > + > +#define irq_vector_name_table \ > + irq_vector_name(ERROR_APIC_VECTOR), \ > + irq_vector_name(THERMAL_APIC_VECTOR), \ > + irq_vector_name(THRESHOLD_APIC_VECTOR), \ > + irq_vector_name(SPURIOUS_APIC_VECTOR), \ > + irq_vector_name(X86_PLATFORM_IPI_VECTOR) > + > #endif /* _ASM_X86_IRQ_VECTORS_H */ > diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 24deb30..b9cdd8f 100644 > --- a/arch/x86/kernel/apic/apic.c > +++ b/arch/x86/kernel/apic/apic.c > @@ -34,6 +34,7 @@ > #include <linux/dmi.h> > #include <linux/smp.h> > #include <linux/mm.h> > +#include <trace/events/irq_vectors.h> > > #include <asm/irq_remapping.h> > #include <asm/perf_event.h> > @@ -895,7 +896,9 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) > */ > irq_enter(); > exit_idle(); > + trace_local_timer_entry(LOCAL_TIMER_VECTOR); > local_apic_timer_interrupt(); > + trace_local_timer_exit(LOCAL_TIMER_VECTOR); > irq_exit(); > > set_irq_regs(old_regs); > @@ -1881,6 +1884,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) > > irq_enter(); > exit_idle(); > + trace_arch_irq_vector_entry(SPURIOUS_APIC_VECTOR); > /* > * Check if this really is a spurious interrupt and ACK it > * if it is a vectored one. Just in case... > @@ -1895,6 +1899,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) > /* see sw-dev-man vol 3, chapter 7.4.13.5 */ > pr_info("spurious APIC interrupt on CPU#%d, " > "should never happen.\n", smp_processor_id()); > + trace_arch_irq_vector_exit(SPURIOUS_APIC_VECTOR); > irq_exit(); > } > > @@ -1918,6 +1923,7 @@ void smp_error_interrupt(struct pt_regs *regs) > > irq_enter(); > exit_idle(); > + trace_arch_irq_vector_entry(ERROR_APIC_VECTOR); > /* First tickle the hardware, only then report what went on. -- REW */ > v0 = apic_read(APIC_ESR); > apic_write(APIC_ESR, 0); > @@ -1938,6 +1944,7 @@ void smp_error_interrupt(struct pt_regs *regs) > > apic_printk(APIC_DEBUG, KERN_CONT "\n"); > > + trace_arch_irq_vector_exit(ERROR_APIC_VECTOR); > irq_exit(); > } > > diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c > index 47a1870..63c2cc8 100644 > --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c > +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c > @@ -23,6 +23,7 @@ > #include <linux/init.h> > #include <linux/smp.h> > #include <linux/cpu.h> > +#include <trace/events/irq_vectors.h> > > #include <asm/processor.h> > #include <asm/apic.h> > @@ -382,8 +383,10 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) { > irq_enter(); > exit_idle(); > + trace_arch_irq_vector_entry(THERMAL_APIC_VECTOR); > inc_irq_stat(irq_thermal_count); > smp_thermal_vector(); > + trace_arch_irq_vector_exit(THERMAL_APIC_VECTOR); > irq_exit(); > /* Ack only at the end to avoid potential reentry */ > ack_APIC_irq(); > diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c > index aa578ca..de74768 100644 > --- a/arch/x86/kernel/cpu/mcheck/threshold.c > +++ b/arch/x86/kernel/cpu/mcheck/threshold.c > @@ -3,6 +3,7 @@ > */ > #include <linux/interrupt.h> > #include <linux/kernel.h> > +#include <trace/events/irq_vectors.h> > > #include <asm/irq_vectors.h> > #include <asm/apic.h> > @@ -21,8 +22,10 @@ asmlinkage void smp_threshold_interrupt(void) { > irq_enter(); > exit_idle(); > + trace_arch_irq_vector_entry(THRESHOLD_APIC_VECTOR); > inc_irq_stat(irq_threshold_count); > mce_threshold_vector(); > + trace_arch_irq_vector_exit(THRESHOLD_APIC_VECTOR); > irq_exit(); > /* Ack only at the end to avoid potential reentry */ > ack_APIC_irq(); > diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 1f5f1d5..f4d7344 100644 > --- a/arch/x86/kernel/irq.c > +++ b/arch/x86/kernel/irq.c > @@ -18,6 +18,9 @@ > #include <asm/mce.h> > #include <asm/hw_irq.h> > > +#define CREATE_TRACE_POINTS > +#include <trace/events/irq_vectors.h> > + > atomic_t irq_err_count; > > /* Function pointer for generic interrupt vector handling */ @@ -218,11 +221,13 @@ void smp_x86_platform_ipi(struct pt_regs *regs) > > exit_idle(); > > + trace_arch_irq_vector_entry(X86_PLATFORM_IPI_VECTOR); > inc_irq_stat(x86_platform_ipis); > > if (x86_platform_ipi_callback) > x86_platform_ipi_callback(); > > + trace_arch_irq_vector_exit(X86_PLATFORM_IPI_VECTOR); > irq_exit(); > > set_irq_regs(old_regs); > diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index ca8f703..2cf7505 100644 > --- a/arch/x86/kernel/irq_work.c > +++ b/arch/x86/kernel/irq_work.c > @@ -8,13 +8,16 @@ > #include <linux/irq_work.h> > #include <linux/hardirq.h> > #include <asm/apic.h> > +#include <trace/events/irq_vectors.h> > > void smp_irq_work_interrupt(struct pt_regs *regs) { > irq_enter(); > ack_APIC_irq(); > + trace_irq_work_entry(IRQ_WORK_VECTOR); > inc_irq_stat(apic_irq_work_irqs); > irq_work_run(); > + trace_irq_work_exit(IRQ_WORK_VECTOR); > irq_exit(); > } > > diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index f84f5c5..cc57aba 100644 > --- a/arch/x86/kernel/nmi.c > +++ b/arch/x86/kernel/nmi.c > @@ -28,6 +28,7 @@ > #include <asm/mach_traps.h> > #include <asm/nmi.h> > #include <asm/x86_init.h> > +#include <trace/events/irq_vectors.h> > > struct nmi_desc { > spinlock_t lock; > @@ -482,12 +483,14 @@ do_nmi(struct pt_regs *regs, long error_code) > nmi_nesting_preprocess(regs); > > nmi_enter(); > + trace_nmi_entry(NMI_VECTOR); > > inc_irq_stat(__nmi_count); > > if (!ignore_nmis) > default_do_nmi(regs); > > + trace_nmi_exit(NMI_VECTOR); > nmi_exit(); > > /* On i386, may loop back to preprocess */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index > 48d2b7d..5b2d6de 100644 > --- a/arch/x86/kernel/smp.c > +++ b/arch/x86/kernel/smp.c > @@ -23,6 +23,7 @@ > #include <linux/interrupt.h> > #include <linux/cpu.h> > #include <linux/gfp.h> > +#include <trace/events/irq_vectors.h> > > #include <asm/mtrr.h> > #include <asm/tlbflush.h> > @@ -252,8 +253,10 @@ finish: > void smp_reschedule_interrupt(struct pt_regs *regs) { > ack_APIC_irq(); > + trace_reschedule_entry(RESCHEDULE_VECTOR); > inc_irq_stat(irq_resched_count); > scheduler_ipi(); > + trace_reschedule_exit(RESCHEDULE_VECTOR); > /* > * KVM uses this interrupt to force a cpu out of guest mode > */ > @@ -263,8 +266,10 @@ void smp_call_function_interrupt(struct pt_regs *regs) { > ack_APIC_irq(); > irq_enter(); > + trace_call_function_entry(CALL_FUNCTION_VECTOR); > generic_smp_call_function_interrupt(); > inc_irq_stat(irq_call_count); > + trace_call_function_exit(CALL_FUNCTION_VECTOR); > irq_exit(); > } > > @@ -272,8 +277,10 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) { > ack_APIC_irq(); > irq_enter(); > + trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); > generic_smp_call_function_single_interrupt(); > inc_irq_stat(irq_call_count); > + trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); > irq_exit(); > } > > diff --git a/include/trace/events/irq_vectors.h b/include/trace/events/irq_vectors.h > new file mode 100644 > index 0000000..fffe0c0 > --- /dev/null > +++ b/include/trace/events/irq_vectors.h > @@ -0,0 +1,209 @@ > +#undef TRACE_SYSTEM > +#define TRACE_SYSTEM irq_vectors > + > +#if !defined(_TRACE_IRQ_VECTORS_H) || defined(TRACE_HEADER_MULTI_READ) > +#define _TRACE_IRQ_VECTORS_H > + > +#include <linux/tracepoint.h> > +#include <asm/irq.h> > + > +#ifndef irq_vector_name_table > +#define irq_vector_name_table { -1, NULL } #endif > + > + > +/* > + * This class is used by generic ,cross-architecture tracepoints. > + */ > +DECLARE_EVENT_CLASS(irq_vector, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector), > + > + TP_STRUCT__entry( > + __field( int, vector ) > + ), > + > + TP_fast_assign( > + __entry->vector = vector; > + ), > + > + TP_printk("vector=%d", __entry->vector) ); > + > +/* > + * nmi_entry - called before enterring a nmi vector handler */ > +DEFINE_EVENT(irq_vector, nmi_entry, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +/* > + * nmi_exit - called immediately after the interrupt vector > + * handler returns > + */ > +DEFINE_EVENT(irq_vector, nmi_exit, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +/* > + * local_timer_entry - called before enterring a local timer interrupt > + * vector handler > + */ > +DEFINE_EVENT(irq_vector, local_timer_entry, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +/* > + * local_timer_exit - called immediately after the interrupt vector > + * handler returns > + */ > +DEFINE_EVENT(irq_vector, local_timer_exit, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +/* > + * reschedule_entry - called before enterring a reschedule vector > +handler */ DEFINE_EVENT(irq_vector, reschedule_entry, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +/* > + * reschedule_exit - called immediately after the interrupt vector > + * handler returns > + */ > +DEFINE_EVENT(irq_vector, reschedule_exit, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +/* > + * call_function_entry - called before enterring a call function > + * vector handler > + */ > +DEFINE_EVENT(irq_vector, call_function_entry, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +/* > + * call_function_exit - called immediately after the interrupt vector > + * handler returns > + */ > +DEFINE_EVENT(irq_vector, call_function_exit, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +/* > + * call_function_single_entry - called before enterring a call function > + * single vector handler > + */ > +DEFINE_EVENT(irq_vector, call_function_single_entry, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +/* > + * call_function_single_exit - called immediately after the interrupt > +vector > + * handler returns > + */ > +DEFINE_EVENT(irq_vector, call_function_single_exit, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +/* > + * irq_work_entry - called before enterring an irq work vector handler > +*/ DEFINE_EVENT(irq_vector, irq_work_entry, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +/* > + * irq_work_exit - called immediately after the interrupt vector > + * handler returns > + */ > +DEFINE_EVENT(irq_vector, irq_work_exit, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +/* > + * This class is used by arch-specific tracepoints. > + */ > +DECLARE_EVENT_CLASS(arch_irq_vector, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector), > + > + TP_STRUCT__entry( > + __field( int, vector ) > + ), > + > + TP_fast_assign( > + __entry->vector = vector; > + ), > + > + TP_printk("vector=%d name=%s", __entry->vector, > + __print_symbolic(__entry->vector, irq_vector_name_table)) ); > + > +/* > + * arch_irq_vector_entry - called before enterring a interrupt vector > +handler */ DEFINE_EVENT(arch_irq_vector, arch_irq_vector_entry, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +/* > + * arch_irq_vector_exit - called immediately after the interrupt vector > + * handler returns > + */ > +DEFINE_EVENT(arch_irq_vector, arch_irq_vector_exit, > + > + TP_PROTO(int vector), > + > + TP_ARGS(vector) > +); > + > +#endif /* _TRACE_IRQ_VECTORS_H */ > + > +/* This part must be outside protection */ #include > +<trace/define_trace.h> > -- 1.7.1 |
|
From: Seiji A. <sei...@hd...> - 2012-09-04 22:14:50
|
Matthew, Could you please review this patchset? I talked to Tony about it in Plumbers conference last week and he told that your review is needed because it is efi-specific. Seiji > -----Original Message----- > From: lin...@vg... [mailto:lin...@vg...] On Behalf Of Seiji Aguchi > Sent: Tuesday, September 04, 2012 6:06 PM > To: Mike Waychison; lin...@vg...; Luck, Tony (ton...@in...); Matthew Garrett (mj...@re...); > dz...@re...; lin...@vg... > Cc: dle...@li...; Satoru Moriya > Subject: [RESEND][PATCH v4 0/3] make efivars/efi_pstore interrupt-safe > > Changelog > v3 -> v4 > - Patch 2/3 > Move cancel_work_sync() above an efi_enabled test in efivars_exit(). > > v2 -> v3 > - Patch 1/3 > Replace spin_lock_irqsave/spin_unlock_irqrestore with spin_lock_irq/spin_unlock_irq in efivars_unregister(), > efivar_create(), efivar_store_raw() and efivar_delete() which are called in a process context. > > - Patch 2/3 > Change a name of delete_sysfs_entry() to delete_all_stale_sysfs_entries(). > Also, don't release an efivar->lock while searching efivar->list in delete_all_stale_sysfs_entries(). > > - Patch 3/3 > Remove a logic in efi_pstore_erase() which freshly created in patch v2. > > v1 -> v2 > - Patch 1/3 > Add spin_lock_irq/spin_unlock_irq to open/close callbacks of efi_pstore > instead of moving spin_locks to a read callback. > > - Patch 2/3 > Replace a periodical timer with schedule_work(). > > - Patch 3/3 > freshly create to kick a workqueue in oops case only. > > [Problem] > There are following problems related to an interrupt context in efivar/efi_pstore. > > Currently, efivars enables interrupt while taking efivars->lock. > So, there is a risk to be deadlocking in a write callback of efi_pstore if kernel panics in interrupt context while taking efi_lock. > > Also, efi_pstore creates sysfs entries ,which enable users to access to NVRAM, in a write callback. > If a kernel panic happens in interrupt contexts, pstore may fail because it could sleep due to dynamic memory allocations during > creating sysfs entries. > > To resolve the problems above, a goal of this patchset is making efivars/efi_pstore interrupt-safe. > > [Patch Description] > Patch 1/3 efivars: Disable external interrupt while holding efivars->lock > This patch replaces spin_lock/spin_unlock with spin_lock_irqsave/spin_lock_irqrestore to make efivars interrupt safe > > Patch 2/3 efi_pstore: Introducing workqueue updating sysfs entries > This patch removes sysfs operations from write callback by introducing a workqueue updating sysfs entries > > Patch 3/3 efi_pstore: Skiping scheduling a workqueue in cases other than oops > This patch restricts a schedule of a workqueue in case where users erase entries or oops happen which is truly needed for users. > > drivers/firmware/efivars.c | 167 +++++++++++++++++++++++++++++++++++-------- > include/linux/efi.h | 3 +- > 2 files changed, 138 insertions(+), 32 deletions(-) > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to maj...@vg... More > majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ |
|
From: Seiji A. <sei...@hd...> - 2012-09-04 22:08:28
|
[Problem] efi_pstore creates sysfs files when logging kernel messages to NVRAM. Currently, the sysfs files are updated in a workqueue which is registered in a write callback. On the other hand, situations which users need the sysfs files are when they erase entries or oops happen because a system will be down and users can't access to sysfs files in other cases like panic, reboot or emergency_restart. Also, if kernel panics due to a bug of workqueue operations and a write callback of efi_pstore is called in panic case, efi_pstore may fail due to a failure of schedule_work(). And panic_notifier_chain()/emergency_restart() is not kicked if efi_pstore fails. This may cause user's unwanted result. [Patch Description] This patch registers a workqueue updating sysfs entries in cases where users erase entries or oops happen only, and skips it in other cases like panic, reboot or emergency_start. Signed-off-by: Seiji Aguchi <sei...@hd...> --- drivers/firmware/efivars.c | 7 ++++++- 1 files changed, 6 insertions(+), 1 deletions(-) diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index cd16ea6..d5911fd 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -746,7 +746,12 @@ static int efi_pstore_write(enum pstore_type_id type, spin_unlock_irqrestore(&efivars->lock, flags); - schedule_work(&efivar_work); + /* + * The user may want to update sysfs for this write + * when they erase an entry via /dev/pstore or oops happen. + */ + if (!size || reason == KMSG_DUMP_OOPS) + schedule_work(&efivar_work); *id = part; return ret; -- 1.7.1 |
|
From: Seiji A. <sei...@hd...> - 2012-09-04 22:07:51
|
[Problem]
efi_pstore creates sysfs entries ,which enable users to access to NVRAM,
in a write callback. If a kernel panic happens in interrupt contexts, pstore may
fail because it could sleep due to dynamic memory allocations during creating sysfs entries.
[Patch Description]
This patch removes sysfs operations from write callback by introducing a workqueue
updating sysfs entries which is kicked after a write callback of efi_pstore is called.
efi_pstore will be robust against a kernel panic in an interrupt context with it.
The workqueue works as follows.
- A timer is registered during an initialization of efivars module.
- A flag, update_sysfs_entries, is checked periodically with the timer.
- The flag is set in a write callback, efi_pstore_write().
- Operation updating sysfs entries is kicked if the flag is set.
Any operations ,like registering a timer, are not added in a write callback because it may run
in panic case and fail due to operations related to workqueue.
To make efi_pstore work in panic case, a write callback should just log to NVRAM.
Signed-off-by: Seiji Aguchi <sei...@hd...>
---
drivers/firmware/efivars.c | 119 +++++++++++++++++++++++++++++++++++++++----
include/linux/efi.h | 3 +-
2 files changed, 110 insertions(+), 12 deletions(-)
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index bd1df01..cd16ea6 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -146,6 +146,13 @@ efivar_create_sysfs_entry(struct efivars *efivars,
efi_char16_t *variable_name,
efi_guid_t *vendor_guid);
+/*
+ * Prototype for workqueue functions updating sysfs entry
+ */
+
+static void efivar_update_sysfs_entry(struct work_struct *);
+static DECLARE_WORK(efivar_work, efivar_update_sysfs_entry);
+
/* Return the number of unicode characters in data */
static unsigned long
utf16_strnlen(efi_char16_t *s, size_t maxlength)
@@ -731,9 +738,6 @@ static int efi_pstore_write(enum pstore_type_id type,
0, NULL);
}
- if (found)
- list_del(&found->list);
-
for (i = 0; i < DUMP_NAME_LEN; i++)
efi_name[i] = name[i];
@@ -742,14 +746,7 @@ static int efi_pstore_write(enum pstore_type_id type,
spin_unlock_irqrestore(&efivars->lock, flags);
- if (found)
- efivar_unregister(found);
-
- if (size)
- ret = efivar_create_sysfs_entry(efivars,
- utf16_strsize(efi_name,
- DUMP_NAME_LEN * 2),
- efi_name, &vendor);
+ schedule_work(&efivar_work);
*id = part;
return ret;
@@ -1200,6 +1197,104 @@ EXPORT_SYMBOL_GPL(register_efivars);
static struct efivars __efivars;
static struct efivar_operations ops;
+static void delete_all_stale_sysfs_entries(void)
+{
+ struct efivars *efivars = &__efivars;
+ struct efivar_entry *entry, *n, *found;
+ efi_status_t status;
+ unsigned long flags;
+
+ while (1) {
+ found = NULL;
+ spin_lock_irqsave(&efivars->lock, flags);
+ list_for_each_entry_safe(entry, n, &efivars->list, list) {
+ status = get_var_data_locked(efivars, &entry->var);
+ if (status != EFI_SUCCESS) {
+ found = entry;
+ list_del(&entry->list);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&efivars->lock, flags);
+ if (found)
+ efivar_unregister(entry);
+ else
+ break;
+ }
+}
+
+static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor)
+{
+ struct efivar_entry *entry, *n;
+ struct efivars *efivars = &__efivars;
+ unsigned long strsize1, strsize2;
+ bool found = false;
+
+ strsize1 = utf16_strsize(variable_name, 1024);
+ list_for_each_entry_safe(entry, n, &efivars->list, list) {
+ strsize2 = utf16_strsize(entry->var.VariableName, 1024);
+ if (strsize1 == strsize2 &&
+ !memcmp(variable_name, &(entry->var.VariableName),
+ strsize2) &&
+ !efi_guidcmp(entry->var.VendorGuid,
+ *vendor)) {
+ found = true;
+ break;
+ }
+ }
+ return found;
+}
+
+static void efivar_update_sysfs_entry(struct work_struct *work)
+{
+ struct efivars *efivars = &__efivars;
+ efi_guid_t vendor;
+ efi_char16_t *variable_name;
+ unsigned long variable_name_size = 1024, flags;
+ efi_status_t status = EFI_NOT_FOUND;
+ bool found;
+
+ /* Delete stale sysfs entries */
+ delete_all_stale_sysfs_entries();
+
+ /* Add new sysfs entries */
+ while (1) {
+ variable_name = kzalloc(variable_name_size, GFP_KERNEL);
+ if (!variable_name) {
+ pr_err("efivars: Memory allocation failed.\n");
+ return;
+ }
+
+ spin_lock_irqsave(&efivars->lock, flags);
+ found = false;
+ while (1) {
+ variable_name_size = 1024;
+ status = efivars->ops->get_next_variable(
+ &variable_name_size,
+ variable_name,
+ &vendor);
+ if (status != EFI_SUCCESS) {
+ break;
+ } else {
+ if (!variable_is_present(variable_name,
+ &vendor)) {
+ found = true;
+ break;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&efivars->lock, flags);
+
+ if (!found) {
+ kfree(variable_name);
+ break;
+ } else
+ efivar_create_sysfs_entry(efivars,
+ variable_name_size,
+ variable_name, &vendor);
+ }
+}
+
/*
* For now we register the efi subsystem with the firmware subsystem
* and the vars subsystem with the efi subsystem. In the future, it
@@ -1254,6 +1349,8 @@ err_put:
static void __exit
efivars_exit(void)
{
+ cancel_work_sync(&efivar_work);
+
if (efi_enabled) {
unregister_efivars(&__efivars);
kobject_put(efi_kobj);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index ec45ccd..8e03cc0 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -643,7 +643,8 @@ struct efivars {
* 1) ->list - adds, removals, reads, writes
* 2) ops.[gs]et_variable() calls.
* It must not be held when creating sysfs entries or calling kmalloc.
- * ops.get_next_variable() is only called from register_efivars(),
+ * ops.get_next_variable() is only called from register_efivars()
+ * or efivar_update_sysfs_entry(),
* which is protected by the BKL, so that path is safe.
*/
spinlock_t lock;
--
1.7.1
|
|
From: Seiji A. <sei...@hd...> - 2012-09-04 22:07:25
|
efivars: Disable external interrupt while holding efivars->lock
[Problem]
Currently, efivars doesn't disable interrupt while taking efivars->lock.
So, there is a risk to be deadlocking in a write callback of efi_pstore
if kernel panics in interrupt context while taking efivars->lock.
[Patch Description]
This patch disables an external interruption while holding efivars->lock by
replacing spin_lock/spin_unlock with spin_lock_irqsave/spin_unlock_irqrestore.
As for spin_locks in open/close callbacks of efi_pstore, they moved to a read callback
to pass a flags argument to spin_lock_irqsave/spin_unlock_irqrestore.
Even if spin_locks in open/close callbacks are erased, a whole process of open/read/close
is still serialized by psinfo->read_mutex which is taken in a pstore layer.
Also, initialization of efivars->walk_entry moved to a read callback to protect list_first_entry()
with efivars->lock. And some comments about efivars->walk_entry are added to avoid using it
Signed-off-by: Seiji Aguchi <sei...@hd...>
---
drivers/firmware/efivars.c | 43 +++++++++++++++++++++++--------------------
1 files changed, 23 insertions(+), 20 deletions(-)
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 47408e8..bd1df01 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -393,10 +393,11 @@ static efi_status_t
get_var_data(struct efivars *efivars, struct efi_variable *var)
{
efi_status_t status;
+ unsigned long flags;
- spin_lock(&efivars->lock);
+ spin_lock_irqsave(&efivars->lock, flags);
status = get_var_data_locked(efivars, var);
- spin_unlock(&efivars->lock);
+ spin_unlock_irqrestore(&efivars->lock, flags);
if (status != EFI_SUCCESS) {
printk(KERN_WARNING "efivars: get_variable() failed 0x%lx!\n",
@@ -514,14 +515,14 @@ efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count)
return -EINVAL;
}
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
status = efivars->ops->set_variable(new_var->VariableName,
&new_var->VendorGuid,
new_var->Attributes,
new_var->DataSize,
new_var->Data);
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
if (status != EFI_SUCCESS) {
printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
@@ -632,7 +633,7 @@ static int efi_pstore_open(struct pstore_info *psi)
{
struct efivars *efivars = psi->data;
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
efivars->walk_entry = list_first_entry(&efivars->list,
struct efivar_entry, list);
return 0;
@@ -642,7 +643,7 @@ static int efi_pstore_close(struct pstore_info *psi)
{
struct efivars *efivars = psi->data;
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return 0;
}
@@ -696,11 +697,12 @@ static int efi_pstore_write(enum pstore_type_id type,
struct efivars *efivars = psi->data;
struct efivar_entry *entry, *found = NULL;
int i, ret = 0;
+ unsigned long flags;
sprintf(stub_name, "dump-type%u-%u-", type, part);
sprintf(name, "%s%lu", stub_name, get_seconds());
- spin_lock(&efivars->lock);
+ spin_lock_irqsave(&efivars->lock, flags);
for (i = 0; i < DUMP_NAME_LEN; i++)
efi_name[i] = stub_name[i];
@@ -738,7 +740,7 @@ static int efi_pstore_write(enum pstore_type_id type,
efivars->ops->set_variable(efi_name, &vendor, PSTORE_EFI_ATTRIBUTES,
size, psi->buf);
- spin_unlock(&efivars->lock);
+ spin_unlock_irqrestore(&efivars->lock, flags);
if (found)
efivar_unregister(found);
@@ -822,7 +824,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
return -EINVAL;
}
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
/*
* Does this variable already exist?
@@ -840,7 +842,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
}
}
if (found) {
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return -EINVAL;
}
@@ -854,10 +856,10 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
if (status != EFI_SUCCESS) {
printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
status);
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return -EIO;
}
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
/* Create the entry in sysfs. Locking is not required here */
status = efivar_create_sysfs_entry(efivars,
@@ -885,7 +887,7 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
/*
* Does this variable already exist?
@@ -903,7 +905,7 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
}
}
if (!found) {
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return -EINVAL;
}
/* force the Attributes/DataSize to 0 to ensure deletion */
@@ -919,12 +921,12 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
if (status != EFI_SUCCESS) {
printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
status);
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return -EIO;
}
list_del(&search_efivar->list);
/* We need to release this lock before unregistering. */
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
efivar_unregister(search_efivar);
/* It's dead Jim.... */
@@ -993,6 +995,7 @@ efivar_create_sysfs_entry(struct efivars *efivars,
int i, short_name_size = variable_name_size / sizeof(efi_char16_t) + 38;
char *short_name;
struct efivar_entry *new_efivar;
+ unsigned long flags;
short_name = kzalloc(short_name_size + 1, GFP_KERNEL);
new_efivar = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL);
@@ -1032,9 +1035,9 @@ efivar_create_sysfs_entry(struct efivars *efivars,
kfree(short_name);
short_name = NULL;
- spin_lock(&efivars->lock);
+ spin_lock_irqsave(&efivars->lock, flags);
list_add(&new_efivar->list, &efivars->list);
- spin_unlock(&efivars->lock);
+ spin_unlock_irqrestore(&efivars->lock, flags);
return 0;
}
@@ -1103,9 +1106,9 @@ void unregister_efivars(struct efivars *efivars)
struct efivar_entry *entry, *n;
list_for_each_entry_safe(entry, n, &efivars->list, list) {
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
list_del(&entry->list);
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
efivar_unregister(entry);
}
if (efivars->new_var)
--
1.7.1
|
|
From: Seiji A. <sei...@hd...> - 2012-09-04 22:06:10
|
Changelog
v3 -> v4
- Patch 2/3
Move cancel_work_sync() above an efi_enabled test in efivars_exit().
v2 -> v3
- Patch 1/3
Replace spin_lock_irqsave/spin_unlock_irqrestore with spin_lock_irq/spin_unlock_irq in efivars_unregister(),
efivar_create(), efivar_store_raw() and efivar_delete() which are called in a process context.
- Patch 2/3
Change a name of delete_sysfs_entry() to delete_all_stale_sysfs_entries().
Also, don't release an efivar->lock while searching efivar->list in delete_all_stale_sysfs_entries().
- Patch 3/3
Remove a logic in efi_pstore_erase() which freshly created in patch v2.
v1 -> v2
- Patch 1/3
Add spin_lock_irq/spin_unlock_irq to open/close callbacks of efi_pstore
instead of moving spin_locks to a read callback.
- Patch 2/3
Replace a periodical timer with schedule_work().
- Patch 3/3
freshly create to kick a workqueue in oops case only.
[Problem]
There are following problems related to an interrupt context in efivar/efi_pstore.
Currently, efivars enables interrupt while taking efivars->lock.
So, there is a risk to be deadlocking in a write callback of efi_pstore if kernel panics
in interrupt context while taking efi_lock.
Also, efi_pstore creates sysfs entries ,which enable users to access to NVRAM, in a write callback.
If a kernel panic happens in interrupt contexts, pstore may fail because it could sleep due to dynamic
memory allocations during creating sysfs entries.
To resolve the problems above, a goal of this patchset is making efivars/efi_pstore interrupt-safe.
[Patch Description]
Patch 1/3 efivars: Disable external interrupt while holding efivars->lock
This patch replaces spin_lock/spin_unlock with spin_lock_irqsave/spin_lock_irqrestore to make efivars interrupt safe
Patch 2/3 efi_pstore: Introducing workqueue updating sysfs entries
This patch removes sysfs operations from write callback by introducing a workqueue updating sysfs entries
Patch 3/3 efi_pstore: Skiping scheduling a workqueue in cases other than oops
This patch restricts a schedule of a workqueue in case where users erase entries or oops happen which is truly needed for users.
drivers/firmware/efivars.c | 167 +++++++++++++++++++++++++++++++++++--------
include/linux/efi.h | 3 +-
2 files changed, 138 insertions(+), 32 deletions(-)
|
|
From: Seiji A. <sei...@hd...> - 2012-08-24 19:19:18
|
Hi, I'm sending an email to discuss how to remove spinlock from a whole process of open/read/close of efi_pstore. [Problem] Current efi_pstore calls kmalloc() in a read callback while holding a spinlock, efivar->lock, in an open callback. This means efi_pstore may deadlock if it sleeps in kmalloc(). Also, there is a comment that an efivar->lock shouldn't be held when calling kmalloc(). [Idea] In a process of open/read/close, pstore creates some files in the directory of /dev/pstore. So, memory allocations like kmalloc() are not avoidable. My idea fixing this issue is removing spinlock from a whole process of open/read/close of efi_pstore. Currently, efivar->lock protects two things, efivar->list and ops.[gs]et_variable(). But if we can revise locks of efivar as follows, we can fix this issue. - Introduce a new efivar->list_lock to protect efivar->list and RCU is used while searching it. - An existing efivar>lock protects just ops.[gs]et_variable() calls. Any comments are welcome. Seiji |
|
From: Seiji A. <sei...@hd...> - 2012-08-24 15:22:40
|
Tomas, It is helpful if you can review this patch. Change log v2 -> v3 - Remove an invalidate_tlb_vector event because it was replaced by a call function vector in a following commit. http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=52aec3308db85f4e9f5c8b9f5dc4fbd0138c6fa4 v1 -> v2 - Modify variable name from irq to vector. - Merge arch-specific tracepoints below to an arch_irq_vector_entry/exit. - error_apic_vector - thermal_apic_vector - threshold_apic_vector - spurious_apic_vector - x86_platform_ipi_vector As Vaibhav explained in the thread below, tracepoints for irq vectors are useful. http://www.spinics.net/lists/mm-commits/msg85707.html <snip> The current interrupt traces from irq_handler_entry and irq_handler_exit provide when an interrupt is handled. They provide good data about when the system has switched to kernel space and how it affects the currently running processes. There are some IRQ vectors which trigger the system into kernel space, which are not handled in generic IRQ handlers. Tracing such events gives us the information about IRQ interaction with other system events. The trace also tells where the system is spending its time. We want to know which cores are handling interrupts and how they are affecting other processes in the system. Also, the trace provides information about when the cores are idle and which interrupts are changing that state. <snip> On the other hand, my usecase is tracing just local timer event and getting a value of instruction pointer. I suggested to add an argument local timer event to get instruction pointer before. But there is another way to get it with external module like systemtap. So, I don't need to add any argument to irq vector tracepoints now. Vaibhav's patch shared a trace point ,irq_vector_entry/irq_vector_exit, in all events. But there is an above use case to trace specific irq_vector rather than tracing all events. In this case, we are concerned about overhead due to unwanted events. This patch modifies Vaibhav's one as follows. - Separate generic, and across-architecture tracepoints to enable independently. - nmi_vector - local_timer_vector - reschedule_vector - call_function_vector - call_function_single_vector - irq_work_entry_vector - Rename architecture-specific tracepoints from irq_vector_entry/exit to arch_irq_vector_entry/exit. - error_apic_vector - thermal_apic_vector - threshold_apic_vector - spurious_apic_vector - x86_platform_ipi_vector Those x86 specific ones are not really frequently raised vectors, so enabling them all won't affect performance and readability of the traces too much. Signed-off-by: Seiji Aguchi <sei...@hd...> --- arch/x86/include/asm/irq_vectors.h | 9 ++ arch/x86/kernel/apic/apic.c | 7 + arch/x86/kernel/cpu/mcheck/therm_throt.c | 3 + arch/x86/kernel/cpu/mcheck/threshold.c | 3 + arch/x86/kernel/irq.c | 5 + arch/x86/kernel/irq_work.c | 3 + arch/x86/kernel/nmi.c | 3 + arch/x86/kernel/smp.c | 7 + include/trace/events/irq_vectors.h | 209 ++++++++++++++++++++++++++++++ 9 files changed, 249 insertions(+), 0 deletions(-) create mode 100644 include/trace/events/irq_vectors.h diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 1508e51..510ced5 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -158,4 +158,13 @@ static inline int invalid_vm86_irq(int irq) # define NR_IRQS NR_IRQS_LEGACY #endif +#define irq_vector_name(vector) { vector, #vector } + +#define irq_vector_name_table \ + irq_vector_name(ERROR_APIC_VECTOR), \ + irq_vector_name(THERMAL_APIC_VECTOR), \ + irq_vector_name(THRESHOLD_APIC_VECTOR), \ + irq_vector_name(SPURIOUS_APIC_VECTOR), \ + irq_vector_name(X86_PLATFORM_IPI_VECTOR) + #endif /* _ASM_X86_IRQ_VECTORS_H */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 24deb30..b9cdd8f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -34,6 +34,7 @@ #include <linux/dmi.h> #include <linux/smp.h> #include <linux/mm.h> +#include <trace/events/irq_vectors.h> #include <asm/irq_remapping.h> #include <asm/perf_event.h> @@ -895,7 +896,9 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) */ irq_enter(); exit_idle(); + trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); + trace_local_timer_exit(LOCAL_TIMER_VECTOR); irq_exit(); set_irq_regs(old_regs); @@ -1881,6 +1884,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) irq_enter(); exit_idle(); + trace_arch_irq_vector_entry(SPURIOUS_APIC_VECTOR); /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1895,6 +1899,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); + trace_arch_irq_vector_exit(SPURIOUS_APIC_VECTOR); irq_exit(); } @@ -1918,6 +1923,7 @@ void smp_error_interrupt(struct pt_regs *regs) irq_enter(); exit_idle(); + trace_arch_irq_vector_entry(ERROR_APIC_VECTOR); /* First tickle the hardware, only then report what went on. -- REW */ v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); @@ -1938,6 +1944,7 @@ void smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT "\n"); + trace_arch_irq_vector_exit(ERROR_APIC_VECTOR); irq_exit(); } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 47a1870..63c2cc8 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -23,6 +23,7 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/cpu.h> +#include <trace/events/irq_vectors.h> #include <asm/processor.h> #include <asm/apic.h> @@ -382,8 +383,10 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) { irq_enter(); exit_idle(); + trace_arch_irq_vector_entry(THERMAL_APIC_VECTOR); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); + trace_arch_irq_vector_exit(THERMAL_APIC_VECTOR); irq_exit(); /* Ack only at the end to avoid potential reentry */ ack_APIC_irq(); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index aa578ca..de74768 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -3,6 +3,7 @@ */ #include <linux/interrupt.h> #include <linux/kernel.h> +#include <trace/events/irq_vectors.h> #include <asm/irq_vectors.h> #include <asm/apic.h> @@ -21,8 +22,10 @@ asmlinkage void smp_threshold_interrupt(void) { irq_enter(); exit_idle(); + trace_arch_irq_vector_entry(THRESHOLD_APIC_VECTOR); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); + trace_arch_irq_vector_exit(THRESHOLD_APIC_VECTOR); irq_exit(); /* Ack only at the end to avoid potential reentry */ ack_APIC_irq(); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 1f5f1d5..f4d7344 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -18,6 +18,9 @@ #include <asm/mce.h> #include <asm/hw_irq.h> +#define CREATE_TRACE_POINTS +#include <trace/events/irq_vectors.h> + atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ @@ -218,11 +221,13 @@ void smp_x86_platform_ipi(struct pt_regs *regs) exit_idle(); + trace_arch_irq_vector_entry(X86_PLATFORM_IPI_VECTOR); inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) x86_platform_ipi_callback(); + trace_arch_irq_vector_exit(X86_PLATFORM_IPI_VECTOR); irq_exit(); set_irq_regs(old_regs); diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index ca8f703..2cf7505 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -8,13 +8,16 @@ #include <linux/irq_work.h> #include <linux/hardirq.h> #include <asm/apic.h> +#include <trace/events/irq_vectors.h> void smp_irq_work_interrupt(struct pt_regs *regs) { irq_enter(); ack_APIC_irq(); + trace_irq_work_entry(IRQ_WORK_VECTOR); inc_irq_stat(apic_irq_work_irqs); irq_work_run(); + trace_irq_work_exit(IRQ_WORK_VECTOR); irq_exit(); } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index f84f5c5..cc57aba 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -28,6 +28,7 @@ #include <asm/mach_traps.h> #include <asm/nmi.h> #include <asm/x86_init.h> +#include <trace/events/irq_vectors.h> struct nmi_desc { spinlock_t lock; @@ -482,12 +483,14 @@ do_nmi(struct pt_regs *regs, long error_code) nmi_nesting_preprocess(regs); nmi_enter(); + trace_nmi_entry(NMI_VECTOR); inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); + trace_nmi_exit(NMI_VECTOR); nmi_exit(); /* On i386, may loop back to preprocess */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 48d2b7d..5b2d6de 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -23,6 +23,7 @@ #include <linux/interrupt.h> #include <linux/cpu.h> #include <linux/gfp.h> +#include <trace/events/irq_vectors.h> #include <asm/mtrr.h> #include <asm/tlbflush.h> @@ -252,8 +253,10 @@ finish: void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + trace_reschedule_entry(RESCHEDULE_VECTOR); inc_irq_stat(irq_resched_count); scheduler_ipi(); + trace_reschedule_exit(RESCHEDULE_VECTOR); /* * KVM uses this interrupt to force a cpu out of guest mode */ @@ -263,8 +266,10 @@ void smp_call_function_interrupt(struct pt_regs *regs) { ack_APIC_irq(); irq_enter(); + trace_call_function_entry(CALL_FUNCTION_VECTOR); generic_smp_call_function_interrupt(); inc_irq_stat(irq_call_count); + trace_call_function_exit(CALL_FUNCTION_VECTOR); irq_exit(); } @@ -272,8 +277,10 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) { ack_APIC_irq(); irq_enter(); + trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); generic_smp_call_function_single_interrupt(); inc_irq_stat(irq_call_count); + trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); irq_exit(); } diff --git a/include/trace/events/irq_vectors.h b/include/trace/events/irq_vectors.h new file mode 100644 index 0000000..fffe0c0 --- /dev/null +++ b/include/trace/events/irq_vectors.h @@ -0,0 +1,209 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM irq_vectors + +#if !defined(_TRACE_IRQ_VECTORS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_IRQ_VECTORS_H + +#include <linux/tracepoint.h> +#include <asm/irq.h> + +#ifndef irq_vector_name_table +#define irq_vector_name_table { -1, NULL } #endif + + +/* + * This class is used by generic ,cross-architecture tracepoints. + */ +DECLARE_EVENT_CLASS(irq_vector, + + TP_PROTO(int vector), + + TP_ARGS(vector), + + TP_STRUCT__entry( + __field( int, vector ) + ), + + TP_fast_assign( + __entry->vector = vector; + ), + + TP_printk("vector=%d", __entry->vector) ); + +/* + * nmi_entry - called before enterring a nmi vector handler */ +DEFINE_EVENT(irq_vector, nmi_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * nmi_exit - called immediately after the interrupt vector + * handler returns + */ +DEFINE_EVENT(irq_vector, nmi_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * local_timer_entry - called before enterring a local timer interrupt + * vector handler + */ +DEFINE_EVENT(irq_vector, local_timer_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * local_timer_exit - called immediately after the interrupt vector + * handler returns + */ +DEFINE_EVENT(irq_vector, local_timer_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * reschedule_entry - called before enterring a reschedule vector +handler */ DEFINE_EVENT(irq_vector, reschedule_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * reschedule_exit - called immediately after the interrupt vector + * handler returns + */ +DEFINE_EVENT(irq_vector, reschedule_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * call_function_entry - called before enterring a call function + * vector handler + */ +DEFINE_EVENT(irq_vector, call_function_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * call_function_exit - called immediately after the interrupt vector + * handler returns + */ +DEFINE_EVENT(irq_vector, call_function_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * call_function_single_entry - called before enterring a call function + * single vector handler + */ +DEFINE_EVENT(irq_vector, call_function_single_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * call_function_single_exit - called immediately after the interrupt +vector + * handler returns + */ +DEFINE_EVENT(irq_vector, call_function_single_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * irq_work_entry - called before enterring an irq work vector handler +*/ DEFINE_EVENT(irq_vector, irq_work_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * irq_work_exit - called immediately after the interrupt vector + * handler returns + */ +DEFINE_EVENT(irq_vector, irq_work_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * This class is used by arch-specific tracepoints. + */ +DECLARE_EVENT_CLASS(arch_irq_vector, + + TP_PROTO(int vector), + + TP_ARGS(vector), + + TP_STRUCT__entry( + __field( int, vector ) + ), + + TP_fast_assign( + __entry->vector = vector; + ), + + TP_printk("vector=%d name=%s", __entry->vector, + __print_symbolic(__entry->vector, irq_vector_name_table)) ); + +/* + * arch_irq_vector_entry - called before enterring a interrupt vector +handler */ DEFINE_EVENT(arch_irq_vector, arch_irq_vector_entry, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +/* + * arch_irq_vector_exit - called immediately after the interrupt vector + * handler returns + */ +DEFINE_EVENT(arch_irq_vector, arch_irq_vector_exit, + + TP_PROTO(int vector), + + TP_ARGS(vector) +); + +#endif /* _TRACE_IRQ_VECTORS_H */ + +/* This part must be outside protection */ #include +<trace/define_trace.h> -- 1.7.1 |
|
From: Seiji A. <sei...@hd...> - 2012-08-23 18:56:51
|
Tony, Cloud you please review this patchset? Seiji > -----Original Message----- > From: Seiji Aguchi > Sent: Tuesday, August 21, 2012 4:35 PM > To: 'Mike Waychison'; 'lin...@vg...'; 'Luck, Tony (ton...@in...)'; 'Matthew Garrett (mj...@re...)'; > 'dz...@re...' > Cc: 'dle...@li...'; Satoru Moriya > Subject: [RFC][PATCH v4 0/3] make efivars/efi_pstore interrupt-safe > > Changelog > v3 -> v4 > - Patch 2/3 > Move cancel_work_sync() above an efi_enabled test in efivars_exit(). > > v2 -> v3 > - Patch 1/3 > Replace spin_lock_irqsave/spin_unlock_irqrestore with spin_lock_irq/spin_unlock_irq in efivars_unregister(), > efivar_create(), efivar_store_raw() and efivar_delete() which are called in a process context. > > - Patch 2/3 > Change a name of delete_sysfs_entry() to delete_all_stale_sysfs_entries(). > Also, don't release an efivar->lock while searching efivar->list in delete_all_stale_sysfs_entries(). > > - Patch 3/3 > Remove a logic in efi_pstore_erase() which freshly created in patch v2. > > v1 -> v2 > - Patch 1/3 > Add spin_lock_irq/spin_unlock_irq to open/close callbacks of efi_pstore > instead of moving spin_locks to a read callback. > > - Patch 2/3 > Replace a periodical timer with schedule_work(). > > - Patch 3/3 > freshly create to kick a workqueue in oops case only. > > [Problem] > There are following problems related to an interrupt context in efivar/efi_pstore. > > Currently, efivars enables interrupt while taking efivars->lock. > So, there is a risk to be deadlocking in a write callback of efi_pstore if kernel panics in interrupt context while taking efi_lock. > > Also, efi_pstore creates sysfs entries ,which enable users to access to NVRAM, in a write callback. > If a kernel panic happens in interrupt contexts, pstore may fail because it could sleep due to dynamic memory allocations during > creating sysfs entries. > > To resolve the problems above, a goal of this patchset is making efivars/efi_pstore interrupt-safe. > > [Patch Description] > Patch 1/3 efivars: Disable external interrupt while holding efivars->lock > This patch replaces spin_lock/spin_unlock with spin_lock_irqsave/spin_lock_irqrestore to make efivars interrupt safe > > Patch 2/3 efi_pstore: Introducing workqueue updating sysfs entries > This patch removes sysfs operations from write callback by introducing a workqueue updating sysfs entries > > Patch 3/3 efi_pstore: Skiping scheduling a workqueue in cases other than oops > This patch restricts a schedule of a workqueue in case where users erase entries or oops happen which is truly needed for users. > > drivers/firmware/efivars.c | 167 +++++++++++++++++++++++++++++++++++-------- > include/linux/efi.h | 3 +- > 2 files changed, 138 insertions(+), 32 deletions(-) > > > |
|
From: Seiji A. <sei...@hd...> - 2012-08-21 20:36:39
|
[Problem] efi_pstore creates sysfs files when logging kernel messages to NVRAM. Currently, the sysfs files are updated in a workqueue which is registered in a write callback. On the other hand, situations which users needs the sysfs files are when they erase entries or oops happen because a system will be down and users can't access to sysfs files in other cases like panic, reboot or emergency_restart. Also, if kernel panics due to a bug of workqueue operations and a write callback of efi_pstore is called in panic case, efi_pstore may fail due to a failure of schedule_work(). And panic_notifier_chain()/emergency_restart() is not kicked if efi_pstore fails. This may cause user's unwanted result. [Patch Description] This patch registers a workqueue updating sysfs entries in cases where users erase entries or oops happen only, and skips it in other cases like panic, reboot or emergency_start. Signed-off-by: Seiji Aguchi <sei...@hd...> --- drivers/firmware/efivars.c | 7 ++++++- 1 files changed, 6 insertions(+), 1 deletions(-) diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index cd16ea6..d5911fd 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -746,7 +746,12 @@ static int efi_pstore_write(enum pstore_type_id type, spin_unlock_irqrestore(&efivars->lock, flags); - schedule_work(&efivar_work); + /* + * The user may want to update sysfs for this write + * when they erase an entry via /dev/pstore or oops happen. + */ + if (!size || reason == KMSG_DUMP_OOPS) + schedule_work(&efivar_work); *id = part; return ret; -- 1.7.1 |
|
From: Seiji A. <sei...@hd...> - 2012-08-21 20:36:12
|
[Problem]
efi_pstore creates sysfs entries ,which enable users to access to NVRAM,
in a write callback. If a kernel panic happens in interrupt contexts, pstore may
fail because it could sleep due to dynamic memory allocations during creating sysfs entries.
[Patch Description]
This patch removes sysfs operations from a write callback by introducing a workqueue
updating sysfs entries which is scheduled after the write callback is called.
efi_pstore will be robust against a kernel panic in an interrupt context with it.
Signed-off-by: Seiji Aguchi <sei...@hd...>
---
drivers/firmware/efivars.c | 119 +++++++++++++++++++++++++++++++++++++++----
include/linux/efi.h | 3 +-
2 files changed, 110 insertions(+), 12 deletions(-)
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index bd1df01..cd16ea6 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -146,6 +146,13 @@ efivar_create_sysfs_entry(struct efivars *efivars,
efi_char16_t *variable_name,
efi_guid_t *vendor_guid);
+/*
+ * Prototype for workqueue functions updating sysfs entry
+ */
+
+static void efivar_update_sysfs_entry(struct work_struct *);
+static DECLARE_WORK(efivar_work, efivar_update_sysfs_entry);
+
/* Return the number of unicode characters in data */
static unsigned long
utf16_strnlen(efi_char16_t *s, size_t maxlength)
@@ -731,9 +738,6 @@ static int efi_pstore_write(enum pstore_type_id type,
0, NULL);
}
- if (found)
- list_del(&found->list);
-
for (i = 0; i < DUMP_NAME_LEN; i++)
efi_name[i] = name[i];
@@ -742,14 +746,7 @@ static int efi_pstore_write(enum pstore_type_id type,
spin_unlock_irqrestore(&efivars->lock, flags);
- if (found)
- efivar_unregister(found);
-
- if (size)
- ret = efivar_create_sysfs_entry(efivars,
- utf16_strsize(efi_name,
- DUMP_NAME_LEN * 2),
- efi_name, &vendor);
+ schedule_work(&efivar_work);
*id = part;
return ret;
@@ -1200,6 +1197,104 @@ EXPORT_SYMBOL_GPL(register_efivars);
static struct efivars __efivars;
static struct efivar_operations ops;
+static void delete_all_stale_sysfs_entries(void)
+{
+ struct efivars *efivars = &__efivars;
+ struct efivar_entry *entry, *n, *found;
+ efi_status_t status;
+ unsigned long flags;
+
+ while (1) {
+ found = NULL;
+ spin_lock_irqsave(&efivars->lock, flags);
+ list_for_each_entry_safe(entry, n, &efivars->list, list) {
+ status = get_var_data_locked(efivars, &entry->var);
+ if (status != EFI_SUCCESS) {
+ found = entry;
+ list_del(&entry->list);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&efivars->lock, flags);
+ if (found)
+ efivar_unregister(entry);
+ else
+ break;
+ }
+}
+
+static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor)
+{
+ struct efivar_entry *entry, *n;
+ struct efivars *efivars = &__efivars;
+ unsigned long strsize1, strsize2;
+ bool found = false;
+
+ strsize1 = utf16_strsize(variable_name, 1024);
+ list_for_each_entry_safe(entry, n, &efivars->list, list) {
+ strsize2 = utf16_strsize(entry->var.VariableName, 1024);
+ if (strsize1 == strsize2 &&
+ !memcmp(variable_name, &(entry->var.VariableName),
+ strsize2) &&
+ !efi_guidcmp(entry->var.VendorGuid,
+ *vendor)) {
+ found = true;
+ break;
+ }
+ }
+ return found;
+}
+
+static void efivar_update_sysfs_entry(struct work_struct *work)
+{
+ struct efivars *efivars = &__efivars;
+ efi_guid_t vendor;
+ efi_char16_t *variable_name;
+ unsigned long variable_name_size = 1024, flags;
+ efi_status_t status = EFI_NOT_FOUND;
+ bool found;
+
+ /* Delete stale sysfs entries */
+ delete_all_stale_sysfs_entries();
+
+ /* Add new sysfs entries */
+ while (1) {
+ variable_name = kzalloc(variable_name_size, GFP_KERNEL);
+ if (!variable_name) {
+ pr_err("efivars: Memory allocation failed.\n");
+ return;
+ }
+
+ spin_lock_irqsave(&efivars->lock, flags);
+ found = false;
+ while (1) {
+ variable_name_size = 1024;
+ status = efivars->ops->get_next_variable(
+ &variable_name_size,
+ variable_name,
+ &vendor);
+ if (status != EFI_SUCCESS) {
+ break;
+ } else {
+ if (!variable_is_present(variable_name,
+ &vendor)) {
+ found = true;
+ break;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&efivars->lock, flags);
+
+ if (!found) {
+ kfree(variable_name);
+ break;
+ } else
+ efivar_create_sysfs_entry(efivars,
+ variable_name_size,
+ variable_name, &vendor);
+ }
+}
+
/*
* For now we register the efi subsystem with the firmware subsystem
* and the vars subsystem with the efi subsystem. In the future, it
@@ -1254,6 +1349,8 @@ err_put:
static void __exit
efivars_exit(void)
{
+ cancel_work_sync(&efivar_work);
+
if (efi_enabled) {
unregister_efivars(&__efivars);
kobject_put(efi_kobj);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 103adc6..cecdf58 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -641,7 +641,8 @@ struct efivars {
* 1) ->list - adds, removals, reads, writes
* 2) ops.[gs]et_variable() calls.
* It must not be held when creating sysfs entries or calling kmalloc.
- * ops.get_next_variable() is only called from register_efivars(),
+ * ops.get_next_variable() is only called from register_efivars()
+ * or efivar_update_sysfs_entry(),
* which is protected by the BKL, so that path is safe.
*/
spinlock_t lock;
-- 1.7.1
|
|
From: Seiji A. <sei...@hd...> - 2012-08-21 20:35:59
|
[Problem]
Currently, efivars doesn't disable interrupt while taking efivars->lock.
So, there is a risk to be deadlocking in a write callback of efi_pstore
if kernel panics in interrupt context while taking efivars->lock.
[Patch Description]
This patch disables an external interruption while holding efivars->lock as follows
- In efi_pstore_open()/efi_pstore_close(), replace spin_lock/spin_unlock with spin_lock_irq/spin_unlock_irq
because they are called in a process context when users access to /dev/pstore.
- In unregister_efivars(), replace them with spin_lock_irq/spin_unlock_irq
because they are called in a process context when unloading this module.
- Also, in efivar_create()/efivar_store_raw()/efivar_delete(), replace them with spin_lock_irq/spin_unlock_irq
because they are called in a process context when users access to /sys/firmware/efi/vars/{new_var|del_var}.
- In other function calls, replace spin_lock/spin_unlock with spin_lock_irqsave/spin_unlock_irqrestore.
Signed-off-by: Seiji Aguchi <sei...@hd...>
---
drivers/firmware/efivars.c | 43 +++++++++++++++++++++++--------------------
1 files changed, 23 insertions(+), 20 deletions(-)
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 47408e8..bd1df01 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -393,10 +393,11 @@ static efi_status_t
get_var_data(struct efivars *efivars, struct efi_variable *var)
{
efi_status_t status;
+ unsigned long flags;
- spin_lock(&efivars->lock);
+ spin_lock_irqsave(&efivars->lock, flags);
status = get_var_data_locked(efivars, var);
- spin_unlock(&efivars->lock);
+ spin_unlock_irqrestore(&efivars->lock, flags);
if (status != EFI_SUCCESS) {
printk(KERN_WARNING "efivars: get_variable() failed 0x%lx!\n",
@@ -514,14 +515,14 @@ efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count)
return -EINVAL;
}
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
status = efivars->ops->set_variable(new_var->VariableName,
&new_var->VendorGuid,
new_var->Attributes,
new_var->DataSize,
new_var->Data);
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
if (status != EFI_SUCCESS) {
printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
@@ -632,7 +633,7 @@ static int efi_pstore_open(struct pstore_info *psi)
{
struct efivars *efivars = psi->data;
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
efivars->walk_entry = list_first_entry(&efivars->list,
struct efivar_entry, list);
return 0;
@@ -642,7 +643,7 @@ static int efi_pstore_close(struct pstore_info *psi)
{
struct efivars *efivars = psi->data;
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return 0;
}
@@ -696,11 +697,12 @@ static int efi_pstore_write(enum pstore_type_id type,
struct efivars *efivars = psi->data;
struct efivar_entry *entry, *found = NULL;
int i, ret = 0;
+ unsigned long flags;
sprintf(stub_name, "dump-type%u-%u-", type, part);
sprintf(name, "%s%lu", stub_name, get_seconds());
- spin_lock(&efivars->lock);
+ spin_lock_irqsave(&efivars->lock, flags);
for (i = 0; i < DUMP_NAME_LEN; i++)
efi_name[i] = stub_name[i];
@@ -738,7 +740,7 @@ static int efi_pstore_write(enum pstore_type_id type,
efivars->ops->set_variable(efi_name, &vendor, PSTORE_EFI_ATTRIBUTES,
size, psi->buf);
- spin_unlock(&efivars->lock);
+ spin_unlock_irqrestore(&efivars->lock, flags);
if (found)
efivar_unregister(found);
@@ -822,7 +824,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
return -EINVAL;
}
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
/*
* Does this variable already exist?
@@ -840,7 +842,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
}
}
if (found) {
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return -EINVAL;
}
@@ -854,10 +856,10 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
if (status != EFI_SUCCESS) {
printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
status);
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return -EIO;
}
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
/* Create the entry in sysfs. Locking is not required here */
status = efivar_create_sysfs_entry(efivars,
@@ -885,7 +887,7 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
/*
* Does this variable already exist?
@@ -903,7 +905,7 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
}
}
if (!found) {
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return -EINVAL;
}
/* force the Attributes/DataSize to 0 to ensure deletion */
@@ -919,12 +921,12 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
if (status != EFI_SUCCESS) {
printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
status);
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return -EIO;
}
list_del(&search_efivar->list);
/* We need to release this lock before unregistering. */
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
efivar_unregister(search_efivar);
/* It's dead Jim.... */
@@ -993,6 +995,7 @@ efivar_create_sysfs_entry(struct efivars *efivars,
int i, short_name_size = variable_name_size / sizeof(efi_char16_t) + 38;
char *short_name;
struct efivar_entry *new_efivar;
+ unsigned long flags;
short_name = kzalloc(short_name_size + 1, GFP_KERNEL);
new_efivar = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL);
@@ -1032,9 +1035,9 @@ efivar_create_sysfs_entry(struct efivars *efivars,
kfree(short_name);
short_name = NULL;
- spin_lock(&efivars->lock);
+ spin_lock_irqsave(&efivars->lock, flags);
list_add(&new_efivar->list, &efivars->list);
- spin_unlock(&efivars->lock);
+ spin_unlock_irqrestore(&efivars->lock, flags);
return 0;
}
@@ -1103,9 +1106,9 @@ void unregister_efivars(struct efivars *efivars)
struct efivar_entry *entry, *n;
list_for_each_entry_safe(entry, n, &efivars->list, list) {
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
list_del(&entry->list);
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
efivar_unregister(entry);
}
if (efivars->new_var)
-- 1.7.1
|
|
From: Seiji A. <sei...@hd...> - 2012-08-21 20:35:17
|
Changelog
v3 -> v4
- Patch 2/3
Move cancel_work_sync() above an efi_enabled test in efivars_exit().
v2 -> v3
- Patch 1/3
Replace spin_lock_irqsave/spin_unlock_irqrestore with spin_lock_irq/spin_unlock_irq in efivars_unregister(),
efivar_create(), efivar_store_raw() and efivar_delete() which are called in a process context.
- Patch 2/3
Change a name of delete_sysfs_entry() to delete_all_stale_sysfs_entries().
Also, don't release an efivar->lock while searching efivar->list in delete_all_stale_sysfs_entries().
- Patch 3/3
Remove a logic in efi_pstore_erase() which freshly created in patch v2.
v1 -> v2
- Patch 1/3
Add spin_lock_irq/spin_unlock_irq to open/close callbacks of efi_pstore
instead of moving spin_locks to a read callback.
- Patch 2/3
Replace a periodical timer with schedule_work().
- Patch 3/3
freshly create to kick a workqueue in oops case only.
[Problem]
There are following problems related to an interrupt context in efivar/efi_pstore.
Currently, efivars enables interrupt while taking efivars->lock.
So, there is a risk to be deadlocking in a write callback of efi_pstore if kernel panics
in interrupt context while taking efi_lock.
Also, efi_pstore creates sysfs entries ,which enable users to access to NVRAM, in a write callback.
If a kernel panic happens in interrupt contexts, pstore may fail because it could sleep due to dynamic
memory allocations during creating sysfs entries.
To resolve the problems above, a goal of this patchset is making efivars/efi_pstore interrupt-safe.
[Patch Description]
Patch 1/3 efivars: Disable external interrupt while holding efivars->lock
This patch replaces spin_lock/spin_unlock with spin_lock_irqsave/spin_lock_irqrestore to make efivars interrupt safe
Patch 2/3 efi_pstore: Introducing workqueue updating sysfs entries
This patch removes sysfs operations from write callback by introducing a workqueue updating sysfs entries
Patch 3/3 efi_pstore: Skiping scheduling a workqueue in cases other than oops
This patch restricts a schedule of a workqueue in case where users erase entries or oops happen which is truly needed for users.
drivers/firmware/efivars.c | 167 +++++++++++++++++++++++++++++++++++--------
include/linux/efi.h | 3 +-
2 files changed, 138 insertions(+), 32 deletions(-)
|
|
From: Seiji A. <sei...@hd...> - 2012-08-21 19:24:22
|
> > efivars_exit(void)
> > {
> > if (efi_enabled) {
> > + cancel_work_sync(&efivar_work);
>
> Please move this cancel_work_sync() to be before the efi_enabled test.
> efi_enabled here means that we registered __efivars. There may be another driver (gsmi) using the efivars code, so we should
> always be cancelling this work.
>
OK. I will fix it.
I confirmed that gsmi accually called register_efivars().
Thank you for letting me know about this.
Seiji
|
|
From: Mike W. <mi...@go...> - 2012-08-21 18:01:23
|
On Tue, Aug 21, 2012 at 9:54 AM, Seiji Aguchi <sei...@hd...> wrote:
> [Problem]
> efi_pstore creates sysfs entries ,which enable users to access to NVRAM,
> in a write callback. If a kernel panic happens in interrupt contexts, pstore may
> fail because it could sleep due to dynamic memory allocations during creating sysfs entries.
>
> [Patch Description]
> This patch removes sysfs operations from a write callback by introducing a workqueue
> updating sysfs entries which is scheduled after the write callback is called.
> efi_pstore will be robust against a kernel panic in an interrupt context with it.
>
> Signed-off-by: Seiji Aguchi <sei...@hd...>
> ---
> drivers/firmware/efivars.c | 118 +++++++++++++++++++++++++++++++++++++++----
> include/linux/efi.h | 3 +-
> 2 files changed, 109 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
> index bd1df01..4698346 100644
> --- a/drivers/firmware/efivars.c
> +++ b/drivers/firmware/efivars.c
> @@ -146,6 +146,13 @@ efivar_create_sysfs_entry(struct efivars *efivars,
> efi_char16_t *variable_name,
> efi_guid_t *vendor_guid);
>
> +/*
> + * Prototype for workqueue functions updating sysfs entry
> + */
> +
> +static void efivar_update_sysfs_entry(struct work_struct *);
> +static DECLARE_WORK(efivar_work, efivar_update_sysfs_entry);
> +
> /* Return the number of unicode characters in data */
> static unsigned long
> utf16_strnlen(efi_char16_t *s, size_t maxlength)
> @@ -731,9 +738,6 @@ static int efi_pstore_write(enum pstore_type_id type,
> 0, NULL);
> }
>
> - if (found)
> - list_del(&found->list);
> -
> for (i = 0; i < DUMP_NAME_LEN; i++)
> efi_name[i] = name[i];
>
> @@ -742,14 +746,7 @@ static int efi_pstore_write(enum pstore_type_id type,
>
> spin_unlock_irqrestore(&efivars->lock, flags);
>
> - if (found)
> - efivar_unregister(found);
> -
> - if (size)
> - ret = efivar_create_sysfs_entry(efivars,
> - utf16_strsize(efi_name,
> - DUMP_NAME_LEN * 2),
> - efi_name, &vendor);
> + schedule_work(&efivar_work);
>
> *id = part;
> return ret;
> @@ -1200,6 +1197,104 @@ EXPORT_SYMBOL_GPL(register_efivars);
> static struct efivars __efivars;
> static struct efivar_operations ops;
>
> +static void delete_all_stale_sysfs_entries(void)
> +{
> + struct efivars *efivars = &__efivars;
> + struct efivar_entry *entry, *n, *found;
> + efi_status_t status;
> + unsigned long flags;
> +
> + while (1) {
> + found = NULL;
> + spin_lock_irqsave(&efivars->lock, flags);
> + list_for_each_entry_safe(entry, n, &efivars->list, list) {
> + status = get_var_data_locked(efivars, &entry->var);
> + if (status != EFI_SUCCESS) {
> + found = entry;
> + list_del(&entry->list);
> + break;
> + }
> + }
> + spin_unlock_irqrestore(&efivars->lock, flags);
> + if (found)
> + efivar_unregister(entry);
> + else
> + break;
> + }
> +}
> +
> +static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor)
> +{
> + struct efivar_entry *entry, *n;
> + struct efivars *efivars = &__efivars;
> + unsigned long strsize1, strsize2;
> + bool found = false;
> +
> + strsize1 = utf16_strsize(variable_name, 1024);
> + list_for_each_entry_safe(entry, n, &efivars->list, list) {
> + strsize2 = utf16_strsize(entry->var.VariableName, 1024);
> + if (strsize1 == strsize2 &&
> + !memcmp(variable_name, &(entry->var.VariableName),
> + strsize2) &&
> + !efi_guidcmp(entry->var.VendorGuid,
> + *vendor)) {
> + found = true;
> + break;
> + }
> + }
> + return found;
> +}
> +
> +static void efivar_update_sysfs_entry(struct work_struct *work)
> +{
> + struct efivars *efivars = &__efivars;
> + efi_guid_t vendor;
> + efi_char16_t *variable_name;
> + unsigned long variable_name_size = 1024, flags;
> + efi_status_t status = EFI_NOT_FOUND;
> + bool found;
> +
> + /* Delete stale sysfs entries */
> + delete_all_stale_sysfs_entries();
> +
> + /* Add new sysfs entries */
> + while (1) {
> + variable_name = kzalloc(variable_name_size, GFP_KERNEL);
> + if (!variable_name) {
> + pr_err("efivars: Memory allocation failed.\n");
> + return;
> + }
> +
> + spin_lock_irqsave(&efivars->lock, flags);
> + found = false;
> + while (1) {
> + variable_name_size = 1024;
> + status = efivars->ops->get_next_variable(
> + &variable_name_size,
> + variable_name,
> + &vendor);
> + if (status != EFI_SUCCESS) {
> + break;
> + } else {
> + if (!variable_is_present(variable_name,
> + &vendor)) {
> + found = true;
> + break;
> + }
> + }
> + }
> + spin_unlock_irqrestore(&efivars->lock, flags);
> +
> + if (!found) {
> + kfree(variable_name);
> + break;
> + } else
> + efivar_create_sysfs_entry(efivars,
> + variable_name_size,
> + variable_name, &vendor);
> + }
> +}
> +
> /*
> * For now we register the efi subsystem with the firmware subsystem
> * and the vars subsystem with the efi subsystem. In the future, it
> @@ -1255,6 +1350,7 @@ static void __exit
> efivars_exit(void)
> {
> if (efi_enabled) {
> + cancel_work_sync(&efivar_work);
Please move this cancel_work_sync() to be before the efi_enabled test.
efi_enabled here means that we registered __efivars. There may be
another driver (gsmi) using the efivars code, so we should always be
cancelling this work.
> unregister_efivars(&__efivars);
> kobject_put(efi_kobj);
> }
> diff --git a/include/linux/efi.h b/include/linux/efi.h
> index 103adc6..cecdf58 100644
> --- a/include/linux/efi.h
> +++ b/include/linux/efi.h
> @@ -641,7 +641,8 @@ struct efivars {
> * 1) ->list - adds, removals, reads, writes
> * 2) ops.[gs]et_variable() calls.
> * It must not be held when creating sysfs entries or calling kmalloc.
> - * ops.get_next_variable() is only called from register_efivars(),
> + * ops.get_next_variable() is only called from register_efivars()
> + * or efivar_update_sysfs_entry(),
> * which is protected by the BKL, so that path is safe.
> */
> spinlock_t lock;
> -- 1.7.1
>
|
|
From: Seiji A. <sei...@hd...> - 2012-08-21 16:54:50
|
[Problem] efi_pstore creates sysfs files when logging kernel messages to NVRAM. Currently, the sysfs files are updated in a workqueue which is registered in a write callback. On the other hand, situations which users needs the sysfs files are when they erase entries or oops happen because a system will be down and users can't access to sysfs files in other cases like panic, reboot or emergency_restart. Also, if kernel panics due to a bug of workqueue operations and a write callback of efi_pstore is called in panic case, efi_pstore may fail due to a failure of schedule_work(). And panic_notifier_chain()/emergency_restart() is not kicked if efi_pstore fails. This may cause user's unwanted result. [Patch Description] This patch registers a workqueue updating sysfs entries in cases where users erase entries or oops happen only, and skips it in other cases like panic, reboot or emergency_start. Signed-off-by: Seiji Aguchi <sei...@hd...> --- drivers/firmware/efivars.c | 7 ++++++- 1 files changed, 6 insertions(+), 1 deletions(-) diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index 3dde7d6..a4919ad 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -746,7 +746,12 @@ static int efi_pstore_write(enum pstore_type_id type, spin_unlock_irqrestore(&efivars->lock, flags); - schedule_work(&efivar_work); + /* + * The user may want to update sysfs for this write + * when they erase an entry via /dev/pstore or oops happen. + */ + if (!size || reason == KMSG_DUMP_OOPS) + schedule_work(&efivar_work); *id = part; return ret; -- 1.7.1 |
|
From: Seiji A. <sei...@hd...> - 2012-08-21 16:54:24
|
[Problem]
efi_pstore creates sysfs entries ,which enable users to access to NVRAM,
in a write callback. If a kernel panic happens in interrupt contexts, pstore may
fail because it could sleep due to dynamic memory allocations during creating sysfs entries.
[Patch Description]
This patch removes sysfs operations from a write callback by introducing a workqueue
updating sysfs entries which is scheduled after the write callback is called.
efi_pstore will be robust against a kernel panic in an interrupt context with it.
Signed-off-by: Seiji Aguchi <sei...@hd...>
---
drivers/firmware/efivars.c | 118 +++++++++++++++++++++++++++++++++++++++----
include/linux/efi.h | 3 +-
2 files changed, 109 insertions(+), 12 deletions(-)
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index bd1df01..4698346 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -146,6 +146,13 @@ efivar_create_sysfs_entry(struct efivars *efivars,
efi_char16_t *variable_name,
efi_guid_t *vendor_guid);
+/*
+ * Prototype for workqueue functions updating sysfs entry
+ */
+
+static void efivar_update_sysfs_entry(struct work_struct *);
+static DECLARE_WORK(efivar_work, efivar_update_sysfs_entry);
+
/* Return the number of unicode characters in data */
static unsigned long
utf16_strnlen(efi_char16_t *s, size_t maxlength)
@@ -731,9 +738,6 @@ static int efi_pstore_write(enum pstore_type_id type,
0, NULL);
}
- if (found)
- list_del(&found->list);
-
for (i = 0; i < DUMP_NAME_LEN; i++)
efi_name[i] = name[i];
@@ -742,14 +746,7 @@ static int efi_pstore_write(enum pstore_type_id type,
spin_unlock_irqrestore(&efivars->lock, flags);
- if (found)
- efivar_unregister(found);
-
- if (size)
- ret = efivar_create_sysfs_entry(efivars,
- utf16_strsize(efi_name,
- DUMP_NAME_LEN * 2),
- efi_name, &vendor);
+ schedule_work(&efivar_work);
*id = part;
return ret;
@@ -1200,6 +1197,104 @@ EXPORT_SYMBOL_GPL(register_efivars);
static struct efivars __efivars;
static struct efivar_operations ops;
+static void delete_all_stale_sysfs_entries(void)
+{
+ struct efivars *efivars = &__efivars;
+ struct efivar_entry *entry, *n, *found;
+ efi_status_t status;
+ unsigned long flags;
+
+ while (1) {
+ found = NULL;
+ spin_lock_irqsave(&efivars->lock, flags);
+ list_for_each_entry_safe(entry, n, &efivars->list, list) {
+ status = get_var_data_locked(efivars, &entry->var);
+ if (status != EFI_SUCCESS) {
+ found = entry;
+ list_del(&entry->list);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&efivars->lock, flags);
+ if (found)
+ efivar_unregister(entry);
+ else
+ break;
+ }
+}
+
+static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor)
+{
+ struct efivar_entry *entry, *n;
+ struct efivars *efivars = &__efivars;
+ unsigned long strsize1, strsize2;
+ bool found = false;
+
+ strsize1 = utf16_strsize(variable_name, 1024);
+ list_for_each_entry_safe(entry, n, &efivars->list, list) {
+ strsize2 = utf16_strsize(entry->var.VariableName, 1024);
+ if (strsize1 == strsize2 &&
+ !memcmp(variable_name, &(entry->var.VariableName),
+ strsize2) &&
+ !efi_guidcmp(entry->var.VendorGuid,
+ *vendor)) {
+ found = true;
+ break;
+ }
+ }
+ return found;
+}
+
+static void efivar_update_sysfs_entry(struct work_struct *work)
+{
+ struct efivars *efivars = &__efivars;
+ efi_guid_t vendor;
+ efi_char16_t *variable_name;
+ unsigned long variable_name_size = 1024, flags;
+ efi_status_t status = EFI_NOT_FOUND;
+ bool found;
+
+ /* Delete stale sysfs entries */
+ delete_all_stale_sysfs_entries();
+
+ /* Add new sysfs entries */
+ while (1) {
+ variable_name = kzalloc(variable_name_size, GFP_KERNEL);
+ if (!variable_name) {
+ pr_err("efivars: Memory allocation failed.\n");
+ return;
+ }
+
+ spin_lock_irqsave(&efivars->lock, flags);
+ found = false;
+ while (1) {
+ variable_name_size = 1024;
+ status = efivars->ops->get_next_variable(
+ &variable_name_size,
+ variable_name,
+ &vendor);
+ if (status != EFI_SUCCESS) {
+ break;
+ } else {
+ if (!variable_is_present(variable_name,
+ &vendor)) {
+ found = true;
+ break;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&efivars->lock, flags);
+
+ if (!found) {
+ kfree(variable_name);
+ break;
+ } else
+ efivar_create_sysfs_entry(efivars,
+ variable_name_size,
+ variable_name, &vendor);
+ }
+}
+
/*
* For now we register the efi subsystem with the firmware subsystem
* and the vars subsystem with the efi subsystem. In the future, it
@@ -1255,6 +1350,7 @@ static void __exit
efivars_exit(void)
{
if (efi_enabled) {
+ cancel_work_sync(&efivar_work);
unregister_efivars(&__efivars);
kobject_put(efi_kobj);
}
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 103adc6..cecdf58 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -641,7 +641,8 @@ struct efivars {
* 1) ->list - adds, removals, reads, writes
* 2) ops.[gs]et_variable() calls.
* It must not be held when creating sysfs entries or calling kmalloc.
- * ops.get_next_variable() is only called from register_efivars(),
+ * ops.get_next_variable() is only called from register_efivars()
+ * or efivar_update_sysfs_entry(),
* which is protected by the BKL, so that path is safe.
*/
spinlock_t lock;
-- 1.7.1
|
|
From: Seiji A. <sei...@hd...> - 2012-08-21 16:53:48
|
[Problem]
Currently, efivars doesn't disable interrupt while taking efivars->lock.
So, there is a risk to be deadlocking in a write callback of efi_pstore
if kernel panics in interrupt context while taking efivars->lock.
[Patch Description]
This patch disables an external interruption while holding efivars->lock as follows
- In efi_pstore_open()/efi_pstore_close(), replace spin_lock/spin_unlock with spin_lock_irq/spin_unlock_irq
because they are called in a process context when users access to /dev/pstore.
- In unregister_efivars(), replace them with spin_lock_irq/spin_unlock_irq
because they are called in a process context when unloading this module.
- Also, in efivar_create()/efivar_store_raw()/efivar_delete(), replace them with spin_lock_irq/spin_unlock_irq
because they are called in a process context when users access to /sys/firmware/efi/vars/{new_var|del_var}.
- In other function calls, replace spin_lock/spin_unlock them with spin_lock_irqsave/spin_unlock_irqrestore.
Signed-off-by: Seiji Aguchi <sei...@hd...>
---
drivers/firmware/efivars.c | 43 +++++++++++++++++++++++--------------------
1 files changed, 23 insertions(+), 20 deletions(-)
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 47408e8..bd1df01 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -393,10 +393,11 @@ static efi_status_t
get_var_data(struct efivars *efivars, struct efi_variable *var)
{
efi_status_t status;
+ unsigned long flags;
- spin_lock(&efivars->lock);
+ spin_lock_irqsave(&efivars->lock, flags);
status = get_var_data_locked(efivars, var);
- spin_unlock(&efivars->lock);
+ spin_unlock_irqrestore(&efivars->lock, flags);
if (status != EFI_SUCCESS) {
printk(KERN_WARNING "efivars: get_variable() failed 0x%lx!\n",
@@ -514,14 +515,14 @@ efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count)
return -EINVAL;
}
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
status = efivars->ops->set_variable(new_var->VariableName,
&new_var->VendorGuid,
new_var->Attributes,
new_var->DataSize,
new_var->Data);
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
if (status != EFI_SUCCESS) {
printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
@@ -632,7 +633,7 @@ static int efi_pstore_open(struct pstore_info *psi)
{
struct efivars *efivars = psi->data;
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
efivars->walk_entry = list_first_entry(&efivars->list,
struct efivar_entry, list);
return 0;
@@ -642,7 +643,7 @@ static int efi_pstore_close(struct pstore_info *psi)
{
struct efivars *efivars = psi->data;
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return 0;
}
@@ -696,11 +697,12 @@ static int efi_pstore_write(enum pstore_type_id type,
struct efivars *efivars = psi->data;
struct efivar_entry *entry, *found = NULL;
int i, ret = 0;
+ unsigned long flags;
sprintf(stub_name, "dump-type%u-%u-", type, part);
sprintf(name, "%s%lu", stub_name, get_seconds());
- spin_lock(&efivars->lock);
+ spin_lock_irqsave(&efivars->lock, flags);
for (i = 0; i < DUMP_NAME_LEN; i++)
efi_name[i] = stub_name[i];
@@ -738,7 +740,7 @@ static int efi_pstore_write(enum pstore_type_id type,
efivars->ops->set_variable(efi_name, &vendor, PSTORE_EFI_ATTRIBUTES,
size, psi->buf);
- spin_unlock(&efivars->lock);
+ spin_unlock_irqrestore(&efivars->lock, flags);
if (found)
efivar_unregister(found);
@@ -822,7 +824,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
return -EINVAL;
}
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
/*
* Does this variable already exist?
@@ -840,7 +842,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
}
}
if (found) {
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return -EINVAL;
}
@@ -854,10 +856,10 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
if (status != EFI_SUCCESS) {
printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
status);
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return -EIO;
}
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
/* Create the entry in sysfs. Locking is not required here */
status = efivar_create_sysfs_entry(efivars,
@@ -885,7 +887,7 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
/*
* Does this variable already exist?
@@ -903,7 +905,7 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
}
}
if (!found) {
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return -EINVAL;
}
/* force the Attributes/DataSize to 0 to ensure deletion */
@@ -919,12 +921,12 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
if (status != EFI_SUCCESS) {
printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
status);
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
return -EIO;
}
list_del(&search_efivar->list);
/* We need to release this lock before unregistering. */
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
efivar_unregister(search_efivar);
/* It's dead Jim.... */
@@ -993,6 +995,7 @@ efivar_create_sysfs_entry(struct efivars *efivars,
int i, short_name_size = variable_name_size / sizeof(efi_char16_t) + 38;
char *short_name;
struct efivar_entry *new_efivar;
+ unsigned long flags;
short_name = kzalloc(short_name_size + 1, GFP_KERNEL);
new_efivar = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL);
@@ -1032,9 +1035,9 @@ efivar_create_sysfs_entry(struct efivars *efivars,
kfree(short_name);
short_name = NULL;
- spin_lock(&efivars->lock);
+ spin_lock_irqsave(&efivars->lock, flags);
list_add(&new_efivar->list, &efivars->list);
- spin_unlock(&efivars->lock);
+ spin_unlock_irqrestore(&efivars->lock, flags);
return 0;
}
@@ -1103,9 +1106,9 @@ void unregister_efivars(struct efivars *efivars)
struct efivar_entry *entry, *n;
list_for_each_entry_safe(entry, n, &efivars->list, list) {
- spin_lock(&efivars->lock);
+ spin_lock_irq(&efivars->lock);
list_del(&entry->list);
- spin_unlock(&efivars->lock);
+ spin_unlock_irq(&efivars->lock);
efivar_unregister(entry);
}
if (efivars->new_var)
-- 1.7.1
|