You can subscribe to this list here.
| 2009 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(32) |
Jun
(66) |
Jul
(102) |
Aug
(78) |
Sep
(106) |
Oct
(137) |
Nov
(147) |
Dec
(147) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2010 |
Jan
(71) |
Feb
(139) |
Mar
(86) |
Apr
(76) |
May
(57) |
Jun
(10) |
Jul
(12) |
Aug
(6) |
Sep
(8) |
Oct
(12) |
Nov
(12) |
Dec
(18) |
| 2011 |
Jan
(16) |
Feb
(19) |
Mar
(3) |
Apr
(1) |
May
(16) |
Jun
(17) |
Jul
(74) |
Aug
(22) |
Sep
(18) |
Oct
(24) |
Nov
(21) |
Dec
(30) |
| 2012 |
Jan
(31) |
Feb
(16) |
Mar
(22) |
Apr
(25) |
May
(18) |
Jun
(13) |
Jul
(83) |
Aug
(49) |
Sep
(20) |
Oct
(60) |
Nov
(35) |
Dec
(28) |
| 2013 |
Jan
(39) |
Feb
(61) |
Mar
(35) |
Apr
(21) |
May
(45) |
Jun
(56) |
Jul
(20) |
Aug
(9) |
Sep
(10) |
Oct
(31) |
Nov
(8) |
Dec
(4) |
| 2014 |
Jan
(6) |
Feb
(7) |
Mar
(7) |
Apr
(6) |
May
(4) |
Jun
(8) |
Jul
(5) |
Aug
(2) |
Sep
(4) |
Oct
(4) |
Nov
(11) |
Dec
(5) |
| 2015 |
Jan
(4) |
Feb
(4) |
Mar
(3) |
Apr
(4) |
May
(9) |
Jun
(4) |
Jul
(15) |
Aug
(8) |
Sep
(16) |
Oct
(18) |
Nov
(15) |
Dec
(7) |
| 2016 |
Jan
(20) |
Feb
(9) |
Mar
(15) |
Apr
(24) |
May
(16) |
Jun
(28) |
Jul
(22) |
Aug
(23) |
Sep
(18) |
Oct
(30) |
Nov
(40) |
Dec
(9) |
| 2017 |
Jan
(1) |
Feb
(8) |
Mar
(37) |
Apr
(26) |
May
(25) |
Jun
(46) |
Jul
(24) |
Aug
(9) |
Sep
|
Oct
|
Nov
|
Dec
|
|
From: Masami H. <mhi...@re...> - 2009-11-24 21:54:16
|
Move signal sending event to events/signal.h. This patch also renames
sched_signal_send event to signal_generate.
Changes in v4:
- Fix a typo of task_struct pointer.
Changes in v3:
- Add docbook style comments
Changes in v2:
- Add siginfo argument
- Add siginfo storing macro
Signed-off-by: Masami Hiramatsu <mhi...@re...>
Reviewed-by: Jason Baron <jb...@re...>
Cc: Oleg Nesterov <ol...@re...>
Cc: Roland McGrath <ro...@re...>
Cc: Ingo Molnar <mi...@el...>
---
Documentation/DocBook/tracepoint.tmpl | 5 +++
include/trace/events/sched.h | 25 -------------
include/trace/events/signal.h | 66 +++++++++++++++++++++++++++++++++
kernel/signal.c | 5 ++-
4 files changed, 74 insertions(+), 27 deletions(-)
create mode 100644 include/trace/events/signal.h
diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl
index b0756d0..8bca1d5 100644
--- a/Documentation/DocBook/tracepoint.tmpl
+++ b/Documentation/DocBook/tracepoint.tmpl
@@ -86,4 +86,9 @@
!Iinclude/trace/events/irq.h
</chapter>
+ <chapter id="signal">
+ <title>SIGNAL</title>
+!Iinclude/trace/events/signal.h
+ </chapter>
+
</book>
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index b50b985..b221bb3 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -320,31 +320,6 @@ TRACE_EVENT(sched_process_fork,
);
/*
- * Tracepoint for sending a signal:
- */
-TRACE_EVENT(sched_signal_send,
-
- TP_PROTO(int sig, struct task_struct *p),
-
- TP_ARGS(sig, p),
-
- TP_STRUCT__entry(
- __field( int, sig )
- __array( char, comm, TASK_COMM_LEN )
- __field( pid_t, pid )
- ),
-
- TP_fast_assign(
- memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
- __entry->pid = p->pid;
- __entry->sig = sig;
- ),
-
- TP_printk("sig=%d comm=%s pid=%d",
- __entry->sig, __entry->comm, __entry->pid)
-);
-
-/*
* XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
* adding sched_stat support to SCHED_FIFO/RR would be welcome.
*/
diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h
new file mode 100644
index 0000000..ef51756
--- /dev/null
+++ b/include/trace/events/signal.h
@@ -0,0 +1,66 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM signal
+
+#if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SIGNAL_H
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#define TP_STORE_SIGINFO(__entry, info) \
+ do { \
+ if (info == SEND_SIG_NOINFO) { \
+ __entry->errno = 0; \
+ __entry->code = SI_USER; \
+ } else if (info == SEND_SIG_PRIV) { \
+ __entry->errno = 0; \
+ __entry->code = SI_KERNEL; \
+ } else { \
+ __entry->errno = info->si_errno; \
+ __entry->code = info->si_code; \
+ } \
+ } while (0)
+
+/**
+ * signal_generate - called when a signal is generated
+ * @sig: signal number
+ * @info: pointer to struct siginfo
+ * @task: pointer to struct task_struct
+ *
+ * Current process sends a 'sig' signal to 'task' process with
+ * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV,
+ * 'info' is not a pointer and you can't access its field. Instead,
+ * SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV
+ * means that si_code is SI_KERNEL.
+ */
+TRACE_EVENT(signal_generate,
+
+ TP_PROTO(int sig, struct siginfo *info, struct task_struct *task),
+
+ TP_ARGS(sig, info, task),
+
+ TP_STRUCT__entry(
+ __field( int, sig )
+ __field( int, errno )
+ __field( int, code )
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ ),
+
+ TP_fast_assign(
+ __entry->sig = sig;
+ TP_STORE_SIGINFO(__entry, info);
+ memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+ __entry->pid = task->pid;
+ ),
+
+ TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d",
+ __entry->sig, __entry->errno, __entry->code,
+ __entry->comm, __entry->pid)
+);
+
+#endif /* _TRACE_SIGNAL_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index fe08008..54ac4c5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -28,7 +28,8 @@
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
-#include <trace/events/sched.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/signal.h>
#include <asm/param.h>
#include <asm/uaccess.h>
@@ -856,7 +857,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
struct sigqueue *q;
int override_rlimit;
- trace_sched_signal_send(sig, t);
+ trace_signal_generate(sig, info, t);
assert_spin_locked(&t->sighand->siglock);
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-11-24 21:54:13
|
Hi,
These patches add signal related tracepoints including
signal generation, delivery, and loss. First patch also
moves signal-sending tracepoint from events/sched.h to
events/signal.h.
Changes in v4
- Fix typo of task_struct pointer
Changes in v3
- Add Docbook style comments
Changes in v2
- Add siginfo arguments
Thank you,
---
Masami Hiramatsu (3):
tracepoint: Add signal loss events
tracepoint: Add signal deliver event
tracepoint: Move signal sending tracepoint to events/signal.h
Documentation/DocBook/tracepoint.tmpl | 5 +
include/trace/events/sched.h | 25 -----
include/trace/events/signal.h | 173 +++++++++++++++++++++++++++++++++
kernel/signal.c | 27 ++++-
4 files changed, 198 insertions(+), 32 deletions(-)
create mode 100644 include/trace/events/signal.h
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-11-24 21:45:31
|
Oleg Nesterov wrote: > On 11/23, Ingo Molnar wrote: >> >> * Masami Hiramatsu<mhi...@re...> wrote: >> >>> Hi, >>> >>> These patches add signal related tracepoints including >>> signal generation, delivery, and loss. First patch also >>> moves signal-sending tracepoint from events/sched.h to >>> events/signal.h. >>> >>> Changes in v3 >>> - Add Docbook style comments >>> >>> Changes in v2 >>> - Add siginfo arguments >>> >>> Thank you, >>> >>> --- >>> >>> Masami Hiramatsu (3): >>> tracepoint: Add signal loss events >>> tracepoint: Add signal deliver event >>> tracepoint: Move signal sending tracepoint to events/signal.h >>> >>> >>> Documentation/DocBook/tracepoint.tmpl | 5 + >>> include/trace/events/sched.h | 25 ----- >>> include/trace/events/signal.h | 173 +++++++++++++++++++++++++++++++++ >>> kernel/signal.c | 27 ++++- >>> 4 files changed, 198 insertions(+), 32 deletions(-) >>> create mode 100644 include/trace/events/signal.h >> >> Would be nice to have Roland's and Oleg's Acked-by tags in the patches - >> to show that this is a representative and useful looking set of signal >> events. > > Sorry, I can't really comment these patches. > > I mean, I do not know which info is useful and which is not. > For example, I am a bit surprized we report trace_signal_lose_info() > but please do not consider this as if I think we shouldn't. Just I > do not know. > > OTOH, we do not report if __send_signal() fails just because the > legacy signal is already queued. We do not report who sends the signal, > we do not report if it was private or shared. zap_process, complete_signal > can "send" SIGKILL via sigaddset, this won't be noticed. But again, it is > not that I think this should be reported. > > In short: I think any info may be useful, and these patches can help. > But I do not understand what exactly should be reported to userspace. Yeah, any comments are welcome:-) IMHO, these tracepoints are just for providing options for users who care about who sent the signal, etc. Thank you, -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
|
From: Oleg N. <ol...@re...> - 2009-11-24 21:28:51
|
On 11/23, Ingo Molnar wrote: > > * Masami Hiramatsu <mhi...@re...> wrote: > > > Hi, > > > > These patches add signal related tracepoints including > > signal generation, delivery, and loss. First patch also > > moves signal-sending tracepoint from events/sched.h to > > events/signal.h. > > > > Changes in v3 > > - Add Docbook style comments > > > > Changes in v2 > > - Add siginfo arguments > > > > Thank you, > > > > --- > > > > Masami Hiramatsu (3): > > tracepoint: Add signal loss events > > tracepoint: Add signal deliver event > > tracepoint: Move signal sending tracepoint to events/signal.h > > > > > > Documentation/DocBook/tracepoint.tmpl | 5 + > > include/trace/events/sched.h | 25 ----- > > include/trace/events/signal.h | 173 +++++++++++++++++++++++++++++++++ > > kernel/signal.c | 27 ++++- > > 4 files changed, 198 insertions(+), 32 deletions(-) > > create mode 100644 include/trace/events/signal.h > > Would be nice to have Roland's and Oleg's Acked-by tags in the patches - > to show that this is a representative and useful looking set of signal > events. Sorry, I can't really comment these patches. I mean, I do not know which info is useful and which is not. For example, I am a bit surprized we report trace_signal_lose_info() but please do not consider this as if I think we shouldn't. Just I do not know. OTOH, we do not report if __send_signal() fails just because the legacy signal is already queued. We do not report who sends the signal, we do not report if it was private or shared. zap_process, complete_signal can "send" SIGKILL via sigaddset, this won't be noticed. But again, it is not that I think this should be reported. In short: I think any info may be useful, and these patches can help. But I do not understand what exactly should be reported to userspace. Oleg. |
|
From: Masami H. <mhi...@re...> - 2009-11-24 21:17:46
|
Frederic Weisbecker wrote: > On Tue, Nov 24, 2009 at 10:34:08AM -0500, Masami Hiramatsu wrote: >>> And this comment doesn't tell us much what this brings us. >>> The changelog tells it stands to avoid a text_mutex deadlock. >>> I'm not sure why we would deadlock without it. >> >> As Mathieu and I discussed on LKML (http://lkml.org/lkml/2009/11/21/187) >> text_mutex will be locked on the way of cpu-hotplug. >> Since kprobes locks text_mutex too and stop_machine() refers online_cpus, >> it will cause a dead-lock. So, I decided to use get_online_cpus() to >> locking hotplug while optimizing/unoptimizng. > > > Ah ok :) > Could you add a comment in the code that explains it? Sure, of course :-) -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
|
From: H. P. A. <hp...@zy...> - 2009-11-24 21:12:15
|
On 11/24/2009 12:14 PM, Frederic Weisbecker wrote: > > PS: hmm btw I remember about a patch that > tagged refrigerator() as __cold but it looks like it hasn't been > applied.... > Groan! That hurt! -hpa |
|
From: Masami H. <mhi...@re...> - 2009-11-24 21:00:29
|
Oleg Nesterov wrote: > On 11/20, Masami Hiramatsu wrote: >> >> +TRACE_EVENT(signal_generate, >> + >> + TP_PROTO(int sig, struct siginfo *info, struct task_struct *task), >> + >> + TP_ARGS(sig, info, p), > ^^^ > s/p/task/ ? Oops, right. Thank you for reviewing! > > Oleg. > -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
|
From: Oleg N. <ol...@re...> - 2009-11-24 21:00:19
|
On 11/20, Masami Hiramatsu wrote:
>
> +TRACE_EVENT(signal_generate,
> +
> + TP_PROTO(int sig, struct siginfo *info, struct task_struct *task),
> +
> + TP_ARGS(sig, info, p),
^^^
s/p/task/ ?
Oleg.
|
|
From: Masami H. <mhi...@re...> - 2009-11-24 21:00:04
|
Frederic Weisbecker wrote: > On Tue, Nov 24, 2009 at 10:34:16AM -0500, Masami Hiramatsu wrote: >> Frederic Weisbecker wrote: >>> I _might_ have understood. >>> You have set up the optimized flags, then you wait for >>> any old-style int 3 kprobes to complete and route >>> to detour buffer so that you can patch the jump >>> safely in the dead code? (and finish with first byte >>> by patching the int 3 itself) >>> >> >> Yeah, you might get almost correct answer. >> The reason why we have to wait scheduling on all processors >> is that this code may modify N instructions (not a single >> instruction). This means, there is a chance that 2nd to nth >> instructions are interrupted on other cpus when we start >> code modifying. > > > Aaah ok! > > In this case, you probably just need the synchronize_sched() > thing. The delayed work looks unnecessary. Yeah, the delayed work is for speeding up batch registration which kprobes are already supported. Sometimes ~100 probes can be set via batch registration I/F. >> Please imagine that 2nd instruction is interrupted and >> stop_machine() replaces the 2nd instruction with jump >> *address* while running interrupt handler. When the interrupt >> returns to original address, there is no valid instructions >> and it causes unexpected result. > > > Yeah. > > >> >> To avoid this situation, we have to wait a scheduler quiescent >> state on all cpus, because it also ensure that all current >> interruption are done. > > > Ok. > > >> This also excuses why we don't need to wait when unoptimizing >> and why it has not supported preemptive kernel yet. > > > I see...so the non-preemptible kernel requirement looks > hard to workaround :-s It's the next challenge I think :-) Even though, kprobes itself still work on preemptive kernel, so we don't lose any functionality. >> In unoptimizing case, since there is just a single instruction >> (jump), there is no nth instruction which can be interrupted. >> Thus we can just use a stop_machine(). :-) > > > Ok. > > >> >> On the preemptive kernel, waiting scheduling is not work as we >> see on non-preemptive kernel. Since processes can be preempted >> in interruption, we can't ensure that the current running >> interruption is done. (I assume that a pair of freeze_processes >> and thaw_processes may possibly ensure that, or maybe we can >> share some stack rewinding code with ksplice.) >> So it depends on !PREEMPT. > > > > Right. > However using freeze_processes() and thaw_processes() would be > probably too costly and it's not a guarantee that every processes > go to the refrigerator() :-), because some tasks are not freezable, > like the kernel threads by default if I remember well, unless they > call set_freezable(). That's a pity, we would just have needed > to set __kprobe in refrigerator(). Ah, right. Even though, we still have an option of ksplice code. Thank you, > PS: hmm btw I remember about a patch that > tagged refrigerator() as __cold but it looks like it hasn't been > applied.... > > Thanks. > -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
|
From: Frederic W. <fwe...@gm...> - 2009-11-24 20:20:17
|
On Tue, Nov 24, 2009 at 10:40:24AM -0500, Frank Ch. Eigler wrote: > Frederic Weisbecker <fwe...@gm...> writes: > > > [...] > >> +#define SAVE_REGS_STRING \ > >> +#define RESTORE_REGS_STRING \ > > > > BTW, do you really need to push/pop every registers > > before/after calling a probe handler? > > It's part of the definition of a kprobe, that a populated > pt_regs* value is passed. Clients can rely on that in order > to access registers etc. > > - FChE Yeah I made a confusion. Sorry. |
|
From: Frederic W. <fwe...@gm...> - 2009-11-24 20:19:36
|
On Tue, Nov 24, 2009 at 10:39:13AM -0500, Masami Hiramatsu wrote: > Frederic Weisbecker wrote: > > On Mon, Nov 23, 2009 at 06:22:04PM -0500, Masami Hiramatsu wrote: > >> +#ifdef CONFIG_X86_64 > >> +#define SAVE_REGS_STRING \ > >> + /* Skip cs, ip, orig_ax. */ \ > >> + " subq $24, %rsp\n" \ > >> + " pushq %rdi\n" \ > >> + " pushq %rsi\n" \ > >> + " pushq %rdx\n" \ > >> + " pushq %rcx\n" \ > >> + " pushq %rax\n" \ > >> + " pushq %r8\n" \ > >> + " pushq %r9\n" \ > >> + " pushq %r10\n" \ > >> + " pushq %r11\n" \ > >> + " pushq %rbx\n" \ > >> + " pushq %rbp\n" \ > >> + " pushq %r12\n" \ > >> + " pushq %r13\n" \ > >> + " pushq %r14\n" \ > >> + " pushq %r15\n" > >> +#define RESTORE_REGS_STRING \ > >> + " popq %r15\n" \ > >> + " popq %r14\n" \ > >> + " popq %r13\n" \ > >> + " popq %r12\n" \ > >> + " popq %rbp\n" \ > >> + " popq %rbx\n" \ > >> + " popq %r11\n" \ > >> + " popq %r10\n" \ > >> + " popq %r9\n" \ > >> + " popq %r8\n" \ > >> + " popq %rax\n" \ > >> + " popq %rcx\n" \ > >> + " popq %rdx\n" \ > >> + " popq %rsi\n" \ > >> + " popq %rdi\n" \ > > > > > > BTW, do you really need to push/pop every registers > > before/after calling a probe handler? > > Yes, in both cases (kretprobe/optprpbe) it needs to > emulate kprobes behavior. kprobes can be used as > fault injection, it should pop pt_regs. > > > Is it possible to only save/restore the scratch ones? > > Hmm, what code did you mean? Ah this chain of push/pop is there to dump a struct pt_regs for the handler? Sorry, I just thought it was to save the registers from the probed function. |
|
From: Frederic W. <fwe...@gm...> - 2009-11-24 20:14:13
|
On Tue, Nov 24, 2009 at 10:34:16AM -0500, Masami Hiramatsu wrote: > Frederic Weisbecker wrote: > > I _might_ have understood. > > You have set up the optimized flags, then you wait for > > any old-style int 3 kprobes to complete and route > > to detour buffer so that you can patch the jump > > safely in the dead code? (and finish with first byte > > by patching the int 3 itself) > > > > Yeah, you might get almost correct answer. > The reason why we have to wait scheduling on all processors > is that this code may modify N instructions (not a single > instruction). This means, there is a chance that 2nd to nth > instructions are interrupted on other cpus when we start > code modifying. Aaah ok! In this case, you probably just need the synchronize_sched() thing. The delayed work looks unnecessary. > Please imagine that 2nd instruction is interrupted and > stop_machine() replaces the 2nd instruction with jump > *address* while running interrupt handler. When the interrupt > returns to original address, there is no valid instructions > and it causes unexpected result. Yeah. > > To avoid this situation, we have to wait a scheduler quiescent > state on all cpus, because it also ensure that all current > interruption are done. Ok. > This also excuses why we don't need to wait when unoptimizing > and why it has not supported preemptive kernel yet. I see...so the non-preemptible kernel requirement looks hard to workaround :-s > In unoptimizing case, since there is just a single instruction > (jump), there is no nth instruction which can be interrupted. > Thus we can just use a stop_machine(). :-) Ok. > > On the preemptive kernel, waiting scheduling is not work as we > see on non-preemptive kernel. Since processes can be preempted > in interruption, we can't ensure that the current running > interruption is done. (I assume that a pair of freeze_processes > and thaw_processes may possibly ensure that, or maybe we can > share some stack rewinding code with ksplice.) > So it depends on !PREEMPT. Right. However using freeze_processes() and thaw_processes() would be probably too costly and it's not a guarantee that every processes go to the refrigerator() :-), because some tasks are not freezable, like the kernel threads by default if I remember well, unless they call set_freezable(). That's a pity, we would just have needed to set __kprobe in refrigerator(). PS: hmm btw I remember about a patch that tagged refrigerator() as __cold but it looks like it hasn't been applied.... Thanks. |
|
From: Frederic W. <fwe...@gm...> - 2009-11-24 19:45:45
|
On Tue, Nov 24, 2009 at 10:34:08AM -0500, Masami Hiramatsu wrote: > > And this comment doesn't tell us much what this brings us. > > The changelog tells it stands to avoid a text_mutex deadlock. > > I'm not sure why we would deadlock without it. > > As Mathieu and I discussed on LKML (http://lkml.org/lkml/2009/11/21/187) > text_mutex will be locked on the way of cpu-hotplug. > Since kprobes locks text_mutex too and stop_machine() refers online_cpus, > it will cause a dead-lock. So, I decided to use get_online_cpus() to > locking hotplug while optimizing/unoptimizng. Ah ok :) Could you add a comment in the code that explains it? Thanks. |
|
From: Masami H. <mhi...@re...> - 2009-11-24 17:49:21
|
Jason Baron wrote:
[...]
>> +/*
>> + * Cross-modifying kernel text with stop_machine().
>> + * This code originally comes from immediate value.
>> + * This does _not_ protect against NMI and MCE. However,
>> + * since kprobes can't probe NMI/MCE handler, it is OK for kprobes.
>> + */
>> +static atomic_t stop_machine_first;
>> +static int wrote_text;
>> +
>> +struct text_poke_param {
>> + void *addr;
>> + const void *opcode;
>> + size_t len;
>> +};
>> +
>> +static int __kprobes stop_machine_multibyte_poke(void *data)
>> +{
>> + struct text_poke_param *tpp = data;
>> +
>> + if (atomic_dec_and_test(&stop_machine_first)) {
>> + text_poke(tpp->addr, tpp->opcode, tpp->len);
>> + smp_wmb(); /* Make sure other cpus see that this has run */
>> + wrote_text = 1;
>> + } else {
>> + while (!wrote_text)
>> + smp_rmb();
>> + sync_core();
>> + }
>> +
>> + flush_icache_range((unsigned long)tpp->addr,
>> + (unsigned long)tpp->addr + tpp->len);
>> + return 0;
>> +}
>> +
>> +static void *__kprobes __multibyte_poke(void *addr, const void *opcode,
>> + size_t len)
>> +{
>> + struct text_poke_param tpp;
>> +
>> + tpp.addr = addr;
>> + tpp.opcode = opcode;
>> + tpp.len = len;
>> + atomic_set(&stop_machine_first, 1);
>> + wrote_text = 0;
>> + stop_machine(stop_machine_multibyte_poke, (void *)&tpp, NULL);
>> + return addr;
>> +}
>
> As you know, I'd like to have the jump label optimization for
> tracepoints, make use of this '__multibyte_poke()' interface. So perhaps
> it can be moved to arch/x86/kernel/alternative.c. This is where 'text_poke()'
> and friends currently live.
Hmm, maybe current text_poke needs to have singlebyte_poke() wrapper
for avoiding confusion.
> Also, with multiple users we don't want to trample over each others code
> patching. Thus, if each sub-system could register some type of
> 'is_reserved()' callback, and then we can call all these call backs from
> the '__multibyte_poke()' routine before we do any patching to make sure
> that we aren't trampling on each others code. After a successful
> patching, each sub-system can update its reserved set of code as
> appropriate. I can code a prototype here, if this makes sense.
Hmm, we have to implement it carefully, because here kprobes already
inserted int3 and optprobe rewrites the int3 again. If is_reserved()
returns 1 and multibyte_poke returns error, we can't optimize it anymore.
Thank you,
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-11-24 17:01:47
|
Hi Peter, H. Peter Anvin wrote: > On 11/23/2009 03:22 PM, Masami Hiramatsu wrote: >> >> This uses stop_machine() for corss modifying code from int3 to jump. >> It doesn't allow us to modify code on NMI/SMI path. However, since >> kprobes itself doesn't support NMI/SMI code probing, it's not a >> problem. >> > > I'm a bit confused by the above statement... does that mean you're > poking int3 and *then* do stop_machine()? Yes, as I said in http://lkml.org/lkml/2009/11/24/310, there are two separated issues. ---- We have to separate below issues: - int3-based multi-bytes code replacement - multi-instruction replacement with int3-detour code The former is implemented on patch 9/10 and 10/10. As you can see, these patches are RFC status, because I'd like to wait for official reply of safeness from processor architects. And it may be able to use a dummy IPI for 2nd IPI because it just for waiting int3 interrupts. But again, it is just estimated that replacing with/recovering from int3 is automatically synchronized... However, at least stop_machine() method is officially described at "7.1.3 Handling Self- and Cross-Modifying Code" on the intel's software developer's manual 3A . So currently we can use it. For the latter issue, as I explained on previous reply, we need to wait all running interrupts including hardware interrupts. Thus I used synchronize_sched(). ---- So that the previous "x86 generic jump patching" patch is basically for single-instruction replacement. For multi-instructions replacement, we need to make detour code and wait for all running interruption. (of course, there are other static code limitations, as I described at "Safety check" section in patch 0/10.) Thank you, -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
|
From: H. P. A. <hp...@zy...> - 2009-11-24 16:42:06
|
On 11/23/2009 03:22 PM, Masami Hiramatsu wrote: > > This uses stop_machine() for corss modifying code from int3 to jump. > It doesn't allow us to modify code on NMI/SMI path. However, since > kprobes itself doesn't support NMI/SMI code probing, it's not a > problem. > I'm a bit confused by the above statement... does that mean you're poking int3 and *then* do stop_machine()? -hpa -- H. Peter Anvin, Intel Open Source Technology Center I work for Intel. I don't speak on their behalf. |
|
From: Jason B. <jb...@re...> - 2009-11-24 16:28:46
|
On Mon, Nov 23, 2009 at 06:22:11PM -0500, Masami Hiramatsu wrote:
> Introduce x86 arch-specific optimization code, which supports both of
> x86-32 and x86-64.
>
> This code also supports safety checking, which decodes whole of a function
> in which probe is inserted, and checks following conditions before
> optimization:
> - The optimized instructions which will be replaced by a jump instruction
> don't straddle the function boundary.
> - There is no indirect jump instruction, because it will jumps into
> the address range which is replaced by jump operand.
> - There is no jump/loop instruction which jumps into the address range
> which is replaced by jump operand.
> - Don't optimize kprobes if it is in functions into which fixup code will
> jumps.
>
> This uses stop_machine() for corss modifying code from int3 to jump.
> It doesn't allow us to modify code on NMI/SMI path. However, since
> kprobes itself doesn't support NMI/SMI code probing, it's not a
> problem.
>
> Changes in v5:
> - Introduce stop_machine-based jump replacing.
>
> Signed-off-by: Masami Hiramatsu <mhi...@re...>
> Cc: Ananth N Mavinakayanahalli <an...@in...>
> Cc: Ingo Molnar <mi...@el...>
> Cc: Jim Keniston <jke...@us...>
> Cc: Srikar Dronamraju <sr...@li...>
> Cc: Christoph Hellwig <hc...@in...>
> Cc: Steven Rostedt <ro...@go...>
> Cc: Frederic Weisbecker <fwe...@gm...>
> Cc: H. Peter Anvin <hp...@zy...>
> Cc: Anders Kaseorg <an...@ks...>
> Cc: Tim Abbott <ta...@ks...>
> Cc: Andi Kleen <an...@fi...>
> Cc: Jason Baron <jb...@re...>
> Cc: Mathieu Desnoyers <mat...@po...>
> ---
>
> arch/x86/Kconfig | 1
> arch/x86/include/asm/kprobes.h | 29 +++
> arch/x86/kernel/kprobes.c | 457 ++++++++++++++++++++++++++++++++++++++--
> 3 files changed, 465 insertions(+), 22 deletions(-)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 17abcfa..af0313e 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -31,6 +31,7 @@ config X86
> select ARCH_WANT_FRAME_POINTERS
> select HAVE_DMA_ATTRS
> select HAVE_KRETPROBES
> + select HAVE_OPTPROBES
> select HAVE_FTRACE_MCOUNT_RECORD
> select HAVE_DYNAMIC_FTRACE
> select HAVE_FUNCTION_TRACER
> diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
> index eaec8ea..4ffa345 100644
> --- a/arch/x86/include/asm/kprobes.h
> +++ b/arch/x86/include/asm/kprobes.h
> @@ -33,6 +33,9 @@ struct kprobe;
> typedef u8 kprobe_opcode_t;
> #define BREAKPOINT_INSTRUCTION 0xcc
> #define RELATIVEJUMP_OPCODE 0xe9
> +#define RELATIVEJUMP_SIZE 5
> +#define RELATIVECALL_OPCODE 0xe8
> +#define RELATIVE_ADDR_SIZE 4
> #define MAX_INSN_SIZE 16
> #define MAX_STACK_SIZE 64
> #define MIN_STACK_SIZE(ADDR) \
> @@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t;
>
> #define flush_insn_slot(p) do { } while (0)
>
> +/* optinsn template addresses */
> +extern kprobe_opcode_t optprobe_template_entry;
> +extern kprobe_opcode_t optprobe_template_val;
> +extern kprobe_opcode_t optprobe_template_call;
> +extern kprobe_opcode_t optprobe_template_end;
> +#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
> +#define MAX_OPTINSN_SIZE \
> + (((unsigned long)&optprobe_template_end - \
> + (unsigned long)&optprobe_template_entry) + \
> + MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE)
> +
> extern const int kretprobe_blacklist_size;
>
> void arch_remove_kprobe(struct kprobe *p);
> @@ -64,6 +78,21 @@ struct arch_specific_insn {
> int boostable;
> };
>
> +struct arch_optimized_insn {
> + /* copy of the original instructions */
> + kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE];
> + /* detour code buffer */
> + kprobe_opcode_t *insn;
> + /* the size of instructions copied to detour code buffer */
> + size_t size;
> +};
> +
> +/* Return true (!0) if optinsn is prepared for optimization. */
> +static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
> +{
> + return optinsn->size;
> +}
> +
> struct prev_kprobe {
> struct kprobe *kp;
> unsigned long status;
> diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
> index 73ac21e..6d81c11 100644
> --- a/arch/x86/kernel/kprobes.c
> +++ b/arch/x86/kernel/kprobes.c
> @@ -49,6 +49,7 @@
> #include <linux/module.h>
> #include <linux/kdebug.h>
> #include <linux/kallsyms.h>
> +#include <linux/stop_machine.h>
>
> #include <asm/cacheflush.h>
> #include <asm/desc.h>
> @@ -106,16 +107,21 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
> };
> const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
>
> -/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
> -static void __kprobes set_jmp_op(void *from, void *to)
> +static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
> {
> - struct __arch_jmp_op {
> - char op;
> + struct __arch_relative_insn {
> + u8 op;
> s32 raddr;
> - } __attribute__((packed)) * jop;
> - jop = (struct __arch_jmp_op *)from;
> - jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
> - jop->op = RELATIVEJUMP_OPCODE;
> + } __attribute__((packed)) *insn;
> + insn = (struct __arch_relative_insn *)from;
> + insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
> + insn->op = op;
> +}
> +
> +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
> +static void __kprobes synthesize_reljump(void *from, void *to)
> +{
> + __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
> }
>
> /*
> @@ -202,7 +208,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
> /*
> * Basically, kp->ainsn.insn has an original instruction.
> * However, RIP-relative instruction can not do single-stepping
> - * at different place, fix_riprel() tweaks the displacement of
> + * at different place, __copy_instruction() tweaks the displacement of
> * that instruction. In that case, we can't recover the instruction
> * from the kp->ainsn.insn.
> *
> @@ -284,21 +290,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
> }
>
> /*
> - * Adjust the displacement if the instruction uses the %rip-relative
> - * addressing mode.
> + * Copy an instruction and adjust the displacement if the instruction
> + * uses the %rip-relative addressing mode.
> * If it does, Return the address of the 32-bit displacement word.
> * If not, return null.
> * Only applicable to 64-bit x86.
> */
> -static void __kprobes fix_riprel(struct kprobe *p)
> +static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
> {
> -#ifdef CONFIG_X86_64
> struct insn insn;
> - kernel_insn_init(&insn, p->ainsn.insn);
> + int ret;
> + kprobe_opcode_t buf[MAX_INSN_SIZE];
> +
> + kernel_insn_init(&insn, src);
> + if (recover) {
> + insn_get_opcode(&insn);
> + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
> + ret = recover_probed_instruction(buf,
> + (unsigned long)src);
> + if (ret)
> + return 0;
> + kernel_insn_init(&insn, buf);
> + }
> + }
> + insn_get_length(&insn);
> + memcpy(dest, insn.kaddr, insn.length);
>
> +#ifdef CONFIG_X86_64
> if (insn_rip_relative(&insn)) {
> s64 newdisp;
> u8 *disp;
> + kernel_insn_init(&insn, dest);
> insn_get_displacement(&insn);
> /*
> * The copied instruction uses the %rip-relative addressing
> @@ -312,20 +334,23 @@ static void __kprobes fix_riprel(struct kprobe *p)
> * extension of the original signed 32-bit displacement would
> * have given.
> */
> - newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
> - (u8 *) p->ainsn.insn;
> + newdisp = (u8 *) src + (s64) insn.displacement.value -
> + (u8 *) dest;
> BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
> - disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
> + disp = (u8 *) dest + insn_offset_displacement(&insn);
> *(s32 *) disp = (s32) newdisp;
> }
> #endif
> + return insn.length;
> }
>
> static void __kprobes arch_copy_kprobe(struct kprobe *p)
> {
> - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
> -
> - fix_riprel(p);
> + /*
> + * Copy an instruction without recovering int3, because it will be
> + * put by another subsystem.
> + */
> + __copy_instruction(p->ainsn.insn, p->addr, 0);
>
> if (can_boost(p->addr))
> p->ainsn.boostable = 0;
> @@ -414,9 +439,20 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
> *sara = (unsigned long) &kretprobe_trampoline;
> }
>
> +#ifdef CONFIG_OPTPROBES
> +static int __kprobes setup_detour_execution(struct kprobe *p,
> + struct pt_regs *regs,
> + int reenter);
> +#else
> +#define setup_detour_execution(p, regs, reenter) (0)
> +#endif
> +
> static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
> struct kprobe_ctlblk *kcb, int reenter)
> {
> + if (setup_detour_execution(p, regs, reenter))
> + return;
> +
> #if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER)
> if (p->ainsn.boostable == 1 && !p->post_handler) {
> /* Boost up -- we can execute copied instructions directly */
> @@ -812,8 +848,8 @@ static void __kprobes resume_execution(struct kprobe *p,
> * These instructions can be executed directly if it
> * jumps back to correct address.
> */
> - set_jmp_op((void *)regs->ip,
> - (void *)orig_ip + (regs->ip - copy_ip));
> + synthesize_reljump((void *)regs->ip,
> + (void *)orig_ip + (regs->ip - copy_ip));
> p->ainsn.boostable = 1;
> } else {
> p->ainsn.boostable = -1;
> @@ -1040,6 +1076,383 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
> return 0;
> }
>
> +
> +#ifdef CONFIG_OPTPROBES
> +
> +/* Insert a call instruction at address 'from', which calls address 'to'.*/
> +static void __kprobes synthesize_relcall(void *from, void *to)
> +{
> + __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
> +}
> +
> +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
> +static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
> + unsigned long val)
> +{
> +#ifdef CONFIG_X86_64
> + *addr++ = 0x48;
> + *addr++ = 0xbf;
> +#else
> + *addr++ = 0xb8;
> +#endif
> + *(unsigned long *)addr = val;
> +}
> +
> +void __kprobes kprobes_optinsn_template_holder(void)
> +{
> + asm volatile (
> + ".global optprobe_template_entry\n"
> + "optprobe_template_entry: \n"
> +#ifdef CONFIG_X86_64
> + /* We don't bother saving the ss register */
> + " pushq %rsp\n"
> + " pushfq\n"
> + SAVE_REGS_STRING
> + " movq %rsp, %rsi\n"
> + ".global optprobe_template_val\n"
> + "optprobe_template_val: \n"
> + ASM_NOP5
> + ASM_NOP5
> + ".global optprobe_template_call\n"
> + "optprobe_template_call: \n"
> + ASM_NOP5
> + /* Move flags to rsp */
> + " movq 144(%rsp), %rdx\n"
> + " movq %rdx, 152(%rsp)\n"
> + RESTORE_REGS_STRING
> + /* Skip flags entry */
> + " addq $8, %rsp\n"
> + " popfq\n"
> +#else /* CONFIG_X86_32 */
> + " pushf\n"
> + SAVE_REGS_STRING
> + " movl %esp, %edx\n"
> + ".global optprobe_template_val\n"
> + "optprobe_template_val: \n"
> + ASM_NOP5
> + ".global optprobe_template_call\n"
> + "optprobe_template_call: \n"
> + ASM_NOP5
> + RESTORE_REGS_STRING
> + " addl $4, %esp\n" /* skip cs */
> + " popf\n"
> +#endif
> + ".global optprobe_template_end\n"
> + "optprobe_template_end: \n");
> +}
> +
> +#define TMPL_MOVE_IDX \
> + ((long)&optprobe_template_val - (long)&optprobe_template_entry)
> +#define TMPL_CALL_IDX \
> + ((long)&optprobe_template_call - (long)&optprobe_template_entry)
> +#define TMPL_END_IDX \
> + ((long)&optprobe_template_end - (long)&optprobe_template_entry)
> +
> +#define INT3_SIZE sizeof(kprobe_opcode_t)
> +
> +/* Optimized kprobe call back function: called from optinsn */
> +static void __kprobes optimized_callback(struct optimized_kprobe *op,
> + struct pt_regs *regs)
> +{
> + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
> +
> + preempt_disable();
> + if (kprobe_running()) {
> + kprobes_inc_nmissed_count(&op->kp);
> + } else {
> + /* Save skipped registers */
> +#ifdef CONFIG_X86_64
> + regs->cs = __KERNEL_CS;
> +#else
> + regs->cs = __KERNEL_CS | get_kernel_rpl();
> + regs->gs = 0;
> +#endif
> + regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
> + regs->orig_ax = ~0UL;
> +
> + __get_cpu_var(current_kprobe) = &op->kp;
> + kcb->kprobe_status = KPROBE_HIT_ACTIVE;
> + opt_pre_handler(&op->kp, regs);
> + __get_cpu_var(current_kprobe) = NULL;
> + }
> + preempt_enable_no_resched();
> +}
> +
> +static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
> +{
> + int len = 0, ret;
> + while (len < RELATIVEJUMP_SIZE) {
> + ret = __copy_instruction(dest + len, src + len, 1);
> + if (!ret || !can_boost(dest + len))
> + return -EINVAL;
> + len += ret;
> + }
> + return len;
> +}
> +
> +/* Check whether insn is indirect jump */
> +static int __kprobes insn_is_indirect_jump(struct insn *insn)
> +{
> + return (insn->opcode.bytes[0] == 0xff ||
> + insn->opcode.bytes[0] == 0xea);
> +}
> +
> +/* Check whether insn jumps into specified address range */
> +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
> +{
> + unsigned long target = 0;
> + switch (insn->opcode.bytes[0]) {
> + case 0xe0: /* loopne */
> + case 0xe1: /* loope */
> + case 0xe2: /* loop */
> + case 0xe3: /* jcxz */
> + case 0xe9: /* near relative jump */
> + case 0xeb: /* short relative jump */
> + break;
> + case 0x0f:
> + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
> + break;
> + return 0;
> + default:
> + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
> + break;
> + return 0;
> + }
> + target = (unsigned long)insn->next_byte + insn->immediate.value;
> + return (start <= target && target <= start + len);
> +}
> +
> +/* Decode whole function to ensure any instructions don't jump into target */
> +static int __kprobes can_optimize(unsigned long paddr)
> +{
> + int ret;
> + unsigned long addr, size = 0, offset = 0;
> + struct insn insn;
> + kprobe_opcode_t buf[MAX_INSN_SIZE];
> + /* Dummy buffers for lookup_symbol_attrs */
> + static char __dummy_buf[KSYM_NAME_LEN];
> +
> + /* Lookup symbol including addr */
> + if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
> + return 0;
> +
> + /* Check there is enough space for a relative jump. */
> + if (size - offset < RELATIVEJUMP_SIZE)
> + return 0;
> +
> + /* Decode instructions */
> + addr = paddr - offset;
> + while (addr < paddr - offset + size) { /* Decode until function end */
> + if (search_exception_tables(addr))
> + /*
> + * Since some fixup code will jumps into this function,
> + * we can't optimize kprobe in this function.
> + */
> + return 0;
> + kernel_insn_init(&insn, (void *)addr);
> + insn_get_opcode(&insn);
> + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
> + ret = recover_probed_instruction(buf, addr);
> + if (ret)
> + return 0;
> + kernel_insn_init(&insn, buf);
> + }
> + insn_get_length(&insn);
> + /* Recover address */
> + insn.kaddr = (void *)addr;
> + insn.next_byte = (void *)(addr + insn.length);
> + /* Check any instructions don't jump into target */
> + if (insn_is_indirect_jump(&insn) ||
> + insn_jump_into_range(&insn, paddr + INT3_SIZE,
> + RELATIVE_ADDR_SIZE))
> + return 0;
> + addr += insn.length;
> + }
> +
> + return 1;
> +}
> +
> +/* Check optimized_kprobe can actually be optimized. */
> +int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
> +{
> + int i;
> + for (i = 1; i < op->optinsn.size; i++)
> + if (get_kprobe(op->kp.addr + i))
> + return -EEXIST;
> + return 0;
> +}
> +
> +/* Check the addr is within the optimized instructions. */
> +int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
> + unsigned long addr)
> +{
> + return ((unsigned long)op->kp.addr <= addr &&
> + (unsigned long)op->kp.addr + op->optinsn.size > addr);
> +}
> +
> +/* Free optimized instruction slot */
> +static __kprobes
> +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
> +{
> + if (op->optinsn.insn) {
> + free_optinsn_slot(op->optinsn.insn, dirty);
> + op->optinsn.insn = NULL;
> + op->optinsn.size = 0;
> + }
> +}
> +
> +void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
> +{
> + __arch_remove_optimized_kprobe(op, 1);
> +}
> +
> +/*
> + * Copy replacing target instructions
> + * Target instructions MUST be relocatable (checked inside)
> + */
> +int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
> +{
> + u8 *buf;
> + int ret;
> +
> + if (!can_optimize((unsigned long)op->kp.addr))
> + return -EILSEQ;
> +
> + op->optinsn.insn = get_optinsn_slot();
> + if (!op->optinsn.insn)
> + return -ENOMEM;
> +
> + buf = (u8 *)op->optinsn.insn;
> +
> + /* Copy instructions into the out-of-line buffer */
> + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
> + if (ret < 0) {
> + __arch_remove_optimized_kprobe(op, 0);
> + return ret;
> + }
> + op->optinsn.size = ret;
> +
> + /* Backup instructions which will be replaced by jump address */
> + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
> + RELATIVE_ADDR_SIZE);
> +
> + /* Copy arch-dep-instance from template */
> + memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
> +
> + /* Set probe information */
> + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
> +
> + /* Set probe function call */
> + synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
> +
> + /* Set returning jmp instruction at the tail of out-of-line buffer */
> + synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
> + (u8 *)op->kp.addr + op->optinsn.size);
> +
> + flush_icache_range((unsigned long) buf,
> + (unsigned long) buf + TMPL_END_IDX +
> + op->optinsn.size + RELATIVEJUMP_SIZE);
> + return 0;
> +}
> +
> +/*
> + * Cross-modifying kernel text with stop_machine().
> + * This code originally comes from immediate value.
> + * This does _not_ protect against NMI and MCE. However,
> + * since kprobes can't probe NMI/MCE handler, it is OK for kprobes.
> + */
> +static atomic_t stop_machine_first;
> +static int wrote_text;
> +
> +struct text_poke_param {
> + void *addr;
> + const void *opcode;
> + size_t len;
> +};
> +
> +static int __kprobes stop_machine_multibyte_poke(void *data)
> +{
> + struct text_poke_param *tpp = data;
> +
> + if (atomic_dec_and_test(&stop_machine_first)) {
> + text_poke(tpp->addr, tpp->opcode, tpp->len);
> + smp_wmb(); /* Make sure other cpus see that this has run */
> + wrote_text = 1;
> + } else {
> + while (!wrote_text)
> + smp_rmb();
> + sync_core();
> + }
> +
> + flush_icache_range((unsigned long)tpp->addr,
> + (unsigned long)tpp->addr + tpp->len);
> + return 0;
> +}
> +
> +static void *__kprobes __multibyte_poke(void *addr, const void *opcode,
> + size_t len)
> +{
> + struct text_poke_param tpp;
> +
> + tpp.addr = addr;
> + tpp.opcode = opcode;
> + tpp.len = len;
> + atomic_set(&stop_machine_first, 1);
> + wrote_text = 0;
> + stop_machine(stop_machine_multibyte_poke, (void *)&tpp, NULL);
> + return addr;
> +}
As you know, I'd like to have the jump label optimization for
tracepoints, make use of this '__multibyte_poke()' interface. So perhaps
it can be moved to arch/x86/kernel/alternative.c. This is where 'text_poke()'
and friends currently live.
Also, with multiple users we don't want to trample over each others code
patching. Thus, if each sub-system could register some type of
'is_reserved()' callback, and then we can call all these call backs from
the '__multibyte_poke()' routine before we do any patching to make sure
that we aren't trampling on each others code. After a successful
patching, each sub-system can update its reserved set of code as
appropriate. I can code a prototype here, if this makes sense.
thanks,
-Jason
|
|
From: Masami H. <mhi...@re...> - 2009-11-24 16:04:43
|
Ingo Molnar wrote: > > * Frederic Weisbecker <fwe...@gm...> wrote: > >> On Tue, Nov 24, 2009 at 03:03:19AM +0100, Frederic Weisbecker wrote: >>> On Mon, Nov 23, 2009 at 06:21:16PM -0500, Masami Hiramatsu wrote: >>>> When the optimized-kprobe is hit before optimization, its handler >>>> changes IP(instruction pointer) to copied code and exits. So, the >>>> instructions which were copied to detour buffer are executed on the detour >>>> buffer. >>> >>> >>> >>> Hm, why is it playing such hybrid game there? >>> If I understand well, we have executed int 3, executed the >>> handler and we jump back to the detour buffer? >>> >> >> I got it, I think. We have instructions to patch. And the above turn >> this area into dead code, safe to patch. >> >> But still, stop_machine() seem to make it not necessary anymore. > > i think 'sending an IPI to all online CPUs' might be an adequate > sequence to make sure patching effects have propagated. I.e. an > smp_call_function() with a dummy function? Hmm, I assume that you mean waiting for all int3 handler. We have to separate below issues: - int3-based multi-bytes code replacement - multi-instruction replacement with int3-detour code The former is implemented on patch 9/10 and 10/10. As you can see, these patches are RFC status, because I'd like to wait for official reply of safeness from processor architects. And it may be able to use a dummy IPI for 2nd IPI because it just for waiting int3 interrupts. But again, it is just estimated that replacing with/recovering from int3 is automatically synchronized... However, at least stop_machine() method is officially described at "7.1.3 Handling Self- and Cross-Modifying Code" on the intel's software developer's manual 3A . So currently we can use it. For the latter issue, as I explained on previous reply, we need to wait all running interrupts including hardware interrupts. Thus I used synchronize_sched(). Thank you, -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
|
From: <fc...@re...> - 2009-11-24 15:41:06
|
Frederic Weisbecker <fwe...@gm...> writes: > [...] >> +#define SAVE_REGS_STRING \ >> +#define RESTORE_REGS_STRING \ > > BTW, do you really need to push/pop every registers > before/after calling a probe handler? It's part of the definition of a kprobe, that a populated pt_regs* value is passed. Clients can rely on that in order to access registers etc. - FChE |
|
From: Masami H. <mhi...@re...> - 2009-11-24 15:37:38
|
Frederic Weisbecker wrote: > On Mon, Nov 23, 2009 at 06:22:04PM -0500, Masami Hiramatsu wrote: >> +#ifdef CONFIG_X86_64 >> +#define SAVE_REGS_STRING \ >> + /* Skip cs, ip, orig_ax. */ \ >> + " subq $24, %rsp\n" \ >> + " pushq %rdi\n" \ >> + " pushq %rsi\n" \ >> + " pushq %rdx\n" \ >> + " pushq %rcx\n" \ >> + " pushq %rax\n" \ >> + " pushq %r8\n" \ >> + " pushq %r9\n" \ >> + " pushq %r10\n" \ >> + " pushq %r11\n" \ >> + " pushq %rbx\n" \ >> + " pushq %rbp\n" \ >> + " pushq %r12\n" \ >> + " pushq %r13\n" \ >> + " pushq %r14\n" \ >> + " pushq %r15\n" >> +#define RESTORE_REGS_STRING \ >> + " popq %r15\n" \ >> + " popq %r14\n" \ >> + " popq %r13\n" \ >> + " popq %r12\n" \ >> + " popq %rbp\n" \ >> + " popq %rbx\n" \ >> + " popq %r11\n" \ >> + " popq %r10\n" \ >> + " popq %r9\n" \ >> + " popq %r8\n" \ >> + " popq %rax\n" \ >> + " popq %rcx\n" \ >> + " popq %rdx\n" \ >> + " popq %rsi\n" \ >> + " popq %rdi\n" \ > > > BTW, do you really need to push/pop every registers > before/after calling a probe handler? Yes, in both cases (kretprobe/optprpbe) it needs to emulate kprobes behavior. kprobes can be used as fault injection, it should pop pt_regs. > Is it possible to only save/restore the scratch ones? Hmm, what code did you mean? Thank you, -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
|
From: Masami H. <mhi...@re...> - 2009-11-24 15:32:35
|
Frederic Weisbecker wrote:
> On Tue, Nov 24, 2009 at 03:44:19AM +0100, Frederic Weisbecker wrote:
>> On Mon, Nov 23, 2009 at 06:21:41PM -0500, Masami Hiramatsu wrote:
>>> +static void kprobe_optimizer(struct work_struct *work);
>>> +static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
>>> +#define OPTIMIZE_DELAY 5
>>> +
>>> +/* Kprobe jump optimizer */
>>> +static __kprobes void kprobe_optimizer(struct work_struct *work)
>>> +{
>>> + struct optimized_kprobe *op, *tmp;
>>> +
>>> + /* Lock modules while optimizing kprobes */
>>> + mutex_lock(&module_mutex);
>>> + mutex_lock(&kprobe_mutex);
>>> + if (kprobes_all_disarmed)
>>> + goto end;
>>> +
>>> + /* Wait quiesence period for ensuring all interrupts are done */
>>> + synchronize_sched();
>>
>>
>>
>> It's not clear to me why you are doing that.
>> Is this waiting for pending int 3 kprobes handlers
>> to complete? If so, why, and what does that prevent?
>
>
> I _might_ have understood.
> You have set up the optimized flags, then you wait for
> any old-style int 3 kprobes to complete and route
> to detour buffer so that you can patch the jump
> safely in the dead code? (and finish with first byte
> by patching the int 3 itself)
>
Yeah, you might get almost correct answer.
The reason why we have to wait scheduling on all processors
is that this code may modify N instructions (not a single
instruction). This means, there is a chance that 2nd to nth
instructions are interrupted on other cpus when we start
code modifying.
Please imagine that 2nd instruction is interrupted and
stop_machine() replaces the 2nd instruction with jump
*address* while running interrupt handler. When the interrupt
returns to original address, there is no valid instructions
and it causes unexpected result.
To avoid this situation, we have to wait a scheduler quiescent
state on all cpus, because it also ensure that all current
interruption are done.
This also excuses why we don't need to wait when unoptimizing
and why it has not supported preemptive kernel yet.
In unoptimizing case, since there is just a single instruction
(jump), there is no nth instruction which can be interrupted.
Thus we can just use a stop_machine(). :-)
On the preemptive kernel, waiting scheduling is not work as we
see on non-preemptive kernel. Since processes can be preempted
in interruption, we can't ensure that the current running
interruption is done. (I assume that a pair of freeze_processes
and thaw_processes may possibly ensure that, or maybe we can
share some stack rewinding code with ksplice.)
So it depends on !PREEMPT.
Thank you,
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-11-24 15:32:24
|
Hi Frederic,
Frederic Weisbecker wrote:
> On Mon, Nov 23, 2009 at 06:21:41PM -0500, Masami Hiramatsu wrote:
>> +config OPTPROBES
>> + bool "Kprobes jump optimization support (EXPERIMENTAL)"
>> + default y
>> + depends on KPROBES
>> + depends on !PREEMPT
>
>
> Why does it depends on !PREEMPT?
Oh, because it has not supported preemptive kernel yet.
(I'd like to tell you why in another mail)
>> @@ -301,6 +302,31 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
>> __free_insn_slot(&kprobe_insn_slots, slot, dirty);
>> mutex_unlock(&kprobe_insn_mutex);
>> }
>> +#ifdef CONFIG_OPTPROBES
>> +/* For optimized_kprobe buffer */
>> +static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
>> +static struct kprobe_insn_cache kprobe_optinsn_slots = {
>> + .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
>> + /* .insn_size is initialized later */
>> + .nr_garbage = 0,
>> +};
>> +/* Get a slot for optimized_kprobe buffer */
>> +kprobe_opcode_t __kprobes *get_optinsn_slot(void)
>> +{
>> + kprobe_opcode_t *ret = NULL;
>> + mutex_lock(&kprobe_optinsn_mutex);
>> + ret = __get_insn_slot(&kprobe_optinsn_slots);
>> + mutex_unlock(&kprobe_optinsn_mutex);
>> + return ret;
>> +}
>
>
>
> Just a small nano-neat: could you add a line between variable
> declarations and the rest? And also just before the return?
> It makes the code a bit easier to review.
Sure :-)
>> +static void kprobe_optimizer(struct work_struct *work);
>> +static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
>> +#define OPTIMIZE_DELAY 5
>> +
>> +/* Kprobe jump optimizer */
>> +static __kprobes void kprobe_optimizer(struct work_struct *work)
>> +{
>> + struct optimized_kprobe *op, *tmp;
>> +
>> + /* Lock modules while optimizing kprobes */
>> + mutex_lock(&module_mutex);
>> + mutex_lock(&kprobe_mutex);
>> + if (kprobes_all_disarmed)
>> + goto end;
>> +
>> + /* Wait quiesence period for ensuring all interrupts are done */
>> + synchronize_sched();
>
>
>
> It's not clear to me why you are doing that.
> Is this waiting for pending int 3 kprobes handlers
> to complete? If so, why, and what does that prevent?
>
> Also, why is it a delayed work? I'm not sure what we are
> waiting for here.
[...]
> Again, I think this dance with live patching protected
> by int 3 only, which patching is in turn a necessary
> stage before, is a complicated sequence that could be
> simplified by choosing only one patching in stop_machine()
> time.
There is a reason why we have to wait here and it's excuse
why it hasn't supported preemption yet too, I'll tell you
in next mail :-)
>> +
>> + get_online_cpus(); /* Use online_cpus while optimizing */
>
>
>
> And this comment doesn't tell us much what this brings us.
> The changelog tells it stands to avoid a text_mutex deadlock.
> I'm not sure why we would deadlock without it.
As Mathieu and I discussed on LKML (http://lkml.org/lkml/2009/11/21/187)
text_mutex will be locked on the way of cpu-hotplug.
Since kprobes locks text_mutex too and stop_machine() refers online_cpus,
it will cause a dead-lock. So, I decided to use get_online_cpus() to
locking hotplug while optimizing/unoptimizng.
Thank you,
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Ingo M. <mi...@el...> - 2009-11-24 07:53:27
|
* Frederic Weisbecker <fwe...@gm...> wrote: > On Tue, Nov 24, 2009 at 03:03:19AM +0100, Frederic Weisbecker wrote: > > On Mon, Nov 23, 2009 at 06:21:16PM -0500, Masami Hiramatsu wrote: > > > When the optimized-kprobe is hit before optimization, its handler > > > changes IP(instruction pointer) to copied code and exits. So, the > > > instructions which were copied to detour buffer are executed on the detour > > > buffer. > > > > > > > > Hm, why is it playing such hybrid game there? > > If I understand well, we have executed int 3, executed the > > handler and we jump back to the detour buffer? > > > > I got it, I think. We have instructions to patch. And the above turn > this area into dead code, safe to patch. > > But still, stop_machine() seem to make it not necessary anymore. i think 'sending an IPI to all online CPUs' might be an adequate sequence to make sure patching effects have propagated. I.e. an smp_call_function() with a dummy function? Ingo |
|
From: Frederic W. <fwe...@gm...> - 2009-11-24 03:31:52
|
On Tue, Nov 24, 2009 at 03:44:19AM +0100, Frederic Weisbecker wrote:
> On Mon, Nov 23, 2009 at 06:21:41PM -0500, Masami Hiramatsu wrote:
> > +static void kprobe_optimizer(struct work_struct *work);
> > +static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
> > +#define OPTIMIZE_DELAY 5
> > +
> > +/* Kprobe jump optimizer */
> > +static __kprobes void kprobe_optimizer(struct work_struct *work)
> > +{
> > + struct optimized_kprobe *op, *tmp;
> > +
> > + /* Lock modules while optimizing kprobes */
> > + mutex_lock(&module_mutex);
> > + mutex_lock(&kprobe_mutex);
> > + if (kprobes_all_disarmed)
> > + goto end;
> > +
> > + /* Wait quiesence period for ensuring all interrupts are done */
> > + synchronize_sched();
>
>
>
> It's not clear to me why you are doing that.
> Is this waiting for pending int 3 kprobes handlers
> to complete? If so, why, and what does that prevent?
I _might_ have understood.
You have set up the optimized flags, then you wait for
any old-style int 3 kprobes to complete and route
to detour buffer so that you can patch the jump
safely in the dead code? (and finish with first byte
by patching the int 3 itself)
|
|
From: Frederic W. <fwe...@gm...> - 2009-11-24 03:20:28
|
On Tue, Nov 24, 2009 at 03:03:19AM +0100, Frederic Weisbecker wrote: > On Mon, Nov 23, 2009 at 06:21:16PM -0500, Masami Hiramatsu wrote: > > When the optimized-kprobe is hit before optimization, its handler > > changes IP(instruction pointer) to copied code and exits. So, the > > instructions which were copied to detour buffer are executed on the detour > > buffer. > > > > Hm, why is it playing such hybrid game there? > If I understand well, we have executed int 3, executed the > handler and we jump back to the detour buffer? > I got it, I think. We have instructions to patch. And the above turn this area into dead code, safe to patch. But still, stop_machine() seem to make it not necessary anymore. |