You can subscribe to this list here.
| 2009 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(32) |
Jun
(66) |
Jul
(102) |
Aug
(78) |
Sep
(106) |
Oct
(137) |
Nov
(147) |
Dec
(147) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2010 |
Jan
(71) |
Feb
(139) |
Mar
(86) |
Apr
(76) |
May
(57) |
Jun
(10) |
Jul
(12) |
Aug
(6) |
Sep
(8) |
Oct
(12) |
Nov
(12) |
Dec
(18) |
| 2011 |
Jan
(16) |
Feb
(19) |
Mar
(3) |
Apr
(1) |
May
(16) |
Jun
(17) |
Jul
(74) |
Aug
(22) |
Sep
(18) |
Oct
(24) |
Nov
(21) |
Dec
(30) |
| 2012 |
Jan
(31) |
Feb
(16) |
Mar
(22) |
Apr
(25) |
May
(18) |
Jun
(13) |
Jul
(83) |
Aug
(49) |
Sep
(20) |
Oct
(60) |
Nov
(35) |
Dec
(28) |
| 2013 |
Jan
(39) |
Feb
(61) |
Mar
(35) |
Apr
(21) |
May
(45) |
Jun
(56) |
Jul
(20) |
Aug
(9) |
Sep
(10) |
Oct
(31) |
Nov
(8) |
Dec
(4) |
| 2014 |
Jan
(6) |
Feb
(7) |
Mar
(7) |
Apr
(6) |
May
(4) |
Jun
(8) |
Jul
(5) |
Aug
(2) |
Sep
(4) |
Oct
(4) |
Nov
(11) |
Dec
(5) |
| 2015 |
Jan
(4) |
Feb
(4) |
Mar
(3) |
Apr
(4) |
May
(9) |
Jun
(4) |
Jul
(15) |
Aug
(8) |
Sep
(16) |
Oct
(18) |
Nov
(15) |
Dec
(7) |
| 2016 |
Jan
(20) |
Feb
(9) |
Mar
(15) |
Apr
(24) |
May
(16) |
Jun
(28) |
Jul
(22) |
Aug
(23) |
Sep
(18) |
Oct
(30) |
Nov
(40) |
Dec
(9) |
| 2017 |
Jan
(1) |
Feb
(8) |
Mar
(37) |
Apr
(26) |
May
(25) |
Jun
(46) |
Jul
(24) |
Aug
(9) |
Sep
|
Oct
|
Nov
|
Dec
|
|
From: Frederic W. <fwe...@gm...> - 2009-07-07 11:46:26
|
On Tue, Jun 30, 2009 at 09:09:23PM -0400, Masami Hiramatsu wrote:
> Add kprobes-based event tracer on ftrace.
>
> This tracer is similar to the events tracer which is based on Tracepoint
> infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe
> and kretprobe). It probes anywhere where kprobes can probe(this means, all
> functions body except for __kprobes functions).
>
> Similar to the events tracer, this tracer doesn't need to be activated via
> current_tracer, instead of that, just set probe points via
> /sys/kernel/debug/tracing/kprobe_events. And you can set filters on each
> probe events via /sys/kernel/debug/tracing/events/kprobes/<EVENT>/filter.
>
> This tracer supports following probe arguments for each probe.
>
> %REG : Fetch register REG
> sN : Fetch Nth entry of stack (N >= 0)
> @ADDR : Fetch memory at ADDR (ADDR should be in kernel)
> @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
> aN : Fetch function argument. (N >= 0)
> rv : Fetch return value.
> ra : Fetch return address.
> +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.
>
> See Documentation/trace/kprobes.txt for details.
>
> Changes from v9:
> - Select CONFIG_GENERIC_TRACER when CONFIG_KPROBE_TRACER=y.
>
> Signed-off-by: Masami Hiramatsu <mhi...@re...>
> Acked-by: Ananth N Mavinakayanahalli <an...@in...>
> Cc: Christoph Hellwig <hc...@in...>
> Cc: Steven Rostedt <ro...@go...>
> Cc: Ingo Molnar <mi...@el...>
> Cc: Frederic Weisbecker <fwe...@gm...>
> Cc: Tom Zanussi <tza...@gm...>
> ---
>
> Documentation/trace/kprobes.txt | 138 ++++
> kernel/trace/Kconfig | 12
> kernel/trace/Makefile | 1
> kernel/trace/trace.h | 22 +
> kernel/trace/trace_event_types.h | 20 +
> kernel/trace/trace_kprobe.c | 1183 ++++++++++++++++++++++++++++++++++++++
> 6 files changed, 1376 insertions(+), 0 deletions(-)
> create mode 100644 Documentation/trace/kprobes.txt
> create mode 100644 kernel/trace/trace_kprobe.c
>
> diff --git a/Documentation/trace/kprobes.txt b/Documentation/trace/kprobes.txt
> new file mode 100644
> index 0000000..3a90ebb
> --- /dev/null
> +++ b/Documentation/trace/kprobes.txt
> @@ -0,0 +1,138 @@
> + Kprobe-based Event Tracer
> + =========================
> +
> + Documentation is written by Masami Hiramatsu
> +
> +
> +Overview
> +--------
> +This tracer is similar to the events tracer which is based on Tracepoint
> +infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe
> +and kretprobe). It probes anywhere where kprobes can probe(this means, all
> +functions body except for __kprobes functions).
> +
> +Unlike the function tracer, this tracer can probe instructions inside of
> +kernel functions. It allows you to check which instruction has been executed.
> +
> +Unlike the Tracepoint based events tracer, this tracer can add and remove
> +probe points on the fly.
> +
> +Similar to the events tracer, this tracer doesn't need to be activated via
> +current_tracer, instead of that, just set probe points via
> +/sys/kernel/debug/tracing/kprobe_events. And you can set filters on each
> +probe events via /sys/kernel/debug/tracing/events/kprobes/<EVENT>/filter.
> +
> +
> +Synopsis of kprobe_events
> +-------------------------
> + p[:EVENT] SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS] : set a probe
> + r[:EVENT] SYMBOL[+0] [FETCHARGS] : set a return probe
> +
> + EVENT : Event name
> + SYMBOL[+offs|-offs] : Symbol+offset where the probe is inserted
> + MEMADDR : Address where the probe is inserted
> +
> + FETCHARGS : Arguments
> + %REG : Fetch register REG
> + sN : Fetch Nth entry of stack (N >= 0)
> + @ADDR : Fetch memory at ADDR (ADDR should be in kernel)
> + @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
> + aN : Fetch function argument. (N >= 0)(*)
> + rv : Fetch return value.(**)
> + ra : Fetch return address.(**)
> + +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.(***)
> +
> + (*) aN may not correct on asmlinkaged functions and at the middle of
> + function body.
> + (**) only for return probe.
> + (***) this is useful for fetching a field of data structures.
> +
> +
> +Per-Probe Event Filtering
> +-------------------------
> + Per-probe event filtering feature allows you to set different filter on each
> +probe and gives you what arguments will be shown in trace buffer. If an event
> +name is specified right after 'p:' or 'r:' in kprobe_events, the tracer adds
> +an event under tracing/events/kprobes/<EVENT>, at the directory you can see
> +'id', 'enabled', 'format' and 'filter'.
> +
> +enabled:
> + You can enable/disable the probe by writing 1 or 0 on it.
> +
> +format:
> + It shows the format of this probe event. It also shows aliases of arguments
> + which you specified to kprobe_events.
> +
> +filter:
> + You can write filtering rules of this event. And you can use both of aliase
> + names and field names for describing filters.
> +
> +
> +Usage examples
> +--------------
> +To add a probe as a new event, write a new definition to kprobe_events
> +as below.
> +
> + echo p:myprobe do_sys_open a0 a1 a2 a3 > /sys/kernel/debug/tracing/kprobe_events
> +
> + This sets a kprobe on the top of do_sys_open() function with recording
> +1st to 4th arguments as "myprobe" event.
> +
> + echo r:myretprobe do_sys_open rv ra >> /sys/kernel/debug/tracing/kprobe_events
> +
> + This sets a kretprobe on the return point of do_sys_open() function with
> +recording return value and return address as "myretprobe" event.
> + You can see the format of these events via
> +/sys/kernel/debug/tracing/events/kprobes/<EVENT>/format.
> +
> + cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format
> +name: myprobe
> +ID: 23
> +format:
> + field:unsigned short common_type; offset:0; size:2;
> + field:unsigned char common_flags; offset:2; size:1;
> + field:unsigned char common_preempt_count; offset:3; size:1;
> + field:int common_pid; offset:4; size:4;
> + field:int common_tgid; offset:8; size:4;
> +
> + field: unsigned long ip; offset:16;tsize:8;
> + field: int nargs; offset:24;tsize:4;
> + field: unsigned long arg0; offset:32;tsize:8;
> + field: unsigned long arg1; offset:40;tsize:8;
> + field: unsigned long arg2; offset:48;tsize:8;
> + field: unsigned long arg3; offset:56;tsize:8;
> +
> + alias: a0; original: arg0;
> + alias: a1; original: arg1;
> + alias: a2; original: arg2;
> + alias: a3; original: arg3;
> +
> +print fmt: "%lx: 0x%lx 0x%lx 0x%lx 0x%lx", ip, arg0, arg1, arg2, arg3
> +
> +
> + You can see that the event has 4 arguments and alias expressions
> +corresponding to it.
> +
> + echo > /sys/kernel/debug/tracing/kprobe_events
> +
> + This clears all probe points. and you can see the traced information via
> +/sys/kernel/debug/tracing/trace.
> +
> + cat /sys/kernel/debug/tracing/trace
> +# tracer: nop
> +#
> +# TASK-PID CPU# TIMESTAMP FUNCTION
> +# | | | | |
> + <...>-1447 [001] 1038282.286875: do_sys_open+0x0/0xd6: 0x3 0x7fffd1ec4440 0x8000 0x0
> + <...>-1447 [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: 0xfffffffffffffffe 0xffffffff81367a3a
> + <...>-1447 [001] 1038282.286885: do_sys_open+0x0/0xd6: 0xffffff9c 0x40413c 0x8000 0x1b6
> + <...>-1447 [001] 1038282.286915: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a
> + <...>-1447 [001] 1038282.286969: do_sys_open+0x0/0xd6: 0xffffff9c 0x4041c6 0x98800 0x10
> + <...>-1447 [001] 1038282.286976: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a
> +
> +
> + Each line shows when the kernel hits a probe, and <- SYMBOL means kernel
> +returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel
> +returns from do_sys_open to sys_open+0x1b).
> +
I'm looking forward to use it for debugging :)
> diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
> index 860c712..60f3401 100644
> --- a/kernel/trace/Kconfig
> +++ b/kernel/trace/Kconfig
> @@ -445,6 +445,18 @@ config BLK_DEV_IO_TRACE
>
> If unsure, say N.
>
> +config KPROBE_TRACER
> + depends on KPROBES
> + depends on X86
> + bool "Trace kprobes"
> + select TRACING
> + select GENERIC_TRACER
> + help
> + This tracer probes everywhere where kprobes can probe it, and
> + records various registers and memories specified by user.
> + This also allows you to trace kprobe probe points as a dynamic
> + defined events. It provides per-probe event filtering interface.
> +
> config DYNAMIC_FTRACE
> bool "enable/disable ftrace tracepoints dynamically"
> depends on FUNCTION_TRACER
> diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> index ce3b1cd..8e6884d 100644
> --- a/kernel/trace/Makefile
> +++ b/kernel/trace/Makefile
> @@ -55,5 +55,6 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
> obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
> obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
> obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
> +obj-$(CONFIG_KPROBE_TRACER) += trace_kprobe.o
>
> libftrace-y := ftrace.o
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 206cb7d..65945eb 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -45,6 +45,8 @@ enum trace_type {
> TRACE_POWER,
> TRACE_BLK,
> TRACE_KSYM,
> + TRACE_KPROBE,
> + TRACE_KRETPROBE,
>
> __TRACE_LAST_TYPE,
> };
> @@ -227,6 +229,22 @@ struct trace_ksym {
> char ksym_name[KSYM_NAME_LEN];
> char p_name[TASK_COMM_LEN];
> };
> +#define TRACE_KPROBE_ARGS 6
> +
> +struct kprobe_trace_entry {
> + struct trace_entry ent;
> + unsigned long ip;
> + int nargs;
> + unsigned long args[TRACE_KPROBE_ARGS];
I see that you actually make use of arg as a dynamic sizeable
array.
For clarity, args[TRACE_KPROBE_ARGS] could be args[0].
It's just a neat and wouldn't affect the code nor the data
but would be clearer for readers of that code.
> +};
> +
> +struct kretprobe_trace_entry {
> + struct trace_entry ent;
> + unsigned long func;
> + unsigned long ret_ip;
> + int nargs;
> + unsigned long args[TRACE_KPROBE_ARGS];
> +};
ditto
> /*
> * trace_flag_type is an enumeration that holds different
> @@ -344,6 +362,10 @@ extern void __ftrace_bad_type(void);
> IF_ASSIGN(var, ent, struct syscall_trace_exit, \
> TRACE_SYSCALL_EXIT); \
> IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM); \
> + IF_ASSIGN(var, ent, struct kprobe_trace_entry, \
> + TRACE_KPROBE); \
> + IF_ASSIGN(var, ent, struct kretprobe_trace_entry, \
> + TRACE_KRETPROBE); \
> __ftrace_bad_type(); \
> } while (0)
>
> diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
> index 6db005e..ec2e6f3 100644
> --- a/kernel/trace/trace_event_types.h
> +++ b/kernel/trace/trace_event_types.h
> @@ -175,4 +175,24 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
> TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
> );
>
> +TRACE_EVENT_FORMAT(kprobe, TRACE_KPROBE, kprobe_trace_entry, ignore,
> + TRACE_STRUCT(
> + TRACE_FIELD(unsigned long, ip, ip)
> + TRACE_FIELD(int, nargs, nargs)
> + TRACE_FIELD_SPECIAL(unsigned long args[TRACE_KPROBE_ARGS],
> + args, TRACE_KPROBE_ARGS, args)
> + ),
> + TP_RAW_FMT("%08lx: args:0x%lx ...")
> +);
> +
> +TRACE_EVENT_FORMAT(kretprobe, TRACE_KRETPROBE, kretprobe_trace_entry, ignore,
> + TRACE_STRUCT(
> + TRACE_FIELD(unsigned long, func, func)
> + TRACE_FIELD(unsigned long, ret_ip, ret_ip)
> + TRACE_FIELD(int, nargs, nargs)
> + TRACE_FIELD_SPECIAL(unsigned long args[TRACE_KPROBE_ARGS],
> + args, TRACE_KPROBE_ARGS, args)
> + ),
> + TP_RAW_FMT("%08lx <- %08lx: args:0x%lx ...")
> +);
> #undef TRACE_SYSTEM
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> new file mode 100644
> index 0000000..0951512
> --- /dev/null
> +++ b/kernel/trace/trace_kprobe.c
> @@ -0,0 +1,1183 @@
> +/*
> + * kprobe based kernel tracer
> + *
> + * Created by Masami Hiramatsu <mhi...@re...>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> + */
> +
> +#include <linux/module.h>
> +#include <linux/uaccess.h>
> +#include <linux/kprobes.h>
> +#include <linux/seq_file.h>
> +#include <linux/slab.h>
> +#include <linux/smp.h>
> +#include <linux/debugfs.h>
> +#include <linux/types.h>
> +#include <linux/string.h>
> +#include <linux/ctype.h>
> +#include <linux/ptrace.h>
> +
> +#include "trace.h"
> +#include "trace_output.h"
> +
> +#define MAX_ARGSTR_LEN 63
> +
> +/* currently, trace_kprobe only supports X86. */
> +
> +struct fetch_func {
> + unsigned long (*func)(struct pt_regs *, void *);
> + void *data;
> +};
> +
> +static __kprobes unsigned long call_fetch(struct fetch_func *f,
> + struct pt_regs *regs)
> +{
> + return f->func(regs, f->data);
> +}
> +
> +/* fetch handlers */
> +static __kprobes unsigned long fetch_register(struct pt_regs *regs,
> + void *offset)
> +{
> + return regs_get_register(regs, (unsigned)((unsigned long)offset));
> +}
> +
> +static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
> + void *num)
> +{
> + return regs_get_kernel_stack_nth(regs, (unsigned)((unsigned long)num));
You seem to often use unsigned as an implicit type.
Would be better to explicitly use unsigned int.
Anyway, I guess we can merge it in -tip and give it a try.
Thanks,
Frederic.
> +}
> +
> +static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
> +{
> + unsigned long retval;
> + if (probe_kernel_address(addr, retval))
> + return 0;
> + return retval;
> +}
> +
> +static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
> +{
> + return regs_get_argument_nth(regs, (unsigned)((unsigned long)num));
> +}
> +
> +static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
> + void *dummy)
> +{
> + return regs_return_value(regs);
> +}
> +
> +static __kprobes unsigned long fetch_ip(struct pt_regs *regs, void *dummy)
> +{
> + return instruction_pointer(regs);
> +}
> +
> +/* Memory fetching by symbol */
> +struct symbol_cache {
> + char *symbol;
> + long offset;
> + unsigned long addr;
> +};
> +
> +static unsigned long update_symbol_cache(struct symbol_cache *sc)
> +{
> + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
> + if (sc->addr)
> + sc->addr += sc->offset;
> + return sc->addr;
> +}
> +
> +static void free_symbol_cache(struct symbol_cache *sc)
> +{
> + kfree(sc->symbol);
> + kfree(sc);
> +}
> +
> +static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
> +{
> + struct symbol_cache *sc;
> + if (!sym || strlen(sym) == 0)
> + return NULL;
> + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
> + if (!sc)
> + return NULL;
> +
> + sc->symbol = kstrdup(sym, GFP_KERNEL);
> + if (!sc->symbol) {
> + kfree(sc);
> + return NULL;
> + }
> + sc->offset = offset;
> +
> + update_symbol_cache(sc);
> + return sc;
> +}
> +
> +static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
> +{
> + struct symbol_cache *sc = data;
> + if (sc->addr)
> + return fetch_memory(regs, (void *)sc->addr);
> + else
> + return 0;
> +}
> +
> +/* Special indirect memory access interface */
> +struct indirect_fetch_data {
> + struct fetch_func orig;
> + long offset;
> +};
> +
> +static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
> +{
> + struct indirect_fetch_data *ind = data;
> + unsigned long addr;
> + addr = call_fetch(&ind->orig, regs);
> + if (addr) {
> + addr += ind->offset;
> + return fetch_memory(regs, (void *)addr);
> + } else
> + return 0;
> +}
> +
> +static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
> +{
> + if (data->orig.func == fetch_indirect)
> + free_indirect_fetch_data(data->orig.data);
> + else if (data->orig.func == fetch_symbol)
> + free_symbol_cache(data->orig.data);
> + kfree(data);
> +}
> +
> +/**
> + * kprobe_trace_core
> + */
> +
> +struct trace_probe {
> + struct list_head list;
> + union {
> + struct kprobe kp;
> + struct kretprobe rp;
> + };
> + const char *symbol; /* symbol name */
> + unsigned int nr_args;
> + struct fetch_func args[TRACE_KPROBE_ARGS];
> + struct ftrace_event_call call;
> +};
> +
> +static int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs);
> +static int kretprobe_trace_func(struct kretprobe_instance *ri,
> + struct pt_regs *regs);
> +
> +static __kprobes int probe_is_return(struct trace_probe *tp)
> +{
> + return (tp->rp.handler == kretprobe_trace_func);
> +}
> +
> +static __kprobes const char *probe_symbol(struct trace_probe *tp)
> +{
> + return tp->symbol ? tp->symbol : "unknown";
> +}
> +
> +static __kprobes long probe_offset(struct trace_probe *tp)
> +{
> + return (probe_is_return(tp)) ? tp->rp.kp.offset : tp->kp.offset;
> +}
> +
> +static __kprobes void *probe_address(struct trace_probe *tp)
> +{
> + return (probe_is_return(tp)) ? tp->rp.kp.addr : tp->kp.addr;
> +}
> +
> +static int trace_arg_string(char *buf, size_t n, struct fetch_func *ff)
> +{
> + int ret = -EINVAL;
> + if (ff->func == fetch_argument)
> + ret = snprintf(buf, n, "a%lu", (unsigned long)ff->data);
> + else if (ff->func == fetch_register) {
> + const char *name;
> + name = regs_query_register_name((unsigned)((long)ff->data));
> + ret = snprintf(buf, n, "%%%s", name);
> + } else if (ff->func == fetch_stack)
> + ret = snprintf(buf, n, "s%lu", (unsigned long)ff->data);
> + else if (ff->func == fetch_memory)
> + ret = snprintf(buf, n, "@0x%p", ff->data);
> + else if (ff->func == fetch_symbol) {
> + struct symbol_cache *sc = ff->data;
> + ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset);
> + } else if (ff->func == fetch_retvalue)
> + ret = snprintf(buf, n, "rv");
> + else if (ff->func == fetch_ip)
> + ret = snprintf(buf, n, "ra");
> + else if (ff->func == fetch_indirect) {
> + struct indirect_fetch_data *id = ff->data;
> + ret = snprintf(buf, n, "%+ld(", id->offset);
> + if (ret > n)
> + goto end;
> + n -= ret;
> + ret = trace_arg_string(buf, n, &id->orig);
> + if (ret > n)
> + goto end;
> + n -= ret;
> + ret = snprintf(buf, n, ")");
> + }
> +end:
> + if (ret > n)
> + return -ENOSPC;
> + return 0;
> +}
> +
> +static int register_probe_event(struct trace_probe *tp);
> +static void unregister_probe_event(struct trace_probe *tp);
> +
> +static DEFINE_MUTEX(probe_lock);
> +static LIST_HEAD(probe_list);
> +
> +static struct trace_probe *alloc_trace_probe(const char *symbol,
> + const char *event)
> +{
> + struct trace_probe *tp;
> +
> + tp = kzalloc(sizeof(struct trace_probe), GFP_KERNEL);
> + if (!tp)
> + return ERR_PTR(-ENOMEM);
> +
> + if (symbol) {
> + tp->symbol = kstrdup(symbol, GFP_KERNEL);
> + if (!tp->symbol)
> + goto error;
> + }
> + if (event) {
> + tp->call.name = kstrdup(event, GFP_KERNEL);
> + if (!tp->call.name)
> + goto error;
> + }
> +
> + INIT_LIST_HEAD(&tp->list);
> + return tp;
> +error:
> + kfree(tp->symbol);
> + kfree(tp);
> + return ERR_PTR(-ENOMEM);
> +}
> +
> +static void free_trace_probe(struct trace_probe *tp)
> +{
> + int i;
> + for (i = 0; i < tp->nr_args; i++)
> + if (tp->args[i].func == fetch_symbol)
> + free_symbol_cache(tp->args[i].data);
> + else if (tp->args[i].func == fetch_indirect)
> + free_indirect_fetch_data(tp->args[i].data);
> +
> + kfree(tp->call.name);
> + kfree(tp->symbol);
> + kfree(tp);
> +}
> +
> +static struct trace_probe *find_probe_event(const char *event)
> +{
> + struct trace_probe *tp;
> + list_for_each_entry(tp, &probe_list, list)
> + if (tp->call.name && !strcmp(tp->call.name, event))
> + return tp;
> + return NULL;
> +}
> +
> +static void __unregister_trace_probe(struct trace_probe *tp)
> +{
> + if (probe_is_return(tp))
> + unregister_kretprobe(&tp->rp);
> + else
> + unregister_kprobe(&tp->kp);
> +}
> +
> +/* Unregister a trace_probe and probe_event: call with locking probe_lock */
> +static void unregister_trace_probe(struct trace_probe *tp)
> +{
> + if (tp->call.name)
> + unregister_probe_event(tp);
> + __unregister_trace_probe(tp);
> + list_del(&tp->list);
> +}
> +
> +/* Register a trace_probe and probe_event */
> +static int register_trace_probe(struct trace_probe *tp)
> +{
> + struct trace_probe *old_tp;
> + int ret;
> +
> + mutex_lock(&probe_lock);
> +
> + if (probe_is_return(tp))
> + ret = register_kretprobe(&tp->rp);
> + else
> + ret = register_kprobe(&tp->kp);
> +
> + if (ret) {
> + pr_warning("Could not insert probe(%d)\n", ret);
> + if (ret == -EILSEQ) {
> + pr_warning("Probing address(0x%p) is not an "
> + "instruction boundary.\n",
> + probe_address(tp));
> + ret = -EINVAL;
> + }
> + goto end;
> + }
> + /* register as an event */
> + if (tp->call.name) {
> + old_tp = find_probe_event(tp->call.name);
> + if (old_tp) {
> + /* delete old event */
> + unregister_trace_probe(old_tp);
> + free_trace_probe(old_tp);
> + }
> + ret = register_probe_event(tp);
> + if (ret) {
> + pr_warning("Faild to register probe event(%d)\n", ret);
> + __unregister_trace_probe(tp);
> + }
> + }
> + list_add_tail(&tp->list, &probe_list);
> +end:
> + mutex_unlock(&probe_lock);
> + return ret;
> +}
> +
> +/* Split symbol and offset. */
> +static int split_symbol_offset(char *symbol, long *offset)
> +{
> + char *tmp;
> + int ret;
> +
> + if (!offset)
> + return -EINVAL;
> +
> + tmp = strchr(symbol, '+');
> + if (!tmp)
> + tmp = strchr(symbol, '-');
> +
> + if (tmp) {
> + /* skip sign because strict_strtol doesn't accept '+' */
> + ret = strict_strtol(tmp + 1, 0, offset);
> + if (ret)
> + return ret;
> + if (*tmp == '-')
> + *offset = -(*offset);
> + *tmp = '\0';
> + } else
> + *offset = 0;
> + return 0;
> +}
> +
> +#define PARAM_MAX_ARGS 16
> +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
> +
> +static int parse_trace_arg(char *arg, struct fetch_func *ff, int is_return)
> +{
> + int ret = 0;
> + unsigned long param;
> + long offset;
> + char *tmp;
> +
> + switch (arg[0]) {
> + case 'a': /* argument */
> + ret = strict_strtoul(arg + 1, 10, ¶m);
> + if (ret || param > PARAM_MAX_ARGS)
> + ret = -EINVAL;
> + else {
> + ff->func = fetch_argument;
> + ff->data = (void *)param;
> + }
> + break;
> + case 'r': /* retval or retaddr */
> + if (is_return && arg[1] == 'v') {
> + ff->func = fetch_retvalue;
> + ff->data = NULL;
> + } else if (is_return && arg[1] == 'a') {
> + ff->func = fetch_ip;
> + ff->data = NULL;
> + } else
> + ret = -EINVAL;
> + break;
> + case '%': /* named register */
> + ret = regs_query_register_offset(arg + 1);
> + if (ret >= 0) {
> + ff->func = fetch_register;
> + ff->data = (void *)(unsigned long)ret;
> + ret = 0;
> + }
> + break;
> + case 's': /* stack */
> + ret = strict_strtoul(arg + 1, 10, ¶m);
> + if (ret || param > PARAM_MAX_STACK)
> + ret = -EINVAL;
> + else {
> + ff->func = fetch_stack;
> + ff->data = (void *)param;
> + }
> + break;
> + case '@': /* memory or symbol */
> + if (isdigit(arg[1])) {
> + ret = strict_strtoul(arg + 1, 0, ¶m);
> + if (ret)
> + break;
> + ff->func = fetch_memory;
> + ff->data = (void *)param;
> + } else {
> + ret = split_symbol_offset(arg + 1, &offset);
> + if (ret)
> + break;
> + ff->data = alloc_symbol_cache(arg + 1,
> + offset);
> + if (ff->data)
> + ff->func = fetch_symbol;
> + else
> + ret = -EINVAL;
> + }
> + break;
> + case '+': /* indirect memory */
> + case '-':
> + tmp = strchr(arg, '(');
> + if (!tmp) {
> + ret = -EINVAL;
> + break;
> + }
> + *tmp = '\0';
> + ret = strict_strtol(arg + 1, 0, &offset);
> + if (ret)
> + break;
> + if (arg[0] == '-')
> + offset = -offset;
> + arg = tmp + 1;
> + tmp = strrchr(arg, ')');
> + if (tmp) {
> + struct indirect_fetch_data *id;
> + *tmp = '\0';
> + id = kzalloc(sizeof(struct indirect_fetch_data),
> + GFP_KERNEL);
> + if (!id)
> + return -ENOMEM;
> + id->offset = offset;
> + ret = parse_trace_arg(arg, &id->orig, is_return);
> + if (ret)
> + kfree(id);
> + else {
> + ff->func = fetch_indirect;
> + ff->data = (void *)id;
> + }
> + } else
> + ret = -EINVAL;
> + break;
> + default:
> + /* TODO: support custom handler */
> + ret = -EINVAL;
> + }
> + return ret;
> +}
> +
> +static int create_trace_probe(int argc, char **argv)
> +{
> + /*
> + * Argument syntax:
> + * - Add kprobe: p[:EVENT] SYMBOL[+OFFS|-OFFS]|ADDRESS [FETCHARGS]
> + * - Add kretprobe: r[:EVENT] SYMBOL[+0] [FETCHARGS]
> + * Fetch args:
> + * aN : fetch Nth of function argument. (N:0-)
> + * rv : fetch return value
> + * ra : fetch return address
> + * sN : fetch Nth of stack (N:0-)
> + * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
> + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
> + * %REG : fetch register REG
> + * Indirect memory fetch:
> + * +|-offs(ARG) : fetch memory at ARG +|- offs address.
> + */
> + struct trace_probe *tp;
> + struct kprobe *kp;
> + int i, ret = 0;
> + int is_return = 0;
> + char *symbol = NULL, *event = NULL;
> + long offset = 0;
> + void *addr = NULL;
> +
> + if (argc < 2)
> + return -EINVAL;
> +
> + if (argv[0][0] == 'p')
> + is_return = 0;
> + else if (argv[0][0] == 'r')
> + is_return = 1;
> + else
> + return -EINVAL;
> +
> + if (argv[0][1] == ':') {
> + event = &argv[0][2];
> + if (strlen(event) == 0) {
> + pr_info("Event name is not specifiled\n");
> + return -EINVAL;
> + }
> + }
> +
> + if (isdigit(argv[1][0])) {
> + if (is_return)
> + return -EINVAL;
> + /* an address specified */
> + ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
> + if (ret)
> + return ret;
> + } else {
> + /* a symbol specified */
> + symbol = argv[1];
> + /* TODO: support .init module functions */
> + ret = split_symbol_offset(symbol, &offset);
> + if (ret)
> + return ret;
> + if (offset && is_return)
> + return -EINVAL;
> + }
> +
> + /* setup a probe */
> + tp = alloc_trace_probe(symbol, event);
> + if (IS_ERR(tp))
> + return PTR_ERR(tp);
> +
> + if (is_return) {
> + kp = &tp->rp.kp;
> + tp->rp.handler = kretprobe_trace_func;
> + } else {
> + kp = &tp->kp;
> + tp->kp.pre_handler = kprobe_trace_func;
> + }
> +
> + if (tp->symbol) {
> + kp->symbol_name = tp->symbol;
> + kp->offset = offset;
> + } else
> + kp->addr = addr;
> +
> + /* parse arguments */
> + argc -= 2; argv += 2; ret = 0;
> + for (i = 0; i < argc && i < TRACE_KPROBE_ARGS; i++) {
> + if (strlen(argv[i]) > MAX_ARGSTR_LEN) {
> + pr_info("Argument%d(%s) is too long.\n", i, argv[i]);
> + ret = -ENOSPC;
> + goto error;
> + }
> + ret = parse_trace_arg(argv[i], &tp->args[i], is_return);
> + if (ret)
> + goto error;
> + }
> + tp->nr_args = i;
> +
> + ret = register_trace_probe(tp);
> + if (ret)
> + goto error;
> + return 0;
> +
> +error:
> + free_trace_probe(tp);
> + return ret;
> +}
> +
> +static void cleanup_all_probes(void)
> +{
> + struct trace_probe *tp;
> + mutex_lock(&probe_lock);
> + /* TODO: Use batch unregistration */
> + while (!list_empty(&probe_list)) {
> + tp = list_entry(probe_list.next, struct trace_probe, list);
> + unregister_trace_probe(tp);
> + free_trace_probe(tp);
> + }
> + mutex_unlock(&probe_lock);
> +}
> +
> +
> +/* Probes listing interfaces */
> +static void *probes_seq_start(struct seq_file *m, loff_t *pos)
> +{
> + mutex_lock(&probe_lock);
> + return seq_list_start(&probe_list, *pos);
> +}
> +
> +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
> +{
> + return seq_list_next(v, &probe_list, pos);
> +}
> +
> +static void probes_seq_stop(struct seq_file *m, void *v)
> +{
> + mutex_unlock(&probe_lock);
> +}
> +
> +static int probes_seq_show(struct seq_file *m, void *v)
> +{
> + struct trace_probe *tp = v;
> + int i, ret;
> + char buf[MAX_ARGSTR_LEN + 1];
> +
> + if (tp == NULL)
> + return 0;
> +
> + seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
> + if (tp->call.name)
> + seq_printf(m, ":%s", tp->call.name);
> +
> + if (tp->symbol)
> + seq_printf(m, " %s%+ld", probe_symbol(tp), probe_offset(tp));
> + else
> + seq_printf(m, " 0x%p", probe_address(tp));
> +
> + for (i = 0; i < tp->nr_args; i++) {
> + ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
> + if (ret) {
> + pr_warning("Argument%d is too long.\n", i);
> + break;
> + }
> + seq_printf(m, " %s", buf);
> + }
> + seq_printf(m, "\n");
> + return 0;
> +}
> +
> +static const struct seq_operations probes_seq_op = {
> + .start = probes_seq_start,
> + .next = probes_seq_next,
> + .stop = probes_seq_stop,
> + .show = probes_seq_show
> +};
> +
> +static int probes_open(struct inode *inode, struct file *file)
> +{
> + if ((file->f_mode & FMODE_WRITE) &&
> + !(file->f_flags & O_APPEND))
> + cleanup_all_probes();
> +
> + return seq_open(file, &probes_seq_op);
> +}
> +
> +static int command_trace_probe(const char *buf)
> +{
> + char **argv;
> + int argc = 0, ret = 0;
> +
> + argv = argv_split(GFP_KERNEL, buf, &argc);
> + if (!argv)
> + return -ENOMEM;
> +
> + if (argc)
> + ret = create_trace_probe(argc, argv);
> +
> + argv_free(argv);
> + return ret;
> +}
> +
> +#define WRITE_BUFSIZE 128
> +
> +static ssize_t probes_write(struct file *file, const char __user *buffer,
> + size_t count, loff_t *ppos)
> +{
> + char *kbuf, *tmp;
> + int ret;
> + size_t done;
> + size_t size;
> +
> + if (!count || count < 0)
> + return 0;
> +
> + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
> + if (!kbuf)
> + return -ENOMEM;
> +
> + ret = done = 0;
> + do {
> + size = count - done;
> + if (size > WRITE_BUFSIZE)
> + size = WRITE_BUFSIZE;
> + if (copy_from_user(kbuf, buffer + done, size)) {
> + ret = -EFAULT;
> + goto out;
> + }
> + kbuf[size] = '\0';
> + tmp = strchr(kbuf, '\n');
> + if (!tmp) {
> + pr_warning("Line length is too long: "
> + "Should be less than %d.", WRITE_BUFSIZE);
> + ret = -EINVAL;
> + goto out;
> + }
> + *tmp = '\0';
> + size = tmp - kbuf + 1;
> + done += size;
> + /* Remove comments */
> + tmp = strchr(kbuf, '#');
> + if (tmp)
> + *tmp = '\0';
> +
> + ret = command_trace_probe(kbuf);
> + if (ret)
> + goto out;
> +
> + } while (done < count);
> + ret = done;
> +out:
> + kfree(kbuf);
> + return ret;
> +}
> +
> +static const struct file_operations kprobe_events_ops = {
> + .owner = THIS_MODULE,
> + .open = probes_open,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .release = seq_release,
> + .write = probes_write,
> +};
> +
> +/* Kprobe handler */
> +static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
> +{
> + struct trace_probe *tp = container_of(kp, struct trace_probe, kp);
> + struct kprobe_trace_entry *entry;
> + struct ring_buffer_event *event;
> + int size, i, pc;
> + unsigned long irq_flags;
> + struct ftrace_event_call *call = &event_kprobe;
> + if (&tp->call.name)
> + call = &tp->call;
> +
> + local_save_flags(irq_flags);
> + pc = preempt_count();
> +
> + size = sizeof(struct kprobe_trace_entry) -
> + (sizeof(unsigned long) * (TRACE_KPROBE_ARGS - tp->nr_args));
> +
> + event = trace_current_buffer_lock_reserve(TRACE_KPROBE, size,
> + irq_flags, pc);
> + if (!event)
> + return 0;
> +
> + entry = ring_buffer_event_data(event);
> + entry->nargs = tp->nr_args;
> + entry->ip = (unsigned long)kp->addr;
> + for (i = 0; i < tp->nr_args; i++)
> + entry->args[i] = call_fetch(&tp->args[i], regs);
> +
> + if (!filter_current_check_discard(call, entry, event))
> + trace_nowake_buffer_unlock_commit(event, irq_flags, pc);
> + return 0;
> +}
> +
> +/* Kretprobe handler */
> +static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
> + struct pt_regs *regs)
> +{
> + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
> + struct kretprobe_trace_entry *entry;
> + struct ring_buffer_event *event;
> + int size, i, pc;
> + unsigned long irq_flags;
> + struct ftrace_event_call *call = &event_kretprobe;
> + if (&tp->call.name)
> + call = &tp->call;
> +
> + local_save_flags(irq_flags);
> + pc = preempt_count();
> +
> + size = sizeof(struct kretprobe_trace_entry) -
> + (sizeof(unsigned long) * (TRACE_KPROBE_ARGS - tp->nr_args));
> +
> + event = trace_current_buffer_lock_reserve(TRACE_KRETPROBE, size,
> + irq_flags, pc);
> + if (!event)
> + return 0;
> +
> + entry = ring_buffer_event_data(event);
> + entry->nargs = tp->nr_args;
> + entry->func = (unsigned long)probe_address(tp);
> + entry->ret_ip = (unsigned long)ri->ret_addr;
> + for (i = 0; i < tp->nr_args; i++)
> + entry->args[i] = call_fetch(&tp->args[i], regs);
> +
> + if (!filter_current_check_discard(call, entry, event))
> + trace_nowake_buffer_unlock_commit(event, irq_flags, pc);
> +
> + return 0;
> +}
> +
> +/* Event entry printers */
> +enum print_line_t
> +print_kprobe_event(struct trace_iterator *iter, int flags)
> +{
> + struct kprobe_trace_entry *field;
> + struct trace_seq *s = &iter->seq;
> + int i;
> +
> + trace_assign_type(field, iter->ent);
> +
> + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
> + goto partial;
> +
> + if (!trace_seq_puts(s, ":"))
> + goto partial;
> +
> + for (i = 0; i < field->nargs; i++)
> + if (!trace_seq_printf(s, " 0x%lx", field->args[i]))
> + goto partial;
> +
> + if (!trace_seq_puts(s, "\n"))
> + goto partial;
> +
> + return TRACE_TYPE_HANDLED;
> +partial:
> + return TRACE_TYPE_PARTIAL_LINE;
> +}
> +
> +enum print_line_t
> +print_kretprobe_event(struct trace_iterator *iter, int flags)
> +{
> + struct kretprobe_trace_entry *field;
> + struct trace_seq *s = &iter->seq;
> + int i;
> +
> + trace_assign_type(field, iter->ent);
> +
> + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
> + goto partial;
> +
> + if (!trace_seq_puts(s, " <- "))
> + goto partial;
> +
> + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
> + goto partial;
> +
> + if (!trace_seq_puts(s, ":"))
> + goto partial;
> +
> + for (i = 0; i < field->nargs; i++)
> + if (!trace_seq_printf(s, " 0x%lx", field->args[i]))
> + goto partial;
> +
> + if (!trace_seq_puts(s, "\n"))
> + goto partial;
> +
> + return TRACE_TYPE_HANDLED;
> +partial:
> + return TRACE_TYPE_PARTIAL_LINE;
> +}
> +
> +static struct trace_event kprobe_trace_event = {
> + .type = TRACE_KPROBE,
> + .trace = print_kprobe_event,
> +};
> +
> +static struct trace_event kretprobe_trace_event = {
> + .type = TRACE_KRETPROBE,
> + .trace = print_kretprobe_event,
> +};
> +
> +static int probe_event_enable(struct ftrace_event_call *call)
> +{
> + struct trace_probe *tp = container_of(call, struct trace_probe, call);
> + if (probe_is_return(tp))
> + return enable_kretprobe(&tp->rp);
> + else
> + return enable_kprobe(&tp->kp);
> +}
> +
> +static void probe_event_disable(struct ftrace_event_call *call)
> +{
> + struct trace_probe *tp = container_of(call, struct trace_probe, call);
> + if (probe_is_return(tp))
> + disable_kretprobe(&tp->rp);
> + else
> + disable_kprobe(&tp->kp);
> +}
> +
> +static int probe_event_raw_init(struct ftrace_event_call *event_call)
> +{
> + INIT_LIST_HEAD(&event_call->fields);
> + init_preds(event_call);
> + return 0;
> +}
> +
> +#undef DEFINE_FIELD
> +#define DEFINE_FIELD(type, item, name, is_signed) \
> + do { \
> + ret = trace_define_field(event_call, #type, name, \
> + offsetof(typeof(field), item), \
> + sizeof(field.item), is_signed);\
> + if (ret) \
> + return ret; \
> + } while (0)
> +
> +static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
> +{
> + int ret, i;
> + struct kprobe_trace_entry field;
> + char buf[MAX_ARGSTR_LEN + 1];
> + struct trace_probe *tp = container_of(event_call,
> + struct trace_probe, call);
> +
> + __common_field(int, type, 1);
> + __common_field(unsigned char, flags, 0);
> + __common_field(unsigned char, preempt_count, 0);
> + __common_field(int, pid, 1);
> + __common_field(int, tgid, 1);
> +
> + DEFINE_FIELD(unsigned long, ip, "ip", 0);
> + DEFINE_FIELD(int, nargs, "nargs", 1);
> + for (i = 0; i < tp->nr_args; i++) {
> + /* Set argN as a field */
> + sprintf(buf, "arg%d", i);
> + DEFINE_FIELD(unsigned long, args[i], buf, 0);
> + /* Set argument string as an alias field */
> + ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
> + if (ret)
> + return ret;
> + DEFINE_FIELD(unsigned long, args[i], buf, 0);
> + }
> + return 0;
> +}
> +
> +static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
> +{
> + int ret, i;
> + struct kretprobe_trace_entry field;
> + char buf[MAX_ARGSTR_LEN + 1];
> + struct trace_probe *tp = container_of(event_call,
> + struct trace_probe, call);
> +
> + __common_field(int, type, 1);
> + __common_field(unsigned char, flags, 0);
> + __common_field(unsigned char, preempt_count, 0);
> + __common_field(int, pid, 1);
> + __common_field(int, tgid, 1);
> +
> + DEFINE_FIELD(unsigned long, func, "func", 0);
> + DEFINE_FIELD(unsigned long, ret_ip, "ret_ip", 0);
> + DEFINE_FIELD(int, nargs, "nargs", 1);
> + for (i = 0; i < tp->nr_args; i++) {
> + /* Set argN as a field */
> + sprintf(buf, "arg%d", i);
> + DEFINE_FIELD(unsigned long, args[i], buf, 0);
> + /* Set argument string as an alias field */
> + ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
> + if (ret)
> + return ret;
> + DEFINE_FIELD(unsigned long, args[i], buf, 0);
> + }
> + return 0;
> +}
> +
> +static int __probe_event_show_format(struct ftrace_event_call *call,
> + struct trace_seq *s, const char *fmt,
> + const char *arg)
> +{
> + int i;
> + char buf[MAX_ARGSTR_LEN + 1];
> + struct trace_probe *tp = container_of(call, struct trace_probe, call);
> +
> + /* Show aliases */
> + for (i = 0; i < tp->nr_args; i++) {
> + if (trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]))
> + return 0;
> + if (!trace_seq_printf(s, "\talias: %s;\toriginal: arg%d;\n",
> + buf, i))
> + return 0;
> + }
> + /* Show format */
> + if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
> + return 0;
> +
> + for (i = 0; i < tp->nr_args; i++)
> + if (!trace_seq_puts(s, " 0x%lx"))
> + return 0;
> +
> + if (!trace_seq_printf(s, "\", %s", arg))
> + return 0;
> +
> + for (i = 0; i < tp->nr_args; i++)
> + if (!trace_seq_printf(s, ", arg%d", i))
> + return 0;
> +
> + return trace_seq_puts(s, "\n");
> +}
> +
> +#undef SHOW_FIELD
> +#define SHOW_FIELD(type, item, name) \
> + do { \
> + ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \
> + "offset:%u;tsize:%u;\n", name, \
> + (unsigned)offsetof(typeof(field), item),\
> + (unsigned)sizeof(type)); \
> + if (!ret) \
> + return 0; \
> + } while (0)
> +
> +static int kprobe_event_show_format(struct ftrace_event_call *call,
> + struct trace_seq *s)
> +{
> + struct kprobe_trace_entry field __attribute__((unused));
> + int ret, i;
> + char buf[8];
> + struct trace_probe *tp = container_of(call, struct trace_probe, call);
> +
> + SHOW_FIELD(unsigned long, ip, "ip");
> + SHOW_FIELD(int, nargs, "nargs");
> +
> + /* Show fields */
> + for (i = 0; i < tp->nr_args; i++) {
> + sprintf(buf, "arg%d", i);
> + SHOW_FIELD(unsigned long, args[i], buf);
> + }
> + trace_seq_puts(s, "\n");
> +
> + return __probe_event_show_format(call, s, "%lx:", "ip");
> +}
> +
> +static int kretprobe_event_show_format(struct ftrace_event_call *call,
> + struct trace_seq *s)
> +{
> + struct kretprobe_trace_entry field __attribute__((unused));
> + int ret, i;
> + char buf[8];
> + struct trace_probe *tp = container_of(call, struct trace_probe, call);
> +
> + SHOW_FIELD(unsigned long, func, "func");
> + SHOW_FIELD(unsigned long, ret_ip, "ret_ip");
> + SHOW_FIELD(int, nargs, "nargs");
> +
> + /* Show fields */
> + for (i = 0; i < tp->nr_args; i++) {
> + sprintf(buf, "arg%d", i);
> + SHOW_FIELD(unsigned long, args[i], buf);
> + }
> + trace_seq_puts(s, "\n");
> +
> + return __probe_event_show_format(call, s, "%lx <- %lx:",
> + "func, ret_ip");
> +}
> +
> +static int register_probe_event(struct trace_probe *tp)
> +{
> + struct ftrace_event_call *call = &tp->call;
> + int ret;
> +
> + /* Initialize ftrace_event_call */
> + call->system = "kprobes";
> + if (probe_is_return(tp)) {
> + call->event = &kretprobe_trace_event;
> + call->id = TRACE_KRETPROBE;
> + call->raw_init = probe_event_raw_init;
> + call->show_format = kretprobe_event_show_format;
> + call->define_fields = kretprobe_event_define_fields;
> + } else {
> + call->event = &kprobe_trace_event;
> + call->id = TRACE_KPROBE;
> + call->raw_init = probe_event_raw_init;
> + call->show_format = kprobe_event_show_format;
> + call->define_fields = kprobe_event_define_fields;
> + }
> + call->enabled = 1;
> + call->regfunc = probe_event_enable;
> + call->unregfunc = probe_event_disable;
> + ret = trace_add_event_call(call);
> + if (ret)
> + pr_info("Failed to register kprobe event: %s\n", call->name);
> + return ret;
> +}
> +
> +static void unregister_probe_event(struct trace_probe *tp)
> +{
> + /*
> + * Prevent to unregister event itself because the event is shared
> + * among other probes.
> + */
> + tp->call.event = NULL;
> + trace_remove_event_call(&tp->call);
> +}
> +
> +/* Make a debugfs interface for controling probe points */
> +static __init int init_kprobe_trace(void)
> +{
> + struct dentry *d_tracer;
> + struct dentry *entry;
> + int ret;
> +
> + ret = register_ftrace_event(&kprobe_trace_event);
> + if (!ret) {
> + pr_warning("Could not register kprobe_trace_event type.\n");
> + return 0;
> + }
> + ret = register_ftrace_event(&kretprobe_trace_event);
> + if (!ret) {
> + pr_warning("Could not register kretprobe_trace_event type.\n");
> + return 0;
> + }
> +
> + d_tracer = tracing_init_dentry();
> + if (!d_tracer)
> + return 0;
> +
> + entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
> + NULL, &kprobe_events_ops);
> +
> + if (!entry)
> + pr_warning("Could not create debugfs "
> + "'kprobe_events' entry\n");
> + return 0;
> +}
> +fs_initcall(init_kprobe_trace);
> +
> +
> +#ifdef CONFIG_FTRACE_STARTUP_TEST
> +
> +static int kprobe_trace_selftest_target(int a1, int a2, int a3,
> + int a4, int a5, int a6)
> +{
> + return a1 + a2 + a3 + a4 + a5 + a6;
> +}
> +
> +static __init int kprobe_trace_self_tests_init(void)
> +{
> + int ret;
> + int (*target)(int, int, int, int, int, int);
> + target = kprobe_trace_selftest_target;
> +
> + pr_info("Testing kprobe tracing: ");
> +
> + ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
> + "a1 a2 a3 a4 a5 a6");
> + if (WARN_ON_ONCE(ret))
> + pr_warning("error enabling function entry\n");
> +
> + ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
> + "ra rv");
> + if (WARN_ON_ONCE(ret))
> + pr_warning("error enabling function return\n");
> +
> + ret = target(1, 2, 3, 4, 5, 6);
> +
> + cleanup_all_probes();
> +
> + pr_cont("OK\n");
> + return 0;
> +}
> +
> +late_initcall(kprobe_trace_self_tests_init);
> +
> +#endif
>
>
> --
> Masami Hiramatsu
>
> Software Engineer
> Hitachi Computer Products (America), Inc.
> Software Solutions Division
>
> e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-07-07 00:58:25
|
Introduce kprobes jump optimization arch-independent parts.
Kprobes uses breakpoint instruction for interrupting execution flow, on
some architectures, it can be replaced by a jump instruction and
interruption emulation code. This gains kprobs' performance drastically.
Changes from v2:
- Set CONFIG_OPTPROBES=y by default.
- Remove CONFIG_DISABLE_CROSSJUMP
- Change detour logic for supporting reentered probes
(remove detour_optimized_kprobe())
Signed-off-by: Masami Hiramatsu <mhi...@re...>
Cc: Ananth N Mavinakayanahalli <an...@in...>
Cc: Ingo Molnar <mi...@el...>
Cc: Jim Keniston <jke...@us...>
Cc: Srikar Dronamraju <sr...@li...>
Cc: Christoph Hellwig <hc...@in...>
Cc: Steven Rostedt <ro...@go...>
Cc: Frederic Weisbecker <fwe...@gm...>
Cc: H. Peter Anvin <hp...@zy...>
Cc: Anders Kaseorg <an...@ks...>
Cc: Tim Abbott <ta...@ks...>
Cc: Andi Kleen <an...@fi...>
---
arch/Kconfig | 13 ++
include/linux/kprobes.h | 36 ++++
kernel/kprobes.c | 390 +++++++++++++++++++++++++++++++++++++++++------
3 files changed, 393 insertions(+), 46 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index c72f18f..e830933 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -44,6 +44,17 @@ config KPROBES
for kernel debugging, non-intrusive instrumentation and testing.
If in doubt, say "N".
+config OPTPROBES
+ bool "Kprobes jump optimization support (EXPERIMENTAL)"
+ default y
+ depends on KPROBES
+ depends on !PREEMPT
+ depends on HAVE_OPTPROBES
+ select KALLSYMS_ALL
+ help
+ This option will allow kprobes to optimize breakpoint to
+ a jump for reducing its overhead.
+
config HAVE_EFFICIENT_UNALIGNED_ACCESS
bool
help
@@ -79,6 +90,8 @@ config HAVE_KPROBES
config HAVE_KRETPROBES
bool
+config HAVE_OPTPROBES
+ bool
#
# An arch should select this if it provides all these things:
#
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index bcd9c07..6f75014 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -122,6 +122,11 @@ struct kprobe {
/* Kprobe status flags */
#define KPROBE_FLAG_GONE 1 /* breakpoint has already gone */
#define KPROBE_FLAG_DISABLED 2 /* probe is temporarily disabled */
+#define KPROBE_FLAG_OPTIMIZED 4 /*
+ * probe is really optimized.
+ * NOTE:
+ * this flag is only for optimized_kprobe.
+ */
/* Has this kprobe gone ? */
static inline int kprobe_gone(struct kprobe *p)
@@ -134,6 +139,12 @@ static inline int kprobe_disabled(struct kprobe *p)
{
return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE);
}
+
+/* Is this kprobe really running optimized path ? */
+static inline int kprobe_optimized(struct kprobe *p)
+{
+ return p->flags & KPROBE_FLAG_OPTIMIZED;
+}
/*
* Special probe type that uses setjmp-longjmp type tricks to resume
* execution at a specified entry with a matching prototype corresponding
@@ -249,6 +260,31 @@ extern kprobe_opcode_t *get_insn_slot(void);
extern void free_insn_slot(kprobe_opcode_t *slot, int dirty);
extern void kprobes_inc_nmissed_count(struct kprobe *p);
+#ifdef CONFIG_OPTPROBES
+/*
+ * Internal structure for direct jump optimized probe
+ */
+struct optimized_kprobe {
+ struct kprobe kp;
+ struct list_head list; /* list for optimizing queue */
+ struct arch_optimized_insn optinsn;
+};
+
+/* Architecture dependent functions for direct jump optimization */
+extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn);
+extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
+extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op);
+extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
+extern int arch_optimize_kprobe(struct optimized_kprobe *op);
+extern void arch_unoptimize_kprobe(struct optimized_kprobe *op);
+extern kprobe_opcode_t *get_optinsn_slot(void);
+extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty);
+extern int arch_within_optimized_kprobe(struct optimized_kprobe *op,
+ unsigned long addr);
+
+extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs);
+#endif /* CONFIG_OPTPROBES */
+
/* Get the kprobe at this addr (if any) - called with preemption disabled */
struct kprobe *get_kprobe(void *addr);
void kretprobe_hash_lock(struct task_struct *tsk,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0162c3c..0cbb607 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -298,6 +298,31 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
__free_insn_slot(&kprobe_insn_slots, slot, dirty);
mutex_unlock(&kprobe_insn_mutex);
}
+#ifdef CONFIG_OPTPROBES
+/* For optimized_kprobe buffer */
+static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
+static struct kprobe_insn_cache kprobe_optinsn_slots = {
+ .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
+ /* .insn_size is initialized later */
+ .nr_garbage = 0,
+};
+/* Get a slot for optimized_kprobe buffer */
+kprobe_opcode_t __kprobes *get_optinsn_slot(void)
+{
+ kprobe_opcode_t *ret = NULL;
+ mutex_lock(&kprobe_optinsn_mutex);
+ ret = __get_insn_slot(&kprobe_optinsn_slots);
+ mutex_unlock(&kprobe_optinsn_mutex);
+ return ret;
+}
+
+void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
+{
+ mutex_lock(&kprobe_optinsn_mutex);
+ __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
+ mutex_unlock(&kprobe_optinsn_mutex);
+}
+#endif
#endif
/* We have preemption disabled.. so it is safe to use __ versions */
@@ -331,11 +356,256 @@ struct kprobe __kprobes *get_kprobe(void *addr)
return NULL;
}
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
+
+/* Return true if the kprobe is an aggregator */
+static inline int kprobe_aggrprobe(struct kprobe *p)
+{
+ return p->pre_handler == aggr_pre_handler;
+}
+
+/*
+ * Keep all fields in the kprobe consistent
+ */
+static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+ memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+ memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+}
+
+#ifdef CONFIG_OPTPROBES
+/*
+ * Call all pre_handler on the list, but ignores its return value.
+ * This must be called from arch-dep optimized caller.
+ */
+void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kprobe *kp;
+
+ list_for_each_entry_rcu(kp, &p->list, list) {
+ if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
+ set_kprobe_instance(kp);
+ kp->pre_handler(kp, regs);
+ }
+ reset_kprobe_instance();
+ }
+}
+
+/* Return true(!0) if the kprobe is ready for optimization. */
+static inline int kprobe_optready(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+ if (kprobe_aggrprobe(p)) {
+ op = container_of(p, struct optimized_kprobe, kp);
+ return arch_prepared_optinsn(&op->optinsn);
+ }
+ return 0;
+}
+
+/* Return an optimized kprobe which replaces instructions including addr. */
+struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+{
+ int i;
+ struct kprobe *p = NULL;
+ struct optimized_kprobe *op;
+ for (i = 0; !p && i < MAX_OPTIMIZED_LENGTH; i++)
+ p = get_kprobe((void *)(addr - i));
+
+ if (p && kprobe_optready(p)) {
+ op = container_of(p, struct optimized_kprobe, kp);
+ if (arch_within_optimized_kprobe(op, addr))
+ return p;
+ }
+ return NULL;
+}
+
+/* Optimization staging list, protected by kprobe_mutex */
+static LIST_HEAD(optimizing_list);
+
+static void kprobe_optimizer(struct work_struct *work);
+static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+#define OPTIMIZE_DELAY 5
+
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ /* Lock modules while optimizing kprobes */
+ mutex_lock(&module_mutex);
+ mutex_lock(&kprobe_mutex);
+ if (kprobes_all_disarmed)
+ goto end;
+
+ /* Wait quiesence period for ensuring all interrupts are done */
+ synchronize_sched();
+
+ mutex_lock(&text_mutex);
+ list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
+ WARN_ON(kprobe_disabled(&op->kp));
+ if (arch_optimize_kprobe(op) < 0)
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ list_del_init(&op->list);
+ }
+ mutex_unlock(&text_mutex);
+end:
+ mutex_unlock(&kprobe_mutex);
+ mutex_unlock(&module_mutex);
+}
+
+/* Optimize kprobe if p is ready to be optimized */
+static __kprobes void optimize_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+ /* Check if the kprobe is disabled or not ready for optimization. */
+ if (!kprobe_optready(p) ||
+ (kprobe_disabled(p) || kprobes_all_disarmed))
+ return;
+
+ /* Both of break_handler and post_handler are not supported. */
+ if (p->break_handler || p->post_handler)
+ return;
+
+ op = container_of(p, struct optimized_kprobe, kp);
+
+ /* Check there is no other kprobes at the optimized instructions */
+ if (arch_check_optimized_kprobe(op) < 0)
+ return;
+
+ /* Check if it is already optimized. */
+ if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
+ return;
+
+ op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
+ list_add(&op->list, &optimizing_list);
+ if (!delayed_work_pending(&optimizing_work))
+ schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+
+/* Unoptimize a kprobe if p is optimized */
+static __kprobes void unoptimize_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+ if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
+ op = container_of(p, struct optimized_kprobe, kp);
+ if (!list_empty(&op->list))
+ /* Dequeue from the optimization queue */
+ list_del_init(&op->list);
+ else
+ /* Replace jump with break */
+ arch_unoptimize_kprobe(op);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ }
+}
+
+/* Remove optimized instructions */
+static void __kprobes kill_optimized_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+ op = container_of(p, struct optimized_kprobe, kp);
+ if (!list_empty(&op->list)) {
+ /* Dequeue from the optimization queue */
+ list_del_init(&op->list);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ }
+ /* Don't unoptimize, because the target code will be freed. */
+ arch_remove_optimized_kprobe(op);
+}
+
+/* Try to prepare optimized instructions */
+static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+ op = container_of(p, struct optimized_kprobe, kp);
+ arch_prepare_optimized_kprobe(op);
+}
+
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+ op = container_of(p, struct optimized_kprobe, kp);
+ arch_remove_optimized_kprobe(op);
+ kfree(op);
+}
+
+/* Allocate new optimized_kprobe and try to prepare optimized instructions */
+static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
+ if (!op)
+ return NULL;
+
+ INIT_LIST_HEAD(&op->list);
+ op->kp.addr = p->addr;
+ arch_prepare_optimized_kprobe(op);
+ return &op->kp;
+}
+
+static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
+
+/*
+ * Prepare an optimized_kprobe and optimize it
+ * NOTE: p must be a normal registered kprobe
+ */
+static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
+{
+ struct kprobe *ap;
+ struct optimized_kprobe *op;
+
+ ap = alloc_aggr_kprobe(p);
+ if (!ap)
+ return;
+
+ op = container_of(ap, struct optimized_kprobe, kp);
+ if (!arch_prepared_optinsn(&op->optinsn)) {
+ /* If failed to setup optimizing, fallback to kprobe */
+ free_aggr_kprobe(ap);
+ return;
+ }
+
+ init_aggr_kprobe(ap, p);
+ optimize_kprobe(ap);
+ return;
+}
+#else /* !CONFIG_OPTPROBES */
+#define get_optimized_kprobe(addr) (NULL)
+#define optimize_kprobe(p) do {} while (0)
+#define unoptimize_kprobe(p) do {} while (0)
+#define kill_optimized_kprobe(p) do {} while (0)
+#define prepare_optimized_kprobe(p) do {} while (0)
+#define try_to_optimize_kprobe(p) do {} while (0)
+
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+ kfree(p);
+}
+
+static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+ return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+}
+#endif /* CONFIG_OPTPROBES */
+
+static void __kprobes __arm_kprobe(struct kprobe *kp)
+{
+ arch_arm_kprobe(kp);
+ optimize_kprobe(kp); /* Try to re-optimize */
+}
+
+static void __kprobes __disarm_kprobe(struct kprobe *kp)
+{
+ unoptimize_kprobe(kp); /* Try to unoptimize */
+ arch_disarm_kprobe(kp);
+}
+
/* Arm a kprobe with text_mutex */
static void __kprobes arm_kprobe(struct kprobe *kp)
{
mutex_lock(&text_mutex);
- arch_arm_kprobe(kp);
+ __arm_kprobe(kp);
mutex_unlock(&text_mutex);
}
@@ -343,7 +613,7 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
static void __kprobes disarm_kprobe(struct kprobe *kp)
{
mutex_lock(&text_mutex);
- arch_disarm_kprobe(kp);
+ __disarm_kprobe(kp);
mutex_unlock(&text_mutex);
}
@@ -413,7 +683,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
{
struct kprobe *kp;
- if (p->pre_handler != aggr_pre_handler) {
+ if (!kprobe_aggrprobe(p)) {
p->nmissed++;
} else {
list_for_each_entry_rcu(kp, &p->list, list)
@@ -537,21 +807,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
}
/*
- * Keep all fields in the kprobe consistent
- */
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
-{
- memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
- memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
-}
-
-/*
* Add the new probe to ap->list. Fail if this is the
* second jprobe at the address - two jprobes can't coexist
*/
static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
+
+ if (p->break_handler || p->post_handler)
+ unoptimize_kprobe(ap); /* Fall back to normal kprobe */
+
if (p->break_handler) {
if (ap->break_handler)
return -EEXIST;
@@ -566,7 +831,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
ap->flags &= ~KPROBE_FLAG_DISABLED;
if (!kprobes_all_disarmed)
/* Arm the breakpoint again. */
- arm_kprobe(ap);
+ __arm_kprobe(ap);
}
return 0;
}
@@ -575,12 +840,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
* Fill in the required fields of the "manager kprobe". Replace the
* earlier kprobe in the hlist with the manager kprobe
*/
-static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
+ /* Copy p's insn slot to ap */
copy_kprobe(p, ap);
flush_insn_slot(ap);
ap->addr = p->addr;
- ap->flags = p->flags;
+ ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
ap->pre_handler = aggr_pre_handler;
ap->fault_handler = aggr_fault_handler;
/* We don't care the kprobe which has gone. */
@@ -590,8 +856,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
ap->break_handler = aggr_break_handler;
INIT_LIST_HEAD(&ap->list);
- list_add_rcu(&p->list, &ap->list);
+ INIT_HLIST_NODE(&ap->hlist);
+ list_add_rcu(&p->list, &ap->list);
hlist_replace_rcu(&p->hlist, &ap->hlist);
}
@@ -605,12 +872,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
int ret = 0;
struct kprobe *ap = old_p;
- if (old_p->pre_handler != aggr_pre_handler) {
- /* If old_p is not an aggr_probe, create new aggr_kprobe. */
- ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+ if (!kprobe_aggrprobe(old_p)) {
+ /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
+ ap = alloc_aggr_kprobe(old_p);
if (!ap)
return -ENOMEM;
- add_aggr_kprobe(ap, old_p);
+ init_aggr_kprobe(ap, old_p);
}
if (kprobe_gone(ap)) {
@@ -629,6 +896,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
*/
return ret;
+ /* Prepare optimized instructions if possible. */
+ prepare_optimized_kprobe(ap);
+
/*
* Clear gone flag to prevent allocating new slot again, and
* set disabled flag because it is not armed yet.
@@ -637,6 +907,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
| KPROBE_FLAG_DISABLED;
}
+ /* Copy ap's insn slot to p */
copy_kprobe(ap, p);
return add_new_kprobe(ap, p);
}
@@ -748,16 +1019,23 @@ int __kprobes register_kprobe(struct kprobe *p)
p->nmissed = 0;
INIT_LIST_HEAD(&p->list);
mutex_lock(&kprobe_mutex);
+ mutex_lock(&text_mutex);
+
old_p = get_kprobe(p->addr);
if (old_p) {
+ /* Since this may unoptimize old_p, locking text_mutex. */
ret = register_aggr_kprobe(old_p, p);
goto out;
}
- mutex_lock(&text_mutex);
+ /* Check collision with other optimized kprobes */
+ old_p = get_optimized_kprobe((unsigned long)p->addr);
+ if (unlikely(old_p))
+ unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+
ret = arch_prepare_kprobe(p);
if (ret)
- goto out_unlock_text;
+ goto out;
INIT_HLIST_NODE(&p->hlist);
hlist_add_head_rcu(&p->hlist,
@@ -766,9 +1044,11 @@ int __kprobes register_kprobe(struct kprobe *p)
if (!kprobes_all_disarmed && !kprobe_disabled(p))
arch_arm_kprobe(p);
-out_unlock_text:
- mutex_unlock(&text_mutex);
+ /* Try to optimize kprobe */
+ try_to_optimize_kprobe(p);
+
out:
+ mutex_unlock(&text_mutex);
mutex_unlock(&kprobe_mutex);
if (probed_mod)
@@ -810,7 +1090,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
return -EINVAL;
if (old_p == p ||
- (old_p->pre_handler == aggr_pre_handler &&
+ (kprobe_aggrprobe(old_p) &&
list_is_singular(&old_p->list))) {
/*
* Only probe on the hash list. Disarm only if kprobes are
@@ -818,8 +1098,13 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
* already have been removed. We save on flushing icache.
*/
if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
- disarm_kprobe(p);
+ disarm_kprobe(old_p);
hlist_del_rcu(&old_p->hlist);
+
+ /* If another kprobe was blocked, optimize it. */
+ old_p = get_optimized_kprobe((unsigned long)p->addr);
+ if (unlikely(old_p))
+ optimize_kprobe(old_p);
} else {
if (p->break_handler && !kprobe_gone(p))
old_p->break_handler = NULL;
@@ -834,8 +1119,13 @@ noclean:
list_del_rcu(&p->list);
if (!kprobe_disabled(old_p)) {
try_to_disable_aggr_kprobe(old_p);
- if (!kprobes_all_disarmed && kprobe_disabled(old_p))
- disarm_kprobe(old_p);
+ if (!kprobes_all_disarmed) {
+ if (kprobe_disabled(old_p))
+ disarm_kprobe(old_p);
+ else
+ /* Try to optimize this probe again */
+ optimize_kprobe(old_p);
+ }
}
}
return 0;
@@ -852,7 +1142,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
old_p = list_entry(p->list.next, struct kprobe, list);
list_del(&p->list);
arch_remove_kprobe(old_p);
- kfree(old_p);
+ free_aggr_kprobe(old_p);
}
}
@@ -1148,7 +1438,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
struct kprobe *kp;
p->flags |= KPROBE_FLAG_GONE;
- if (p->pre_handler == aggr_pre_handler) {
+ if (kprobe_aggrprobe(p)) {
/*
* If this is an aggr_kprobe, we have to list all the
* chained probes and mark them GONE.
@@ -1157,6 +1447,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
kp->flags |= KPROBE_FLAG_GONE;
p->post_handler = NULL;
p->break_handler = NULL;
+ kill_optimized_kprobe(p);
}
/*
* Here, we can remove insn_slot safely, because no thread calls
@@ -1259,6 +1550,11 @@ static int __init init_kprobes(void)
}
}
+#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
+ /* Init kprobe_optinsn_slots */
+ kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+#endif
+
/* By default, kprobes are armed */
kprobes_all_disarmed = false;
@@ -1277,7 +1573,7 @@ static int __init init_kprobes(void)
#ifdef CONFIG_DEBUG_FS
static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
- const char *sym, int offset,char *modname)
+ const char *sym, int offset, char *modname, struct kprobe *pp)
{
char *kprobe_type;
@@ -1287,19 +1583,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
kprobe_type = "j";
else
kprobe_type = "k";
+
if (sym)
- seq_printf(pi, "%p %s %s+0x%x %s %s%s\n",
+ seq_printf(pi, "%p %s %s+0x%x %s ",
p->addr, kprobe_type, sym, offset,
- (modname ? modname : " "),
- (kprobe_gone(p) ? "[GONE]" : ""),
- ((kprobe_disabled(p) && !kprobe_gone(p)) ?
- "[DISABLED]" : ""));
+ (modname ? modname : " "));
else
- seq_printf(pi, "%p %s %p %s%s\n",
- p->addr, kprobe_type, p->addr,
- (kprobe_gone(p) ? "[GONE]" : ""),
- ((kprobe_disabled(p) && !kprobe_gone(p)) ?
- "[DISABLED]" : ""));
+ seq_printf(pi, "%p %s %p ",
+ p->addr, kprobe_type, p->addr);
+
+ if (!pp)
+ pp = p;
+ seq_printf(pi, "%s%s%s\n",
+ (kprobe_gone(p) ? "[GONE]" : ""),
+ ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
+ (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
}
static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1335,11 +1633,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
hlist_for_each_entry_rcu(p, node, head, hlist) {
sym = kallsyms_lookup((unsigned long)p->addr, NULL,
&offset, &modname, namebuf);
- if (p->pre_handler == aggr_pre_handler) {
+ if (kprobe_aggrprobe(p)) {
list_for_each_entry_rcu(kp, &p->list, list)
- report_probe(pi, kp, sym, offset, modname);
+ report_probe(pi, kp, sym, offset, modname, p);
} else
- report_probe(pi, p, sym, offset, modname);
+ report_probe(pi, p, sym, offset, modname, NULL);
}
preempt_enable();
return 0;
@@ -1447,7 +1745,7 @@ static void __kprobes arm_all_kprobes(void)
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist)
if (!kprobe_disabled(p))
- arch_arm_kprobe(p);
+ __arm_kprobe(p);
}
mutex_unlock(&text_mutex);
@@ -1479,7 +1777,7 @@ static void __kprobes disarm_all_kprobes(void)
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist) {
if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
- arch_disarm_kprobe(p);
+ __disarm_kprobe(p);
}
}
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-07-07 00:58:24
|
Introduce x86 arch-specific optimization code, which supports both of
x86-32 and x86-64.
This code also supports safety checking, which decodes whole of a function
in which probe is inserted, and checks following conditions before
optimization:
- The optimized instructions which will be replaced by a jump instruction
don't straddle the function boundary.
- There is no indirect jump instruction, because it will jumps into
the address range which is replaced by jump operand.
- There is no jump/loop instruction which jumps into the address range
which is replaced by jump operand.
- Don't optimize kprobes if it is in functions into which fixup code will
jumps.
Changes from v2:
- Fix a bug to support reentered probes (add setup_detour_execution(),
and modify setup_singlestep().)
Signed-off-by: Masami Hiramatsu <mhi...@re...>
Cc: Ananth N Mavinakayanahalli <an...@in...>
Cc: Ingo Molnar <mi...@el...>
Cc: Jim Keniston <jke...@us...>
Cc: Srikar Dronamraju <sr...@li...>
Cc: Christoph Hellwig <hc...@in...>
Cc: Steven Rostedt <ro...@go...>
Cc: Frederic Weisbecker <fwe...@gm...>
Cc: H. Peter Anvin <hp...@zy...>
Cc: Anders Kaseorg <an...@ks...>
Cc: Tim Abbott <ta...@ks...>
Cc: Andi Kleen <an...@fi...>
---
arch/x86/Kconfig | 1
arch/x86/include/asm/kprobes.h | 31 +++
arch/x86/kernel/kprobes.c | 421 ++++++++++++++++++++++++++++++++++++++--
3 files changed, 430 insertions(+), 23 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a70cdac..367a111 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -30,6 +30,7 @@ config X86
select ARCH_WANT_FRAME_POINTERS
select HAVE_DMA_ATTRS
select HAVE_KRETPROBES
+ select HAVE_OPTPROBES
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_DYNAMIC_FTRACE
select HAVE_FUNCTION_TRACER
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4fe681d..cacc5ea 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -32,7 +32,10 @@ struct kprobe;
typedef u8 kprobe_opcode_t;
#define BREAKPOINT_INSTRUCTION 0xcc
-#define RELATIVEJUMP_INSTRUCTION 0xe9
+#define RELATIVEJUMP_OPCODE 0xe9
+#define RELATIVECALL_OPCODE 0xe8
+#define RELATIVE_ADDR_SIZE 4
+#define RELATIVE_JUMP_SIZE (sizeof(kprobe_opcode_t) + RELATIVE_ADDR_SIZE)
#define MAX_INSN_SIZE 16
#define MAX_STACK_SIZE 64
#define MIN_STACK_SIZE(ADDR) \
@@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t;
#define flush_insn_slot(p) do { } while (0)
+/* optinsn template addresses */
+extern kprobe_opcode_t optprobe_template_entry;
+extern kprobe_opcode_t optprobe_template_val;
+extern kprobe_opcode_t optprobe_template_call;
+extern kprobe_opcode_t optprobe_template_end;
+#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
+#define MAX_OPTINSN_SIZE \
+ (((unsigned long)&optprobe_template_end - \
+ (unsigned long)&optprobe_template_entry) + \
+ MAX_OPTIMIZED_LENGTH + RELATIVE_JUMP_SIZE)
+
extern const int kretprobe_blacklist_size;
void arch_remove_kprobe(struct kprobe *p);
@@ -64,6 +78,21 @@ struct arch_specific_insn {
int boostable;
};
+struct arch_optimized_insn {
+ /* copy of the original instructions */
+ kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE];
+ /* detour code buffer */
+ kprobe_opcode_t *insn;
+ /* the size of instructions copied to detour code buffer */
+ size_t size;
+};
+
+/* Return true (!0) if optinsn is prepared for optimization. */
+static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
+{
+ return optinsn->size;
+}
+
struct prev_kprobe {
struct kprobe *kp;
unsigned long status;
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 40f204b..1cd4445 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -118,16 +118,36 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
};
const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static void __kprobes set_jmp_op(void *from, void *to)
+/*
+ * On pentium series, Unsynchronized cross-modifying code
+ * operations can cause unexpected instruction execution results.
+ * So after code modified, we should synchronize it on each processor.
+ */
+static void __local_serialize_cpu(void *info)
+{
+ sync_core();
+}
+
+void arch_serialize_cpus(void)
{
- struct __arch_jmp_op {
- char op;
+ on_each_cpu(__local_serialize_cpu, NULL, 1);
+}
+
+static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
+{
+ struct __arch_relative_insn {
+ u8 op;
s32 raddr;
- } __attribute__((packed)) * jop;
- jop = (struct __arch_jmp_op *)from;
- jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
- jop->op = RELATIVEJUMP_INSTRUCTION;
+ } __attribute__((packed)) *insn;
+ insn = (struct __arch_relative_insn *)from;
+ insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
+ insn->op = op;
+}
+
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
+static void __kprobes synthesize_reljump(void *from, void *to)
+{
+ __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
}
/*
@@ -214,7 +234,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
/*
* Basically, kp->ainsn.insn has an original instruction.
* However, RIP-relative instruction can not do single-stepping
- * at different place, fix_riprel() tweaks the displacement of
+ * at different place, __copy_instruction() tweaks the displacement of
* that instruction. In that case, we can't recover the instruction
* from the kp->ainsn.insn.
*
@@ -292,21 +312,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
}
/*
- * Adjust the displacement if the instruction uses the %rip-relative
- * addressing mode.
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
* If it does, Return the address of the 32-bit displacement word.
* If not, return null.
* Only applicable to 64-bit x86.
*/
-static void __kprobes fix_riprel(struct kprobe *p)
+static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
{
-#ifdef CONFIG_X86_64
struct insn insn;
- kernel_insn_init(&insn, p->ainsn.insn);
+ int ret;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ kernel_insn_init(&insn, src);
+ if (recover) {
+ insn_get_opcode(&insn);
+ if (OPCODE1(&insn) == BREAKPOINT_INSTRUCTION) {
+ ret = recover_probed_instruction(buf,
+ (unsigned long)src);
+ if (ret)
+ return 0;
+ kernel_insn_init(&insn, buf);
+ }
+ }
+ insn_get_length(&insn);
+ memcpy(dest, insn.kaddr, insn.length);
+#ifdef CONFIG_X86_64
if (insn_rip_relative(&insn)) {
s64 newdisp;
u8 *disp;
+ kernel_insn_init(&insn, dest);
insn_get_displacement(&insn);
/*
* The copied instruction uses the %rip-relative addressing
@@ -320,20 +356,23 @@ static void __kprobes fix_riprel(struct kprobe *p)
* extension of the original signed 32-bit displacement would
* have given.
*/
- newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
- (u8 *) p->ainsn.insn;
+ newdisp = (u8 *) src + (s64) insn.displacement.value -
+ (u8 *) dest;
BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
- disp = (u8 *) p->ainsn.insn + INSN_DISPLACEMENT_OFFS(&insn);
+ disp = (u8 *) dest + INSN_DISPLACEMENT_OFFS(&insn);
*(s32 *) disp = (s32) newdisp;
}
#endif
+ return insn.length;
}
static void __kprobes arch_copy_kprobe(struct kprobe *p)
{
- memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-
- fix_riprel(p);
+ /*
+ * Copy an instruction without recovering int3, because it will be
+ * put by another subsystem.
+ */
+ __copy_instruction(p->ainsn.insn, p->addr, 0);
if (can_boost(p->addr))
p->ainsn.boostable = 0;
@@ -422,9 +461,18 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
*sara = (unsigned long) &kretprobe_trampoline;
}
+#ifdef CONFIG_OPTPROBES
+static int __kprobes setup_detour_execution(struct kprobe *p,
+ struct pt_regs *regs);
+#else
+#define setup_detour_execution(p, regs) (0)
+#endif
+
static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb, int reenter)
{
+ if (setup_detour_execution(p, regs))
+ return;
#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER)
if (p->ainsn.boostable == 1 && !p->post_handler) {
/* Boost up -- we can execute copied instructions directly */
@@ -827,8 +875,8 @@ static void __kprobes resume_execution(struct kprobe *p,
* These instructions can be executed directly if it
* jumps back to correct address.
*/
- set_jmp_op((void *)regs->ip,
- (void *)orig_ip + (regs->ip - copy_ip));
+ synthesize_reljump((void *)regs->ip,
+ (void *)orig_ip + (regs->ip - copy_ip));
p->ainsn.boostable = 1;
} else {
p->ainsn.boostable = -1;
@@ -1055,6 +1103,335 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
return 0;
}
+
+#ifdef CONFIG_OPTPROBES
+
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+static void __kprobes synthesize_relcall(void *from, void *to)
+{
+ __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
+}
+
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
+ unsigned long val)
+{
+#ifdef CONFIG_X86_64
+ *addr++ = 0x48;
+ *addr++ = 0xbf;
+#else
+ *addr++ = 0xb8;
+#endif
+ *(unsigned long *)addr = val;
+}
+
+void __kprobes kprobes_optinsn_template_holder(void)
+{
+ asm volatile (
+ ".global optprobe_template_entry\n"
+ "optprobe_template_entry: \n"
+#ifdef CONFIG_X86_64
+ /* We don't bother saving the ss register */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rsi\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val: \n"
+ ASM_NOP5
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call: \n"
+ ASM_NOP5
+ /* Move flags to rsp */
+ " movq 144(%rsp), %rdx\n"
+ " movq %rdx, 152(%rsp)\n"
+ RESTORE_REGS_STRING
+ /* Skip flags entry */
+ " addq $8, %rsp\n"
+ " popfq\n"
+#else /* CONFIG_X86_32 */
+ " pushf\n"
+ SAVE_REGS_STRING
+ " movl %esp, %edx\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val: \n"
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call: \n"
+ ASM_NOP5
+ RESTORE_REGS_STRING
+ " addl $4, %esp\n" /* skip cs */
+ " popf\n"
+#endif
+ ".global optprobe_template_end\n"
+ "optprobe_template_end: \n");
+}
+
+#define TMPL_MOVE_IDX \
+ ((long)&optprobe_template_val - (long)&optprobe_template_entry)
+#define TMPL_CALL_IDX \
+ ((long)&optprobe_template_call - (long)&optprobe_template_entry)
+#define TMPL_END_IDX \
+ ((long)&optprobe_template_end - (long)&optprobe_template_entry)
+
+#define INT3_SIZE sizeof(kprobe_opcode_t)
+
+/* Optimized kprobe call back function: called from optinsn */
+static void __kprobes optimized_callback(struct optimized_kprobe *op,
+ struct pt_regs *regs)
+{
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ preempt_disable();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(&op->kp);
+ } else {
+ /* Save skipped registers */
+#ifdef CONFIG_X86_64
+ regs->cs = __KERNEL_CS;
+#else
+ regs->cs = __KERNEL_CS | get_kernel_rpl();
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+ regs->orig_ax = ~0UL;
+
+ __get_cpu_var(current_kprobe) = &op->kp;
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ opt_pre_handler(&op->kp, regs);
+ __get_cpu_var(current_kprobe) = NULL;
+ }
+ preempt_enable_no_resched();
+}
+
+static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
+{
+ int len = 0, ret;
+ while (len < RELATIVE_JUMP_SIZE) {
+ ret = __copy_instruction(dest + len, src + len, 1);
+ if (!ret || !can_boost(dest + len))
+ return -EINVAL;
+ len += ret;
+ }
+ return len;
+}
+
+/* Check whether insn is indirect jump */
+static int __kprobes insn_is_indirect_jump(struct insn *insn)
+{
+ return (OPCODE1(insn) == 0xff || OPCODE1(insn) == 0xea);
+}
+
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+ unsigned long target = 0;
+ switch (OPCODE1(insn)) {
+ case 0xe0: /* loopne */
+ case 0xe1: /* loope */
+ case 0xe2: /* loop */
+ case 0xe3: /* jcxz */
+ case 0xe9: /* near relative jump */
+ case 0xeb: /* short relative jump */
+ break;
+ case 0x0f:
+ if ((OPCODE2(insn) & 0xf0) == 0x80) /* jcc near */
+ break;
+ return 0;
+ default:
+ if ((OPCODE1(insn) & 0xf0) == 0x70) /* jcc short */
+ break;
+ return 0;
+ }
+ target = (unsigned long)insn->next_byte + insn->immediate.value;
+ return (start <= target && target <= start + len);
+}
+
+/* Decode whole function to ensure any instructions don't jump into target */
+static int __kprobes can_optimize(unsigned long paddr)
+{
+ int ret;
+ unsigned long addr, size = 0, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ /* Dummy buffers for lookup_symbol_attrs */
+ static char __dummy_buf[KSYM_NAME_LEN];
+
+ /* Lookup symbol including addr */
+ if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+ return 0;
+
+ /* Check there is enough space for a relative jump. */
+ if (size - offset < RELATIVE_JUMP_SIZE)
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr - offset + size) { /* Decode until function end */
+ if (search_exception_tables(addr))
+ /*
+ * Since some fixup code will jumps into this function,
+ * we can't optimize kprobe in this function.
+ */
+ return 0;
+ kernel_insn_init(&insn, (void *)addr);
+ insn_get_opcode(&insn);
+ if (OPCODE1(&insn) == BREAKPOINT_INSTRUCTION) {
+ ret = recover_probed_instruction(buf, addr);
+ if (ret)
+ return 0;
+ kernel_insn_init(&insn, buf);
+ }
+ insn_get_length(&insn);
+ /* Recover address */
+ insn.kaddr = (void *)addr;
+ insn.next_byte = (void *)(addr + insn.length);
+ /* Check any instructions don't jump into target */
+ if (insn_is_indirect_jump(&insn) ||
+ insn_jump_into_range(&insn, paddr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE))
+ return 0;
+ addr += insn.length;
+ }
+
+ return 1;
+}
+
+/* Check optimized_kprobe can actually be optimized. */
+int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+ int i;
+ for (i = 1; i < op->optinsn.size; i++)
+ if (get_kprobe(op->kp.addr + i))
+ return -EEXIST;
+ return 0;
+}
+
+/* Check the addr is within the optimized instructions. */
+int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
+ unsigned long addr)
+{
+ return ((unsigned long)op->kp.addr <= addr &&
+ (unsigned long)op->kp.addr + op->optinsn.size > addr);
+}
+
+/* Free optimized instruction slot */
+static __kprobes
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+ if (op->optinsn.insn) {
+ free_optinsn_slot(op->optinsn.insn, dirty);
+ op->optinsn.insn = NULL;
+ op->optinsn.size = 0;
+ }
+}
+
+void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+ __arch_remove_optimized_kprobe(op, 1);
+}
+
+/*
+ * Copy p st processing instructions
+ * Target instructions MUST be relocatable.
+ */
+int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+{
+ u8 *buf;
+ int ret;
+
+ if (!can_optimize((unsigned long)op->kp.addr))
+ return -EILSEQ;
+
+ op->optinsn.insn = get_optinsn_slot();
+ if (!op->optinsn.insn)
+ return -ENOMEM;
+
+ buf = (u8 *)op->optinsn.insn;
+
+ /* Copy instructions into the out-of-line buffer */
+ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
+ if (ret < 0) {
+ __arch_remove_optimized_kprobe(op, 0);
+ return ret;
+ }
+ op->optinsn.size = ret;
+
+ /* Backup instructions which will be replaced by jump address */
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE);
+
+ /* Copy arch-dep-instance from template */
+ memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
+
+ /* Set probe information */
+ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+
+ /* Set probe function call */
+ synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
+
+ /* Set returning jmp instruction at the tail of out-of-line buffer */
+ synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
+ (u8 *)op->kp.addr + op->optinsn.size);
+
+ flush_icache_range((unsigned long) buf,
+ (unsigned long) buf + TMPL_END_IDX +
+ op->optinsn.size + RELATIVE_JUMP_SIZE);
+ return 0;
+}
+
+/* Replace a breakpoint (int3) with a relative jump. */
+int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
+{
+ kprobe_opcode_t opcode = RELATIVEJUMP_OPCODE;
+ long rel = (long)(op->optinsn.insn) -
+ ((long)(op->kp.addr) + RELATIVE_JUMP_SIZE);
+
+ /* Insert the destination address only */
+ text_poke((void *)((char *)op->kp.addr + INT3_SIZE), &rel,
+ RELATIVE_ADDR_SIZE);
+ arch_serialize_cpus();
+
+ /* Overwrite breakpoint to reljump */
+ text_poke(op->kp.addr, &opcode, sizeof(kprobe_opcode_t));
+ arch_serialize_cpus();
+ return 0;
+}
+
+/* Replace a relative jump with a breakpoint (int3). */
+void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ /* Change (the 1st byte of) jump to int3. */
+ arch_arm_kprobe(&op->kp);
+ arch_serialize_cpus();
+ /*
+ * Recover the instructions covered by the destination address.
+ * The int3 will be removed by arch_disarm_kprobe()
+ */
+ text_poke((void *)((long)op->kp.addr + INT3_SIZE),
+ (void *)op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ arch_serialize_cpus();
+}
+
+static int __kprobes setup_detour_execution(struct kprobe *p,
+ struct pt_regs *regs)
+{
+ struct optimized_kprobe *op;
+
+ if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+ /* This kprobe is really able to run optimized path. */
+ op = container_of(p, struct optimized_kprobe, kp);
+ /* Detour through copied instructions */
+ regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+ reset_current_kprobe();
+ preempt_enable_no_resched();
+ return 1;
+ }
+ return 0;
+}
+#endif
+
int __init arch_init_kprobes(void)
{
return 0;
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-07-07 00:58:13
|
Add documentations about kprobe jump optimization to Documentation/kprobes.txt. Changes from v2: - Add a description about sysctl (Appendix B). Signed-off-by: Masami Hiramatsu <mhi...@re...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Ingo Molnar <mi...@el...> Cc: Jim Keniston <jke...@us...> Cc: Srikar Dronamraju <sr...@li...> Cc: Christoph Hellwig <hc...@in...> Cc: Steven Rostedt <ro...@go...> Cc: Frederic Weisbecker <fwe...@gm...> Cc: H. Peter Anvin <hp...@zy...> Cc: Anders Kaseorg <an...@ks...> Cc: Tim Abbott <ta...@ks...> Cc: Andi Kleen <an...@fi...> --- Documentation/kprobes.txt | 192 ++++++++++++++++++++++++++++++++++++++++++--- 1 files changed, 179 insertions(+), 13 deletions(-) diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 053037a..e4b0504 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -1,6 +1,7 @@ Title : Kernel Probes (Kprobes) Authors : Jim Keniston <jke...@us...> : Prasanna S Panchamukhi <pra...@in...> + : Masami Hiramatsu <mhi...@re...> CONTENTS @@ -14,6 +15,7 @@ CONTENTS 8. Kprobes Example 9. Jprobes Example 10. Kretprobes Example +11. Optimization Example Appendix A: The kprobes debugfs interface 1. Concepts: Kprobes, Jprobes, Return Probes @@ -42,13 +44,13 @@ registration/unregistration of a group of *probes. These functions can speed up unregistration process when you have to unregister a lot of probes at once. -The next three subsections explain how the different types of -probes work. They explain certain things that you'll need to -know in order to make the best use of Kprobes -- e.g., the -difference between a pre_handler and a post_handler, and how -to use the maxactive and nmissed fields of a kretprobe. But -if you're in a hurry to start using Kprobes, you can skip ahead -to section 2. +The next four subsections explain how the different types of +probes work and how the optimization works. They explain certain +things that you'll need to know in order to make the best use of +Kprobes -- e.g., the difference between a pre_handler and +a post_handler, and how to use the maxactive and nmissed fields of +a kretprobe. But if you're in a hurry to start using Kprobes, you +can skip ahead to section 2. 1.1 How Does a Kprobe Work? @@ -161,13 +163,110 @@ In case probed function is entered but there is no kretprobe_instance object available, then in addition to incrementing the nmissed count, the user entry_handler invocation is also skipped. +1.4 How Does the Optimization Work? + + If you configured kernel with CONFIG_OPTPROBES=y (currently this option is +supported on x86/x86-64, non-preemptive kernel) and +"debug.kprobes_optimization" sysctl sets 1, kprobes tries to use a +jump instruction instead of breakpoint instruction automatically. + +1.4.1 Init a Kprobe + + Before preparing optimization, Kprobes inserts original(user-defined) +kprobe on the specified address. So, even if the kprobe is not +possible to be optimized, it just uses a normal kprobe. + +1.4.2 Safety check + + First, Kprobes gets the address of probed function and checks whether the +optimized region, which will be replaced by a jump instruction, does NOT +straddle the function boundary, because if the optimized region reaches the +next function, its caller causes unexpected results. + Next, Kprobes decodes whole body of probed function and checks there is +NO indirect jump, NO instruction which will cause exception by checking +exception_tables (this will jump to fixup code and fixup code jumps into +same function body) and NO near jump which jumps into the optimized region +(except the 1st byte of jump), because if some jump instruction jumps +into the middle of another instruction, it causes unexpected results too. + Kprobes also measures the length of instructions which will be replaced +by a jump instruction, because a jump instruction is longer than 1 byte, +it may replaces multiple instructions, and it checks whether those +instructions can be executed out-of-line. + +1.4.3 Preparing detour buffer + + Then, Kprobes prepares "detour" buffer, which contains exception emulating +code (push/pop registers, call handler), copied instructions(Kprobes copies +instructions which will be replaced by a jump, to the detour buffer), and +a jump which jumps back to the original execution path. + +1.4.4 Pre-optimization + + After preparing detour buffer, Kprobes checks that the probe is *NOT* in +the below cases; + - The probe has either break_handler or post_handler. + - Other probes are probing the instructions which will be replaced by + a jump instruction. + - The probe is disabled. +In above cases, Kprobes just doesn't start optimizating the probe. + + If the kprobe can be optimized, Kprobes enqueues the kprobe to optimizing +list and kicks kprobe-optimizer workqueue to optimize it. To wait other +optimized probes, kprobe-optimizer will delay to work. + When the optimized-kprobe is hit before optimization, its handler changes +IP(instruction pointer) to copied code and exits. So, the instructions which +were copied to detour buffer are executed on the detour buffer. + +1.4.5 Optimization + + Kprobe-optimizer doesn't start instruction-replacing soon, it waits +synchronize_sched for safety, because some processors are possible to be +interrupted on the instructions which will be replaced by a jump instruction. +As you know, synchronize_sched() can ensure that all interruptions which were +executed when synchronize_sched() was called are done, only if +CONFIG_PREEMPT=n. So, this version supports only the kernel with +CONFIG_PREEMPT=n.(*) + After that, kprobe-optimizer replaces the 4 bytes right after int3 +breakpoint with relative-jump destination, and synchronize caches on all +processors. And then, it replaces int3 with relative-jump opcode, and +synchronize caches again. + + After optimizing the probe, a CPU hits the jump instruction and jumps to +the out-of-line buffer directly. Thus the breakpoint exception is skipped. + +1.4.6 Unoptimization + + When unregistering, disabling kprobe or being blocked by other kprobe, +an optimized-kprobe will be unoptimized. Before kprobe-optimizer runs, +the kprobe just be dequeued from the optimized list. When the optimization +has been done, it replaces a jump with int3 breakpoint and original code. + First it puts int3 at the first byte of the jump, synchronize caches +on all processors, replaces the 4 bytes right after int3 with the original +code and synchronize caches again. + +(*)This optimization-safety checking may be replaced with stop-machine method + which ksplice is done for supporting CONFIG_PREEMPT=y kernel. + +NOTE for geeks: +The jump optimization changes the kprobe's pre_handler behavior. +Without optimization, pre_handler can change kernel execution path by +changing regs->ip and return 1. However, after optimizing the probe, +that modification is ignored. Thus, if you'd like to tweak kernel +execution path, you need to avoid optimization. In that case, you can +choose either, + - Set empty function to post_handler or break_handler. + or + - Config CONFIG_OPTPROBES=n. + or + - Execute 'sysctl -w debug.kprobes_optimization=n' + 2. Architectures Supported Kprobes, jprobes, and return probes are implemented on the following architectures: -- i386 -- x86_64 (AMD-64, EM64T) +- i386 (Supports jump optimization) +- x86_64 (AMD-64, EM64T) (Supports jump optimization) - ppc64 - ia64 (Does not support probes on instruction slot1.) - sparc64 (Return probes not yet implemented.) @@ -193,6 +292,10 @@ it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO), so you can use "objdump -d -l vmlinux" to see the source-to-object code mapping. +If you want to reduce probing overhead, set "Kprobes jump optimization +support" (CONFIG_OPTPROBES) to "y". You can find this option under +"Kprobes" line. + 4. API Reference The Kprobes API includes a "register" function and an "unregister" @@ -387,9 +490,12 @@ the probe which has been registered. 5. Kprobes Features and Limitations -Kprobes allows multiple probes at the same address. Currently, -however, there cannot be multiple jprobes on the same function at -the same time. +Kprobes allows multiple probes at the same address even if it is optimized. +Currently, however, there cannot be multiple jprobes on the same function +at the same time. And also, optimized kprobes can not invoke the +post_handler and the break_handler. So if you attempt to install the probe +which has the the post_handler or the break_handler at the same address of +an optimized kprobe, the probe will be unoptimized automatically. In general, you can install a probe anywhere in the kernel. In particular, you can probe interrupt handlers. Known exceptions @@ -453,6 +559,37 @@ reason, Kprobes doesn't support return probes (or kprobes or jprobes) on the x86_64 version of __switch_to(); the registration functions return -EINVAL. +On x86/x86-64, since the Jump Optimization of Kprobes modifies instructions +widely, there are some limitations for optimization. To explain it, +we introduce some terminology. Image certain binary line which is +constructed by 2 byte instruction, 2byte instruction and 3byte instruction. + + IA + | +[-2][-1][0][1][2][3][4][5][6][7] + [ins1][ins2][ ins3 ] + [<- DCR ->] + [<- JTPR ->] + +ins1: 1st Instruction +ins2: 2nd Instruction +ins3: 3rd Instruction +IA: Insertion Address +JTPR: Jump Target Prohibition Region +DCR: Detoured Code Region + +The instructions in DCR are copied to the out-of-line buffer +of the djprobe instance, because the bytes in JTPR are replaced by +a jump instruction. So, there are several limitations. + +a) The instructions in DCR must be relocatable. +b) The instructions in DCR must not include call instruction. +c) JTPR must not be targeted by any jump or call instruction. +d) DCR must not straddle the border betweeen functions. + +Anyway, these limitations are checked by in-kernel instruction decoder, +so you don't need to care about that. + 6. Probe Overhead On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0 @@ -476,6 +613,19 @@ k = 0.49 usec; j = 0.76; r = 0.80; kr = 0.82; jr = 1.07 ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU) k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99 +6.1 Optimized Probe Overhead + +Typically, an optimized kprobe hit takes 0.07 to 0.1 microseconds to +process. Here are sample overhead figures (in usec) for x86-64 architectures. +k = unoptimized kprobe, b = boosted(single-step skipped), o = optimized kprobe, +r = unoptimized kretprobe, rb = boosted kretprobe, ro = optimized kretprobe. + +i386: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips +k = 0.68 usec; b = 0.27; o = 0.06; r = 0.95; rb = 0.53; ro = 0.30 + +x86-64: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips +k = 0.91 usec; b = 0.40; o = 0.06; r = 1.21; rb = 0.71; ro = 0.35 + 7. TODO a. SystemTap (http://sourceware.org/systemtap): Provides a simplified @@ -523,7 +673,8 @@ is also specified. Following columns show probe status. If the probe is on a virtual address that is no longer valid (module init sections, module virtual addresses that correspond to modules that've been unloaded), such probes are marked with [GONE]. If the probe is temporarily disabled, -such probes are marked with [DISABLED]. +such probes are marked with [DISABLED]. If the probe is optimized, it is +marked with [OPTIMIZED]. /sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly. @@ -533,3 +684,18 @@ registered probes will be disarmed, till such time a "1" is echoed to this file. Note that this knob just disarms and arms all kprobes and doesn't change each probe's disabling state. This means that disabled kprobes (marked [DISABLED]) will be not enabled if you turn ON all kprobes by this knob. + + +Appendix B: The kprobes sysctl interface + +/proc/sys/debug/kprobes-optimization: Turn kprobes optimization ON/OFF. + +When CONFIG_OPTPROBES=y, this sysctl interface appears and it provides a knob +to globally and forcibly turn the jump optimization ON or OFF. By default, +jump optimization is allowed(ON). By echoing "0" to this file or By setting +0 to "debug.kprobes_optimization" via sysctl, all optimized probes will be +unoptimized. And new probes registered after that will not be optimized. +Note that this knob *Changes* the optimized state. This means that optimized +probes (marked [OPTIMIZED]) will be unoptimized ([OPTIMIZED] tag will be +removed). And after the knob is turned on, it will be optimized again. + -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
|
From: Masami H. <mhi...@re...> - 2009-07-07 00:58:12
|
Make insn_slot framework support various size slots.
Current insn_slot just supports one-size instruction buffer slot. However,
kprobes jump optimization needs larger size buffers.
Changes from v2:
- Use an empty array for a variable array.
- Use WARN_ON(1) when finding a trivial bug, instead of BUG().
Signed-off-by: Masami Hiramatsu <mhi...@re...>
Cc: Ananth N Mavinakayanahalli <an...@in...>
Cc: Ingo Molnar <mi...@el...>
Cc: Jim Keniston <jke...@us...>
Cc: Srikar Dronamraju <sr...@li...>
Cc: Christoph Hellwig <hc...@in...>
Cc: Steven Rostedt <ro...@go...>
Cc: Frederic Weisbecker <fwe...@gm...>
Cc: H. Peter Anvin <hp...@zy...>
Cc: Anders Kaseorg <an...@ks...>
Cc: Tim Abbott <ta...@ks...>
Cc: Andi Kleen <an...@fi...>
---
kernel/kprobes.c | 102 +++++++++++++++++++++++++++++++++---------------------
1 files changed, 63 insertions(+), 39 deletions(-)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6fe9dc6..0162c3c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -100,26 +100,42 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
* stepping on the instruction on a vmalloced/kmalloced/data page
* is a recipe for disaster
*/
-#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
-
struct kprobe_insn_page {
struct list_head list;
kprobe_opcode_t *insns; /* Page of instruction slots */
- char slot_used[INSNS_PER_PAGE];
int nused;
int ngarbage;
+ char slot_used[];
};
+#define KPROBE_INSN_PAGE_SIZE(slots) \
+ (offsetof(struct kprobe_insn_page, slot_used) + \
+ (sizeof(char) * (slots)))
+
+struct kprobe_insn_cache {
+ struct list_head pages; /* list of kprobe_insn_page */
+ size_t insn_size; /* size of instruction slot */
+ int nr_garbage;
+};
+
+static int slots_per_page(struct kprobe_insn_cache *c)
+{
+ return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
+}
+
enum kprobe_slot_state {
SLOT_CLEAN = 0,
SLOT_DIRTY = 1,
SLOT_USED = 2,
};
-static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
-static LIST_HEAD(kprobe_insn_pages);
-static int kprobe_garbage_slots;
-static int collect_garbage_slots(void);
+static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
+static struct kprobe_insn_cache kprobe_insn_slots = {
+ .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
+ .insn_size = MAX_INSN_SIZE,
+ .nr_garbage = 0,
+};
+static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
static int __kprobes check_safety(void)
{
@@ -149,32 +165,33 @@ loop_end:
* __get_insn_slot() - Find a slot on an executable page for an instruction.
* We allocate an executable page if there's no room on existing ones.
*/
-static kprobe_opcode_t __kprobes *__get_insn_slot(void)
+static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip;
retry:
- list_for_each_entry(kip, &kprobe_insn_pages, list) {
- if (kip->nused < INSNS_PER_PAGE) {
+ list_for_each_entry(kip, &c->pages, list) {
+ if (kip->nused < slots_per_page(c)) {
int i;
- for (i = 0; i < INSNS_PER_PAGE; i++) {
+ for (i = 0; i < slots_per_page(c); i++) {
if (kip->slot_used[i] == SLOT_CLEAN) {
kip->slot_used[i] = SLOT_USED;
kip->nused++;
- return kip->insns + (i * MAX_INSN_SIZE);
+ return kip->insns + (i * c->insn_size);
}
}
- /* Surprise! No unused slots. Fix kip->nused. */
- kip->nused = INSNS_PER_PAGE;
+ /* kip->nused is broken. Fix it. */
+ kip->nused = slots_per_page(c);
+ WARN_ON(1);
}
}
/* If there are any garbage slots, collect it and try again. */
- if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+ if (c->nr_garbage && collect_garbage_slots(c) == 0)
goto retry;
- }
- /* All out of space. Need to allocate a new page. Use slot 0. */
- kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
+
+ /* All out of space. Need to allocate a new page. */
+ kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
if (!kip)
return NULL;
@@ -189,19 +206,20 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
return NULL;
}
INIT_LIST_HEAD(&kip->list);
- list_add(&kip->list, &kprobe_insn_pages);
- memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
+ memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
kip->slot_used[0] = SLOT_USED;
kip->nused = 1;
kip->ngarbage = 0;
+ list_add(&kip->list, &c->pages);
return kip->insns;
}
+
kprobe_opcode_t __kprobes *get_insn_slot(void)
{
- kprobe_opcode_t *ret;
+ kprobe_opcode_t *ret = NULL;
mutex_lock(&kprobe_insn_mutex);
- ret = __get_insn_slot();
+ ret = __get_insn_slot(&kprobe_insn_slots);
mutex_unlock(&kprobe_insn_mutex);
return ret;
}
@@ -218,7 +236,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
* so as not to have to set it up again the
* next time somebody inserts a probe.
*/
- if (!list_is_singular(&kprobe_insn_pages)) {
+ if (!list_is_singular(&kip->list)) {
list_del(&kip->list);
module_free(NULL, kip->insns);
kfree(kip);
@@ -228,7 +246,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
return 0;
}
-static int __kprobes collect_garbage_slots(void)
+static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip, *next;
@@ -236,42 +254,48 @@ static int __kprobes collect_garbage_slots(void)
if (check_safety())
return -EAGAIN;
- list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
+ list_for_each_entry_safe(kip, next, &c->pages, list) {
int i;
if (kip->ngarbage == 0)
continue;
kip->ngarbage = 0; /* we will collect all garbages */
- for (i = 0; i < INSNS_PER_PAGE; i++) {
+ for (i = 0; i < slots_per_page(c); i++) {
if (kip->slot_used[i] == SLOT_DIRTY &&
collect_one_slot(kip, i))
break;
}
}
- kprobe_garbage_slots = 0;
+ c->nr_garbage = 0;
return 0;
}
-void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
+static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
+ kprobe_opcode_t *slot, int dirty)
{
struct kprobe_insn_page *kip;
- mutex_lock(&kprobe_insn_mutex);
- list_for_each_entry(kip, &kprobe_insn_pages, list) {
- if (kip->insns <= slot &&
- slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
- int i = (slot - kip->insns) / MAX_INSN_SIZE;
+ list_for_each_entry(kip, &c->pages, list) {
+ long idx = ((long)slot - (long)kip->insns) / c->insn_size;
+ if (idx >= 0 && idx < slots_per_page(c)) {
+ WARN_ON(kip->slot_used[idx] != SLOT_USED);
if (dirty) {
- kip->slot_used[i] = SLOT_DIRTY;
+ kip->slot_used[idx] = SLOT_DIRTY;
kip->ngarbage++;
+ if (++c->nr_garbage > slots_per_page(c))
+ collect_garbage_slots(c);
} else
- collect_one_slot(kip, i);
- break;
+ collect_one_slot(kip, idx);
+ return;
}
}
+ /* Could not free this slot. */
+ WARN_ON(1);
+}
- if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
- collect_garbage_slots();
-
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
+{
+ mutex_lock(&kprobe_insn_mutex);
+ __free_insn_slot(&kprobe_insn_slots, slot, dirty);
mutex_unlock(&kprobe_insn_mutex);
}
#endif
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-07-07 00:58:05
|
Add /proc/sys/debug/kprobes-optimization sysctl which enables and disables
kprobes jump optimization on the fly for debugging.
Signed-off-by: Masami Hiramatsu <mhi...@re...>
Cc: Ananth N Mavinakayanahalli <an...@in...>
Cc: Ingo Molnar <mi...@el...>
Cc: Jim Keniston <jke...@us...>
Cc: Srikar Dronamraju <sr...@li...>
Cc: Christoph Hellwig <hc...@in...>
Cc: Steven Rostedt <ro...@go...>
Cc: Frederic Weisbecker <fwe...@gm...>
Cc: H. Peter Anvin <hp...@zy...>
Cc: Anders Kaseorg <an...@ks...>
Cc: Tim Abbott <ta...@ks...>
Cc: Andi Kleen <an...@fi...>
---
include/linux/kprobes.h | 9 +++++
kernel/kprobes.c | 85 +++++++++++++++++++++++++++++++++++++++++++++--
kernel/sysctl.c | 13 +++++++
3 files changed, 104 insertions(+), 3 deletions(-)
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 6f75014..de592ea 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -283,6 +283,15 @@ extern int arch_within_optimized_kprobe(struct optimized_kprobe *op,
unsigned long addr);
extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs);
+
+#ifdef CONFIG_SYSCTL
+extern int sysctl_kprobes_optimization;
+extern int proc_kprobes_optimization_handler(struct ctl_table *table,
+ int write, struct file *file,
+ void __user *buffer,
+ size_t *length, loff_t *ppos);
+#endif
+
#endif /* CONFIG_OPTPROBES */
/* Get the kprobe at this addr (if any) - called with preemption disabled */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0cbb607..1350e3d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -44,6 +44,7 @@
#include <linux/debugfs.h>
#include <linux/kdebug.h>
#include <linux/memory.h>
+#include <linux/sysctl.h>
#include <asm-generic/sections.h>
#include <asm/cacheflush.h>
@@ -374,6 +375,9 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
}
#ifdef CONFIG_OPTPROBES
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobes_allow_optimization;
+
/*
* Call all pre_handler on the list, but ignores its return value.
* This must be called from arch-dep optimized caller.
@@ -434,7 +438,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
/* Lock modules while optimizing kprobes */
mutex_lock(&module_mutex);
mutex_lock(&kprobe_mutex);
- if (kprobes_all_disarmed)
+ if (kprobes_all_disarmed || !kprobes_allow_optimization)
goto end;
/* Wait quiesence period for ensuring all interrupts are done */
@@ -458,7 +462,7 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
/* Check if the kprobe is disabled or not ready for optimization. */
- if (!kprobe_optready(p) ||
+ if (!kprobe_optready(p) || !kprobes_allow_optimization ||
(kprobe_disabled(p) || kprobes_all_disarmed))
return;
@@ -570,6 +574,77 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
optimize_kprobe(ap);
return;
}
+
+#ifdef CONFIG_SYSCTL
+static void __kprobes optimize_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kprobe *p;
+ unsigned int i;
+
+ /* If optimization is already allowed, just return */
+ if (kprobes_allow_optimization)
+ return;
+
+ kprobes_allow_optimization = true;
+ mutex_lock(&text_mutex);
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, node, head, hlist)
+ if (!kprobe_disabled(p))
+ optimize_kprobe(p);
+ }
+ mutex_unlock(&text_mutex);
+ printk(KERN_INFO "Kprobes globally optimized\n");
+}
+
+static void __kprobes unoptimize_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kprobe *p;
+ unsigned int i;
+
+ /* If optimization is already prohibited, just return */
+ if (!kprobes_allow_optimization)
+ return;
+
+ kprobes_allow_optimization = false;
+ printk(KERN_INFO "Kprobes globally unoptimized\n");
+ mutex_lock(&text_mutex);
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, node, head, hlist) {
+ if (!kprobe_disabled(p))
+ unoptimize_kprobe(p);
+ }
+ }
+
+ mutex_unlock(&text_mutex);
+ /* Allow all currently running kprobes to complete */
+ synchronize_sched();
+}
+
+int sysctl_kprobes_optimization;
+int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
+ struct file *file, void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ int ret;
+ mutex_lock(&kprobe_mutex);
+ sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
+ ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+
+ if (sysctl_kprobes_optimization)
+ optimize_all_kprobes();
+ else
+ unoptimize_all_kprobes();
+ mutex_unlock(&kprobe_mutex);
+ return ret;
+}
+#endif /* CONFIG_SYSCTL */
+
#else /* !CONFIG_OPTPROBES */
#define get_optimized_kprobe(addr) (NULL)
#define optimize_kprobe(p) do {} while (0)
@@ -1550,10 +1625,14 @@ static int __init init_kprobes(void)
}
}
-#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
+#if defined(CONFIG_OPTPROBES)
+#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
/* Init kprobe_optinsn_slots */
kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
#endif
+ /* By default, kprobes can be optimized */
+ kprobes_allow_optimization = true;
+#endif
/* By default, kprobes are armed */
kprobes_all_disarmed = false;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 32460ea..4359d61 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -51,6 +51,7 @@
#include <linux/ftrace.h>
#include <linux/slow-work.h>
#include <linux/perf_counter.h>
+#include <linux/kprobes.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -1551,6 +1552,18 @@ static struct ctl_table debug_table[] = {
.proc_handler = proc_dointvec
},
#endif
+#if defined(CONFIG_OPTPROBES)
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "kprobes-optimization",
+ .data = &sysctl_kprobes_optimization,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_kprobes_optimization_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{ .ctl_name = 0 }
};
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-07-07 00:57:52
|
Hi,
Here are the patchset of the kprobes jump optimization v3
(a.k.a. Djprobe). This version includes some bugfixes and
sysctl interface for disabling optimization runtime.
I fixed an important bug about reentering probes which will
cause unexpected kernel panic if a probe is hit when another
probe handler is running.
As far as I can see, the gcc's crossjumping unifies equivalent
code inside function and doesn't generate jumps which jump into
the middle of other functions. So I decided to drop disabling
crossjumping patch from this series.
These patches can be applied on -tip tree + x86 instruction
decoder which I sent before. This is another example of x86
instruction decoder.
Jump Optimized Kprobes
======================
o Concept
Kprobes uses the int3 breakpoint instruction on x86 for instrumenting
probes into running kernel. Jump optimization allows kprobes to replace
breakpoint with a jump instruction for reducing probing overhead drastically.
o Performance
An optimized kprobe 5 times faster than a kprobe.
Optimizing probes gains its performance. Usually, a kprobe hit takes
0.5 to 1.0 microseconds to process. On the other hand, a jump optimized
probe hit takes less than 0.1 microseconds (actual number depends on the
processor). Here is a sample overheads.
Intel(R) Xeon(R) CPU E5410 @ 2.33GHz (without debugging options)
x86-32 x86-64
kprobe: 0.68us 0.91us
kprobe+booster: 0.27us 0.40us
kprobe+optimized: 0.06us 0.06us
kretprobe : 0.95us 1.21us
kretprobe+booster: 0.53us 0.71us
kretprobe+optimized: 0.30us 0.35us
(booster skips single-stepping)
Note that jump optimization also consumes more memory, but not so much.
It just uses ~200 bytes, so, even if you use ~10,000 probes, it just
consumes a few MB.
o Usage
Set CONFIG_OPTPROBES=y when building a kernel, then all *probes will be
optimized if possible.
Kprobes decodes probed function and checks whether the target instructions
can be optimized(replaced with a jump) safely. If it can't be, Kprobes just
doesn't optimize it.
o Optimization
Before preparing optimization, Kprobes inserts original(user-defined)
kprobe on the specified address. So, even if the kprobe is not
possible to be optimized, it just uses a normal kprobe.
- Safety check
First, Kprobes gets the address of probed function and checks whether the
optimized region, which will be replaced by a jump instruction, does NOT
straddle the function boundary, because if the optimized region reaches the
next function, its caller causes unexpected results.
Next, Kprobes decodes whole body of probed function and checks there is
NO indirect jump, NO instruction which will cause exception by checking
exception_tables (this will jump to fixup code and fixup code jumps into
same function body) and NO near jump which jumps into the optimized region
(except the 1st byte of jump), because if some jump instruction jumps
into the middle of another instruction, it causes unexpected results too.
Kprobes also measures the length of instructions which will be replaced
by a jump instruction, because a jump instruction is longer than 1 byte,
it may replaces multiple instructions, and it checks whether those
instructions can be executed out-of-line.
- Preparing detour code
Then, Kprobes prepares "detour" buffer, which contains exception emulating
code (push/pop registers, call handler), copied instructions(Kprobes copies
instructions which will be replaced by a jump, to the detour buffer), and
a jump which jumps back to the original execution path.
- Pre-optimization
After preparing detour code, Kprobes enqueues the kprobe to optimizing list
and kicks kprobe-optimizer workqueue to optimize it. To wait other optimized
probes, kprobe-optimizer will delay to work.
When the optimized-kprobe is hit before optimization, its handler
changes IP(instruction pointer) to copied code and exits. So, the
instructions which were copied to detour buffer are executed on the detour
buffer.
- Optimization
Kprobe-optimizer doesn't start instruction-replacing soon, it waits
synchronize_sched for safety, because some processors are possible to be
interrupted on the instructions which will be replaced by a jump instruction.
As you know, synchronize_sched() can ensure that all interruptions which were
executed when synchronize_sched() was called are done, only if
CONFIG_PREEMPT=n. So, this version supports only the kernel with
CONFIG_PREEMPT=n.(*)
After that, kprobe-optimizer replaces the 4 bytes right after int3 breakpoint
with relative-jump destination, and synchronize caches on all processors. Next,
it replaces int3 with relative-jump opcode, and synchronize caches again.
- Unoptimization
When unregistering, disabling kprobe or being blocked by other kprobe,
an optimized-kprobe will be unoptimized. Before kprobe-optimizer runs,
the kprobe just be dequeued from the optimized list. When the optimization
has been done, it replaces a jump with int3 breakpoint and original code.
First it puts int3 at the first byte of the jump, synchronize caches
on all processors, and replaces the 4 bytes right after int3 with the
original code.
(*)This optimization-safety checking may be replaced with stop-machine method
which ksplice is done for supporting CONFIG_PREEMPT=y kernel.
Thank you,
---
Masami Hiramatsu (7):
kprobes: Add documents of jump optimization
kprobes: x86: Support kprobes jump optimization on x86
kprobes: x86: Cleanup save/restore registers
kprobes: x86: Boost probes when reentering
kprobes: Jump optimization sysctl interface
kprobes: Introduce kprobes jump optimization
kprobes: Introduce generic insn_slot framework
Documentation/kprobes.txt | 192 ++++++++++++-
arch/Kconfig | 13 +
arch/x86/Kconfig | 1
arch/x86/include/asm/kprobes.h | 31 ++
arch/x86/kernel/kprobes.c | 589 +++++++++++++++++++++++++++++++++-------
include/linux/kprobes.h | 45 +++
kernel/kprobes.c | 571 +++++++++++++++++++++++++++++++++------
kernel/sysctl.c | 13 +
8 files changed, 1252 insertions(+), 203 deletions(-)
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-07-07 00:57:46
|
Introduce SAVE/RESOTRE_REGS_STRING for cleanup kretprobe-trampoline asm code. These macros will be used for emulating interruption. Signed-off-by: Masami Hiramatsu <mhi...@re...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Ingo Molnar <mi...@el...> Cc: Jim Keniston <jke...@us...> Cc: Srikar Dronamraju <sr...@li...> Cc: Christoph Hellwig <hc...@in...> Cc: Steven Rostedt <ro...@go...> Cc: Frederic Weisbecker <fwe...@gm...> Cc: H. Peter Anvin <hp...@zy...> Cc: Anders Kaseorg <an...@ks...> Cc: Tim Abbott <ta...@ks...> Cc: Andi Kleen <an...@fi...> --- arch/x86/kernel/kprobes.c | 128 ++++++++++++++++++++++++--------------------- 1 files changed, 67 insertions(+), 61 deletions(-) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 4e9cf46..40f204b 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -566,6 +566,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) return 0; } +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $16, %esp\n" \ + " pushl %fs\n" \ + " pushl %ds\n" \ + " pushl %es\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ + " addl $24, %esp\n" +#endif + /* * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. @@ -579,65 +642,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void) /* We don't bother saving the ss register */ " pushq %rsp\n" " pushfq\n" - /* - * Skip cs, ip, orig_ax. - * trampoline_handler() will plug in these values - */ - " subq $24, %rsp\n" - " pushq %rdi\n" - " pushq %rsi\n" - " pushq %rdx\n" - " pushq %rcx\n" - " pushq %rax\n" - " pushq %r8\n" - " pushq %r9\n" - " pushq %r10\n" - " pushq %r11\n" - " pushq %rbx\n" - " pushq %rbp\n" - " pushq %r12\n" - " pushq %r13\n" - " pushq %r14\n" - " pushq %r15\n" + SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" /* Replace saved sp with true return address. */ " movq %rax, 152(%rsp)\n" - " popq %r15\n" - " popq %r14\n" - " popq %r13\n" - " popq %r12\n" - " popq %rbp\n" - " popq %rbx\n" - " popq %r11\n" - " popq %r10\n" - " popq %r9\n" - " popq %r8\n" - " popq %rax\n" - " popq %rcx\n" - " popq %rdx\n" - " popq %rsi\n" - " popq %rdi\n" - /* Skip orig_ax, ip, cs */ - " addq $24, %rsp\n" + RESTORE_REGS_STRING " popfq\n" #else " pushf\n" - /* - * Skip cs, ip, orig_ax and gs. - * trampoline_handler() will plug in these values - */ - " subl $16, %esp\n" - " pushl %fs\n" - " pushl %es\n" - " pushl %ds\n" - " pushl %eax\n" - " pushl %ebp\n" - " pushl %edi\n" - " pushl %esi\n" - " pushl %edx\n" - " pushl %ecx\n" - " pushl %ebx\n" + SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" /* Move flags to cs */ @@ -645,15 +659,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void) " movl %edx, 52(%esp)\n" /* Replace saved flags with true return address. */ " movl %eax, 56(%esp)\n" - " popl %ebx\n" - " popl %ecx\n" - " popl %edx\n" - " popl %esi\n" - " popl %edi\n" - " popl %ebp\n" - " popl %eax\n" - /* Skip ds, es, fs, gs, orig_ax and ip */ - " addl $24, %esp\n" + RESTORE_REGS_STRING " popf\n" #endif " ret\n"); -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
|
From: Masami H. <mhi...@re...> - 2009-07-07 00:57:40
|
Cleanup prepare_singlestep() and setup_singlestep() to boost up reenter
probes.
Signed-off-by: Masami Hiramatsu <mhi...@re...>
Cc: Ananth N Mavinakayanahalli <an...@in...>
Cc: Ingo Molnar <mi...@el...>
Cc: Jim Keniston <jke...@us...>
Cc: Srikar Dronamraju <sr...@li...>
Cc: Christoph Hellwig <hc...@in...>
Cc: Steven Rostedt <ro...@go...>
Cc: Frederic Weisbecker <fwe...@gm...>
Cc: H. Peter Anvin <hp...@zy...>
Cc: Anders Kaseorg <an...@ks...>
Cc: Tim Abbott <ta...@ks...>
Cc: Andi Kleen <an...@fi...>
---
arch/x86/kernel/kprobes.c | 40 +++++++++++++++++++---------------------
1 files changed, 19 insertions(+), 21 deletions(-)
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b77e050..4e9cf46 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -411,18 +411,6 @@ static void __kprobes restore_btf(void)
update_debugctlmsr(current->thread.debugctlmsr);
}
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
- clear_btf();
- regs->flags |= X86_EFLAGS_TF;
- regs->flags &= ~X86_EFLAGS_IF;
- /* single step inline if the instruction is an int3 */
- if (p->opcode == BREAKPOINT_INSTRUCTION)
- regs->ip = (unsigned long)p->addr;
- else
- regs->ip = (unsigned long)p->ainsn.insn;
-}
-
void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
@@ -435,7 +423,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
}
static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+ struct kprobe_ctlblk *kcb, int reenter)
{
#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER)
if (p->ainsn.boostable == 1 && !p->post_handler) {
@@ -446,8 +434,21 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
return;
}
#endif
- prepare_singlestep(p, regs);
- kcb->kprobe_status = KPROBE_HIT_SS;
+ if (reenter) {
+ save_previous_kprobe(kcb);
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_REENTER;
+ } else
+ kcb->kprobe_status = KPROBE_HIT_SS;
+ /* Prepare real single stepping */
+ clear_btf();
+ regs->flags |= X86_EFLAGS_TF;
+ regs->flags &= ~X86_EFLAGS_IF;
+ /* single step inline if the instruction is an int3 */
+ if (p->opcode == BREAKPOINT_INSTRUCTION)
+ regs->ip = (unsigned long)p->addr;
+ else
+ regs->ip = (unsigned long)p->ainsn.insn;
}
/*
@@ -472,11 +473,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
break;
#endif
case KPROBE_HIT_ACTIVE:
- save_previous_kprobe(kcb);
- set_current_kprobe(p, regs, kcb);
kprobes_inc_nmissed_count(p);
- prepare_singlestep(p, regs);
- kcb->kprobe_status = KPROBE_REENTER;
+ setup_singlestep(p, regs, kcb, 1);
break;
case KPROBE_HIT_SS:
if (p == kprobe_running()) {
@@ -553,13 +551,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
* more here.
*/
if (!p->pre_handler || !p->pre_handler(p, regs))
- setup_singlestep(p, regs, kcb);
+ setup_singlestep(p, regs, kcb, 0);
return 1;
}
} else if (kprobe_running()) {
p = __get_cpu_var(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
- setup_singlestep(p, regs, kcb);
+ setup_singlestep(p, regs, kcb, 0);
return 1;
}
} /* else: not a kprobe fault; let the kernel handle it */
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-07-07 00:05:24
|
Andi Kleen wrote:
> On Mon, Jul 06, 2009 at 03:28:02PM -0400, Masami Hiramatsu wrote:
>> I'm not so sure about your idea.
>> Would you mean below code?
>>
>> int offs_table[NR_REGPARMS] = {
>
> not REGPARMS of course
>
>> [0] = offsetof(struct pt_regs, di),
>> ...
>> };
>> if (n < NR_REGPARMS)
>> return *((unsigned long *)regs + offs_table[n]);
>
> Yes.
OK, here, I updated my patch.
Thank you,
x86: add pt_regs register and stack access APIs
From: Masami Hiramatsu <mhi...@re...>
Add following APIs for accessing registers and stack entries from pt_regs.
These APIs are required by kprobes-based event tracer on ftrace.
Some other debugging tools might be able to use it too.
- regs_query_register_offset(const char *name)
Query the offset of "name" register.
- regs_query_register_name(unsigned offset)
Query the name of register by its offset.
- regs_get_register(struct pt_regs *regs, unsigned offset)
Get the value of a register by its offset.
- regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr)
Check the address is in the kernel stack.
- regs_get_kernel_stack_nth(struct pt_regs *reg, unsigned nth)
Get Nth entry of the kernel stack. (N >= 0)
- regs_get_argument_nth(struct pt_regs *reg, unsigned nth)
Get Nth argument at function call. (N >= 0)
Changes from v10:
- Use an offsetof table in regs_get_argument_nth().
Signed-off-by: Masami Hiramatsu <mhi...@re...>
Cc: Andi Kleen <an...@fi...>
Cc: Christoph Hellwig <hc...@in...>
Cc: Steven Rostedt <ro...@go...>
Cc: Ananth N Mavinakayanahalli <an...@in...>
Cc: Ingo Molnar <mi...@el...>
Cc: Frederic Weisbecker <fwe...@gm...>
Cc: Roland McGrath <ro...@re...>
Cc: Srikar Dronamraju <sr...@li...>
Cc: lin...@vg...
---
arch/x86/include/asm/ptrace.h | 61 ++++++++++++++++++++++
arch/x86/kernel/ptrace.c | 112 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 173 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 0f0d908..a9b7e2d 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -7,6 +7,7 @@
#ifdef __KERNEL__
#include <asm/segment.h>
+#include <asm/page_types.h>
#endif
#ifndef __ASSEMBLY__
@@ -216,6 +217,66 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
return regs->sp;
}
+/* Query offset/name of register from its name/offset */
+extern int regs_query_register_offset(const char *name);
+extern const char *regs_query_register_name(unsigned offset);
+#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
+
+/**
+ * regs_get_register() - get register value from its offset
+ * @regs: pt_regs from which register value is gotten.
+ * @offset: offset number of the register.
+ *
+ * regs_get_register returns the value of a register whose offset from @regs
+ * is @offset. The @offset is the offset of the register in struct pt_regs.
+ * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
+ */
+static inline unsigned long regs_get_register(struct pt_regs *regs,
+ unsigned offset)
+{
+ if (unlikely(offset > MAX_REG_OFFSET))
+ return 0;
+ return *(unsigned long *)((unsigned long)regs + offset);
+}
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @addr: address which is checked.
+ *
+ * regs_within_kenel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static inline int regs_within_kernel_stack(struct pt_regs *regs,
+ unsigned long addr)
+{
+ return ((addr & ~(THREAD_SIZE - 1)) ==
+ (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @n: stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specifined by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
+ unsigned n)
+{
+ unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+ addr += n;
+ if (regs_within_kernel_stack(regs, (unsigned long)addr))
+ return *addr;
+ else
+ return 0;
+}
+
+/* Get Nth argument at function call */
+extern unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned n);
+
/*
* These are defined as per linux/ptrace.h, which see.
*/
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index cabdabc..4f9b513 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -49,6 +49,118 @@ enum x86_regset {
REGSET_IOPERM32,
};
+struct pt_regs_offset {
+ const char *name;
+ int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+
+static const struct pt_regs_offset regoffset_table[] = {
+#ifdef CONFIG_X86_64
+ REG_OFFSET_NAME(r15),
+ REG_OFFSET_NAME(r14),
+ REG_OFFSET_NAME(r13),
+ REG_OFFSET_NAME(r12),
+ REG_OFFSET_NAME(r11),
+ REG_OFFSET_NAME(r10),
+ REG_OFFSET_NAME(r9),
+ REG_OFFSET_NAME(r8),
+#endif
+ REG_OFFSET_NAME(bx),
+ REG_OFFSET_NAME(cx),
+ REG_OFFSET_NAME(dx),
+ REG_OFFSET_NAME(si),
+ REG_OFFSET_NAME(di),
+ REG_OFFSET_NAME(bp),
+ REG_OFFSET_NAME(ax),
+#ifdef CONFIG_X86_32
+ REG_OFFSET_NAME(ds),
+ REG_OFFSET_NAME(es),
+ REG_OFFSET_NAME(fs),
+ REG_OFFSET_NAME(gs),
+#endif
+ REG_OFFSET_NAME(orig_ax),
+ REG_OFFSET_NAME(ip),
+ REG_OFFSET_NAME(cs),
+ REG_OFFSET_NAME(flags),
+ REG_OFFSET_NAME(sp),
+ REG_OFFSET_NAME(ss),
+ REG_OFFSET_END,
+};
+
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name: the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (!strcmp(roff->name, name))
+ return roff->offset;
+ return -EINVAL;
+}
+
+/**
+ * regs_query_register_name() - query register name from its offset
+ * @offset: the offset of a register in struct pt_regs.
+ *
+ * regs_query_register_name() returns the name of a register from its
+ * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
+ */
+const char *regs_query_register_name(unsigned offset)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (roff->offset == offset)
+ return roff->name;
+ return NULL;
+}
+
+static const int arg_offs_table[] = {
+#ifdef CONFIG_X86_32
+ [0] = offsetof(struct pt_regs, ax),
+ [1] = offsetof(struct pt_regs, dx),
+ [2] = offsetof(struct pt_regs, cx)
+#else /* CONFIG_X86_64 */
+ [0] = offsetof(struct pt_regs, di),
+ [1] = offsetof(struct pt_regs, si),
+ [2] = offsetof(struct pt_regs, dx),
+ [3] = offsetof(struct pt_regs, cx),
+ [4] = offsetof(struct pt_regs, r8),
+ [5] = offsetof(struct pt_regs, r9)
+#endif
+};
+
+/**
+ * regs_get_argument_nth() - get Nth argument at function call
+ * @regs: pt_regs which contains registers at function entry.
+ * @n: argument number.
+ *
+ * regs_get_argument_nth() returns @n th argument of a function call.
+ * Since usually the kernel stack will be changed right after function entry,
+ * you must use this at function entry. If the @n th entry is NOT in the
+ * kernel stack or pt_regs, this returns 0.
+ */
+unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned n)
+{
+ if (n < ARRAY_SIZE(arg_offs_table))
+ return *((unsigned long *)regs + arg_offs_table[n]);
+ else {
+ /*
+ * The typical case: arg n is on the stack.
+ * (Note: stack[0] = return address, so skip it)
+ */
+ n -= ARRAY_SIZE(arg_offs_table);
+ return regs_get_kernel_stack_nth(regs, 1 + n);
+ }
+}
+
/*
* does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Andi K. <an...@fi...> - 2009-07-06 21:53:06
|
On Mon, Jul 06, 2009 at 03:28:02PM -0400, Masami Hiramatsu wrote:
> I'm not so sure about your idea.
> Would you mean below code?
>
> int offs_table[NR_REGPARMS] = {
not REGPARMS of course
> [0] = offsetof(struct pt_regs, di),
> ...
> };
> if (n < NR_REGPARMS)
> return *((unsigned long *)regs + offs_table[n]);
Yes.
-Andi
--
ak...@li... -- Speaking for myself only.
|
|
From: Masami H. <mhi...@re...> - 2009-07-06 19:54:55
|
Andi Kleen wrote:
> Masami Hiramatsu <mhi...@re...> writes:
>
>> Add following APIs for accessing registers and stack entries from pt_regs.
>
> You forgot to state who calls these functions/why are they added?
> Who only has strings for registers?
Oh, yes. This patch is needed for kprobes based event tracer on ftrace.
Some other debugging tools might be able to use it.
> I can see the point of having a function for nth argument though,
> that's useful.
>
>> +static inline unsigned long regs_get_argument_nth(struct pt_regs *regs,
>> + unsigned n)
>> +{
>> + if (n < NR_REGPARMS) {
>> + switch (n) {
>> + case 0:
>> + return regs->ax;
>> + case 1:
>> + return regs->dx;
>> + case 2:
>> + return regs->cx;
>
>
> [....]
>
> That could be done shorter with a offsetof table.
>
>> + if (n < NR_REGPARMS) {
>> + switch (n) {
>> + case 0:
>> + return regs->di;
>> + case 1:
>> + return regs->si;
>> + case 2:
>> + return regs->dx;
>> + case 3:
>> + return regs->cx;
>> + case 4:
>> + return regs->r8;
>> + case 5:
>> + return regs->r9;
>
> and that too.
I'm not so sure about your idea.
Would you mean below code?
int offs_table[NR_REGPARMS] = {
[0] = offsetof(struct pt_regs, di),
...
};
if (n < NR_REGPARMS)
return *((unsigned long *)regs + offs_table[n]);
Thank you,
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Andi K. <an...@fi...> - 2009-07-06 14:35:09
|
Masami Hiramatsu <mhi...@re...> writes:
> Add following APIs for accessing registers and stack entries from pt_regs.
You forgot to state who calls these functions/why are they added?
Who only has strings for registers?
I can see the point of having a function for nth argument though,
that's useful.
> +static inline unsigned long regs_get_argument_nth(struct pt_regs *regs,
> + unsigned n)
> +{
> + if (n < NR_REGPARMS) {
> + switch (n) {
> + case 0:
> + return regs->ax;
> + case 1:
> + return regs->dx;
> + case 2:
> + return regs->cx;
[....]
That could be done shorter with a offsetof table.
> + if (n < NR_REGPARMS) {
> + switch (n) {
> + case 0:
> + return regs->di;
> + case 1:
> + return regs->si;
> + case 2:
> + return regs->dx;
> + case 3:
> + return regs->cx;
> + case 4:
> + return regs->r8;
> + case 5:
> + return regs->r9;
and that too.
-Andi
--
ak...@li... -- Speaking for myself only.
|
|
From: Frederic W. <fwe...@gm...> - 2009-07-06 02:51:21
|
On Tue, Jun 30, 2009 at 09:09:11PM -0400, Masami Hiramatsu wrote:
> Add following APIs for accessing registers and stack entries from pt_regs.
>
> - regs_query_register_offset(const char *name)
> Query the offset of "name" register.
>
> - regs_query_register_name(unsigned offset)
> Query the name of register by its offset.
>
> - regs_get_register(struct pt_regs *regs, unsigned offset)
> Get the value of a register by its offset.
>
> - regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr)
> Check the address is in the kernel stack.
>
> - regs_get_kernel_stack_nth(struct pt_regs *reg, unsigned nth)
> Get Nth entry of the kernel stack. (N >= 0)
>
> - regs_get_argument_nth(struct pt_regs *reg, unsigned nth)
> Get Nth argument at function call. (N >= 0)
>
> Changes from v9:
> -Fix a typo in a comment.
>
> Signed-off-by: Masami Hiramatsu <mhi...@re...>
> Cc: Christoph Hellwig <hc...@in...>
> Cc: Steven Rostedt <ro...@go...>
> Cc: Ananth N Mavinakayanahalli <an...@in...>
> Cc: Ingo Molnar <mi...@el...>
> Cc: Frederic Weisbecker <fwe...@gm...>
> Cc: Roland McGrath <ro...@re...>
> Cc: Srikar Dronamraju <sr...@li...>
> Cc: lin...@vg...
Looks good!
Reviewed-by: Frederic Weisbecker <fwe...@gm...>
Frederic.
> ---
>
> arch/x86/include/asm/ptrace.h | 122 +++++++++++++++++++++++++++++++++++++++++
> arch/x86/kernel/ptrace.c | 73 +++++++++++++++++++++++++
> 2 files changed, 195 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
> index 0f0d908..d5e3b3b 100644
> --- a/arch/x86/include/asm/ptrace.h
> +++ b/arch/x86/include/asm/ptrace.h
> @@ -7,6 +7,7 @@
>
> #ifdef __KERNEL__
> #include <asm/segment.h>
> +#include <asm/page_types.h>
> #endif
>
> #ifndef __ASSEMBLY__
> @@ -216,6 +217,127 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
> return regs->sp;
> }
>
> +/* Query offset/name of register from its name/offset */
> +extern int regs_query_register_offset(const char *name);
> +extern const char *regs_query_register_name(unsigned offset);
> +#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
> +
> +/**
> + * regs_get_register() - get register value from its offset
> + * @regs: pt_regs from which register value is gotten.
> + * @offset: offset number of the register.
> + *
> + * regs_get_register returns the value of a register whose offset from @regs
> + * is @offset. The @offset is the offset of the register in struct pt_regs.
> + * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
> + */
> +static inline unsigned long regs_get_register(struct pt_regs *regs,
> + unsigned offset)
> +{
> + if (unlikely(offset > MAX_REG_OFFSET))
> + return 0;
> + return *(unsigned long *)((unsigned long)regs + offset);
> +}
> +
> +/**
> + * regs_within_kernel_stack() - check the address in the stack
> + * @regs: pt_regs which contains kernel stack pointer.
> + * @addr: address which is checked.
> + *
> + * regs_within_kenel_stack() checks @addr is within the kernel stack page(s).
> + * If @addr is within the kernel stack, it returns true. If not, returns false.
> + */
> +static inline int regs_within_kernel_stack(struct pt_regs *regs,
> + unsigned long addr)
> +{
> + return ((addr & ~(THREAD_SIZE - 1)) ==
> + (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
> +}
> +
> +/**
> + * regs_get_kernel_stack_nth() - get Nth entry of the stack
> + * @regs: pt_regs which contains kernel stack pointer.
> + * @n: stack entry number.
> + *
> + * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
> + * is specifined by @regs. If the @n th entry is NOT in the kernel stack,
> + * this returns 0.
> + */
> +static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
> + unsigned n)
> +{
> + unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
> + addr += n;
> + if (regs_within_kernel_stack(regs, (unsigned long)addr))
> + return *addr;
> + else
> + return 0;
> +}
> +
> +/**
> + * regs_get_argument_nth() - get Nth argument at function call
> + * @regs: pt_regs which contains registers at function entry.
> + * @n: argument number.
> + *
> + * regs_get_argument_nth() returns @n th argument of a function call.
> + * Since usually the kernel stack will be changed right after function entry,
> + * you must use this at function entry. If the @n th entry is NOT in the
> + * kernel stack or pt_regs, this returns 0.
> + */
> +#ifdef CONFIG_X86_32
> +#define NR_REGPARMS 3
> +static inline unsigned long regs_get_argument_nth(struct pt_regs *regs,
> + unsigned n)
> +{
> + if (n < NR_REGPARMS) {
> + switch (n) {
> + case 0:
> + return regs->ax;
> + case 1:
> + return regs->dx;
> + case 2:
> + return regs->cx;
> + }
> + return 0;
> + } else {
> + /*
> + * The typical case: arg n is on the stack.
> + * (Note: stack[0] = return address, so skip it)
> + */
> + return regs_get_kernel_stack_nth(regs, 1 + n - NR_REGPARMS);
> + }
> +}
> +#else /* CONFIG_X86_64 */
> +#define NR_REGPARMS 6
> +static inline unsigned long regs_get_argument_nth(struct pt_regs *regs,
> + unsigned n)
> +{
> + if (n < NR_REGPARMS) {
> + switch (n) {
> + case 0:
> + return regs->di;
> + case 1:
> + return regs->si;
> + case 2:
> + return regs->dx;
> + case 3:
> + return regs->cx;
> + case 4:
> + return regs->r8;
> + case 5:
> + return regs->r9;
> + }
> + return 0;
> + } else {
> + /*
> + * The typical case: arg n is on the stack.
> + * (Note: stack[0] = return address, so skip it)
> + */
> + return regs_get_kernel_stack_nth(regs, 1 + n - NR_REGPARMS);
> + }
> +}
> +#endif
> +
> /*
> * These are defined as per linux/ptrace.h, which see.
> */
> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
> index b457f78..2944d3a 100644
> --- a/arch/x86/kernel/ptrace.c
> +++ b/arch/x86/kernel/ptrace.c
> @@ -49,6 +49,79 @@ enum x86_regset {
> REGSET_IOPERM32,
> };
>
> +struct pt_regs_offset {
> + const char *name;
> + int offset;
> +};
> +
> +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
> +#define REG_OFFSET_END {.name = NULL, .offset = 0}
> +
> +static const struct pt_regs_offset regoffset_table[] = {
> +#ifdef CONFIG_X86_64
> + REG_OFFSET_NAME(r15),
> + REG_OFFSET_NAME(r14),
> + REG_OFFSET_NAME(r13),
> + REG_OFFSET_NAME(r12),
> + REG_OFFSET_NAME(r11),
> + REG_OFFSET_NAME(r10),
> + REG_OFFSET_NAME(r9),
> + REG_OFFSET_NAME(r8),
> +#endif
> + REG_OFFSET_NAME(bx),
> + REG_OFFSET_NAME(cx),
> + REG_OFFSET_NAME(dx),
> + REG_OFFSET_NAME(si),
> + REG_OFFSET_NAME(di),
> + REG_OFFSET_NAME(bp),
> + REG_OFFSET_NAME(ax),
> +#ifdef CONFIG_X86_32
> + REG_OFFSET_NAME(ds),
> + REG_OFFSET_NAME(es),
> + REG_OFFSET_NAME(fs),
> + REG_OFFSET_NAME(gs),
> +#endif
> + REG_OFFSET_NAME(orig_ax),
> + REG_OFFSET_NAME(ip),
> + REG_OFFSET_NAME(cs),
> + REG_OFFSET_NAME(flags),
> + REG_OFFSET_NAME(sp),
> + REG_OFFSET_NAME(ss),
> + REG_OFFSET_END,
> +};
> +
> +/**
> + * regs_query_register_offset() - query register offset from its name
> + * @name: the name of a register
> + *
> + * regs_query_register_offset() returns the offset of a register in struct
> + * pt_regs from its name. If the name is invalid, this returns -EINVAL;
> + */
> +int regs_query_register_offset(const char *name)
> +{
> + const struct pt_regs_offset *roff;
> + for (roff = regoffset_table; roff->name != NULL; roff++)
> + if (!strcmp(roff->name, name))
> + return roff->offset;
> + return -EINVAL;
> +}
> +
> +/**
> + * regs_query_register_name() - query register name from its offset
> + * @offset: the offset of a register in struct pt_regs.
> + *
> + * regs_query_register_name() returns the name of a register from its
> + * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
> + */
> +const char *regs_query_register_name(unsigned offset)
> +{
> + const struct pt_regs_offset *roff;
> + for (roff = regoffset_table; roff->name != NULL; roff++)
> + if (roff->offset == offset)
> + return roff->name;
> + return NULL;
> +}
> +
> /*
> * does not yet catch signals sent when the child dies.
> * in exit.c or in signal.c.
>
>
> --
> Masami Hiramatsu
>
> Software Engineer
> Hitachi Computer Products (America), Inc.
> Software Solutions Division
>
> e-mail: mhi...@re...
|
|
From: Frederic W. <fwe...@gm...> - 2009-07-06 02:28:25
|
On Tue, Jun 30, 2009 at 09:09:17PM -0400, Masami Hiramatsu wrote:
> Add dynamic ftrace_event_call support to ftrace. Trace engines can adds new
> ftrace_event_call to ftrace on the fly. Each operator functions of the call
> takes a ftrace_event_call data structure as an argument, because these
> functions may be shared among several ftrace_event_calls.
>
> Signed-off-by: Masami Hiramatsu <mhi...@re...>
> Cc: Steven Rostedt <ro...@go...>
> Cc: Ingo Molnar <mi...@el...>
> Cc: Tom Zanussi <tza...@gm...>
> Cc: Frederic Weisbecker <fwe...@gm...>
Looks good too.
Acked-by: Frederic Weisbecker <fwe...@gm...>
> ---
>
> include/linux/ftrace_event.h | 13 +++++---
> include/trace/ftrace.h | 22 +++++++------
> kernel/trace/trace_events.c | 70 ++++++++++++++++++++++++++++++++----------
> kernel/trace/trace_export.c | 27 ++++++++--------
> 4 files changed, 85 insertions(+), 47 deletions(-)
>
> diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
> index 5c093ff..f7733b6 100644
> --- a/include/linux/ftrace_event.h
> +++ b/include/linux/ftrace_event.h
> @@ -108,12 +108,13 @@ struct ftrace_event_call {
> struct dentry *dir;
> struct trace_event *event;
> int enabled;
> - int (*regfunc)(void);
> - void (*unregfunc)(void);
> + int (*regfunc)(struct ftrace_event_call *);
> + void (*unregfunc)(struct ftrace_event_call *);
> int id;
> - int (*raw_init)(void);
> - int (*show_format)(struct trace_seq *s);
> - int (*define_fields)(void);
> + int (*raw_init)(struct ftrace_event_call *);
> + int (*show_format)(struct ftrace_event_call *,
> + struct trace_seq *);
> + int (*define_fields)(struct ftrace_event_call *);
> struct list_head fields;
> int filter_active;
> void *filter;
> @@ -138,6 +139,8 @@ extern int filter_current_check_discard(struct ftrace_event_call *call,
>
> extern int trace_define_field(struct ftrace_event_call *call, char *type,
> char *name, int offset, int size, int is_signed);
> +extern int trace_add_event_call(struct ftrace_event_call *call);
> +extern void trace_remove_event_call(struct ftrace_event_call *call);
>
> #define is_signed_type(type) (((type)(-1)) < 0)
>
> diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
> index 1867553..d696580 100644
> --- a/include/trace/ftrace.h
> +++ b/include/trace/ftrace.h
> @@ -147,7 +147,8 @@
> #undef TRACE_EVENT
> #define TRACE_EVENT(call, proto, args, tstruct, func, print) \
> static int \
> -ftrace_format_##call(struct trace_seq *s) \
> +ftrace_format_##call(struct ftrace_event_call *event_call, \
> + struct trace_seq *s) \
> { \
> struct ftrace_raw_##call field __attribute__((unused)); \
> int ret = 0; \
> @@ -289,10 +290,9 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
> #undef TRACE_EVENT
> #define TRACE_EVENT(call, proto, args, tstruct, func, print) \
> int \
> -ftrace_define_fields_##call(void) \
> +ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
> { \
> struct ftrace_raw_##call field; \
> - struct ftrace_event_call *event_call = &event_##call; \
> int ret; \
> \
> __common_field(int, type, 1); \
> @@ -355,7 +355,7 @@ static inline int ftrace_get_offsets_##call( \
> * event_trace_printk(_RET_IP_, "<call>: " <fmt>);
> * }
> *
> - * static int ftrace_reg_event_<call>(void)
> + * static int ftrace_reg_event_<call>(struct ftrace_event_call *unused)
> * {
> * int ret;
> *
> @@ -366,7 +366,7 @@ static inline int ftrace_get_offsets_##call( \
> * return ret;
> * }
> *
> - * static void ftrace_unreg_event_<call>(void)
> + * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
> * {
> * unregister_trace_<call>(ftrace_event_<call>);
> * }
> @@ -399,7 +399,7 @@ static inline int ftrace_get_offsets_##call( \
> * trace_current_buffer_unlock_commit(event, irq_flags, pc);
> * }
> *
> - * static int ftrace_raw_reg_event_<call>(void)
> + * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
> * {
> * int ret;
> *
> @@ -410,7 +410,7 @@ static inline int ftrace_get_offsets_##call( \
> * return ret;
> * }
> *
> - * static void ftrace_unreg_event_<call>(void)
> + * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
> * {
> * unregister_trace_<call>(ftrace_raw_event_<call>);
> * }
> @@ -419,7 +419,7 @@ static inline int ftrace_get_offsets_##call( \
> * .trace = ftrace_raw_output_<call>, <-- stage 2
> * };
> *
> - * static int ftrace_raw_init_event_<call>(void)
> + * static int ftrace_raw_init_event_<call>(struct ftrace_event_call *unused)
> * {
> * int id;
> *
> @@ -537,7 +537,7 @@ static void ftrace_raw_event_##call(proto) \
> trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
> } \
> \
> -static int ftrace_raw_reg_event_##call(void) \
> +static int ftrace_raw_reg_event_##call(struct ftrace_event_call *unused)\
> { \
> int ret; \
> \
> @@ -548,7 +548,7 @@ static int ftrace_raw_reg_event_##call(void) \
> return ret; \
> } \
> \
> -static void ftrace_raw_unreg_event_##call(void) \
> +static void ftrace_raw_unreg_event_##call(struct ftrace_event_call *unused)\
> { \
> unregister_trace_##call(ftrace_raw_event_##call); \
> } \
> @@ -557,7 +557,7 @@ static struct trace_event ftrace_event_type_##call = { \
> .trace = ftrace_raw_output_##call, \
> }; \
> \
> -static int ftrace_raw_init_event_##call(void) \
> +static int ftrace_raw_init_event_##call(struct ftrace_event_call *unused)\
> { \
> int id; \
> \
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 53c8fd3..94ff41e 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -60,9 +60,7 @@ err:
> }
> EXPORT_SYMBOL_GPL(trace_define_field);
>
> -#ifdef CONFIG_MODULES
> -
> -static void trace_destroy_fields(struct ftrace_event_call *call)
> +void trace_destroy_fields(struct ftrace_event_call *call)
> {
> struct ftrace_event_field *field, *next;
>
> @@ -74,8 +72,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
> }
> }
>
> -#endif /* CONFIG_MODULES */
> -
> static void ftrace_event_enable_disable(struct ftrace_event_call *call,
> int enable)
> {
> @@ -84,14 +80,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
> if (call->enabled) {
> call->enabled = 0;
> tracing_stop_cmdline_record();
> - call->unregfunc();
> + call->unregfunc(call);
> }
> break;
> case 1:
> if (!call->enabled) {
> call->enabled = 1;
> tracing_start_cmdline_record();
> - call->regfunc();
> + call->regfunc(call);
> }
> break;
> }
> @@ -574,7 +570,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
> trace_seq_printf(s, "format:\n");
> trace_write_header(s);
>
> - r = call->show_format(s);
> + r = call->show_format(call, s);
> if (!r) {
> /*
> * ug! The format output is bigger than a PAGE!!
> @@ -921,7 +917,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
> d_events = event_subsystem_dir(call->system, d_events);
>
> if (call->raw_init) {
> - ret = call->raw_init();
> + ret = call->raw_init(call);
> if (ret < 0) {
> pr_warning("Could not initialize trace point"
> " events/%s\n", call->name);
> @@ -945,7 +941,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
> id);
>
> if (call->define_fields) {
> - ret = call->define_fields();
> + ret = call->define_fields(call);
> if (ret < 0) {
> pr_warning("Could not initialize trace point"
> " events/%s\n", call->name);
> @@ -965,6 +961,52 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
> return 0;
> }
>
> +static int __trace_add_event_call(struct ftrace_event_call *call)
> +{
> + struct dentry *d_events;
> +
> + if (!call->name)
> + return -EINVAL;
> +
> + d_events = event_trace_events_dir();
> + if (!d_events)
> + return -ENOENT;
> +
> + list_add(&call->list, &ftrace_events);
> + return event_create_dir(call, d_events, &ftrace_event_id_fops,
> + &ftrace_enable_fops, &ftrace_event_filter_fops,
> + &ftrace_event_format_fops);
> +}
> +
> +/* Add an additional event_call dynamically */
> +int trace_add_event_call(struct ftrace_event_call *call)
> +{
> + int ret;
> + mutex_lock(&event_mutex);
> + ret = __trace_add_event_call(call);
> + mutex_unlock(&event_mutex);
> + return ret;
> +}
> +
> +static void __trace_remove_event_call(struct ftrace_event_call *call)
> +{
> + ftrace_event_enable_disable(call, 0);
> + if (call->event)
> + __unregister_ftrace_event(call->event);
> + debugfs_remove_recursive(call->dir);
> + list_del(&call->list);
> + trace_destroy_fields(call);
> + destroy_preds(call);
> +}
> +
> +/* Remove an event_call */
> +void trace_remove_event_call(struct ftrace_event_call *call)
> +{
> + mutex_lock(&event_mutex);
> + __trace_remove_event_call(call);
> + mutex_unlock(&event_mutex);
> +}
> +
> #define for_each_event(event, start, end) \
> for (event = start; \
> (unsigned long)event < (unsigned long)end; \
> @@ -1070,13 +1112,7 @@ static void trace_module_remove_events(struct module *mod)
> list_for_each_entry_safe(call, p, &ftrace_events, list) {
> if (call->mod == mod) {
> found = true;
> - ftrace_event_enable_disable(call, 0);
> - if (call->event)
> - __unregister_ftrace_event(call->event);
> - debugfs_remove_recursive(call->dir);
> - list_del(&call->list);
> - trace_destroy_fields(call);
> - destroy_preds(call);
> + __trace_remove_event_call(call);
> }
> }
>
> diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
> index d06cf89..7cee79d 100644
> --- a/kernel/trace/trace_export.c
> +++ b/kernel/trace/trace_export.c
> @@ -60,7 +60,7 @@ extern void __bad_type_size(void);
> #undef TRACE_EVENT_FORMAT
> #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
> static int \
> -ftrace_format_##call(struct trace_seq *s) \
> +ftrace_format_##call(struct ftrace_event_call *dummy, struct trace_seq *s)\
> { \
> struct args field; \
> int ret; \
> @@ -76,7 +76,7 @@ ftrace_format_##call(struct trace_seq *s) \
> #define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
> tpfmt) \
> static int \
> -ftrace_format_##call(struct trace_seq *s) \
> +ftrace_format_##call(struct ftrace_event_call *dummy, struct trace_seq *s)\
> { \
> struct args field; \
> int ret; \
> @@ -115,10 +115,16 @@ ftrace_format_##call(struct trace_seq *s) \
> #define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
> cmd;
>
> +static int ftrace_raw_init_event(struct ftrace_event_call *event_call)
> +{
> + INIT_LIST_HEAD(&event_call->fields);
> + init_preds(event_call);
> + return 0;
> +}
> +
> #undef TRACE_EVENT_FORMAT
> #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
> -int ftrace_define_fields_##call(void); \
> -static int ftrace_raw_init_event_##call(void); \
> +int ftrace_define_fields_##call(struct ftrace_event_call *c); \
> \
> struct ftrace_event_call __used \
> __attribute__((__aligned__(4))) \
> @@ -126,16 +132,10 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
> .name = #call, \
> .id = proto, \
> .system = __stringify(TRACE_SYSTEM), \
> - .raw_init = ftrace_raw_init_event_##call, \
> + .raw_init = ftrace_raw_init_event, \
> .show_format = ftrace_format_##call, \
> .define_fields = ftrace_define_fields_##call, \
> -}; \
> -static int ftrace_raw_init_event_##call(void) \
> -{ \
> - INIT_LIST_HEAD(&event_##call.fields); \
> - init_preds(&event_##call); \
> - return 0; \
> -} \
> +};
>
> #undef TRACE_EVENT_FORMAT_NOFILTER
> #define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
> @@ -182,9 +182,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
> #undef TRACE_EVENT_FORMAT
> #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
> int \
> -ftrace_define_fields_##call(void) \
> +ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
> { \
> - struct ftrace_event_call *event_call = &event_##call; \
> struct args field; \
> int ret; \
> \
>
>
> --
> Masami Hiramatsu
>
> Software Engineer
> Hitachi Computer Products (America), Inc.
> Software Solutions Division
>
> e-mail: mhi...@re...
|
|
From: Mathieu D. <mat...@po...> - 2009-07-01 22:58:15
|
* Masami Hiramatsu (mhi...@re...) wrote: > Since the fixmap pages are assigned higher address to lower, text_poke() > has to use it with inverted order (FIX_TEXT_POKE1 to FIX_TEXT_POKE0). > Hrm, is that only true for x86_32 or also for x86_64 ? Reading arch/x86/include/asm/fixmap.h : * for x86_32: We allocate these special addresses * from the end of virtual memory (0xfffff000) backwards. * Also this lets us do fail-safe vmalloc(), we * can guarantee that these special addresses and * vmalloc()-ed addresses never overlap. Mathieu > Signed-off-by: Masami Hiramatsu <mhi...@re...> > Cc: Mathieu Desnoyers <mat...@po...> > Cc: Ingo Molnar <mi...@el...> > --- > > arch/x86/kernel/alternative.c | 14 +++++++++----- > 1 files changed, 9 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c > index f576587..4d8b40b 100644 > --- a/arch/x86/kernel/alternative.c > +++ b/arch/x86/kernel/alternative.c > @@ -527,14 +527,18 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) > } > BUG_ON(!pages[0]); > local_irq_save(flags); > - set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); > + /* > + * Since the fixmaps are assinged from higher address to lower, > + * we use FIX_TEXT_POKE1 first, and FIX_TEXT_POKE0 second. > + */ > + set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[0])); > if (pages[1]) > - set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); > - vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); > + set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[1])); > + vaddr = (char *)fix_to_virt(FIX_TEXT_POKE1); > memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); > - clear_fixmap(FIX_TEXT_POKE0); > + clear_fixmap(FIX_TEXT_POKE1); > if (pages[1]) > - clear_fixmap(FIX_TEXT_POKE1); > + clear_fixmap(FIX_TEXT_POKE0); > local_flush_tlb(); > sync_core(); > /* Could also do a CLFLUSH here to speed up CPU recovery; but > > > -- > Masami Hiramatsu > > Software Engineer > Hitachi Computer Products (America), Inc. > Software Solutions Division > > e-mail: mhi...@re... -- Mathieu Desnoyers OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 |
|
From: Masami H. <mhi...@re...> - 2009-07-01 22:09:00
|
Mathieu Desnoyers wrote:
> Masami wrote :
>
>> Since the fixmap pages are assigned higher address to lower, text_poke()
>> has to use it with inverted order (FIX_TEXT_POKE1 to FIX_TEXT_POKE0).
>
> I prefer to just invert the order of the fixmap declaration. It's simpler and
> more straightforward.
It's ok for me too.
> Backward fixmaps seems to be used by both x86 32 and 64.
>
> It's a really nasty bug, because it only hurts when instructions to patch are
> crossing a page boundary. If this happens, the fixmap write accesses
> will spill on the following fixmap, which may very well crash the
> system. And this does not crash the system, it could leave illegal
> instructions in place. Thanks Masami for finding this.
>
> It seems to have crept into the 2.6.30-rc series, so this calls for a
> -stable inclusion.
Right, thanks.
>
> Signed-off-by: Mathieu Desnoyers <mat...@po...>
Acked-by: Masami Hiramatsu <mhi...@re...>
> Cc: Ingo Molnar <mi...@el...>
> CC: st...@ke...
> ---
> arch/x86/include/asm/fixmap.h | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> Index: linux-2.6-lttng/arch/x86/include/asm/fixmap.h
> ===================================================================
> --- linux-2.6-lttng.orig/arch/x86/include/asm/fixmap.h 2009-07-01 16:52:57.000000000 -0400
> +++ linux-2.6-lttng/arch/x86/include/asm/fixmap.h 2009-07-01 16:54:52.000000000 -0400
> @@ -111,8 +111,8 @@ enum fixed_addresses {
> #ifdef CONFIG_PARAVIRT
> FIX_PARAVIRT_BOOTMAP,
> #endif
> - FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */
> - FIX_TEXT_POKE1,
> + FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
> + FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
> __end_of_permanent_fixed_addresses,
> #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
> FIX_OHCI1394_BASE,
>
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Mathieu D. <mat...@po...> - 2009-07-01 21:37:29
|
Masami wrote :
> Since the fixmap pages are assigned higher address to lower, text_poke()
> has to use it with inverted order (FIX_TEXT_POKE1 to FIX_TEXT_POKE0).
I prefer to just invert the order of the fixmap declaration. It's simpler and
more straightforward.
Backward fixmaps seems to be used by both x86 32 and 64.
It's a really nasty bug, because it only hurts when instructions to patch are
crossing a page boundary. If this happens, the fixmap write accesses
will spill on the following fixmap, which may very well crash the
system. And this does not crash the system, it could leave illegal
instructions in place. Thanks Masami for finding this.
It seems to have crept into the 2.6.30-rc series, so this calls for a
-stable inclusion.
Signed-off-by: Mathieu Desnoyers <mat...@po...>
CC: Masami Hiramatsu <mhi...@re...>
Cc: Ingo Molnar <mi...@el...>
CC: st...@ke...
---
arch/x86/include/asm/fixmap.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
Index: linux-2.6-lttng/arch/x86/include/asm/fixmap.h
===================================================================
--- linux-2.6-lttng.orig/arch/x86/include/asm/fixmap.h 2009-07-01 16:52:57.000000000 -0400
+++ linux-2.6-lttng/arch/x86/include/asm/fixmap.h 2009-07-01 16:54:52.000000000 -0400
@@ -111,8 +111,8 @@ enum fixed_addresses {
#ifdef CONFIG_PARAVIRT
FIX_PARAVIRT_BOOTMAP,
#endif
- FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */
- FIX_TEXT_POKE1,
+ FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
+ FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
__end_of_permanent_fixed_addresses,
#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
FIX_OHCI1394_BASE,
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
|
|
From: Masami H. <mhi...@re...> - 2009-07-01 21:00:03
|
Since the fixmap pages are assigned higher address to lower, text_poke() has to use it with inverted order (FIX_TEXT_POKE1 to FIX_TEXT_POKE0). Signed-off-by: Masami Hiramatsu <mhi...@re...> Cc: Mathieu Desnoyers <mat...@po...> Cc: Ingo Molnar <mi...@el...> --- arch/x86/kernel/alternative.c | 14 +++++++++----- 1 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f576587..4d8b40b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -527,14 +527,18 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) } BUG_ON(!pages[0]); local_irq_save(flags); - set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); + /* + * Since the fixmaps are assinged from higher address to lower, + * we use FIX_TEXT_POKE1 first, and FIX_TEXT_POKE0 second. + */ + set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[0])); if (pages[1]) - set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); - vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); + set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[1])); + vaddr = (char *)fix_to_virt(FIX_TEXT_POKE1); memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - clear_fixmap(FIX_TEXT_POKE0); + clear_fixmap(FIX_TEXT_POKE1); if (pages[1]) - clear_fixmap(FIX_TEXT_POKE1); + clear_fixmap(FIX_TEXT_POKE0); local_flush_tlb(); sync_core(); /* Could also do a CLFLUSH here to speed up CPU recovery; but -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
|
From: Masami H. <mhi...@re...> - 2009-07-01 01:21:11
|
Add a user-space selftest of x86 instruction decoder at kernel build time.
When CONFIG_X86_DECODER_SELFTEST=y, Kbuild builds a test harness of x86
instruction decoder and performs it after building vmlinux.
The test compares the results of objdump and x86 instruction decoder
code and check there are no differences.
Signed-off-by: Masami Hiramatsu <mhi...@re...>
Signed-off-by: Jim Keniston <jke...@us...>
Cc: H. Peter Anvin <hp...@zy...>
Cc: Steven Rostedt <ro...@go...>
Cc: Ananth N Mavinakayanahalli <an...@in...>
Cc: Srikar Dronamraju <sr...@li...>
Cc: Ingo Molnar <mi...@el...>
Cc: Frederic Weisbecker <fwe...@gm...>
Cc: Andi Kleen <ak...@li...>
Cc: Vegard Nossum <veg...@gm...>
Cc: Avi Kivity <av...@re...>
Cc: Przemysław Pawełczyk <prz...@pa...>
Cc: Sam Ravnborg <sa...@ra...>
---
arch/x86/Kconfig.debug | 9 ++++
arch/x86/Makefile | 3 +
arch/x86/include/asm/inat.h | 2 +
arch/x86/include/asm/insn.h | 2 +
arch/x86/lib/inat.c | 2 +
arch/x86/lib/insn.c | 2 +
arch/x86/scripts/Makefile | 19 +++++++
arch/x86/scripts/distill.awk | 42 +++++++++++++++++
arch/x86/scripts/test_get_len.c | 99 +++++++++++++++++++++++++++++++++++++++
arch/x86/scripts/user_include.h | 49 +++++++++++++++++++
10 files changed, 229 insertions(+), 0 deletions(-)
create mode 100644 arch/x86/scripts/Makefile
create mode 100644 arch/x86/scripts/distill.awk
create mode 100644 arch/x86/scripts/test_get_len.c
create mode 100644 arch/x86/scripts/user_include.h
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d105f29..7d0b681 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -186,6 +186,15 @@ config X86_DS_SELFTEST
config HAVE_MMIOTRACE_SUPPORT
def_bool y
+config X86_DECODER_SELFTEST
+ bool "x86 instruction decoder selftest"
+ depends on DEBUG_KERNEL
+ ---help---
+ Perform x86 instruction decoder selftests at build time.
+ This option is useful for checking the sanity of x86 instruction
+ decoder code.
+ If unsure, say "N".
+
#
# IO delay types:
#
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1b68659..7046556 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -154,6 +154,9 @@ all: bzImage
KBUILD_IMAGE := $(boot)/bzImage
bzImage: vmlinux
+ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
+ $(Q)$(MAKE) $(build)=arch/x86/scripts posttest
+endif
$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
$(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
index 01e079a..9090665 100644
--- a/arch/x86/include/asm/inat.h
+++ b/arch/x86/include/asm/inat.h
@@ -20,7 +20,9 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
+#ifdef __KERNEL__
#include <linux/types.h>
+#endif
/* Instruction attributes */
typedef u32 insn_attr_t;
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 5b50fa3..5736404 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -20,7 +20,9 @@
* Copyright (C) IBM Corporation, 2009
*/
+#ifdef __KERNEL__
#include <linux/types.h>
+#endif
/* insn_attr_t is defined in inat.h */
#include <asm/inat.h>
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
index d6a34be..564ecbd 100644
--- a/arch/x86/lib/inat.c
+++ b/arch/x86/lib/inat.c
@@ -18,7 +18,9 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
+#ifdef __KERNEL__
#include <linux/module.h>
+#endif
#include <asm/insn.h>
/* Attribute tables are generated from opcode map */
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 254c848..3b9451a 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -18,8 +18,10 @@
* Copyright (C) IBM Corporation, 2002, 2004, 2009
*/
+#ifdef __KERNEL__
#include <linux/string.h>
#include <linux/module.h>
+#endif
#include <asm/inat.h>
#include <asm/insn.h>
diff --git a/arch/x86/scripts/Makefile b/arch/x86/scripts/Makefile
new file mode 100644
index 0000000..f08859e
--- /dev/null
+++ b/arch/x86/scripts/Makefile
@@ -0,0 +1,19 @@
+PHONY += posttest
+quiet_cmd_posttest = TEST $@
+ cmd_posttest = objdump -d $(objtree)/vmlinux | awk -f $(srctree)/arch/x86/scripts/distill.awk | $(obj)/test_get_len
+
+posttest: $(obj)/test_get_len vmlinux
+ $(call cmd,posttest)
+
+test_get_len_SRC = $(srctree)/arch/x86/scripts/test_get_len.c $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c
+test_get_len_INC = $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+
+quiet_cmd_test_get_len = CC $@
+ cmd_test_get_len = $(CC) -Wall $(test_get_len_SRC) -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include -include $(srctree)/arch/x86/scripts/user_include.h -o $@
+
+
+$(obj)/test_get_len: $(test_get_len_SRC) $(test_get_len_INC)
+ $(call cmd,test_get_len)
+
+clean-files := test_get_len
+
diff --git a/arch/x86/scripts/distill.awk b/arch/x86/scripts/distill.awk
new file mode 100644
index 0000000..d433619
--- /dev/null
+++ b/arch/x86/scripts/distill.awk
@@ -0,0 +1,42 @@
+#!/bin/awk -f
+# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
+# Distills the disassembly as follows:
+# - Removes all lines except the disassembled instructions.
+# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes
+# into a single line.
+# - Remove bad(or prefix only) instructions
+
+BEGIN {
+ prev_addr = ""
+ prev_hex = ""
+ prev_mnemonic = ""
+ bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
+ fwait_expr = "^9b "
+ fwait_str="9b\tfwait"
+}
+
+/^ *[0-9a-f]+:/ {
+ if (split($0, field, "\t") < 3) {
+ # This is a continuation of the same insn.
+ prev_hex = prev_hex field[2]
+ } else {
+ # Skip bad instructions
+ if (match(prev_mnemonic, bad_expr))
+ prev_addr = ""
+ # Split fwait from other f* instructions
+ if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") {
+ printf "%s\t%s\n", prev_addr, fwait_str
+ sub(fwait_expr, "", prev_hex)
+ }
+ if (prev_addr != "")
+ printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+ prev_addr = field[1]
+ prev_hex = field[2]
+ prev_mnemonic = field[3]
+ }
+}
+
+END {
+ if (prev_addr != "")
+ printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+}
diff --git a/arch/x86/scripts/test_get_len.c b/arch/x86/scripts/test_get_len.c
new file mode 100644
index 0000000..0f702e8
--- /dev/null
+++ b/arch/x86/scripts/test_get_len.c
@@ -0,0 +1,99 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include <asm/insn.h>
+
+/*
+ * Test of instruction analysis in general and insn_get_length() in
+ * particular. See if insn_get_length() and the disassembler agree
+ * on the length of each instruction in an elf disassembly.
+ *
+ * usage: test_get_len < distilled_disassembly
+ */
+
+const char *prog;
+
+static void usage()
+{
+ fprintf(stderr, "usage: %s < distilled_disassembly\n", prog);
+ exit(1);
+}
+
+static void malformed_line(const char *line, int line_nr)
+{
+ fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line);
+ exit(3);
+}
+
+#define BUFSIZE 256
+
+int main(int argc, char **argv)
+{
+ char line[BUFSIZE];
+ unsigned char insn_buf[16];
+ struct insn insn;
+ int insns = 0;
+
+ prog = argv[0];
+ if (argc > 1)
+ usage();
+
+ while (fgets(line, BUFSIZE, stdin)) {
+ char copy[BUFSIZE], *s, *tab1, *tab2;
+ int nb = 0;
+ unsigned b;
+
+ insns++;
+ memset(insn_buf, 0, 16);
+ strcpy(copy, line);
+ tab1 = strchr(copy, '\t');
+ if (!tab1)
+ malformed_line(line, insns);
+ s = tab1 + 1;
+ s += strspn(s, " ");
+ tab2 = strchr(s, '\t');
+ if (!tab2)
+ malformed_line(line, insns);
+ *tab2 = '\0'; /* Characters beyond tab2 aren't examined */
+ while (s < tab2) {
+ if (sscanf(s, "%x", &b) == 1) {
+ insn_buf[nb++] = (unsigned char) b;
+ s += 3;
+ } else
+ break;
+ }
+ /* Decode an instruction */
+ kernel_insn_init(&insn, insn_buf);
+ insn_get_length(&insn);
+ if (insn.length != nb) {
+ fprintf(stderr, "Error: %s", line);
+ fprintf(stderr, "Error: objdump says %d bytes, but "
+ "insn_get_length() says %d (attr:%x)\n", nb,
+ insn.length, insn.attr);
+ exit(2);
+ }
+ }
+ fprintf(stderr, "Succeed: decoded and checked %d instructions\n",
+ insns);
+ return 0;
+}
diff --git a/arch/x86/scripts/user_include.h b/arch/x86/scripts/user_include.h
new file mode 100644
index 0000000..3bdcc55
--- /dev/null
+++ b/arch/x86/scripts/user_include.h
@@ -0,0 +1,49 @@
+#ifndef __USER_TYPES_H
+#define __USER_TYPES_H
+
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <string.h>
+
+#ifdef __x86_64__
+#define CONFIG_X86_64
+#else
+#define CONFIG_X86_32
+#endif
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+typedef signed char s8;
+typedef short s16;
+typedef int s32;
+typedef long long s64;
+
+typedef enum bool { false = 0, true } bool;
+
+/* any harmless file-scope decl */
+#define NOP_DECL struct __nop
+#define EXPORT_SYMBOL_GPL(symbol) NOP_DECL
+#define MODULE_LICENSE(gpl) NOP_DECL
+
+#define WARN_ON(cond) do { } while (0)
+#define unlikely(cond) (cond)
+
+#endif /* __USER_TYPES_H */
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-07-01 01:07:36
|
Add kprobes-based event tracer on ftrace.
This tracer is similar to the events tracer which is based on Tracepoint
infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe
and kretprobe). It probes anywhere where kprobes can probe(this means, all
functions body except for __kprobes functions).
Similar to the events tracer, this tracer doesn't need to be activated via
current_tracer, instead of that, just set probe points via
/sys/kernel/debug/tracing/kprobe_events. And you can set filters on each
probe events via /sys/kernel/debug/tracing/events/kprobes/<EVENT>/filter.
This tracer supports following probe arguments for each probe.
%REG : Fetch register REG
sN : Fetch Nth entry of stack (N >= 0)
@ADDR : Fetch memory at ADDR (ADDR should be in kernel)
@SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
aN : Fetch function argument. (N >= 0)
rv : Fetch return value.
ra : Fetch return address.
+|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.
See Documentation/trace/kprobes.txt for details.
Changes from v9:
- Select CONFIG_GENERIC_TRACER when CONFIG_KPROBE_TRACER=y.
Signed-off-by: Masami Hiramatsu <mhi...@re...>
Acked-by: Ananth N Mavinakayanahalli <an...@in...>
Cc: Christoph Hellwig <hc...@in...>
Cc: Steven Rostedt <ro...@go...>
Cc: Ingo Molnar <mi...@el...>
Cc: Frederic Weisbecker <fwe...@gm...>
Cc: Tom Zanussi <tza...@gm...>
---
Documentation/trace/kprobes.txt | 138 ++++
kernel/trace/Kconfig | 12
kernel/trace/Makefile | 1
kernel/trace/trace.h | 22 +
kernel/trace/trace_event_types.h | 20 +
kernel/trace/trace_kprobe.c | 1183 ++++++++++++++++++++++++++++++++++++++
6 files changed, 1376 insertions(+), 0 deletions(-)
create mode 100644 Documentation/trace/kprobes.txt
create mode 100644 kernel/trace/trace_kprobe.c
diff --git a/Documentation/trace/kprobes.txt b/Documentation/trace/kprobes.txt
new file mode 100644
index 0000000..3a90ebb
--- /dev/null
+++ b/Documentation/trace/kprobes.txt
@@ -0,0 +1,138 @@
+ Kprobe-based Event Tracer
+ =========================
+
+ Documentation is written by Masami Hiramatsu
+
+
+Overview
+--------
+This tracer is similar to the events tracer which is based on Tracepoint
+infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe
+and kretprobe). It probes anywhere where kprobes can probe(this means, all
+functions body except for __kprobes functions).
+
+Unlike the function tracer, this tracer can probe instructions inside of
+kernel functions. It allows you to check which instruction has been executed.
+
+Unlike the Tracepoint based events tracer, this tracer can add and remove
+probe points on the fly.
+
+Similar to the events tracer, this tracer doesn't need to be activated via
+current_tracer, instead of that, just set probe points via
+/sys/kernel/debug/tracing/kprobe_events. And you can set filters on each
+probe events via /sys/kernel/debug/tracing/events/kprobes/<EVENT>/filter.
+
+
+Synopsis of kprobe_events
+-------------------------
+ p[:EVENT] SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS] : set a probe
+ r[:EVENT] SYMBOL[+0] [FETCHARGS] : set a return probe
+
+ EVENT : Event name
+ SYMBOL[+offs|-offs] : Symbol+offset where the probe is inserted
+ MEMADDR : Address where the probe is inserted
+
+ FETCHARGS : Arguments
+ %REG : Fetch register REG
+ sN : Fetch Nth entry of stack (N >= 0)
+ @ADDR : Fetch memory at ADDR (ADDR should be in kernel)
+ @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
+ aN : Fetch function argument. (N >= 0)(*)
+ rv : Fetch return value.(**)
+ ra : Fetch return address.(**)
+ +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.(***)
+
+ (*) aN may not correct on asmlinkaged functions and at the middle of
+ function body.
+ (**) only for return probe.
+ (***) this is useful for fetching a field of data structures.
+
+
+Per-Probe Event Filtering
+-------------------------
+ Per-probe event filtering feature allows you to set different filter on each
+probe and gives you what arguments will be shown in trace buffer. If an event
+name is specified right after 'p:' or 'r:' in kprobe_events, the tracer adds
+an event under tracing/events/kprobes/<EVENT>, at the directory you can see
+'id', 'enabled', 'format' and 'filter'.
+
+enabled:
+ You can enable/disable the probe by writing 1 or 0 on it.
+
+format:
+ It shows the format of this probe event. It also shows aliases of arguments
+ which you specified to kprobe_events.
+
+filter:
+ You can write filtering rules of this event. And you can use both of aliase
+ names and field names for describing filters.
+
+
+Usage examples
+--------------
+To add a probe as a new event, write a new definition to kprobe_events
+as below.
+
+ echo p:myprobe do_sys_open a0 a1 a2 a3 > /sys/kernel/debug/tracing/kprobe_events
+
+ This sets a kprobe on the top of do_sys_open() function with recording
+1st to 4th arguments as "myprobe" event.
+
+ echo r:myretprobe do_sys_open rv ra >> /sys/kernel/debug/tracing/kprobe_events
+
+ This sets a kretprobe on the return point of do_sys_open() function with
+recording return value and return address as "myretprobe" event.
+ You can see the format of these events via
+/sys/kernel/debug/tracing/events/kprobes/<EVENT>/format.
+
+ cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format
+name: myprobe
+ID: 23
+format:
+ field:unsigned short common_type; offset:0; size:2;
+ field:unsigned char common_flags; offset:2; size:1;
+ field:unsigned char common_preempt_count; offset:3; size:1;
+ field:int common_pid; offset:4; size:4;
+ field:int common_tgid; offset:8; size:4;
+
+ field: unsigned long ip; offset:16;tsize:8;
+ field: int nargs; offset:24;tsize:4;
+ field: unsigned long arg0; offset:32;tsize:8;
+ field: unsigned long arg1; offset:40;tsize:8;
+ field: unsigned long arg2; offset:48;tsize:8;
+ field: unsigned long arg3; offset:56;tsize:8;
+
+ alias: a0; original: arg0;
+ alias: a1; original: arg1;
+ alias: a2; original: arg2;
+ alias: a3; original: arg3;
+
+print fmt: "%lx: 0x%lx 0x%lx 0x%lx 0x%lx", ip, arg0, arg1, arg2, arg3
+
+
+ You can see that the event has 4 arguments and alias expressions
+corresponding to it.
+
+ echo > /sys/kernel/debug/tracing/kprobe_events
+
+ This clears all probe points. and you can see the traced information via
+/sys/kernel/debug/tracing/trace.
+
+ cat /sys/kernel/debug/tracing/trace
+# tracer: nop
+#
+# TASK-PID CPU# TIMESTAMP FUNCTION
+# | | | | |
+ <...>-1447 [001] 1038282.286875: do_sys_open+0x0/0xd6: 0x3 0x7fffd1ec4440 0x8000 0x0
+ <...>-1447 [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: 0xfffffffffffffffe 0xffffffff81367a3a
+ <...>-1447 [001] 1038282.286885: do_sys_open+0x0/0xd6: 0xffffff9c 0x40413c 0x8000 0x1b6
+ <...>-1447 [001] 1038282.286915: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a
+ <...>-1447 [001] 1038282.286969: do_sys_open+0x0/0xd6: 0xffffff9c 0x4041c6 0x98800 0x10
+ <...>-1447 [001] 1038282.286976: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a
+
+
+ Each line shows when the kernel hits a probe, and <- SYMBOL means kernel
+returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel
+returns from do_sys_open to sys_open+0x1b).
+
+
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 860c712..60f3401 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -445,6 +445,18 @@ config BLK_DEV_IO_TRACE
If unsure, say N.
+config KPROBE_TRACER
+ depends on KPROBES
+ depends on X86
+ bool "Trace kprobes"
+ select TRACING
+ select GENERIC_TRACER
+ help
+ This tracer probes everywhere where kprobes can probe it, and
+ records various registers and memories specified by user.
+ This also allows you to trace kprobe probe points as a dynamic
+ defined events. It provides per-probe event filtering interface.
+
config DYNAMIC_FTRACE
bool "enable/disable ftrace tracepoints dynamically"
depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ce3b1cd..8e6884d 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -55,5 +55,6 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
+obj-$(CONFIG_KPROBE_TRACER) += trace_kprobe.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 206cb7d..65945eb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -45,6 +45,8 @@ enum trace_type {
TRACE_POWER,
TRACE_BLK,
TRACE_KSYM,
+ TRACE_KPROBE,
+ TRACE_KRETPROBE,
__TRACE_LAST_TYPE,
};
@@ -227,6 +229,22 @@ struct trace_ksym {
char ksym_name[KSYM_NAME_LEN];
char p_name[TASK_COMM_LEN];
};
+#define TRACE_KPROBE_ARGS 6
+
+struct kprobe_trace_entry {
+ struct trace_entry ent;
+ unsigned long ip;
+ int nargs;
+ unsigned long args[TRACE_KPROBE_ARGS];
+};
+
+struct kretprobe_trace_entry {
+ struct trace_entry ent;
+ unsigned long func;
+ unsigned long ret_ip;
+ int nargs;
+ unsigned long args[TRACE_KPROBE_ARGS];
+};
/*
* trace_flag_type is an enumeration that holds different
@@ -344,6 +362,10 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct syscall_trace_exit, \
TRACE_SYSCALL_EXIT); \
IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM); \
+ IF_ASSIGN(var, ent, struct kprobe_trace_entry, \
+ TRACE_KPROBE); \
+ IF_ASSIGN(var, ent, struct kretprobe_trace_entry, \
+ TRACE_KRETPROBE); \
__ftrace_bad_type(); \
} while (0)
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 6db005e..ec2e6f3 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -175,4 +175,24 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
);
+TRACE_EVENT_FORMAT(kprobe, TRACE_KPROBE, kprobe_trace_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(unsigned long, ip, ip)
+ TRACE_FIELD(int, nargs, nargs)
+ TRACE_FIELD_SPECIAL(unsigned long args[TRACE_KPROBE_ARGS],
+ args, TRACE_KPROBE_ARGS, args)
+ ),
+ TP_RAW_FMT("%08lx: args:0x%lx ...")
+);
+
+TRACE_EVENT_FORMAT(kretprobe, TRACE_KRETPROBE, kretprobe_trace_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(unsigned long, func, func)
+ TRACE_FIELD(unsigned long, ret_ip, ret_ip)
+ TRACE_FIELD(int, nargs, nargs)
+ TRACE_FIELD_SPECIAL(unsigned long args[TRACE_KPROBE_ARGS],
+ args, TRACE_KPROBE_ARGS, args)
+ ),
+ TP_RAW_FMT("%08lx <- %08lx: args:0x%lx ...")
+);
#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 0000000..0951512
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1183 @@
+/*
+ * kprobe based kernel tracer
+ *
+ * Created by Masami Hiramatsu <mhi...@re...>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/ptrace.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+#define MAX_ARGSTR_LEN 63
+
+/* currently, trace_kprobe only supports X86. */
+
+struct fetch_func {
+ unsigned long (*func)(struct pt_regs *, void *);
+ void *data;
+};
+
+static __kprobes unsigned long call_fetch(struct fetch_func *f,
+ struct pt_regs *regs)
+{
+ return f->func(regs, f->data);
+}
+
+/* fetch handlers */
+static __kprobes unsigned long fetch_register(struct pt_regs *regs,
+ void *offset)
+{
+ return regs_get_register(regs, (unsigned)((unsigned long)offset));
+}
+
+static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
+ void *num)
+{
+ return regs_get_kernel_stack_nth(regs, (unsigned)((unsigned long)num));
+}
+
+static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
+{
+ unsigned long retval;
+ if (probe_kernel_address(addr, retval))
+ return 0;
+ return retval;
+}
+
+static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
+{
+ return regs_get_argument_nth(regs, (unsigned)((unsigned long)num));
+}
+
+static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
+ void *dummy)
+{
+ return regs_return_value(regs);
+}
+
+static __kprobes unsigned long fetch_ip(struct pt_regs *regs, void *dummy)
+{
+ return instruction_pointer(regs);
+}
+
+/* Memory fetching by symbol */
+struct symbol_cache {
+ char *symbol;
+ long offset;
+ unsigned long addr;
+};
+
+static unsigned long update_symbol_cache(struct symbol_cache *sc)
+{
+ sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+ if (sc->addr)
+ sc->addr += sc->offset;
+ return sc->addr;
+}
+
+static void free_symbol_cache(struct symbol_cache *sc)
+{
+ kfree(sc->symbol);
+ kfree(sc);
+}
+
+static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+ struct symbol_cache *sc;
+ if (!sym || strlen(sym) == 0)
+ return NULL;
+ sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+ if (!sc)
+ return NULL;
+
+ sc->symbol = kstrdup(sym, GFP_KERNEL);
+ if (!sc->symbol) {
+ kfree(sc);
+ return NULL;
+ }
+ sc->offset = offset;
+
+ update_symbol_cache(sc);
+ return sc;
+}
+
+static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
+{
+ struct symbol_cache *sc = data;
+ if (sc->addr)
+ return fetch_memory(regs, (void *)sc->addr);
+ else
+ return 0;
+}
+
+/* Special indirect memory access interface */
+struct indirect_fetch_data {
+ struct fetch_func orig;
+ long offset;
+};
+
+static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
+{
+ struct indirect_fetch_data *ind = data;
+ unsigned long addr;
+ addr = call_fetch(&ind->orig, regs);
+ if (addr) {
+ addr += ind->offset;
+ return fetch_memory(regs, (void *)addr);
+ } else
+ return 0;
+}
+
+static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
+{
+ if (data->orig.func == fetch_indirect)
+ free_indirect_fetch_data(data->orig.data);
+ else if (data->orig.func == fetch_symbol)
+ free_symbol_cache(data->orig.data);
+ kfree(data);
+}
+
+/**
+ * kprobe_trace_core
+ */
+
+struct trace_probe {
+ struct list_head list;
+ union {
+ struct kprobe kp;
+ struct kretprobe rp;
+ };
+ const char *symbol; /* symbol name */
+ unsigned int nr_args;
+ struct fetch_func args[TRACE_KPROBE_ARGS];
+ struct ftrace_event_call call;
+};
+
+static int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs);
+static int kretprobe_trace_func(struct kretprobe_instance *ri,
+ struct pt_regs *regs);
+
+static __kprobes int probe_is_return(struct trace_probe *tp)
+{
+ return (tp->rp.handler == kretprobe_trace_func);
+}
+
+static __kprobes const char *probe_symbol(struct trace_probe *tp)
+{
+ return tp->symbol ? tp->symbol : "unknown";
+}
+
+static __kprobes long probe_offset(struct trace_probe *tp)
+{
+ return (probe_is_return(tp)) ? tp->rp.kp.offset : tp->kp.offset;
+}
+
+static __kprobes void *probe_address(struct trace_probe *tp)
+{
+ return (probe_is_return(tp)) ? tp->rp.kp.addr : tp->kp.addr;
+}
+
+static int trace_arg_string(char *buf, size_t n, struct fetch_func *ff)
+{
+ int ret = -EINVAL;
+ if (ff->func == fetch_argument)
+ ret = snprintf(buf, n, "a%lu", (unsigned long)ff->data);
+ else if (ff->func == fetch_register) {
+ const char *name;
+ name = regs_query_register_name((unsigned)((long)ff->data));
+ ret = snprintf(buf, n, "%%%s", name);
+ } else if (ff->func == fetch_stack)
+ ret = snprintf(buf, n, "s%lu", (unsigned long)ff->data);
+ else if (ff->func == fetch_memory)
+ ret = snprintf(buf, n, "@0x%p", ff->data);
+ else if (ff->func == fetch_symbol) {
+ struct symbol_cache *sc = ff->data;
+ ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset);
+ } else if (ff->func == fetch_retvalue)
+ ret = snprintf(buf, n, "rv");
+ else if (ff->func == fetch_ip)
+ ret = snprintf(buf, n, "ra");
+ else if (ff->func == fetch_indirect) {
+ struct indirect_fetch_data *id = ff->data;
+ ret = snprintf(buf, n, "%+ld(", id->offset);
+ if (ret > n)
+ goto end;
+ n -= ret;
+ ret = trace_arg_string(buf, n, &id->orig);
+ if (ret > n)
+ goto end;
+ n -= ret;
+ ret = snprintf(buf, n, ")");
+ }
+end:
+ if (ret > n)
+ return -ENOSPC;
+ return 0;
+}
+
+static int register_probe_event(struct trace_probe *tp);
+static void unregister_probe_event(struct trace_probe *tp);
+
+static DEFINE_MUTEX(probe_lock);
+static LIST_HEAD(probe_list);
+
+static struct trace_probe *alloc_trace_probe(const char *symbol,
+ const char *event)
+{
+ struct trace_probe *tp;
+
+ tp = kzalloc(sizeof(struct trace_probe), GFP_KERNEL);
+ if (!tp)
+ return ERR_PTR(-ENOMEM);
+
+ if (symbol) {
+ tp->symbol = kstrdup(symbol, GFP_KERNEL);
+ if (!tp->symbol)
+ goto error;
+ }
+ if (event) {
+ tp->call.name = kstrdup(event, GFP_KERNEL);
+ if (!tp->call.name)
+ goto error;
+ }
+
+ INIT_LIST_HEAD(&tp->list);
+ return tp;
+error:
+ kfree(tp->symbol);
+ kfree(tp);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void free_trace_probe(struct trace_probe *tp)
+{
+ int i;
+ for (i = 0; i < tp->nr_args; i++)
+ if (tp->args[i].func == fetch_symbol)
+ free_symbol_cache(tp->args[i].data);
+ else if (tp->args[i].func == fetch_indirect)
+ free_indirect_fetch_data(tp->args[i].data);
+
+ kfree(tp->call.name);
+ kfree(tp->symbol);
+ kfree(tp);
+}
+
+static struct trace_probe *find_probe_event(const char *event)
+{
+ struct trace_probe *tp;
+ list_for_each_entry(tp, &probe_list, list)
+ if (tp->call.name && !strcmp(tp->call.name, event))
+ return tp;
+ return NULL;
+}
+
+static void __unregister_trace_probe(struct trace_probe *tp)
+{
+ if (probe_is_return(tp))
+ unregister_kretprobe(&tp->rp);
+ else
+ unregister_kprobe(&tp->kp);
+}
+
+/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+static void unregister_trace_probe(struct trace_probe *tp)
+{
+ if (tp->call.name)
+ unregister_probe_event(tp);
+ __unregister_trace_probe(tp);
+ list_del(&tp->list);
+}
+
+/* Register a trace_probe and probe_event */
+static int register_trace_probe(struct trace_probe *tp)
+{
+ struct trace_probe *old_tp;
+ int ret;
+
+ mutex_lock(&probe_lock);
+
+ if (probe_is_return(tp))
+ ret = register_kretprobe(&tp->rp);
+ else
+ ret = register_kprobe(&tp->kp);
+
+ if (ret) {
+ pr_warning("Could not insert probe(%d)\n", ret);
+ if (ret == -EILSEQ) {
+ pr_warning("Probing address(0x%p) is not an "
+ "instruction boundary.\n",
+ probe_address(tp));
+ ret = -EINVAL;
+ }
+ goto end;
+ }
+ /* register as an event */
+ if (tp->call.name) {
+ old_tp = find_probe_event(tp->call.name);
+ if (old_tp) {
+ /* delete old event */
+ unregister_trace_probe(old_tp);
+ free_trace_probe(old_tp);
+ }
+ ret = register_probe_event(tp);
+ if (ret) {
+ pr_warning("Faild to register probe event(%d)\n", ret);
+ __unregister_trace_probe(tp);
+ }
+ }
+ list_add_tail(&tp->list, &probe_list);
+end:
+ mutex_unlock(&probe_lock);
+ return ret;
+}
+
+/* Split symbol and offset. */
+static int split_symbol_offset(char *symbol, long *offset)
+{
+ char *tmp;
+ int ret;
+
+ if (!offset)
+ return -EINVAL;
+
+ tmp = strchr(symbol, '+');
+ if (!tmp)
+ tmp = strchr(symbol, '-');
+
+ if (tmp) {
+ /* skip sign because strict_strtol doesn't accept '+' */
+ ret = strict_strtol(tmp + 1, 0, offset);
+ if (ret)
+ return ret;
+ if (*tmp == '-')
+ *offset = -(*offset);
+ *tmp = '\0';
+ } else
+ *offset = 0;
+ return 0;
+}
+
+#define PARAM_MAX_ARGS 16
+#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
+
+static int parse_trace_arg(char *arg, struct fetch_func *ff, int is_return)
+{
+ int ret = 0;
+ unsigned long param;
+ long offset;
+ char *tmp;
+
+ switch (arg[0]) {
+ case 'a': /* argument */
+ ret = strict_strtoul(arg + 1, 10, ¶m);
+ if (ret || param > PARAM_MAX_ARGS)
+ ret = -EINVAL;
+ else {
+ ff->func = fetch_argument;
+ ff->data = (void *)param;
+ }
+ break;
+ case 'r': /* retval or retaddr */
+ if (is_return && arg[1] == 'v') {
+ ff->func = fetch_retvalue;
+ ff->data = NULL;
+ } else if (is_return && arg[1] == 'a') {
+ ff->func = fetch_ip;
+ ff->data = NULL;
+ } else
+ ret = -EINVAL;
+ break;
+ case '%': /* named register */
+ ret = regs_query_register_offset(arg + 1);
+ if (ret >= 0) {
+ ff->func = fetch_register;
+ ff->data = (void *)(unsigned long)ret;
+ ret = 0;
+ }
+ break;
+ case 's': /* stack */
+ ret = strict_strtoul(arg + 1, 10, ¶m);
+ if (ret || param > PARAM_MAX_STACK)
+ ret = -EINVAL;
+ else {
+ ff->func = fetch_stack;
+ ff->data = (void *)param;
+ }
+ break;
+ case '@': /* memory or symbol */
+ if (isdigit(arg[1])) {
+ ret = strict_strtoul(arg + 1, 0, ¶m);
+ if (ret)
+ break;
+ ff->func = fetch_memory;
+ ff->data = (void *)param;
+ } else {
+ ret = split_symbol_offset(arg + 1, &offset);
+ if (ret)
+ break;
+ ff->data = alloc_symbol_cache(arg + 1,
+ offset);
+ if (ff->data)
+ ff->func = fetch_symbol;
+ else
+ ret = -EINVAL;
+ }
+ break;
+ case '+': /* indirect memory */
+ case '-':
+ tmp = strchr(arg, '(');
+ if (!tmp) {
+ ret = -EINVAL;
+ break;
+ }
+ *tmp = '\0';
+ ret = strict_strtol(arg + 1, 0, &offset);
+ if (ret)
+ break;
+ if (arg[0] == '-')
+ offset = -offset;
+ arg = tmp + 1;
+ tmp = strrchr(arg, ')');
+ if (tmp) {
+ struct indirect_fetch_data *id;
+ *tmp = '\0';
+ id = kzalloc(sizeof(struct indirect_fetch_data),
+ GFP_KERNEL);
+ if (!id)
+ return -ENOMEM;
+ id->offset = offset;
+ ret = parse_trace_arg(arg, &id->orig, is_return);
+ if (ret)
+ kfree(id);
+ else {
+ ff->func = fetch_indirect;
+ ff->data = (void *)id;
+ }
+ } else
+ ret = -EINVAL;
+ break;
+ default:
+ /* TODO: support custom handler */
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+static int create_trace_probe(int argc, char **argv)
+{
+ /*
+ * Argument syntax:
+ * - Add kprobe: p[:EVENT] SYMBOL[+OFFS|-OFFS]|ADDRESS [FETCHARGS]
+ * - Add kretprobe: r[:EVENT] SYMBOL[+0] [FETCHARGS]
+ * Fetch args:
+ * aN : fetch Nth of function argument. (N:0-)
+ * rv : fetch return value
+ * ra : fetch return address
+ * sN : fetch Nth of stack (N:0-)
+ * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
+ * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
+ * %REG : fetch register REG
+ * Indirect memory fetch:
+ * +|-offs(ARG) : fetch memory at ARG +|- offs address.
+ */
+ struct trace_probe *tp;
+ struct kprobe *kp;
+ int i, ret = 0;
+ int is_return = 0;
+ char *symbol = NULL, *event = NULL;
+ long offset = 0;
+ void *addr = NULL;
+
+ if (argc < 2)
+ return -EINVAL;
+
+ if (argv[0][0] == 'p')
+ is_return = 0;
+ else if (argv[0][0] == 'r')
+ is_return = 1;
+ else
+ return -EINVAL;
+
+ if (argv[0][1] == ':') {
+ event = &argv[0][2];
+ if (strlen(event) == 0) {
+ pr_info("Event name is not specifiled\n");
+ return -EINVAL;
+ }
+ }
+
+ if (isdigit(argv[1][0])) {
+ if (is_return)
+ return -EINVAL;
+ /* an address specified */
+ ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
+ if (ret)
+ return ret;
+ } else {
+ /* a symbol specified */
+ symbol = argv[1];
+ /* TODO: support .init module functions */
+ ret = split_symbol_offset(symbol, &offset);
+ if (ret)
+ return ret;
+ if (offset && is_return)
+ return -EINVAL;
+ }
+
+ /* setup a probe */
+ tp = alloc_trace_probe(symbol, event);
+ if (IS_ERR(tp))
+ return PTR_ERR(tp);
+
+ if (is_return) {
+ kp = &tp->rp.kp;
+ tp->rp.handler = kretprobe_trace_func;
+ } else {
+ kp = &tp->kp;
+ tp->kp.pre_handler = kprobe_trace_func;
+ }
+
+ if (tp->symbol) {
+ kp->symbol_name = tp->symbol;
+ kp->offset = offset;
+ } else
+ kp->addr = addr;
+
+ /* parse arguments */
+ argc -= 2; argv += 2; ret = 0;
+ for (i = 0; i < argc && i < TRACE_KPROBE_ARGS; i++) {
+ if (strlen(argv[i]) > MAX_ARGSTR_LEN) {
+ pr_info("Argument%d(%s) is too long.\n", i, argv[i]);
+ ret = -ENOSPC;
+ goto error;
+ }
+ ret = parse_trace_arg(argv[i], &tp->args[i], is_return);
+ if (ret)
+ goto error;
+ }
+ tp->nr_args = i;
+
+ ret = register_trace_probe(tp);
+ if (ret)
+ goto error;
+ return 0;
+
+error:
+ free_trace_probe(tp);
+ return ret;
+}
+
+static void cleanup_all_probes(void)
+{
+ struct trace_probe *tp;
+ mutex_lock(&probe_lock);
+ /* TODO: Use batch unregistration */
+ while (!list_empty(&probe_list)) {
+ tp = list_entry(probe_list.next, struct trace_probe, list);
+ unregister_trace_probe(tp);
+ free_trace_probe(tp);
+ }
+ mutex_unlock(&probe_lock);
+}
+
+
+/* Probes listing interfaces */
+static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&probe_lock);
+ return seq_list_start(&probe_list, *pos);
+}
+
+static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &probe_list, pos);
+}
+
+static void probes_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&probe_lock);
+}
+
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+ struct trace_probe *tp = v;
+ int i, ret;
+ char buf[MAX_ARGSTR_LEN + 1];
+
+ if (tp == NULL)
+ return 0;
+
+ seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
+ if (tp->call.name)
+ seq_printf(m, ":%s", tp->call.name);
+
+ if (tp->symbol)
+ seq_printf(m, " %s%+ld", probe_symbol(tp), probe_offset(tp));
+ else
+ seq_printf(m, " 0x%p", probe_address(tp));
+
+ for (i = 0; i < tp->nr_args; i++) {
+ ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
+ if (ret) {
+ pr_warning("Argument%d is too long.\n", i);
+ break;
+ }
+ seq_printf(m, " %s", buf);
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static const struct seq_operations probes_seq_op = {
+ .start = probes_seq_start,
+ .next = probes_seq_next,
+ .stop = probes_seq_stop,
+ .show = probes_seq_show
+};
+
+static int probes_open(struct inode *inode, struct file *file)
+{
+ if ((file->f_mode & FMODE_WRITE) &&
+ !(file->f_flags & O_APPEND))
+ cleanup_all_probes();
+
+ return seq_open(file, &probes_seq_op);
+}
+
+static int command_trace_probe(const char *buf)
+{
+ char **argv;
+ int argc = 0, ret = 0;
+
+ argv = argv_split(GFP_KERNEL, buf, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ if (argc)
+ ret = create_trace_probe(argc, argv);
+
+ argv_free(argv);
+ return ret;
+}
+
+#define WRITE_BUFSIZE 128
+
+static ssize_t probes_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ char *kbuf, *tmp;
+ int ret;
+ size_t done;
+ size_t size;
+
+ if (!count || count < 0)
+ return 0;
+
+ kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ ret = done = 0;
+ do {
+ size = count - done;
+ if (size > WRITE_BUFSIZE)
+ size = WRITE_BUFSIZE;
+ if (copy_from_user(kbuf, buffer + done, size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ kbuf[size] = '\0';
+ tmp = strchr(kbuf, '\n');
+ if (!tmp) {
+ pr_warning("Line length is too long: "
+ "Should be less than %d.", WRITE_BUFSIZE);
+ ret = -EINVAL;
+ goto out;
+ }
+ *tmp = '\0';
+ size = tmp - kbuf + 1;
+ done += size;
+ /* Remove comments */
+ tmp = strchr(kbuf, '#');
+ if (tmp)
+ *tmp = '\0';
+
+ ret = command_trace_probe(kbuf);
+ if (ret)
+ goto out;
+
+ } while (done < count);
+ ret = done;
+out:
+ kfree(kbuf);
+ return ret;
+}
+
+static const struct file_operations kprobe_events_ops = {
+ .owner = THIS_MODULE,
+ .open = probes_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = probes_write,
+};
+
+/* Kprobe handler */
+static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(kp, struct trace_probe, kp);
+ struct kprobe_trace_entry *entry;
+ struct ring_buffer_event *event;
+ int size, i, pc;
+ unsigned long irq_flags;
+ struct ftrace_event_call *call = &event_kprobe;
+ if (&tp->call.name)
+ call = &tp->call;
+
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
+ size = sizeof(struct kprobe_trace_entry) -
+ (sizeof(unsigned long) * (TRACE_KPROBE_ARGS - tp->nr_args));
+
+ event = trace_current_buffer_lock_reserve(TRACE_KPROBE, size,
+ irq_flags, pc);
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->nargs = tp->nr_args;
+ entry->ip = (unsigned long)kp->addr;
+ for (i = 0; i < tp->nr_args; i++)
+ entry->args[i] = call_fetch(&tp->args[i], regs);
+
+ if (!filter_current_check_discard(call, entry, event))
+ trace_nowake_buffer_unlock_commit(event, irq_flags, pc);
+ return 0;
+}
+
+/* Kretprobe handler */
+static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+ struct kretprobe_trace_entry *entry;
+ struct ring_buffer_event *event;
+ int size, i, pc;
+ unsigned long irq_flags;
+ struct ftrace_event_call *call = &event_kretprobe;
+ if (&tp->call.name)
+ call = &tp->call;
+
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
+ size = sizeof(struct kretprobe_trace_entry) -
+ (sizeof(unsigned long) * (TRACE_KPROBE_ARGS - tp->nr_args));
+
+ event = trace_current_buffer_lock_reserve(TRACE_KRETPROBE, size,
+ irq_flags, pc);
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->nargs = tp->nr_args;
+ entry->func = (unsigned long)probe_address(tp);
+ entry->ret_ip = (unsigned long)ri->ret_addr;
+ for (i = 0; i < tp->nr_args; i++)
+ entry->args[i] = call_fetch(&tp->args[i], regs);
+
+ if (!filter_current_check_discard(call, entry, event))
+ trace_nowake_buffer_unlock_commit(event, irq_flags, pc);
+
+ return 0;
+}
+
+/* Event entry printers */
+enum print_line_t
+print_kprobe_event(struct trace_iterator *iter, int flags)
+{
+ struct kprobe_trace_entry *field;
+ struct trace_seq *s = &iter->seq;
+ int i;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+ goto partial;
+
+ if (!trace_seq_puts(s, ":"))
+ goto partial;
+
+ for (i = 0; i < field->nargs; i++)
+ if (!trace_seq_printf(s, " 0x%lx", field->args[i]))
+ goto partial;
+
+ if (!trace_seq_puts(s, "\n"))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+enum print_line_t
+print_kretprobe_event(struct trace_iterator *iter, int flags)
+{
+ struct kretprobe_trace_entry *field;
+ struct trace_seq *s = &iter->seq;
+ int i;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+ goto partial;
+
+ if (!trace_seq_puts(s, " <- "))
+ goto partial;
+
+ if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+ goto partial;
+
+ if (!trace_seq_puts(s, ":"))
+ goto partial;
+
+ for (i = 0; i < field->nargs; i++)
+ if (!trace_seq_printf(s, " 0x%lx", field->args[i]))
+ goto partial;
+
+ if (!trace_seq_puts(s, "\n"))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event kprobe_trace_event = {
+ .type = TRACE_KPROBE,
+ .trace = print_kprobe_event,
+};
+
+static struct trace_event kretprobe_trace_event = {
+ .type = TRACE_KRETPROBE,
+ .trace = print_kretprobe_event,
+};
+
+static int probe_event_enable(struct ftrace_event_call *call)
+{
+ struct trace_probe *tp = container_of(call, struct trace_probe, call);
+ if (probe_is_return(tp))
+ return enable_kretprobe(&tp->rp);
+ else
+ return enable_kprobe(&tp->kp);
+}
+
+static void probe_event_disable(struct ftrace_event_call *call)
+{
+ struct trace_probe *tp = container_of(call, struct trace_probe, call);
+ if (probe_is_return(tp))
+ disable_kretprobe(&tp->rp);
+ else
+ disable_kprobe(&tp->kp);
+}
+
+static int probe_event_raw_init(struct ftrace_event_call *event_call)
+{
+ INIT_LIST_HEAD(&event_call->fields);
+ init_preds(event_call);
+ return 0;
+}
+
+#undef DEFINE_FIELD
+#define DEFINE_FIELD(type, item, name, is_signed) \
+ do { \
+ ret = trace_define_field(event_call, #type, name, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), is_signed);\
+ if (ret) \
+ return ret; \
+ } while (0)
+
+static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+ int ret, i;
+ struct kprobe_trace_entry field;
+ char buf[MAX_ARGSTR_LEN + 1];
+ struct trace_probe *tp = container_of(event_call,
+ struct trace_probe, call);
+
+ __common_field(int, type, 1);
+ __common_field(unsigned char, flags, 0);
+ __common_field(unsigned char, preempt_count, 0);
+ __common_field(int, pid, 1);
+ __common_field(int, tgid, 1);
+
+ DEFINE_FIELD(unsigned long, ip, "ip", 0);
+ DEFINE_FIELD(int, nargs, "nargs", 1);
+ for (i = 0; i < tp->nr_args; i++) {
+ /* Set argN as a field */
+ sprintf(buf, "arg%d", i);
+ DEFINE_FIELD(unsigned long, args[i], buf, 0);
+ /* Set argument string as an alias field */
+ ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
+ if (ret)
+ return ret;
+ DEFINE_FIELD(unsigned long, args[i], buf, 0);
+ }
+ return 0;
+}
+
+static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+ int ret, i;
+ struct kretprobe_trace_entry field;
+ char buf[MAX_ARGSTR_LEN + 1];
+ struct trace_probe *tp = container_of(event_call,
+ struct trace_probe, call);
+
+ __common_field(int, type, 1);
+ __common_field(unsigned char, flags, 0);
+ __common_field(unsigned char, preempt_count, 0);
+ __common_field(int, pid, 1);
+ __common_field(int, tgid, 1);
+
+ DEFINE_FIELD(unsigned long, func, "func", 0);
+ DEFINE_FIELD(unsigned long, ret_ip, "ret_ip", 0);
+ DEFINE_FIELD(int, nargs, "nargs", 1);
+ for (i = 0; i < tp->nr_args; i++) {
+ /* Set argN as a field */
+ sprintf(buf, "arg%d", i);
+ DEFINE_FIELD(unsigned long, args[i], buf, 0);
+ /* Set argument string as an alias field */
+ ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]);
+ if (ret)
+ return ret;
+ DEFINE_FIELD(unsigned long, args[i], buf, 0);
+ }
+ return 0;
+}
+
+static int __probe_event_show_format(struct ftrace_event_call *call,
+ struct trace_seq *s, const char *fmt,
+ const char *arg)
+{
+ int i;
+ char buf[MAX_ARGSTR_LEN + 1];
+ struct trace_probe *tp = container_of(call, struct trace_probe, call);
+
+ /* Show aliases */
+ for (i = 0; i < tp->nr_args; i++) {
+ if (trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]))
+ return 0;
+ if (!trace_seq_printf(s, "\talias: %s;\toriginal: arg%d;\n",
+ buf, i))
+ return 0;
+ }
+ /* Show format */
+ if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
+ return 0;
+
+ for (i = 0; i < tp->nr_args; i++)
+ if (!trace_seq_puts(s, " 0x%lx"))
+ return 0;
+
+ if (!trace_seq_printf(s, "\", %s", arg))
+ return 0;
+
+ for (i = 0; i < tp->nr_args; i++)
+ if (!trace_seq_printf(s, ", arg%d", i))
+ return 0;
+
+ return trace_seq_puts(s, "\n");
+}
+
+#undef SHOW_FIELD
+#define SHOW_FIELD(type, item, name) \
+ do { \
+ ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \
+ "offset:%u;tsize:%u;\n", name, \
+ (unsigned)offsetof(typeof(field), item),\
+ (unsigned)sizeof(type)); \
+ if (!ret) \
+ return 0; \
+ } while (0)
+
+static int kprobe_event_show_format(struct ftrace_event_call *call,
+ struct trace_seq *s)
+{
+ struct kprobe_trace_entry field __attribute__((unused));
+ int ret, i;
+ char buf[8];
+ struct trace_probe *tp = container_of(call, struct trace_probe, call);
+
+ SHOW_FIELD(unsigned long, ip, "ip");
+ SHOW_FIELD(int, nargs, "nargs");
+
+ /* Show fields */
+ for (i = 0; i < tp->nr_args; i++) {
+ sprintf(buf, "arg%d", i);
+ SHOW_FIELD(unsigned long, args[i], buf);
+ }
+ trace_seq_puts(s, "\n");
+
+ return __probe_event_show_format(call, s, "%lx:", "ip");
+}
+
+static int kretprobe_event_show_format(struct ftrace_event_call *call,
+ struct trace_seq *s)
+{
+ struct kretprobe_trace_entry field __attribute__((unused));
+ int ret, i;
+ char buf[8];
+ struct trace_probe *tp = container_of(call, struct trace_probe, call);
+
+ SHOW_FIELD(unsigned long, func, "func");
+ SHOW_FIELD(unsigned long, ret_ip, "ret_ip");
+ SHOW_FIELD(int, nargs, "nargs");
+
+ /* Show fields */
+ for (i = 0; i < tp->nr_args; i++) {
+ sprintf(buf, "arg%d", i);
+ SHOW_FIELD(unsigned long, args[i], buf);
+ }
+ trace_seq_puts(s, "\n");
+
+ return __probe_event_show_format(call, s, "%lx <- %lx:",
+ "func, ret_ip");
+}
+
+static int register_probe_event(struct trace_probe *tp)
+{
+ struct ftrace_event_call *call = &tp->call;
+ int ret;
+
+ /* Initialize ftrace_event_call */
+ call->system = "kprobes";
+ if (probe_is_return(tp)) {
+ call->event = &kretprobe_trace_event;
+ call->id = TRACE_KRETPROBE;
+ call->raw_init = probe_event_raw_init;
+ call->show_format = kretprobe_event_show_format;
+ call->define_fields = kretprobe_event_define_fields;
+ } else {
+ call->event = &kprobe_trace_event;
+ call->id = TRACE_KPROBE;
+ call->raw_init = probe_event_raw_init;
+ call->show_format = kprobe_event_show_format;
+ call->define_fields = kprobe_event_define_fields;
+ }
+ call->enabled = 1;
+ call->regfunc = probe_event_enable;
+ call->unregfunc = probe_event_disable;
+ ret = trace_add_event_call(call);
+ if (ret)
+ pr_info("Failed to register kprobe event: %s\n", call->name);
+ return ret;
+}
+
+static void unregister_probe_event(struct trace_probe *tp)
+{
+ /*
+ * Prevent to unregister event itself because the event is shared
+ * among other probes.
+ */
+ tp->call.event = NULL;
+ trace_remove_event_call(&tp->call);
+}
+
+/* Make a debugfs interface for controling probe points */
+static __init int init_kprobe_trace(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+ int ret;
+
+ ret = register_ftrace_event(&kprobe_trace_event);
+ if (!ret) {
+ pr_warning("Could not register kprobe_trace_event type.\n");
+ return 0;
+ }
+ ret = register_ftrace_event(&kretprobe_trace_event);
+ if (!ret) {
+ pr_warning("Could not register kretprobe_trace_event type.\n");
+ return 0;
+ }
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
+
+ entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+ NULL, &kprobe_events_ops);
+
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'kprobe_events' entry\n");
+ return 0;
+}
+fs_initcall(init_kprobe_trace);
+
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static int kprobe_trace_selftest_target(int a1, int a2, int a3,
+ int a4, int a5, int a6)
+{
+ return a1 + a2 + a3 + a4 + a5 + a6;
+}
+
+static __init int kprobe_trace_self_tests_init(void)
+{
+ int ret;
+ int (*target)(int, int, int, int, int, int);
+ target = kprobe_trace_selftest_target;
+
+ pr_info("Testing kprobe tracing: ");
+
+ ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
+ "a1 a2 a3 a4 a5 a6");
+ if (WARN_ON_ONCE(ret))
+ pr_warning("error enabling function entry\n");
+
+ ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
+ "ra rv");
+ if (WARN_ON_ONCE(ret))
+ pr_warning("error enabling function return\n");
+
+ ret = target(1, 2, 3, 4, 5, 6);
+
+ cleanup_all_probes();
+
+ pr_cont("OK\n");
+ return 0;
+}
+
+late_initcall(kprobe_trace_self_tests_init);
+
+#endif
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-07-01 01:07:15
|
Hi, Here are the v10 patches. I just updated for the latest -tip and fixed typos and Kconfig dependency. Here are the patches of kprobe-based event tracer for x86, version 10, which allows you to probe various kernel events through ftrace interface. The tracer supports per-probe filtering which allows you to set filters on each probe and shows formats of each probe. I think this is more generic integration with ftrace, especially event-tracer. This patchset also includes x86(-64) instruction decoder which supports non-SSE/FP opcodes and includes x86 opcode map. The decoder is used for finding the instruction boundaries when inserting new kprobes. I think it will be possible to share this opcode map with KVM's decoder. The decoder is tested when building kernel, the test compares the results of objdump and the decoder right after building vmlinux. You can enable that test by CONFIG_X86_DECODER_SELFTEST=y. This series can be applied on the latest linux-2.6-tip tree. This supports only x86(-32/-64) (but porting it on other arch just needs kprobes/kretprobes and register and stack access APIs). This patchset includes following changes: - Add x86 instruction decoder [1/7] - Add x86 instruction decoder selftest [2/7] - Check insertion point safety in kprobe [3/7] - Cleanup fix_riprel() with insn decoder [4/7] - Add arch-dep register and stack fetching functions [5/7] - Add dynamic event_call support to ftrace [6/7] - Add kprobe-based event tracer [7/7] Enhancement ideas will be added after merging: - Add profiling interface for each event. - Make a stress test of kprobes on this tracer. (see http://sources.redhat.com/ml/systemtap/2009-q2/msg01055.html) - .init function tracing support. - Support primitive types(long, ulong, int, uint, etc) for args. Kprobe-based Event Tracer ========================= Overview -------- This tracer is similar to the events tracer which is based on Tracepoint infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe and kretprobe). It probes anywhere where kprobes can probe(this means, all functions body except for __kprobes functions). Unlike the function tracer, this tracer can probe instructions inside of kernel functions. It allows you to check which instruction has been executed. Unlike the Tracepoint based events tracer, this tracer can add new probe points on the fly. Similar to the events tracer, this tracer doesn't need to be activated via current_tracer, instead of that, just set probe points via /sys/kernel/debug/tracing/kprobe_events. And you can set filters on each probe events via /sys/kernel/debug/tracing/events/kprobes/<EVENT>/filter. Synopsis of kprobe_events ------------------------- p[:EVENT] SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS] : set a probe r[:EVENT] SYMBOL[+0] [FETCHARGS] : set a return probe EVENT : Event name SYMBOL[+offs|-offs] : Symbol+offset where the probe is inserted MEMADDR : Address where the probe is inserted FETCHARGS : Arguments %REG : Fetch register REG sN : Fetch Nth entry of stack (N >= 0) @ADDR : Fetch memory at ADDR (ADDR should be in kernel) @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) aN : Fetch function argument. (N >= 0)(*) rv : Fetch return value.(**) ra : Fetch return address.(**) +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.(***) (*) aN may not correct on asmlinkaged functions and at the middle of function body. (**) only for return probe. (***) this is useful for fetching a field of data structures. Per-Probe Event Filtering ------------------------- Per-probe event filtering feature allows you to set different filter on each probe and gives you what arguments will be shown in trace buffer. If an event name is specified right after 'p:' or 'r:' in kprobe_events, the tracer adds an event under tracing/events/kprobes/<EVENT>, at the directory you can see 'id', 'enabled', 'format' and 'filter'. enabled: You can enable/disable the probe by writing 1 or 0 on it. format: It shows the format of this probe event. It also shows aliases of arguments which you specified to kprobe_events. filter: You can write filtering rules of this event. And you can use both of aliase names and field names for describing filters. Usage examples -------------- To add a probe as a new event, write a new definition to kprobe_events as below. echo p:myprobe do_sys_open a0 a1 a2 a3 > /sys/kernel/debug/tracing/kprobe_events This sets a kprobe on the top of do_sys_open() function with recording 1st to 4th arguments as "myprobe" event. echo r:myretprobe do_sys_open rv ra >> /sys/kernel/debug/tracing/kprobe_events This sets a kretprobe on the return point of do_sys_open() function with recording return value and return address as "myretprobe" event. You can see the format of these events via /sys/kernel/debug/tracing/events/kprobes/<EVENT>/format. cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format name: myprobe ID: 23 format: field:unsigned short common_type; offset:0; size:2; field:unsigned char common_flags; offset:2; size:1; field:unsigned char common_preempt_count; offset:3; size:1; field:int common_pid; offset:4; size:4; field:int common_tgid; offset:8; size:4; field: unsigned long ip; offset:16;tsize:8; field: int nargs; offset:24;tsize:4; field: unsigned long arg0; offset:32;tsize:8; field: unsigned long arg1; offset:40;tsize:8; field: unsigned long arg2; offset:48;tsize:8; field: unsigned long arg3; offset:56;tsize:8; alias: a0; original: arg0; alias: a1; original: arg1; alias: a2; original: arg2; alias: a3; original: arg3; print fmt: "%lx: 0x%lx 0x%lx 0x%lx 0x%lx", ip, arg0, arg1, arg2, arg3 You can see that the event has 4 arguments and alias expressions corresponding to it. echo > /sys/kernel/debug/tracing/kprobe_events This clears all probe points. and you can see the traced information via /sys/kernel/debug/tracing/trace. cat /sys/kernel/debug/tracing/trace # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | <...>-1447 [001] 1038282.286875: do_sys_open+0x0/0xd6: 0x3 0x7fffd1ec4440 0x8000 0x0 <...>-1447 [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: 0xfffffffffffffffe 0xffffffff81367a3a <...>-1447 [001] 1038282.286885: do_sys_open+0x0/0xd6: 0xffffff9c 0x40413c 0x8000 0x1b6 <...>-1447 [001] 1038282.286915: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a <...>-1447 [001] 1038282.286969: do_sys_open+0x0/0xd6: 0xffffff9c 0x4041c6 0x98800 0x10 <...>-1447 [001] 1038282.286976: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a Each line shows when the kernel hits a probe, and <- SYMBOL means kernel returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel returns from do_sys_open to sys_open+0x1b). Thank you, --- Masami Hiramatsu (7): tracing: add kprobe-based event tracer tracing: ftrace dynamic ftrace_event_call support x86: add pt_regs register and stack access APIs kprobes: cleanup fix_riprel() using insn decoder on x86 kprobes: checks probe address is instruction boudary on x86 x86: x86 instruction decoder build-time selftest x86: instruction decoder API Documentation/trace/kprobes.txt | 138 ++++ arch/x86/Kconfig.debug | 9 arch/x86/Makefile | 3 arch/x86/include/asm/inat.h | 127 +++ arch/x86/include/asm/insn.h | 136 ++++ arch/x86/include/asm/ptrace.h | 122 +++ arch/x86/kernel/kprobes.c | 197 ++--- arch/x86/kernel/ptrace.c | 73 ++ arch/x86/lib/Makefile | 13 arch/x86/lib/inat.c | 82 ++ arch/x86/lib/insn.c | 473 +++++++++++++ arch/x86/lib/x86-opcode-map.txt | 711 +++++++++++++++++++ arch/x86/scripts/Makefile | 19 + arch/x86/scripts/distill.awk | 42 + arch/x86/scripts/gen-insn-attr-x86.awk | 314 ++++++++ arch/x86/scripts/test_get_len.c | 99 +++ arch/x86/scripts/user_include.h | 49 + include/linux/ftrace_event.h | 13 include/trace/ftrace.h | 22 - kernel/trace/Kconfig | 12 kernel/trace/Makefile | 1 kernel/trace/trace.h | 22 + kernel/trace/trace_event_types.h | 20 + kernel/trace/trace_events.c | 70 +- kernel/trace/trace_export.c | 27 - kernel/trace/trace_kprobe.c | 1183 ++++++++++++++++++++++++++++++++ 26 files changed, 3825 insertions(+), 152 deletions(-) create mode 100644 Documentation/trace/kprobes.txt create mode 100644 arch/x86/include/asm/inat.h create mode 100644 arch/x86/include/asm/insn.h create mode 100644 arch/x86/lib/inat.c create mode 100644 arch/x86/lib/insn.c create mode 100644 arch/x86/lib/x86-opcode-map.txt create mode 100644 arch/x86/scripts/Makefile create mode 100644 arch/x86/scripts/distill.awk create mode 100644 arch/x86/scripts/gen-insn-attr-x86.awk create mode 100644 arch/x86/scripts/test_get_len.c create mode 100644 arch/x86/scripts/user_include.h create mode 100644 kernel/trace/trace_kprobe.c -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
|
From: Masami H. <mhi...@re...> - 2009-07-01 01:07:15
|
Cleanup fix_riprel() in arch/x86/kernel/kprobes.c by using x86 instruction
decoder.
Signed-off-by: Masami Hiramatsu <mhi...@re...>
Cc: Ananth N Mavinakayanahalli <an...@in...>
Cc: Jim Keniston <jke...@us...>
Cc: Ingo Molnar <mi...@el...>
---
arch/x86/kernel/kprobes.c | 128 ++++++++-------------------------------------
1 files changed, 23 insertions(+), 105 deletions(-)
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 5341842..b77e050 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -109,50 +109,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
/* ----------------------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
-static const u32 onebyte_has_modrm[256 / 32] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* ----------------------------------------------- */
- W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
- W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
- W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
- W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
- W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
- W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
- W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
- W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
- W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
- W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
- W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
- W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
- W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
- W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
- W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
- W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
- /* ----------------------------------------------- */
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
-};
-static const u32 twobyte_has_modrm[256 / 32] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* ----------------------------------------------- */
- W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
- W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
- W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
- W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
- W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
- W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
- W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
- W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
- W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
- W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
- W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
- W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
- W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
- W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
- W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
- W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
- /* ----------------------------------------------- */
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
-};
#undef W
struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -345,68 +301,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
static void __kprobes fix_riprel(struct kprobe *p)
{
#ifdef CONFIG_X86_64
- u8 *insn = p->ainsn.insn;
- s64 disp;
- int need_modrm;
-
- /* Skip legacy instruction prefixes. */
- while (1) {
- switch (*insn) {
- case 0x66:
- case 0x67:
- case 0x2e:
- case 0x3e:
- case 0x26:
- case 0x64:
- case 0x65:
- case 0x36:
- case 0xf0:
- case 0xf3:
- case 0xf2:
- ++insn;
- continue;
- }
- break;
- }
+ struct insn insn;
+ kernel_insn_init(&insn, p->ainsn.insn);
- /* Skip REX instruction prefix. */
- if (is_REX_prefix(insn))
- ++insn;
-
- if (*insn == 0x0f) {
- /* Two-byte opcode. */
- ++insn;
- need_modrm = test_bit(*insn,
- (unsigned long *)twobyte_has_modrm);
- } else
- /* One-byte opcode. */
- need_modrm = test_bit(*insn,
- (unsigned long *)onebyte_has_modrm);
-
- if (need_modrm) {
- u8 modrm = *++insn;
- if ((modrm & 0xc7) == 0x05) {
- /* %rip+disp32 addressing mode */
- /* Displacement follows ModRM byte. */
- ++insn;
- /*
- * The copied instruction uses the %rip-relative
- * addressing mode. Adjust the displacement for the
- * difference between the original location of this
- * instruction and the location of the copy that will
- * actually be run. The tricky bit here is making sure
- * that the sign extension happens correctly in this
- * calculation, since we need a signed 32-bit result to
- * be sign-extended to 64 bits when it's added to the
- * %rip value and yield the same 64-bit result that the
- * sign-extension of the original signed 32-bit
- * displacement would have given.
- */
- disp = (u8 *) p->addr + *((s32 *) insn) -
- (u8 *) p->ainsn.insn;
- BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
- *(s32 *)insn = (s32) disp;
- }
+ if (insn_rip_relative(&insn)) {
+ s64 newdisp;
+ u8 *disp;
+ insn_get_displacement(&insn);
+ /*
+ * The copied instruction uses the %rip-relative addressing
+ * mode. Adjust the displacement for the difference between
+ * the original location of this instruction and the location
+ * of the copy that will actually be run. The tricky bit here
+ * is making sure that the sign extension happens correctly in
+ * this calculation, since we need a signed 32-bit result to
+ * be sign-extended to 64 bits when it's added to the %rip
+ * value and yield the same 64-bit result that the sign-
+ * extension of the original signed 32-bit displacement would
+ * have given.
+ */
+ newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
+ (u8 *) p->ainsn.insn;
+ BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
+ disp = (u8 *) p->ainsn.insn + INSN_DISPLACEMENT_OFFS(&insn);
+ *(s32 *) disp = (s32) newdisp;
}
#endif
}
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-07-01 01:07:10
|
Add following APIs for accessing registers and stack entries from pt_regs.
- regs_query_register_offset(const char *name)
Query the offset of "name" register.
- regs_query_register_name(unsigned offset)
Query the name of register by its offset.
- regs_get_register(struct pt_regs *regs, unsigned offset)
Get the value of a register by its offset.
- regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr)
Check the address is in the kernel stack.
- regs_get_kernel_stack_nth(struct pt_regs *reg, unsigned nth)
Get Nth entry of the kernel stack. (N >= 0)
- regs_get_argument_nth(struct pt_regs *reg, unsigned nth)
Get Nth argument at function call. (N >= 0)
Changes from v9:
-Fix a typo in a comment.
Signed-off-by: Masami Hiramatsu <mhi...@re...>
Cc: Christoph Hellwig <hc...@in...>
Cc: Steven Rostedt <ro...@go...>
Cc: Ananth N Mavinakayanahalli <an...@in...>
Cc: Ingo Molnar <mi...@el...>
Cc: Frederic Weisbecker <fwe...@gm...>
Cc: Roland McGrath <ro...@re...>
Cc: Srikar Dronamraju <sr...@li...>
Cc: lin...@vg...
---
arch/x86/include/asm/ptrace.h | 122 +++++++++++++++++++++++++++++++++++++++++
arch/x86/kernel/ptrace.c | 73 +++++++++++++++++++++++++
2 files changed, 195 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 0f0d908..d5e3b3b 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -7,6 +7,7 @@
#ifdef __KERNEL__
#include <asm/segment.h>
+#include <asm/page_types.h>
#endif
#ifndef __ASSEMBLY__
@@ -216,6 +217,127 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
return regs->sp;
}
+/* Query offset/name of register from its name/offset */
+extern int regs_query_register_offset(const char *name);
+extern const char *regs_query_register_name(unsigned offset);
+#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
+
+/**
+ * regs_get_register() - get register value from its offset
+ * @regs: pt_regs from which register value is gotten.
+ * @offset: offset number of the register.
+ *
+ * regs_get_register returns the value of a register whose offset from @regs
+ * is @offset. The @offset is the offset of the register in struct pt_regs.
+ * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
+ */
+static inline unsigned long regs_get_register(struct pt_regs *regs,
+ unsigned offset)
+{
+ if (unlikely(offset > MAX_REG_OFFSET))
+ return 0;
+ return *(unsigned long *)((unsigned long)regs + offset);
+}
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @addr: address which is checked.
+ *
+ * regs_within_kenel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static inline int regs_within_kernel_stack(struct pt_regs *regs,
+ unsigned long addr)
+{
+ return ((addr & ~(THREAD_SIZE - 1)) ==
+ (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @n: stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specifined by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
+ unsigned n)
+{
+ unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+ addr += n;
+ if (regs_within_kernel_stack(regs, (unsigned long)addr))
+ return *addr;
+ else
+ return 0;
+}
+
+/**
+ * regs_get_argument_nth() - get Nth argument at function call
+ * @regs: pt_regs which contains registers at function entry.
+ * @n: argument number.
+ *
+ * regs_get_argument_nth() returns @n th argument of a function call.
+ * Since usually the kernel stack will be changed right after function entry,
+ * you must use this at function entry. If the @n th entry is NOT in the
+ * kernel stack or pt_regs, this returns 0.
+ */
+#ifdef CONFIG_X86_32
+#define NR_REGPARMS 3
+static inline unsigned long regs_get_argument_nth(struct pt_regs *regs,
+ unsigned n)
+{
+ if (n < NR_REGPARMS) {
+ switch (n) {
+ case 0:
+ return regs->ax;
+ case 1:
+ return regs->dx;
+ case 2:
+ return regs->cx;
+ }
+ return 0;
+ } else {
+ /*
+ * The typical case: arg n is on the stack.
+ * (Note: stack[0] = return address, so skip it)
+ */
+ return regs_get_kernel_stack_nth(regs, 1 + n - NR_REGPARMS);
+ }
+}
+#else /* CONFIG_X86_64 */
+#define NR_REGPARMS 6
+static inline unsigned long regs_get_argument_nth(struct pt_regs *regs,
+ unsigned n)
+{
+ if (n < NR_REGPARMS) {
+ switch (n) {
+ case 0:
+ return regs->di;
+ case 1:
+ return regs->si;
+ case 2:
+ return regs->dx;
+ case 3:
+ return regs->cx;
+ case 4:
+ return regs->r8;
+ case 5:
+ return regs->r9;
+ }
+ return 0;
+ } else {
+ /*
+ * The typical case: arg n is on the stack.
+ * (Note: stack[0] = return address, so skip it)
+ */
+ return regs_get_kernel_stack_nth(regs, 1 + n - NR_REGPARMS);
+ }
+}
+#endif
+
/*
* These are defined as per linux/ptrace.h, which see.
*/
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index b457f78..2944d3a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -49,6 +49,79 @@ enum x86_regset {
REGSET_IOPERM32,
};
+struct pt_regs_offset {
+ const char *name;
+ int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+
+static const struct pt_regs_offset regoffset_table[] = {
+#ifdef CONFIG_X86_64
+ REG_OFFSET_NAME(r15),
+ REG_OFFSET_NAME(r14),
+ REG_OFFSET_NAME(r13),
+ REG_OFFSET_NAME(r12),
+ REG_OFFSET_NAME(r11),
+ REG_OFFSET_NAME(r10),
+ REG_OFFSET_NAME(r9),
+ REG_OFFSET_NAME(r8),
+#endif
+ REG_OFFSET_NAME(bx),
+ REG_OFFSET_NAME(cx),
+ REG_OFFSET_NAME(dx),
+ REG_OFFSET_NAME(si),
+ REG_OFFSET_NAME(di),
+ REG_OFFSET_NAME(bp),
+ REG_OFFSET_NAME(ax),
+#ifdef CONFIG_X86_32
+ REG_OFFSET_NAME(ds),
+ REG_OFFSET_NAME(es),
+ REG_OFFSET_NAME(fs),
+ REG_OFFSET_NAME(gs),
+#endif
+ REG_OFFSET_NAME(orig_ax),
+ REG_OFFSET_NAME(ip),
+ REG_OFFSET_NAME(cs),
+ REG_OFFSET_NAME(flags),
+ REG_OFFSET_NAME(sp),
+ REG_OFFSET_NAME(ss),
+ REG_OFFSET_END,
+};
+
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name: the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (!strcmp(roff->name, name))
+ return roff->offset;
+ return -EINVAL;
+}
+
+/**
+ * regs_query_register_name() - query register name from its offset
+ * @offset: the offset of a register in struct pt_regs.
+ *
+ * regs_query_register_name() returns the name of a register from its
+ * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
+ */
+const char *regs_query_register_name(unsigned offset)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (roff->offset == offset)
+ return roff->name;
+ return NULL;
+}
+
/*
* does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|
|
From: Masami H. <mhi...@re...> - 2009-07-01 01:07:10
|
Add dynamic ftrace_event_call support to ftrace. Trace engines can adds new
ftrace_event_call to ftrace on the fly. Each operator functions of the call
takes a ftrace_event_call data structure as an argument, because these
functions may be shared among several ftrace_event_calls.
Signed-off-by: Masami Hiramatsu <mhi...@re...>
Cc: Steven Rostedt <ro...@go...>
Cc: Ingo Molnar <mi...@el...>
Cc: Tom Zanussi <tza...@gm...>
Cc: Frederic Weisbecker <fwe...@gm...>
---
include/linux/ftrace_event.h | 13 +++++---
include/trace/ftrace.h | 22 +++++++------
kernel/trace/trace_events.c | 70 ++++++++++++++++++++++++++++++++----------
kernel/trace/trace_export.c | 27 ++++++++--------
4 files changed, 85 insertions(+), 47 deletions(-)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 5c093ff..f7733b6 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -108,12 +108,13 @@ struct ftrace_event_call {
struct dentry *dir;
struct trace_event *event;
int enabled;
- int (*regfunc)(void);
- void (*unregfunc)(void);
+ int (*regfunc)(struct ftrace_event_call *);
+ void (*unregfunc)(struct ftrace_event_call *);
int id;
- int (*raw_init)(void);
- int (*show_format)(struct trace_seq *s);
- int (*define_fields)(void);
+ int (*raw_init)(struct ftrace_event_call *);
+ int (*show_format)(struct ftrace_event_call *,
+ struct trace_seq *);
+ int (*define_fields)(struct ftrace_event_call *);
struct list_head fields;
int filter_active;
void *filter;
@@ -138,6 +139,8 @@ extern int filter_current_check_discard(struct ftrace_event_call *call,
extern int trace_define_field(struct ftrace_event_call *call, char *type,
char *name, int offset, int size, int is_signed);
+extern int trace_add_event_call(struct ftrace_event_call *call);
+extern void trace_remove_event_call(struct ftrace_event_call *call);
#define is_signed_type(type) (((type)(-1)) < 0)
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 1867553..d696580 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -147,7 +147,8 @@
#undef TRACE_EVENT
#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
static int \
-ftrace_format_##call(struct trace_seq *s) \
+ftrace_format_##call(struct ftrace_event_call *event_call, \
+ struct trace_seq *s) \
{ \
struct ftrace_raw_##call field __attribute__((unused)); \
int ret = 0; \
@@ -289,10 +290,9 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
#undef TRACE_EVENT
#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
int \
-ftrace_define_fields_##call(void) \
+ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
{ \
struct ftrace_raw_##call field; \
- struct ftrace_event_call *event_call = &event_##call; \
int ret; \
\
__common_field(int, type, 1); \
@@ -355,7 +355,7 @@ static inline int ftrace_get_offsets_##call( \
* event_trace_printk(_RET_IP_, "<call>: " <fmt>);
* }
*
- * static int ftrace_reg_event_<call>(void)
+ * static int ftrace_reg_event_<call>(struct ftrace_event_call *unused)
* {
* int ret;
*
@@ -366,7 +366,7 @@ static inline int ftrace_get_offsets_##call( \
* return ret;
* }
*
- * static void ftrace_unreg_event_<call>(void)
+ * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
* {
* unregister_trace_<call>(ftrace_event_<call>);
* }
@@ -399,7 +399,7 @@ static inline int ftrace_get_offsets_##call( \
* trace_current_buffer_unlock_commit(event, irq_flags, pc);
* }
*
- * static int ftrace_raw_reg_event_<call>(void)
+ * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
* {
* int ret;
*
@@ -410,7 +410,7 @@ static inline int ftrace_get_offsets_##call( \
* return ret;
* }
*
- * static void ftrace_unreg_event_<call>(void)
+ * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
* {
* unregister_trace_<call>(ftrace_raw_event_<call>);
* }
@@ -419,7 +419,7 @@ static inline int ftrace_get_offsets_##call( \
* .trace = ftrace_raw_output_<call>, <-- stage 2
* };
*
- * static int ftrace_raw_init_event_<call>(void)
+ * static int ftrace_raw_init_event_<call>(struct ftrace_event_call *unused)
* {
* int id;
*
@@ -537,7 +537,7 @@ static void ftrace_raw_event_##call(proto) \
trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
} \
\
-static int ftrace_raw_reg_event_##call(void) \
+static int ftrace_raw_reg_event_##call(struct ftrace_event_call *unused)\
{ \
int ret; \
\
@@ -548,7 +548,7 @@ static int ftrace_raw_reg_event_##call(void) \
return ret; \
} \
\
-static void ftrace_raw_unreg_event_##call(void) \
+static void ftrace_raw_unreg_event_##call(struct ftrace_event_call *unused)\
{ \
unregister_trace_##call(ftrace_raw_event_##call); \
} \
@@ -557,7 +557,7 @@ static struct trace_event ftrace_event_type_##call = { \
.trace = ftrace_raw_output_##call, \
}; \
\
-static int ftrace_raw_init_event_##call(void) \
+static int ftrace_raw_init_event_##call(struct ftrace_event_call *unused)\
{ \
int id; \
\
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53c8fd3..94ff41e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -60,9 +60,7 @@ err:
}
EXPORT_SYMBOL_GPL(trace_define_field);
-#ifdef CONFIG_MODULES
-
-static void trace_destroy_fields(struct ftrace_event_call *call)
+void trace_destroy_fields(struct ftrace_event_call *call)
{
struct ftrace_event_field *field, *next;
@@ -74,8 +72,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
}
}
-#endif /* CONFIG_MODULES */
-
static void ftrace_event_enable_disable(struct ftrace_event_call *call,
int enable)
{
@@ -84,14 +80,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
if (call->enabled) {
call->enabled = 0;
tracing_stop_cmdline_record();
- call->unregfunc();
+ call->unregfunc(call);
}
break;
case 1:
if (!call->enabled) {
call->enabled = 1;
tracing_start_cmdline_record();
- call->regfunc();
+ call->regfunc(call);
}
break;
}
@@ -574,7 +570,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_printf(s, "format:\n");
trace_write_header(s);
- r = call->show_format(s);
+ r = call->show_format(call, s);
if (!r) {
/*
* ug! The format output is bigger than a PAGE!!
@@ -921,7 +917,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
d_events = event_subsystem_dir(call->system, d_events);
if (call->raw_init) {
- ret = call->raw_init();
+ ret = call->raw_init(call);
if (ret < 0) {
pr_warning("Could not initialize trace point"
" events/%s\n", call->name);
@@ -945,7 +941,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
id);
if (call->define_fields) {
- ret = call->define_fields();
+ ret = call->define_fields(call);
if (ret < 0) {
pr_warning("Could not initialize trace point"
" events/%s\n", call->name);
@@ -965,6 +961,52 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
return 0;
}
+static int __trace_add_event_call(struct ftrace_event_call *call)
+{
+ struct dentry *d_events;
+
+ if (!call->name)
+ return -EINVAL;
+
+ d_events = event_trace_events_dir();
+ if (!d_events)
+ return -ENOENT;
+
+ list_add(&call->list, &ftrace_events);
+ return event_create_dir(call, d_events, &ftrace_event_id_fops,
+ &ftrace_enable_fops, &ftrace_event_filter_fops,
+ &ftrace_event_format_fops);
+}
+
+/* Add an additional event_call dynamically */
+int trace_add_event_call(struct ftrace_event_call *call)
+{
+ int ret;
+ mutex_lock(&event_mutex);
+ ret = __trace_add_event_call(call);
+ mutex_unlock(&event_mutex);
+ return ret;
+}
+
+static void __trace_remove_event_call(struct ftrace_event_call *call)
+{
+ ftrace_event_enable_disable(call, 0);
+ if (call->event)
+ __unregister_ftrace_event(call->event);
+ debugfs_remove_recursive(call->dir);
+ list_del(&call->list);
+ trace_destroy_fields(call);
+ destroy_preds(call);
+}
+
+/* Remove an event_call */
+void trace_remove_event_call(struct ftrace_event_call *call)
+{
+ mutex_lock(&event_mutex);
+ __trace_remove_event_call(call);
+ mutex_unlock(&event_mutex);
+}
+
#define for_each_event(event, start, end) \
for (event = start; \
(unsigned long)event < (unsigned long)end; \
@@ -1070,13 +1112,7 @@ static void trace_module_remove_events(struct module *mod)
list_for_each_entry_safe(call, p, &ftrace_events, list) {
if (call->mod == mod) {
found = true;
- ftrace_event_enable_disable(call, 0);
- if (call->event)
- __unregister_ftrace_event(call->event);
- debugfs_remove_recursive(call->dir);
- list_del(&call->list);
- trace_destroy_fields(call);
- destroy_preds(call);
+ __trace_remove_event_call(call);
}
}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d06cf89..7cee79d 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -60,7 +60,7 @@ extern void __bad_type_size(void);
#undef TRACE_EVENT_FORMAT
#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
static int \
-ftrace_format_##call(struct trace_seq *s) \
+ftrace_format_##call(struct ftrace_event_call *dummy, struct trace_seq *s)\
{ \
struct args field; \
int ret; \
@@ -76,7 +76,7 @@ ftrace_format_##call(struct trace_seq *s) \
#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
tpfmt) \
static int \
-ftrace_format_##call(struct trace_seq *s) \
+ftrace_format_##call(struct ftrace_event_call *dummy, struct trace_seq *s)\
{ \
struct args field; \
int ret; \
@@ -115,10 +115,16 @@ ftrace_format_##call(struct trace_seq *s) \
#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
cmd;
+static int ftrace_raw_init_event(struct ftrace_event_call *event_call)
+{
+ INIT_LIST_HEAD(&event_call->fields);
+ init_preds(event_call);
+ return 0;
+}
+
#undef TRACE_EVENT_FORMAT
#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
-int ftrace_define_fields_##call(void); \
-static int ftrace_raw_init_event_##call(void); \
+int ftrace_define_fields_##call(struct ftrace_event_call *c); \
\
struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
@@ -126,16 +132,10 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
.name = #call, \
.id = proto, \
.system = __stringify(TRACE_SYSTEM), \
- .raw_init = ftrace_raw_init_event_##call, \
+ .raw_init = ftrace_raw_init_event, \
.show_format = ftrace_format_##call, \
.define_fields = ftrace_define_fields_##call, \
-}; \
-static int ftrace_raw_init_event_##call(void) \
-{ \
- INIT_LIST_HEAD(&event_##call.fields); \
- init_preds(&event_##call); \
- return 0; \
-} \
+};
#undef TRACE_EVENT_FORMAT_NOFILTER
#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
@@ -182,9 +182,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
#undef TRACE_EVENT_FORMAT
#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
int \
-ftrace_define_fields_##call(void) \
+ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
{ \
- struct ftrace_event_call *event_call = &event_##call; \
struct args field; \
int ret; \
\
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhi...@re...
|