You can subscribe to this list here.
2009 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(32) |
Jun
(66) |
Jul
(102) |
Aug
(78) |
Sep
(106) |
Oct
(137) |
Nov
(147) |
Dec
(147) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2010 |
Jan
(71) |
Feb
(139) |
Mar
(86) |
Apr
(76) |
May
(57) |
Jun
(10) |
Jul
(12) |
Aug
(6) |
Sep
(8) |
Oct
(12) |
Nov
(12) |
Dec
(18) |
2011 |
Jan
(16) |
Feb
(19) |
Mar
(3) |
Apr
(1) |
May
(16) |
Jun
(17) |
Jul
(74) |
Aug
(22) |
Sep
(18) |
Oct
(24) |
Nov
(21) |
Dec
(30) |
2012 |
Jan
(31) |
Feb
(16) |
Mar
(22) |
Apr
(25) |
May
(18) |
Jun
(13) |
Jul
(83) |
Aug
(49) |
Sep
(20) |
Oct
(60) |
Nov
(35) |
Dec
(28) |
2013 |
Jan
(39) |
Feb
(61) |
Mar
(35) |
Apr
(21) |
May
(45) |
Jun
(56) |
Jul
(20) |
Aug
(9) |
Sep
(10) |
Oct
(31) |
Nov
(8) |
Dec
(4) |
2014 |
Jan
(6) |
Feb
(7) |
Mar
(7) |
Apr
(6) |
May
(4) |
Jun
(8) |
Jul
(5) |
Aug
(2) |
Sep
(4) |
Oct
(4) |
Nov
(11) |
Dec
(5) |
2015 |
Jan
(4) |
Feb
(4) |
Mar
(3) |
Apr
(4) |
May
(9) |
Jun
(4) |
Jul
(15) |
Aug
(8) |
Sep
(16) |
Oct
(18) |
Nov
(15) |
Dec
(7) |
2016 |
Jan
(20) |
Feb
(9) |
Mar
(15) |
Apr
(24) |
May
(16) |
Jun
(28) |
Jul
(22) |
Aug
(23) |
Sep
(18) |
Oct
(30) |
Nov
(40) |
Dec
(9) |
2017 |
Jan
(1) |
Feb
(8) |
Mar
(37) |
Apr
(26) |
May
(25) |
Jun
(46) |
Jul
(24) |
Aug
(9) |
Sep
|
Oct
|
Nov
|
Dec
|
From: Steven R. <ro...@go...> - 2009-05-30 04:40:45
|
On Thu, 28 May 2009, Masami Hiramatsu wrote: > Add dynamic ftrace_event_call support to ftrace. Trace engines can adds new > ftrace_event_call to ftrace on the fly. Each operator functions of the call > takes a ftrace_event_call data structure as an argument, because these > functions may be shared among several ftrace_event_calls. > > Signed-off-by: Masami Hiramatsu <mhi...@re...> > Cc: Steven Rostedt <ro...@go...> > Cc: Ingo Molnar <mi...@el...> > Cc: Tom Zanussi <tza...@gm...> > Cc: Frederic Weisbecker <fwe...@gm...> > --- > > include/linux/ftrace_event.h | 13 ++++++---- > include/trace/ftrace.h | 22 +++++++++-------- > kernel/trace/trace_events.c | 54 +++++++++++++++++++++++++++++------------- > kernel/trace/trace_export.c | 27 ++++++++++----------- > 4 files changed, 69 insertions(+), 47 deletions(-) > > diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h > index bbf40f6..e25f3a4 100644 > --- a/include/linux/ftrace_event.h > +++ b/include/linux/ftrace_event.h > @@ -108,12 +108,13 @@ struct ftrace_event_call { > struct dentry *dir; > struct trace_event *event; > int enabled; > - int (*regfunc)(void); > - void (*unregfunc)(void); > + int (*regfunc)(struct ftrace_event_call *); > + void (*unregfunc)(struct ftrace_event_call *); > int id; > - int (*raw_init)(void); > - int (*show_format)(struct trace_seq *s); > - int (*define_fields)(void); > + int (*raw_init)(struct ftrace_event_call *); > + int (*show_format)(struct ftrace_event_call *, > + struct trace_seq *); > + int (*define_fields)(struct ftrace_event_call *); > struct list_head fields; > int filter_active; > void *filter; > @@ -138,6 +139,8 @@ extern int filter_current_check_discard(struct ftrace_event_call *call, > > extern int trace_define_field(struct ftrace_event_call *call, char *type, > char *name, int offset, int size, int is_signed); > +extern int trace_add_event_call(struct ftrace_event_call *call); > +extern void trace_remove_event_call(struct ftrace_event_call *call); > > #define is_signed_type(type) (((type)(-1)) < 0) > > diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h > index b4ec83a..de3ee7c 100644 > --- a/include/trace/ftrace.h > +++ b/include/trace/ftrace.h > @@ -229,7 +229,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ > #undef TRACE_EVENT > #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ > static int \ > -ftrace_format_##call(struct trace_seq *s) \ > +ftrace_format_##call(struct ftrace_event_call *event_call, \ > + struct trace_seq *s) \ > { \ > struct ftrace_raw_##call field __attribute__((unused)); \ > int ret = 0; \ > @@ -269,10 +270,9 @@ ftrace_format_##call(struct trace_seq *s) \ > #undef TRACE_EVENT > #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ > int \ > -ftrace_define_fields_##call(void) \ > +ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ > { \ > struct ftrace_raw_##call field; \ > - struct ftrace_event_call *event_call = &event_##call; \ > int ret; \ > \ > __common_field(int, type, 1); \ > @@ -298,7 +298,7 @@ ftrace_define_fields_##call(void) \ > * event_trace_printk(_RET_IP_, "<call>: " <fmt>); > * } > * > - * static int ftrace_reg_event_<call>(void) > + * static int ftrace_reg_event_<call>(struct ftrace_event_call *dummy) I would prefer "unused" or similar as a variable name over "dummy". > * { > * int ret; > * > @@ -309,7 +309,7 @@ ftrace_define_fields_##call(void) \ > * return ret; > * } > * > - * static void ftrace_unreg_event_<call>(void) > + * static void ftrace_unreg_event_<call>(struct ftrace_event_call *dummy) > * { > * unregister_trace_<call>(ftrace_event_<call>); > * } > @@ -342,7 +342,7 @@ ftrace_define_fields_##call(void) \ > * trace_current_buffer_unlock_commit(event, irq_flags, pc); > * } > * > - * static int ftrace_raw_reg_event_<call>(void) > + * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *dummy) > * { > * int ret; > * > @@ -353,7 +353,7 @@ ftrace_define_fields_##call(void) \ > * return ret; > * } > * > - * static void ftrace_unreg_event_<call>(void) > + * static void ftrace_unreg_event_<call>(struct ftrace_event_call *dummy) > * { > * unregister_trace_<call>(ftrace_raw_event_<call>); > * } > @@ -362,7 +362,7 @@ ftrace_define_fields_##call(void) \ > * .trace = ftrace_raw_output_<call>, <-- stage 2 > * }; > * > - * static int ftrace_raw_init_event_<call>(void) > + * static int ftrace_raw_init_event_<call>(struct ftrace_event_call *dummy) > * { > * int id; > * > @@ -477,7 +477,7 @@ static void ftrace_raw_event_##call(proto) \ > trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ > } \ > \ > -static int ftrace_raw_reg_event_##call(void) \ > +static int ftrace_raw_reg_event_##call(struct ftrace_event_call *dummy) \ > { \ > int ret; \ > \ > @@ -488,7 +488,7 @@ static int ftrace_raw_reg_event_##call(void) \ > return ret; \ > } \ > \ > -static void ftrace_raw_unreg_event_##call(void) \ > +static void ftrace_raw_unreg_event_##call(struct ftrace_event_call *dummy)\ > { \ > unregister_trace_##call(ftrace_raw_event_##call); \ > } \ > @@ -497,7 +497,7 @@ static struct trace_event ftrace_event_type_##call = { \ > .trace = ftrace_raw_output_##call, \ > }; \ > \ > -static int ftrace_raw_init_event_##call(void) \ > +static int ftrace_raw_init_event_##call(struct ftrace_event_call *dummy)\ > { \ > int id; \ > \ > diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c > index 6c81f9c..5d0a324 100644 > --- a/kernel/trace/trace_events.c > +++ b/kernel/trace/trace_events.c > @@ -60,9 +60,7 @@ err: > } > EXPORT_SYMBOL_GPL(trace_define_field); > > -#ifdef CONFIG_MODULES > - > -static void trace_destroy_fields(struct ftrace_event_call *call) > +void trace_destroy_fields(struct ftrace_event_call *call) > { > struct ftrace_event_field *field, *next; > > @@ -74,8 +72,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call) > } > } > > -#endif /* CONFIG_MODULES */ > - > static void ftrace_event_enable_disable(struct ftrace_event_call *call, > int enable) > { > @@ -84,14 +80,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, > if (call->enabled) { > call->enabled = 0; > tracing_stop_cmdline_record(); > - call->unregfunc(); > + call->unregfunc(call); > } > break; > case 1: > if (!call->enabled) { > call->enabled = 1; > tracing_start_cmdline_record(); > - call->regfunc(); > + call->regfunc(call); Cute. > } > break; > } > @@ -558,7 +554,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, > trace_seq_printf(s, "format:\n"); > trace_write_header(s); > > - r = call->show_format(s); > + r = call->show_format(call, s); > if (!r) { > /* > * ug! The format output is bigger than a PAGE!! > @@ -905,7 +901,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, > d_events = event_subsystem_dir(call->system, d_events); > > if (call->raw_init) { > - ret = call->raw_init(); > + ret = call->raw_init(call); > if (ret < 0) { > pr_warning("Could not initialize trace point" > " events/%s\n", call->name); > @@ -929,7 +925,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, > id); > > if (call->define_fields) { > - ret = call->define_fields(); > + ret = call->define_fields(call); > if (ret < 0) { > pr_warning("Could not initialize trace point" > " events/%s\n", call->name); > @@ -949,6 +945,36 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, > return 0; > } > > +/* Add an additional event_call dynamically */ > +int trace_add_event_call(struct ftrace_event_call *call) > +{ > + struct dentry *d_events; > + > + if (!call->name) > + return -EINVAL; > + > + d_events = event_trace_events_dir(); > + if (!d_events) > + return -ENOENT; > + > + list_add(&call->list, &ftrace_events); ftrace_events needs to be protected by the event_mutex. > + return event_create_dir(call, d_events, &ftrace_event_id_fops, > + &ftrace_enable_fops, &ftrace_event_filter_fops, > + &ftrace_event_format_fops); > +} > + > +/* Remove an event_call */ > +void trace_remove_event_call(struct ftrace_event_call *event_call) > +{ > + ftrace_event_enable_disable(event_call, 0); > + if (event_call->event) > + unregister_ftrace_event(event_call->event); > + debugfs_remove_recursive(event_call->dir); > + list_del(&event_call->list); Same here. > + trace_destroy_fields(event_call); > + destroy_preds(event_call); > +} > + > #define for_each_event(event, start, end) \ > for (event = start; \ > (unsigned long)event < (unsigned long)end; \ > @@ -1053,13 +1079,7 @@ static void trace_module_remove_events(struct module *mod) > list_for_each_entry_safe(call, p, &ftrace_events, list) { > if (call->mod == mod) { > found = true; > - ftrace_event_enable_disable(call, 0); > - if (call->event) > - unregister_ftrace_event(call->event); > - debugfs_remove_recursive(call->dir); > - list_del(&call->list); > - trace_destroy_fields(call); > - destroy_preds(call); > + trace_remove_event_call(call); We'll need a version that does not take the event_mutex, because it is held at this point. -- Steve > } > } > > diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c > index d06cf89..7cee79d 100644 > --- a/kernel/trace/trace_export.c > +++ b/kernel/trace/trace_export.c > @@ -60,7 +60,7 @@ extern void __bad_type_size(void); > #undef TRACE_EVENT_FORMAT > #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ > static int \ > -ftrace_format_##call(struct trace_seq *s) \ > +ftrace_format_##call(struct ftrace_event_call *dummy, struct trace_seq *s)\ > { \ > struct args field; \ > int ret; \ > @@ -76,7 +76,7 @@ ftrace_format_##call(struct trace_seq *s) \ > #define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ > tpfmt) \ > static int \ > -ftrace_format_##call(struct trace_seq *s) \ > +ftrace_format_##call(struct ftrace_event_call *dummy, struct trace_seq *s)\ > { \ > struct args field; \ > int ret; \ > @@ -115,10 +115,16 @@ ftrace_format_##call(struct trace_seq *s) \ > #define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ > cmd; > > +static int ftrace_raw_init_event(struct ftrace_event_call *event_call) > +{ > + INIT_LIST_HEAD(&event_call->fields); > + init_preds(event_call); > + return 0; > +} > + > #undef TRACE_EVENT_FORMAT > #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ > -int ftrace_define_fields_##call(void); \ > -static int ftrace_raw_init_event_##call(void); \ > +int ftrace_define_fields_##call(struct ftrace_event_call *c); \ > \ > struct ftrace_event_call __used \ > __attribute__((__aligned__(4))) \ > @@ -126,16 +132,10 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ > .name = #call, \ > .id = proto, \ > .system = __stringify(TRACE_SYSTEM), \ > - .raw_init = ftrace_raw_init_event_##call, \ > + .raw_init = ftrace_raw_init_event, \ > .show_format = ftrace_format_##call, \ > .define_fields = ftrace_define_fields_##call, \ > -}; \ > -static int ftrace_raw_init_event_##call(void) \ > -{ \ > - INIT_LIST_HEAD(&event_##call.fields); \ > - init_preds(&event_##call); \ > - return 0; \ > -} \ > +}; > > #undef TRACE_EVENT_FORMAT_NOFILTER > #define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ > @@ -182,9 +182,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ > #undef TRACE_EVENT_FORMAT > #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ > int \ > -ftrace_define_fields_##call(void) \ > +ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ > { \ > - struct ftrace_event_call *event_call = &event_##call; \ > struct args field; \ > int ret; \ > \ > > > -- > Masami Hiramatsu > > Software Engineer > Hitachi Computer Products (America), Inc. > Software Solutions Division > > e-mail: mhi...@re... > |
From: Steven R. <ro...@go...> - 2009-05-30 04:11:32
|
On Thu, 28 May 2009, Masami Hiramatsu wrote: > +#undef SHOW_FIELD > +#define SHOW_FIELD(type, item, name) \ > + do { \ > + ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ > + "offset:%u;tsize:%u;\n", name, \ > + (unsigned)offsetof(typeof(field), item),\ > + (unsigned)sizeof(type)); \ > + if (!ret) \ > + return 0; \ > + } while (0) > + > +static int __probe_event_show_format(struct ftrace_event_call *event_call, > + struct trace_seq *s, const char *fmt, > + const char *arg) > +{ > + struct kprobe_trace_entry field __attribute__((unused)); You use kprobe_trace_entry for both kprobe and kretprobe. > + int ret, i; > + char buf[MAX_ARGSTR_LEN + 1]; > + struct trace_probe *tp = container_of(event_call, > + struct trace_probe, call); > + > + /* Show fields */ > + for (i = 0; i < tp->nr_args; i++) { > + sprintf(buf, "arg%d", i); > + SHOW_FIELD(unsigned long, args[i], buf); > + } > + trace_seq_puts(s, "\n"); > + > + /* Show aliases */ > + for (i = 0; i < tp->nr_args; i++) { > + if (trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i])) > + return 0; > + if (!trace_seq_printf(s, "\talias: %s;\toriginal: arg%d;\n", > + buf, i)) > + return 0; > + } > + /* Show format */ > + if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) > + return 0; > + > + for (i = 0; i < tp->nr_args; i++) > + if (!trace_seq_puts(s, " 0x%lx")) > + return 0; > + > + if (!trace_seq_printf(s, "\", %s", arg)) > + return 0; > + > + for (i = 0; i < tp->nr_args; i++) > + if (!trace_seq_printf(s, ", arg%d", i)) > + return 0; > + > + return trace_seq_puts(s, "\n"); > +} > + > +static int kprobe_event_show_format(struct ftrace_event_call *call, > + struct trace_seq *s) > +{ > + struct kprobe_trace_entry field __attribute__((unused)); > + int ret; > + > + SHOW_FIELD(unsigned long, ip, "ip"); > + SHOW_FIELD(int, nargs, "nargs"); > + > + return __probe_event_show_format(call, s, "%lx:", "ip"); > +} > + > +static int kretprobe_event_show_format(struct ftrace_event_call *call, > + struct trace_seq *s) > +{ > + struct kretprobe_trace_entry field __attribute__((unused)); > + int ret; > + > + SHOW_FIELD(unsigned long, func, "func"); > + SHOW_FIELD(unsigned long, ret_ip, "ret_ip"); > + SHOW_FIELD(int, nargs, "nargs"); > + > + return __probe_event_show_format(call, s, "%lx <- %lx:", > + "func, ret_ip"); > +} Thus we end up with: format: field:unsigned short common_type; offset:0; size:2; field:unsigned char common_flags; offset:2; size:1; field:unsigned char common_preempt_count; offset:3; size:1; field:int common_pid; offset:4; size:4; field:int common_tgid; offset:8; size:4; field: unsigned long func; offset:16;tsize:8; field: unsigned long ret_ip; offset:24;tsize:8; field: int nargs; offset:32;tsize:4; field: unsigned long arg0; offset:32;tsize:8; field: unsigned long arg1; offset:40;tsize:8; field: unsigned long arg2; offset:48;tsize:8; field: unsigned long arg3; offset:56;tsize:8; Notice that nargs and arg0 are both at offest 32. -- Steve |
From: Masami H. <mhi...@re...> - 2009-05-29 00:01:40
|
Hi, Here are the patches of kprobe-based event tracer for x86, version 8, which allows you to probe various kernel events through ftrace interface. This version, I added per-probe filtering support which allows you to set filters on each probe and shows formats of each probe. I think this is more generic integration with ftrace, especially event-tracer. This patchset also includes x86(-64) instruction decoder which supports non-SSE/FP opcodes and includes x86 opcode map. The decoder is used for finding the instruction boundaries when inserting new kprobes. I think it will be possible to share this opcode map with KVM's decoder. The decoder is tested when building kernel, the test compares the results of objdump and the decoder right after building vmlinux. You can enable that test by CONFIG_X86_DECODER_SELFTEST=y. This series can be applied on the latest linux-2.6-tip tree. This supports only x86(-32/-64) (but porting it on other arch just needs kprobes/kretprobes and register and stack access APIs). This patchset includes following changes: - Add x86 instruction decoder [1/7] (FIXED) - Add x86 instruction decoder selftest [2/7] (FIXED) - Check insertion point safety in kprobe [3/7] - Cleanup fix_riprel() with insn decoder [4/7] - Add arch-dep register and stack fetching functions [5/7] - Add dynamic event_call support to ftrace [6/7] (NEW) - Add kprobe-based event tracer [7/7] (UPDATED) Enhancement ideas will be added after merging: - .init function tracing support. - Support primitive types(long, ulong, int, uint, etc) for args. Kprobe-based Event Tracer ========================= Overview -------- This tracer is similar to the events tracer which is based on Tracepoint infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe and kretprobe). It probes anywhere where kprobes can probe(this means, all functions body except for __kprobes functions). Unlike the function tracer, this tracer can probe instructions inside of kernel functions. It allows you to check which instruction has been executed. Unlike the Tracepoint based events tracer, this tracer can add new probe points on the fly. Similar to the events tracer, this tracer doesn't need to be activated via current_tracer, instead of that, just set probe points via /debug/tracing/kprobe_events. Synopsis of kprobe_events ------------------------- p[:EVENT] SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS] : set a probe r[:EVENT] SYMBOL[+0] [FETCHARGS] : set a return probe EVENT : Event name SYMBOL[+offs|-offs] : Symbol+offset where the probe is inserted MEMADDR : Address where the probe is inserted FETCHARGS : Arguments %REG : Fetch register REG sN : Fetch Nth entry of stack (N >= 0) @ADDR : Fetch memory at ADDR (ADDR should be in kernel) @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) aN : Fetch function argument. (N >= 0)(*) rv : Fetch return value.(**) ra : Fetch return address.(**) +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.(***) (*) aN may not correct on asmlinkaged functions and at the middle of function body. (**) only for return probe. (***) this is useful for fetching a field of data structures. Per-Probe Event Filtering ------------------------- Per-probe event filtering feature allows you to set different filter on each probe and gives you what arguments will be shown in trace buffer. If an event name is specified right after 'p:' or 'r:' in kprobe_events, the tracer adds an event under tracing/events/kprobes/<EVENT>, at the directory you can see 'id', 'enabled', 'format' and 'filter'. enabled: You can enable/disable the probe by writing 1 or 0 on it. format: It shows the format of this probe event. It also shows aliases of arguments which you specified to kprobe_events. filter: You can write filtering rules of this event. And you can use both of aliase names and field names for describing filters. Usage examples -------------- To add a probe as a new event, write a new definition to kprobe_events as below. echo p:myprobe do_sys_open a0 a1 a2 a3 > /debug/tracing/kprobe_events This sets a kprobe on the top of do_sys_open() function with recording 1st to 4th arguments as "myprobe" event. echo r:myretprobe do_sys_open rv ra >> /debug/tracing/kprobe_events This sets a kretprobe on the return point of do_sys_open() function with recording return value and return address as "myretprobe" event. You can see the format of these events via tracing/events/kprobes/<EVENT>/format. cat /debug/tracing/events/kprobes/myprobe/format name: myprobe ID: 23 format: field:unsigned short common_type; offset:0; size:2; field:unsigned char common_flags; offset:2; size:1; field:unsigned char common_preempt_count; offset:3; size:1; field:int common_pid; offset:4; size:4; field:int common_tgid; offset:8; size:4; field: unsigned long ip; offset:16;tsize:8; field: int nargs; offset:24;tsize:4; field: unsigned long arg0; offset:32;tsize:8; field: unsigned long arg1; offset:40;tsize:8; field: unsigned long arg2; offset:48;tsize:8; field: unsigned long arg3; offset:56;tsize:8; alias: a0; original: arg0; alias: a1; original: arg1; alias: a2; original: arg2; alias: a3; original: arg3; print fmt: "%lx: 0x%lx 0x%lx 0x%lx 0x%lx", ip, arg0, arg1, arg2, arg3 You can see that the event has 4 arguments and alias expressions corresponding to it. echo > /debug/tracing/kprobe_events This clears all probe points. and you can see the traced information via /debug/tracing/trace. cat /debug/tracing/trace # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | <...>-1447 [001] 1038282.286875: do_sys_open+0x0/0xd6: 0x3 0x7fffd1ec4440 0x8000 0x0 <...>-1447 [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: 0xfffffffffffffffe 0xffffffff81367a3a <...>-1447 [001] 1038282.286885: do_sys_open+0x0/0xd6: 0xffffff9c 0x40413c 0x8000 0x1b6 <...>-1447 [001] 1038282.286915: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a <...>-1447 [001] 1038282.286969: do_sys_open+0x0/0xd6: 0xffffff9c 0x4041c6 0x98800 0x10 <...>-1447 [001] 1038282.286976: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a Each line shows when the kernel hits a probe, and <- SYMBOL means kernel returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel returns from do_sys_open to sys_open+0x1b). Thank you, --- Masami Hiramatsu (7): tracing: add kprobe-based event tracer tracing: ftrace dynamic ftrace_event_call support x86: add pt_regs register and stack access APIs kprobes: cleanup fix_riprel() using insn decoder on x86 kprobes: checks probe address is instruction boudary on x86 x86: x86 instruction decoder build-time selftest x86: instruction decoder API Documentation/trace/kprobes.txt | 138 ++++ arch/x86/Kconfig.debug | 9 arch/x86/Makefile | 3 arch/x86/include/asm/inat.h | 127 +++ arch/x86/include/asm/insn.h | 136 ++++ arch/x86/include/asm/ptrace.h | 67 ++ arch/x86/kernel/kprobes.c | 197 +++-- arch/x86/kernel/ptrace.c | 60 ++ arch/x86/lib/Makefile | 13 arch/x86/lib/inat.c | 82 ++ arch/x86/lib/insn.c | 473 +++++++++++++ arch/x86/lib/x86-opcode-map.txt | 711 +++++++++++++++++++ arch/x86/scripts/Makefile | 19 + arch/x86/scripts/distill.awk | 42 + arch/x86/scripts/gen-insn-attr-x86.awk | 314 +++++++++ arch/x86/scripts/test_get_len.c | 99 +++ arch/x86/scripts/user_include.h | 49 + include/linux/ftrace_event.h | 13 include/trace/ftrace.h | 22 - kernel/trace/Kconfig | 12 kernel/trace/Makefile | 1 kernel/trace/trace.h | 22 + kernel/trace/trace_event_types.h | 20 + kernel/trace/trace_events.c | 54 + kernel/trace/trace_export.c | 27 - kernel/trace/trace_kprobe.c | 1174 ++++++++++++++++++++++++++++++++ 26 files changed, 3732 insertions(+), 152 deletions(-) create mode 100644 Documentation/trace/kprobes.txt create mode 100644 arch/x86/include/asm/inat.h create mode 100644 arch/x86/include/asm/insn.h create mode 100644 arch/x86/lib/inat.c create mode 100644 arch/x86/lib/insn.c create mode 100644 arch/x86/lib/x86-opcode-map.txt create mode 100644 arch/x86/scripts/Makefile create mode 100644 arch/x86/scripts/distill.awk create mode 100644 arch/x86/scripts/gen-insn-attr-x86.awk create mode 100644 arch/x86/scripts/test_get_len.c create mode 100644 arch/x86/scripts/user_include.h create mode 100644 kernel/trace/trace_kprobe.c -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-29 00:01:23
|
Add kprobes-based event tracer on ftrace. This tracer is similar to the events tracer which is based on Tracepoint infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe and kretprobe). It probes anywhere where kprobes can probe(this means, all functions body except for __kprobes functions). Similar to the events tracer, this tracer doesn't need to be activated via current_tracer, instead of that, just set probe points via /debug/tracing/kprobe_events. And you can set filters on each probe events via /debug/tracing/events/kprobes/<EVENT>/filter. This tracer supports following probe arguments for each probe. %REG : Fetch register REG sN : Fetch Nth entry of stack (N >= 0) @ADDR : Fetch memory at ADDR (ADDR should be in kernel) @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) aN : Fetch function argument. (N >= 0) rv : Fetch return value. ra : Fetch return address. +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address. See Documentation/trace/kprobes.txt for details. Changes from v7: - Fix document example. - Remove solved TODO. - Support per-probe event filtering. Signed-off-by: Masami Hiramatsu <mhi...@re...> Cc: Christoph Hellwig <hc...@in...> Cc: Steven Rostedt <ro...@go...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Ingo Molnar <mi...@el...> Cc: Frederic Weisbecker <fwe...@gm...> Cc: Tom Zanussi <tza...@gm...> --- Documentation/trace/kprobes.txt | 138 ++++ kernel/trace/Kconfig | 12 kernel/trace/Makefile | 1 kernel/trace/trace.h | 22 + kernel/trace/trace_event_types.h | 20 + kernel/trace/trace_kprobe.c | 1174 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 1367 insertions(+), 0 deletions(-) create mode 100644 Documentation/trace/kprobes.txt create mode 100644 kernel/trace/trace_kprobe.c diff --git a/Documentation/trace/kprobes.txt b/Documentation/trace/kprobes.txt new file mode 100644 index 0000000..f6b4587 --- /dev/null +++ b/Documentation/trace/kprobes.txt @@ -0,0 +1,138 @@ + Kprobe-based Event Tracer + ========================= + + Documentation is written by Masami Hiramatsu + + +Overview +-------- +This tracer is similar to the events tracer which is based on Tracepoint +infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe +and kretprobe). It probes anywhere where kprobes can probe(this means, all +functions body except for __kprobes functions). + +Unlike the function tracer, this tracer can probe instructions inside of +kernel functions. It allows you to check which instruction has been executed. + +Unlike the Tracepoint based events tracer, this tracer can add and remove +probe points on the fly. + +Similar to the events tracer, this tracer doesn't need to be activated via +current_tracer, instead of that, just set probe points via +/debug/tracing/kprobe_events. And you can set filters on each probe events +via /debug/tracing/events/kprobes/<EVENT>/filter. + + +Synopsis of kprobe_events +------------------------- + p[:EVENT] SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS] : set a probe + r[:EVENT] SYMBOL[+0] [FETCHARGS] : set a return probe + + EVENT : Event name + SYMBOL[+offs|-offs] : Symbol+offset where the probe is inserted + MEMADDR : Address where the probe is inserted + + FETCHARGS : Arguments + %REG : Fetch register REG + sN : Fetch Nth entry of stack (N >= 0) + @ADDR : Fetch memory at ADDR (ADDR should be in kernel) + @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) + aN : Fetch function argument. (N >= 0)(*) + rv : Fetch return value.(**) + ra : Fetch return address.(**) + +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.(***) + + (*) aN may not correct on asmlinkaged functions and at the middle of + function body. + (**) only for return probe. + (***) this is useful for fetching a field of data structures. + + +Per-Probe Event Filtering +------------------------- + Per-probe event filtering feature allows you to set different filter on each +probe and gives you what arguments will be shown in trace buffer. If an event +name is specified right after 'p:' or 'r:' in kprobe_events, the tracer adds +an event under tracing/events/kprobes/<EVENT>, at the directory you can see +'id', 'enabled', 'format' and 'filter'. + +enabled: + You can enable/disable the probe by writing 1 or 0 on it. + +format: + It shows the format of this probe event. It also shows aliases of arguments + which you specified to kprobe_events. + +filter: + You can write filtering rules of this event. And you can use both of aliase + names and field names for describing filters. + + +Usage examples +-------------- +To add a probe as a new event, write a new definition to kprobe_events +as below. + + echo p:myprobe do_sys_open a0 a1 a2 a3 > /debug/tracing/kprobe_events + + This sets a kprobe on the top of do_sys_open() function with recording +1st to 4th arguments as "myprobe" event. + + echo r:myretprobe do_sys_open rv ra >> /debug/tracing/kprobe_events + + This sets a kretprobe on the return point of do_sys_open() function with +recording return value and return address as "myretprobe" event. + You can see the format of these events via +tracing/events/kprobes/<EVENT>/format. + + cat /debug/tracing/events/kprobes/myprobe/format +name: myprobe +ID: 23 +format: + field:unsigned short common_type; offset:0; size:2; + field:unsigned char common_flags; offset:2; size:1; + field:unsigned char common_preempt_count; offset:3; size:1; + field:int common_pid; offset:4; size:4; + field:int common_tgid; offset:8; size:4; + + field: unsigned long ip; offset:16;tsize:8; + field: int nargs; offset:24;tsize:4; + field: unsigned long arg0; offset:32;tsize:8; + field: unsigned long arg1; offset:40;tsize:8; + field: unsigned long arg2; offset:48;tsize:8; + field: unsigned long arg3; offset:56;tsize:8; + + alias: a0; original: arg0; + alias: a1; original: arg1; + alias: a2; original: arg2; + alias: a3; original: arg3; + +print fmt: "%lx: 0x%lx 0x%lx 0x%lx 0x%lx", ip, arg0, arg1, arg2, arg3 + + + You can see that the event has 4 arguments and alias expressions +corresponding to it. + + echo > /debug/tracing/kprobe_events + + This clears all probe points. and you can see the traced information via +/debug/tracing/trace. + + cat /debug/tracing/trace +# tracer: nop +# +# TASK-PID CPU# TIMESTAMP FUNCTION +# | | | | | + <...>-1447 [001] 1038282.286875: do_sys_open+0x0/0xd6: 0x3 0x7fffd1ec4440 0x8000 0x0 + <...>-1447 [001] 1038282.286878: sys_openat+0xc/0xe <- do_sys_open: 0xfffffffffffffffe 0xffffffff81367a3a + <...>-1447 [001] 1038282.286885: do_sys_open+0x0/0xd6: 0xffffff9c 0x40413c 0x8000 0x1b6 + <...>-1447 [001] 1038282.286915: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a + <...>-1447 [001] 1038282.286969: do_sys_open+0x0/0xd6: 0xffffff9c 0x4041c6 0x98800 0x10 + <...>-1447 [001] 1038282.286976: sys_open+0x1b/0x1d <- do_sys_open: 0x3 0xffffffff81367a3a + + + Each line shows when the kernel hits a probe, and <- SYMBOL means kernel +returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel +returns from do_sys_open to sys_open+0x1b). + + diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a508b9d..3a25730 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -403,6 +403,18 @@ config BLK_DEV_IO_TRACE If unsure, say N. +config KPROBE_TRACER + depends on KPROBES + depends on X86 + bool "Trace kprobes" + select TRACING + select EVENT_TRACING + help + This tracer probes everywhere where kprobes can probe it, and + records various registers and memories specified by user. + This also allows you to trace kprobe probe points as a dynamic + defined events. It provides per-probe event filtering interface. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 848e5ce..01ac95b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -52,5 +52,6 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_mm.o +obj-$(CONFIG_KPROBE_TRACER) += trace_kprobe.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6e735d4..5d7849b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -40,6 +40,8 @@ enum trace_type { TRACE_KMEM_FREE, TRACE_POWER, TRACE_BLK, + TRACE_KPROBE, + TRACE_KRETPROBE, __TRACE_LAST_TYPE, }; @@ -207,6 +209,22 @@ struct syscall_trace_exit { unsigned long ret; }; +#define TRACE_KPROBE_ARGS 6 + +struct kprobe_trace_entry { + struct trace_entry ent; + unsigned long ip; + int nargs; + unsigned long args[TRACE_KPROBE_ARGS]; +}; + +struct kretprobe_trace_entry { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; + int nargs; + unsigned long args[TRACE_KPROBE_ARGS]; +}; /* * trace_flag_type is an enumeration that holds different @@ -323,6 +341,10 @@ extern void __ftrace_bad_type(void); TRACE_SYSCALL_ENTER); \ IF_ASSIGN(var, ent, struct syscall_trace_exit, \ TRACE_SYSCALL_EXIT); \ + IF_ASSIGN(var, ent, struct kprobe_trace_entry, \ + TRACE_KPROBE); \ + IF_ASSIGN(var, ent, struct kretprobe_trace_entry, \ + TRACE_KRETPROBE); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 5e32e37..3be3e32 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -172,4 +172,24 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, TP_RAW_FMT("type:%u call_site:%lx ptr:%p") ); +TRACE_EVENT_FORMAT(kprobe, TRACE_KPROBE, kprobe_trace_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned long, ip, ip) + TRACE_FIELD(int, nargs, nargs) + TRACE_FIELD_SPECIAL(unsigned long args[TRACE_KPROBE_ARGS], + args, TRACE_KPROBE_ARGS, args) + ), + TP_RAW_FMT("%08lx: args:0x%lx ...") +); + +TRACE_EVENT_FORMAT(kretprobe, TRACE_KRETPROBE, kretprobe_trace_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned long, func, func) + TRACE_FIELD(unsigned long, ret_ip, ret_ip) + TRACE_FIELD(int, nargs, nargs) + TRACE_FIELD_SPECIAL(unsigned long args[TRACE_KPROBE_ARGS], + args, TRACE_KPROBE_ARGS, args) + ), + TP_RAW_FMT("%08lx <- %08lx: args:0x%lx ...") +); #undef TRACE_SYSTEM diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c new file mode 100644 index 0000000..c46cf69 --- /dev/null +++ b/kernel/trace/trace_kprobe.c @@ -0,0 +1,1174 @@ +/* + * kprobe based kernel tracer + * + * Created by Masami Hiramatsu <mhi...@re...> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/kprobes.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/debugfs.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/ptrace.h> + +#include "trace.h" +#include "trace_output.h" + +#define MAX_ARGSTR_LEN 63 + +/* currently, trace_kprobe only supports X86. */ + +struct fetch_func { + unsigned long (*func)(struct pt_regs *, void *); + void *data; +}; + +static __kprobes unsigned long call_fetch(struct fetch_func *f, + struct pt_regs *regs) +{ + return f->func(regs, f->data); +} + +/* fetch handlers */ +static __kprobes unsigned long fetch_register(struct pt_regs *regs, + void *offset) +{ + return get_register(regs, (unsigned)((unsigned long)offset)); +} + +static __kprobes unsigned long fetch_stack(struct pt_regs *regs, + void *num) +{ + return get_kernel_stack_nth(regs, (unsigned)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) +{ + unsigned long retval; + if (probe_kernel_address(addr, retval)) + return 0; + return retval; +} + +static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num) +{ + return get_argument_nth(regs, (unsigned)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, + void *dummy) +{ + return regs_return_value(regs); +} + +static __kprobes unsigned long fetch_ip(struct pt_regs *regs, void *dummy) +{ + return instruction_pointer(regs); +} + +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +static unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + if (sc->addr) + sc->addr += sc->offset; + return sc->addr; +} + +static void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + if (!sym || strlen(sym) == 0) + return NULL; + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + + update_symbol_cache(sc); + return sc; +} + +static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) +{ + struct symbol_cache *sc = data; + if (sc->addr) + return fetch_memory(regs, (void *)sc->addr); + else + return 0; +} + +/* Special indirect memory access interface */ +struct indirect_fetch_data { + struct fetch_func orig; + long offset; +}; + +static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) +{ + struct indirect_fetch_data *ind = data; + unsigned long addr; + addr = call_fetch(&ind->orig, regs); + if (addr) { + addr += ind->offset; + return fetch_memory(regs, (void *)addr); + } else + return 0; +} + +static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) +{ + if (data->orig.func == fetch_indirect) + free_indirect_fetch_data(data->orig.data); + else if (data->orig.func == fetch_symbol) + free_symbol_cache(data->orig.data); + kfree(data); +} + +/** + * kprobe_trace_core + */ + +struct trace_probe { + struct list_head list; + union { + struct kprobe kp; + struct kretprobe rp; + }; + const char *symbol; /* symbol name */ + unsigned int nr_args; + struct fetch_func args[TRACE_KPROBE_ARGS]; + struct ftrace_event_call call; +}; + +static int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs); +static int kretprobe_trace_func(struct kretprobe_instance *ri, + struct pt_regs *regs); + +static __kprobes int probe_is_return(struct trace_probe *tp) +{ + return (tp->rp.handler == kretprobe_trace_func); +} + +static __kprobes const char *probe_symbol(struct trace_probe *tp) +{ + return tp->symbol ? tp->symbol : "unknown"; +} + +static __kprobes long probe_offset(struct trace_probe *tp) +{ + return (probe_is_return(tp)) ? tp->rp.kp.offset : tp->kp.offset; +} + +static __kprobes void *probe_address(struct trace_probe *tp) +{ + return (probe_is_return(tp)) ? tp->rp.kp.addr : tp->kp.addr; +} + +static int trace_arg_string(char *buf, size_t n, struct fetch_func *ff) +{ + int ret = -EINVAL; + if (ff->func == fetch_argument) + ret = snprintf(buf, n, "a%lu", (unsigned long)ff->data); + else if (ff->func == fetch_register) { + const char *name; + name = query_register_name((unsigned)((long)ff->data)); + ret = snprintf(buf, n, "%%%s", name); + } else if (ff->func == fetch_stack) + ret = snprintf(buf, n, "s%lu", (unsigned long)ff->data); + else if (ff->func == fetch_memory) + ret = snprintf(buf, n, "@0x%p", ff->data); + else if (ff->func == fetch_symbol) { + struct symbol_cache *sc = ff->data; + ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset); + } else if (ff->func == fetch_retvalue) + ret = snprintf(buf, n, "rv"); + else if (ff->func == fetch_ip) + ret = snprintf(buf, n, "ra"); + else if (ff->func == fetch_indirect) { + struct indirect_fetch_data *id = ff->data; + ret = snprintf(buf, n, "%+ld(", id->offset); + if (ret > n) + goto end; + n -= ret; + ret = trace_arg_string(buf, n, &id->orig); + if (ret > n) + goto end; + n -= ret; + ret = snprintf(buf, n, ")"); + } +end: + if (ret > n) + return -ENOSPC; + return 0; +} + +static int register_probe_event(struct trace_probe *tp); +static void unregister_probe_event(struct trace_probe *tp); + +static DEFINE_MUTEX(probe_lock); +static LIST_HEAD(probe_list); + +static struct trace_probe *alloc_trace_probe(const char *symbol, + const char *event) +{ + struct trace_probe *tp; + + tp = kzalloc(sizeof(struct trace_probe), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + if (symbol) { + tp->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tp->symbol) + goto error; + } + if (event) { + tp->call.name = kstrdup(event, GFP_KERNEL); + if (!tp->call.name) + goto error; + } + + INIT_LIST_HEAD(&tp->list); + return tp; +error: + kfree(tp->symbol); + kfree(tp); + return ERR_PTR(-ENOMEM); +} + +static void free_trace_probe(struct trace_probe *tp) +{ + int i; + for (i = 0; i < tp->nr_args; i++) + if (tp->args[i].func == fetch_symbol) + free_symbol_cache(tp->args[i].data); + else if (tp->args[i].func == fetch_indirect) + free_indirect_fetch_data(tp->args[i].data); + + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); +} + +static struct trace_probe *find_probe_event(const char *event) +{ + struct trace_probe *tp; + list_for_each_entry(tp, &probe_list, list) + if (tp->call.name && !strcmp(tp->call.name, event)) + return tp; + return NULL; +} + +static void __unregister_trace_probe(struct trace_probe *tp) +{ + if (probe_is_return(tp)) + unregister_kretprobe(&tp->rp); + else + unregister_kprobe(&tp->kp); +} + +/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +static void unregister_trace_probe(struct trace_probe *tp) +{ + if (tp->call.name) + unregister_probe_event(tp); + __unregister_trace_probe(tp); + list_del(&tp->list); +} + +/* Register a trace_probe and probe_event */ +static int register_trace_probe(struct trace_probe *tp) +{ + struct trace_probe *old_tp; + int ret; + + mutex_lock(&probe_lock); + + if (probe_is_return(tp)) + ret = register_kretprobe(&tp->rp); + else + ret = register_kprobe(&tp->kp); + + if (ret) { + pr_warning("Could not insert probe(%d)\n", ret); + if (ret == -EILSEQ) { + pr_warning("Probing address(0x%p) is not an " + "instruction boundary.\n", + probe_address(tp)); + ret = -EINVAL; + } + goto end; + } + /* register as an event */ + if (tp->call.name) { + old_tp = find_probe_event(tp->call.name); + if (old_tp) { + /* delete old event */ + unregister_trace_probe(old_tp); + free_trace_probe(old_tp); + } + ret = register_probe_event(tp); + if (ret) { + pr_warning("Faild to register probe event(%d)\n", ret); + __unregister_trace_probe(tp); + } + } + list_add_tail(&tp->list, &probe_list); +end: + mutex_unlock(&probe_lock); + return ret; +} + +/* Split symbol and offset. */ +static int split_symbol_offset(char *symbol, long *offset) +{ + char *tmp; + int ret; + + if (!offset) + return -EINVAL; + + tmp = strchr(symbol, '+'); + if (!tmp) + tmp = strchr(symbol, '-'); + + if (tmp) { + /* skip sign because strict_strtol doesn't accept '+' */ + ret = strict_strtol(tmp + 1, 0, offset); + if (ret) + return ret; + if (*tmp == '-') + *offset = -(*offset); + *tmp = '\0'; + } else + *offset = 0; + return 0; +} + +#define PARAM_MAX_ARGS 16 +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) + +static int parse_trace_arg(char *arg, struct fetch_func *ff, int is_return) +{ + int ret = 0; + unsigned long param; + long offset; + char *tmp; + + switch (arg[0]) { + case 'a': /* argument */ + ret = strict_strtoul(arg + 1, 10, ¶m); + if (ret || param > PARAM_MAX_ARGS) + ret = -EINVAL; + else { + ff->func = fetch_argument; + ff->data = (void *)param; + } + break; + case 'r': /* retval or retaddr */ + if (is_return && arg[1] == 'v') { + ff->func = fetch_retvalue; + ff->data = NULL; + } else if (is_return && arg[1] == 'a') { + ff->func = fetch_ip; + ff->data = NULL; + } else + ret = -EINVAL; + break; + case '%': /* named register */ + ret = query_register_offset(arg + 1); + if (ret >= 0) { + ff->func = fetch_register; + ff->data = (void *)(unsigned long)ret; + ret = 0; + } + break; + case 's': /* stack */ + ret = strict_strtoul(arg + 1, 10, ¶m); + if (ret || param > PARAM_MAX_STACK) + ret = -EINVAL; + else { + ff->func = fetch_stack; + ff->data = (void *)param; + } + break; + case '@': /* memory or symbol */ + if (isdigit(arg[1])) { + ret = strict_strtoul(arg + 1, 0, ¶m); + if (ret) + break; + ff->func = fetch_memory; + ff->data = (void *)param; + } else { + ret = split_symbol_offset(arg + 1, &offset); + if (ret) + break; + ff->data = alloc_symbol_cache(arg + 1, + offset); + if (ff->data) + ff->func = fetch_symbol; + else + ret = -EINVAL; + } + break; + case '+': /* indirect memory */ + case '-': + tmp = strchr(arg, '('); + if (!tmp) { + ret = -EINVAL; + break; + } + *tmp = '\0'; + ret = strict_strtol(arg + 1, 0, &offset); + if (ret) + break; + if (arg[0] == '-') + offset = -offset; + arg = tmp + 1; + tmp = strrchr(arg, ')'); + if (tmp) { + struct indirect_fetch_data *id; + *tmp = '\0'; + id = kzalloc(sizeof(struct indirect_fetch_data), + GFP_KERNEL); + if (!id) + return -ENOMEM; + id->offset = offset; + ret = parse_trace_arg(arg, &id->orig, is_return); + if (ret) + kfree(id); + else { + ff->func = fetch_indirect; + ff->data = (void *)id; + } + } else + ret = -EINVAL; + break; + default: + /* TODO: support custom handler */ + ret = -EINVAL; + } + return ret; +} + +static int create_trace_probe(int argc, char **argv) +{ + /* + * Argument syntax: + * - Add kprobe: p[:EVENT] SYMBOL[+OFFS|-OFFS]|ADDRESS [FETCHARGS] + * - Add kretprobe: r[:EVENT] SYMBOL[+0] [FETCHARGS] + * Fetch args: + * aN : fetch Nth of function argument. (N:0-) + * rv : fetch return value + * ra : fetch return address + * sN : fetch Nth of stack (N:0-) + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * %REG : fetch register REG + * Indirect memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + */ + struct trace_probe *tp; + struct kprobe *kp; + int i, ret = 0; + int is_return = 0; + char *symbol = NULL, *event = NULL; + long offset = 0; + void *addr = NULL; + + if (argc < 2) + return -EINVAL; + + if (argv[0][0] == 'p') + is_return = 0; + else if (argv[0][0] == 'r') + is_return = 1; + else + return -EINVAL; + + if (argv[0][1] == ':') { + event = &argv[0][2]; + if (strlen(event) == 0) { + pr_info("Event name is not specifiled\n"); + return -EINVAL; + } + } + + if (isdigit(argv[1][0])) { + if (is_return) + return -EINVAL; + /* an address specified */ + ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + if (ret) + return ret; + } else { + /* a symbol specified */ + symbol = argv[1]; + /* TODO: support .init module functions */ + ret = split_symbol_offset(symbol, &offset); + if (ret) + return ret; + if (offset && is_return) + return -EINVAL; + } + + /* setup a probe */ + tp = alloc_trace_probe(symbol, event); + if (IS_ERR(tp)) + return PTR_ERR(tp); + + if (is_return) { + kp = &tp->rp.kp; + tp->rp.handler = kretprobe_trace_func; + } else { + kp = &tp->kp; + tp->kp.pre_handler = kprobe_trace_func; + } + + if (tp->symbol) { + kp->symbol_name = tp->symbol; + kp->offset = offset; + } else + kp->addr = addr; + + /* parse arguments */ + argc -= 2; argv += 2; ret = 0; + for (i = 0; i < argc && i < TRACE_KPROBE_ARGS; i++) { + if (strlen(argv[i]) > MAX_ARGSTR_LEN) { + pr_info("Argument%d(%s) is too long.\n", i, argv[i]); + ret = -ENOSPC; + goto error; + } + ret = parse_trace_arg(argv[i], &tp->args[i], is_return); + if (ret) + goto error; + } + tp->nr_args = i; + + ret = register_trace_probe(tp); + if (ret) + goto error; + return 0; + +error: + free_trace_probe(tp); + return ret; +} + +static void cleanup_all_probes(void) +{ + struct trace_probe *tp; + mutex_lock(&probe_lock); + /* TODO: Use batch unregistration */ + while (!list_empty(&probe_list)) { + tp = list_entry(probe_list.next, struct trace_probe, list); + unregister_trace_probe(tp); + free_trace_probe(tp); + } + mutex_unlock(&probe_lock); +} + + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&probe_lock); + return seq_list_start(&probe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &probe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&probe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + int i, ret; + char buf[MAX_ARGSTR_LEN + 1]; + + if (tp == NULL) + return 0; + + seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); + if (tp->call.name) + seq_printf(m, ":%s", tp->call.name); + + if (tp->symbol) + seq_printf(m, " %s%+ld", probe_symbol(tp), probe_offset(tp)); + else + seq_printf(m, " 0x%p", probe_address(tp)); + + for (i = 0; i < tp->nr_args; i++) { + ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]); + if (ret) { + pr_warning("Argument%d is too long.\n", i); + break; + } + seq_printf(m, " %s", buf); + } + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations probes_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) + cleanup_all_probes(); + + return seq_open(file, &probes_seq_op); +} + +static int command_trace_probe(const char *buf) +{ + char **argv; + int argc = 0, ret = 0; + + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = create_trace_probe(argc, argv); + + argv_free(argv); + return ret; +} + +#define WRITE_BUFSIZE 128 + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *kbuf, *tmp; + int ret; + size_t done; + size_t size; + + if (!count || count < 0) + return 0; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = done = 0; + do { + size = count - done; + if (size > WRITE_BUFSIZE) + size = WRITE_BUFSIZE; + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + tmp = strchr(kbuf, '\n'); + if (!tmp) { + pr_warning("Line length is too long: " + "Should be less than %d.", WRITE_BUFSIZE); + ret = -EINVAL; + goto out; + } + *tmp = '\0'; + size = tmp - kbuf + 1; + done += size; + /* Remove comments */ + tmp = strchr(kbuf, '#'); + if (tmp) + *tmp = '\0'; + + ret = command_trace_probe(kbuf); + if (ret) + goto out; + + } while (done < count); + ret = done; +out: + kfree(kbuf); + return ret; +} + +static const struct file_operations kprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Kprobe handler */ +static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, kp); + struct kprobe_trace_entry *entry; + struct ring_buffer_event *event; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &event_kprobe; + if (&tp->call.name) + call = &tp->call; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = sizeof(struct kprobe_trace_entry) - + (sizeof(unsigned long) * (TRACE_KPROBE_ARGS - tp->nr_args)); + + event = trace_current_buffer_lock_reserve(TRACE_KPROBE, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->ip = (unsigned long)kp->addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i], regs); + + if (!filter_current_check_discard(call, entry, event)) + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); + return 0; +} + +/* Kretprobe handler */ +static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct kretprobe_trace_entry *entry; + struct ring_buffer_event *event; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &event_kretprobe; + if (&tp->call.name) + call = &tp->call; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = sizeof(struct kretprobe_trace_entry) - + (sizeof(unsigned long) * (TRACE_KPROBE_ARGS - tp->nr_args)); + + event = trace_current_buffer_lock_reserve(TRACE_KRETPROBE, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->func = (unsigned long)probe_address(tp); + entry->ret_ip = (unsigned long)ri->ret_addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i], regs); + + if (!filter_current_check_discard(call, entry, event)) + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); + + return 0; +} + +/* Event entry printers */ +enum print_line_t +print_kprobe_event(struct trace_iterator *iter, int flags) +{ + struct kprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + int i; + + trace_assign_type(field, iter->ent); + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ":")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " 0x%lx", field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +enum print_line_t +print_kretprobe_event(struct trace_iterator *iter, int flags) +{ + struct kretprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + int i; + + trace_assign_type(field, iter->ent); + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, " <- ")) + goto partial; + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ":")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " 0x%lx", field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event kprobe_trace_event = { + .type = TRACE_KPROBE, + .trace = print_kprobe_event, +}; + +static struct trace_event kretprobe_trace_event = { + .type = TRACE_KRETPROBE, + .trace = print_kretprobe_event, +}; + +static int probe_event_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = container_of(call, struct trace_probe, call); + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->kp); +} + +static void probe_event_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = container_of(call, struct trace_probe, call); + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->kp); +} + +static int probe_event_raw_init(struct ftrace_event_call *event_call) +{ + INIT_LIST_HEAD(&event_call->fields); + init_preds(event_call); + return 0; +} + +#undef DEFINE_FIELD +#define DEFINE_FIELD(type, item, name, is_signed) \ + do { \ + ret = trace_define_field(event_call, #type, name, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed);\ + if (ret) \ + return ret; \ + } while (0) + +static int kprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kprobe_trace_entry field; + char buf[MAX_ARGSTR_LEN + 1]; + struct trace_probe *tp = container_of(event_call, + struct trace_probe, call); + + __common_field(int, type, 1); + __common_field(unsigned char, flags, 0); + __common_field(unsigned char, preempt_count, 0); + __common_field(int, pid, 1); + __common_field(int, tgid, 1); + + DEFINE_FIELD(unsigned long, ip, "ip", 0); + DEFINE_FIELD(int, nargs, "nargs", 1); + for (i = 0; i < tp->nr_args; i++) { + /* Set argN as a field */ + sprintf(buf, "arg%d", i); + DEFINE_FIELD(unsigned long, args[i], buf, 0); + /* Set argument string as an alias field */ + ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]); + if (ret) + return ret; + DEFINE_FIELD(unsigned long, args[i], buf, 0); + } + return 0; +} + +static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kretprobe_trace_entry field; + char buf[MAX_ARGSTR_LEN + 1]; + struct trace_probe *tp = container_of(event_call, + struct trace_probe, call); + + __common_field(int, type, 1); + __common_field(unsigned char, flags, 0); + __common_field(unsigned char, preempt_count, 0); + __common_field(int, pid, 1); + __common_field(int, tgid, 1); + + DEFINE_FIELD(unsigned long, func, "func", 0); + DEFINE_FIELD(unsigned long, ret_ip, "ret_ip", 0); + DEFINE_FIELD(int, nargs, "nargs", 1); + for (i = 0; i < tp->nr_args; i++) { + /* Set argN as a field */ + sprintf(buf, "arg%d", i); + DEFINE_FIELD(unsigned long, args[i], buf, 0); + /* Set argument string as an alias field */ + ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]); + if (ret) + return ret; + DEFINE_FIELD(unsigned long, args[i], buf, 0); + } + return 0; +} + +#undef SHOW_FIELD +#define SHOW_FIELD(type, item, name) \ + do { \ + ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ + "offset:%u;tsize:%u;\n", name, \ + (unsigned)offsetof(typeof(field), item),\ + (unsigned)sizeof(type)); \ + if (!ret) \ + return 0; \ + } while (0) + +static int __probe_event_show_format(struct ftrace_event_call *event_call, + struct trace_seq *s, const char *fmt, + const char *arg) +{ + struct kprobe_trace_entry field __attribute__((unused)); + int ret, i; + char buf[MAX_ARGSTR_LEN + 1]; + struct trace_probe *tp = container_of(event_call, + struct trace_probe, call); + + /* Show fields */ + for (i = 0; i < tp->nr_args; i++) { + sprintf(buf, "arg%d", i); + SHOW_FIELD(unsigned long, args[i], buf); + } + trace_seq_puts(s, "\n"); + + /* Show aliases */ + for (i = 0; i < tp->nr_args; i++) { + if (trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i])) + return 0; + if (!trace_seq_printf(s, "\talias: %s;\toriginal: arg%d;\n", + buf, i)) + return 0; + } + /* Show format */ + if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) + return 0; + + for (i = 0; i < tp->nr_args; i++) + if (!trace_seq_puts(s, " 0x%lx")) + return 0; + + if (!trace_seq_printf(s, "\", %s", arg)) + return 0; + + for (i = 0; i < tp->nr_args; i++) + if (!trace_seq_printf(s, ", arg%d", i)) + return 0; + + return trace_seq_puts(s, "\n"); +} + +static int kprobe_event_show_format(struct ftrace_event_call *call, + struct trace_seq *s) +{ + struct kprobe_trace_entry field __attribute__((unused)); + int ret; + + SHOW_FIELD(unsigned long, ip, "ip"); + SHOW_FIELD(int, nargs, "nargs"); + + return __probe_event_show_format(call, s, "%lx:", "ip"); +} + +static int kretprobe_event_show_format(struct ftrace_event_call *call, + struct trace_seq *s) +{ + struct kretprobe_trace_entry field __attribute__((unused)); + int ret; + + SHOW_FIELD(unsigned long, func, "func"); + SHOW_FIELD(unsigned long, ret_ip, "ret_ip"); + SHOW_FIELD(int, nargs, "nargs"); + + return __probe_event_show_format(call, s, "%lx <- %lx:", + "func, ret_ip"); +} + +static int register_probe_event(struct trace_probe *tp) +{ + struct ftrace_event_call *call = &tp->call; + int ret; + + /* Initialize ftrace_event_call */ + call->system = "kprobes"; + if (probe_is_return(tp)) { + call->event = &kretprobe_trace_event; + call->id = TRACE_KRETPROBE; + call->raw_init = probe_event_raw_init; + call->show_format = kretprobe_event_show_format; + call->define_fields = kretprobe_event_define_fields; + } else { + call->event = &kprobe_trace_event; + call->id = TRACE_KPROBE; + call->raw_init = probe_event_raw_init; + call->show_format = kprobe_event_show_format; + call->define_fields = kprobe_event_define_fields; + } + call->enabled = 1; + call->regfunc = probe_event_enable; + call->unregfunc = probe_event_disable; + ret = trace_add_event_call(call); + if (ret) + pr_info("Failed to register kprobe event: %s\n", call->name); + return ret; +} + +static void unregister_probe_event(struct trace_probe *tp) +{ + /* + * Prevent to unregister event itself because the event is shared + * among other probes. + */ + tp->call.event = NULL; + trace_remove_event_call(&tp->call); +} + +/* Make a debugfs interface for controling probe points */ +static __init int init_kprobe_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + int ret; + + ret = register_ftrace_event(&kprobe_trace_event); + if (!ret) { + pr_warning("Could not register kprobe_trace_event type.\n"); + return 0; + } + ret = register_ftrace_event(&kretprobe_trace_event); + if (!ret) { + pr_warning("Could not register kretprobe_trace_event type.\n"); + return 0; + } + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("kprobe_events", 0644, d_tracer, + NULL, &kprobe_events_ops); + + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_events' entry\n"); + return 0; +} +fs_initcall(init_kprobe_trace); + + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static int kprobe_trace_selftest_target(int a1, int a2, int a3, + int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} + +static __init int kprobe_trace_self_tests_init(void) +{ + int ret; + int (*target)(int, int, int, int, int, int); + target = kprobe_trace_selftest_target; + + pr_info("Testing kprobe tracing: "); + + ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " + "a1 a2 a3 a4 a5 a6"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function entry\n"); + + ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " + "ra rv"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function return\n"); + + ret = target(1, 2, 3, 4, 5, 6); + + cleanup_all_probes(); + + pr_cont("OK\n"); + return 0; +} + +late_initcall(kprobe_trace_self_tests_init); + +#endif -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-29 00:01:20
|
Add x86 instruction decoder to arch-specific libraries. This decoder can decode x86 instructions used in kernel into prefix, opcode, modrm, sib, displacement and immediates. This can also show the length of instructions. This version introduces instruction attributes for decoding instructions. The instruction attribute tables are generated from the opcode map file (x86-opcode-map.txt) by the generator script(gen-insn-attr-x86.awk). Currently, the opcode maps are based on opcode maps in Intel(R) 64 and IA-32 Architectures Software Developers Manual Vol.2: Appendix.A, and consist of below two types of opcode tables. 1-byte/2-bytes/3-bytes opcodes, which has 256 elements, are written as below; Table: table-name Referrer: escaped-name opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] (or) opcode: escape # escaped-name EndTable Group opcodes, which has 8 elements, are written as below; GrpTable: GrpXXX reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] EndTable These opcode maps do NOT include most of SSE and FP opcodes, because those opcodes are not used in the kernel. Changes from v6.1: - fix patch title. Signed-off-by: Masami Hiramatsu <mhi...@re...> Signed-off-by: Jim Keniston <jke...@us...> Cc: H. Peter Anvin <hp...@zy...> Cc: Steven Rostedt <ro...@go...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Ingo Molnar <mi...@el...> Cc: Frederic Weisbecker <fwe...@gm...> Cc: Andi Kleen <ak...@li...> Cc: Vegard Nossum <veg...@gm...> Cc: Avi Kivity <av...@re...> Cc: Przemysław Pawełczyk <prz...@pa...> --- arch/x86/include/asm/inat.h | 125 ++++++ arch/x86/include/asm/insn.h | 134 ++++++ arch/x86/lib/Makefile | 13 + arch/x86/lib/inat.c | 80 ++++ arch/x86/lib/insn.c | 471 +++++++++++++++++++++ arch/x86/lib/x86-opcode-map.txt | 711 ++++++++++++++++++++++++++++++++ arch/x86/scripts/gen-insn-attr-x86.awk | 314 ++++++++++++++ 7 files changed, 1848 insertions(+), 0 deletions(-) create mode 100644 arch/x86/include/asm/inat.h create mode 100644 arch/x86/include/asm/insn.h create mode 100644 arch/x86/lib/inat.c create mode 100644 arch/x86/lib/insn.c create mode 100644 arch/x86/lib/x86-opcode-map.txt create mode 100644 arch/x86/scripts/gen-insn-attr-x86.awk diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h new file mode 100644 index 0000000..01e079a --- /dev/null +++ b/arch/x86/include/asm/inat.h @@ -0,0 +1,125 @@ +#ifndef _ASM_INAT_INAT_H +#define _ASM_INAT_INAT_H +/* + * x86 instruction attributes + * + * Written by Masami Hiramatsu <mhi...@re...> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include <linux/types.h> + +/* Instruction attributes */ +typedef u32 insn_attr_t; + +/* + * Internal bits. Don't use bitmasks directly, because these bits are + * unstable. You should add checking macros and use that macro in + * your code. + */ + +#define INAT_OPCODE_TABLE_SIZE 256 +#define INAT_GROUP_TABLE_SIZE 8 + +/* Legacy instruction prefixes */ +#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */ +#define INAT_PFX_REPNE 2 /* 0xF2 */ /* LPFX2 */ +#define INAT_PFX_REPE 3 /* 0xF3 */ /* LPFX3 */ +#define INAT_PFX_LOCK 4 /* 0xF0 */ +#define INAT_PFX_CS 5 /* 0x2E */ +#define INAT_PFX_DS 6 /* 0x3E */ +#define INAT_PFX_ES 7 /* 0x26 */ +#define INAT_PFX_FS 8 /* 0x64 */ +#define INAT_PFX_GS 9 /* 0x65 */ +#define INAT_PFX_SS 10 /* 0x36 */ +#define INAT_PFX_ADDRSZ 11 /* 0x67 */ + +#define INAT_LPREFIX_MAX 3 + +/* Immediate size */ +#define INAT_IMM_BYTE 1 +#define INAT_IMM_WORD 2 +#define INAT_IMM_DWORD 3 +#define INAT_IMM_QWORD 4 +#define INAT_IMM_PTR 5 +#define INAT_IMM_VWORD32 6 +#define INAT_IMM_VWORD 7 + +/* Legacy prefix */ +#define INAT_PFX_OFFS 0 +#define INAT_PFX_BITS 4 +#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1) +#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS) +/* Escape opcodes */ +#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS) +#define INAT_ESC_BITS 2 +#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1) +#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS) +/* Group opcodes (1-16) */ +#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS) +#define INAT_GRP_BITS 5 +#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1) +#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS) +/* Immediates */ +#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS) +#define INAT_IMM_BITS 3 +#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS) +/* Flags */ +#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS) +#define INAT_REXPFX (1 << INAT_FLAG_OFFS) +#define INAT_MODRM (1 << (INAT_FLAG_OFFS + 1)) +#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 2)) +#define INAT_ADDIMM (1 << (INAT_FLAG_OFFS + 3)) +#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 4)) +#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 5)) + +/* Attribute search APIs */ +extern insn_attr_t inat_get_opcode_attribute(u8 opcode); +extern insn_attr_t inat_get_escape_attribute(u8 opcode, u8 last_pfx, + insn_attr_t esc_attr); +extern insn_attr_t inat_get_group_attribute(u8 modrm, u8 last_pfx, + insn_attr_t esc_attr); + +/* Attribute checking macros. Use these macros in your code */ +#define INAT_IS_PREFIX(attr) (attr & INAT_PFX_MASK) +#define INAT_IS_ADDRSZ(attr) ((attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ) +#define INAT_IS_OPNDSZ(attr) ((attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ) +#define INAT_LPREFIX_NUM(attr) \ + (((attr & INAT_PFX_MASK) > INAT_LPREFIX_MAX) ? 0 :\ + (attr & INAT_PFX_MASK)) +#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) + +#define INAT_IS_ESCAPE(attr) (attr & INAT_ESC_MASK) +#define INAT_ESCAPE_NUM(attr) ((attr & INAT_ESC_MASK) >> INAT_ESC_OFFS) +#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) + +#define INAT_IS_GROUP(attr) (attr & INAT_GRP_MASK) +#define INAT_GROUP_NUM(attr) ((attr & INAT_GRP_MASK) >> INAT_GRP_OFFS) +#define INAT_GROUP_COMMON(attr) (attr & ~INAT_GRP_MASK) +#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) + +#define INAT_HAS_IMM(attr) (attr & INAT_IMM_MASK) +#define INAT_IMM_SIZE(attr) ((attr & INAT_IMM_MASK) >> INAT_IMM_OFFS) +#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) + +#define INAT_IS_REX_PREFIX(attr) (attr & INAT_REXPFX) +#define INAT_HAS_MODRM(attr) (attr & INAT_MODRM) +#define INAT_IS_FORCE64(attr) (attr & INAT_FORCE64) +#define INAT_HAS_ADDIMM(attr) (attr & INAT_ADDIMM) +#define INAT_HAS_MOFFSET(attr) (attr & INAT_MOFFSET) +#define INAT_HAS_VARIANT(attr) (attr & INAT_VARIANT) + +#endif diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h new file mode 100644 index 0000000..5b50fa3 --- /dev/null +++ b/arch/x86/include/asm/insn.h @@ -0,0 +1,134 @@ +#ifndef _ASM_X86_INSN_H +#define _ASM_X86_INSN_H +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include <linux/types.h> +/* insn_attr_t is defined in inat.h */ +#include <asm/inat.h> + +struct insn_field { + union { + s32 value; + u8 bytes[4]; + }; + bool got; /* true if we've run insn_get_xxx() for this field */ + u8 nbytes; +}; + +struct insn { + struct insn_field prefixes; /* + * Prefixes + * prefixes.bytes[3]: last prefix + */ + struct insn_field rex_prefix; /* REX prefix */ + struct insn_field opcode; /* + * opcode.bytes[0]: opcode1 + * opcode.bytes[1]: opcode2 + * opcode.bytes[2]: opcode3 + */ + struct insn_field modrm; + struct insn_field sib; + struct insn_field displacement; + union { + struct insn_field immediate; + struct insn_field moffset1; /* for 64bit MOV */ + struct insn_field immediate1; /* for 64bit imm or off16/32 */ + }; + union { + struct insn_field moffset2; /* for 64bit MOV */ + struct insn_field immediate2; /* for 64bit imm or seg16 */ + }; + + insn_attr_t attr; + u8 opnd_bytes; + u8 addr_bytes; + u8 length; + bool x86_64; + + const u8 *kaddr; /* kernel address of insn (copy) to analyze */ + const u8 *next_byte; +}; + +#define OPCODE1(insn) ((insn)->opcode.bytes[0]) +#define OPCODE2(insn) ((insn)->opcode.bytes[1]) +#define OPCODE3(insn) ((insn)->opcode.bytes[2]) + +#define MODRM_MOD(insn) (((insn)->modrm.value & 0xc0) >> 6) +#define MODRM_REG(insn) (((insn)->modrm.value & 0x38) >> 3) +#define MODRM_RM(insn) ((insn)->modrm.value & 0x07) + +#define SIB_SCALE(insn) (((insn)->sib.value & 0xc0) >> 6) +#define SIB_INDEX(insn) (((insn)->sib.value & 0x38) >> 3) +#define SIB_BASE(insn) ((insn)->sib.value & 0x07) + +#define REX_W(insn) ((insn)->rex_prefix.value & 8) +#define REX_R(insn) ((insn)->rex_prefix.value & 4) +#define REX_X(insn) ((insn)->rex_prefix.value & 2) +#define REX_B(insn) ((insn)->rex_prefix.value & 1) + +/* The last prefix is needed for two-byte and three-byte opcodes */ +#define LAST_PREFIX(insn) ((insn)->prefixes.bytes[3]) + +#define MOFFSET64(insn) (((u64)((insn)->moffset2.value) << 32) | \ + (u32)((insn)->moffset1.value)) + +#define IMMEDIATE64(insn) (((u64)((insn)->immediate2.value) << 32) | \ + (u32)((insn)->immediate1.value)) + +extern void insn_init(struct insn *insn, const u8 *kaddr, bool x86_64); +extern void insn_get_prefixes(struct insn *insn); +extern void insn_get_opcode(struct insn *insn); +extern void insn_get_modrm(struct insn *insn); +extern void insn_get_sib(struct insn *insn); +extern void insn_get_displacement(struct insn *insn); +extern void insn_get_immediate(struct insn *insn); +extern void insn_get_length(struct insn *insn); + +/* Attribute will be determined after getting ModRM (for opcode groups) */ +static inline void insn_get_attr(struct insn *insn) +{ + insn_get_modrm(insn); +} + +/* Instruction uses RIP-relative addressing */ +extern bool insn_rip_relative(struct insn *insn); + +#ifdef CONFIG_X86_64 +/* Init insn for kernel text */ +#define kernel_insn_init(insn, kaddr) insn_init(insn, kaddr, 1) +#else /* CONFIG_X86_32 */ +#define kernel_insn_init(insn, kaddr) insn_init(insn, kaddr, 0) +#endif + +#define INSN_PREFIXES_OFFS(insn) (0) +#define INSN_REXPREFIX_OFFS(insn) ((insn)->prefixes.nbytes) +#define INSN_OPCODE_OFFS(insn) (INSN_REXPREFIX_OFFS(insn) + \ + ((insn)->rex_prefix.nbytes)) +#define INSN_MODRM_OFFS(insn) (INSN_OPCODE_OFFS(insn) + \ + ((insn)->opcode.nbytes)) +#define INSN_SIB_OFFS(insn) (INSN_MODRM_OFFS(insn) + \ + ((insn)->modrm.nbytes)) +#define INSN_DISPLACEMENT_OFFS(insn) (INSN_SIB_OFFS(insn) + \ + ((insn)->sib.nbytes)) +#define INSN_IMMEDIATE_OFFS(insn) (INSN_DISPLACEMENT_OFFS(insn) + \ + ((insn)->displacement.nbytes)) + +#endif /* _ASM_X86_INSN_H */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 55e11aa..db0e3be 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -2,12 +2,25 @@ # Makefile for x86 specific library files. # +inat_tables_script = $(srctree)/arch/x86/scripts/gen-insn-attr-x86.awk +inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt +quiet_cmd_inat_tables = GEN $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ + +$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) + $(call cmd,inat_tables) + +$(obj)/inat.o: $(obj)/inat-tables.c + +clean-files := inat-tables.c + obj-$(CONFIG_SMP) := msr-on-cpu.o lib-y := delay.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o +lib-y += insn.o inat.o ifeq ($(CONFIG_X86_32),y) lib-y += checksum_32.o diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c new file mode 100644 index 0000000..d6a34be --- /dev/null +++ b/arch/x86/lib/inat.c @@ -0,0 +1,80 @@ +/* + * x86 instruction attribute tables + * + * Written by Masami Hiramatsu <mhi...@re...> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include <linux/module.h> +#include <asm/insn.h> + +/* Attribute tables are generated from opcode map */ +#include "inat-tables.c" + +/* Attribute search APIs */ +insn_attr_t inat_get_opcode_attribute(u8 opcode) +{ + return inat_primary_table[opcode]; +} + +insn_attr_t inat_get_escape_attribute(u8 opcode, u8 last_pfx, + insn_attr_t esc_attr) +{ + const insn_attr_t *table; + insn_attr_t lpfx_attr; + int n, m = 0; + + n = INAT_ESCAPE_NUM(esc_attr); + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = INAT_LPREFIX_NUM(lpfx_attr); + } + table = inat_escape_tables[n][0]; + if (!table) + return 0; + if (INAT_HAS_VARIANT(table[opcode]) && m) { + table = inat_escape_tables[n][m]; + if (!table) + return 0; + } + return table[opcode]; +} + +#define REGBITS(modrm) (((modrm) >> 3) & 0x7) + +insn_attr_t inat_get_group_attribute(u8 modrm, u8 last_pfx, + insn_attr_t grp_attr) +{ + const insn_attr_t *table; + insn_attr_t lpfx_attr; + int n, m = 0; + + n = INAT_GROUP_NUM(grp_attr); + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = INAT_LPREFIX_NUM(lpfx_attr); + } + table = inat_group_tables[n][0]; + if (!table) + return INAT_GROUP_COMMON(grp_attr); + if (INAT_HAS_VARIANT(table[REGBITS(modrm)]) && m) { + table = inat_escape_tables[n][m]; + if (!table) + return INAT_GROUP_COMMON(grp_attr); + } + return table[REGBITS(modrm)] | INAT_GROUP_COMMON(grp_attr); +} + diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c new file mode 100644 index 0000000..254c848 --- /dev/null +++ b/arch/x86/lib/insn.c @@ -0,0 +1,471 @@ +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004, 2009 + */ + +#include <linux/string.h> +#include <linux/module.h> +#include <asm/inat.h> +#include <asm/insn.h> + +#define get_next(t, insn) \ + ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) + +#define peek_next(t, insn) \ + ({t r; r = *(t*)insn->next_byte; r; }) + +/** + * insn_init() - initialize struct insn + * @insn: &struct insn to be initialized + * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @x86_64: true for 64-bit kernel or 64-bit app + */ +void insn_init(struct insn *insn, const u8 *kaddr, bool x86_64) +{ + memset(insn, 0, sizeof(*insn)); + insn->kaddr = kaddr; + insn->next_byte = kaddr; + insn->x86_64 = x86_64; + insn->opnd_bytes = 4; + if (x86_64) + insn->addr_bytes = 8; + else + insn->addr_bytes = 4; +} +EXPORT_SYMBOL_GPL(insn_init); + +/** + * insn_get_prefixes - scan x86 instruction prefix bytes + * @insn: &struct insn containing instruction + * + * Populates the @insn->prefixes bitmap, and updates @insn->next_byte + * to point to the (first) opcode. No effect if @insn->prefixes.got + * is already true. + */ +void insn_get_prefixes(struct insn *insn) +{ + struct insn_field *prefixes = &insn->prefixes; + insn_attr_t attr; + u8 b, lb, i, nb; + + if (prefixes->got) + return; + + nb = 0; + lb = 0; + b = peek_next(u8, insn); + attr = inat_get_opcode_attribute(b); + while (INAT_IS_PREFIX(attr)) { + /* Skip if same prefix */ + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == b) + goto found; + if (nb == 4) + /* Invalid instruction */ + break; + prefixes->bytes[nb++] = b; + if (INAT_IS_ADDRSZ(attr)) { + /* address size switches 2/4 or 4/8 */ + if (insn->x86_64) + insn->addr_bytes ^= 12; + else + insn->addr_bytes ^= 6; + } else if (INAT_IS_OPNDSZ(attr)) { + /* oprand size switches 2/4 */ + insn->opnd_bytes ^= 6; + } +found: + prefixes->nbytes++; + insn->next_byte++; + lb = b; + b = peek_next(u8, insn); + attr = inat_get_opcode_attribute(b); + } + /* Set the last prefix */ + if (lb && lb != LAST_PREFIX(insn)) { + if (unlikely(LAST_PREFIX(insn))) { + /* Swap the last prefix */ + b = LAST_PREFIX(insn); + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == lb) + prefixes->bytes[i] = b; + } + LAST_PREFIX(insn) = lb; + } + + if (insn->x86_64) { + b = peek_next(u8, insn); + attr = inat_get_opcode_attribute(b); + if (INAT_IS_REX_PREFIX(attr)) { + insn->rex_prefix.value = b; + insn->rex_prefix.nbytes = 1; + insn->next_byte++; + if (REX_W(insn)) + /* REX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } + } + insn->rex_prefix.got = true; + prefixes->got = true; + return; +} +EXPORT_SYMBOL_GPL(insn_get_prefixes); + +/** + * insn_get_opcode - collect opcode(s) + * @insn: &struct insn containing instruction + * + * Populates @insn->opcode, updates @insn->next_byte to point past the + * opcode byte(s), and set @insn->attr (except for groups). + * If necessary, first collects any preceding (prefix) bytes. + * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got + * is already true. + * + */ +void insn_get_opcode(struct insn *insn) +{ + struct insn_field *opcode = &insn->opcode; + u8 op, pfx; + if (opcode->got) + return; + if (!insn->prefixes.got) + insn_get_prefixes(insn); + + /* Get first opcode */ + op = get_next(u8, insn); + OPCODE1(insn) = op; + opcode->nbytes = 1; + insn->attr = inat_get_opcode_attribute(op); + while (INAT_IS_ESCAPE(insn->attr)) { + /* Get escaped opcode */ + op = get_next(u8, insn); + opcode->bytes[opcode->nbytes++] = op; + pfx = LAST_PREFIX(insn); + insn->attr = inat_get_escape_attribute(op, pfx, insn->attr); + } + opcode->got = true; +} +EXPORT_SYMBOL_GPL(insn_get_opcode); + +/** + * insn_get_modrm - collect ModRM byte, if any + * @insn: &struct insn containing instruction + * + * Populates @insn->modrm and updates @insn->next_byte to point past the + * ModRM byte, if any. If necessary, first collects the preceding bytes + * (prefixes and opcode(s)). No effect if @insn->modrm.got is already true. + */ +void insn_get_modrm(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + u8 pfx, mod; + if (modrm->got) + return; + if (!insn->opcode.got) + insn_get_opcode(insn); + + if (INAT_HAS_MODRM(insn->attr)) { + mod = get_next(u8, insn); + modrm->value = mod; + modrm->nbytes = 1; + if (INAT_IS_GROUP(insn->attr)) { + pfx = LAST_PREFIX(insn); + insn->attr = inat_get_group_attribute(mod, pfx, + insn->attr); + } + } + + if (insn->x86_64 && INAT_IS_FORCE64(insn->attr)) + insn->opnd_bytes = 8; + modrm->got = true; +} +EXPORT_SYMBOL_GPL(insn_get_modrm); + + +/** + * insn_rip_relative() - Does instruction use RIP-relative addressing mode? + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. No effect if @insn->x86_64 is false. + */ +bool insn_rip_relative(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + + if (!insn->x86_64) + return false; + if (!modrm->got) + insn_get_modrm(insn); + /* + * For rip-relative instructions, the mod field (top 2 bits) + * is zero and the r/m field (bottom 3 bits) is 0x5. + */ + return (modrm->nbytes && (modrm->value & 0xc7) == 0x5); +} +EXPORT_SYMBOL_GPL(insn_rip_relative); + +/** + * + * insn_get_sib() - Get the SIB byte of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. + */ +void insn_get_sib(struct insn *insn) +{ + if (insn->sib.got) + return; + if (!insn->modrm.got) + insn_get_modrm(insn); + if (insn->modrm.nbytes) + if (insn->addr_bytes != 2 && + MODRM_MOD(insn) != 3 && MODRM_RM(insn) == 4) { + insn->sib.value = get_next(u8, insn); + insn->sib.nbytes = 1; + } + insn->sib.got = true; +} +EXPORT_SYMBOL_GPL(insn_get_sib); + + +/** + * + * insn_get_displacement() - Get the displacement of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * SIB byte. + * Displacement value is sign-expanded. + */ +void insn_get_displacement(struct insn *insn) +{ + u8 mod; + if (insn->displacement.got) + return; + if (!insn->sib.got) + insn_get_sib(insn); + if (insn->modrm.nbytes) { + /* + * Interpreting the modrm byte: + * mod = 00 - no displacement fields (exceptions below) + * mod = 01 - 1-byte displacement field + * mod = 10 - displacement field is 4 bytes, or 2 bytes if + * address size = 2 (0x67 prefix in 32-bit mode) + * mod = 11 - no memory operand + * + * If address size = 2... + * mod = 00, r/m = 110 - displacement field is 2 bytes + * + * If address size != 2... + * mod != 11, r/m = 100 - SIB byte exists + * mod = 00, SIB base = 101 - displacement field is 4 bytes + * mod = 00, r/m = 101 - rip-relative addressing, displacement + * field is 4 bytes + */ + mod = MODRM_MOD(insn); + if (mod == 3) + goto out; + if (mod == 1) { + insn->displacement.value = get_next(s8, insn); + insn->displacement.nbytes = 1; + } else if (insn->addr_bytes == 2) { + if ((mod == 0 && MODRM_RM(insn) == 6) || mod == 2) { + insn->displacement.value = get_next(s16, insn); + insn->displacement.nbytes = 2; + } + } else { + if ((mod == 0 && MODRM_RM(insn) == 5) || mod == 2 || + (mod == 0 && SIB_BASE(insn) == 5)) { + insn->displacement.value = get_next(s32, insn); + insn->displacement.nbytes = 4; + } + } + } +out: + insn->displacement.got = true; +} +EXPORT_SYMBOL_GPL(insn_get_displacement); + +/* Decode moffset16/32/64 */ +static void __get_moffset(struct insn *insn) +{ + switch (insn->addr_bytes) { + case 2: + insn->moffset1.value = get_next(s16, insn); + insn->moffset1.nbytes = 2; + break; + case 4: + insn->moffset1.value = get_next(s32, insn); + insn->moffset1.nbytes = 4; + break; + case 8: + insn->moffset1.value = get_next(s32, insn); + insn->moffset1.nbytes = 4; + insn->moffset2.value = get_next(s32, insn); + insn->moffset2.nbytes = 4; + break; + } + insn->moffset1.got = insn->moffset2.got = true; +} + +/* Decode imm v32(Iz) */ +static void __get_immv32(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate.value = get_next(s16, insn); + insn->immediate.nbytes = 2; + break; + case 4: + case 8: + insn->immediate.value = get_next(s32, insn); + insn->immediate.nbytes = 4; + break; + } +} + +/* Decode imm v64(Iv/Ov) */ +static void __get_immv(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(s16, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(s32, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + insn->immediate1.value = get_next(s32, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(s32, insn); + insn->immediate2.nbytes = 4; + break; + } + insn->immediate1.got = insn->immediate2.got = true; +} + +/* Decode ptr16:16/32(Ap) */ +static void __get_immptr(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(s16, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(s32, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + /* ptr16:64 is not supported (no segment) */ + WARN_ON(1); + return; + } + insn->immediate2.value = get_next(u16, insn); + insn->immediate2.nbytes = 2; + insn->immediate1.got = insn->immediate2.got = true; +} + +/** + * + * insn_get_immediate() - Get the immediates of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * displacement bytes. + * Basically, most of immediates are sign-expanded. Unsigned-value can be + * get by bit masking with ((1 << (nbytes * 8)) - 1) + */ +void insn_get_immediate(struct insn *insn) +{ + if (insn->immediate.got) + return; + if (!insn->displacement.got) + insn_get_displacement(insn); + + if (INAT_HAS_MOFFSET(insn->attr)) { + __get_moffset(insn); + goto done; + } + + if (!INAT_HAS_IMM(insn->attr)) + /* no immediates */ + goto done; + + switch (INAT_IMM_SIZE(insn->attr)) { + case INAT_IMM_BYTE: + insn->immediate.value = get_next(s8, insn); + insn->immediate.nbytes = 1; + break; + case INAT_IMM_WORD: + insn->immediate.value = get_next(s16, insn); + insn->immediate.nbytes = 2; + break; + case INAT_IMM_DWORD: + insn->immediate.value = get_next(s32, insn); + insn->immediate.nbytes = 4; + break; + case INAT_IMM_QWORD: + insn->immediate1.value = get_next(s32, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(s32, insn); + insn->immediate2.nbytes = 4; + break; + case INAT_IMM_PTR: + __get_immptr(insn); + break; + case INAT_IMM_VWORD32: + __get_immv32(insn); + break; + case INAT_IMM_VWORD: + __get_immv(insn); + break; + default: + break; + } + if (INAT_HAS_ADDIMM(insn->attr)) { + insn->immediate2.value = get_next(s8, insn); + insn->immediate2.nbytes = 1; + } +done: + insn->immediate.got = true; +} +EXPORT_SYMBOL_GPL(insn_get_immediate); + +/** + * + * insn_get_length() - Get the length of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * immediates bytes. + */ +void insn_get_length(struct insn *insn) +{ + if (insn->length) + return; + if (!insn->immediate.got) + insn_get_immediate(insn); + insn->length = (u8)((unsigned long)insn->next_byte + - (unsigned long)insn->kaddr); +} +EXPORT_SYMBOL_GPL(insn_get_length); diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt new file mode 100644 index 0000000..ab2a58d --- /dev/null +++ b/arch/x86/lib/x86-opcode-map.txt @@ -0,0 +1,711 @@ +# x86 Opcode Maps +# +#<Opcode maps> +# Table: table-name +# Referrer: escaped-name +# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# (or) +# opcode: escape # escaped-name +# EndTable +# +#<group maps> +# GrpTable: GrpXXX +# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# EndTable +# + +Table: one byte opcode +Referrer: +# 0x00 - 0x0f +00: ADD Eb,Gb +01: ADD Ev,Gv +02: ADD Gb,Eb +03: ADD Gv,Ev +04: ADD AL,Ib +05: ADD rAX,Iz +06: PUSH ES (i64) +07: POP ES (i64) +08: OR Eb,Gb +09: OR Ev,Gv +0a: OR Gb,Eb +0b: OR Gv,Ev +0c: OR AL,Ib +0d: OR rAX,Iz +0e: PUSH CS (i64) +0f: escape # 2-byte escape +# 0x10 - 0x1f +10: ADC Eb,Gb +11: ADC Ev,Gv +12: ADC Gb,Eb +13: ADC Gv,Ev +14: ADC AL,Ib +15: ADC rAX,Iz +16: PUSH SS (i64) +17: POP SS (i64) +18: SBB Eb,Gb +19: SBB Ev,Gv +1a: SBB Gb,Eb +1b: SBB Gv,Ev +1c: SBB AL,Ib +1d: SBB rAX,Iz +1e: PUSH DS (i64) +1f: POP DS (i64) +# 0x20 - 0x2f +20: AND Eb,Gb +21: AND Ev,Gv +22: AND Gb,Eb +23: AND Gv,Ev +24: AND AL,Ib +25: AND rAx,Iz +26: SEG=ES (Prefix) +27: DAA (i64) +28: SUB Eb,Gb +29: SUB Ev,Gv +2a: SUB Gb,Eb +2b: SUB Gv,Ev +2c: SUB AL,Ib +2d: SUB rAX,Iz +2e: SEG=CS (Prefix) +2f: DAS (i64) +# 0x30 - 0x3f +30: XOR Eb,Gb +31: XOR Ev,Gv +32: XOR Gb,Eb +33: XOR Gv,Ev +34: XOR AL,Ib +35: XOR rAX,Iz +36: SEG=SS (Prefix) +37: AAA (i64) +38: CMP Eb,Gb +39: CMP Ev,Gv +3a: CMP Gb,Eb +3b: CMP Gv,Ev +3c: CMP AL,Ib +3d: CMP rAX,Iz +3e: SEG=DS (Prefix) +3f: AAS (i64) +# 0x40 - 0x4f +40: INC eAX (i64) | REX (o64) +41: INC eCX (i64) | REX.B (o64) +42: INC eDX (i64) | REX.X (o64) +43: INC eBX (i64) | REX.XB (o64) +44: INC eSP (i64) | REX.R (o64) +45: INC eBP (i64) | REX.RB (o64) +46: INC eSI (i64) | REX.RX (o64) +47: INC eDI (i64) | REX.RXB (o64) +48: DEC eAX (i64) | REX.W (o64) +49: DEC eCX (i64) | REX.WB (o64) +4a: DEC eDX (i64) | REX.WX (o64) +4b: DEC eBX (i64) | REX.WXB (o64) +4c: DEC eSP (i64) | REX.WR (o64) +4d: DEC eBP (i64) | REX.WRB (o64) +4e: DEC eSI (i64) | REX.WRX (o64) +4f: DEC eDI (i64) | REX.WRXB (o64) +# 0x50 - 0x5f +50: PUSH rAX/r8 (d64) +51: PUSH rCX/r9 (d64) +52: PUSH rDX/r10 (d64) +53: PUSH rBX/r11 (d64) +54: PUSH rSP/r12 (d64) +55: PUSH rBP/r13 (d64) +56: PUSH rSI/r14 (d64) +57: PUSH rDI/r15 (d64) +58: POP rAX/r8 (d64) +59: POP rCX/r9 (d64) +5a: POP rDX/r10 (d64) +5b: POP rBX/r11 (d64) +5c: POP rSP/r12 (d64) +5d: POP rBP/r13 (d64) +5e: POP rSI/r14 (d64) +5f: POP rDI/r15 (d64) +# 0x60 - 0x6f +60: PUSHA/PUSHAD (i64) +61: POPA/POPAD (i64) +62: BOUND Gv,Ma (i64) +63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64) +64: SEG=FS (Prefix) +65: SEG=GS (Prefix) +66: Operand-Size (Prefix) +67: Address-Size (Prefix) +68: PUSH Iz (d64) +69: IMUL Gv,Ev,Iz +6a: PUSH Ib (d64) +6b: IMUL Gv,Ev,Ib +6c: INS/INSB Yb,DX +6d: INS/INSW/INSD Yz,DX +6e: OUTS/OUTSB DX,Xb +6f: OUTS/OUTSW/OUTSD DX,Xz +# 0x70 - 0x7f +70: JO Jb +71: JNO Jb +72: JB/JNAE/JC Jb +73: JNB/JAE/JNC Jb +74: JZ/JE Jb +75: JNZ/JNE Jb +76: JBE/JNA Jb +77: JNBE/JA Jb +78: JS Jb +79: JNS Jb +7a: JP/JPE Jb +7b: JNP/JPO Jb +7c: JL/JNGE Jb +7d: JNL/JGE Jb +7e: JLE/JNG Jb +7f: JNLE/JG Jb +# 0x80 - 0x8f +80: Grp1 Eb,Ib (1A) +81: Grp1 Ev,Iz (1A) +82: Grp1 Eb,Ib (1A),(i64) +83: Grp1 Ev,Ib (1A) +84: TEST Eb,Gb +85: TEST Ev,Gv +86: XCHG Eb,Gb +87: XCHG Ev,Gv +88: MOV Eb,Gb +89: MOV Ev,Gv +8a: MOV Gb,Eb +8b: MOV Gv,Ev +8c: MOV Ev,Sw +8d: LEA Gv,M +8e: MOV Sw,Ew +8f: Grp1A (1A) | POP Ev (d64) +# 0x90 - 0x9f +90: NOP | PAUSE (F3) | XCHG r8,rAX +91: XCHG rCX/r9,rAX +92: XCHG rDX/r10,rAX +93: XCHG rBX/r11,rAX +94: XCHG rSP/r12,rAX +95: XCHG rBP/r13,rAX +96: XCHG rSI/r14,rAX +97: XCHG rDI/r15,rAX +98: CBW/CWDE/CDQE +99: CWD/CDQ/CQO +9a: CALLF Ap (i64) +9b: FWAIT/WAIT +9c: PUSHF/D/Q Fv (d64) +9d: POPF/D/Q Fv (d64) +9e: SAHF +9f: LAHF +# 0xa0 - 0xaf +a0: MOV AL,Ob +a1: MOV rAX,Ov +a2: MOV Ob,AL +a3: MOV Ov,rAX +a4: MOVS/B Xb,Yb +a5: MOVS/W/D/Q Xv,Yv +a6: CMPS/B Xb,Yb +a7: CMPS/W/D Xv,Yv +a8: TEST AL,Ib +a9: TEST rAX,Iz +aa: STOS/B Yb,AL +ab: STOS/W/D/Q Yv,rAX +ac: LODS/B AL,Xb +ad: LODS/W/D/Q rAX,Xv +ae: SCAS/B AL,Yb +af: SCAS/W/D/Q rAX,Xv +# 0xb0 - 0xbf +b0: MOV AL/R8L,Ib +b1: MOV CL/R9L,Ib +b2: MOV DL/R10L,Ib +b3: MOV BL/R11L,Ib +b4: MOV AH/R12L,Ib +b5: MOV CH/R13L,Ib +b6: MOV DH/R14L,Ib +b7: MOV BH/R15L,Ib +b8: MOV rAX/r8,Iv +b9: MOV rCX/r9,Iv +ba: MOV rDX/r10,Iv +bb: MOV rBX/r11,Iv +bc: MOV rSP/r12,Iv +bd: MOV rBP/r13,Iv +be: MOV rSI/r14,Iv +bf: MOV rDI/r15,Iv +# 0xc0 - 0xcf +c0: Grp2 Eb,Ib (1A) +c1: Grp2 Ev,Ib (1A) +c2: RETN Iw (f64) +c3: RETN +c4: LES Gz,Mp (i64) +c5: LDS Gz,Mp (i64) +c6: Grp11 Eb,Ib (1A) +c7: Grp11 Ev,Iz (1A) +c8: ENTER Iw,Ib +c9: LEAVE (d64) +ca: RETF Iw +cb: RETF +cc: INT3 +cd: INT Ib +ce: INTO (i64) +cf: IRET/D/Q +# 0xd0 - 0xdf +d0: Grp2 Eb,1 (1A) +d1: Grp2 Ev,1 (1A) +d2: Grp2 Eb,CL (1A) +d3: Grp2 Ev,CL (1A) +d4: AAM Ib (i64) +d5: AAD Ib (i64) +d6: +d7: XLAT/XLATB +d8: ESC +d9: ESC +da: ESC +db: ESC +dc: ESC +dd: ESC +de: ESC +df: ESC +# 0xe0 - 0xef +e0: LOOPNE/LOOPNZ Jb (f64) +e1: LOOPE/LOOPZ Jb (f64) +e2: LOOP Jb (f64) +e3: JrCXZ Jb (f64) +e4: IN AL,Ib +e5: IN eAX,Ib +e6: OUT Ib,AL +e7: OUT Ib,eAX +e8: CALL Jz (f64) +e9: JMP-near Jz (f64) +ea: JMP-far Ap (i64) +eb: JMP-short Jb (f64) +ec: IN AL,DX +ed: IN eAX,DX +ee: OUT DX,AL +ef: OUT DX,eAX +# 0xf0 - 0xff +f0: LOCK (Prefix) +f1: +f2: REPNE (Prefix) +f3: REP/REPE (Prefix) +f4: HLT +f5: CMC +f6: Grp3_1 Eb (1A) +f7: Grp3_2 Ev (1A) +f8: CLC +f9: STC +fa: CLI +fb: STI +fc: CLD +fd: STD +fe: Grp4 (1A) +ff: Grp5 (1A) +EndTable + +Table: 2-byte opcode # First Byte is 0x0f +Referrer: 2-byte escape +# 0x0f 0x00-0x0f +00: Grp6 (1A) +01: Grp7 (1A) +02: LAR Gv,Ew +03: LSL Gv,Ew +04: +05: SYSCALL (o64) +06: CLTS +07: SYSRET (o64) +08: INVD +09: WBINVD +0a: +0b: UD2 (1B) +0c: +0d: NOP Ev +0e: +0f: +# 0x0f 0x10-0x1f +10: +11: +12: +13: +14: +15: +16: +17: +18: Grp16 (1A) +19: +1a: +1b: +1c: +1d: +1e: +1f: NOP Ev +# 0x0f 0x20-0x2f +20: MOV Rd,Cd +21: MOV Rd,Dd +22: MOV Cd,Rd +23: MOV Dd,Rd +24: +25: +26: +27: +28: movaps Vps,Wps | movapd Vpd,Wpd (66) +29: movaps Wps,Vps | movapd Wpd,Vpd (66) +2a: +2b: +2c: +2d: +2e: +2f: +# 0x0f 0x30-0x3f +30: WRMSR +31: RDTSC +32: RDMSR +33: RDPMC +34: SYSENTER +35: SYSEXIT +36: +37: GETSEC +38: escape # 3-byte escape 1 +39: +3a: escape # 3-byte escape 2 +3b: +3c: +3d: +3e: +3f: +# 0x0f 0x40-0x4f +40: CMOVO Gv,Ev +41: CMOVNO Gv,Ev +42: CMOVB/C/NAE Gv,Ev +43: CMOVAE/NB/NC Gv,Ev +44: CMOVE/Z Gv,Ev +45: CMOVNE/NZ Gv,Ev +46: CMOVBE/NA Gv,Ev +47: CMOVA/NBE Gv,Ev +48: CMOVS Gv,Ev +49: CMOVNS Gv,Ev +4a: CMOVP/PE Gv,Ev +4b: CMOVNP/PO Gv,Ev +4c: CMOVL/NGE Gv,Ev +4d: CMOVNL/GE Gv,Ev +4e: CMOVLE/NG Gv,Ev +4f: CMOVNLE/G Gv,Ev +# 0x0f 0x50-0x5f +50: +51: +52: +53: +54: +55: +56: +57: +58: +59: +5a: +5b: +5c: +5d: +5e: +5f: +# 0x0f 0x60-0x6f +60: +61: +62: +63: +64: +65: +66: +67: +68: +69: +6a: +6b: +6c: +6d: +6e: +6f: +# 0x0f 0x70-0x7f +70: +71: Grp12 (1A) +72: Grp13 (1A) +73: Grp14 (1A) +74: +75: +76: +77: +78: VMREAD Ed/q,Gd/q +79: VMWRITE Gd/q,Ed/q +7a: +7b: +7c: +7d: +7e: +7f: +# 0x0f 0x80-0x8f +80: JO Jz (f64) +81: JNO Jz (f64) +82: JB/JNAE/JC Jz (f64) +83: JNB/JAE/JNC Jz (f64) +84: JZ/JE Jz (f64) +85: JNZ/JNE Jz (f64) +86: JBE/JNA Jz (f64) +87: JNBE/JA Jz (f64) +88: JS Jz (f64) +89: JNS Jz (f64) +8a: JP/JPE Jz (f64) +8b: JNP/JPO Jz (f64) +8c: JL/JNGE Jz (f64) +8d: JNL/JGE Jz (f64) +8e: JLE/JNG Jz (f64) +8f: JNLE/JG Jz (f64) +# 0x0f 0x90-0x9f +90: SETO Eb +91: SETNO Eb +92: SETB/C/NAE Eb +93: SETAE/NB/NC Eb +94: SETE/Z Eb +95: SETNE/NZ Eb +96: SETBE/NA Eb +97: SETA/NBE Eb +98: SETS Eb +99: SETNS Eb +9a: SETP/PE Eb +9b: SETNP/PO Eb +9c: SETL/NGE Eb +9d: SETNL/GE Eb +9e: SETLE/NG Eb +9f: SETNLE/G Eb +# 0x0f 0xa0-0xaf +a0: PUSH FS (d64) +a1: POP FS (d64) +a2: CPUID +a3: BT Ev,Gv +a4: SHLD Ev,Gv,Ib +a5: SHLD Ev,Gv,CL +a6: +a7: +a8: PUSH GS (d64) +a9: POP GS (d64) +aa: RSM +ab: BTS Ev,Gv +ac: SHRD Ev,Gv,Ib +ad: SHRD Ev,Gv,CL +ae: Grp15 (1A),(1C) +af: IMUL Gv,Ev +# 0x0f 0xb0-0xbf +b0: CMPXCHG Eb,Gb +b1: CMPXCHG Ev,Gv +b2: LSS Gv,Mp +b3: BTR Ev,Gv +b4: LFS Gv,Mp +b5: LGS Gv,Mp +b6: MOVZX Gv,Eb +b7: MOVZX Gv,Ew +b8: JMPE | POPCNT Gv,Ev (F3) +b9: Grp10 (1A) +ba: Grp8 Ev,Ib (1A) +bb: BTC Ev,Gv +bc: BSF Gv,Ev +bd: BSR Gv,Ev +be: MOVSX Gv,Eb +bf: MOVSX Gv,Ew +# 0x0f 0xc0-0xcf +c0: XADD Eb,Gb +c1: XADD Ev,Gv +c2: +c3: movnti Md/q,Gd/q +c4: +c5: +c6: +c7: Grp9 (1A) +c8: BSWAP RAX/EAX/R8/R8D +c9: BSWAP RCX/ECX/R9/R9D +ca: BSWAP RDX/EDX/R10/R10D +cb: BSWAP RBX/EBX/R11/R11D +cc: BSWAP RSP/ESP/R12/R12D +cd: BSWAP RBP/EBP/R13/R13D +ce: BSWAP RSI/ESI/R14/R14D +cf: BSWAP RDI/EDI/R15/R15D +# 0x0f 0xd0-0xdf +d0: +d1: +d2: +d3: +d4: +d5: +d6: +d7: +d8: +d9: +da: +db: +dc: +dd: +de: +df: +# 0x0f 0xe0-0xef +e0: +e1: +e2: +e3: +e4: +e5: +e6: +e7: +e8: +e9: +ea: +eb: +ec: +ed: +ee: +ef: +# 0x0f 0xf0-0xff +f0: +f1: +f2: +f3: +f4: +f5: +f6: +f7: +f8: +f9: +fa: +fb: +fc: +fd: +fe: +ff: +EndTable + +Table: 3-byte opcode 1 +Referrer: 3-byte escape 1 +80: INVEPT Gd/q,Mdq (66) +81: INVPID Gd/q,Mdq (66) +f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) +f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2) +EndTable + +Table: 3-byte opcode 2 +Referrer: 3-byte escape 2 +# all opcode is for SSE +EndTable + +GrpTable: Grp1 +0: ADD +1: OR +2: ADC +3: SBB +4: AND +5: SUB +6: XOR +7: CMP +EndTable + +GrpTable: Grp1A +0: POP +EndTable + +GrpTable: Grp2 +0: ROL +1: ROR +2: RCL +3: RCR +4: SHL/SAL +5: SHR +6: +7: SAR +EndTable + +GrpTable: Grp3_1 +0: TEST Eb,Ib +1: +2: NOT Eb +3: NEG Eb +4: MUL AL,Eb +5: IMUL AL,Eb +6: DIV AL,Eb +7: IDIV AL,Eb +EndTable + +GrpTable: Grp3_2 +0: TEST Ev,Iz +1: +2: NOT Ev +3: NEG Ev +4: MUL rAX,Ev +5: IMUL rAX,Ev +6: DIV rAX,Ev +7: IDIV rAX,Ev +EndTable + +GrpTable: Grp4 +0: INC Eb +1: DEC Eb +EndTable + +GrpTable: Grp5 +0: INC Ev +1: DEC Ev +2: CALLN Ev (f64) +3: CALLF Ep +4: JMPN Ev (f64) +5: JMPF Ep +6: PUSH Ev (d64) +7: +EndTable + +GrpTable: Grp6 +0: SLDT Rv/Mw +1: STR Rv/Mw +2: LLDT Ew +3: LTR Ew +4: VERR Ew +5: VERW Ew +EndTable + +GrpTable: Grp7 +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) +3: LIDT Ms +4: SMSW Mw/Rv +5: +6: LMSW Ew +7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) +EndTable + +GrpTable: Grp8 +4: BT +5: BTS +6: BTR +7: BTC +EndTable + +GrpTable: Grp9 +1: CMPXCHG8B/16B Mq/Mdq +6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) +7: VMPTRST Mq +EndTable + +GrpTable: Grp10 +EndTable + +GrpTable: Grp11 +0: MOV +EndTable + +GrpTable: Grp12 +EndTable + +GrpTable: Grp13 +EndTable + +GrpTable: Grp14 +EndTable + +GrpTable: Grp15 +0: fxsave +1: fxstor +2: ldmxcsr +3: stmxcsr +4: XSAVE +5: XRSTOR | lfence (11B) +6: mfence (11B) +7: clflush | sfence (11B) +EndTable + +GrpTable: Grp16 +0: prefetch NTA +1: prefetch T0 +2: prefetch T1 +3: prefetch T2 +EndTable diff --git a/arch/x86/scripts/gen-insn-attr-x86.awk b/arch/x86/scripts/gen-insn-attr-x86.awk new file mode 100644 index 0000000..6fa88cd --- /dev/null +++ b/arch/x86/scripts/gen-insn-attr-x86.awk @@ -0,0 +1,314 @@ +#!/bin/awk -f +# gen-insn-attr-x86.awk: Instruction attribute table generator +# Written by Masami Hiramatsu <mhi...@re...> +# +# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c + +BEGIN { + print "/* x86 opcode map generated from x86-opcode-map.txt */" + print "/* Do not change this code. */" + ggid = 1 + geid = 1 + + opnd_expr = "^[[:alpha:]]" + ext_expr = "^\\(" + sep_expr = "^\\|$" + group_expr = "^Grp[[:digit:]]+A*" + + imm_expr = "^[IJAO][[:lower:]]" + imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" + imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)" + imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)" + imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)" + imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)" + imm_flag["Ob"] = "INAT_MOFFSET" + imm_flag["Ov"] = "INAT_MOFFSET" + + modrm_expr = "^([CDEGMNPQRSUVW][[:lower:]]+|NTA|T[012])" + force64_expr = "\\([df]64\\)" + rex_expr = "^REX(\\.[XRWB]+)*" + fpu_expr = "^ESC" # TODO + + lprefix1_expr = "\\(66\\)" + delete lptable1 + lprefix2_expr = "\\(F2\\)" + delete lptable2 + lprefix3_expr = "\\(F3\\)" + delete lptable3 + max_lprefix = 4 + + prefix_expr = "\\(Prefix\\)" + prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" + prefix_num["REPNE"] = "INAT_PFX_REPNE" + prefix_num["REP/REPE"] = "INAT_PFX_REPE" + prefix_num["LOCK"] = "INAT_PFX_LOCK" + prefix_num["SEG=CS"] = "INAT_PFX_CS" + prefix_num["SEG=DS"] = "INAT_PFX_DS" + prefix_num["SEG=ES"] = "INAT_PFX_ES" + prefix_num["SEG=FS"] = "INAT_PFX_FS" + prefix_num["SEG=GS"] = "INAT_PFX_GS" + prefix_num["SEG=SS"] = "INAT_PFX_SS" + prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" + + delete table + delete etable + delete gtable + eid = -1 + gid = -1 +} + +function semantic_error(msg) { + print "Semantic error at " NR ": " msg > "/dev/stderr" + exit 1 +} + +function debug(msg) { + print "DEBUG: " msg +} + +function array_size(arr, i,c) { + c = 0 + for (i in arr) + c++ + return c +} + +/^Table:/ { + print "/* " $0 " */" +} + +/^Referrer:/ { + if (NF == 1) { + # primary opcode table + tname = "inat_primary_table" + eid = -1 + } else { + # escape opcode table + ref = "" + for (i = 2; i <= NF; i++) + ref = ref $i + eid = escape[ref] + tname = sprintf("inat_escape_table_%d", eid) + } +} + +/^GrpTable:/ { + print "/* " $0 " */" + if (!($2 in group)) + semantic_error("No group: " $2 ) + gid = group[$2] + tname = "inat_group_table_" gid +} + +function print_table(tbl,name,fmt,n) +{ + print "const insn_attr_t " name " = {" + for (i = 0; i < n; i++) { + id = sprintf(fmt, i) + if (tbl[id]) + print " [" id "] = " tbl[id] "," + } + print "};" +} + +/^EndTable/ { + if (gid != -1) { + # print group tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,3] = tname "_3" + } + } else { + # print primary/escaped tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,3] = tname "_3" + } + } + print "" + delete table + delete lptable1 + delete lptable2 + delete lptable3 + gid = -1 + eid = -1 +} + +function add_flags(old,new) { + if (old && new) + return old " | " new + else if (old) + return old + else + return new +} + +# convert operands to flags. +function convert_operands(opnd, i,imm,mod) +{ + imm = null + mod = null + for (i in opnd) { + i = opnd[i] + if (match(i, imm_expr) == 1) { + if (!imm_flag[i]) + semantic_error("Unknown imm opnd: " i) + if (imm) { + if (i != "Ib") + semantic_error("ADDIMM error") + imm = add_flags(imm, "INAT_ADDIMM") + } else + imm = imm_flag[i] + } else if (match(i, modrm_expr)) + mod = "INAT_MODRM" + } + return add_flags(imm, mod) +} + +/^[0-9a-f]+\:/ { + if (NR == 1) + next + # get index + idx = "0x" substr($1, 1, index($1,":") - 1) + if (idx in table) + semantic_error("Redefine " idx " in " tname) + + # check if escaped opcode + if ("escape" == $2) { + if ($3 != "#") + semantic_error("No escaped name") + ref = "" + for (i = 4; i <= NF; i++) + ref = ref $i + if (ref in escape) + semantic_error("Redefine escape (" ref ")") + escape[ref] = geid + geid++ + table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")" + next + } + + variant = null + # converts + i = 2 + while (i <= NF) { + opcode = $(i++) + delete opnds + ext = null + flags = null + opnd = null + # parse one opcode + if (match($i, opnd_expr)) { + opnd = $i + split($(i++), opnds, ",") + flags = convert_operands(opnds) + } + if (match($i, ext_expr)) + ext = $(i++) + if (match($i, sep_expr)) + i++ + else if (i < NF) + semantic_error($i " is not a separator") + + # check if group opcode + if (match(opcode, group_expr)) { + if (!(opcode in group)) { + group[opcode] = ggid + ggid++ + } + flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")") + } + # check force(or default) 64bit + if (match(ext, force64_expr)) + flags = add_flags(flags, "INAT_FORCE64") + + # check REX prefix + if (match(opcode, rex_expr)) + flags = add_flags(flags, "INAT_REXPFX") + + # check coprocessor escape : TODO + if (match(opcode, fpu_expr)) + flags = add_flags(flags, "INAT_MODRM") + + # check prefixes + if (match(ext, prefix_expr)) { + if (!prefix_num[opcode]) + semantic_error("Unknown prefix: " opcode) + flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")") + } + if (length(flags) == 0) + continue + # check if last prefix + if (match(ext, lprefix1_expr)) { + lptable1[idx] = add_flags(lptable1[idx],flags) + variant = "INAT_VARIANT" + } else if (match(ext, lprefix2_expr)) { + lptable2[idx] = add_flags(lptable2[idx],flags) + variant = "INAT_VARIANT" + } else if (match(ext, lprefix3_expr)) { + lptable3[idx] = add_flags(lptable3[idx],flags) + variant = "INAT_VARIANT" + } else { + table[idx] = add_flags(table[idx],flags) + } + } + if (variant) + table[idx] = add_flags(table[idx],variant) +} + +END { + # print escape opcode map's array + print "/* Escape opcode map array */" + print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \ + "[INAT_LPREFIX_MAX + 1] = {" + for (i = 0; i < geid; i++) + for (j = 0; j < max_lprefix; j++) + if (etable[i,j]) + print " ["i"]["j"] = "etable[i,j]"," + print "};\n" + # print group opcode map's array + print "/* Group opcode map array */" + print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\ + "[INAT_LPREFIX_MAX + 1] = {" + for (i = 0; i < ggid; i++) + for (j = 0; j < max_lprefix; j++) + if (gtable[i,j]) + print " ["i"]["j"] = "gtable[i,j]"," + print "};" +} -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-29 00:01:18
|
Cleanup fix_riprel() in arch/x86/kernel/kprobes.c by using x86 instruction decoder. Signed-off-by: Masami Hiramatsu <mhi...@re...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Jim Keniston <jke...@us...> Cc: Ingo Molnar <mi...@el...> --- arch/x86/kernel/kprobes.c | 128 ++++++++------------------------------------- 1 files changed, 23 insertions(+), 105 deletions(-) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 41d524f..ebac470 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -108,50 +108,6 @@ static const u32 twobyte_is_boostable[256 / 32] = { /* ----------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -static const u32 onebyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ - W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ - W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ - W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ - W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ - W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ - W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ - W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ - W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; -static const u32 twobyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ - W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ - W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ - W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ - W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ - W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ - W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ - W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ - W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ - W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ - W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ - W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; #undef W struct kretprobe_blackpoint kretprobe_blacklist[] = { @@ -344,68 +300,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) static void __kprobes fix_riprel(struct kprobe *p) { #ifdef CONFIG_X86_64 - u8 *insn = p->ainsn.insn; - s64 disp; - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } + struct insn insn; + kernel_insn_init(&insn, p->ainsn.insn); - /* Skip REX instruction prefix. */ - if (is_REX_prefix(insn)) - ++insn; - - if (*insn == 0x0f) { - /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, - (unsigned long *)twobyte_has_modrm); - } else - /* One-byte opcode. */ - need_modrm = test_bit(*insn, - (unsigned long *)onebyte_has_modrm); - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { - /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - ++insn; - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - disp = (u8 *) p->addr + *((s32 *) insn) - - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *(s32 *)insn = (s32) disp; - } + if (insn_rip_relative(&insn)) { + s64 newdisp; + u8 *disp; + insn_get_displacement(&insn); + /* + * The copied instruction uses the %rip-relative addressing + * mode. Adjust the displacement for the difference between + * the original location of this instruction and the location + * of the copy that will actually be run. The tricky bit here + * is making sure that the sign extension happens correctly in + * this calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the %rip + * value and yield the same 64-bit result that the sign- + * extension of the original signed 32-bit displacement would + * have given. + */ + newdisp = (u8 *) p->addr + (s64) insn.displacement.value - + (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ + disp = (u8 *) p->ainsn.insn + INSN_DISPLACEMENT_OFFS(&insn); + *(s32 *) disp = (s32) newdisp; } #endif } -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-29 00:01:02
|
Add following APIs for accessing registers and stack entries from pt_regs. - query_register_offset(const char *name) Query the offset of "name" register. - query_register_name(unsigned offset) Query the name of register by its offset. - get_register(struct pt_regs *regs, unsigned offset) Get the value of a register by its offset. - within_kernel_stack(struct pt_regs *regs, unsigned long addr) Check the address is in the kernel stack. - get_kernel_stack_nth(struct pt_regs *reg, unsigned nth) Get Nth entry of the kernel stack. (N >= 0) - get_argument_nth(struct pt_regs *reg, unsigned nth) Get Nth argument at function call. (N >= 0) Signed-off-by: Masami Hiramatsu <mhi...@re...> Cc: Steven Rostedt <ro...@go...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Ingo Molnar <mi...@el...> Cc: Frederic Weisbecker <fwe...@gm...> Cc: Roland McGrath <ro...@re...> --- arch/x86/include/asm/ptrace.h | 67 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/ptrace.c | 60 +++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 0 deletions(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 0f0d908..577d625 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -7,6 +7,7 @@ #ifdef __KERNEL__ #include <asm/segment.h> +#include <asm/page_types.h> #endif #ifndef __ASSEMBLY__ @@ -216,6 +217,72 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) return regs->sp; } +/* Query offset/name of register from its name/offset */ +extern int query_register_offset(const char *name); +extern const char *query_register_name(unsigned offset); +#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss)) + +/* Get register value from its offset */ +static inline unsigned long get_register(struct pt_regs *regs, unsigned offset) +{ + if (unlikely(offset > MAX_REG_OFFSET)) + return 0; + return *(unsigned long *)((unsigned long)regs + offset); +} + +/* Check the address in the stack */ +static inline int within_kernel_stack(struct pt_regs *regs, unsigned long addr) +{ + return ((addr & ~(THREAD_SIZE - 1)) == + (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))); +} + +/* Get Nth entry of the stack */ +static inline unsigned long get_kernel_stack_nth(struct pt_regs *regs, + unsigned n) +{ + unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + addr += n; + if (within_kernel_stack(regs, (unsigned long)addr)) + return *addr; + else + return 0; +} + +/* Get Nth argument at function call */ +static inline unsigned long get_argument_nth(struct pt_regs *regs, unsigned n) +{ +#ifdef CONFIG_X86_32 +#define NR_REGPARMS 3 + if (n < NR_REGPARMS) { + switch (n) { + case 0: return regs->ax; + case 1: return regs->dx; + case 2: return regs->cx; + } + return 0; +#else /* CONFIG_X86_64 */ +#define NR_REGPARMS 6 + if (n < NR_REGPARMS) { + switch (n) { + case 0: return regs->di; + case 1: return regs->si; + case 2: return regs->dx; + case 3: return regs->cx; + case 4: return regs->r8; + case 5: return regs->r9; + } + return 0; +#endif + } else { + /* + * The typical case: arg n is on the stack. + * (Note: stack[0] = return address, so skip it) + */ + return get_kernel_stack_nth(regs, 1 + n - NR_REGPARMS); + } +} + /* * These are defined as per linux/ptrace.h, which see. */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 09ecbde..00eb9d7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -48,6 +48,66 @@ enum x86_regset { REGSET_IOPERM32, }; +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET(r) offsetof(struct pt_regs, r) +#define REG_OFFSET_NAME(r) {.name = #r, .offset = REG_OFFSET(r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { +#ifdef CONFIG_X86_64 + REG_OFFSET_NAME(r15), + REG_OFFSET_NAME(r14), + REG_OFFSET_NAME(r13), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r8), +#endif + REG_OFFSET_NAME(bx), + REG_OFFSET_NAME(cx), + REG_OFFSET_NAME(dx), + REG_OFFSET_NAME(si), + REG_OFFSET_NAME(di), + REG_OFFSET_NAME(bp), + REG_OFFSET_NAME(ax), +#ifdef CONFIG_X86_32 + REG_OFFSET_NAME(ds), + REG_OFFSET_NAME(es), + REG_OFFSET_NAME(fs), + REG_OFFSET_NAME(gs), +#endif + REG_OFFSET_NAME(orig_ax), + REG_OFFSET_NAME(ip), + REG_OFFSET_NAME(cs), + REG_OFFSET_NAME(flags), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(ss), + REG_OFFSET_END, +}; + +int query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +const char *query_register_name(unsigned offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-29 00:00:57
|
Add a user-space selftest of x86 instruction decoder at kernel build time. When CONFIG_X86_DECODER_SELFTEST=y, Kbuild builds a test harness of x86 instruction decoder and performs it after building vmlinux. The test compares the results of objdump and x86 instruction decoder code and check there are no differences. Changes from v7: - Add data, addr, rep, lock prefixes to skip instructions list. - Add license comments. Signed-off-by: Masami Hiramatsu <mhi...@re...> Signed-off-by: Jim Keniston <jke...@us...> Cc: H. Peter Anvin <hp...@zy...> Cc: Steven Rostedt <ro...@go...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Ingo Molnar <mi...@el...> Cc: Frederic Weisbecker <fwe...@gm...> Cc: Andi Kleen <ak...@li...> Cc: Vegard Nossum <veg...@gm...> Cc: Avi Kivity <av...@re...> Cc: Przemysław Pawełczyk <prz...@pa...> Cc: Sam Ravnborg <sa...@ra...> --- arch/x86/Kconfig.debug | 9 ++++ arch/x86/Makefile | 3 + arch/x86/include/asm/inat.h | 2 + arch/x86/include/asm/insn.h | 2 + arch/x86/lib/inat.c | 2 + arch/x86/lib/insn.c | 2 + arch/x86/scripts/Makefile | 19 +++++++ arch/x86/scripts/distill.awk | 42 +++++++++++++++++ arch/x86/scripts/test_get_len.c | 99 +++++++++++++++++++++++++++++++++++++++ arch/x86/scripts/user_include.h | 49 +++++++++++++++++++ 10 files changed, 229 insertions(+), 0 deletions(-) create mode 100644 arch/x86/scripts/Makefile create mode 100644 arch/x86/scripts/distill.awk create mode 100644 arch/x86/scripts/test_get_len.c create mode 100644 arch/x86/scripts/user_include.h diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 9a88937..430aab4 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -179,6 +179,15 @@ config X86_DS_SELFTEST config HAVE_MMIOTRACE_SUPPORT def_bool y +config X86_DECODER_SELFTEST + bool "x86 instruction decoder selftest" + depends on DEBUG_KERNEL + ---help--- + Perform x86 instruction decoder selftests at build time. + This option is useful for checking the sanity of x86 instruction + decoder code. + If unsure, say "N". + # # IO delay types: # diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1b68659..7046556 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -154,6 +154,9 @@ all: bzImage KBUILD_IMAGE := $(boot)/bzImage bzImage: vmlinux +ifeq ($(CONFIG_X86_DECODER_SELFTEST),y) + $(Q)$(MAKE) $(build)=arch/x86/scripts posttest +endif $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 01e079a..9090665 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -20,7 +20,9 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ +#ifdef __KERNEL__ #include <linux/types.h> +#endif /* Instruction attributes */ typedef u32 insn_attr_t; diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 5b50fa3..5736404 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -20,7 +20,9 @@ * Copyright (C) IBM Corporation, 2009 */ +#ifdef __KERNEL__ #include <linux/types.h> +#endif /* insn_attr_t is defined in inat.h */ #include <asm/inat.h> diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c index d6a34be..564ecbd 100644 --- a/arch/x86/lib/inat.c +++ b/arch/x86/lib/inat.c @@ -18,7 +18,9 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ +#ifdef __KERNEL__ #include <linux/module.h> +#endif #include <asm/insn.h> /* Attribute tables are generated from opcode map */ diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 254c848..3b9451a 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -18,8 +18,10 @@ * Copyright (C) IBM Corporation, 2002, 2004, 2009 */ +#ifdef __KERNEL__ #include <linux/string.h> #include <linux/module.h> +#endif #include <asm/inat.h> #include <asm/insn.h> diff --git a/arch/x86/scripts/Makefile b/arch/x86/scripts/Makefile new file mode 100644 index 0000000..f08859e --- /dev/null +++ b/arch/x86/scripts/Makefile @@ -0,0 +1,19 @@ +PHONY += posttest +quiet_cmd_posttest = TEST $@ + cmd_posttest = objdump -d $(objtree)/vmlinux | awk -f $(srctree)/arch/x86/scripts/distill.awk | $(obj)/test_get_len + +posttest: $(obj)/test_get_len vmlinux + $(call cmd,posttest) + +test_get_len_SRC = $(srctree)/arch/x86/scripts/test_get_len.c $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c +test_get_len_INC = $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c + +quiet_cmd_test_get_len = CC $@ + cmd_test_get_len = $(CC) -Wall $(test_get_len_SRC) -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include -include $(srctree)/arch/x86/scripts/user_include.h -o $@ + + +$(obj)/test_get_len: $(test_get_len_SRC) $(test_get_len_INC) + $(call cmd,test_get_len) + +clean-files := test_get_len + diff --git a/arch/x86/scripts/distill.awk b/arch/x86/scripts/distill.awk new file mode 100644 index 0000000..d433619 --- /dev/null +++ b/arch/x86/scripts/distill.awk @@ -0,0 +1,42 @@ +#!/bin/awk -f +# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len +# Distills the disassembly as follows: +# - Removes all lines except the disassembled instructions. +# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes +# into a single line. +# - Remove bad(or prefix only) instructions + +BEGIN { + prev_addr = "" + prev_hex = "" + prev_mnemonic = "" + bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))" + fwait_expr = "^9b " + fwait_str="9b\tfwait" +} + +/^ *[0-9a-f]+:/ { + if (split($0, field, "\t") < 3) { + # This is a continuation of the same insn. + prev_hex = prev_hex field[2] + } else { + # Skip bad instructions + if (match(prev_mnemonic, bad_expr)) + prev_addr = "" + # Split fwait from other f* instructions + if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") { + printf "%s\t%s\n", prev_addr, fwait_str + sub(fwait_expr, "", prev_hex) + } + if (prev_addr != "") + printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic + prev_addr = field[1] + prev_hex = field[2] + prev_mnemonic = field[3] + } +} + +END { + if (prev_addr != "") + printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic +} diff --git a/arch/x86/scripts/test_get_len.c b/arch/x86/scripts/test_get_len.c new file mode 100644 index 0000000..0f702e8 --- /dev/null +++ b/arch/x86/scripts/test_get_len.c @@ -0,0 +1,99 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> + +#include <asm/insn.h> + +/* + * Test of instruction analysis in general and insn_get_length() in + * particular. See if insn_get_length() and the disassembler agree + * on the length of each instruction in an elf disassembly. + * + * usage: test_get_len < distilled_disassembly + */ + +const char *prog; + +static void usage() +{ + fprintf(stderr, "usage: %s < distilled_disassembly\n", prog); + exit(1); +} + +static void malformed_line(const char *line, int line_nr) +{ + fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line); + exit(3); +} + +#define BUFSIZE 256 + +int main(int argc, char **argv) +{ + char line[BUFSIZE]; + unsigned char insn_buf[16]; + struct insn insn; + int insns = 0; + + prog = argv[0]; + if (argc > 1) + usage(); + + while (fgets(line, BUFSIZE, stdin)) { + char copy[BUFSIZE], *s, *tab1, *tab2; + int nb = 0; + unsigned b; + + insns++; + memset(insn_buf, 0, 16); + strcpy(copy, line); + tab1 = strchr(copy, '\t'); + if (!tab1) + malformed_line(line, insns); + s = tab1 + 1; + s += strspn(s, " "); + tab2 = strchr(s, '\t'); + if (!tab2) + malformed_line(line, insns); + *tab2 = '\0'; /* Characters beyond tab2 aren't examined */ + while (s < tab2) { + if (sscanf(s, "%x", &b) == 1) { + insn_buf[nb++] = (unsigned char) b; + s += 3; + } else + break; + } + /* Decode an instruction */ + kernel_insn_init(&insn, insn_buf); + insn_get_length(&insn); + if (insn.length != nb) { + fprintf(stderr, "Error: %s", line); + fprintf(stderr, "Error: objdump says %d bytes, but " + "insn_get_length() says %d (attr:%x)\n", nb, + insn.length, insn.attr); + exit(2); + } + } + fprintf(stderr, "Succeed: decoded and checked %d instructions\n", + insns); + return 0; +} diff --git a/arch/x86/scripts/user_include.h b/arch/x86/scripts/user_include.h new file mode 100644 index 0000000..3bdcc55 --- /dev/null +++ b/arch/x86/scripts/user_include.h @@ -0,0 +1,49 @@ +#ifndef __USER_TYPES_H +#define __USER_TYPES_H + +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include <string.h> + +#ifdef __x86_64__ +#define CONFIG_X86_64 +#else +#define CONFIG_X86_32 +#endif +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned int u32; +typedef unsigned long long u64; + +typedef signed char s8; +typedef short s16; +typedef int s32; +typedef long long s64; + +typedef enum bool { false = 0, true } bool; + +/* any harmless file-scope decl */ +#define NOP_DECL struct __nop +#define EXPORT_SYMBOL_GPL(symbol) NOP_DECL +#define MODULE_LICENSE(gpl) NOP_DECL + +#define WARN_ON(cond) do { } while (0) +#define unlikely(cond) (cond) + +#endif /* __USER_TYPES_H */ -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-29 00:00:52
|
Ensure safeness of inserting kprobes by checking whether the specified address is at the first byte of a instruction on x86. This is done by decoding probed function from its head to the probe point. Signed-off-by: Masami Hiramatsu <mhi...@re...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Jim Keniston <jke...@us...> Cc: Ingo Molnar <mi...@el...> --- arch/x86/kernel/kprobes.c | 69 +++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 69 insertions(+), 0 deletions(-) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d..41d524f 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -48,12 +48,14 @@ #include <linux/preempt.h> #include <linux/module.h> #include <linux/kdebug.h> +#include <linux/kallsyms.h> #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/pgtable.h> #include <asm/uaccess.h> #include <asm/alternative.h> +#include <asm/insn.h> void jprobe_return_end(void); @@ -244,6 +246,71 @@ retry: } } +/* Recover the probed instruction at addr for further analysis. */ +static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +{ + struct kprobe *kp; + kp = get_kprobe((void *)addr); + if (!kp) + return -EINVAL; + + /* + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, fix_riprel() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. + * + * On the other hand, kp->opcode has a copy of the first byte of + * the probed instruction, which is overwritten by int3. And + * the instruction at kp->addr is not modified by kprobes except + * for the first byte, we can recover the original instruction + * from it and kp->opcode. + */ + memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + buf[0] = kp->opcode; + return 0; +} + +/* Dummy buffers for kallsyms_lookup */ +static char __dummy_buf[KSYM_NAME_LEN]; + +/* Check if paddr is at an instruction boundary */ +static int __kprobes can_probe(unsigned long paddr) +{ + int ret; + unsigned long addr, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + + /* Check if the instruction has been modified. */ + if (OPCODE1(&insn) == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + /* + * Another debugging subsystem might insert + * this breakpoint. In that case, we can't + * recover it. + */ + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + addr += insn.length; + } + + return (addr == paddr); +} + /* * Returns non-zero if opcode modifies the interrupt flag. */ @@ -359,6 +426,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (!can_probe((unsigned long)p->addr)) + return -EILSEQ; /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-29 00:00:47
|
Add dynamic ftrace_event_call support to ftrace. Trace engines can adds new ftrace_event_call to ftrace on the fly. Each operator functions of the call takes a ftrace_event_call data structure as an argument, because these functions may be shared among several ftrace_event_calls. Signed-off-by: Masami Hiramatsu <mhi...@re...> Cc: Steven Rostedt <ro...@go...> Cc: Ingo Molnar <mi...@el...> Cc: Tom Zanussi <tza...@gm...> Cc: Frederic Weisbecker <fwe...@gm...> --- include/linux/ftrace_event.h | 13 ++++++---- include/trace/ftrace.h | 22 +++++++++-------- kernel/trace/trace_events.c | 54 +++++++++++++++++++++++++++++------------- kernel/trace/trace_export.c | 27 ++++++++++----------- 4 files changed, 69 insertions(+), 47 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index bbf40f6..e25f3a4 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -108,12 +108,13 @@ struct ftrace_event_call { struct dentry *dir; struct trace_event *event; int enabled; - int (*regfunc)(void); - void (*unregfunc)(void); + int (*regfunc)(struct ftrace_event_call *); + void (*unregfunc)(struct ftrace_event_call *); int id; - int (*raw_init)(void); - int (*show_format)(struct trace_seq *s); - int (*define_fields)(void); + int (*raw_init)(struct ftrace_event_call *); + int (*show_format)(struct ftrace_event_call *, + struct trace_seq *); + int (*define_fields)(struct ftrace_event_call *); struct list_head fields; int filter_active; void *filter; @@ -138,6 +139,8 @@ extern int filter_current_check_discard(struct ftrace_event_call *call, extern int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size, int is_signed); +extern int trace_add_event_call(struct ftrace_event_call *call); +extern void trace_remove_event_call(struct ftrace_event_call *call); #define is_signed_type(type) (((type)(-1)) < 0) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index b4ec83a..de3ee7c 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -229,7 +229,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ static int \ -ftrace_format_##call(struct trace_seq *s) \ +ftrace_format_##call(struct ftrace_event_call *event_call, \ + struct trace_seq *s) \ { \ struct ftrace_raw_##call field __attribute__((unused)); \ int ret = 0; \ @@ -269,10 +270,9 @@ ftrace_format_##call(struct trace_seq *s) \ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ int \ -ftrace_define_fields_##call(void) \ +ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ { \ struct ftrace_raw_##call field; \ - struct ftrace_event_call *event_call = &event_##call; \ int ret; \ \ __common_field(int, type, 1); \ @@ -298,7 +298,7 @@ ftrace_define_fields_##call(void) \ * event_trace_printk(_RET_IP_, "<call>: " <fmt>); * } * - * static int ftrace_reg_event_<call>(void) + * static int ftrace_reg_event_<call>(struct ftrace_event_call *dummy) * { * int ret; * @@ -309,7 +309,7 @@ ftrace_define_fields_##call(void) \ * return ret; * } * - * static void ftrace_unreg_event_<call>(void) + * static void ftrace_unreg_event_<call>(struct ftrace_event_call *dummy) * { * unregister_trace_<call>(ftrace_event_<call>); * } @@ -342,7 +342,7 @@ ftrace_define_fields_##call(void) \ * trace_current_buffer_unlock_commit(event, irq_flags, pc); * } * - * static int ftrace_raw_reg_event_<call>(void) + * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *dummy) * { * int ret; * @@ -353,7 +353,7 @@ ftrace_define_fields_##call(void) \ * return ret; * } * - * static void ftrace_unreg_event_<call>(void) + * static void ftrace_unreg_event_<call>(struct ftrace_event_call *dummy) * { * unregister_trace_<call>(ftrace_raw_event_<call>); * } @@ -362,7 +362,7 @@ ftrace_define_fields_##call(void) \ * .trace = ftrace_raw_output_<call>, <-- stage 2 * }; * - * static int ftrace_raw_init_event_<call>(void) + * static int ftrace_raw_init_event_<call>(struct ftrace_event_call *dummy) * { * int id; * @@ -477,7 +477,7 @@ static void ftrace_raw_event_##call(proto) \ trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ } \ \ -static int ftrace_raw_reg_event_##call(void) \ +static int ftrace_raw_reg_event_##call(struct ftrace_event_call *dummy) \ { \ int ret; \ \ @@ -488,7 +488,7 @@ static int ftrace_raw_reg_event_##call(void) \ return ret; \ } \ \ -static void ftrace_raw_unreg_event_##call(void) \ +static void ftrace_raw_unreg_event_##call(struct ftrace_event_call *dummy)\ { \ unregister_trace_##call(ftrace_raw_event_##call); \ } \ @@ -497,7 +497,7 @@ static struct trace_event ftrace_event_type_##call = { \ .trace = ftrace_raw_output_##call, \ }; \ \ -static int ftrace_raw_init_event_##call(void) \ +static int ftrace_raw_init_event_##call(struct ftrace_event_call *dummy)\ { \ int id; \ \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6c81f9c..5d0a324 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -60,9 +60,7 @@ err: } EXPORT_SYMBOL_GPL(trace_define_field); -#ifdef CONFIG_MODULES - -static void trace_destroy_fields(struct ftrace_event_call *call) +void trace_destroy_fields(struct ftrace_event_call *call) { struct ftrace_event_field *field, *next; @@ -74,8 +72,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call) } } -#endif /* CONFIG_MODULES */ - static void ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { @@ -84,14 +80,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, if (call->enabled) { call->enabled = 0; tracing_stop_cmdline_record(); - call->unregfunc(); + call->unregfunc(call); } break; case 1: if (!call->enabled) { call->enabled = 1; tracing_start_cmdline_record(); - call->regfunc(); + call->regfunc(call); } break; } @@ -558,7 +554,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_printf(s, "format:\n"); trace_write_header(s); - r = call->show_format(s); + r = call->show_format(call, s); if (!r) { /* * ug! The format output is bigger than a PAGE!! @@ -905,7 +901,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, d_events = event_subsystem_dir(call->system, d_events); if (call->raw_init) { - ret = call->raw_init(); + ret = call->raw_init(call); if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); @@ -929,7 +925,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, id); if (call->define_fields) { - ret = call->define_fields(); + ret = call->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); @@ -949,6 +945,36 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return 0; } +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct ftrace_event_call *call) +{ + struct dentry *d_events; + + if (!call->name) + return -EINVAL; + + d_events = event_trace_events_dir(); + if (!d_events) + return -ENOENT; + + list_add(&call->list, &ftrace_events); + return event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, &ftrace_event_filter_fops, + &ftrace_event_format_fops); +} + +/* Remove an event_call */ +void trace_remove_event_call(struct ftrace_event_call *event_call) +{ + ftrace_event_enable_disable(event_call, 0); + if (event_call->event) + unregister_ftrace_event(event_call->event); + debugfs_remove_recursive(event_call->dir); + list_del(&event_call->list); + trace_destroy_fields(event_call); + destroy_preds(event_call); +} + #define for_each_event(event, start, end) \ for (event = start; \ (unsigned long)event < (unsigned long)end; \ @@ -1053,13 +1079,7 @@ static void trace_module_remove_events(struct module *mod) list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; - ftrace_event_enable_disable(call, 0); - if (call->event) - unregister_ftrace_event(call->event); - debugfs_remove_recursive(call->dir); - list_del(&call->list); - trace_destroy_fields(call); - destroy_preds(call); + trace_remove_event_call(call); } } diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d06cf89..7cee79d 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -60,7 +60,7 @@ extern void __bad_type_size(void); #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ static int \ -ftrace_format_##call(struct trace_seq *s) \ +ftrace_format_##call(struct ftrace_event_call *dummy, struct trace_seq *s)\ { \ struct args field; \ int ret; \ @@ -76,7 +76,7 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ tpfmt) \ static int \ -ftrace_format_##call(struct trace_seq *s) \ +ftrace_format_##call(struct ftrace_event_call *dummy, struct trace_seq *s)\ { \ struct args field; \ int ret; \ @@ -115,10 +115,16 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ cmd; +static int ftrace_raw_init_event(struct ftrace_event_call *event_call) +{ + INIT_LIST_HEAD(&event_call->fields); + init_preds(event_call); + return 0; +} + #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -int ftrace_define_fields_##call(void); \ -static int ftrace_raw_init_event_##call(void); \ +int ftrace_define_fields_##call(struct ftrace_event_call *c); \ \ struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ @@ -126,16 +132,10 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .id = proto, \ .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event_##call, \ + .raw_init = ftrace_raw_init_event, \ .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ -}; \ -static int ftrace_raw_init_event_##call(void) \ -{ \ - INIT_LIST_HEAD(&event_##call.fields); \ - init_preds(&event_##call); \ - return 0; \ -} \ +}; #undef TRACE_EVENT_FORMAT_NOFILTER #define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ @@ -182,9 +182,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ int \ -ftrace_define_fields_##call(void) \ +ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ { \ - struct ftrace_event_call *event_call = &event_##call; \ struct args field; \ int ret; \ \ -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America), Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-23 01:20:08
|
Hi, Here are the patches of kprobe-based event tracer for x86, version 7, which allows you to probe various kernel events through ftrace interface. I added a selftest of x86 instruction decoder which makes a user-space test tool of the x86 instruction decoder and compares the results of objdump and the test tool right after building vmlinux. You can enable that test by CONFIG_X86_DECODER_SELFTEST=y. This version supports only x86(-32/-64) (but porting it on other arch just needs kprobes/kretprobes and register and stack access APIs). This patchset also includes x86(-64) instruction decoder which supports non-SSE/FP opcodes and includes x86 opcode map. The decoder is used for finding the instruction boundaries when inserting new kprobes. I think it will be possible to share this opcode map with KVM's decoder. This series can be applied on the latest linux-2.6-tip tree. This patchset includes following changes: - Add x86 instruction decoder [1/6] - Add x86 instruction decoder selftest [2/6] - Check insertion point safety in kprobe [3/6] - Cleanup fix_riprel() with insn decoder [4/6] - Add arch-dep register and stack fetching functions [5/6] - Add kprobe-based event tracer [6/6] Future items: - Support per-probe event-filtering interface. - .init function tracing support. - Support primitive types(long, ulong, int, uint, etc) for args. Kprobe-based Event Tracer ========================= Overview -------- This tracer is similar to the events tracer which is based on Tracepoint infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe and kretprobe). It probes anywhere where kprobes can probe(this means, all functions body except for __kprobes functions). Unlike the function tracer, this tracer can probe instructions inside of kernel functions. It allows you to check which instruction has been executed. Unlike the Tracepoint based events tracer, this tracer can add new probe points on the fly. Similar to the events tracer, this tracer doesn't need to be activated via current_tracer, instead of that, just set probe points via /debug/tracing/kprobe_events. Synopsis of kprobe_events ------------------------- p SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS] : set a probe r SYMBOL[+0] [FETCHARGS] : set a return probe FETCHARGS: %REG : Fetch register REG sN : Fetch Nth entry of stack (N >= 0) @ADDR : Fetch memory at ADDR (ADDR should be in kernel) @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) aN : Fetch function argument. (N >= 0)(*) rv : Fetch return value.(**) ra : Fetch return address.(**) +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.(***) (*) aN may not correct on asmlinkaged functions and at the middle of function body. (**) only for return probe. (***) this is useful for fetching a field of data structures. Usage examples -------------- echo p do_sys_open a0 a1 a2 a3 > /debug/tracing/kprobe_events This sets a kprobe on the top of do_sys_open() function with recording 1st to 4th arguments. echo r do_sys_open rv ra >> /debug/tracing/kprobe_events This sets a kretprobe on the return point of do_sys_open() function with recording return value and return address. echo > /debug/tracing/kprobe_events This clears all probe points. and you can see the traced information via /debug/tracing/trace. cat /debug/tracing/trace # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | <...>-1447 [001] 1038282.286875: do_sys_open+0x0/0xd6: 0x3 0x7fffd1ec4440 0x8000 0x0 <...>-1447 [001] 1038282.286878: sys_openat+0xc/0xe <-do_sys_open: 0xfffffffffffffffe 0xffffffff81367a3a <...>-1447 [001] 1038282.286885: do_sys_open+0x0/0xd6: 0xffffff9c 0x40413c 0x8000 0x1b6 <...>-1447 [001] 1038282.286915: sys_open+0x1b/0x1d <-do_sys_open: 0x3 0xffffffff81367a3a <...>-1447 [001] 1038282.286969: do_sys_open+0x0/0xd6: 0xffffff9c 0x4041c6 0x98800 0x10 <...>-1447 [001] 1038282.286976: sys_open+0x1b/0x1d <-do_sys_open: 0x3 0xffffffff81367a3a Each line shows when the kernel hits a probe, and <- SYMBOL means kernel returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel returns from do_sys_open to sys_open+0x1b). Thank you, --- Masami Hiramatsu (6): tracing: add kprobe-based event tracer x86: add pt_regs register and stack access APIs kprobes: cleanup fix_riprel() using insn decoder on x86 kprobes: checks probe address is instruction boudary on x86 x86: x86 instruction decoder build-time selftest x86: instruction decoder API Documentation/trace/kprobes.txt | 81 +++ arch/x86/Kconfig.debug | 9 arch/x86/Makefile | 3 arch/x86/include/asm/inat.h | 127 +++++ arch/x86/include/asm/insn.h | 136 +++++ arch/x86/include/asm/ptrace.h | 67 ++ arch/x86/kernel/kprobes.c | 197 +++---- arch/x86/kernel/ptrace.c | 60 ++ arch/x86/lib/Makefile | 13 arch/x86/lib/inat.c | 82 +++ arch/x86/lib/insn.c | 473 +++++++++++++++++ arch/x86/lib/x86-opcode-map.txt | 711 +++++++++++++++++++++++++ arch/x86/scripts/Makefile | 19 + arch/x86/scripts/distill.awk | 41 + arch/x86/scripts/gen-insn-attr-x86.awk | 314 +++++++++++ arch/x86/scripts/test_get_len.c | 81 +++ arch/x86/scripts/user_include.h | 49 ++ kernel/trace/Kconfig | 9 kernel/trace/Makefile | 1 kernel/trace/trace.h | 22 + kernel/trace/trace_event_types.h | 20 + kernel/trace/trace_kprobe.c | 903 ++++++++++++++++++++++++++++++++ 22 files changed, 3313 insertions(+), 105 deletions(-) create mode 100644 Documentation/trace/kprobes.txt create mode 100644 arch/x86/include/asm/inat.h create mode 100644 arch/x86/include/asm/insn.h create mode 100644 arch/x86/lib/inat.c create mode 100644 arch/x86/lib/insn.c create mode 100644 arch/x86/lib/x86-opcode-map.txt create mode 100644 arch/x86/scripts/Makefile create mode 100644 arch/x86/scripts/distill.awk create mode 100644 arch/x86/scripts/gen-insn-attr-x86.awk create mode 100644 arch/x86/scripts/test_get_len.c create mode 100644 arch/x86/scripts/user_include.h create mode 100644 kernel/trace/trace_kprobe.c -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America) Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-23 01:18:03
|
Add x86 instruction decoder to arch-specific libraries. This decoder can decode x86 instructions used in kernel into prefix, opcode, modrm, sib, displacement and immediates. This can also show the length of instructions. This version introduces instruction attributes for decoding instructions. The instruction attribute tables are generated from the opcode map file (x86-opcode-map.txt) by the generator script(gen-insn-attr-x86.awk). Currently, the opcode maps are based on opcode maps in Intel(R) 64 and IA-32 Architectures Software Developers Manual Vol.2: Appendix.A, and consist of below two types of opcode tables. 1-byte/2-bytes/3-bytes opcodes, which has 256 elements, are written as below; Table: table-name Referrer: escaped-name opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] (or) opcode: escape # escaped-name EndTable Group opcodes, which has 8 elements, are written as below; GrpTable: GrpXXX reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] EndTable These opcode maps do NOT include most of SSE and FP opcodes, because those opcodes are not used in the kernel. Changes from v6.1: - fix patch title. Signed-off-by: Masami Hiramatsu <mhi...@re...> Signed-off-by: Jim Keniston <jke...@us...> Cc: H. Peter Anvin <hp...@zy...> Cc: Steven Rostedt <ro...@go...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Ingo Molnar <mi...@el...> Cc: Frederic Weisbecker <fwe...@gm...> Cc: Andi Kleen <ak...@li...> Cc: Vegard Nossum <veg...@gm...> Cc: Avi Kivity <av...@re...> Cc: Przemysław Pawełczyk <prz...@pa...> --- arch/x86/include/asm/inat.h | 125 ++++++ arch/x86/include/asm/insn.h | 134 ++++++ arch/x86/lib/Makefile | 13 + arch/x86/lib/inat.c | 80 ++++ arch/x86/lib/insn.c | 471 +++++++++++++++++++++ arch/x86/lib/x86-opcode-map.txt | 711 ++++++++++++++++++++++++++++++++ arch/x86/scripts/gen-insn-attr-x86.awk | 314 ++++++++++++++ 7 files changed, 1848 insertions(+), 0 deletions(-) create mode 100644 arch/x86/include/asm/inat.h create mode 100644 arch/x86/include/asm/insn.h create mode 100644 arch/x86/lib/inat.c create mode 100644 arch/x86/lib/insn.c create mode 100644 arch/x86/lib/x86-opcode-map.txt create mode 100644 arch/x86/scripts/gen-insn-attr-x86.awk diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h new file mode 100644 index 0000000..01e079a --- /dev/null +++ b/arch/x86/include/asm/inat.h @@ -0,0 +1,125 @@ +#ifndef _ASM_INAT_INAT_H +#define _ASM_INAT_INAT_H +/* + * x86 instruction attributes + * + * Written by Masami Hiramatsu <mhi...@re...> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include <linux/types.h> + +/* Instruction attributes */ +typedef u32 insn_attr_t; + +/* + * Internal bits. Don't use bitmasks directly, because these bits are + * unstable. You should add checking macros and use that macro in + * your code. + */ + +#define INAT_OPCODE_TABLE_SIZE 256 +#define INAT_GROUP_TABLE_SIZE 8 + +/* Legacy instruction prefixes */ +#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */ +#define INAT_PFX_REPNE 2 /* 0xF2 */ /* LPFX2 */ +#define INAT_PFX_REPE 3 /* 0xF3 */ /* LPFX3 */ +#define INAT_PFX_LOCK 4 /* 0xF0 */ +#define INAT_PFX_CS 5 /* 0x2E */ +#define INAT_PFX_DS 6 /* 0x3E */ +#define INAT_PFX_ES 7 /* 0x26 */ +#define INAT_PFX_FS 8 /* 0x64 */ +#define INAT_PFX_GS 9 /* 0x65 */ +#define INAT_PFX_SS 10 /* 0x36 */ +#define INAT_PFX_ADDRSZ 11 /* 0x67 */ + +#define INAT_LPREFIX_MAX 3 + +/* Immediate size */ +#define INAT_IMM_BYTE 1 +#define INAT_IMM_WORD 2 +#define INAT_IMM_DWORD 3 +#define INAT_IMM_QWORD 4 +#define INAT_IMM_PTR 5 +#define INAT_IMM_VWORD32 6 +#define INAT_IMM_VWORD 7 + +/* Legacy prefix */ +#define INAT_PFX_OFFS 0 +#define INAT_PFX_BITS 4 +#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1) +#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS) +/* Escape opcodes */ +#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS) +#define INAT_ESC_BITS 2 +#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1) +#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS) +/* Group opcodes (1-16) */ +#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS) +#define INAT_GRP_BITS 5 +#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1) +#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS) +/* Immediates */ +#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS) +#define INAT_IMM_BITS 3 +#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS) +/* Flags */ +#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS) +#define INAT_REXPFX (1 << INAT_FLAG_OFFS) +#define INAT_MODRM (1 << (INAT_FLAG_OFFS + 1)) +#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 2)) +#define INAT_ADDIMM (1 << (INAT_FLAG_OFFS + 3)) +#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 4)) +#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 5)) + +/* Attribute search APIs */ +extern insn_attr_t inat_get_opcode_attribute(u8 opcode); +extern insn_attr_t inat_get_escape_attribute(u8 opcode, u8 last_pfx, + insn_attr_t esc_attr); +extern insn_attr_t inat_get_group_attribute(u8 modrm, u8 last_pfx, + insn_attr_t esc_attr); + +/* Attribute checking macros. Use these macros in your code */ +#define INAT_IS_PREFIX(attr) (attr & INAT_PFX_MASK) +#define INAT_IS_ADDRSZ(attr) ((attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ) +#define INAT_IS_OPNDSZ(attr) ((attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ) +#define INAT_LPREFIX_NUM(attr) \ + (((attr & INAT_PFX_MASK) > INAT_LPREFIX_MAX) ? 0 :\ + (attr & INAT_PFX_MASK)) +#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) + +#define INAT_IS_ESCAPE(attr) (attr & INAT_ESC_MASK) +#define INAT_ESCAPE_NUM(attr) ((attr & INAT_ESC_MASK) >> INAT_ESC_OFFS) +#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) + +#define INAT_IS_GROUP(attr) (attr & INAT_GRP_MASK) +#define INAT_GROUP_NUM(attr) ((attr & INAT_GRP_MASK) >> INAT_GRP_OFFS) +#define INAT_GROUP_COMMON(attr) (attr & ~INAT_GRP_MASK) +#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) + +#define INAT_HAS_IMM(attr) (attr & INAT_IMM_MASK) +#define INAT_IMM_SIZE(attr) ((attr & INAT_IMM_MASK) >> INAT_IMM_OFFS) +#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) + +#define INAT_IS_REX_PREFIX(attr) (attr & INAT_REXPFX) +#define INAT_HAS_MODRM(attr) (attr & INAT_MODRM) +#define INAT_IS_FORCE64(attr) (attr & INAT_FORCE64) +#define INAT_HAS_ADDIMM(attr) (attr & INAT_ADDIMM) +#define INAT_HAS_MOFFSET(attr) (attr & INAT_MOFFSET) +#define INAT_HAS_VARIANT(attr) (attr & INAT_VARIANT) + +#endif diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h new file mode 100644 index 0000000..5b50fa3 --- /dev/null +++ b/arch/x86/include/asm/insn.h @@ -0,0 +1,134 @@ +#ifndef _ASM_X86_INSN_H +#define _ASM_X86_INSN_H +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include <linux/types.h> +/* insn_attr_t is defined in inat.h */ +#include <asm/inat.h> + +struct insn_field { + union { + s32 value; + u8 bytes[4]; + }; + bool got; /* true if we've run insn_get_xxx() for this field */ + u8 nbytes; +}; + +struct insn { + struct insn_field prefixes; /* + * Prefixes + * prefixes.bytes[3]: last prefix + */ + struct insn_field rex_prefix; /* REX prefix */ + struct insn_field opcode; /* + * opcode.bytes[0]: opcode1 + * opcode.bytes[1]: opcode2 + * opcode.bytes[2]: opcode3 + */ + struct insn_field modrm; + struct insn_field sib; + struct insn_field displacement; + union { + struct insn_field immediate; + struct insn_field moffset1; /* for 64bit MOV */ + struct insn_field immediate1; /* for 64bit imm or off16/32 */ + }; + union { + struct insn_field moffset2; /* for 64bit MOV */ + struct insn_field immediate2; /* for 64bit imm or seg16 */ + }; + + insn_attr_t attr; + u8 opnd_bytes; + u8 addr_bytes; + u8 length; + bool x86_64; + + const u8 *kaddr; /* kernel address of insn (copy) to analyze */ + const u8 *next_byte; +}; + +#define OPCODE1(insn) ((insn)->opcode.bytes[0]) +#define OPCODE2(insn) ((insn)->opcode.bytes[1]) +#define OPCODE3(insn) ((insn)->opcode.bytes[2]) + +#define MODRM_MOD(insn) (((insn)->modrm.value & 0xc0) >> 6) +#define MODRM_REG(insn) (((insn)->modrm.value & 0x38) >> 3) +#define MODRM_RM(insn) ((insn)->modrm.value & 0x07) + +#define SIB_SCALE(insn) (((insn)->sib.value & 0xc0) >> 6) +#define SIB_INDEX(insn) (((insn)->sib.value & 0x38) >> 3) +#define SIB_BASE(insn) ((insn)->sib.value & 0x07) + +#define REX_W(insn) ((insn)->rex_prefix.value & 8) +#define REX_R(insn) ((insn)->rex_prefix.value & 4) +#define REX_X(insn) ((insn)->rex_prefix.value & 2) +#define REX_B(insn) ((insn)->rex_prefix.value & 1) + +/* The last prefix is needed for two-byte and three-byte opcodes */ +#define LAST_PREFIX(insn) ((insn)->prefixes.bytes[3]) + +#define MOFFSET64(insn) (((u64)((insn)->moffset2.value) << 32) | \ + (u32)((insn)->moffset1.value)) + +#define IMMEDIATE64(insn) (((u64)((insn)->immediate2.value) << 32) | \ + (u32)((insn)->immediate1.value)) + +extern void insn_init(struct insn *insn, const u8 *kaddr, bool x86_64); +extern void insn_get_prefixes(struct insn *insn); +extern void insn_get_opcode(struct insn *insn); +extern void insn_get_modrm(struct insn *insn); +extern void insn_get_sib(struct insn *insn); +extern void insn_get_displacement(struct insn *insn); +extern void insn_get_immediate(struct insn *insn); +extern void insn_get_length(struct insn *insn); + +/* Attribute will be determined after getting ModRM (for opcode groups) */ +static inline void insn_get_attr(struct insn *insn) +{ + insn_get_modrm(insn); +} + +/* Instruction uses RIP-relative addressing */ +extern bool insn_rip_relative(struct insn *insn); + +#ifdef CONFIG_X86_64 +/* Init insn for kernel text */ +#define kernel_insn_init(insn, kaddr) insn_init(insn, kaddr, 1) +#else /* CONFIG_X86_32 */ +#define kernel_insn_init(insn, kaddr) insn_init(insn, kaddr, 0) +#endif + +#define INSN_PREFIXES_OFFS(insn) (0) +#define INSN_REXPREFIX_OFFS(insn) ((insn)->prefixes.nbytes) +#define INSN_OPCODE_OFFS(insn) (INSN_REXPREFIX_OFFS(insn) + \ + ((insn)->rex_prefix.nbytes)) +#define INSN_MODRM_OFFS(insn) (INSN_OPCODE_OFFS(insn) + \ + ((insn)->opcode.nbytes)) +#define INSN_SIB_OFFS(insn) (INSN_MODRM_OFFS(insn) + \ + ((insn)->modrm.nbytes)) +#define INSN_DISPLACEMENT_OFFS(insn) (INSN_SIB_OFFS(insn) + \ + ((insn)->sib.nbytes)) +#define INSN_IMMEDIATE_OFFS(insn) (INSN_DISPLACEMENT_OFFS(insn) + \ + ((insn)->displacement.nbytes)) + +#endif /* _ASM_X86_INSN_H */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 55e11aa..db0e3be 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -2,12 +2,25 @@ # Makefile for x86 specific library files. # +inat_tables_script = $(srctree)/arch/x86/scripts/gen-insn-attr-x86.awk +inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt +quiet_cmd_inat_tables = GEN $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ + +$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) + $(call cmd,inat_tables) + +$(obj)/inat.o: $(obj)/inat-tables.c + +clean-files := inat-tables.c + obj-$(CONFIG_SMP) := msr-on-cpu.o lib-y := delay.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o +lib-y += insn.o inat.o ifeq ($(CONFIG_X86_32),y) lib-y += checksum_32.o diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c new file mode 100644 index 0000000..d6a34be --- /dev/null +++ b/arch/x86/lib/inat.c @@ -0,0 +1,80 @@ +/* + * x86 instruction attribute tables + * + * Written by Masami Hiramatsu <mhi...@re...> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include <linux/module.h> +#include <asm/insn.h> + +/* Attribute tables are generated from opcode map */ +#include "inat-tables.c" + +/* Attribute search APIs */ +insn_attr_t inat_get_opcode_attribute(u8 opcode) +{ + return inat_primary_table[opcode]; +} + +insn_attr_t inat_get_escape_attribute(u8 opcode, u8 last_pfx, + insn_attr_t esc_attr) +{ + const insn_attr_t *table; + insn_attr_t lpfx_attr; + int n, m = 0; + + n = INAT_ESCAPE_NUM(esc_attr); + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = INAT_LPREFIX_NUM(lpfx_attr); + } + table = inat_escape_tables[n][0]; + if (!table) + return 0; + if (INAT_HAS_VARIANT(table[opcode]) && m) { + table = inat_escape_tables[n][m]; + if (!table) + return 0; + } + return table[opcode]; +} + +#define REGBITS(modrm) (((modrm) >> 3) & 0x7) + +insn_attr_t inat_get_group_attribute(u8 modrm, u8 last_pfx, + insn_attr_t grp_attr) +{ + const insn_attr_t *table; + insn_attr_t lpfx_attr; + int n, m = 0; + + n = INAT_GROUP_NUM(grp_attr); + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = INAT_LPREFIX_NUM(lpfx_attr); + } + table = inat_group_tables[n][0]; + if (!table) + return INAT_GROUP_COMMON(grp_attr); + if (INAT_HAS_VARIANT(table[REGBITS(modrm)]) && m) { + table = inat_escape_tables[n][m]; + if (!table) + return INAT_GROUP_COMMON(grp_attr); + } + return table[REGBITS(modrm)] | INAT_GROUP_COMMON(grp_attr); +} + diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c new file mode 100644 index 0000000..254c848 --- /dev/null +++ b/arch/x86/lib/insn.c @@ -0,0 +1,471 @@ +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004, 2009 + */ + +#include <linux/string.h> +#include <linux/module.h> +#include <asm/inat.h> +#include <asm/insn.h> + +#define get_next(t, insn) \ + ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) + +#define peek_next(t, insn) \ + ({t r; r = *(t*)insn->next_byte; r; }) + +/** + * insn_init() - initialize struct insn + * @insn: &struct insn to be initialized + * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @x86_64: true for 64-bit kernel or 64-bit app + */ +void insn_init(struct insn *insn, const u8 *kaddr, bool x86_64) +{ + memset(insn, 0, sizeof(*insn)); + insn->kaddr = kaddr; + insn->next_byte = kaddr; + insn->x86_64 = x86_64; + insn->opnd_bytes = 4; + if (x86_64) + insn->addr_bytes = 8; + else + insn->addr_bytes = 4; +} +EXPORT_SYMBOL_GPL(insn_init); + +/** + * insn_get_prefixes - scan x86 instruction prefix bytes + * @insn: &struct insn containing instruction + * + * Populates the @insn->prefixes bitmap, and updates @insn->next_byte + * to point to the (first) opcode. No effect if @insn->prefixes.got + * is already true. + */ +void insn_get_prefixes(struct insn *insn) +{ + struct insn_field *prefixes = &insn->prefixes; + insn_attr_t attr; + u8 b, lb, i, nb; + + if (prefixes->got) + return; + + nb = 0; + lb = 0; + b = peek_next(u8, insn); + attr = inat_get_opcode_attribute(b); + while (INAT_IS_PREFIX(attr)) { + /* Skip if same prefix */ + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == b) + goto found; + if (nb == 4) + /* Invalid instruction */ + break; + prefixes->bytes[nb++] = b; + if (INAT_IS_ADDRSZ(attr)) { + /* address size switches 2/4 or 4/8 */ + if (insn->x86_64) + insn->addr_bytes ^= 12; + else + insn->addr_bytes ^= 6; + } else if (INAT_IS_OPNDSZ(attr)) { + /* oprand size switches 2/4 */ + insn->opnd_bytes ^= 6; + } +found: + prefixes->nbytes++; + insn->next_byte++; + lb = b; + b = peek_next(u8, insn); + attr = inat_get_opcode_attribute(b); + } + /* Set the last prefix */ + if (lb && lb != LAST_PREFIX(insn)) { + if (unlikely(LAST_PREFIX(insn))) { + /* Swap the last prefix */ + b = LAST_PREFIX(insn); + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == lb) + prefixes->bytes[i] = b; + } + LAST_PREFIX(insn) = lb; + } + + if (insn->x86_64) { + b = peek_next(u8, insn); + attr = inat_get_opcode_attribute(b); + if (INAT_IS_REX_PREFIX(attr)) { + insn->rex_prefix.value = b; + insn->rex_prefix.nbytes = 1; + insn->next_byte++; + if (REX_W(insn)) + /* REX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } + } + insn->rex_prefix.got = true; + prefixes->got = true; + return; +} +EXPORT_SYMBOL_GPL(insn_get_prefixes); + +/** + * insn_get_opcode - collect opcode(s) + * @insn: &struct insn containing instruction + * + * Populates @insn->opcode, updates @insn->next_byte to point past the + * opcode byte(s), and set @insn->attr (except for groups). + * If necessary, first collects any preceding (prefix) bytes. + * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got + * is already true. + * + */ +void insn_get_opcode(struct insn *insn) +{ + struct insn_field *opcode = &insn->opcode; + u8 op, pfx; + if (opcode->got) + return; + if (!insn->prefixes.got) + insn_get_prefixes(insn); + + /* Get first opcode */ + op = get_next(u8, insn); + OPCODE1(insn) = op; + opcode->nbytes = 1; + insn->attr = inat_get_opcode_attribute(op); + while (INAT_IS_ESCAPE(insn->attr)) { + /* Get escaped opcode */ + op = get_next(u8, insn); + opcode->bytes[opcode->nbytes++] = op; + pfx = LAST_PREFIX(insn); + insn->attr = inat_get_escape_attribute(op, pfx, insn->attr); + } + opcode->got = true; +} +EXPORT_SYMBOL_GPL(insn_get_opcode); + +/** + * insn_get_modrm - collect ModRM byte, if any + * @insn: &struct insn containing instruction + * + * Populates @insn->modrm and updates @insn->next_byte to point past the + * ModRM byte, if any. If necessary, first collects the preceding bytes + * (prefixes and opcode(s)). No effect if @insn->modrm.got is already true. + */ +void insn_get_modrm(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + u8 pfx, mod; + if (modrm->got) + return; + if (!insn->opcode.got) + insn_get_opcode(insn); + + if (INAT_HAS_MODRM(insn->attr)) { + mod = get_next(u8, insn); + modrm->value = mod; + modrm->nbytes = 1; + if (INAT_IS_GROUP(insn->attr)) { + pfx = LAST_PREFIX(insn); + insn->attr = inat_get_group_attribute(mod, pfx, + insn->attr); + } + } + + if (insn->x86_64 && INAT_IS_FORCE64(insn->attr)) + insn->opnd_bytes = 8; + modrm->got = true; +} +EXPORT_SYMBOL_GPL(insn_get_modrm); + + +/** + * insn_rip_relative() - Does instruction use RIP-relative addressing mode? + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. No effect if @insn->x86_64 is false. + */ +bool insn_rip_relative(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + + if (!insn->x86_64) + return false; + if (!modrm->got) + insn_get_modrm(insn); + /* + * For rip-relative instructions, the mod field (top 2 bits) + * is zero and the r/m field (bottom 3 bits) is 0x5. + */ + return (modrm->nbytes && (modrm->value & 0xc7) == 0x5); +} +EXPORT_SYMBOL_GPL(insn_rip_relative); + +/** + * + * insn_get_sib() - Get the SIB byte of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. + */ +void insn_get_sib(struct insn *insn) +{ + if (insn->sib.got) + return; + if (!insn->modrm.got) + insn_get_modrm(insn); + if (insn->modrm.nbytes) + if (insn->addr_bytes != 2 && + MODRM_MOD(insn) != 3 && MODRM_RM(insn) == 4) { + insn->sib.value = get_next(u8, insn); + insn->sib.nbytes = 1; + } + insn->sib.got = true; +} +EXPORT_SYMBOL_GPL(insn_get_sib); + + +/** + * + * insn_get_displacement() - Get the displacement of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * SIB byte. + * Displacement value is sign-expanded. + */ +void insn_get_displacement(struct insn *insn) +{ + u8 mod; + if (insn->displacement.got) + return; + if (!insn->sib.got) + insn_get_sib(insn); + if (insn->modrm.nbytes) { + /* + * Interpreting the modrm byte: + * mod = 00 - no displacement fields (exceptions below) + * mod = 01 - 1-byte displacement field + * mod = 10 - displacement field is 4 bytes, or 2 bytes if + * address size = 2 (0x67 prefix in 32-bit mode) + * mod = 11 - no memory operand + * + * If address size = 2... + * mod = 00, r/m = 110 - displacement field is 2 bytes + * + * If address size != 2... + * mod != 11, r/m = 100 - SIB byte exists + * mod = 00, SIB base = 101 - displacement field is 4 bytes + * mod = 00, r/m = 101 - rip-relative addressing, displacement + * field is 4 bytes + */ + mod = MODRM_MOD(insn); + if (mod == 3) + goto out; + if (mod == 1) { + insn->displacement.value = get_next(s8, insn); + insn->displacement.nbytes = 1; + } else if (insn->addr_bytes == 2) { + if ((mod == 0 && MODRM_RM(insn) == 6) || mod == 2) { + insn->displacement.value = get_next(s16, insn); + insn->displacement.nbytes = 2; + } + } else { + if ((mod == 0 && MODRM_RM(insn) == 5) || mod == 2 || + (mod == 0 && SIB_BASE(insn) == 5)) { + insn->displacement.value = get_next(s32, insn); + insn->displacement.nbytes = 4; + } + } + } +out: + insn->displacement.got = true; +} +EXPORT_SYMBOL_GPL(insn_get_displacement); + +/* Decode moffset16/32/64 */ +static void __get_moffset(struct insn *insn) +{ + switch (insn->addr_bytes) { + case 2: + insn->moffset1.value = get_next(s16, insn); + insn->moffset1.nbytes = 2; + break; + case 4: + insn->moffset1.value = get_next(s32, insn); + insn->moffset1.nbytes = 4; + break; + case 8: + insn->moffset1.value = get_next(s32, insn); + insn->moffset1.nbytes = 4; + insn->moffset2.value = get_next(s32, insn); + insn->moffset2.nbytes = 4; + break; + } + insn->moffset1.got = insn->moffset2.got = true; +} + +/* Decode imm v32(Iz) */ +static void __get_immv32(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate.value = get_next(s16, insn); + insn->immediate.nbytes = 2; + break; + case 4: + case 8: + insn->immediate.value = get_next(s32, insn); + insn->immediate.nbytes = 4; + break; + } +} + +/* Decode imm v64(Iv/Ov) */ +static void __get_immv(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(s16, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(s32, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + insn->immediate1.value = get_next(s32, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(s32, insn); + insn->immediate2.nbytes = 4; + break; + } + insn->immediate1.got = insn->immediate2.got = true; +} + +/* Decode ptr16:16/32(Ap) */ +static void __get_immptr(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(s16, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(s32, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + /* ptr16:64 is not supported (no segment) */ + WARN_ON(1); + return; + } + insn->immediate2.value = get_next(u16, insn); + insn->immediate2.nbytes = 2; + insn->immediate1.got = insn->immediate2.got = true; +} + +/** + * + * insn_get_immediate() - Get the immediates of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * displacement bytes. + * Basically, most of immediates are sign-expanded. Unsigned-value can be + * get by bit masking with ((1 << (nbytes * 8)) - 1) + */ +void insn_get_immediate(struct insn *insn) +{ + if (insn->immediate.got) + return; + if (!insn->displacement.got) + insn_get_displacement(insn); + + if (INAT_HAS_MOFFSET(insn->attr)) { + __get_moffset(insn); + goto done; + } + + if (!INAT_HAS_IMM(insn->attr)) + /* no immediates */ + goto done; + + switch (INAT_IMM_SIZE(insn->attr)) { + case INAT_IMM_BYTE: + insn->immediate.value = get_next(s8, insn); + insn->immediate.nbytes = 1; + break; + case INAT_IMM_WORD: + insn->immediate.value = get_next(s16, insn); + insn->immediate.nbytes = 2; + break; + case INAT_IMM_DWORD: + insn->immediate.value = get_next(s32, insn); + insn->immediate.nbytes = 4; + break; + case INAT_IMM_QWORD: + insn->immediate1.value = get_next(s32, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(s32, insn); + insn->immediate2.nbytes = 4; + break; + case INAT_IMM_PTR: + __get_immptr(insn); + break; + case INAT_IMM_VWORD32: + __get_immv32(insn); + break; + case INAT_IMM_VWORD: + __get_immv(insn); + break; + default: + break; + } + if (INAT_HAS_ADDIMM(insn->attr)) { + insn->immediate2.value = get_next(s8, insn); + insn->immediate2.nbytes = 1; + } +done: + insn->immediate.got = true; +} +EXPORT_SYMBOL_GPL(insn_get_immediate); + +/** + * + * insn_get_length() - Get the length of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * immediates bytes. + */ +void insn_get_length(struct insn *insn) +{ + if (insn->length) + return; + if (!insn->immediate.got) + insn_get_immediate(insn); + insn->length = (u8)((unsigned long)insn->next_byte + - (unsigned long)insn->kaddr); +} +EXPORT_SYMBOL_GPL(insn_get_length); diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt new file mode 100644 index 0000000..ab2a58d --- /dev/null +++ b/arch/x86/lib/x86-opcode-map.txt @@ -0,0 +1,711 @@ +# x86 Opcode Maps +# +#<Opcode maps> +# Table: table-name +# Referrer: escaped-name +# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# (or) +# opcode: escape # escaped-name +# EndTable +# +#<group maps> +# GrpTable: GrpXXX +# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# EndTable +# + +Table: one byte opcode +Referrer: +# 0x00 - 0x0f +00: ADD Eb,Gb +01: ADD Ev,Gv +02: ADD Gb,Eb +03: ADD Gv,Ev +04: ADD AL,Ib +05: ADD rAX,Iz +06: PUSH ES (i64) +07: POP ES (i64) +08: OR Eb,Gb +09: OR Ev,Gv +0a: OR Gb,Eb +0b: OR Gv,Ev +0c: OR AL,Ib +0d: OR rAX,Iz +0e: PUSH CS (i64) +0f: escape # 2-byte escape +# 0x10 - 0x1f +10: ADC Eb,Gb +11: ADC Ev,Gv +12: ADC Gb,Eb +13: ADC Gv,Ev +14: ADC AL,Ib +15: ADC rAX,Iz +16: PUSH SS (i64) +17: POP SS (i64) +18: SBB Eb,Gb +19: SBB Ev,Gv +1a: SBB Gb,Eb +1b: SBB Gv,Ev +1c: SBB AL,Ib +1d: SBB rAX,Iz +1e: PUSH DS (i64) +1f: POP DS (i64) +# 0x20 - 0x2f +20: AND Eb,Gb +21: AND Ev,Gv +22: AND Gb,Eb +23: AND Gv,Ev +24: AND AL,Ib +25: AND rAx,Iz +26: SEG=ES (Prefix) +27: DAA (i64) +28: SUB Eb,Gb +29: SUB Ev,Gv +2a: SUB Gb,Eb +2b: SUB Gv,Ev +2c: SUB AL,Ib +2d: SUB rAX,Iz +2e: SEG=CS (Prefix) +2f: DAS (i64) +# 0x30 - 0x3f +30: XOR Eb,Gb +31: XOR Ev,Gv +32: XOR Gb,Eb +33: XOR Gv,Ev +34: XOR AL,Ib +35: XOR rAX,Iz +36: SEG=SS (Prefix) +37: AAA (i64) +38: CMP Eb,Gb +39: CMP Ev,Gv +3a: CMP Gb,Eb +3b: CMP Gv,Ev +3c: CMP AL,Ib +3d: CMP rAX,Iz +3e: SEG=DS (Prefix) +3f: AAS (i64) +# 0x40 - 0x4f +40: INC eAX (i64) | REX (o64) +41: INC eCX (i64) | REX.B (o64) +42: INC eDX (i64) | REX.X (o64) +43: INC eBX (i64) | REX.XB (o64) +44: INC eSP (i64) | REX.R (o64) +45: INC eBP (i64) | REX.RB (o64) +46: INC eSI (i64) | REX.RX (o64) +47: INC eDI (i64) | REX.RXB (o64) +48: DEC eAX (i64) | REX.W (o64) +49: DEC eCX (i64) | REX.WB (o64) +4a: DEC eDX (i64) | REX.WX (o64) +4b: DEC eBX (i64) | REX.WXB (o64) +4c: DEC eSP (i64) | REX.WR (o64) +4d: DEC eBP (i64) | REX.WRB (o64) +4e: DEC eSI (i64) | REX.WRX (o64) +4f: DEC eDI (i64) | REX.WRXB (o64) +# 0x50 - 0x5f +50: PUSH rAX/r8 (d64) +51: PUSH rCX/r9 (d64) +52: PUSH rDX/r10 (d64) +53: PUSH rBX/r11 (d64) +54: PUSH rSP/r12 (d64) +55: PUSH rBP/r13 (d64) +56: PUSH rSI/r14 (d64) +57: PUSH rDI/r15 (d64) +58: POP rAX/r8 (d64) +59: POP rCX/r9 (d64) +5a: POP rDX/r10 (d64) +5b: POP rBX/r11 (d64) +5c: POP rSP/r12 (d64) +5d: POP rBP/r13 (d64) +5e: POP rSI/r14 (d64) +5f: POP rDI/r15 (d64) +# 0x60 - 0x6f +60: PUSHA/PUSHAD (i64) +61: POPA/POPAD (i64) +62: BOUND Gv,Ma (i64) +63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64) +64: SEG=FS (Prefix) +65: SEG=GS (Prefix) +66: Operand-Size (Prefix) +67: Address-Size (Prefix) +68: PUSH Iz (d64) +69: IMUL Gv,Ev,Iz +6a: PUSH Ib (d64) +6b: IMUL Gv,Ev,Ib +6c: INS/INSB Yb,DX +6d: INS/INSW/INSD Yz,DX +6e: OUTS/OUTSB DX,Xb +6f: OUTS/OUTSW/OUTSD DX,Xz +# 0x70 - 0x7f +70: JO Jb +71: JNO Jb +72: JB/JNAE/JC Jb +73: JNB/JAE/JNC Jb +74: JZ/JE Jb +75: JNZ/JNE Jb +76: JBE/JNA Jb +77: JNBE/JA Jb +78: JS Jb +79: JNS Jb +7a: JP/JPE Jb +7b: JNP/JPO Jb +7c: JL/JNGE Jb +7d: JNL/JGE Jb +7e: JLE/JNG Jb +7f: JNLE/JG Jb +# 0x80 - 0x8f +80: Grp1 Eb,Ib (1A) +81: Grp1 Ev,Iz (1A) +82: Grp1 Eb,Ib (1A),(i64) +83: Grp1 Ev,Ib (1A) +84: TEST Eb,Gb +85: TEST Ev,Gv +86: XCHG Eb,Gb +87: XCHG Ev,Gv +88: MOV Eb,Gb +89: MOV Ev,Gv +8a: MOV Gb,Eb +8b: MOV Gv,Ev +8c: MOV Ev,Sw +8d: LEA Gv,M +8e: MOV Sw,Ew +8f: Grp1A (1A) | POP Ev (d64) +# 0x90 - 0x9f +90: NOP | PAUSE (F3) | XCHG r8,rAX +91: XCHG rCX/r9,rAX +92: XCHG rDX/r10,rAX +93: XCHG rBX/r11,rAX +94: XCHG rSP/r12,rAX +95: XCHG rBP/r13,rAX +96: XCHG rSI/r14,rAX +97: XCHG rDI/r15,rAX +98: CBW/CWDE/CDQE +99: CWD/CDQ/CQO +9a: CALLF Ap (i64) +9b: FWAIT/WAIT +9c: PUSHF/D/Q Fv (d64) +9d: POPF/D/Q Fv (d64) +9e: SAHF +9f: LAHF +# 0xa0 - 0xaf +a0: MOV AL,Ob +a1: MOV rAX,Ov +a2: MOV Ob,AL +a3: MOV Ov,rAX +a4: MOVS/B Xb,Yb +a5: MOVS/W/D/Q Xv,Yv +a6: CMPS/B Xb,Yb +a7: CMPS/W/D Xv,Yv +a8: TEST AL,Ib +a9: TEST rAX,Iz +aa: STOS/B Yb,AL +ab: STOS/W/D/Q Yv,rAX +ac: LODS/B AL,Xb +ad: LODS/W/D/Q rAX,Xv +ae: SCAS/B AL,Yb +af: SCAS/W/D/Q rAX,Xv +# 0xb0 - 0xbf +b0: MOV AL/R8L,Ib +b1: MOV CL/R9L,Ib +b2: MOV DL/R10L,Ib +b3: MOV BL/R11L,Ib +b4: MOV AH/R12L,Ib +b5: MOV CH/R13L,Ib +b6: MOV DH/R14L,Ib +b7: MOV BH/R15L,Ib +b8: MOV rAX/r8,Iv +b9: MOV rCX/r9,Iv +ba: MOV rDX/r10,Iv +bb: MOV rBX/r11,Iv +bc: MOV rSP/r12,Iv +bd: MOV rBP/r13,Iv +be: MOV rSI/r14,Iv +bf: MOV rDI/r15,Iv +# 0xc0 - 0xcf +c0: Grp2 Eb,Ib (1A) +c1: Grp2 Ev,Ib (1A) +c2: RETN Iw (f64) +c3: RETN +c4: LES Gz,Mp (i64) +c5: LDS Gz,Mp (i64) +c6: Grp11 Eb,Ib (1A) +c7: Grp11 Ev,Iz (1A) +c8: ENTER Iw,Ib +c9: LEAVE (d64) +ca: RETF Iw +cb: RETF +cc: INT3 +cd: INT Ib +ce: INTO (i64) +cf: IRET/D/Q +# 0xd0 - 0xdf +d0: Grp2 Eb,1 (1A) +d1: Grp2 Ev,1 (1A) +d2: Grp2 Eb,CL (1A) +d3: Grp2 Ev,CL (1A) +d4: AAM Ib (i64) +d5: AAD Ib (i64) +d6: +d7: XLAT/XLATB +d8: ESC +d9: ESC +da: ESC +db: ESC +dc: ESC +dd: ESC +de: ESC +df: ESC +# 0xe0 - 0xef +e0: LOOPNE/LOOPNZ Jb (f64) +e1: LOOPE/LOOPZ Jb (f64) +e2: LOOP Jb (f64) +e3: JrCXZ Jb (f64) +e4: IN AL,Ib +e5: IN eAX,Ib +e6: OUT Ib,AL +e7: OUT Ib,eAX +e8: CALL Jz (f64) +e9: JMP-near Jz (f64) +ea: JMP-far Ap (i64) +eb: JMP-short Jb (f64) +ec: IN AL,DX +ed: IN eAX,DX +ee: OUT DX,AL +ef: OUT DX,eAX +# 0xf0 - 0xff +f0: LOCK (Prefix) +f1: +f2: REPNE (Prefix) +f3: REP/REPE (Prefix) +f4: HLT +f5: CMC +f6: Grp3_1 Eb (1A) +f7: Grp3_2 Ev (1A) +f8: CLC +f9: STC +fa: CLI +fb: STI +fc: CLD +fd: STD +fe: Grp4 (1A) +ff: Grp5 (1A) +EndTable + +Table: 2-byte opcode # First Byte is 0x0f +Referrer: 2-byte escape +# 0x0f 0x00-0x0f +00: Grp6 (1A) +01: Grp7 (1A) +02: LAR Gv,Ew +03: LSL Gv,Ew +04: +05: SYSCALL (o64) +06: CLTS +07: SYSRET (o64) +08: INVD +09: WBINVD +0a: +0b: UD2 (1B) +0c: +0d: NOP Ev +0e: +0f: +# 0x0f 0x10-0x1f +10: +11: +12: +13: +14: +15: +16: +17: +18: Grp16 (1A) +19: +1a: +1b: +1c: +1d: +1e: +1f: NOP Ev +# 0x0f 0x20-0x2f +20: MOV Rd,Cd +21: MOV Rd,Dd +22: MOV Cd,Rd +23: MOV Dd,Rd +24: +25: +26: +27: +28: movaps Vps,Wps | movapd Vpd,Wpd (66) +29: movaps Wps,Vps | movapd Wpd,Vpd (66) +2a: +2b: +2c: +2d: +2e: +2f: +# 0x0f 0x30-0x3f +30: WRMSR +31: RDTSC +32: RDMSR +33: RDPMC +34: SYSENTER +35: SYSEXIT +36: +37: GETSEC +38: escape # 3-byte escape 1 +39: +3a: escape # 3-byte escape 2 +3b: +3c: +3d: +3e: +3f: +# 0x0f 0x40-0x4f +40: CMOVO Gv,Ev +41: CMOVNO Gv,Ev +42: CMOVB/C/NAE Gv,Ev +43: CMOVAE/NB/NC Gv,Ev +44: CMOVE/Z Gv,Ev +45: CMOVNE/NZ Gv,Ev +46: CMOVBE/NA Gv,Ev +47: CMOVA/NBE Gv,Ev +48: CMOVS Gv,Ev +49: CMOVNS Gv,Ev +4a: CMOVP/PE Gv,Ev +4b: CMOVNP/PO Gv,Ev +4c: CMOVL/NGE Gv,Ev +4d: CMOVNL/GE Gv,Ev +4e: CMOVLE/NG Gv,Ev +4f: CMOVNLE/G Gv,Ev +# 0x0f 0x50-0x5f +50: +51: +52: +53: +54: +55: +56: +57: +58: +59: +5a: +5b: +5c: +5d: +5e: +5f: +# 0x0f 0x60-0x6f +60: +61: +62: +63: +64: +65: +66: +67: +68: +69: +6a: +6b: +6c: +6d: +6e: +6f: +# 0x0f 0x70-0x7f +70: +71: Grp12 (1A) +72: Grp13 (1A) +73: Grp14 (1A) +74: +75: +76: +77: +78: VMREAD Ed/q,Gd/q +79: VMWRITE Gd/q,Ed/q +7a: +7b: +7c: +7d: +7e: +7f: +# 0x0f 0x80-0x8f +80: JO Jz (f64) +81: JNO Jz (f64) +82: JB/JNAE/JC Jz (f64) +83: JNB/JAE/JNC Jz (f64) +84: JZ/JE Jz (f64) +85: JNZ/JNE Jz (f64) +86: JBE/JNA Jz (f64) +87: JNBE/JA Jz (f64) +88: JS Jz (f64) +89: JNS Jz (f64) +8a: JP/JPE Jz (f64) +8b: JNP/JPO Jz (f64) +8c: JL/JNGE Jz (f64) +8d: JNL/JGE Jz (f64) +8e: JLE/JNG Jz (f64) +8f: JNLE/JG Jz (f64) +# 0x0f 0x90-0x9f +90: SETO Eb +91: SETNO Eb +92: SETB/C/NAE Eb +93: SETAE/NB/NC Eb +94: SETE/Z Eb +95: SETNE/NZ Eb +96: SETBE/NA Eb +97: SETA/NBE Eb +98: SETS Eb +99: SETNS Eb +9a: SETP/PE Eb +9b: SETNP/PO Eb +9c: SETL/NGE Eb +9d: SETNL/GE Eb +9e: SETLE/NG Eb +9f: SETNLE/G Eb +# 0x0f 0xa0-0xaf +a0: PUSH FS (d64) +a1: POP FS (d64) +a2: CPUID +a3: BT Ev,Gv +a4: SHLD Ev,Gv,Ib +a5: SHLD Ev,Gv,CL +a6: +a7: +a8: PUSH GS (d64) +a9: POP GS (d64) +aa: RSM +ab: BTS Ev,Gv +ac: SHRD Ev,Gv,Ib +ad: SHRD Ev,Gv,CL +ae: Grp15 (1A),(1C) +af: IMUL Gv,Ev +# 0x0f 0xb0-0xbf +b0: CMPXCHG Eb,Gb +b1: CMPXCHG Ev,Gv +b2: LSS Gv,Mp +b3: BTR Ev,Gv +b4: LFS Gv,Mp +b5: LGS Gv,Mp +b6: MOVZX Gv,Eb +b7: MOVZX Gv,Ew +b8: JMPE | POPCNT Gv,Ev (F3) +b9: Grp10 (1A) +ba: Grp8 Ev,Ib (1A) +bb: BTC Ev,Gv +bc: BSF Gv,Ev +bd: BSR Gv,Ev +be: MOVSX Gv,Eb +bf: MOVSX Gv,Ew +# 0x0f 0xc0-0xcf +c0: XADD Eb,Gb +c1: XADD Ev,Gv +c2: +c3: movnti Md/q,Gd/q +c4: +c5: +c6: +c7: Grp9 (1A) +c8: BSWAP RAX/EAX/R8/R8D +c9: BSWAP RCX/ECX/R9/R9D +ca: BSWAP RDX/EDX/R10/R10D +cb: BSWAP RBX/EBX/R11/R11D +cc: BSWAP RSP/ESP/R12/R12D +cd: BSWAP RBP/EBP/R13/R13D +ce: BSWAP RSI/ESI/R14/R14D +cf: BSWAP RDI/EDI/R15/R15D +# 0x0f 0xd0-0xdf +d0: +d1: +d2: +d3: +d4: +d5: +d6: +d7: +d8: +d9: +da: +db: +dc: +dd: +de: +df: +# 0x0f 0xe0-0xef +e0: +e1: +e2: +e3: +e4: +e5: +e6: +e7: +e8: +e9: +ea: +eb: +ec: +ed: +ee: +ef: +# 0x0f 0xf0-0xff +f0: +f1: +f2: +f3: +f4: +f5: +f6: +f7: +f8: +f9: +fa: +fb: +fc: +fd: +fe: +ff: +EndTable + +Table: 3-byte opcode 1 +Referrer: 3-byte escape 1 +80: INVEPT Gd/q,Mdq (66) +81: INVPID Gd/q,Mdq (66) +f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) +f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2) +EndTable + +Table: 3-byte opcode 2 +Referrer: 3-byte escape 2 +# all opcode is for SSE +EndTable + +GrpTable: Grp1 +0: ADD +1: OR +2: ADC +3: SBB +4: AND +5: SUB +6: XOR +7: CMP +EndTable + +GrpTable: Grp1A +0: POP +EndTable + +GrpTable: Grp2 +0: ROL +1: ROR +2: RCL +3: RCR +4: SHL/SAL +5: SHR +6: +7: SAR +EndTable + +GrpTable: Grp3_1 +0: TEST Eb,Ib +1: +2: NOT Eb +3: NEG Eb +4: MUL AL,Eb +5: IMUL AL,Eb +6: DIV AL,Eb +7: IDIV AL,Eb +EndTable + +GrpTable: Grp3_2 +0: TEST Ev,Iz +1: +2: NOT Ev +3: NEG Ev +4: MUL rAX,Ev +5: IMUL rAX,Ev +6: DIV rAX,Ev +7: IDIV rAX,Ev +EndTable + +GrpTable: Grp4 +0: INC Eb +1: DEC Eb +EndTable + +GrpTable: Grp5 +0: INC Ev +1: DEC Ev +2: CALLN Ev (f64) +3: CALLF Ep +4: JMPN Ev (f64) +5: JMPF Ep +6: PUSH Ev (d64) +7: +EndTable + +GrpTable: Grp6 +0: SLDT Rv/Mw +1: STR Rv/Mw +2: LLDT Ew +3: LTR Ew +4: VERR Ew +5: VERW Ew +EndTable + +GrpTable: Grp7 +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) +3: LIDT Ms +4: SMSW Mw/Rv +5: +6: LMSW Ew +7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) +EndTable + +GrpTable: Grp8 +4: BT +5: BTS +6: BTR +7: BTC +EndTable + +GrpTable: Grp9 +1: CMPXCHG8B/16B Mq/Mdq +6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) +7: VMPTRST Mq +EndTable + +GrpTable: Grp10 +EndTable + +GrpTable: Grp11 +0: MOV +EndTable + +GrpTable: Grp12 +EndTable + +GrpTable: Grp13 +EndTable + +GrpTable: Grp14 +EndTable + +GrpTable: Grp15 +0: fxsave +1: fxstor +2: ldmxcsr +3: stmxcsr +4: XSAVE +5: XRSTOR | lfence (11B) +6: mfence (11B) +7: clflush | sfence (11B) +EndTable + +GrpTable: Grp16 +0: prefetch NTA +1: prefetch T0 +2: prefetch T1 +3: prefetch T2 +EndTable diff --git a/arch/x86/scripts/gen-insn-attr-x86.awk b/arch/x86/scripts/gen-insn-attr-x86.awk new file mode 100644 index 0000000..6fa88cd --- /dev/null +++ b/arch/x86/scripts/gen-insn-attr-x86.awk @@ -0,0 +1,314 @@ +#!/bin/awk -f +# gen-insn-attr-x86.awk: Instruction attribute table generator +# Written by Masami Hiramatsu <mhi...@re...> +# +# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c + +BEGIN { + print "/* x86 opcode map generated from x86-opcode-map.txt */" + print "/* Do not change this code. */" + ggid = 1 + geid = 1 + + opnd_expr = "^[[:alpha:]]" + ext_expr = "^\\(" + sep_expr = "^\\|$" + group_expr = "^Grp[[:digit:]]+A*" + + imm_expr = "^[IJAO][[:lower:]]" + imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" + imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)" + imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)" + imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)" + imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)" + imm_flag["Ob"] = "INAT_MOFFSET" + imm_flag["Ov"] = "INAT_MOFFSET" + + modrm_expr = "^([CDEGMNPQRSUVW][[:lower:]]+|NTA|T[012])" + force64_expr = "\\([df]64\\)" + rex_expr = "^REX(\\.[XRWB]+)*" + fpu_expr = "^ESC" # TODO + + lprefix1_expr = "\\(66\\)" + delete lptable1 + lprefix2_expr = "\\(F2\\)" + delete lptable2 + lprefix3_expr = "\\(F3\\)" + delete lptable3 + max_lprefix = 4 + + prefix_expr = "\\(Prefix\\)" + prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" + prefix_num["REPNE"] = "INAT_PFX_REPNE" + prefix_num["REP/REPE"] = "INAT_PFX_REPE" + prefix_num["LOCK"] = "INAT_PFX_LOCK" + prefix_num["SEG=CS"] = "INAT_PFX_CS" + prefix_num["SEG=DS"] = "INAT_PFX_DS" + prefix_num["SEG=ES"] = "INAT_PFX_ES" + prefix_num["SEG=FS"] = "INAT_PFX_FS" + prefix_num["SEG=GS"] = "INAT_PFX_GS" + prefix_num["SEG=SS"] = "INAT_PFX_SS" + prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" + + delete table + delete etable + delete gtable + eid = -1 + gid = -1 +} + +function semantic_error(msg) { + print "Semantic error at " NR ": " msg > "/dev/stderr" + exit 1 +} + +function debug(msg) { + print "DEBUG: " msg +} + +function array_size(arr, i,c) { + c = 0 + for (i in arr) + c++ + return c +} + +/^Table:/ { + print "/* " $0 " */" +} + +/^Referrer:/ { + if (NF == 1) { + # primary opcode table + tname = "inat_primary_table" + eid = -1 + } else { + # escape opcode table + ref = "" + for (i = 2; i <= NF; i++) + ref = ref $i + eid = escape[ref] + tname = sprintf("inat_escape_table_%d", eid) + } +} + +/^GrpTable:/ { + print "/* " $0 " */" + if (!($2 in group)) + semantic_error("No group: " $2 ) + gid = group[$2] + tname = "inat_group_table_" gid +} + +function print_table(tbl,name,fmt,n) +{ + print "const insn_attr_t " name " = {" + for (i = 0; i < n; i++) { + id = sprintf(fmt, i) + if (tbl[id]) + print " [" id "] = " tbl[id] "," + } + print "};" +} + +/^EndTable/ { + if (gid != -1) { + # print group tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,3] = tname "_3" + } + } else { + # print primary/escaped tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,3] = tname "_3" + } + } + print "" + delete table + delete lptable1 + delete lptable2 + delete lptable3 + gid = -1 + eid = -1 +} + +function add_flags(old,new) { + if (old && new) + return old " | " new + else if (old) + return old + else + return new +} + +# convert operands to flags. +function convert_operands(opnd, i,imm,mod) +{ + imm = null + mod = null + for (i in opnd) { + i = opnd[i] + if (match(i, imm_expr) == 1) { + if (!imm_flag[i]) + semantic_error("Unknown imm opnd: " i) + if (imm) { + if (i != "Ib") + semantic_error("ADDIMM error") + imm = add_flags(imm, "INAT_ADDIMM") + } else + imm = imm_flag[i] + } else if (match(i, modrm_expr)) + mod = "INAT_MODRM" + } + return add_flags(imm, mod) +} + +/^[0-9a-f]+\:/ { + if (NR == 1) + next + # get index + idx = "0x" substr($1, 1, index($1,":") - 1) + if (idx in table) + semantic_error("Redefine " idx " in " tname) + + # check if escaped opcode + if ("escape" == $2) { + if ($3 != "#") + semantic_error("No escaped name") + ref = "" + for (i = 4; i <= NF; i++) + ref = ref $i + if (ref in escape) + semantic_error("Redefine escape (" ref ")") + escape[ref] = geid + geid++ + table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")" + next + } + + variant = null + # converts + i = 2 + while (i <= NF) { + opcode = $(i++) + delete opnds + ext = null + flags = null + opnd = null + # parse one opcode + if (match($i, opnd_expr)) { + opnd = $i + split($(i++), opnds, ",") + flags = convert_operands(opnds) + } + if (match($i, ext_expr)) + ext = $(i++) + if (match($i, sep_expr)) + i++ + else if (i < NF) + semantic_error($i " is not a separator") + + # check if group opcode + if (match(opcode, group_expr)) { + if (!(opcode in group)) { + group[opcode] = ggid + ggid++ + } + flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")") + } + # check force(or default) 64bit + if (match(ext, force64_expr)) + flags = add_flags(flags, "INAT_FORCE64") + + # check REX prefix + if (match(opcode, rex_expr)) + flags = add_flags(flags, "INAT_REXPFX") + + # check coprocessor escape : TODO + if (match(opcode, fpu_expr)) + flags = add_flags(flags, "INAT_MODRM") + + # check prefixes + if (match(ext, prefix_expr)) { + if (!prefix_num[opcode]) + semantic_error("Unknown prefix: " opcode) + flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")") + } + if (length(flags) == 0) + continue + # check if last prefix + if (match(ext, lprefix1_expr)) { + lptable1[idx] = add_flags(lptable1[idx],flags) + variant = "INAT_VARIANT" + } else if (match(ext, lprefix2_expr)) { + lptable2[idx] = add_flags(lptable2[idx],flags) + variant = "INAT_VARIANT" + } else if (match(ext, lprefix3_expr)) { + lptable3[idx] = add_flags(lptable3[idx],flags) + variant = "INAT_VARIANT" + } else { + table[idx] = add_flags(table[idx],flags) + } + } + if (variant) + table[idx] = add_flags(table[idx],variant) +} + +END { + # print escape opcode map's array + print "/* Escape opcode map array */" + print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \ + "[INAT_LPREFIX_MAX + 1] = {" + for (i = 0; i < geid; i++) + for (j = 0; j < max_lprefix; j++) + if (etable[i,j]) + print " ["i"]["j"] = "etable[i,j]"," + print "};\n" + # print group opcode map's array + print "/* Group opcode map array */" + print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\ + "[INAT_LPREFIX_MAX + 1] = {" + for (i = 0; i < ggid; i++) + for (j = 0; j < max_lprefix; j++) + if (gtable[i,j]) + print " ["i"]["j"] = "gtable[i,j]"," + print "};" +} -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America) Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-23 01:18:01
|
Add a user-space selftest of x86 instruction decoder at kernel build time. When CONFIG_X86_DECODER_SELFTEST=y, Kbuild builds a test harness of x86 instruction decoder and performs it after building vmlinux. The test compares the results of objdump and x86 instruction decoder code and check there are no differences. Signed-off-by: Masami Hiramatsu <mhi...@re...> Signed-off-by: Jim Keniston <jke...@us...> Cc: H. Peter Anvin <hp...@zy...> Cc: Steven Rostedt <ro...@go...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Ingo Molnar <mi...@el...> Cc: Frederic Weisbecker <fwe...@gm...> Cc: Andi Kleen <ak...@li...> Cc: Vegard Nossum <veg...@gm...> Cc: Avi Kivity <av...@re...> Cc: Przemysław Pawełczyk <prz...@pa...> Cc: Sam Ravnborg <sa...@ra...> --- arch/x86/Kconfig.debug | 9 ++++ arch/x86/Makefile | 3 + arch/x86/include/asm/inat.h | 2 + arch/x86/include/asm/insn.h | 2 + arch/x86/lib/inat.c | 2 + arch/x86/lib/insn.c | 2 + arch/x86/scripts/Makefile | 19 +++++++++ arch/x86/scripts/distill.awk | 41 ++++++++++++++++++++ arch/x86/scripts/test_get_len.c | 81 +++++++++++++++++++++++++++++++++++++++ arch/x86/scripts/user_include.h | 49 ++++++++++++++++++++++++ 10 files changed, 210 insertions(+), 0 deletions(-) create mode 100644 arch/x86/scripts/Makefile create mode 100644 arch/x86/scripts/distill.awk create mode 100644 arch/x86/scripts/test_get_len.c create mode 100644 arch/x86/scripts/user_include.h diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 9a88937..430aab4 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -179,6 +179,15 @@ config X86_DS_SELFTEST config HAVE_MMIOTRACE_SUPPORT def_bool y +config X86_DECODER_SELFTEST + bool "x86 instruction decoder selftest" + depends on DEBUG_KERNEL + ---help--- + Perform x86 instruction decoder selftests at build time. + This option is useful for checking the sanity of x86 instruction + decoder code. + If unsure, say "N". + # # IO delay types: # diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1b68659..7046556 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -154,6 +154,9 @@ all: bzImage KBUILD_IMAGE := $(boot)/bzImage bzImage: vmlinux +ifeq ($(CONFIG_X86_DECODER_SELFTEST),y) + $(Q)$(MAKE) $(build)=arch/x86/scripts posttest +endif $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 01e079a..9090665 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -20,7 +20,9 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ +#ifdef __KERNEL__ #include <linux/types.h> +#endif /* Instruction attributes */ typedef u32 insn_attr_t; diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 5b50fa3..5736404 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -20,7 +20,9 @@ * Copyright (C) IBM Corporation, 2009 */ +#ifdef __KERNEL__ #include <linux/types.h> +#endif /* insn_attr_t is defined in inat.h */ #include <asm/inat.h> diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c index d6a34be..564ecbd 100644 --- a/arch/x86/lib/inat.c +++ b/arch/x86/lib/inat.c @@ -18,7 +18,9 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ +#ifdef __KERNEL__ #include <linux/module.h> +#endif #include <asm/insn.h> /* Attribute tables are generated from opcode map */ diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 254c848..3b9451a 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -18,8 +18,10 @@ * Copyright (C) IBM Corporation, 2002, 2004, 2009 */ +#ifdef __KERNEL__ #include <linux/string.h> #include <linux/module.h> +#endif #include <asm/inat.h> #include <asm/insn.h> diff --git a/arch/x86/scripts/Makefile b/arch/x86/scripts/Makefile new file mode 100644 index 0000000..f08859e --- /dev/null +++ b/arch/x86/scripts/Makefile @@ -0,0 +1,19 @@ +PHONY += posttest +quiet_cmd_posttest = TEST $@ + cmd_posttest = objdump -d $(objtree)/vmlinux | awk -f $(srctree)/arch/x86/scripts/distill.awk | $(obj)/test_get_len + +posttest: $(obj)/test_get_len vmlinux + $(call cmd,posttest) + +test_get_len_SRC = $(srctree)/arch/x86/scripts/test_get_len.c $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c +test_get_len_INC = $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c + +quiet_cmd_test_get_len = CC $@ + cmd_test_get_len = $(CC) -Wall $(test_get_len_SRC) -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include -include $(srctree)/arch/x86/scripts/user_include.h -o $@ + + +$(obj)/test_get_len: $(test_get_len_SRC) $(test_get_len_INC) + $(call cmd,test_get_len) + +clean-files := test_get_len + diff --git a/arch/x86/scripts/distill.awk b/arch/x86/scripts/distill.awk new file mode 100644 index 0000000..db1c9d5 --- /dev/null +++ b/arch/x86/scripts/distill.awk @@ -0,0 +1,41 @@ +# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len +# Distills the disassembly as follows: +# - Removes all lines except the disassembled instructions. +# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes +# into a single line. +# - Remove bad(or prefix only) instructions + +BEGIN { + prev_addr = "" + prev_hex = "" + prev_mnemonic = "" + bad_expr = "(\\(bad\\)|^rex|^.byte|^es$|^cs$|^ss$|^ds$|^fs$|^gs$)" + fwait_expr = "^9b " + fwait_str="9b\tfwait" +} + +/^ *[0-9a-f]+:/ { + if (split($0, field, "\t") < 3) { + # This is a continuation of the same insn. + prev_hex = prev_hex field[2] + } else { + # Skip bad instructions + if (match(prev_mnemonic, bad_expr)) + prev_addr = "" + # Split fwait from other f* instructions + if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") { + printf "%s\t%s\n", prev_addr, fwait_str + sub(fwait_expr, "", prev_hex) + } + if (prev_addr != "") + printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic + prev_addr = field[1] + prev_hex = field[2] + prev_mnemonic = field[3] + } +} + +END { + if (prev_addr != "") + printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic +} diff --git a/arch/x86/scripts/test_get_len.c b/arch/x86/scripts/test_get_len.c new file mode 100644 index 0000000..99c6fd4 --- /dev/null +++ b/arch/x86/scripts/test_get_len.c @@ -0,0 +1,81 @@ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> + +#include <asm/insn.h> + +/* + * Test of instruction analysis in general and insn_get_length() in + * particular. See if insn_get_length() and the disassembler agree + * on the length of each instruction in an elf disassembly. + * + * usage: test_get_len < distilled_disassembly + */ + +const char *prog; + +static void usage() +{ + fprintf(stderr, "usage: %s < distilled_disassembly\n", prog); + exit(1); +} + +static void malformed_line(const char *line, int line_nr) +{ + fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line); + exit(3); +} + +#define BUFSIZE 256 + +int main(int argc, char **argv) +{ + char line[BUFSIZE]; + unsigned char insn_buf[16]; + struct insn insn; + int insns = 0; + + prog = argv[0]; + if (argc > 1) + usage(); + + while (fgets(line, BUFSIZE, stdin)) { + char copy[BUFSIZE], *s, *tab1, *tab2; + int nb = 0; + unsigned b; + + insns++; + memset(insn_buf, 0, 16); + strcpy(copy, line); + tab1 = strchr(copy, '\t'); + if (!tab1) + malformed_line(line, insns); + s = tab1 + 1; + s += strspn(s, " "); + tab2 = strchr(s, '\t'); + if (!tab2) + malformed_line(line, insns); + *tab2 = '\0'; /* Characters beyond tab2 aren't examined */ + while (s < tab2) { + if (sscanf(s, "%x", &b) == 1) { + insn_buf[nb++] = (unsigned char) b; + s += 3; + } else + break; + } + /* Decode an instruction */ + kernel_insn_init(&insn, insn_buf); + insn_get_length(&insn); + if (insn.length != nb) { + fprintf(stderr, "Error: %s", line); + fprintf(stderr, "Error: objdump says %d bytes, but " + "insn_get_length() says %d (attr:%x)\n", nb, + insn.length, insn.attr); + exit(2); + } + } + fprintf(stderr, "Succeed: decoded and checked %d instructions\n", + insns); + return 0; +} diff --git a/arch/x86/scripts/user_include.h b/arch/x86/scripts/user_include.h new file mode 100644 index 0000000..3bdcc55 --- /dev/null +++ b/arch/x86/scripts/user_include.h @@ -0,0 +1,49 @@ +#ifndef __USER_TYPES_H +#define __USER_TYPES_H + +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include <string.h> + +#ifdef __x86_64__ +#define CONFIG_X86_64 +#else +#define CONFIG_X86_32 +#endif +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned int u32; +typedef unsigned long long u64; + +typedef signed char s8; +typedef short s16; +typedef int s32; +typedef long long s64; + +typedef enum bool { false = 0, true } bool; + +/* any harmless file-scope decl */ +#define NOP_DECL struct __nop +#define EXPORT_SYMBOL_GPL(symbol) NOP_DECL +#define MODULE_LICENSE(gpl) NOP_DECL + +#define WARN_ON(cond) do { } while (0) +#define unlikely(cond) (cond) + +#endif /* __USER_TYPES_H */ -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America) Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-23 01:18:00
|
Add kprobes based event tracer on ftrace. This tracer is similar to the events tracer which is based on Tracepoint infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe and kretprobe). It probes anywhere where kprobes can probe(this means, all functions body except for __kprobes functions). This tracer supports following probe arguments for each probe. %REG : Fetch register REG sN : Fetch Nth entry of stack (N >= 0) @ADDR : Fetch memory at ADDR (ADDR should be in kernel) @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) aN : Fetch function argument. (N >= 0) rv : Fetch return value. ra : Fetch return address. +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address. Changes from v5: - Add __kprobes to functions which will be invoked by kprobes. - Merge tracer-core patch and trace-argument support patch. - Add kprobe_trace_entry and kretprobe_trace_entry for recording kprobe events. - Support common arguments filtering via tracing/events/ftrace/kprobe(or kretprobe)/filter. - Add a selftest. Signed-off-by: Masami Hiramatsu <mhi...@re...> Cc: Christoph Hellwig <hc...@in...> Cc: Steven Rostedt <ro...@go...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Ingo Molnar <mi...@el...> Cc: Frederic Weisbecker <fwe...@gm...> --- Documentation/trace/kprobes.txt | 81 +++ kernel/trace/Kconfig | 9 kernel/trace/Makefile | 1 kernel/trace/trace.h | 22 + kernel/trace/trace_event_types.h | 20 + kernel/trace/trace_kprobe.c | 903 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 1036 insertions(+), 0 deletions(-) create mode 100644 Documentation/trace/kprobes.txt create mode 100644 kernel/trace/trace_kprobe.c diff --git a/Documentation/trace/kprobes.txt b/Documentation/trace/kprobes.txt new file mode 100644 index 0000000..d7c3092 --- /dev/null +++ b/Documentation/trace/kprobes.txt @@ -0,0 +1,81 @@ + Kprobe-based Event Tracer + ========================= + + Documentation is written by Masami Hiramatsu + + +Overview +-------- +This tracer is similar to the events tracer which is based on Tracepoint +infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe +and kretprobe). It probes anywhere where kprobes can probe(this means, all +functions body except for __kprobes functions). + +Unlike the function tracer, this tracer can probe instructions inside of +kernel functions. It allows you to check which instruction has been executed. + +Unlike the Tracepoint based events tracer, this tracer can add new probe points +on the fly. + +Similar to the events tracer, this tracer doesn't need to be activated via +current_tracer, instead of that, just set probe points via +/debug/tracing/kprobe_events. + + +Synopsis of kprobe_events +------------------------- + p SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS] : set a probe + r SYMBOL[+0] [FETCHARGS] : set a return probe + + FETCHARGS: + %REG : Fetch register REG + sN : Fetch Nth entry of stack (N >= 0) + @ADDR : Fetch memory at ADDR (ADDR should be in kernel) + @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) + aN : Fetch function argument. (N >= 0)(*) + rv : Fetch return value.(**) + ra : Fetch return address.(**) + +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.(***) + + (*) aN may not correct on asmlinkaged functions and at the middle of + function body. + (**) only for return probe. + (***) this is useful for fetching a field of data structures. + + +Usage examples +-------------- + + echo p do_sys_open a0 a1 a2 a3 > /debug/tracing/kprobe_events + + This sets a kprobe on the top of do_sys_open() function with recording +1st to 4th arguments. + + echo r do_sys_open rv ra >> /debug/tracing/kprobe_events + + This sets a kretprobe on the return point of do_sys_open() function with +recording return value and return address. + + echo > /debug/tracing/kprobe_events + + This clears all probe points. and you can see the traced information via +/debug/tracing/trace. + + cat /debug/tracing/trace +# tracer: nop +# +# TASK-PID CPU# TIMESTAMP FUNCTION +# | | | | | + <...>-1447 [001] 1038282.286875: do_sys_open+0x0/0xd6: 0x3 0x7fffd1ec4440 0x8000 0x0 + <...>-1447 [001] 1038282.286878: sys_openat+0xc/0xe <-do_sys_open: 0xfffffffffffffffe 0xffffffff81367a3a + <...>-1447 [001] 1038282.286885: do_sys_open+0x0/0xd6: 0xffffff9c 0x40413c 0x8000 0x1b6 + <...>-1447 [001] 1038282.286915: sys_open+0x1b/0x1d <-do_sys_open: 0x3 0xffffffff81367a3a + <...>-1447 [001] 1038282.286969: do_sys_open+0x0/0xd6: 0xffffff9c 0x4041c6 0x98800 0x10 + <...>-1447 [001] 1038282.286976: sys_open+0x1b/0x1d <-do_sys_open: 0x3 0xffffffff81367a3a + + + Each line shows when the kernel hits a probe, and <- SYMBOL means kernel +returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel +returns from do_sys_open to sys_open+0x1b). + + diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f61be30..8927864 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -398,6 +398,15 @@ config BLK_DEV_IO_TRACE If unsure, say N. +config KPROBE_TRACER + depends on KPROBES + depends on X86 + bool "Trace kprobes" + select TRACING + help + This tracer probes everywhere where kprobes can probe it, and + records various registers and memories specified by user. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 848e5ce..01ac95b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -52,5 +52,6 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_mm.o +obj-$(CONFIG_KPROBE_TRACER) += trace_kprobe.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6e735d4..5d7849b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -40,6 +40,8 @@ enum trace_type { TRACE_KMEM_FREE, TRACE_POWER, TRACE_BLK, + TRACE_KPROBE, + TRACE_KRETPROBE, __TRACE_LAST_TYPE, }; @@ -207,6 +209,22 @@ struct syscall_trace_exit { unsigned long ret; }; +#define TRACE_KPROBE_ARGS 6 + +struct kprobe_trace_entry { + struct trace_entry ent; + unsigned long ip; + int nargs; + unsigned long args[TRACE_KPROBE_ARGS]; +}; + +struct kretprobe_trace_entry { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; + int nargs; + unsigned long args[TRACE_KPROBE_ARGS]; +}; /* * trace_flag_type is an enumeration that holds different @@ -323,6 +341,10 @@ extern void __ftrace_bad_type(void); TRACE_SYSCALL_ENTER); \ IF_ASSIGN(var, ent, struct syscall_trace_exit, \ TRACE_SYSCALL_EXIT); \ + IF_ASSIGN(var, ent, struct kprobe_trace_entry, \ + TRACE_KPROBE); \ + IF_ASSIGN(var, ent, struct kretprobe_trace_entry, \ + TRACE_KRETPROBE); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 5e32e37..3be3e32 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -172,4 +172,24 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, TP_RAW_FMT("type:%u call_site:%lx ptr:%p") ); +TRACE_EVENT_FORMAT(kprobe, TRACE_KPROBE, kprobe_trace_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned long, ip, ip) + TRACE_FIELD(int, nargs, nargs) + TRACE_FIELD_SPECIAL(unsigned long args[TRACE_KPROBE_ARGS], + args, TRACE_KPROBE_ARGS, args) + ), + TP_RAW_FMT("%08lx: args:0x%lx ...") +); + +TRACE_EVENT_FORMAT(kretprobe, TRACE_KRETPROBE, kretprobe_trace_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned long, func, func) + TRACE_FIELD(unsigned long, ret_ip, ret_ip) + TRACE_FIELD(int, nargs, nargs) + TRACE_FIELD_SPECIAL(unsigned long args[TRACE_KPROBE_ARGS], + args, TRACE_KPROBE_ARGS, args) + ), + TP_RAW_FMT("%08lx <- %08lx: args:0x%lx ...") +); #undef TRACE_SYSTEM diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c new file mode 100644 index 0000000..ecca548 --- /dev/null +++ b/kernel/trace/trace_kprobe.c @@ -0,0 +1,903 @@ +/* + * kprobe based kernel tracer + * + * Created by Masami Hiramatsu <mhi...@re...> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/kprobes.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/debugfs.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/ptrace.h> + +#include "trace.h" +#include "trace_output.h" + +/* currently, trace_kprobe only supports X86. */ + +struct fetch_func { + unsigned long (*func)(struct pt_regs *, void *); + void *data; +}; + +static __kprobes unsigned long call_fetch(struct fetch_func *f, + struct pt_regs *regs) +{ + return f->func(regs, f->data); +} + +/* fetch handlers */ +static __kprobes unsigned long fetch_register(struct pt_regs *regs, + void *offset) +{ + return get_register(regs, (unsigned)((unsigned long)offset)); +} + +static __kprobes unsigned long fetch_stack(struct pt_regs *regs, + void *num) +{ + return get_kernel_stack_nth(regs, (unsigned)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) +{ + unsigned long retval; + if (probe_kernel_address(addr, retval)) + return 0; + return retval; +} + +static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num) +{ + return get_argument_nth(regs, (unsigned)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, + void *dummy) +{ + return regs_return_value(regs); +} + +static __kprobes unsigned long fetch_ip(struct pt_regs *regs, void *dummy) +{ + return instruction_pointer(regs); +} + +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +static unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + if (sc->addr) + sc->addr += sc->offset; + return sc->addr; +} + +static void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + if (!sym || strlen(sym) == 0) + return NULL; + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + + update_symbol_cache(sc); + return sc; +} + +static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) +{ + struct symbol_cache *sc = data; + if (sc->addr) + return fetch_memory(regs, (void *)sc->addr); + else + return 0; +} + +/* Special indirect memory access interface */ +struct indirect_fetch_data { + struct fetch_func orig; + long offset; +}; + +static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) +{ + struct indirect_fetch_data *ind = data; + unsigned long addr; + addr = call_fetch(&ind->orig, regs); + if (addr) { + addr += ind->offset; + return fetch_memory(regs, (void *)addr); + } else + return 0; +} + +static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) +{ + if (data->orig.func == fetch_indirect) + free_indirect_fetch_data(data->orig.data); + else if (data->orig.func == fetch_symbol) + free_symbol_cache(data->orig.data); + kfree(data); +} + +/** + * kprobe_trace_core + */ + +struct trace_probe { + struct list_head list; + union { + struct kprobe kp; + struct kretprobe rp; + }; + const char *symbol; /* symbol name */ + unsigned int nr_args; + struct fetch_func args[TRACE_KPROBE_ARGS]; +}; + +static int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs); +static int kretprobe_trace_func(struct kretprobe_instance *ri, + struct pt_regs *regs); + +static __kprobes int probe_is_return(struct trace_probe *tp) +{ + return (tp->rp.handler == kretprobe_trace_func); +} + +static __kprobes const char *probe_symbol(struct trace_probe *tp) +{ + return tp->symbol ? tp->symbol : "unknown"; +} + +static __kprobes long probe_offset(struct trace_probe *tp) +{ + return (probe_is_return(tp)) ? tp->rp.kp.offset : tp->kp.offset; +} + +static __kprobes void *probe_address(struct trace_probe *tp) +{ + return (probe_is_return(tp)) ? tp->rp.kp.addr : tp->kp.addr; +} + +static int trace_arg_string(char *buf, size_t n, struct fetch_func *ff) +{ + int ret = -EINVAL; + if (ff->func == fetch_argument) + ret = snprintf(buf, n, "a%lu", (unsigned long)ff->data); + else if (ff->func == fetch_register) { + const char *name; + name = query_register_name((unsigned)((long)ff->data)); + ret = snprintf(buf, n, "%%%s", name); + } else if (ff->func == fetch_stack) + ret = snprintf(buf, n, "s%lu", (unsigned long)ff->data); + else if (ff->func == fetch_memory) + ret = snprintf(buf, n, "@0x%p", ff->data); + else if (ff->func == fetch_symbol) { + struct symbol_cache *sc = ff->data; + ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset); + } else if (ff->func == fetch_retvalue) + ret = snprintf(buf, n, "rv"); + else if (ff->func == fetch_ip) + ret = snprintf(buf, n, "ra"); + else if (ff->func == fetch_indirect) { + struct indirect_fetch_data *id = ff->data; + ret = snprintf(buf, n, "%+ld(", id->offset); + if (ret > n) + goto end; + n -= ret; + ret = trace_arg_string(buf, n, &id->orig); + if (ret > n) + goto end; + n -= ret; + ret = snprintf(buf, n, ")"); + } +end: + if (ret > n) + return -ENOSPC; + return 0; +} + + +static DEFINE_MUTEX(probe_lock); +static LIST_HEAD(probe_list); + +static struct trace_probe *alloc_trace_probe(const char *symbol) +{ + struct trace_probe *tp; + + tp = kzalloc(sizeof(struct trace_probe), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + if (symbol) { + tp->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tp->symbol) { + kfree(tp); + return ERR_PTR(-ENOMEM); + } + } + + INIT_LIST_HEAD(&tp->list); + return tp; +} + +static void free_trace_probe(struct trace_probe *tp) +{ + int i; + for (i = 0; i < tp->nr_args; i++) + if (tp->args[i].func == fetch_symbol) + free_symbol_cache(tp->args[i].data); + else if (tp->args[i].func == fetch_indirect) + free_indirect_fetch_data(tp->args[i].data); + + kfree(tp->symbol); + kfree(tp); +} + +static int register_trace_probe(struct trace_probe *tp) +{ + int ret; + + mutex_lock(&probe_lock); + list_add_tail(&tp->list, &probe_list); + + if (probe_is_return(tp)) + ret = register_kretprobe(&tp->rp); + else + ret = register_kprobe(&tp->kp); + + if (ret) { + pr_warning("Could not insert probe(%d)\n", ret); + if (ret == -EILSEQ) { + pr_warning("Probing address(0x%p) is not an " + "instruction boundary.\n", + probe_address(tp)); + ret = -EINVAL; + } + list_del(&tp->list); + } + mutex_unlock(&probe_lock); + return ret; +} + +static void unregister_trace_probe(struct trace_probe *tp) +{ + if (probe_is_return(tp)) + unregister_kretprobe(&tp->rp); + else + unregister_kprobe(&tp->kp); + list_del(&tp->list); +} + +/* Split symbol and offset. */ +static int split_symbol_offset(char *symbol, long *offset) +{ + char *tmp; + int ret; + + if (!offset) + return -EINVAL; + + tmp = strchr(symbol, '+'); + if (!tmp) + tmp = strchr(symbol, '-'); + + if (tmp) { + /* skip sign because strict_strtol doesn't accept '+' */ + ret = strict_strtol(tmp + 1, 0, offset); + if (ret) + return ret; + if (*tmp == '-') + *offset = -(*offset); + *tmp = '\0'; + } else + *offset = 0; + return 0; +} + +#define PARAM_MAX_ARGS 16 +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) +#define MAX_ARGSTR_LEN 63 + +static int parse_trace_arg(char *arg, struct fetch_func *ff, int is_return) +{ + int ret = 0; + unsigned long param; + long offset; + char *tmp; + + switch (arg[0]) { + case 'a': /* argument */ + ret = strict_strtoul(arg + 1, 10, ¶m); + if (ret || param > PARAM_MAX_ARGS) + ret = -EINVAL; + else { + ff->func = fetch_argument; + ff->data = (void *)param; + } + break; + case 'r': /* retval or retaddr */ + if (is_return && arg[1] == 'v') { + ff->func = fetch_retvalue; + ff->data = NULL; + } else if (is_return && arg[1] == 'a') { + ff->func = fetch_ip; + ff->data = NULL; + } else + ret = -EINVAL; + break; + case '%': /* named register */ + ret = query_register_offset(arg + 1); + if (ret >= 0) { + ff->func = fetch_register; + ff->data = (void *)(unsigned long)ret; + ret = 0; + } + break; + case 's': /* stack */ + ret = strict_strtoul(arg + 1, 10, ¶m); + if (ret || param > PARAM_MAX_STACK) + ret = -EINVAL; + else { + ff->func = fetch_stack; + ff->data = (void *)param; + } + break; + case '@': /* memory or symbol */ + if (isdigit(arg[1])) { + ret = strict_strtoul(arg + 1, 0, ¶m); + if (ret) + break; + ff->func = fetch_memory; + ff->data = (void *)param; + } else { + ret = split_symbol_offset(arg + 1, &offset); + if (ret) + break; + ff->data = alloc_symbol_cache(arg + 1, + offset); + if (ff->data) + ff->func = fetch_symbol; + else + ret = -EINVAL; + } + break; + case '+': /* indirect memory */ + case '-': + tmp = strchr(arg, '('); + if (!tmp) { + ret = -EINVAL; + break; + } + *tmp = '\0'; + ret = strict_strtol(arg + 1, 0, &offset); + if (ret) + break; + if (arg[0] == '-') + offset = -offset; + arg = tmp + 1; + tmp = strrchr(arg, ')'); + if (tmp) { + struct indirect_fetch_data *id; + *tmp = '\0'; + id = kzalloc(sizeof(struct indirect_fetch_data), + GFP_KERNEL); + if (!id) + return -ENOMEM; + id->offset = offset; + ret = parse_trace_arg(arg, &id->orig, is_return); + if (ret) + kfree(id); + else { + ff->func = fetch_indirect; + ff->data = (void *)id; + } + } else + ret = -EINVAL; + break; + default: + /* TODO: support custom handler */ + ret = -EINVAL; + } + return ret; +} + +static int create_trace_probe(int argc, char **argv) +{ + /* + * Argument syntax: + * - Add kprobe: p SYMBOL[+OFFS|-OFFS]|ADDRESS [FETCHARGS] + * - Add kretprobe: r SYMBOL[+0] [FETCHARGS] + * Fetch args: + * aN : fetch Nth of function argument. (N:0-) + * rv : fetch return value + * ra : fetch return address + * sN : fetch Nth of stack (N:0-) + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * %REG : fetch register REG + * Indirect memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + */ + struct trace_probe *tp; + struct kprobe *kp; + int i, ret = 0; + int is_return = 0; + char *symbol = NULL; + long offset = 0; + void *addr = NULL; + + if (argc < 2) + return -EINVAL; + + if (argv[0][0] == 'p') + is_return = 0; + else if (argv[0][0] == 'r') + is_return = 1; + else + return -EINVAL; + + if (isdigit(argv[1][0])) { + if (is_return) + return -EINVAL; + /* an address specified */ + ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + if (ret) + return ret; + } else { + /* a symbol specified */ + symbol = argv[1]; + /* TODO: support .init module functions */ + ret = split_symbol_offset(symbol, &offset); + if (ret) + return ret; + if (offset && is_return) + return -EINVAL; + } + + /* setup a probe */ + tp = alloc_trace_probe(symbol); + if (IS_ERR(tp)) + return PTR_ERR(tp); + + if (is_return) { + kp = &tp->rp.kp; + tp->rp.handler = kretprobe_trace_func; + } else { + kp = &tp->kp; + tp->kp.pre_handler = kprobe_trace_func; + } + + if (tp->symbol) { + /* TODO: check offset is collect by using insn_decoder */ + kp->symbol_name = tp->symbol; + kp->offset = offset; + } else + kp->addr = addr; + + /* parse arguments */ + argc -= 2; argv += 2; ret = 0; + for (i = 0; i < argc && i < TRACE_KPROBE_ARGS; i++) { + if (strlen(argv[i]) > MAX_ARGSTR_LEN) { + pr_info("Argument%d(%s) is too long.\n", i, argv[i]); + ret = -ENOSPC; + goto error; + } + ret = parse_trace_arg(argv[i], &tp->args[i], is_return); + if (ret) + goto error; + } + tp->nr_args = i; + + ret = register_trace_probe(tp); + if (ret) + goto error; + return 0; + +error: + free_trace_probe(tp); + return ret; +} + +static void cleanup_all_probes(void) +{ + struct trace_probe *tp; + mutex_lock(&probe_lock); + /* TODO: Use batch unregistration */ + while (!list_empty(&probe_list)) { + tp = list_entry(probe_list.next, struct trace_probe, list); + unregister_trace_probe(tp); + free_trace_probe(tp); + } + mutex_unlock(&probe_lock); +} + + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&probe_lock); + return seq_list_start(&probe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &probe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&probe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + int i, ret; + char buf[MAX_ARGSTR_LEN + 1]; + + if (tp == NULL) + return 0; + + if (tp->symbol) + seq_printf(m, "%c %s%+ld", + probe_is_return(tp) ? 'r' : 'p', + probe_symbol(tp), probe_offset(tp)); + else + seq_printf(m, "%c 0x%p", + probe_is_return(tp) ? 'r' : 'p', + probe_address(tp)); + + for (i = 0; i < tp->nr_args; i++) { + ret = trace_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i]); + if (ret) { + pr_warning("Argument%d is too long.\n", i); + break; + } + seq_printf(m, " %s", buf); + } + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations probes_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) + cleanup_all_probes(); + + return seq_open(file, &probes_seq_op); +} + +static int command_trace_probe(const char *buf) +{ + char **argv; + int argc = 0, ret = 0; + + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = create_trace_probe(argc, argv); + + argv_free(argv); + return ret; +} + +#define WRITE_BUFSIZE 128 + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *kbuf, *tmp; + int ret; + size_t done; + size_t size; + + if (!count || count < 0) + return 0; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = done = 0; + do { + size = count - done; + if (size > WRITE_BUFSIZE) + size = WRITE_BUFSIZE; + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + tmp = strchr(kbuf, '\n'); + if (!tmp) { + pr_warning("Line length is too long: " + "Should be less than %d.", WRITE_BUFSIZE); + ret = -EINVAL; + goto out; + } + *tmp = '\0'; + size = tmp - kbuf + 1; + done += size; + /* Remove comments */ + tmp = strchr(kbuf, '#'); + if (tmp) + *tmp = '\0'; + + ret = command_trace_probe(kbuf); + if (ret) + goto out; + + } while (done < count); + ret = done; +out: + kfree(kbuf); + return ret; +} + +static const struct file_operations kprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Kprobe handler */ +static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, kp); + struct kprobe_trace_entry *entry; + struct ring_buffer_event *event; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &event_kprobe; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = sizeof(struct kprobe_trace_entry) - + (sizeof(unsigned long) * (TRACE_KPROBE_ARGS - tp->nr_args)); + + event = trace_current_buffer_lock_reserve(TRACE_KPROBE, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->ip = (unsigned long)kp->addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i], regs); + + if (!filter_current_check_discard(call, entry, event)) + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); + return 0; +} + +/* Kretprobe handler */ +static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct kretprobe_trace_entry *entry; + struct ring_buffer_event *event; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &event_kretprobe; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = sizeof(struct kretprobe_trace_entry) - + (sizeof(unsigned long) * (TRACE_KPROBE_ARGS - tp->nr_args)); + + event = trace_current_buffer_lock_reserve(TRACE_KRETPROBE, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->func = (unsigned long)probe_address(tp); + entry->ret_ip = (unsigned long)ri->ret_addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i], regs); + + if (!filter_current_check_discard(call, entry, event)) + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); + + return 0; +} + +/* Event entry printers */ +enum print_line_t +print_kprobe_event(struct trace_iterator *iter, int flags) +{ + struct kprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + int i; + + trace_assign_type(field, iter->ent); + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ":")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " 0x%lx", field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +enum print_line_t +print_kretprobe_event(struct trace_iterator *iter, int flags) +{ + struct kretprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + int i; + + trace_assign_type(field, iter->ent); + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, " <- ")) + goto partial; + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ":")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " 0x%lx", field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event kprobe_trace_event = { + .type = TRACE_KPROBE, + .trace = print_kprobe_event, +}; + +static struct trace_event kretprobe_trace_event = { + .type = TRACE_KRETPROBE, + .trace = print_kretprobe_event, +}; + +/* Make a debugfs interface for controling probe points */ +static __init int init_kprobe_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + int ret; + + ret = register_ftrace_event(&kprobe_trace_event); + if (!ret) { + pr_warning("Could not register kprobe_trace_event type.\n"); + return 0; + } + ret = register_ftrace_event(&kretprobe_trace_event); + if (!ret) { + pr_warning("Could not register kretprobe_trace_event type.\n"); + return 0; + } + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("kprobe_events", 0644, d_tracer, + NULL, &kprobe_events_ops); + + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_events' entry\n"); + return 0; +} +fs_initcall(init_kprobe_trace); + + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static int kprobe_trace_selftest_target(int a1, int a2, int a3, + int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} + +static __init int kprobe_trace_self_tests_init(void) +{ + int ret; + int (*target)(int, int, int, int, int, int); + target = kprobe_trace_selftest_target; + + pr_info("Testing kprobe tracing: "); + + ret = command_trace_probe("p kprobe_trace_selftest_target " + "a1 a2 a3 a4 a5 a6"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function entry\n"); + + ret = command_trace_probe("r kprobe_trace_selftest_target " + "ra rv"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function return\n"); + + ret = target(1, 2, 3, 4, 5, 6); + + cleanup_all_probes(); + + pr_cont("OK\n"); + return 0; +} + +late_initcall(kprobe_trace_self_tests_init); + +#endif -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America) Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-23 01:17:55
|
Cleanup fix_riprel() in arch/x86/kernel/kprobes.c by using x86 instruction decoder. Signed-off-by: Masami Hiramatsu <mhi...@re...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Jim Keniston <jke...@us...> Cc: Ingo Molnar <mi...@el...> --- arch/x86/kernel/kprobes.c | 128 ++++++++------------------------------------- 1 files changed, 23 insertions(+), 105 deletions(-) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 41d524f..ebac470 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -108,50 +108,6 @@ static const u32 twobyte_is_boostable[256 / 32] = { /* ----------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -static const u32 onebyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ - W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ - W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ - W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ - W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ - W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ - W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ - W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ - W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; -static const u32 twobyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ - W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ - W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ - W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ - W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ - W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ - W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ - W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ - W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ - W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ - W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ - W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; #undef W struct kretprobe_blackpoint kretprobe_blacklist[] = { @@ -344,68 +300,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) static void __kprobes fix_riprel(struct kprobe *p) { #ifdef CONFIG_X86_64 - u8 *insn = p->ainsn.insn; - s64 disp; - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } + struct insn insn; + kernel_insn_init(&insn, p->ainsn.insn); - /* Skip REX instruction prefix. */ - if (is_REX_prefix(insn)) - ++insn; - - if (*insn == 0x0f) { - /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, - (unsigned long *)twobyte_has_modrm); - } else - /* One-byte opcode. */ - need_modrm = test_bit(*insn, - (unsigned long *)onebyte_has_modrm); - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { - /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - ++insn; - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - disp = (u8 *) p->addr + *((s32 *) insn) - - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *(s32 *)insn = (s32) disp; - } + if (insn_rip_relative(&insn)) { + s64 newdisp; + u8 *disp; + insn_get_displacement(&insn); + /* + * The copied instruction uses the %rip-relative addressing + * mode. Adjust the displacement for the difference between + * the original location of this instruction and the location + * of the copy that will actually be run. The tricky bit here + * is making sure that the sign extension happens correctly in + * this calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the %rip + * value and yield the same 64-bit result that the sign- + * extension of the original signed 32-bit displacement would + * have given. + */ + newdisp = (u8 *) p->addr + (s64) insn.displacement.value - + (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ + disp = (u8 *) p->ainsn.insn + INSN_DISPLACEMENT_OFFS(&insn); + *(s32 *) disp = (s32) newdisp; } #endif } -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America) Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-23 01:17:47
|
Add following APIs for accessing registers and stack entries from pt_regs. - query_register_offset(const char *name) Query the offset of "name" register. - query_register_name(unsigned offset) Query the name of register by its offset. - get_register(struct pt_regs *regs, unsigned offset) Get the value of a register by its offset. - within_kernel_stack(struct pt_regs *regs, unsigned long addr) Check the address is in the kernel stack. - get_kernel_stack_nth(struct pt_regs *reg, unsigned nth) Get Nth entry of the kernel stack. (N >= 0) - get_argument_nth(struct pt_regs *reg, unsigned nth) Get Nth argument at function call. (N >= 0) Changes from v5: - Rename valid_stack_address to within_kernel_stack. - Rename get_stack_nth to get_kernel_stack_nth. Signed-off-by: Masami Hiramatsu <mhi...@re...> Cc: Steven Rostedt <ro...@go...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Ingo Molnar <mi...@el...> Cc: Frederic Weisbecker <fwe...@gm...> Cc: Roland McGrath <ro...@re...> --- arch/x86/include/asm/ptrace.h | 67 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/ptrace.c | 60 +++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 0 deletions(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 0f0d908..577d625 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -7,6 +7,7 @@ #ifdef __KERNEL__ #include <asm/segment.h> +#include <asm/page_types.h> #endif #ifndef __ASSEMBLY__ @@ -216,6 +217,72 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) return regs->sp; } +/* Query offset/name of register from its name/offset */ +extern int query_register_offset(const char *name); +extern const char *query_register_name(unsigned offset); +#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss)) + +/* Get register value from its offset */ +static inline unsigned long get_register(struct pt_regs *regs, unsigned offset) +{ + if (unlikely(offset > MAX_REG_OFFSET)) + return 0; + return *(unsigned long *)((unsigned long)regs + offset); +} + +/* Check the address in the stack */ +static inline int within_kernel_stack(struct pt_regs *regs, unsigned long addr) +{ + return ((addr & ~(THREAD_SIZE - 1)) == + (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))); +} + +/* Get Nth entry of the stack */ +static inline unsigned long get_kernel_stack_nth(struct pt_regs *regs, + unsigned n) +{ + unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + addr += n; + if (within_kernel_stack(regs, (unsigned long)addr)) + return *addr; + else + return 0; +} + +/* Get Nth argument at function call */ +static inline unsigned long get_argument_nth(struct pt_regs *regs, unsigned n) +{ +#ifdef CONFIG_X86_32 +#define NR_REGPARMS 3 + if (n < NR_REGPARMS) { + switch (n) { + case 0: return regs->ax; + case 1: return regs->dx; + case 2: return regs->cx; + } + return 0; +#else /* CONFIG_X86_64 */ +#define NR_REGPARMS 6 + if (n < NR_REGPARMS) { + switch (n) { + case 0: return regs->di; + case 1: return regs->si; + case 2: return regs->dx; + case 3: return regs->cx; + case 4: return regs->r8; + case 5: return regs->r9; + } + return 0; +#endif + } else { + /* + * The typical case: arg n is on the stack. + * (Note: stack[0] = return address, so skip it) + */ + return get_kernel_stack_nth(regs, 1 + n - NR_REGPARMS); + } +} + /* * These are defined as per linux/ptrace.h, which see. */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 09ecbde..00eb9d7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -48,6 +48,66 @@ enum x86_regset { REGSET_IOPERM32, }; +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET(r) offsetof(struct pt_regs, r) +#define REG_OFFSET_NAME(r) {.name = #r, .offset = REG_OFFSET(r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { +#ifdef CONFIG_X86_64 + REG_OFFSET_NAME(r15), + REG_OFFSET_NAME(r14), + REG_OFFSET_NAME(r13), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r8), +#endif + REG_OFFSET_NAME(bx), + REG_OFFSET_NAME(cx), + REG_OFFSET_NAME(dx), + REG_OFFSET_NAME(si), + REG_OFFSET_NAME(di), + REG_OFFSET_NAME(bp), + REG_OFFSET_NAME(ax), +#ifdef CONFIG_X86_32 + REG_OFFSET_NAME(ds), + REG_OFFSET_NAME(es), + REG_OFFSET_NAME(fs), + REG_OFFSET_NAME(gs), +#endif + REG_OFFSET_NAME(orig_ax), + REG_OFFSET_NAME(ip), + REG_OFFSET_NAME(cs), + REG_OFFSET_NAME(flags), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(ss), + REG_OFFSET_END, +}; + +int query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +const char *query_register_name(unsigned offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America) Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-23 01:16:31
|
Ensure safeness of inserting kprobes by checking whether the specified address is at the first byte of a instruction on x86. This is done by decoding probed function from its head to the probe point. Changes from v5: - Add comments to explain recover_probed_function() and decoding logic. Signed-off-by: Masami Hiramatsu <mhi...@re...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Jim Keniston <jke...@us...> Cc: Ingo Molnar <mi...@el...> --- arch/x86/kernel/kprobes.c | 69 +++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 69 insertions(+), 0 deletions(-) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d..41d524f 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -48,12 +48,14 @@ #include <linux/preempt.h> #include <linux/module.h> #include <linux/kdebug.h> +#include <linux/kallsyms.h> #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/pgtable.h> #include <asm/uaccess.h> #include <asm/alternative.h> +#include <asm/insn.h> void jprobe_return_end(void); @@ -244,6 +246,71 @@ retry: } } +/* Recover the probed instruction at addr for further analysis. */ +static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +{ + struct kprobe *kp; + kp = get_kprobe((void *)addr); + if (!kp) + return -EINVAL; + + /* + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, fix_riprel() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. + * + * On the other hand, kp->opcode has a copy of the first byte of + * the probed instruction, which is overwritten by int3. And + * the instruction at kp->addr is not modified by kprobes except + * for the first byte, we can recover the original instruction + * from it and kp->opcode. + */ + memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + buf[0] = kp->opcode; + return 0; +} + +/* Dummy buffers for kallsyms_lookup */ +static char __dummy_buf[KSYM_NAME_LEN]; + +/* Check if paddr is at an instruction boundary */ +static int __kprobes can_probe(unsigned long paddr) +{ + int ret; + unsigned long addr, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + + /* Check if the instruction has been modified. */ + if (OPCODE1(&insn) == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + /* + * Another debugging subsystem might insert + * this breakpoint. In that case, we can't + * recover it. + */ + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + addr += insn.length; + } + + return (addr == paddr); +} + /* * Returns non-zero if opcode modifies the interrupt flag. */ @@ -359,6 +426,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (!can_probe((unsigned long)p->addr)) + return -EILSEQ; /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America) Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-21 19:08:26
|
Andi Kleen wrote: > This might be an unimportant detail, but I would suggest to fix the > "decorder" typo in the Subject before the next revision. It seems > to have persisted over 6 revisions now, but it still irritatess me @) Oops, sorry... -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America) Inc. Software Solutions Division e-mail: mhi...@re... |
From: Andi K. <ak...@li...> - 2009-05-21 17:59:33
|
This might be an unimportant detail, but I would suggest to fix the "decorder" typo in the Subject before the next revision. It seems to have persisted over 6 revisions now, but it still irritatess me @) -Andi |
From: Masami H. <mhi...@re...> - 2009-05-21 17:26:26
|
Add x86 instruction decoder to arch-specific libraries. This decoder can decode x86 instructions used in kernel into prefix, opcode, modrm, sib, displacement and immediates. This can also show the length of instructions. This version introduces instruction attributes for decoding instructions. The instruction attribute tables are generated from the opcode map file (x86-opcode-map.txt) by the generator script(gen-insn-attr-x86.awk). Currently, the opcode maps are based on opcode maps in Intel(R) 64 and IA-32 Architectures Software Developers Manual Vol.2: Appendix.A, and consist of below two types of opcode tables. 1-byte/2-bytes/3-bytes opcodes, which has 256 elements, are written as below; Table: table-name Referrer: escaped-name opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] (or) opcode: escape # escaped-name EndTable Group opcodes, which has 8 elements, are written as below; GrpTable: GrpXXX reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] EndTable These opcode maps do NOT include most of SSE and FP opcodes, because those opcodes are not used in the kernel. Changes from v6: - Use /bin/awk -f for shebang line of awk script(just for documantation's sake) - Set rex_prefix.got true after decoding prefixes. TODO: - Integrate user-space test harness as a build-time test. Signed-off-by: Masami Hiramatsu <mhi...@re...> Signed-off-by: Jim Keniston <jke...@us...> Cc: H. Peter Anvin <hp...@zy...> Cc: Steven Rostedt <ro...@go...> Cc: Ananth N Mavinakayanahalli <an...@in...> Cc: Ingo Molnar <mi...@el...> Cc: Frederic Weisbecker <fwe...@gm...> Cc: Andi Kleen <ak...@li...> Cc: Vegard Nossum <veg...@gm...> Cc: Avi Kivity <av...@re...> Cc: Przemysław Pawełczyk <prz...@pa...> --- arch/x86/include/asm/inat.h | 125 ++++++ arch/x86/include/asm/insn.h | 134 ++++++ arch/x86/lib/Makefile | 13 + arch/x86/lib/inat.c | 80 ++++ arch/x86/lib/insn.c | 471 +++++++++++++++++++++ arch/x86/lib/x86-opcode-map.txt | 711 ++++++++++++++++++++++++++++++++ arch/x86/scripts/gen-insn-attr-x86.awk | 314 ++++++++++++++ 7 files changed, 1848 insertions(+), 0 deletions(-) create mode 100644 arch/x86/include/asm/inat.h create mode 100644 arch/x86/include/asm/insn.h create mode 100644 arch/x86/lib/inat.c create mode 100644 arch/x86/lib/insn.c create mode 100644 arch/x86/lib/x86-opcode-map.txt create mode 100644 arch/x86/scripts/gen-insn-attr-x86.awk diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h new file mode 100644 index 0000000..01e079a --- /dev/null +++ b/arch/x86/include/asm/inat.h @@ -0,0 +1,125 @@ +#ifndef _ASM_INAT_INAT_H +#define _ASM_INAT_INAT_H +/* + * x86 instruction attributes + * + * Written by Masami Hiramatsu <mhi...@re...> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include <linux/types.h> + +/* Instruction attributes */ +typedef u32 insn_attr_t; + +/* + * Internal bits. Don't use bitmasks directly, because these bits are + * unstable. You should add checking macros and use that macro in + * your code. + */ + +#define INAT_OPCODE_TABLE_SIZE 256 +#define INAT_GROUP_TABLE_SIZE 8 + +/* Legacy instruction prefixes */ +#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */ +#define INAT_PFX_REPNE 2 /* 0xF2 */ /* LPFX2 */ +#define INAT_PFX_REPE 3 /* 0xF3 */ /* LPFX3 */ +#define INAT_PFX_LOCK 4 /* 0xF0 */ +#define INAT_PFX_CS 5 /* 0x2E */ +#define INAT_PFX_DS 6 /* 0x3E */ +#define INAT_PFX_ES 7 /* 0x26 */ +#define INAT_PFX_FS 8 /* 0x64 */ +#define INAT_PFX_GS 9 /* 0x65 */ +#define INAT_PFX_SS 10 /* 0x36 */ +#define INAT_PFX_ADDRSZ 11 /* 0x67 */ + +#define INAT_LPREFIX_MAX 3 + +/* Immediate size */ +#define INAT_IMM_BYTE 1 +#define INAT_IMM_WORD 2 +#define INAT_IMM_DWORD 3 +#define INAT_IMM_QWORD 4 +#define INAT_IMM_PTR 5 +#define INAT_IMM_VWORD32 6 +#define INAT_IMM_VWORD 7 + +/* Legacy prefix */ +#define INAT_PFX_OFFS 0 +#define INAT_PFX_BITS 4 +#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1) +#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS) +/* Escape opcodes */ +#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS) +#define INAT_ESC_BITS 2 +#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1) +#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS) +/* Group opcodes (1-16) */ +#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS) +#define INAT_GRP_BITS 5 +#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1) +#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS) +/* Immediates */ +#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS) +#define INAT_IMM_BITS 3 +#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS) +/* Flags */ +#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS) +#define INAT_REXPFX (1 << INAT_FLAG_OFFS) +#define INAT_MODRM (1 << (INAT_FLAG_OFFS + 1)) +#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 2)) +#define INAT_ADDIMM (1 << (INAT_FLAG_OFFS + 3)) +#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 4)) +#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 5)) + +/* Attribute search APIs */ +extern insn_attr_t inat_get_opcode_attribute(u8 opcode); +extern insn_attr_t inat_get_escape_attribute(u8 opcode, u8 last_pfx, + insn_attr_t esc_attr); +extern insn_attr_t inat_get_group_attribute(u8 modrm, u8 last_pfx, + insn_attr_t esc_attr); + +/* Attribute checking macros. Use these macros in your code */ +#define INAT_IS_PREFIX(attr) (attr & INAT_PFX_MASK) +#define INAT_IS_ADDRSZ(attr) ((attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ) +#define INAT_IS_OPNDSZ(attr) ((attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ) +#define INAT_LPREFIX_NUM(attr) \ + (((attr & INAT_PFX_MASK) > INAT_LPREFIX_MAX) ? 0 :\ + (attr & INAT_PFX_MASK)) +#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) + +#define INAT_IS_ESCAPE(attr) (attr & INAT_ESC_MASK) +#define INAT_ESCAPE_NUM(attr) ((attr & INAT_ESC_MASK) >> INAT_ESC_OFFS) +#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) + +#define INAT_IS_GROUP(attr) (attr & INAT_GRP_MASK) +#define INAT_GROUP_NUM(attr) ((attr & INAT_GRP_MASK) >> INAT_GRP_OFFS) +#define INAT_GROUP_COMMON(attr) (attr & ~INAT_GRP_MASK) +#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) + +#define INAT_HAS_IMM(attr) (attr & INAT_IMM_MASK) +#define INAT_IMM_SIZE(attr) ((attr & INAT_IMM_MASK) >> INAT_IMM_OFFS) +#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) + +#define INAT_IS_REX_PREFIX(attr) (attr & INAT_REXPFX) +#define INAT_HAS_MODRM(attr) (attr & INAT_MODRM) +#define INAT_IS_FORCE64(attr) (attr & INAT_FORCE64) +#define INAT_HAS_ADDIMM(attr) (attr & INAT_ADDIMM) +#define INAT_HAS_MOFFSET(attr) (attr & INAT_MOFFSET) +#define INAT_HAS_VARIANT(attr) (attr & INAT_VARIANT) + +#endif diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h new file mode 100644 index 0000000..5b50fa3 --- /dev/null +++ b/arch/x86/include/asm/insn.h @@ -0,0 +1,134 @@ +#ifndef _ASM_X86_INSN_H +#define _ASM_X86_INSN_H +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include <linux/types.h> +/* insn_attr_t is defined in inat.h */ +#include <asm/inat.h> + +struct insn_field { + union { + s32 value; + u8 bytes[4]; + }; + bool got; /* true if we've run insn_get_xxx() for this field */ + u8 nbytes; +}; + +struct insn { + struct insn_field prefixes; /* + * Prefixes + * prefixes.bytes[3]: last prefix + */ + struct insn_field rex_prefix; /* REX prefix */ + struct insn_field opcode; /* + * opcode.bytes[0]: opcode1 + * opcode.bytes[1]: opcode2 + * opcode.bytes[2]: opcode3 + */ + struct insn_field modrm; + struct insn_field sib; + struct insn_field displacement; + union { + struct insn_field immediate; + struct insn_field moffset1; /* for 64bit MOV */ + struct insn_field immediate1; /* for 64bit imm or off16/32 */ + }; + union { + struct insn_field moffset2; /* for 64bit MOV */ + struct insn_field immediate2; /* for 64bit imm or seg16 */ + }; + + insn_attr_t attr; + u8 opnd_bytes; + u8 addr_bytes; + u8 length; + bool x86_64; + + const u8 *kaddr; /* kernel address of insn (copy) to analyze */ + const u8 *next_byte; +}; + +#define OPCODE1(insn) ((insn)->opcode.bytes[0]) +#define OPCODE2(insn) ((insn)->opcode.bytes[1]) +#define OPCODE3(insn) ((insn)->opcode.bytes[2]) + +#define MODRM_MOD(insn) (((insn)->modrm.value & 0xc0) >> 6) +#define MODRM_REG(insn) (((insn)->modrm.value & 0x38) >> 3) +#define MODRM_RM(insn) ((insn)->modrm.value & 0x07) + +#define SIB_SCALE(insn) (((insn)->sib.value & 0xc0) >> 6) +#define SIB_INDEX(insn) (((insn)->sib.value & 0x38) >> 3) +#define SIB_BASE(insn) ((insn)->sib.value & 0x07) + +#define REX_W(insn) ((insn)->rex_prefix.value & 8) +#define REX_R(insn) ((insn)->rex_prefix.value & 4) +#define REX_X(insn) ((insn)->rex_prefix.value & 2) +#define REX_B(insn) ((insn)->rex_prefix.value & 1) + +/* The last prefix is needed for two-byte and three-byte opcodes */ +#define LAST_PREFIX(insn) ((insn)->prefixes.bytes[3]) + +#define MOFFSET64(insn) (((u64)((insn)->moffset2.value) << 32) | \ + (u32)((insn)->moffset1.value)) + +#define IMMEDIATE64(insn) (((u64)((insn)->immediate2.value) << 32) | \ + (u32)((insn)->immediate1.value)) + +extern void insn_init(struct insn *insn, const u8 *kaddr, bool x86_64); +extern void insn_get_prefixes(struct insn *insn); +extern void insn_get_opcode(struct insn *insn); +extern void insn_get_modrm(struct insn *insn); +extern void insn_get_sib(struct insn *insn); +extern void insn_get_displacement(struct insn *insn); +extern void insn_get_immediate(struct insn *insn); +extern void insn_get_length(struct insn *insn); + +/* Attribute will be determined after getting ModRM (for opcode groups) */ +static inline void insn_get_attr(struct insn *insn) +{ + insn_get_modrm(insn); +} + +/* Instruction uses RIP-relative addressing */ +extern bool insn_rip_relative(struct insn *insn); + +#ifdef CONFIG_X86_64 +/* Init insn for kernel text */ +#define kernel_insn_init(insn, kaddr) insn_init(insn, kaddr, 1) +#else /* CONFIG_X86_32 */ +#define kernel_insn_init(insn, kaddr) insn_init(insn, kaddr, 0) +#endif + +#define INSN_PREFIXES_OFFS(insn) (0) +#define INSN_REXPREFIX_OFFS(insn) ((insn)->prefixes.nbytes) +#define INSN_OPCODE_OFFS(insn) (INSN_REXPREFIX_OFFS(insn) + \ + ((insn)->rex_prefix.nbytes)) +#define INSN_MODRM_OFFS(insn) (INSN_OPCODE_OFFS(insn) + \ + ((insn)->opcode.nbytes)) +#define INSN_SIB_OFFS(insn) (INSN_MODRM_OFFS(insn) + \ + ((insn)->modrm.nbytes)) +#define INSN_DISPLACEMENT_OFFS(insn) (INSN_SIB_OFFS(insn) + \ + ((insn)->sib.nbytes)) +#define INSN_IMMEDIATE_OFFS(insn) (INSN_DISPLACEMENT_OFFS(insn) + \ + ((insn)->displacement.nbytes)) + +#endif /* _ASM_X86_INSN_H */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 55e11aa..db0e3be 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -2,12 +2,25 @@ # Makefile for x86 specific library files. # +inat_tables_script = $(srctree)/arch/x86/scripts/gen-insn-attr-x86.awk +inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt +quiet_cmd_inat_tables = GEN $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ + +$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) + $(call cmd,inat_tables) + +$(obj)/inat.o: $(obj)/inat-tables.c + +clean-files := inat-tables.c + obj-$(CONFIG_SMP) := msr-on-cpu.o lib-y := delay.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o +lib-y += insn.o inat.o ifeq ($(CONFIG_X86_32),y) lib-y += checksum_32.o diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c new file mode 100644 index 0000000..d6a34be --- /dev/null +++ b/arch/x86/lib/inat.c @@ -0,0 +1,80 @@ +/* + * x86 instruction attribute tables + * + * Written by Masami Hiramatsu <mhi...@re...> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include <linux/module.h> +#include <asm/insn.h> + +/* Attribute tables are generated from opcode map */ +#include "inat-tables.c" + +/* Attribute search APIs */ +insn_attr_t inat_get_opcode_attribute(u8 opcode) +{ + return inat_primary_table[opcode]; +} + +insn_attr_t inat_get_escape_attribute(u8 opcode, u8 last_pfx, + insn_attr_t esc_attr) +{ + const insn_attr_t *table; + insn_attr_t lpfx_attr; + int n, m = 0; + + n = INAT_ESCAPE_NUM(esc_attr); + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = INAT_LPREFIX_NUM(lpfx_attr); + } + table = inat_escape_tables[n][0]; + if (!table) + return 0; + if (INAT_HAS_VARIANT(table[opcode]) && m) { + table = inat_escape_tables[n][m]; + if (!table) + return 0; + } + return table[opcode]; +} + +#define REGBITS(modrm) (((modrm) >> 3) & 0x7) + +insn_attr_t inat_get_group_attribute(u8 modrm, u8 last_pfx, + insn_attr_t grp_attr) +{ + const insn_attr_t *table; + insn_attr_t lpfx_attr; + int n, m = 0; + + n = INAT_GROUP_NUM(grp_attr); + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = INAT_LPREFIX_NUM(lpfx_attr); + } + table = inat_group_tables[n][0]; + if (!table) + return INAT_GROUP_COMMON(grp_attr); + if (INAT_HAS_VARIANT(table[REGBITS(modrm)]) && m) { + table = inat_escape_tables[n][m]; + if (!table) + return INAT_GROUP_COMMON(grp_attr); + } + return table[REGBITS(modrm)] | INAT_GROUP_COMMON(grp_attr); +} + diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c new file mode 100644 index 0000000..254c848 --- /dev/null +++ b/arch/x86/lib/insn.c @@ -0,0 +1,471 @@ +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004, 2009 + */ + +#include <linux/string.h> +#include <linux/module.h> +#include <asm/inat.h> +#include <asm/insn.h> + +#define get_next(t, insn) \ + ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) + +#define peek_next(t, insn) \ + ({t r; r = *(t*)insn->next_byte; r; }) + +/** + * insn_init() - initialize struct insn + * @insn: &struct insn to be initialized + * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @x86_64: true for 64-bit kernel or 64-bit app + */ +void insn_init(struct insn *insn, const u8 *kaddr, bool x86_64) +{ + memset(insn, 0, sizeof(*insn)); + insn->kaddr = kaddr; + insn->next_byte = kaddr; + insn->x86_64 = x86_64; + insn->opnd_bytes = 4; + if (x86_64) + insn->addr_bytes = 8; + else + insn->addr_bytes = 4; +} +EXPORT_SYMBOL_GPL(insn_init); + +/** + * insn_get_prefixes - scan x86 instruction prefix bytes + * @insn: &struct insn containing instruction + * + * Populates the @insn->prefixes bitmap, and updates @insn->next_byte + * to point to the (first) opcode. No effect if @insn->prefixes.got + * is already true. + */ +void insn_get_prefixes(struct insn *insn) +{ + struct insn_field *prefixes = &insn->prefixes; + insn_attr_t attr; + u8 b, lb, i, nb; + + if (prefixes->got) + return; + + nb = 0; + lb = 0; + b = peek_next(u8, insn); + attr = inat_get_opcode_attribute(b); + while (INAT_IS_PREFIX(attr)) { + /* Skip if same prefix */ + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == b) + goto found; + if (nb == 4) + /* Invalid instruction */ + break; + prefixes->bytes[nb++] = b; + if (INAT_IS_ADDRSZ(attr)) { + /* address size switches 2/4 or 4/8 */ + if (insn->x86_64) + insn->addr_bytes ^= 12; + else + insn->addr_bytes ^= 6; + } else if (INAT_IS_OPNDSZ(attr)) { + /* oprand size switches 2/4 */ + insn->opnd_bytes ^= 6; + } +found: + prefixes->nbytes++; + insn->next_byte++; + lb = b; + b = peek_next(u8, insn); + attr = inat_get_opcode_attribute(b); + } + /* Set the last prefix */ + if (lb && lb != LAST_PREFIX(insn)) { + if (unlikely(LAST_PREFIX(insn))) { + /* Swap the last prefix */ + b = LAST_PREFIX(insn); + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == lb) + prefixes->bytes[i] = b; + } + LAST_PREFIX(insn) = lb; + } + + if (insn->x86_64) { + b = peek_next(u8, insn); + attr = inat_get_opcode_attribute(b); + if (INAT_IS_REX_PREFIX(attr)) { + insn->rex_prefix.value = b; + insn->rex_prefix.nbytes = 1; + insn->next_byte++; + if (REX_W(insn)) + /* REX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } + } + insn->rex_prefix.got = true; + prefixes->got = true; + return; +} +EXPORT_SYMBOL_GPL(insn_get_prefixes); + +/** + * insn_get_opcode - collect opcode(s) + * @insn: &struct insn containing instruction + * + * Populates @insn->opcode, updates @insn->next_byte to point past the + * opcode byte(s), and set @insn->attr (except for groups). + * If necessary, first collects any preceding (prefix) bytes. + * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got + * is already true. + * + */ +void insn_get_opcode(struct insn *insn) +{ + struct insn_field *opcode = &insn->opcode; + u8 op, pfx; + if (opcode->got) + return; + if (!insn->prefixes.got) + insn_get_prefixes(insn); + + /* Get first opcode */ + op = get_next(u8, insn); + OPCODE1(insn) = op; + opcode->nbytes = 1; + insn->attr = inat_get_opcode_attribute(op); + while (INAT_IS_ESCAPE(insn->attr)) { + /* Get escaped opcode */ + op = get_next(u8, insn); + opcode->bytes[opcode->nbytes++] = op; + pfx = LAST_PREFIX(insn); + insn->attr = inat_get_escape_attribute(op, pfx, insn->attr); + } + opcode->got = true; +} +EXPORT_SYMBOL_GPL(insn_get_opcode); + +/** + * insn_get_modrm - collect ModRM byte, if any + * @insn: &struct insn containing instruction + * + * Populates @insn->modrm and updates @insn->next_byte to point past the + * ModRM byte, if any. If necessary, first collects the preceding bytes + * (prefixes and opcode(s)). No effect if @insn->modrm.got is already true. + */ +void insn_get_modrm(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + u8 pfx, mod; + if (modrm->got) + return; + if (!insn->opcode.got) + insn_get_opcode(insn); + + if (INAT_HAS_MODRM(insn->attr)) { + mod = get_next(u8, insn); + modrm->value = mod; + modrm->nbytes = 1; + if (INAT_IS_GROUP(insn->attr)) { + pfx = LAST_PREFIX(insn); + insn->attr = inat_get_group_attribute(mod, pfx, + insn->attr); + } + } + + if (insn->x86_64 && INAT_IS_FORCE64(insn->attr)) + insn->opnd_bytes = 8; + modrm->got = true; +} +EXPORT_SYMBOL_GPL(insn_get_modrm); + + +/** + * insn_rip_relative() - Does instruction use RIP-relative addressing mode? + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. No effect if @insn->x86_64 is false. + */ +bool insn_rip_relative(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + + if (!insn->x86_64) + return false; + if (!modrm->got) + insn_get_modrm(insn); + /* + * For rip-relative instructions, the mod field (top 2 bits) + * is zero and the r/m field (bottom 3 bits) is 0x5. + */ + return (modrm->nbytes && (modrm->value & 0xc7) == 0x5); +} +EXPORT_SYMBOL_GPL(insn_rip_relative); + +/** + * + * insn_get_sib() - Get the SIB byte of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. + */ +void insn_get_sib(struct insn *insn) +{ + if (insn->sib.got) + return; + if (!insn->modrm.got) + insn_get_modrm(insn); + if (insn->modrm.nbytes) + if (insn->addr_bytes != 2 && + MODRM_MOD(insn) != 3 && MODRM_RM(insn) == 4) { + insn->sib.value = get_next(u8, insn); + insn->sib.nbytes = 1; + } + insn->sib.got = true; +} +EXPORT_SYMBOL_GPL(insn_get_sib); + + +/** + * + * insn_get_displacement() - Get the displacement of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * SIB byte. + * Displacement value is sign-expanded. + */ +void insn_get_displacement(struct insn *insn) +{ + u8 mod; + if (insn->displacement.got) + return; + if (!insn->sib.got) + insn_get_sib(insn); + if (insn->modrm.nbytes) { + /* + * Interpreting the modrm byte: + * mod = 00 - no displacement fields (exceptions below) + * mod = 01 - 1-byte displacement field + * mod = 10 - displacement field is 4 bytes, or 2 bytes if + * address size = 2 (0x67 prefix in 32-bit mode) + * mod = 11 - no memory operand + * + * If address size = 2... + * mod = 00, r/m = 110 - displacement field is 2 bytes + * + * If address size != 2... + * mod != 11, r/m = 100 - SIB byte exists + * mod = 00, SIB base = 101 - displacement field is 4 bytes + * mod = 00, r/m = 101 - rip-relative addressing, displacement + * field is 4 bytes + */ + mod = MODRM_MOD(insn); + if (mod == 3) + goto out; + if (mod == 1) { + insn->displacement.value = get_next(s8, insn); + insn->displacement.nbytes = 1; + } else if (insn->addr_bytes == 2) { + if ((mod == 0 && MODRM_RM(insn) == 6) || mod == 2) { + insn->displacement.value = get_next(s16, insn); + insn->displacement.nbytes = 2; + } + } else { + if ((mod == 0 && MODRM_RM(insn) == 5) || mod == 2 || + (mod == 0 && SIB_BASE(insn) == 5)) { + insn->displacement.value = get_next(s32, insn); + insn->displacement.nbytes = 4; + } + } + } +out: + insn->displacement.got = true; +} +EXPORT_SYMBOL_GPL(insn_get_displacement); + +/* Decode moffset16/32/64 */ +static void __get_moffset(struct insn *insn) +{ + switch (insn->addr_bytes) { + case 2: + insn->moffset1.value = get_next(s16, insn); + insn->moffset1.nbytes = 2; + break; + case 4: + insn->moffset1.value = get_next(s32, insn); + insn->moffset1.nbytes = 4; + break; + case 8: + insn->moffset1.value = get_next(s32, insn); + insn->moffset1.nbytes = 4; + insn->moffset2.value = get_next(s32, insn); + insn->moffset2.nbytes = 4; + break; + } + insn->moffset1.got = insn->moffset2.got = true; +} + +/* Decode imm v32(Iz) */ +static void __get_immv32(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate.value = get_next(s16, insn); + insn->immediate.nbytes = 2; + break; + case 4: + case 8: + insn->immediate.value = get_next(s32, insn); + insn->immediate.nbytes = 4; + break; + } +} + +/* Decode imm v64(Iv/Ov) */ +static void __get_immv(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(s16, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(s32, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + insn->immediate1.value = get_next(s32, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(s32, insn); + insn->immediate2.nbytes = 4; + break; + } + insn->immediate1.got = insn->immediate2.got = true; +} + +/* Decode ptr16:16/32(Ap) */ +static void __get_immptr(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(s16, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(s32, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + /* ptr16:64 is not supported (no segment) */ + WARN_ON(1); + return; + } + insn->immediate2.value = get_next(u16, insn); + insn->immediate2.nbytes = 2; + insn->immediate1.got = insn->immediate2.got = true; +} + +/** + * + * insn_get_immediate() - Get the immediates of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * displacement bytes. + * Basically, most of immediates are sign-expanded. Unsigned-value can be + * get by bit masking with ((1 << (nbytes * 8)) - 1) + */ +void insn_get_immediate(struct insn *insn) +{ + if (insn->immediate.got) + return; + if (!insn->displacement.got) + insn_get_displacement(insn); + + if (INAT_HAS_MOFFSET(insn->attr)) { + __get_moffset(insn); + goto done; + } + + if (!INAT_HAS_IMM(insn->attr)) + /* no immediates */ + goto done; + + switch (INAT_IMM_SIZE(insn->attr)) { + case INAT_IMM_BYTE: + insn->immediate.value = get_next(s8, insn); + insn->immediate.nbytes = 1; + break; + case INAT_IMM_WORD: + insn->immediate.value = get_next(s16, insn); + insn->immediate.nbytes = 2; + break; + case INAT_IMM_DWORD: + insn->immediate.value = get_next(s32, insn); + insn->immediate.nbytes = 4; + break; + case INAT_IMM_QWORD: + insn->immediate1.value = get_next(s32, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(s32, insn); + insn->immediate2.nbytes = 4; + break; + case INAT_IMM_PTR: + __get_immptr(insn); + break; + case INAT_IMM_VWORD32: + __get_immv32(insn); + break; + case INAT_IMM_VWORD: + __get_immv(insn); + break; + default: + break; + } + if (INAT_HAS_ADDIMM(insn->attr)) { + insn->immediate2.value = get_next(s8, insn); + insn->immediate2.nbytes = 1; + } +done: + insn->immediate.got = true; +} +EXPORT_SYMBOL_GPL(insn_get_immediate); + +/** + * + * insn_get_length() - Get the length of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * immediates bytes. + */ +void insn_get_length(struct insn *insn) +{ + if (insn->length) + return; + if (!insn->immediate.got) + insn_get_immediate(insn); + insn->length = (u8)((unsigned long)insn->next_byte + - (unsigned long)insn->kaddr); +} +EXPORT_SYMBOL_GPL(insn_get_length); diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt new file mode 100644 index 0000000..ab2a58d --- /dev/null +++ b/arch/x86/lib/x86-opcode-map.txt @@ -0,0 +1,711 @@ +# x86 Opcode Maps +# +#<Opcode maps> +# Table: table-name +# Referrer: escaped-name +# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# (or) +# opcode: escape # escaped-name +# EndTable +# +#<group maps> +# GrpTable: GrpXXX +# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# EndTable +# + +Table: one byte opcode +Referrer: +# 0x00 - 0x0f +00: ADD Eb,Gb +01: ADD Ev,Gv +02: ADD Gb,Eb +03: ADD Gv,Ev +04: ADD AL,Ib +05: ADD rAX,Iz +06: PUSH ES (i64) +07: POP ES (i64) +08: OR Eb,Gb +09: OR Ev,Gv +0a: OR Gb,Eb +0b: OR Gv,Ev +0c: OR AL,Ib +0d: OR rAX,Iz +0e: PUSH CS (i64) +0f: escape # 2-byte escape +# 0x10 - 0x1f +10: ADC Eb,Gb +11: ADC Ev,Gv +12: ADC Gb,Eb +13: ADC Gv,Ev +14: ADC AL,Ib +15: ADC rAX,Iz +16: PUSH SS (i64) +17: POP SS (i64) +18: SBB Eb,Gb +19: SBB Ev,Gv +1a: SBB Gb,Eb +1b: SBB Gv,Ev +1c: SBB AL,Ib +1d: SBB rAX,Iz +1e: PUSH DS (i64) +1f: POP DS (i64) +# 0x20 - 0x2f +20: AND Eb,Gb +21: AND Ev,Gv +22: AND Gb,Eb +23: AND Gv,Ev +24: AND AL,Ib +25: AND rAx,Iz +26: SEG=ES (Prefix) +27: DAA (i64) +28: SUB Eb,Gb +29: SUB Ev,Gv +2a: SUB Gb,Eb +2b: SUB Gv,Ev +2c: SUB AL,Ib +2d: SUB rAX,Iz +2e: SEG=CS (Prefix) +2f: DAS (i64) +# 0x30 - 0x3f +30: XOR Eb,Gb +31: XOR Ev,Gv +32: XOR Gb,Eb +33: XOR Gv,Ev +34: XOR AL,Ib +35: XOR rAX,Iz +36: SEG=SS (Prefix) +37: AAA (i64) +38: CMP Eb,Gb +39: CMP Ev,Gv +3a: CMP Gb,Eb +3b: CMP Gv,Ev +3c: CMP AL,Ib +3d: CMP rAX,Iz +3e: SEG=DS (Prefix) +3f: AAS (i64) +# 0x40 - 0x4f +40: INC eAX (i64) | REX (o64) +41: INC eCX (i64) | REX.B (o64) +42: INC eDX (i64) | REX.X (o64) +43: INC eBX (i64) | REX.XB (o64) +44: INC eSP (i64) | REX.R (o64) +45: INC eBP (i64) | REX.RB (o64) +46: INC eSI (i64) | REX.RX (o64) +47: INC eDI (i64) | REX.RXB (o64) +48: DEC eAX (i64) | REX.W (o64) +49: DEC eCX (i64) | REX.WB (o64) +4a: DEC eDX (i64) | REX.WX (o64) +4b: DEC eBX (i64) | REX.WXB (o64) +4c: DEC eSP (i64) | REX.WR (o64) +4d: DEC eBP (i64) | REX.WRB (o64) +4e: DEC eSI (i64) | REX.WRX (o64) +4f: DEC eDI (i64) | REX.WRXB (o64) +# 0x50 - 0x5f +50: PUSH rAX/r8 (d64) +51: PUSH rCX/r9 (d64) +52: PUSH rDX/r10 (d64) +53: PUSH rBX/r11 (d64) +54: PUSH rSP/r12 (d64) +55: PUSH rBP/r13 (d64) +56: PUSH rSI/r14 (d64) +57: PUSH rDI/r15 (d64) +58: POP rAX/r8 (d64) +59: POP rCX/r9 (d64) +5a: POP rDX/r10 (d64) +5b: POP rBX/r11 (d64) +5c: POP rSP/r12 (d64) +5d: POP rBP/r13 (d64) +5e: POP rSI/r14 (d64) +5f: POP rDI/r15 (d64) +# 0x60 - 0x6f +60: PUSHA/PUSHAD (i64) +61: POPA/POPAD (i64) +62: BOUND Gv,Ma (i64) +63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64) +64: SEG=FS (Prefix) +65: SEG=GS (Prefix) +66: Operand-Size (Prefix) +67: Address-Size (Prefix) +68: PUSH Iz (d64) +69: IMUL Gv,Ev,Iz +6a: PUSH Ib (d64) +6b: IMUL Gv,Ev,Ib +6c: INS/INSB Yb,DX +6d: INS/INSW/INSD Yz,DX +6e: OUTS/OUTSB DX,Xb +6f: OUTS/OUTSW/OUTSD DX,Xz +# 0x70 - 0x7f +70: JO Jb +71: JNO Jb +72: JB/JNAE/JC Jb +73: JNB/JAE/JNC Jb +74: JZ/JE Jb +75: JNZ/JNE Jb +76: JBE/JNA Jb +77: JNBE/JA Jb +78: JS Jb +79: JNS Jb +7a: JP/JPE Jb +7b: JNP/JPO Jb +7c: JL/JNGE Jb +7d: JNL/JGE Jb +7e: JLE/JNG Jb +7f: JNLE/JG Jb +# 0x80 - 0x8f +80: Grp1 Eb,Ib (1A) +81: Grp1 Ev,Iz (1A) +82: Grp1 Eb,Ib (1A),(i64) +83: Grp1 Ev,Ib (1A) +84: TEST Eb,Gb +85: TEST Ev,Gv +86: XCHG Eb,Gb +87: XCHG Ev,Gv +88: MOV Eb,Gb +89: MOV Ev,Gv +8a: MOV Gb,Eb +8b: MOV Gv,Ev +8c: MOV Ev,Sw +8d: LEA Gv,M +8e: MOV Sw,Ew +8f: Grp1A (1A) | POP Ev (d64) +# 0x90 - 0x9f +90: NOP | PAUSE (F3) | XCHG r8,rAX +91: XCHG rCX/r9,rAX +92: XCHG rDX/r10,rAX +93: XCHG rBX/r11,rAX +94: XCHG rSP/r12,rAX +95: XCHG rBP/r13,rAX +96: XCHG rSI/r14,rAX +97: XCHG rDI/r15,rAX +98: CBW/CWDE/CDQE +99: CWD/CDQ/CQO +9a: CALLF Ap (i64) +9b: FWAIT/WAIT +9c: PUSHF/D/Q Fv (d64) +9d: POPF/D/Q Fv (d64) +9e: SAHF +9f: LAHF +# 0xa0 - 0xaf +a0: MOV AL,Ob +a1: MOV rAX,Ov +a2: MOV Ob,AL +a3: MOV Ov,rAX +a4: MOVS/B Xb,Yb +a5: MOVS/W/D/Q Xv,Yv +a6: CMPS/B Xb,Yb +a7: CMPS/W/D Xv,Yv +a8: TEST AL,Ib +a9: TEST rAX,Iz +aa: STOS/B Yb,AL +ab: STOS/W/D/Q Yv,rAX +ac: LODS/B AL,Xb +ad: LODS/W/D/Q rAX,Xv +ae: SCAS/B AL,Yb +af: SCAS/W/D/Q rAX,Xv +# 0xb0 - 0xbf +b0: MOV AL/R8L,Ib +b1: MOV CL/R9L,Ib +b2: MOV DL/R10L,Ib +b3: MOV BL/R11L,Ib +b4: MOV AH/R12L,Ib +b5: MOV CH/R13L,Ib +b6: MOV DH/R14L,Ib +b7: MOV BH/R15L,Ib +b8: MOV rAX/r8,Iv +b9: MOV rCX/r9,Iv +ba: MOV rDX/r10,Iv +bb: MOV rBX/r11,Iv +bc: MOV rSP/r12,Iv +bd: MOV rBP/r13,Iv +be: MOV rSI/r14,Iv +bf: MOV rDI/r15,Iv +# 0xc0 - 0xcf +c0: Grp2 Eb,Ib (1A) +c1: Grp2 Ev,Ib (1A) +c2: RETN Iw (f64) +c3: RETN +c4: LES Gz,Mp (i64) +c5: LDS Gz,Mp (i64) +c6: Grp11 Eb,Ib (1A) +c7: Grp11 Ev,Iz (1A) +c8: ENTER Iw,Ib +c9: LEAVE (d64) +ca: RETF Iw +cb: RETF +cc: INT3 +cd: INT Ib +ce: INTO (i64) +cf: IRET/D/Q +# 0xd0 - 0xdf +d0: Grp2 Eb,1 (1A) +d1: Grp2 Ev,1 (1A) +d2: Grp2 Eb,CL (1A) +d3: Grp2 Ev,CL (1A) +d4: AAM Ib (i64) +d5: AAD Ib (i64) +d6: +d7: XLAT/XLATB +d8: ESC +d9: ESC +da: ESC +db: ESC +dc: ESC +dd: ESC +de: ESC +df: ESC +# 0xe0 - 0xef +e0: LOOPNE/LOOPNZ Jb (f64) +e1: LOOPE/LOOPZ Jb (f64) +e2: LOOP Jb (f64) +e3: JrCXZ Jb (f64) +e4: IN AL,Ib +e5: IN eAX,Ib +e6: OUT Ib,AL +e7: OUT Ib,eAX +e8: CALL Jz (f64) +e9: JMP-near Jz (f64) +ea: JMP-far Ap (i64) +eb: JMP-short Jb (f64) +ec: IN AL,DX +ed: IN eAX,DX +ee: OUT DX,AL +ef: OUT DX,eAX +# 0xf0 - 0xff +f0: LOCK (Prefix) +f1: +f2: REPNE (Prefix) +f3: REP/REPE (Prefix) +f4: HLT +f5: CMC +f6: Grp3_1 Eb (1A) +f7: Grp3_2 Ev (1A) +f8: CLC +f9: STC +fa: CLI +fb: STI +fc: CLD +fd: STD +fe: Grp4 (1A) +ff: Grp5 (1A) +EndTable + +Table: 2-byte opcode # First Byte is 0x0f +Referrer: 2-byte escape +# 0x0f 0x00-0x0f +00: Grp6 (1A) +01: Grp7 (1A) +02: LAR Gv,Ew +03: LSL Gv,Ew +04: +05: SYSCALL (o64) +06: CLTS +07: SYSRET (o64) +08: INVD +09: WBINVD +0a: +0b: UD2 (1B) +0c: +0d: NOP Ev +0e: +0f: +# 0x0f 0x10-0x1f +10: +11: +12: +13: +14: +15: +16: +17: +18: Grp16 (1A) +19: +1a: +1b: +1c: +1d: +1e: +1f: NOP Ev +# 0x0f 0x20-0x2f +20: MOV Rd,Cd +21: MOV Rd,Dd +22: MOV Cd,Rd +23: MOV Dd,Rd +24: +25: +26: +27: +28: movaps Vps,Wps | movapd Vpd,Wpd (66) +29: movaps Wps,Vps | movapd Wpd,Vpd (66) +2a: +2b: +2c: +2d: +2e: +2f: +# 0x0f 0x30-0x3f +30: WRMSR +31: RDTSC +32: RDMSR +33: RDPMC +34: SYSENTER +35: SYSEXIT +36: +37: GETSEC +38: escape # 3-byte escape 1 +39: +3a: escape # 3-byte escape 2 +3b: +3c: +3d: +3e: +3f: +# 0x0f 0x40-0x4f +40: CMOVO Gv,Ev +41: CMOVNO Gv,Ev +42: CMOVB/C/NAE Gv,Ev +43: CMOVAE/NB/NC Gv,Ev +44: CMOVE/Z Gv,Ev +45: CMOVNE/NZ Gv,Ev +46: CMOVBE/NA Gv,Ev +47: CMOVA/NBE Gv,Ev +48: CMOVS Gv,Ev +49: CMOVNS Gv,Ev +4a: CMOVP/PE Gv,Ev +4b: CMOVNP/PO Gv,Ev +4c: CMOVL/NGE Gv,Ev +4d: CMOVNL/GE Gv,Ev +4e: CMOVLE/NG Gv,Ev +4f: CMOVNLE/G Gv,Ev +# 0x0f 0x50-0x5f +50: +51: +52: +53: +54: +55: +56: +57: +58: +59: +5a: +5b: +5c: +5d: +5e: +5f: +# 0x0f 0x60-0x6f +60: +61: +62: +63: +64: +65: +66: +67: +68: +69: +6a: +6b: +6c: +6d: +6e: +6f: +# 0x0f 0x70-0x7f +70: +71: Grp12 (1A) +72: Grp13 (1A) +73: Grp14 (1A) +74: +75: +76: +77: +78: VMREAD Ed/q,Gd/q +79: VMWRITE Gd/q,Ed/q +7a: +7b: +7c: +7d: +7e: +7f: +# 0x0f 0x80-0x8f +80: JO Jz (f64) +81: JNO Jz (f64) +82: JB/JNAE/JC Jz (f64) +83: JNB/JAE/JNC Jz (f64) +84: JZ/JE Jz (f64) +85: JNZ/JNE Jz (f64) +86: JBE/JNA Jz (f64) +87: JNBE/JA Jz (f64) +88: JS Jz (f64) +89: JNS Jz (f64) +8a: JP/JPE Jz (f64) +8b: JNP/JPO Jz (f64) +8c: JL/JNGE Jz (f64) +8d: JNL/JGE Jz (f64) +8e: JLE/JNG Jz (f64) +8f: JNLE/JG Jz (f64) +# 0x0f 0x90-0x9f +90: SETO Eb +91: SETNO Eb +92: SETB/C/NAE Eb +93: SETAE/NB/NC Eb +94: SETE/Z Eb +95: SETNE/NZ Eb +96: SETBE/NA Eb +97: SETA/NBE Eb +98: SETS Eb +99: SETNS Eb +9a: SETP/PE Eb +9b: SETNP/PO Eb +9c: SETL/NGE Eb +9d: SETNL/GE Eb +9e: SETLE/NG Eb +9f: SETNLE/G Eb +# 0x0f 0xa0-0xaf +a0: PUSH FS (d64) +a1: POP FS (d64) +a2: CPUID +a3: BT Ev,Gv +a4: SHLD Ev,Gv,Ib +a5: SHLD Ev,Gv,CL +a6: +a7: +a8: PUSH GS (d64) +a9: POP GS (d64) +aa: RSM +ab: BTS Ev,Gv +ac: SHRD Ev,Gv,Ib +ad: SHRD Ev,Gv,CL +ae: Grp15 (1A),(1C) +af: IMUL Gv,Ev +# 0x0f 0xb0-0xbf +b0: CMPXCHG Eb,Gb +b1: CMPXCHG Ev,Gv +b2: LSS Gv,Mp +b3: BTR Ev,Gv +b4: LFS Gv,Mp +b5: LGS Gv,Mp +b6: MOVZX Gv,Eb +b7: MOVZX Gv,Ew +b8: JMPE | POPCNT Gv,Ev (F3) +b9: Grp10 (1A) +ba: Grp8 Ev,Ib (1A) +bb: BTC Ev,Gv +bc: BSF Gv,Ev +bd: BSR Gv,Ev +be: MOVSX Gv,Eb +bf: MOVSX Gv,Ew +# 0x0f 0xc0-0xcf +c0: XADD Eb,Gb +c1: XADD Ev,Gv +c2: +c3: movnti Md/q,Gd/q +c4: +c5: +c6: +c7: Grp9 (1A) +c8: BSWAP RAX/EAX/R8/R8D +c9: BSWAP RCX/ECX/R9/R9D +ca: BSWAP RDX/EDX/R10/R10D +cb: BSWAP RBX/EBX/R11/R11D +cc: BSWAP RSP/ESP/R12/R12D +cd: BSWAP RBP/EBP/R13/R13D +ce: BSWAP RSI/ESI/R14/R14D +cf: BSWAP RDI/EDI/R15/R15D +# 0x0f 0xd0-0xdf +d0: +d1: +d2: +d3: +d4: +d5: +d6: +d7: +d8: +d9: +da: +db: +dc: +dd: +de: +df: +# 0x0f 0xe0-0xef +e0: +e1: +e2: +e3: +e4: +e5: +e6: +e7: +e8: +e9: +ea: +eb: +ec: +ed: +ee: +ef: +# 0x0f 0xf0-0xff +f0: +f1: +f2: +f3: +f4: +f5: +f6: +f7: +f8: +f9: +fa: +fb: +fc: +fd: +fe: +ff: +EndTable + +Table: 3-byte opcode 1 +Referrer: 3-byte escape 1 +80: INVEPT Gd/q,Mdq (66) +81: INVPID Gd/q,Mdq (66) +f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) +f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2) +EndTable + +Table: 3-byte opcode 2 +Referrer: 3-byte escape 2 +# all opcode is for SSE +EndTable + +GrpTable: Grp1 +0: ADD +1: OR +2: ADC +3: SBB +4: AND +5: SUB +6: XOR +7: CMP +EndTable + +GrpTable: Grp1A +0: POP +EndTable + +GrpTable: Grp2 +0: ROL +1: ROR +2: RCL +3: RCR +4: SHL/SAL +5: SHR +6: +7: SAR +EndTable + +GrpTable: Grp3_1 +0: TEST Eb,Ib +1: +2: NOT Eb +3: NEG Eb +4: MUL AL,Eb +5: IMUL AL,Eb +6: DIV AL,Eb +7: IDIV AL,Eb +EndTable + +GrpTable: Grp3_2 +0: TEST Ev,Iz +1: +2: NOT Ev +3: NEG Ev +4: MUL rAX,Ev +5: IMUL rAX,Ev +6: DIV rAX,Ev +7: IDIV rAX,Ev +EndTable + +GrpTable: Grp4 +0: INC Eb +1: DEC Eb +EndTable + +GrpTable: Grp5 +0: INC Ev +1: DEC Ev +2: CALLN Ev (f64) +3: CALLF Ep +4: JMPN Ev (f64) +5: JMPF Ep +6: PUSH Ev (d64) +7: +EndTable + +GrpTable: Grp6 +0: SLDT Rv/Mw +1: STR Rv/Mw +2: LLDT Ew +3: LTR Ew +4: VERR Ew +5: VERW Ew +EndTable + +GrpTable: Grp7 +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) +3: LIDT Ms +4: SMSW Mw/Rv +5: +6: LMSW Ew +7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) +EndTable + +GrpTable: Grp8 +4: BT +5: BTS +6: BTR +7: BTC +EndTable + +GrpTable: Grp9 +1: CMPXCHG8B/16B Mq/Mdq +6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) +7: VMPTRST Mq +EndTable + +GrpTable: Grp10 +EndTable + +GrpTable: Grp11 +0: MOV +EndTable + +GrpTable: Grp12 +EndTable + +GrpTable: Grp13 +EndTable + +GrpTable: Grp14 +EndTable + +GrpTable: Grp15 +0: fxsave +1: fxstor +2: ldmxcsr +3: stmxcsr +4: XSAVE +5: XRSTOR | lfence (11B) +6: mfence (11B) +7: clflush | sfence (11B) +EndTable + +GrpTable: Grp16 +0: prefetch NTA +1: prefetch T0 +2: prefetch T1 +3: prefetch T2 +EndTable diff --git a/arch/x86/scripts/gen-insn-attr-x86.awk b/arch/x86/scripts/gen-insn-attr-x86.awk new file mode 100644 index 0000000..6fa88cd --- /dev/null +++ b/arch/x86/scripts/gen-insn-attr-x86.awk @@ -0,0 +1,314 @@ +#!/bin/awk -f +# gen-insn-attr-x86.awk: Instruction attribute table generator +# Written by Masami Hiramatsu <mhi...@re...> +# +# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c + +BEGIN { + print "/* x86 opcode map generated from x86-opcode-map.txt */" + print "/* Do not change this code. */" + ggid = 1 + geid = 1 + + opnd_expr = "^[[:alpha:]]" + ext_expr = "^\\(" + sep_expr = "^\\|$" + group_expr = "^Grp[[:digit:]]+A*" + + imm_expr = "^[IJAO][[:lower:]]" + imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" + imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)" + imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)" + imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)" + imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)" + imm_flag["Ob"] = "INAT_MOFFSET" + imm_flag["Ov"] = "INAT_MOFFSET" + + modrm_expr = "^([CDEGMNPQRSUVW][[:lower:]]+|NTA|T[012])" + force64_expr = "\\([df]64\\)" + rex_expr = "^REX(\\.[XRWB]+)*" + fpu_expr = "^ESC" # TODO + + lprefix1_expr = "\\(66\\)" + delete lptable1 + lprefix2_expr = "\\(F2\\)" + delete lptable2 + lprefix3_expr = "\\(F3\\)" + delete lptable3 + max_lprefix = 4 + + prefix_expr = "\\(Prefix\\)" + prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" + prefix_num["REPNE"] = "INAT_PFX_REPNE" + prefix_num["REP/REPE"] = "INAT_PFX_REPE" + prefix_num["LOCK"] = "INAT_PFX_LOCK" + prefix_num["SEG=CS"] = "INAT_PFX_CS" + prefix_num["SEG=DS"] = "INAT_PFX_DS" + prefix_num["SEG=ES"] = "INAT_PFX_ES" + prefix_num["SEG=FS"] = "INAT_PFX_FS" + prefix_num["SEG=GS"] = "INAT_PFX_GS" + prefix_num["SEG=SS"] = "INAT_PFX_SS" + prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" + + delete table + delete etable + delete gtable + eid = -1 + gid = -1 +} + +function semantic_error(msg) { + print "Semantic error at " NR ": " msg > "/dev/stderr" + exit 1 +} + +function debug(msg) { + print "DEBUG: " msg +} + +function array_size(arr, i,c) { + c = 0 + for (i in arr) + c++ + return c +} + +/^Table:/ { + print "/* " $0 " */" +} + +/^Referrer:/ { + if (NF == 1) { + # primary opcode table + tname = "inat_primary_table" + eid = -1 + } else { + # escape opcode table + ref = "" + for (i = 2; i <= NF; i++) + ref = ref $i + eid = escape[ref] + tname = sprintf("inat_escape_table_%d", eid) + } +} + +/^GrpTable:/ { + print "/* " $0 " */" + if (!($2 in group)) + semantic_error("No group: " $2 ) + gid = group[$2] + tname = "inat_group_table_" gid +} + +function print_table(tbl,name,fmt,n) +{ + print "const insn_attr_t " name " = {" + for (i = 0; i < n; i++) { + id = sprintf(fmt, i) + if (tbl[id]) + print " [" id "] = " tbl[id] "," + } + print "};" +} + +/^EndTable/ { + if (gid != -1) { + # print group tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,3] = tname "_3" + } + } else { + # print primary/escaped tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,3] = tname "_3" + } + } + print "" + delete table + delete lptable1 + delete lptable2 + delete lptable3 + gid = -1 + eid = -1 +} + +function add_flags(old,new) { + if (old && new) + return old " | " new + else if (old) + return old + else + return new +} + +# convert operands to flags. +function convert_operands(opnd, i,imm,mod) +{ + imm = null + mod = null + for (i in opnd) { + i = opnd[i] + if (match(i, imm_expr) == 1) { + if (!imm_flag[i]) + semantic_error("Unknown imm opnd: " i) + if (imm) { + if (i != "Ib") + semantic_error("ADDIMM error") + imm = add_flags(imm, "INAT_ADDIMM") + } else + imm = imm_flag[i] + } else if (match(i, modrm_expr)) + mod = "INAT_MODRM" + } + return add_flags(imm, mod) +} + +/^[0-9a-f]+\:/ { + if (NR == 1) + next + # get index + idx = "0x" substr($1, 1, index($1,":") - 1) + if (idx in table) + semantic_error("Redefine " idx " in " tname) + + # check if escaped opcode + if ("escape" == $2) { + if ($3 != "#") + semantic_error("No escaped name") + ref = "" + for (i = 4; i <= NF; i++) + ref = ref $i + if (ref in escape) + semantic_error("Redefine escape (" ref ")") + escape[ref] = geid + geid++ + table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")" + next + } + + variant = null + # converts + i = 2 + while (i <= NF) { + opcode = $(i++) + delete opnds + ext = null + flags = null + opnd = null + # parse one opcode + if (match($i, opnd_expr)) { + opnd = $i + split($(i++), opnds, ",") + flags = convert_operands(opnds) + } + if (match($i, ext_expr)) + ext = $(i++) + if (match($i, sep_expr)) + i++ + else if (i < NF) + semantic_error($i " is not a separator") + + # check if group opcode + if (match(opcode, group_expr)) { + if (!(opcode in group)) { + group[opcode] = ggid + ggid++ + } + flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")") + } + # check force(or default) 64bit + if (match(ext, force64_expr)) + flags = add_flags(flags, "INAT_FORCE64") + + # check REX prefix + if (match(opcode, rex_expr)) + flags = add_flags(flags, "INAT_REXPFX") + + # check coprocessor escape : TODO + if (match(opcode, fpu_expr)) + flags = add_flags(flags, "INAT_MODRM") + + # check prefixes + if (match(ext, prefix_expr)) { + if (!prefix_num[opcode]) + semantic_error("Unknown prefix: " opcode) + flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")") + } + if (length(flags) == 0) + continue + # check if last prefix + if (match(ext, lprefix1_expr)) { + lptable1[idx] = add_flags(lptable1[idx],flags) + variant = "INAT_VARIANT" + } else if (match(ext, lprefix2_expr)) { + lptable2[idx] = add_flags(lptable2[idx],flags) + variant = "INAT_VARIANT" + } else if (match(ext, lprefix3_expr)) { + lptable3[idx] = add_flags(lptable3[idx],flags) + variant = "INAT_VARIANT" + } else { + table[idx] = add_flags(table[idx],flags) + } + } + if (variant) + table[idx] = add_flags(table[idx],variant) +} + +END { + # print escape opcode map's array + print "/* Escape opcode map array */" + print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \ + "[INAT_LPREFIX_MAX + 1] = {" + for (i = 0; i < geid; i++) + for (j = 0; j < max_lprefix; j++) + if (etable[i,j]) + print " ["i"]["j"] = "etable[i,j]"," + print "};\n" + # print group opcode map's array + print "/* Group opcode map array */" + print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\ + "[INAT_LPREFIX_MAX + 1] = {" + for (i = 0; i < ggid; i++) + for (j = 0; j < max_lprefix; j++) + if (gtable[i,j]) + print " ["i"]["j"] = "gtable[i,j]"," + print "};" +} -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America) Inc. Software Solutions Division e-mail: mhi...@re... |
From: Masami H. <mhi...@re...> - 2009-05-21 15:50:43
|
H. Peter Anvin wrote: > Masami Hiramatsu wrote: >>> /usr/bin/env 'awk -f' gen-insn-attr-x86.awk >>> >>> which is obviously wrong. >> Oops, right. Anyway, there is no reason that we make it >> executable because it's always called from Makefile. >> So, I think just removing that line is better way. >> > > I suggest leaving in a #!/bin/awk line just for documentation's sake, > though. OK, I'd leave that, and change usage line to use awk command instead of run the script directly. Thank you! > > -hpa > -- Masami Hiramatsu Software Engineer Hitachi Computer Products (America) Inc. Software Solutions Division e-mail: mhi...@re... |
From: H. P. A. <hp...@zy...> - 2009-05-21 05:47:13
|
Masami Hiramatsu wrote: >> >> /usr/bin/env 'awk -f' gen-insn-attr-x86.awk >> >> which is obviously wrong. > > Oops, right. Anyway, there is no reason that we make it > executable because it's always called from Makefile. > So, I think just removing that line is better way. > I suggest leaving in a #!/bin/awk line just for documentation's sake, though. -hpa -- H. Peter Anvin, Intel Open Source Technology Center I work for Intel. I don't speak on their behalf. |
From: Takahiro Y. <ty...@re...> - 2009-05-13 20:20:47
|
Hi, This is a systemtap script to disable scsi error retry when scsi timeout happens on devices using dm-mirror. When scsi detects errors and scsi notices the error was recovered, the I/O is retried if REQ_FAILFAST flag is *not* set to req->flags. However, dm-mirror needs to wait for a long time if the error is timeout and it happens repeatedly on the same device because of a storage failure. One of the solution to solve this problem is to set REQ_FAILFAST flag to a request passed to scsi layer in order to disable scsi error retry. You can use this script to evaluate the effect of this solution. I appreciate your comments. Regards, --- Takahiro Yasui Hitachi Computer Products (America), Inc. <failfast.stp> #!/usr/bin/stap -gv # # Failfast test script for RHEL5.3 kernel (2.6.18-128.el5) # # This script works only for the simple mirror structure as follows. # # mirror (mirror target: /dev/mapper/<vg>-<lv>) # + mirror_leg#N (linear target: /dev/mapper/<vg>-<lv>_mimage_N) # + device (/dev/sdx) # /*************************************************** * global vabiarbles ***************************************************/ global fn_clone_endio global fn_endio global fn_mirror_end_io global fn_write_callback global fn_read_callback /*************************************************** * functions ***************************************************/ %{ #include <linux/blkdev.h> #include <linux/fs.h> /* quoated from drivers/md/md-raid1.c */ struct mirror { atomic_t error_count; unsigned long error_type; struct mirror_set *ms; struct dm_dev *dev; sector_t offset; }; %} function set_failfast_flag(req:long) %{ struct request *req = (struct request *)((long)THIS->req); req->flags |= REQ_FAILFAST; %} function check_valid_mirror:long(m:long) %{ struct mirror *m = (struct mirror *)((long)THIS->m); THIS->__retvalue = atomic_read(&m->error_count) ? 0 : 1; %} function __get_mirror:long(m:long, idx:long) %{ struct mirror *m = (struct mirror *)((long)THIS->m); int idx = (int)((long)THIS->idx); THIS->__retvalue = (long)&m[idx]; %} function check_write_io:long(bio:long) %{ struct bio *bio = (struct bio *)((long)THIS->bio); THIS->__retvalue = bio_rw(bio) == WRITE; %} function get_region:long(bio:long) %{ struct bio *bio = (struct bio *)((long)THIS->bio); if (!bio->bi_io_vec) { THIS->__retvalue = -1; return; } THIS->__retvalue = bio->bi_io_vec[bio->bi_max_vecs].bv_len; %} function get_mirror:long(ms:long, idx:long) { m = @cast(ms, "mirror_set", "dm_mirror")->mirror return __get_mirror(m, idx) } function get_valid_mirror_count:long(ms:long) { nr_mirrors = @cast(ms, "mirror_set", "dm_mirror")->nr_mirrors count = 0 for (i = 0; i < nr_mirrors; i++) count += check_valid_mirror(get_mirror(ms, i)) return count } function pio_valid_mirror_count:long(tio:long) { ms = @cast(tio, "target_io", "dm_mod")->ti->private if (!ms) return 0 return get_valid_mirror_count(ms) } function qio_valid_mirror_count:long(io:long) { bio = @cast(io, "io", "dm_mod")->context if (!bio) return 0 m = @cast(bio, "bio")->bi_next if (!m) return 0 ms = @cast(m, "mirror", "dm_mirror")->ms if (!ms) return 0 return get_valid_mirror_count(ms) } function pio_check_valid_mirror:long(tio:long, bio:long) { if (check_write_io(bio)) { ms = @cast(tio, "target_io", "dm_mod")->ti->private if (!ms) return 0 m = @cast(ms, "mirror_set", "dm_mirror")->default_mirror if (!m) return 0 } else { ptr = @cast(tio, "target_io", "dm_mod")->info->ptr if (!ptr) return 0 m = @cast(ptr, "bio_map_info", "dm_mirror")->bmi_m if (!m) return 0 } return check_valid_mirror(m) } function qio_check_valid_mirror:long(io:long, bio:long) { if (check_write_io(io)) { region = get_region(bio) if (region == -1) return 0 bio = @cast(io, "io", "dm_mod")->context if (!bio) return 0 /* * get the default mirror */ m = @cast(bio, "bio")->bi_next if (!m) return 0 /* * get the first mirror leg */ m = @cast(m, "mirror", "dm_mirror")->ms->mirror if (!m) return 0 /* * get the target mirror leg */ m = __get_mirror(m, region) if (!m) return 0 } else { bio = @cast(io, "io", "dm_mod")->context if (!bio) return 0 /* * "m" is a target mirror leg */ m = @cast(bio, "bio")->bi_next if (!m) return 0 } return check_valid_mirror(m) } function check_bi_end_io:long(bio:long, fn:long) { return @cast(bio, "bio")->bi_end_io == fn } function check_end_io:long(type:long, fn:long) { return @cast(type, "target_type", "dm_mod")->end_io == fn } function check_callback:long(io:long, fn:long) { return @cast(io, "io", "dm_mod")->callback == fn } /* * check pass-through I/O */ function check_pio:long(bio:long) { /* * check bio to the lowest level * (mirror_leg -> device) * * If bio->bi_end_io is clone_endio(), the bio is issued * through device mapper. */ if (!check_bi_end_io(bio, fn_clone_endio)) return 0 tio = @cast(bio, "bio")->bi_private if (!tio) return 0 bio = @cast(tio, "target_io", "dm_mod")->io->bio if (!bio) return 0 /* * check bio to the second lower level * (mirror -> mirror_leg) */ if (!check_bi_end_io(bio, fn_clone_endio)) return 0 tio = @cast(bio, "bio")->bi_private if (!tio) return 0 type = @cast(tio, "target_io", "dm_mod")->ti->type if (!type) return 0 if (!check_end_io(type, fn_mirror_end_io)) return 0 /* * Now we know this bio came from a mirror device. */ if (pio_valid_mirror_count(tio) == 1 && pio_check_valid_mirror(tio, bio)) return 0 /* * failfast flag needs to be set in this case. */ return 1 } /* * check queued I/O */ function check_qio:long(bio:long) { /* * check bio to the lowest level * (mirror_leg -> device) * * If bio->bi_end_io is clone_endio(), the bio is issued * through device mapper. */ if (!check_bi_end_io(bio, fn_clone_endio)) return 0 tio = @cast(bio, "bio")->bi_private if (!tio) return 0 bio = @cast(tio, "target_io", "dm_mod")->io->bio if (!bio) return 0 /* * check bio to the second lower level * (mirror -> mirror_leg) */ if (!check_bi_end_io(bio, fn_endio)) return 0 io = @cast(bio, "bio")->bi_private if (!io) return 0 if (!check_callback(io, fn_write_callback) && !check_callback(io, fn_read_callback)) return 0 /* * Now we know this bio came from a mirror device. */ if (qio_valid_mirror_count(io) == 1 && qio_check_valid_mirror(io, bio)) return 0 /* * failfast flag needs to be set in this case. */ return 1 } function usage() { printf("usage: stap -g failfast addr1 addr2 addr3 addr4 addr5\n") printf(" addr1 ... address of clone_endio() [dm_mod]\n") printf(" addr2 ... address of endio() [dm_mod]\n") printf(" addr3 ... address of mirror_end_io() [dm_mirror]\n") printf(" addr4 ... address of write_callback() [dm_mirror]\n") printf(" addr5 ... address of read_callback() [dm_mirror]\n") exit() } /*************************************************** * probes ***************************************************/ /* * 1. I/O types processed by dm-mirror * * The mirror taregt have three types of I/O, path-through read, * queue read/write and queue-single write. * * a) path-through read (read to in-sync region) * * This is a read I/O which are directly passed to lower layer * without being queued. This type of I/O is applied to read I/Os * onto regions in sync state. * * b) queued read/write (read to no-sync/write to in-sync region) * * This is an I/O which are put into a queue of device mapper * and passed to lower regions later by workqueue. This type of * I/O is applied to read and write I/Os onto regions out of sync * state. * * c) queued-single write (write to no-sync region) * * This is a write I/O which are put into a queue of device mapper * and passed to lower regions later by workqueue as queue read/write, * but write I/O is issued only onto default mirror, because target * regions are out of sync state. * * path-trough read and queue-single write have the same bio structure, * and are handled by check_pio() function in this script. On the other * hand, queue read/write is handled by check_qio(). * * * 2. Conditions to set failfast flag * * A failfast flag is set when scsi detects timeout unless bio satisfies * the following conditions. * * - There is only one valid mirror legs, and * - The target mirror leg of the bio is valid. * * Here is an example of three legs. "A" shows an active (valid) mirror * leg and "D" shows a mirror leg in the state of "dead (write error)" * or "read error". * * AAA -> set * AAD (bio to D) -> set * AAD (bio to A) -> set * ADD (bio to D) -> set * ADD (bio to A) -> *not* set * DDD -> set */ probe module("scsi_mod").function("scsi_times_out") { if (!$scmd->request || !$scmd->request->bio) next if (check_pio($scmd->request->bio) || check_qio($scmd->request->bio)) set_failfast_flag($scmd->request) } probe begin { if (!$1 || !$2 || !$3 || !$4 || !$5) usage() fn_clone_endio = $1 fn_endio = $2 fn_mirror_end_io = $3 fn_write_callback = $4 fn_read_callback = $5 } <failfast.sh> #!/bin/sh if [ `uname -i` == "i386" ]; then flag=0xffffffff00000000 fi fn1=$(($flag | 0x`grep clone_endio /proc/kallsyms | cut -f1 -d' '`)) fn2=$(($flag | 0x`grep -e ' endio.*dm_mod' /proc/kallsyms | cut -f1 -d' '`)) fn3=$(($flag | 0x`grep mirror_end_io /proc/kallsyms | cut -f1 -d' '`)) fn4=$(($flag | 0x`grep write_callback /proc/kallsyms | cut -f1 -d' '`)) fn5=$(($flag | 0x`grep read_callback /proc/kallsyms | cut -f1 -d' '`)) ./failfast.stp -- $fn1 $fn2 $fn3 $fn4 $fn5 |
From: Takahiro Y. <ty...@re...> - 2009-05-13 19:52:37
|
Hi, This is a systemtap script to simulate a patch posted to scsi-list. You can use this script to evaluate the effect of the patch. limit state transitions in scsi_internal_device_unblock http://marc.info/?l=linux-scsi&m=124102164303979&w=2 Regarding to the original issue, the following email is useful. SCSI timeout error recovery issue http://marc.info/?l=linux-scsi&m=124042136915970&w=2 I appreciate your comments. Regards, --- Takahiro Yasui Hitachi Computer Products (America), Inc. global sdev_addr = 0 %{ #include <scsi/scsi_device.h> %} function set_invalid_state:long(sdev:long) %{ struct scsi_device *sdev = (struct scsi_device *)((long)THIS->sdev); sdev->sdev_state = 0; %} function check_invalid_state:long(sdev:long) %{ struct scsi_device *sdev = (struct scsi_device *)((long)THIS->sdev); THIS->__retvalue = (sdev->sdev_state == 0) ? 1 : 0; %} function set_offline_state:long(sdev:long) %{ struct scsi_device *sdev = (struct scsi_device *)((long)THIS->sdev); sdev->sdev_state = SDEV_OFFLINE; %} function check_offline_state:long(sdev:long) %{ struct scsi_device *sdev = (struct scsi_device *)((long)THIS->sdev); THIS->__retvalue = (sdev->sdev_state == SDEV_OFFLINE) ? 1 : 0; %} probe module("scsi_mod").function("scsi_internal_device_unblock") { if ($sdev && check_offline_state($sdev)) { set_invalid_state($sdev) sdev_addr = $sdev } } probe module("scsi_mod").function("scsi_internal_device_unblock").return { if (sdev_addr && check_invalid_state(sdev_addr)) { set_offline_state(sdev_addr) sdev_addr = 0 } } |