From: Andi K. <an...@fi...> - 2011-05-15 04:34:26
|
[This is a resend because I don't see the original version from friday on the list. Apologies if you see it duplicated.] Another version of the Sandy Bridge patchkit with extra and named unit masks support. Passes maynard's test cases now. No other changes. Please apply now. -Andi |
From: Andi K. <an...@fi...> - 2011-05-15 04:34:26
|
From: Andi Kleen <ak...@li...> The reference files from which the Intel event lists are generated from have various events that use the CMASK, INV or EDGE flags for the performance counters. This often allows to have a "more natural" counter versus a raw counter. This patch adds the infrastructure to add extra flags for a unit mask event. There is a new extra:... field in the unit mask declaration that declares them. Then a patched kernel can set these extra fields using a new file in oprofilefs. I'm submitting the small kernel patch needed for that separately. This patch adds the infrastructure needed to declare these extra flags, and also adds some of them to the Sandy Bridge events files. v2: Fixed review feedback Signed-off-by: Andi Kleen <ak...@li...> --- libop/op_events.c | 38 +++++++++++++++++++++++++++++++++ libop/op_events.h | 6 +++++ libop/op_xml_events.c | 4 +++ libop/op_xml_out.c | 3 +- libop/op_xml_out.h | 3 +- utils/opcontrol | 14 ++++++++++++ utils/ophelp.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 121 insertions(+), 3 deletions(-) diff --git a/libop/op_events.c b/libop/op_events.c index 30c3207..8da023b 100644 --- a/libop/op_events.c +++ b/libop/op_events.c @@ -21,6 +21,7 @@ #include <string.h> #include <stdlib.h> #include <stdio.h> +#include <ctype.h> static LIST_HEAD(events_list); static LIST_HEAD(um_list); @@ -106,6 +107,34 @@ static void include_um(const char *start, const char *end) free(s); } +/* extra:cmask=12,inv,edge */ +unsigned parse_extra(const char *s) +{ + unsigned v, w; + int o; + + v = 0; + while (*s) { + if (isspace(*s)) + break; + if (strisprefix(s, "edge")) { + v |= EXTRA_EDGE; + s += 4; + } else if (strisprefix(s, "inv")) { + v |= EXTRA_INV; + s += 3; + } else if (sscanf(s, "cmask=%x%n", &w, &o) >= 1) { + v |= (w & EXTRA_CMASK_MASK) << EXTRA_CMASK_SHIFT; + s += o; + } else { + parse_error("Illegal extra field modifier"); + } + if (*s == ',') + ++s; + } + return v; +} + /* name:MESI type:bitmask default:0x0f */ static void parse_um(struct op_unit_mask * um, char const * line) { @@ -178,6 +207,7 @@ static void parse_um(struct op_unit_mask * um, char const * line) /* \t0x08 (M)odified cache state */ +/* \t0x08 extra:inv,cmask=... (M)odified cache state */ static void parse_um_entry(struct op_described_um * entry, char const * line) { char const * c = line; @@ -186,6 +216,14 @@ static void parse_um_entry(struct op_described_um * entry, char const * line) entry->value = parse_hex(c); c = skip_nonws(c); + c = skip_ws(c); + if (strisprefix(c, "extra:")) { + c += 6; + entry->extra = parse_extra(c); + c = skip_nonws(c); + } else + entry->extra = 0; + if (!*c) parse_error("invalid unit mask entry"); diff --git a/libop/op_events.h b/libop/op_events.h index 9ffdc49..3aaaba2 100644 --- a/libop/op_events.h +++ b/libop/op_events.h @@ -20,6 +20,11 @@ extern "C" { #include "op_types.h" #include "op_list.h" +#define EXTRA_EDGE (1U << 18) +#define EXTRA_INV (1U << 23) +#define EXTRA_CMASK_SHIFT 24 +#define EXTRA_CMASK_MASK 0xff + /** Describe an unit mask type. Events can optionally use a filter called * the unit mask. the mask type can be a bitmask or a discrete value */ enum unit_mask_type { @@ -39,6 +44,7 @@ struct op_unit_mask { enum unit_mask_type unit_type_mask; u32 default_mask; /**< only the gui use it */ struct op_described_um { + u32 extra; u32 value; char * desc; } um[MAX_UNIT_MASK]; diff --git a/libop/op_xml_events.c b/libop/op_xml_events.c index 1fcb01e..f573e02 100644 --- a/libop/op_xml_events.c +++ b/libop/op_xml_events.c @@ -103,6 +103,10 @@ void xml_help_for_event(struct op_event const * event) init_xml_str_attr(HELP_UNIT_MASK_DESC, event->unit->um[i].desc, buffer, MAX_BUFFER); + if (event->unit->um[i].extra) + init_xml_int_attr(HELP_UNIT_EXTRA_VALUE, + event->unit->um[i].extra, + buffer, MAX_BUFFER); close_xml_element(NONE, 0, buffer, MAX_BUFFER); } close_xml_element(HELP_UNIT_MASKS, 0, buffer, MAX_BUFFER); diff --git a/libop/op_xml_out.c b/libop/op_xml_out.c index f6d9042..0b3deea 100644 --- a/libop/op_xml_out.c +++ b/libop/op_xml_out.c @@ -83,7 +83,8 @@ char const * xml_tag_map[] = { "category", "unit_mask", "mask", - "desc" + "desc", + "extra" }; #define MAX_BUF_LEN 2048 diff --git a/libop/op_xml_out.h b/libop/op_xml_out.h index 4fb06df..544bd51 100644 --- a/libop/op_xml_out.h +++ b/libop/op_xml_out.h @@ -57,7 +57,8 @@ typedef enum { HELP_UNIT_MASKS_CATEGORY, HELP_UNIT_MASK, HELP_UNIT_MASK_VALUE, - HELP_UNIT_MASK_DESC + HELP_UNIT_MASK_DESC, + HELP_UNIT_EXTRA_VALUE, } tag_t; char const * xml_tag_name(tag_t tag); diff --git a/utils/opcontrol b/utils/opcontrol index 3a8a814..603172d 100644 --- a/utils/opcontrol +++ b/utils/opcontrol @@ -1353,6 +1353,10 @@ do_param_setup() set_ctr_param $f enabled 0 set_ctr_param $f event 0 set_ctr_param $f count 0 + + if test -d $MOUNT/$f/extra ; then + set_ctr_param $f extra 0 + fi done # Check if driver has IBS support @@ -1437,6 +1441,16 @@ do_param_setup() set_ctr_param $CTR kernel $KERNEL set_ctr_param $CTR user $USER set_ctr_param $CTR unit_mask $UNIT_MASK + + EXTRA=`$OPHELP --extra-mask $GOTEVENT` + if test "$EXTRA" -ne 0 ; then + if ! test -d $MOUNT/$CTR/extra ; then + echo >&2 "Warning: $GOTEVENT has extra mask, but kernel does not support extra field" + echo >&2 "Please update your kernel or use a different event. Will miscount." + else + set_ctr_param $CTR extra $EXTRA + fi + fi fi OPROFILED_EVENTS=${OPROFILED_EVENTS}$EVENT:$EVENT_VAL: OPROFILED_EVENTS=${OPROFILED_EVENTS}$CTR:$COUNT:$UNIT_MASK: diff --git a/utils/ophelp.c b/utils/ophelp.c index ce8bace..60b1662 100644 --- a/utils/ophelp.c +++ b/utils/ophelp.c @@ -143,6 +143,21 @@ static void help_for_event(struct op_event * event) event->unit->um[j].value); column = 14; word_wrap(14, &column, event->unit->um[j].desc); + if (event->unit->um[j].extra) { + u32 extra = event->unit->um[j].extra; + + word_wrap(14, &column, "(extra:"); + if (extra & EXTRA_EDGE) + word_wrap(14, &column, "edge"); + if (extra & EXTRA_INV) + word_wrap(14, &column, "inv"); + if ((extra >> EXTRA_CMASK_SHIFT) & EXTRA_CMASK_MASK) { + snprintf(buf, sizeof buf, "cmask=%x", + (extra >> EXTRA_CMASK_SHIFT) & EXTRA_CMASK_MASK); + word_wrap(14, &column, buf); + } + word_wrap(14, &column, ")"); + } putchar('\n'); } } @@ -274,6 +289,37 @@ static void show_unit_mask(void) printf("%d\n", event->unit->default_mask); } +static void show_extra_mask(void) +{ + unsigned i; + struct op_event * event; + size_t count; + unsigned extra; + + count = parse_events(parsed_events, num_chosen_events, chosen_events); + if (count > 1) { + fprintf(stderr, "More than one event specified.\n"); + exit(EXIT_FAILURE); + } + + event = find_event_by_name(parsed_events[0].name, + parsed_events[0].unit_mask, + 1); + if (!event) { + fprintf(stderr, "No such event found.\n"); + exit(EXIT_FAILURE); + } + + /* Not exact match is nothing */ + extra = 0; + for (i = 0; i < event->unit->num; i++) + if (event->unit->um[i].value == (unsigned)parsed_events[0].unit_mask) { + extra = event->unit->um[i].extra; + break; + } + + printf ("%d\n", extra); +} static void show_default_event(void) { @@ -293,6 +339,7 @@ static int get_cpu_type; static int check_events; static int unit_mask; static int get_default_event; +static int extra_mask; static struct poptOption options[] = { { "cpu-type", 'c', POPT_ARG_STRING, &cpu_string, 0, @@ -311,6 +358,8 @@ static struct poptOption options[] = { "show version", NULL, }, { "xml", 'X', POPT_ARG_NONE, &want_xml, 0, "list events as XML", NULL, }, + { "extra-mask", 'E', POPT_ARG_NONE, &extra_mask, 0, + "print extra mask for event", NULL, }, POPT_AUTOHELP { NULL, 0, 0, NULL, 0, NULL, NULL, }, }; @@ -412,7 +461,7 @@ int main(int argc, char const * argv[]) events = op_events(cpu_type); - if (!chosen_events && (unit_mask || check_events)) { + if (!chosen_events && (unit_mask || check_events || extra_mask)) { fprintf(stderr, "No events given.\n"); exit(EXIT_FAILURE); } @@ -422,6 +471,11 @@ int main(int argc, char const * argv[]) exit(EXIT_SUCCESS); } + if (extra_mask) { + show_extra_mask(); + exit(EXIT_SUCCESS); + } + if (check_events) { resolve_events(); exit(EXIT_SUCCESS); -- 1.7.4.4 |
From: Maynard J. <may...@us...> - 2011-05-16 19:36:14
|
Andi Kleen wrote: > From: Andi Kleen <ak...@li...> > > The reference files from which the Intel event lists are generated from > have various events that use the CMASK, INV or EDGE flags for the performance > counters. This often allows to have a "more natural" counter versus > a raw counter. > > This patch adds the infrastructure to add extra flags for a unit mask > event. There is a new extra:... field in the unit mask declaration > that declares them. Committed with fixups for numerous whitespace errors. -Maynard > > Then a patched kernel can set these extra fields using a new file > in oprofilefs. > > I'm submitting the small kernel patch needed for that separately. > > This patch adds the infrastructure needed to declare these > extra flags, and also adds some of them to the Sandy Bridge > events files. > > v2: Fixed review feedback > > Signed-off-by: Andi Kleen <ak...@li...> > --- > libop/op_events.c | 38 +++++++++++++++++++++++++++++++++ > libop/op_events.h | 6 +++++ > libop/op_xml_events.c | 4 +++ > libop/op_xml_out.c | 3 +- > libop/op_xml_out.h | 3 +- > utils/opcontrol | 14 ++++++++++++ > utils/ophelp.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++- > 7 files changed, 121 insertions(+), 3 deletions(-) > > diff --git a/libop/op_events.c b/libop/op_events.c > index 30c3207..8da023b 100644 > --- a/libop/op_events.c > +++ b/libop/op_events.c > @@ -21,6 +21,7 @@ > #include <string.h> > #include <stdlib.h> > #include <stdio.h> > +#include <ctype.h> > > static LIST_HEAD(events_list); > static LIST_HEAD(um_list); > @@ -106,6 +107,34 @@ static void include_um(const char *start, const char *end) > free(s); > } > > +/* extra:cmask=12,inv,edge */ > +unsigned parse_extra(const char *s) > +{ > + unsigned v, w; > + int o; > + > + v = 0; > + while (*s) { > + if (isspace(*s)) > + break; > + if (strisprefix(s, "edge")) { > + v |= EXTRA_EDGE; > + s += 4; > + } else if (strisprefix(s, "inv")) { > + v |= EXTRA_INV; > + s += 3; > + } else if (sscanf(s, "cmask=%x%n", &w, &o) >= 1) { > + v |= (w & EXTRA_CMASK_MASK) << EXTRA_CMASK_SHIFT; > + s += o; > + } else { > + parse_error("Illegal extra field modifier"); > + } > + if (*s == ',') > + ++s; > + } > + return v; > +} > + > /* name:MESI type:bitmask default:0x0f */ > static void parse_um(struct op_unit_mask * um, char const * line) > { > @@ -178,6 +207,7 @@ static void parse_um(struct op_unit_mask * um, char const * line) > > > /* \t0x08 (M)odified cache state */ > +/* \t0x08 extra:inv,cmask=... (M)odified cache state */ > static void parse_um_entry(struct op_described_um * entry, char const * line) > { > char const * c = line; > @@ -186,6 +216,14 @@ static void parse_um_entry(struct op_described_um * entry, char const * line) > entry->value = parse_hex(c); > c = skip_nonws(c); > > + c = skip_ws(c); > + if (strisprefix(c, "extra:")) { > + c += 6; > + entry->extra = parse_extra(c); > + c = skip_nonws(c); > + } else > + entry->extra = 0; > + > if (!*c) > parse_error("invalid unit mask entry"); > > diff --git a/libop/op_events.h b/libop/op_events.h > index 9ffdc49..3aaaba2 100644 > --- a/libop/op_events.h > +++ b/libop/op_events.h > @@ -20,6 +20,11 @@ extern "C" { > #include "op_types.h" > #include "op_list.h" > > +#define EXTRA_EDGE (1U << 18) > +#define EXTRA_INV (1U << 23) > +#define EXTRA_CMASK_SHIFT 24 > +#define EXTRA_CMASK_MASK 0xff > + > /** Describe an unit mask type. Events can optionally use a filter called > * the unit mask. the mask type can be a bitmask or a discrete value */ > enum unit_mask_type { > @@ -39,6 +44,7 @@ struct op_unit_mask { > enum unit_mask_type unit_type_mask; > u32 default_mask; /**< only the gui use it */ > struct op_described_um { > + u32 extra; > u32 value; > char * desc; > } um[MAX_UNIT_MASK]; > diff --git a/libop/op_xml_events.c b/libop/op_xml_events.c > index 1fcb01e..f573e02 100644 > --- a/libop/op_xml_events.c > +++ b/libop/op_xml_events.c > @@ -103,6 +103,10 @@ void xml_help_for_event(struct op_event const * event) > init_xml_str_attr(HELP_UNIT_MASK_DESC, > event->unit->um[i].desc, > buffer, MAX_BUFFER); > + if (event->unit->um[i].extra) > + init_xml_int_attr(HELP_UNIT_EXTRA_VALUE, > + event->unit->um[i].extra, > + buffer, MAX_BUFFER); > close_xml_element(NONE, 0, buffer, MAX_BUFFER); > } > close_xml_element(HELP_UNIT_MASKS, 0, buffer, MAX_BUFFER); > diff --git a/libop/op_xml_out.c b/libop/op_xml_out.c > index f6d9042..0b3deea 100644 > --- a/libop/op_xml_out.c > +++ b/libop/op_xml_out.c > @@ -83,7 +83,8 @@ char const * xml_tag_map[] = { > "category", > "unit_mask", > "mask", > - "desc" > + "desc", > + "extra" > }; > > #define MAX_BUF_LEN 2048 > diff --git a/libop/op_xml_out.h b/libop/op_xml_out.h > index 4fb06df..544bd51 100644 > --- a/libop/op_xml_out.h > +++ b/libop/op_xml_out.h > @@ -57,7 +57,8 @@ typedef enum { > HELP_UNIT_MASKS_CATEGORY, > HELP_UNIT_MASK, > HELP_UNIT_MASK_VALUE, > - HELP_UNIT_MASK_DESC > + HELP_UNIT_MASK_DESC, > + HELP_UNIT_EXTRA_VALUE, > } tag_t; > > char const * xml_tag_name(tag_t tag); > diff --git a/utils/opcontrol b/utils/opcontrol > index 3a8a814..603172d 100644 > --- a/utils/opcontrol > +++ b/utils/opcontrol > @@ -1353,6 +1353,10 @@ do_param_setup() > set_ctr_param $f enabled 0 > set_ctr_param $f event 0 > set_ctr_param $f count 0 > + > + if test -d $MOUNT/$f/extra ; then > + set_ctr_param $f extra 0 > + fi > done > > # Check if driver has IBS support > @@ -1437,6 +1441,16 @@ do_param_setup() > set_ctr_param $CTR kernel $KERNEL > set_ctr_param $CTR user $USER > set_ctr_param $CTR unit_mask $UNIT_MASK > + > + EXTRA=`$OPHELP --extra-mask $GOTEVENT` > + if test "$EXTRA" -ne 0 ; then > + if ! test -d $MOUNT/$CTR/extra ; then > + echo >&2 "Warning: $GOTEVENT has extra mask, but kernel does not support extra field" > + echo >&2 "Please update your kernel or use a different event. Will miscount." > + else > + set_ctr_param $CTR extra $EXTRA > + fi > + fi > fi > OPROFILED_EVENTS=${OPROFILED_EVENTS}$EVENT:$EVENT_VAL: > OPROFILED_EVENTS=${OPROFILED_EVENTS}$CTR:$COUNT:$UNIT_MASK: > diff --git a/utils/ophelp.c b/utils/ophelp.c > index ce8bace..60b1662 100644 > --- a/utils/ophelp.c > +++ b/utils/ophelp.c > @@ -143,6 +143,21 @@ static void help_for_event(struct op_event * event) > event->unit->um[j].value); > column = 14; > word_wrap(14, &column, event->unit->um[j].desc); > + if (event->unit->um[j].extra) { > + u32 extra = event->unit->um[j].extra; > + > + word_wrap(14, &column, "(extra:"); > + if (extra & EXTRA_EDGE) > + word_wrap(14, &column, "edge"); > + if (extra & EXTRA_INV) > + word_wrap(14, &column, "inv"); > + if ((extra >> EXTRA_CMASK_SHIFT) & EXTRA_CMASK_MASK) { > + snprintf(buf, sizeof buf, "cmask=%x", > + (extra >> EXTRA_CMASK_SHIFT) & EXTRA_CMASK_MASK); > + word_wrap(14, &column, buf); > + } > + word_wrap(14, &column, ")"); > + } > putchar('\n'); > } > } > @@ -274,6 +289,37 @@ static void show_unit_mask(void) > printf("%d\n", event->unit->default_mask); > } > > +static void show_extra_mask(void) > +{ > + unsigned i; > + struct op_event * event; > + size_t count; > + unsigned extra; > + > + count = parse_events(parsed_events, num_chosen_events, chosen_events); > + if (count > 1) { > + fprintf(stderr, "More than one event specified.\n"); > + exit(EXIT_FAILURE); > + } > + > + event = find_event_by_name(parsed_events[0].name, > + parsed_events[0].unit_mask, > + 1); > + if (!event) { > + fprintf(stderr, "No such event found.\n"); > + exit(EXIT_FAILURE); > + } > + > + /* Not exact match is nothing */ > + extra = 0; > + for (i = 0; i < event->unit->num; i++) > + if (event->unit->um[i].value == (unsigned)parsed_events[0].unit_mask) { > + extra = event->unit->um[i].extra; > + break; > + } > + > + printf ("%d\n", extra); > +} > > static void show_default_event(void) > { > @@ -293,6 +339,7 @@ static int get_cpu_type; > static int check_events; > static int unit_mask; > static int get_default_event; > +static int extra_mask; > > static struct poptOption options[] = { > { "cpu-type", 'c', POPT_ARG_STRING, &cpu_string, 0, > @@ -311,6 +358,8 @@ static struct poptOption options[] = { > "show version", NULL, }, > { "xml", 'X', POPT_ARG_NONE, &want_xml, 0, > "list events as XML", NULL, }, > + { "extra-mask", 'E', POPT_ARG_NONE, &extra_mask, 0, > + "print extra mask for event", NULL, }, > POPT_AUTOHELP > { NULL, 0, 0, NULL, 0, NULL, NULL, }, > }; > @@ -412,7 +461,7 @@ int main(int argc, char const * argv[]) > > events = op_events(cpu_type); > > - if (!chosen_events && (unit_mask || check_events)) { > + if (!chosen_events && (unit_mask || check_events || extra_mask)) { > fprintf(stderr, "No events given.\n"); > exit(EXIT_FAILURE); > } > @@ -422,6 +471,11 @@ int main(int argc, char const * argv[]) > exit(EXIT_SUCCESS); > } > > + if (extra_mask) { > + show_extra_mask(); > + exit(EXIT_SUCCESS); > + } > + > if (check_events) { > resolve_events(); > exit(EXIT_SUCCESS); |
From: Andi K. <an...@fi...> - 2011-05-15 04:34:26
|
From: Andi Kleen <ak...@li...> Avoid extra spaces printed in individual small wordwraps. This fixes a cosmentic issue in ophelp event printing with the new extra fields. --- utils/ophelp.c | 14 ++++++++------ 1 files changed, 8 insertions(+), 6 deletions(-) diff --git a/utils/ophelp.c b/utils/ophelp.c index f4e0653..b8da87e 100644 --- a/utils/ophelp.c +++ b/utils/ophelp.c @@ -71,10 +71,12 @@ static void word_wrap(int indent, int *column, char *msg) printf("\n%*s", indent, ""); *column = indent; } - printf("%.*s ", wlen, msg); + printf("%.*s", wlen, msg); *column += wlen + 1; msg += wlen; msg += strspn(msg, " "); + if (*msg) + putchar(' '); } } @@ -128,7 +130,7 @@ static void help_for_event(struct op_event * event) printf(")\n\t"); column = 8; word_wrap(8, &column, event->desc); - snprintf(buf, sizeof buf, "(min count: %d)", event->min_count); + snprintf(buf, sizeof buf, " (min count: %d)", event->min_count); word_wrap(8, &column, buf); putchar('\n'); @@ -146,13 +148,13 @@ static void help_for_event(struct op_event * event) if (event->unit->um[j].extra) { u32 extra = event->unit->um[j].extra; - word_wrap(14, &column, "(extra:"); + word_wrap(14, &column, " (extra:"); if (extra & EXTRA_EDGE) - word_wrap(14, &column, "edge"); + word_wrap(14, &column, " edge"); if (extra & EXTRA_INV) - word_wrap(14, &column, "inv"); + word_wrap(14, &column, " inv"); if ((extra >> EXTRA_CMASK_SHIFT) & EXTRA_CMASK_MASK) { - snprintf(buf, sizeof buf, "cmask=%x", + snprintf(buf, sizeof buf, " cmask=%x", (extra >> EXTRA_CMASK_SHIFT) & EXTRA_CMASK_MASK); word_wrap(14, &column, buf); } -- 1.7.4.4 |
From: Maynard J. <may...@us...> - 2011-05-16 19:40:02
|
Andi Kleen wrote: > From: Andi Kleen <ak...@li...> > > Avoid extra spaces printed in individual small wordwraps. This fixes > a cosmentic issue in ophelp event printing with the new extra fields. Committed with one whitespace fixup. -Maynard > --- > utils/ophelp.c | 14 ++++++++------ > 1 files changed, 8 insertions(+), 6 deletions(-) > > diff --git a/utils/ophelp.c b/utils/ophelp.c > index f4e0653..b8da87e 100644 > --- a/utils/ophelp.c > +++ b/utils/ophelp.c > @@ -71,10 +71,12 @@ static void word_wrap(int indent, int *column, char *msg) > printf("\n%*s", indent, ""); > *column = indent; > } > - printf("%.*s ", wlen, msg); > + printf("%.*s", wlen, msg); > *column += wlen + 1; > msg += wlen; > msg += strspn(msg, " "); > + if (*msg) > + putchar(' '); > } > } > > @@ -128,7 +130,7 @@ static void help_for_event(struct op_event * event) > printf(")\n\t"); > column = 8; > word_wrap(8, &column, event->desc); > - snprintf(buf, sizeof buf, "(min count: %d)", event->min_count); > + snprintf(buf, sizeof buf, " (min count: %d)", event->min_count); > word_wrap(8, &column, buf); > putchar('\n'); > > @@ -146,13 +148,13 @@ static void help_for_event(struct op_event * event) > if (event->unit->um[j].extra) { > u32 extra = event->unit->um[j].extra; > > - word_wrap(14, &column, "(extra:"); > + word_wrap(14, &column, " (extra:"); > if (extra & EXTRA_EDGE) > - word_wrap(14, &column, "edge"); > + word_wrap(14, &column, " edge"); > if (extra & EXTRA_INV) > - word_wrap(14, &column, "inv"); > + word_wrap(14, &column, " inv"); > if ((extra >> EXTRA_CMASK_SHIFT) & EXTRA_CMASK_MASK) { > - snprintf(buf, sizeof buf, "cmask=%x", > + snprintf(buf, sizeof buf, " cmask=%x", > (extra >> EXTRA_CMASK_SHIFT) & EXTRA_CMASK_MASK); > word_wrap(14, &column, buf); > } |
From: Andi K. <an...@fi...> - 2011-05-15 04:34:27
|
From: Andi Kleen <ak...@li...> With extra fields available just specifying the numerical unit_mask is often not enough to select the right event. This patch allows to specify unitmasks by name (first word in the description) in the event description. This also makes many unit masks easier to use because names are easier to remember than numbers. v2: Fixed review feedback, fix regression tester v3: Add unit mask name uniqueness check v4: Add another check Signed-off-by: Andi Kleen <ak...@li...> --- doc/opcontrol.1.in | 5 +- doc/oprofile.xml | 2 +- libop/op_events.c | 134 +++++++++++++++++++++++++++++++++++++++ libop/op_events.h | 11 +++ libop/op_parse_event.c | 9 ++- libop/op_parse_event.h | 1 + libop/tests/parse_event_tests.c | 8 +- utils/opcontrol | 13 ++-- utils/ophelp.c | 40 ++++-------- 9 files changed, 180 insertions(+), 43 deletions(-) diff --git a/doc/opcontrol.1.in b/doc/opcontrol.1.in index 5a78aa6..931d877 100644 --- a/doc/opcontrol.1.in +++ b/doc/opcontrol.1.in @@ -107,7 +107,10 @@ or "default" for the default event. The event is of the form count, unit mask, kernel-space counting, user-space counting, respectively. Note that this over-rides all previous events selected; if you want two or more counters used simultaneously, you must specify -them on the same opcontrol invocation. +them on the same opcontrol invocation. The numerical unit mask +can also be a string which matches the first word in the unit mask +description, but only for events with extra parameters shown. +Events with extra parameters must be specified by first word. .br .TP .BI "--separate="[none,lib,kernel,thread,cpu,all] diff --git a/doc/oprofile.xml b/doc/oprofile.xml index 0d6407c..843d96b 100644 --- a/doc/oprofile.xml +++ b/doc/oprofile.xml @@ -877,7 +877,7 @@ of the form <option><emphasis>name</emphasis>:<emphasis>count</emphasis>:<emphas <tbody> <row><entry><option>name</option></entry><entry>The symbolic event name, e.g. <constant>CPU_CLK_UNHALTED</constant></entry></row> <row><entry><option>count</option></entry><entry>The counter reset value, e.g. 100000</entry></row> -<row><entry><option>unitmask</option></entry><entry>The unit mask, as given in the events list, e.g. 0x0f</entry></row> +<row><entry><option>unitmask</option></entry><entry>The unit mask, as given in the events list, e.g. 0x0f or a symbolic name (first word of the description if unique)</entry></row> <row><entry><option>kernel</option></entry><entry>Whether to profile kernel code</entry></row> <row><entry><option>user</option></entry><entry>Whether to profile userspace code</entry></row> </tbody> diff --git a/libop/op_events.c b/libop/op_events.c index 8da023b..502ff01 100644 --- a/libop/op_events.c +++ b/libop/op_events.c @@ -17,6 +17,7 @@ #include "op_string.h" #include "op_cpufreq.h" #include "op_hw_specific.h" +#include "op_parse_event.h" #include <string.h> #include <stdlib.h> @@ -921,6 +922,18 @@ struct op_event * find_event_by_name(char const * name, unsigned um, int um_vali } +static struct op_event * find_next_event(struct op_event * e) +{ + struct list_head * n; + + for (n = e->event_next.next; n != &events_list; n = n->next) { + struct op_event * ne = list_entry(n, struct op_event, event_next); + if (!strcmp(e->name, ne->name)) + return ne; + } + return NULL; +} + struct op_event * op_find_event(op_cpu cpu_type, u32 nr, u32 um) { struct op_event * event; @@ -1120,3 +1133,124 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) break; } } + +static void extra_check(struct op_event *e, u32 unit_mask) +{ + unsigned i; + int found = 0; + + for (i = 0; i < e->unit->num; i++) + if (e->unit->um[i].value == unit_mask) + found++; + if (found > 1) { + fprintf(stderr, +"Unfortunately you are not allowed to use named unit masks on events\n" +"without extra values. Please specify the magic numbers for the unit mask\n" +"instead\n"); + exit(EXIT_FAILURE); + } +} + +static void another_extra_check(struct op_event *e, char *name, unsigned w) +{ + int found; + unsigned i; + + if (!e->unit->um[w].extra) { + fprintf(stderr, +"Named unit mask `%s' has no extra parameter. Please use a numerical\n" +"unit mask; names would be too easy.\n", name); + exit(EXIT_FAILURE); + } + + found = 0; + for (i = 0; i < e->unit->num; i++) { + int len = strcspn(e->unit->um[i].desc, " \t"); + if (!strncmp(name, e->unit->um[i].desc, len) && + name[len] == '\0') + found++; + } + if (found > 1) { + fprintf(stderr, + "Unit mask name `%s' not unique. Sorry please use a numerical unit mask\n", name); + exit(EXIT_FAILURE); + } +} + +static void do_resolve_unit_mask(struct op_event *e, struct parsed_event *pe, + u32 *extra) +{ + unsigned i; + int found; + + for (;;) { + if (pe->unit_mask_name == NULL) { + int fi = 0; + unsigned um = pe->unit_mask; + int had_unit_mask = pe->unit_mask_valid; + + found = 0; + for (i = 0; i < e->unit->num; i++) { + if (pe->unit_mask_valid && + e->unit->um[i].value == um) { + if (found++ == 0) + fi = i; + } + if (!pe->unit_mask_valid && + e->unit->um[i].value == e->unit->default_mask) { + pe->unit_mask_valid = 1; + pe->unit_mask = e->unit->default_mask; + break; + } + } + if (found > 1 && had_unit_mask) { + fprintf(stderr, + "Non unique numerical unit mask.\n" + "Please specify the unit mask using the first word of the description\n"); + exit(EXIT_FAILURE); + } + extra_check(e, pe->unit_mask); + if (i == e->unit->num) { + e = find_next_event(e); + if (e != NULL) + continue; + } else { + if (extra) + *extra = e->unit->um[i].extra; + } + return; + } + for (i = 0; i < e->unit->num; i++) { + int len = strcspn(e->unit->um[i].desc, " \t"); + if (!strncmp(pe->unit_mask_name, e->unit->um[i].desc, + len) && pe->unit_mask_name[len] == '\0') + break; + } + if (i == e->unit->num) { + e = find_next_event(e); + if (e != NULL) + continue; + fprintf(stderr, "Cannot find unit mask %s for %s\n", + pe->unit_mask_name, pe->name); + exit(EXIT_FAILURE); + } + another_extra_check(e, pe->unit_mask_name, i); + pe->unit_mask_valid = 1; + pe->unit_mask = e->unit->um[i].value; + if (extra) + *extra = e->unit->um[i].extra; + return; + } +} + +void op_resolve_unit_mask(struct parsed_event *pe, u32 *extra) +{ + struct op_event *e; + + e = find_event_by_name(pe->name, 0, 0); + if (!e) { + fprintf(stderr, "Cannot find event %s\n", pe->name); + exit(EXIT_FAILURE); + } + return do_resolve_unit_mask(e, pe, extra); +} diff --git a/libop/op_events.h b/libop/op_events.h index 3aaaba2..58d39da 100644 --- a/libop/op_events.h +++ b/libop/op_events.h @@ -128,6 +128,17 @@ struct op_default_event_descr { */ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr); +/** + * op_resolve_unit_mask - resolve a unit mask in a parsed event. + * @pe parsed event + * @extra pointer to extra mask or NULL. + * + * Fills in the extra mask for the unit mask. + */ + +struct parsed_event; +void op_resolve_unit_mask(struct parsed_event *pe, u32 *extra); + #ifdef __cplusplus } #endif diff --git a/libop/op_parse_event.c b/libop/op_parse_event.c index eb99a20..949730b 100644 --- a/libop/op_parse_event.c +++ b/libop/op_parse_event.c @@ -13,6 +13,7 @@ #include <stdio.h> #include <stdlib.h> +#include <ctype.h> #include "op_parse_event.h" #include "op_string.h" @@ -94,8 +95,12 @@ size_t parse_events(struct parsed_event * parsed_events, size_t max_events, if (part) { parsed_events[i].unit_mask_valid = 1; - parsed_events[i].unit_mask = parse_ulong(part); - free(part); + if (!isdigit(*part)) + parsed_events[i].unit_mask_name = part; + else { + parsed_events[i].unit_mask = parse_ulong(part); + free(part); + } } parsed_events[i].kernel = 1; diff --git a/libop/op_parse_event.h b/libop/op_parse_event.h index c8d4144..2519b0d 100644 --- a/libop/op_parse_event.h +++ b/libop/op_parse_event.h @@ -20,6 +20,7 @@ struct parsed_event { char * name; int count; int unit_mask; + char * unit_mask_name; int kernel; int user; int unit_mask_valid; diff --git a/libop/tests/parse_event_tests.c b/libop/tests/parse_event_tests.c index b2fda3b..8e9dabe 100644 --- a/libop/tests/parse_event_tests.c +++ b/libop/tests/parse_event_tests.c @@ -22,10 +22,10 @@ struct events_test { static struct events_test const events[] = { - { { "FOO:3000:0:0:0", 0 }, { "FOO", 3000, 0, 0, 0, 0 } }, - { { "BAR:3000", 0 }, { "BAR", 3000, 0, 1, 1, 0 } }, - { { "FOOBAR:3000:1:1:1", 0 }, { "FOOBAR", 3000, 1, 1, 1, 0 } }, - { { NULL, NULL }, { 0, 0, 0, 0, 0, 0 } } + { { "FOO:3000:0:0:0", 0 }, { "FOO", 3000, 0, NULL, 0, 0, 0 } }, + { { "BAR:3000", 0 }, { "BAR", 3000, 0, NULL, 1, 1, 0 } }, + { { "FOOBAR:3000:1:1:1", 0 }, { "FOOBAR", 3000, 1, NULL, 1, 1, 0 } }, + { { NULL, NULL }, { 0, 0, 0, NULL, 0, 0, 0 } } }; static void do_test(struct events_test const * ev) diff --git a/utils/opcontrol b/utils/opcontrol index 603172d..dfe70f0 100644 --- a/utils/opcontrol +++ b/utils/opcontrol @@ -638,12 +638,11 @@ normalise_events() UNIT_MASK=`echo $GOTEVENT | awk -F: '{print $3}'` KERNEL=`echo $GOTEVENT | awk -F: '{print $4}'` USER=`echo $GOTEVENT | awk -F: '{print $5}'` - if test -z "$UNIT_MASK"; then - TMPEVENT="$EVENT:$COUNT" - UNIT_MASK=`$OPHELP --unit-mask $TMPEVENT` - if test "$?" != 0; then - exit 1 - fi + TMPEVENT="$EVENT:$COUNT:$UNIT_MASK" + UNIT_MASK_NAMED="$UNIT_MASK" + UNIT_MASK=`$OPHELP --unit-mask $TMPEVENT` + if test "$?" != 0; then + exit 1 fi if test -z "$KERNEL"; then KERNEL=1 @@ -1442,7 +1441,7 @@ do_param_setup() set_ctr_param $CTR user $USER set_ctr_param $CTR unit_mask $UNIT_MASK - EXTRA=`$OPHELP --extra-mask $GOTEVENT` + EXTRA=`$OPHELP --extra-mask $EVENT:$COUNT:$UNIT_MASK_NAMED` if test "$EXTRA" -ne 0 ; then if ! test -d $MOUNT/$CTR/extra ; then echo >&2 "Warning: $GOTEVENT has extra mask, but kernel does not support extra field" diff --git a/utils/ophelp.c b/utils/ophelp.c index 60b1662..b3aebde 100644 --- a/utils/ophelp.c +++ b/utils/ophelp.c @@ -182,6 +182,8 @@ static void check_event(struct parsed_event * pev, exit(EXIT_FAILURE); } + op_resolve_unit_mask(pev, NULL); + ret = op_check_events(0, event->val, pev->unit_mask, cpu_type); if (ret & OP_INVALID_UM) { @@ -212,6 +214,7 @@ static void resolve_events(void) count = parse_events(parsed_events, num_chosen_events, chosen_events); for (i = 0; i < count; ++i) { + op_resolve_unit_mask(&parsed_events[i], NULL); for (j = i + 1; j < count; ++j) { struct parsed_event * pev1 = &parsed_events[i]; struct parsed_event * pev2 = &parsed_events[j]; @@ -270,7 +273,6 @@ static void resolve_events(void) static void show_unit_mask(void) { - struct op_event * event; size_t count; count = parse_events(parsed_events, num_chosen_events, chosen_events); @@ -279,20 +281,15 @@ static void show_unit_mask(void) exit(EXIT_FAILURE); } - event = find_event_by_name(parsed_events[0].name, 0, 0); - - if (!event) { - fprintf(stderr, "No such event found.\n"); - exit(EXIT_FAILURE); - } - - printf("%d\n", event->unit->default_mask); + op_resolve_unit_mask(parsed_events, NULL); + if (parsed_events[0].unit_mask_name) + printf("%s\n", parsed_events[0].unit_mask_name); + else + printf("%d\n", parsed_events[0].unit_mask); } static void show_extra_mask(void) { - unsigned i; - struct op_event * event; size_t count; unsigned extra; @@ -302,22 +299,7 @@ static void show_extra_mask(void) exit(EXIT_FAILURE); } - event = find_event_by_name(parsed_events[0].name, - parsed_events[0].unit_mask, - 1); - if (!event) { - fprintf(stderr, "No such event found.\n"); - exit(EXIT_FAILURE); - } - - /* Not exact match is nothing */ - extra = 0; - for (i = 0; i < event->unit->num; i++) - if (event->unit->um[i].value == (unsigned)parsed_events[0].unit_mask) { - extra = event->unit->um[i].extra; - break; - } - + op_resolve_unit_mask(parsed_events, &extra); printf ("%d\n", extra); } @@ -761,8 +743,10 @@ int main(int argc, char const * argv[]) sprintf(title, "oprofile: available events for CPU type \"%s\"\n\n", pretty); if (want_xml) open_xml_events(title, event_doc, cpu_type); - else + else { printf("%s%s", title, event_doc); + printf("You can use named events by specifying the first word of the description if unique\n\n"); + } list_for_each(pos, events) { struct op_event * event = list_entry(pos, struct op_event, event_next); -- 1.7.4.4 |
From: Maynard J. <may...@us...> - 2011-05-16 19:36:49
|
Andi Kleen wrote: > From: Andi Kleen <ak...@li...> > > With extra fields available just specifying the numerical unit_mask is often > not enough to select the right event. This patch allows to specify unitmasks > by name (first word in the description) in the event description. > > This also makes many unit masks easier to use because names are easier > to remember than numbers. > > v2: Fixed review feedback, fix regression tester > v3: Add unit mask name uniqueness check > v4: Add another check Committed with minor edits to some messages, as well as one code fix, plus fixups for numerous whitespace errors. See below. -Maynard > > Signed-off-by: Andi Kleen <ak...@li...> > --- > doc/opcontrol.1.in | 5 +- > doc/oprofile.xml | 2 +- > libop/op_events.c | 134 +++++++++++++++++++++++++++++++++++++++ > libop/op_events.h | 11 +++ > libop/op_parse_event.c | 9 ++- > libop/op_parse_event.h | 1 + > libop/tests/parse_event_tests.c | 8 +- > utils/opcontrol | 13 ++-- > utils/ophelp.c | 40 ++++-------- > 9 files changed, 180 insertions(+), 43 deletions(-) > > diff --git a/doc/opcontrol.1.in b/doc/opcontrol.1.in > index 5a78aa6..931d877 100644 > --- a/doc/opcontrol.1.in > +++ b/doc/opcontrol.1.in > @@ -107,7 +107,10 @@ or "default" for the default event. The event is of the form > count, unit mask, kernel-space counting, user-space counting, > respectively. Note that this over-rides all previous events selected; > if you want two or more counters used simultaneously, you must specify > -them on the same opcontrol invocation. > +them on the same opcontrol invocation. The numerical unit mask > +can also be a string which matches the first word in the unit mask > +description, but only for events with extra parameters shown. Instead . . . "description, but only for events with "extra:" parameters shown." > +Events with extra parameters must be specified by first word. Instead . . . "Unit masks with "extra:" parameters .I must be specified by first word. > .br > .TP > .BI "--separate="[none,lib,kernel,thread,cpu,all] > diff --git a/doc/oprofile.xml b/doc/oprofile.xml > index 0d6407c..843d96b 100644 > --- a/doc/oprofile.xml > +++ b/doc/oprofile.xml > @@ -877,7 +877,7 @@ of the form <option><emphasis>name</emphasis>:<emphasis>count</emphasis>:<emphas > <tbody> > <row><entry><option>name</option></entry><entry>The symbolic event name, e.g. <constant>CPU_CLK_UNHALTED</constant></entry></row> > <row><entry><option>count</option></entry><entry>The counter reset value, e.g. 100000</entry></row> > -<row><entry><option>unitmask</option></entry><entry>The unit mask, as given in the events list, e.g. 0x0f</entry></row> > +<row><entry><option>unitmask</option></entry><entry>The unit mask, as given in the events list, e.g. 0x0f or a symbolic name (first word of the description if unique)</entry></row> > <row><entry><option>kernel</option></entry><entry>Whether to profile kernel code</entry></row> > <row><entry><option>user</option></entry><entry>Whether to profile userspace code</entry></row> > </tbody> > diff --git a/libop/op_events.c b/libop/op_events.c > index 8da023b..502ff01 100644 > --- a/libop/op_events.c > +++ b/libop/op_events.c > @@ -17,6 +17,7 @@ > #include "op_string.h" > #include "op_cpufreq.h" > #include "op_hw_specific.h" > +#include "op_parse_event.h" > > #include <string.h> > #include <stdlib.h> > @@ -921,6 +922,18 @@ struct op_event * find_event_by_name(char const * name, unsigned um, int um_vali > } > > > +static struct op_event * find_next_event(struct op_event * e) > +{ > + struct list_head * n; > + > + for (n = e->event_next.next; n != &events_list; n = n->next) { > + struct op_event * ne = list_entry(n, struct op_event, event_next); > + if (!strcmp(e->name, ne->name)) > + return ne; > + } > + return NULL; > +} > + > struct op_event * op_find_event(op_cpu cpu_type, u32 nr, u32 um) > { > struct op_event * event; > @@ -1120,3 +1133,124 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) > break; > } > } > + > +static void extra_check(struct op_event *e, u32 unit_mask) > +{ > + unsigned i; > + int found = 0; > + > + for (i = 0; i < e->unit->num; i++) > + if (e->unit->um[i].value == unit_mask) > + found++; > + if (found > 1) { > + fprintf(stderr, > +"Unfortunately you are not allowed to use named unit masks on events\n" > +"without extra values. Please specify the magic numbers for the unit mask\n" > +"instead\n"); Instead . . . "Named unit masks not allowed for events without 'extra:' values.\n" "Please specify the numerical value for the unit mask. See 'opcontrol' man page for more info.\n"); > + exit(EXIT_FAILURE); > + } > +} > + > +static void another_extra_check(struct op_event *e, char *name, unsigned w) > +{ > + int found; > + unsigned i; > + > + if (!e->unit->um[w].extra) { > + fprintf(stderr, > +"Named unit mask `%s' has no extra parameter. Please use a numerical\n" > +"unit mask; names would be too easy.\n", name); Removed snide comment and changed wording to match the other similar error message. > + exit(EXIT_FAILURE); > + } > + > + found = 0; > + for (i = 0; i < e->unit->num; i++) { > + int len = strcspn(e->unit->um[i].desc, " \t"); > + if (!strncmp(name, e->unit->um[i].desc, len) && > + name[len] == '\0') > + found++; > + } > + if (found > 1) { > + fprintf(stderr, > + "Unit mask name `%s' not unique. Sorry please use a numerical unit mask\n", name); > + exit(EXIT_FAILURE); > + } > +} > + > +static void do_resolve_unit_mask(struct op_event *e, struct parsed_event *pe, > + u32 *extra) > +{ > + unsigned i; > + int found; > + > + for (;;) { > + if (pe->unit_mask_name == NULL) { > + int fi = 0; > + unsigned um = pe->unit_mask; > + int had_unit_mask = pe->unit_mask_valid; > + > + found = 0; > + for (i = 0; i < e->unit->num; i++) { > + if (pe->unit_mask_valid && > + e->unit->um[i].value == um) { > + if (found++ == 0) > + fi = i; > + } > + if (!pe->unit_mask_valid && > + e->unit->um[i].value == e->unit->default_mask) { > + pe->unit_mask_valid = 1; > + pe->unit_mask = e->unit->default_mask; > + break; > + } > + } > + if (found > 1 && had_unit_mask) { > + fprintf(stderr, > + "Non unique numerical unit mask.\n" > + "Please specify the unit mask using the first word of the description\n"); > + exit(EXIT_FAILURE); > + } > + extra_check(e, pe->unit_mask); > + if (i == e->unit->num) { > + e = find_next_event(e); > + if (e != NULL) > + continue; > + } else { > + if (extra) > + *extra = e->unit->um[i].extra; > + } > + return; > + } > + for (i = 0; i < e->unit->num; i++) { > + int len = strcspn(e->unit->um[i].desc, " \t"); > + if (!strncmp(pe->unit_mask_name, e->unit->um[i].desc, > + len) && pe->unit_mask_name[len] == '\0') > + break; > + } > + if (i == e->unit->num) { > + e = find_next_event(e); > + if (e != NULL) > + continue; > + fprintf(stderr, "Cannot find unit mask %s for %s\n", > + pe->unit_mask_name, pe->name); > + exit(EXIT_FAILURE); > + } > + another_extra_check(e, pe->unit_mask_name, i); > + pe->unit_mask_valid = 1; > + pe->unit_mask = e->unit->um[i].value; > + if (extra) > + *extra = e->unit->um[i].extra; > + return; > + } > +} > + > +void op_resolve_unit_mask(struct parsed_event *pe, u32 *extra) > +{ > + struct op_event *e; > + > + e = find_event_by_name(pe->name, 0, 0); > + if (!e) { > + fprintf(stderr, "Cannot find event %s\n", pe->name); > + exit(EXIT_FAILURE); > + } > + return do_resolve_unit_mask(e, pe, extra); > +} > diff --git a/libop/op_events.h b/libop/op_events.h > index 3aaaba2..58d39da 100644 > --- a/libop/op_events.h > +++ b/libop/op_events.h > @@ -128,6 +128,17 @@ struct op_default_event_descr { > */ > void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr); > > +/** > + * op_resolve_unit_mask - resolve a unit mask in a parsed event. > + * @pe parsed event > + * @extra pointer to extra mask or NULL. > + * > + * Fills in the extra mask for the unit mask. > + */ > + > +struct parsed_event; > +void op_resolve_unit_mask(struct parsed_event *pe, u32 *extra); > + > #ifdef __cplusplus > } > #endif > diff --git a/libop/op_parse_event.c b/libop/op_parse_event.c > index eb99a20..949730b 100644 > --- a/libop/op_parse_event.c > +++ b/libop/op_parse_event.c > @@ -13,6 +13,7 @@ > > #include <stdio.h> > #include <stdlib.h> > +#include <ctype.h> > > #include "op_parse_event.h" > #include "op_string.h" > @@ -94,8 +95,12 @@ size_t parse_events(struct parsed_event * parsed_events, size_t max_events, > > if (part) { > parsed_events[i].unit_mask_valid = 1; > - parsed_events[i].unit_mask = parse_ulong(part); > - free(part); > + if (!isdigit(*part)) > + parsed_events[i].unit_mask_name = part; > + else { > + parsed_events[i].unit_mask = parse_ulong(part); > + free(part); > + } > } > > parsed_events[i].kernel = 1; > diff --git a/libop/op_parse_event.h b/libop/op_parse_event.h > index c8d4144..2519b0d 100644 > --- a/libop/op_parse_event.h > +++ b/libop/op_parse_event.h > @@ -20,6 +20,7 @@ struct parsed_event { > char * name; > int count; > int unit_mask; > + char * unit_mask_name; > int kernel; > int user; > int unit_mask_valid; > diff --git a/libop/tests/parse_event_tests.c b/libop/tests/parse_event_tests.c > index b2fda3b..8e9dabe 100644 > --- a/libop/tests/parse_event_tests.c > +++ b/libop/tests/parse_event_tests.c > @@ -22,10 +22,10 @@ struct events_test { > > static struct events_test const events[] = > { > - { { "FOO:3000:0:0:0", 0 }, { "FOO", 3000, 0, 0, 0, 0 } }, > - { { "BAR:3000", 0 }, { "BAR", 3000, 0, 1, 1, 0 } }, > - { { "FOOBAR:3000:1:1:1", 0 }, { "FOOBAR", 3000, 1, 1, 1, 0 } }, > - { { NULL, NULL }, { 0, 0, 0, 0, 0, 0 } } > + { { "FOO:3000:0:0:0", 0 }, { "FOO", 3000, 0, NULL, 0, 0, 0 } }, > + { { "BAR:3000", 0 }, { "BAR", 3000, 0, NULL, 1, 1, 0 } }, > + { { "FOOBAR:3000:1:1:1", 0 }, { "FOOBAR", 3000, 1, NULL, 1, 1, 0 } }, > + { { NULL, NULL }, { 0, 0, 0, NULL, 0, 0, 0 } } > }; > > static void do_test(struct events_test const * ev) > diff --git a/utils/opcontrol b/utils/opcontrol > index 603172d..dfe70f0 100644 > --- a/utils/opcontrol > +++ b/utils/opcontrol > @@ -638,12 +638,11 @@ normalise_events() > UNIT_MASK=`echo $GOTEVENT | awk -F: '{print $3}'` > KERNEL=`echo $GOTEVENT | awk -F: '{print $4}'` > USER=`echo $GOTEVENT | awk -F: '{print $5}'` > - if test -z "$UNIT_MASK"; then > - TMPEVENT="$EVENT:$COUNT" > - UNIT_MASK=`$OPHELP --unit-mask $TMPEVENT` > - if test "$?" != 0; then > - exit 1 > - fi > + TMPEVENT="$EVENT:$COUNT:$UNIT_MASK" > + UNIT_MASK_NAMED="$UNIT_MASK" > + UNIT_MASK=`$OPHELP --unit-mask $TMPEVENT` > + if test "$?" != 0; then > + exit 1 > fi > if test -z "$KERNEL"; then > KERNEL=1 > @@ -1442,7 +1441,7 @@ do_param_setup() > set_ctr_param $CTR user $USER > set_ctr_param $CTR unit_mask $UNIT_MASK > > - EXTRA=`$OPHELP --extra-mask $GOTEVENT` > + EXTRA=`$OPHELP --extra-mask $EVENT:$COUNT:$UNIT_MASK_NAMED` > if test "$EXTRA" -ne 0 ; then > if ! test -d $MOUNT/$CTR/extra ; then > echo >&2 "Warning: $GOTEVENT has extra mask, but kernel does not support extra field" > diff --git a/utils/ophelp.c b/utils/ophelp.c > index 60b1662..b3aebde 100644 > --- a/utils/ophelp.c > +++ b/utils/ophelp.c > @@ -182,6 +182,8 @@ static void check_event(struct parsed_event * pev, > exit(EXIT_FAILURE); > } > > + op_resolve_unit_mask(pev, NULL); > + > ret = op_check_events(0, event->val, pev->unit_mask, cpu_type); > > if (ret & OP_INVALID_UM) { > @@ -212,6 +214,7 @@ static void resolve_events(void) > count = parse_events(parsed_events, num_chosen_events, chosen_events); > > for (i = 0; i < count; ++i) { > + op_resolve_unit_mask(&parsed_events[i], NULL); > for (j = i + 1; j < count; ++j) { > struct parsed_event * pev1 = &parsed_events[i]; > struct parsed_event * pev2 = &parsed_events[j]; > @@ -270,7 +273,6 @@ static void resolve_events(void) > > static void show_unit_mask(void) > { > - struct op_event * event; > size_t count; > > count = parse_events(parsed_events, num_chosen_events, chosen_events); > @@ -279,20 +281,15 @@ static void show_unit_mask(void) > exit(EXIT_FAILURE); > } > > - event = find_event_by_name(parsed_events[0].name, 0, 0); > - > - if (!event) { > - fprintf(stderr, "No such event found.\n"); > - exit(EXIT_FAILURE); > - } > - > - printf("%d\n", event->unit->default_mask); > + op_resolve_unit_mask(parsed_events, NULL); > + if (parsed_events[0].unit_mask_name) > + printf("%s\n", parsed_events[0].unit_mask_name); > + else > + printf("%d\n", parsed_events[0].unit_mask); > } > > static void show_extra_mask(void) > { > - unsigned i; > - struct op_event * event; > size_t count; > unsigned extra; > > @@ -302,22 +299,7 @@ static void show_extra_mask(void) > exit(EXIT_FAILURE); > } > > - event = find_event_by_name(parsed_events[0].name, > - parsed_events[0].unit_mask, > - 1); > - if (!event) { > - fprintf(stderr, "No such event found.\n"); > - exit(EXIT_FAILURE); > - } > - > - /* Not exact match is nothing */ > - extra = 0; Need to keep the above initialization of 'extra'. > - for (i = 0; i < event->unit->num; i++) > - if (event->unit->um[i].value == (unsigned)parsed_events[0].unit_mask) { > - extra = event->unit->um[i].extra; > - break; > - } > - > + op_resolve_unit_mask(parsed_events, &extra); > printf ("%d\n", extra); > } > > @@ -761,8 +743,10 @@ int main(int argc, char const * argv[]) > sprintf(title, "oprofile: available events for CPU type \"%s\"\n\n", pretty); > if (want_xml) > open_xml_events(title, event_doc, cpu_type); > - else > + else { > printf("%s%s", title, event_doc); > + printf("You can use named events by specifying the first word of the description if unique\n\n"); Instead . . . printf("For architectures using unit masks, you may be able to specify\n" "unit masks by name. See 'opcontrol' man page.\n\n"); > > > + } > > list_for_each(pos, events) { > struct op_event * event = list_entry(pos, struct op_event, event_next); |
From: Andi Kleen <ak...@li...> Add an event list for Sandy Bridge. Modify oprofile to detect Sandy Bridges. Signed-off-by: Andi Kleen <ak...@li...> --- events/Makefile.am | 1 + events/i386/sandybridge/events | 67 +++++++++ events/i386/sandybridge/unit_masks | 275 ++++++++++++++++++++++++++++++++++++ libop/op_cpu_type.c | 2 + libop/op_cpu_type.h | 1 + libop/op_events.c | 1 + libop/op_hw_specific.h | 3 + utils/ophelp.c | 1 + 8 files changed, 351 insertions(+), 0 deletions(-) create mode 100644 events/i386/sandybridge/events create mode 100644 events/i386/sandybridge/unit_masks diff --git a/events/Makefile.am b/events/Makefile.am index 60c4164..c4101cc 100644 --- a/events/Makefile.am +++ b/events/Makefile.am @@ -18,6 +18,7 @@ event_files = \ i386/core_i7/events i386/core_i7/unit_masks \ i386/nehalem/events i386/nehalem/unit_masks \ i386/westmere/events i386/westmere/unit_masks \ + i386/sandybridge/events i386/sandybridge/unit_masks \ ia64/ia64/events ia64/ia64/unit_masks \ ia64/itanium2/events ia64/itanium2/unit_masks \ ia64/itanium/events ia64/itanium/unit_masks \ diff --git a/events/i386/sandybridge/events b/events/i386/sandybridge/events new file mode 100644 index 0000000..bf941c7 --- /dev/null +++ b/events/i386/sandybridge/events @@ -0,0 +1,67 @@ +# +# Intel "sandy-bridge" microarchitecture core events. +# +# See http://ark.intel.com/ for help in identifying sandy-bridge based CPUs +# +# Note the minimum counts are not discovered experimentally and could be likely +# lowered in many cases without ill effect. +# +include:i386/arch_perfmon +event:0x03 counters:cpuid um:ld_blocks minimum:100000 name:ld_blocks : blocked loads +event:0x05 counters:cpuid um:misalign_mem_ref minimum:2000000 name:misalign_mem_ref : Misaligned memory references +event:0x07 counters:cpuid um:ld_blocks_partial minimum:100000 name:ld_blocks_partial : Partial loads +event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000000 name:dtlb_load_misses : D-TLB misses +event:0x0d counters:cpuid um:int_misc minimum:2000000 name:int_misc : Instruction decoder events +event:0x0e counters:0,1,2,3 um:uops_issued minimum:2000000 name:uops_issued : Number of Uops issued +event:0x14 counters:cpuid um:arith minimum:2000000 name:arith : Misc ALU events +event:0x17 counters:cpuid um:one minimum:2000000 name:insts_written_to_iq : Number of instructions written to Instruction Queue (IQ) this cycle. +event:0x24 counters:cpuid um:l2_rqsts minimum:200000 name:l2_rqsts : Requests from L2 cache +event:0x27 counters:cpuid um:l2_store_lock_rqsts minimum:200000 name:l2_store_lock_rqsts : L2 cache store lock requests +event:0x28 counters:cpuid um:l2_l1d_wb_rqsts minimum:200000 name:l2_l1d_wb_rqsts : writebacks from L1D to the L2 cache +event:0x48 counters:2 um:l1d_pend_miss minimum:2000000 name:l1d_pend_miss : Cycles with L1D load Misses outstanding. +event:0x49 counters:cpuid um:dtlb_store_misses minimum:2000000 name:dtlb_store_misses : D-TLB store misses +event:0x4c counters:cpuid um:load_hit_pre minimum:100000 name:load_hit_pre : Load dispatches that hit fill buffer +event:0x4e counters:cpuid um:x02 minimum:2000000 name:hw_pre_req : Hardware Prefetch requests +event:0x51 counters:cpuid um:l1d minimum:2000000 name:l1d : L1D cache events +event:0x59 counters:cpuid um:partial_rat_stalls minimum:2000000 name:partial_rat_stalls : Partial RAT stalls +event:0x5b counters:0,1,2,3 um:resource_stalls2 minimum:2000000 name:resource_stalls2 : Misc resource stalls +event:0x5c counters:cpuid um:cpl_cycles minimum:2000000 name:cpl_cycles : Unhalted core cycles in specific rings +event:0x5e counters:0,1,2,3 um:one minimum:2000000 name:rs_events : Events for the reservation station +event:0x60 counters:cpuid um:offcore_requests_outstanding minimum:2000000 name:offcore_requests_outstanding : Offcore outstanding transactions +event:0x63 counters:cpuid um:lock_cycles minimum:2000000 name:lock_cycles : Cycles due to LOCK prefixes. +event:0x79 counters:0,1,2,3 um:idq minimum:2000000 name:idq : Instruction Decode Queue events +event:0x80 counters:cpuid um:x02 minimum:200000 name:icache : Instruction cache events +event:0x85 counters:cpuid um:itlb_misses minimum:2000000 name:itlb_misses : I-TLB misses +event:0x87 counters:cpuid um:ild_stall minimum:2000000 name:ild_stall : Instruction decoding stalls +event:0x88 counters:cpuid um:br_inst_exec minimum:200000 name:br_inst_exec : Branch instructions +event:0x89 counters:cpuid um:br_misp_exec minimum:200000 name:br_misp_exec : Mispredicted branch instructions +event:0x9c counters:0,1,2,3 um:idq_uops_not_delivered minimum:2000000 name:idq_uops_not_delivered : uops not delivered to IDQ. +event:0xa1 counters:cpuid um:uops_dispatched_port minimum:2000000 name:uops_dispatched_port : Count on which ports uops are dispatched. +event:0xa2 counters:cpuid um:resource_stalls minimum:2000000 name:resource_stalls : Core resource stalls +event:0xab counters:cpuid um:dsb2mite_switches minimum:2000000 name:dsb2mite_switches : Number of Decode Stream Buffer (DSB) to MITE switches +event:0xac counters:cpuid um:dsb_fill minimum:2000000 name:dsb_fill : DSB fill events +event:0xae counters:cpuid um:one minimum:10000 name:itlb : ITLB events +event:0xb0 counters:cpuid um:offcore_requests minimum:100000 name:offcore_requests : Requests sent outside the core +event:0xb1 counters:0,1,2,3 um:uops_dispatched minimum:2000000 name:uops_dispatched : uops dispatched +event:0xb2 counters:cpuid um:one minimum:2000000 name:offcore_requests_buffer : Offcore requests buffer events +event:0xb6 counters:cpuid um:one minimum:100000 name:agu_bypass_cancel : AGU bypass cancel +event:0xbd counters:cpuid um:tlb_flush minimum:10000 name:tlb_flush : TLB flushes +event:0xbf counters:cpuid um:l1d_blocks minimum:100000 name:l1d_blocks : L1D cache blocking events +event:0xc0 counters:1 um:one minimum:2000000 name:inst_retired : Instructions retired +event:0xc1 counters:cpuid um:other_assists minimum:100000 name:other_assists : Instructions that needed an assist +event:0xc2 counters:0,1,2,3 um:uops_retired minimum:2000000 name:uops_retired : uops that actually retired. +event:0xc3 counters:cpuid um:machine_clears minimum:100000 name:machine_clears : Number of Machine Clears detected. +event:0xc4 counters:0,1,2,3 um:br_inst_retired minimum:400000 name:br_inst_retired : Counts branch instructions retired +event:0xc5 counters:0,1,2,3 um:br_misp_retired minimum:400000 name:br_misp_retired : Counts mispredicted branch instructions +event:0xca counters:0,1,2,3 um:fp_assist minimum:100000 name:fp_assist : Counts floating point assists +event:0xcb counters:cpuid um:one minimum:100000 name:hw_interrupts : Number of hardware interrupts received by the processor. +event:0xcc counters:cpuid um:x20 minimum:2000000 name:rob_misc_events : Count ROB (Register Reorder Buffer) events. +event:0xcd counters:3 um:x02 minimum:2000000 name:mem_trans_retired : Count memory transactions +event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000000 name:mem_uops_retired : Count uops with memory accessed retired +event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000000 name:mem_load_uops_retired : Memory load uops. +event:0xd2 counters:0,1,2,3 um:mem_load_uops_llc_hit_retired minimum:100000 name:mem_load_uops_llc_hit_retired : Memory load uops with LLC (Last level cache) hit +event:0xd4 counters:0,1,2,3 um:x02 minimum:10000 name:mem_load_uops_misc_retired : Memory load uops retired +event:0xf0 counters:cpuid um:l2_trans minimum:200000 name:l2_trans : L2 cache accesses +event:0xf1 counters:cpuid um:l2_lines_in minimum:100000 name:l2_lines_in : L2 cache lines in +event:0xf2 counters:cpuid um:l2_lines_out minimum:100000 name:l2_lines_out : L2 cache lines out +event:0xf4 counters:cpuid um:x10 minimum:100000 name:sq_misc : Store queue misc events diff --git a/events/i386/sandybridge/unit_masks b/events/i386/sandybridge/unit_masks new file mode 100644 index 0000000..cca6cb9 --- /dev/null +++ b/events/i386/sandybridge/unit_masks @@ -0,0 +1,275 @@ +# +# Unit masks for the Intel "sandy-bridge" micro architecture +# +# See http://ark.intel.com/ for help in identifying sandy-bridge based CPUs +# +include:i386/arch_perfmon +name:x02 type:mandatory default:0x2 + 0x2 No unit mask +name:x10 type:mandatory default:0x10 + 0x10 No unit mask +name:x20 type:mandatory default:0x20 + 0x20 No unit mask +name:ld_blocks type:bitmask default:0x1 + 0x1 data_unknown blocked loads due to store buffer blocks with unknown data. + 0x2 store_forward loads blocked by overlapping with store buffer that cannot be forwarded + 0x8 no_sr This event counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use. + 0x10 all_block Number of cases where any load is blocked but has no DCU miss. +name:misalign_mem_ref type:bitmask default:0x1 + 0x1 loads Speculative cache-line split load uops dispatched to the L1D. + 0x2 stores Speculative cache-line split Store-address uops dispatched to L1D +name:ld_blocks_partial type:bitmask default:0x1 + 0x1 address_alias False dependencies in MOB due to partial compare on address + 0x8 all_sta_block This event counts the number of times that load operations are temporarily blocked because of older stores, with addresses that are not yet known. A load operation may incur more than one block of this type. +name:dtlb_load_misses type:bitmask default:0x1 + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M/1G) + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M/1G) + 0x4 walk_duration Cycles PMH is busy with this walk + 0x10 stlb_hit First level miss but second level hit; no page walk. +name:int_misc type:bitmask default:0x40 + 0x40 rat_stall_cycles Cycles Resource Allocation Table (RAT) external stall is sent to Instruction Decode Queue (IDQ) for this thread. + 0x3 extra:cmask=1 recovery_cycles Number of cycles waiting to be recover after Nuke due to all other cases except JEClear. + 0x3 extra:cmask=1,edge recovery_stalls_count Edge applied to recovery_cycles, thus counts occurrences. +name:uops_issued type:bitmask default:0x1 + 0x1 any Number of Uops issued by the Resource Allocation Table (RAT) to the Reservation Station (RS) + 0x1 extra:cmask=1,inv stall_cycles cycles no uops issued by this thread. +name:arith type:bitmask default:0x1 + 0x1 fpu_div_active Cycles that the divider is busy with any divide or sqrt operation. + 0x1 extra:cmask=1,edge fpu_div Number of times that the divider is actived, includes INT, SIMD and FP. +name:l2_rqsts type:bitmask default:0x1 + 0x1 demand_data_rd_hit Demand Data Read hit L2, no rejects + 0x4 rfo_hit RFO requests that hit L2 cache + 0x8 rfo_miss RFO requests that miss L2 cache + 0x10 code_rd_hit L2 cache hits when fetching instructions, code reads. + 0x20 code_rd_miss L2 cache misses when fetching instructions + 0x40 pf_hit Requests from the L2 hardware prefetchers that hit L2 cache + 0x80 pf_miss Requests from the L2 hardware prefetchers that miss L2 cache + 0x3 all_demand_data_rd Any data read request to L2 cache + 0xc all_rfo Any data RFO request to L2 cache + 0x30 all_code_rd Any code read request to L2 cache + 0xc0 all_pf Any L2 HW prefetch request to L2 cache +name:l2_store_lock_rqsts type:bitmask default:0xf + 0xf all RFOs that access cache lines in any state + 0x1 miss RFO (as a result of regular RFO or Lock request) miss cache - I state + 0x4 hit_e RFO (as a result of regular RFO or Lock request) hits cache in E state + 0x8 hit_m RFO (as a result of regular RFO or Lock request) hits cache in M state +name:l2_l1d_wb_rqsts type:bitmask default:0x4 + 0x4 hit_e writebacks from L1D to L2 cache lines in E state + 0x8 hit_m writebacks from L1D to L2 cache lines in M state +name:l1d_pend_miss type:bitmask default:0x1 + 0x1 pending Cycles with L1D load Misses outstanding. + 0x1 extra:cmask=1,edge occurences This event counts the number of L1D misses outstanding occurences. +name:dtlb_store_misses type:bitmask default:0x1 + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M/1G) + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M/1G) + 0x4 walk_duration Cycles PMH is busy with this walk + 0x10 stlb_hit First level miss but second level hit; no page walk. Only relevant if multiple levels. +name:load_hit_pre type:bitmask default:0x1 + 0x1 sw_pf Load dispatches that hit fill buffer allocated for S/W prefetch. + 0x2 hw_pf Load dispatches that hit fill buffer allocated for HW prefetch. +name:l1d type:bitmask default:0x1 + 0x1 replacement L1D Data line replacements. + 0x2 allocated_in_m L1D M-state Data Cache Lines Allocated + 0x4 eviction L1D M-state Data Cache Lines Evicted due to replacement (only) + 0x8 all_m_replacement All Modified lines evicted out of L1D +name:partial_rat_stalls type:bitmask default:0x20 + 0x20 flags_merge_uop Number of perf sensitive flags-merge uops added by Sandy Bridge u-arch. + 0x40 slow_lea_window Number of cycles with at least 1 slow Load Effective Address (LEA) uop being allocated. + 0x80 mul_single_uop Number of Multiply packed/scalar single precision uops allocated + 0x20 extra:cmask=1 flags_merge_uop_cycles Cycles with perf sensitive flags-merge uops added by SandyBridge u-arch. +name:resource_stalls2 type:bitmask default:0x40 + 0x40 bob_full Cycles Allocator is stalled due Branch Order Buffer (BOB). + 0xf all_prf_control Resource stalls2 control structures full for physical registers + 0xc all_fl_empty Cycles with either free list is empty + 0x4f ooo_rsrc Resource stalls2 control structures full Physical Register Reclaim Table (PRRT), Physical History Table (PHT), INT or SIMD Free List (FL), Branch Order Buffer (BOB) +name:cpl_cycles type:bitmask default:0x1 + 0x1 ring0 Unhalted core cycles the Thread was in Rings 0. + 0x1 extra:cmask=1,edge ring0_trans Transitions from ring123 to Ring0. + 0x2 ring123 Unhalted core cycles the Thread was in Rings 1/2/3. +name:offcore_requests_outstanding type:bitmask default:0x1 + 0x1 demand_data_rd Offcore outstanding Demand Data Read transactions in the SuperQueue (SQ), queue to uncore, every cycle. Includes L1D data hardware prefetches. + 0x1 extra:cmask=1 cycles_with_demand_data_rd cycles there are Offcore outstanding RD data transactions in the SuperQueue (SQ), queue to uncore. + 0x2 demand_code_rd Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. + 0x4 demand_rfo Offcore outstanding RFO (store) transactions in the SuperQueue (SQ), queue to uncore, every cycle. + 0x8 all_data_rd Offcore outstanding all cacheable Core Data Read transactions in the SuperQueue (SQ), queue to uncore, every cycle. + 0x8 extra:cmask=1 cycles_with_data_rd Cycles there are Offcore outstanding all Data read transactions in the SuperQueue (SQ), queue to uncore, every cycle. + 0x2 extra:cmask=1 cycles_with_demand_code_rd Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. + 0x4 extra:cmask=1 cycles_with_demand_rfo Cycles with offcore outstanding demand RFO Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. +name:lock_cycles type:bitmask default:0x1 + 0x1 split_lock_uc_lock_duration Cycles in which the L1D and L2 are locked, due to a UC lock or split lock + 0x2 cache_lock_duration cycles that theL1D is locked +name:idq type:bitmask default:0x2 + 0x2 empty Cycles the Instruction Decode Queue (IDQ) is empty. + 0x4 mite_uops Number of uops delivered to Instruction Decode Queue (IDQ) from MITE path. + 0x8 dsb_uops Number of uops delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path. + 0x10 ms_dsb_uops Number of Uops delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by Decode Stream Buffer (DSB). + 0x20 ms_mite_uops Number of Uops delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by MITE. + 0x30 ms_uops Number of Uops were delivered into Instruction Decode Queue (IDQ) from MS, initiated by Decode Stream Buffer (DSB) or MITE. + 0x30 extra:cmask=1 ms_cycles Number of cycles that Uops were delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by Decode Stream Buffer (DSB) or MITE. + 0x4 extra:cmask=1 mite_cycles Cycles MITE is active + 0x8 extra:cmask=1 dsb_cycles Cycles Decode Stream Buffer (DSB) is active + 0x10 extra:cmask=1 ms_dsb_cycles Cycles Decode Stream Buffer (DSB) Microcode Sequenser (MS) is active + 0x10 extra:cmask=1,edge ms_dsb_occur Occurences of Decode Stream Buffer (DSB) Microcode Sequenser (MS) going active + 0x18 extra:cmask=1 all_dsb_cycles_any_uops Cycles Decode Stream Buffer (DSB) is delivering anything   + 0x18 extra:cmask=4 all_dsb_cycles_4_uops Cycles Decode Stream Buffer (DSB) is delivering 4 Uops   + 0x24 extra:cmask=1 all_mite_cycles_any_uops Cycles MITE is delivering anything    + 0x24 extra:cmask=4 all_mite_cycles_4_uops Cycles MITE is delivering 4 Uops    + 0x3c mite_all_uops Number of uops delivered to Instruction Decode Queue (IDQ) from any path. +name:itlb_misses type:bitmask default:0x1 + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M) + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M) + 0x4 walk_duration Cycles PMH is busy with this walk. + 0x10 stlb_hit First level miss but second level hit; no page walk. +name:ild_stall type:bitmask default:0x1 + 0x1 lcp Stall "occurrences" due to length changing prefixes (LCP). + 0x4 iq_full Stall cycles when instructions cannot be written because the Instruction Queue (IQ) is full. +name:br_inst_exec type:bitmask default:0xff + 0xff all_branches All branch instructions executed. + 0x41 nontaken_conditional All macro conditional nontaken branch instructions. + 0x81 taken_conditional All macro conditional taken branch instructions. + 0x82 taken_direct_jump All macro unconditional taken branch instructions, excluding calls and indirects. + 0x84 taken_indirect_jump_non_call_ret All taken indirect branches that are not calls nor returns. + 0x88 taken_indirect_near_return All taken indirect branches that have a return mnemonic. + 0x90 taken_direct_near_call All taken non-indirect calls. + 0xa0 taken_indirect_near_call All taken indirect calls, including both register and memory indirect. + 0xc1 all_conditional All macro conditional branch instructions. + 0xc2 all_direct_jmp All macro unconditional branch instructions, excluding calls and indirects + 0xc4 all_indirect_jump_non_call_ret All indirect branches that are not calls nor returns. + 0xc8 all_indirect_near_return All indirect return branches. + 0xd0 all_direct_near_call All non-indirect calls executed. +name:br_misp_exec type:bitmask default:0xff + 0xff all_branches All mispredicted branch instructions executed. + 0x41 nontaken_conditional All nontaken mispredicted macro conditional branch instructions. + 0x81 taken_conditional All taken mispredicted macro conditional branch instructions. + 0x84 taken_indirect_jump_non_call_ret All taken mispredicted indirect branches that are not calls nor returns. + 0x88 taken_return_near All taken mispredicted indirect branches that have a return mnemonic. + 0x90 taken_direct_near_call All taken mispredicted non-indirect calls. + 0xa0 taken_indirect_near_call All taken mispredicted indirect calls, including both register and memory indirect. + 0xc1 all_conditional All mispredicted macro conditional branch instructions. + 0xc4 all_indirect_jump_non_call_ret All mispredicted indirect branches that are not calls nor returns. + 0xd0 all_direct_near_call All mispredicted non-indirect calls +name:idq_uops_not_delivered type:bitmask default:0x1 + 0x1 core Count number of non-delivered uops to Resource Allocation Table (RAT). + 0x1 extra:cmask=4 cycles_0_uops_deliv.core Counts the cycles no uops were delivered + 0x1 extra:cmask=3 cycles_le_1_uop_deliv.core Counts the cycles less than 1 uops were delivered + 0x1 extra:cmask=2 cycles_le_2_uop_deliv.core Counts the cycles less than 2 uops were delivered + 0x1 extra:cmask=1 cycles_le_3_uop_deliv.core Counts the cycles less than 3 uops were delivered + 0x1 extra:cmask=4,inv cycles_ge_1_uop_deliv.core Cycles when 1 or more uops were delivered to the by the front end. + 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. +name:uops_dispatched_port type:bitmask default:0x1 + 0x1 port_0 Cycles which a Uop is dispatched on port 0 + 0x2 port_1 Cycles which a Uop is dispatched on port 1 + 0x4 port_2_ld Cycles which a load Uop is dispatched on port 2 + 0x8 port_2_sta Cycles which a STA Uop is dispatched on port 2 + 0x10 port_3_ld Cycles which a load Uop is dispatched on port 3 + 0x20 port_3_sta Cycles which a STA Uop is dispatched on port 3 + 0x40 port_4 Cycles which a Uop is dispatched on port 4 + 0x80 port_5 Cycles which a Uop is dispatched on port 5 + 0xc port_2 Uops disptached to port 2, loads and stores (speculative and retired) + 0x30 port_3 Uops disptached to port 3, loads and stores (speculative and retired) + 0xc port_2_core Uops disptached to port 2, loads and stores per core (speculative and retired) + 0x30 port_3_core Uops disptached to port 3, loads and stores per core (speculative and retired) +name:resource_stalls type:bitmask default:0x1 + 0x1 any Cycles Allocation is stalled due to Resource Related reason. + 0x2 lb Cycles Allocator is stalled due to Load Buffer full + 0x4 rs Stall due to no eligible Reservation Station (RS) entry available. + 0x8 sb Cycles Allocator is stalled due to Store Buffer full (not including draining from synch). + 0x10 rob ROB full cycles. + 0xe mem_rs Resource stalls due to LB, SB or Reservation Station (RS) being completely in use + 0xf0 ooo_rsrc Resource stalls due to Rob being full, FCSW, MXCSR and OTHER + 0xa lb_sb Resource stalls due to load or store buffers +name:dsb2mite_switches type:bitmask default:0x1 + 0x1 count Number of Decode Stream Buffer (DSB) to MITE switches + 0x2 penalty_cycles Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles. +name:dsb_fill type:bitmask default:0x2 + 0x2 other_cancel Count number of times a valid DSB fill has been actually cancelled for any reason. + 0x8 exceed_dsb_lines Decode Stream Buffer (DSB) Fill encountered > 3 Decode Stream Buffer (DSB) lines. + 0xa all_cancel Count number of times a valid Decode Stream Buffer (DSB) fill has been actually cancelled for any reason. +name:offcore_requests type:bitmask default:0x1 + 0x1 demand_data_rd Demand Data Read requests sent to uncore + 0x2 demand_code_rd Offcore Code read requests. Includes Cacheable and Un-cacheables. + 0x4 demand_rfo Offcore Demand RFOs. Includes regular RFO, Locks, ItoM. + 0x8 all_data_rd Offcore Demand and prefetch data reads returned to the core. +name:uops_dispatched type:bitmask default:0x1 + 0x1 thread Counts total number of uops to be dispatched per-thread each cycle. + 0x1 extra:cmask=1,inv stall_cycles Counts number of cycles no uops were dispatced to be executed on this thread. + 0x2 core Counts total number of uops dispatched from any thread +name:tlb_flush type:bitmask default:0x1 + 0x1 dtlb_thread Count number of DTLB flushes of thread-specific entries. + 0x20 stlb_any Count number of any STLB flushes +name:l1d_blocks type:bitmask default:0x1 + 0x1 ld_bank_conflict Any dispatched loads cancelled due to DCU bank conflict + 0x5 extra:cmask=1 bank_conflict_cycles Cycles with l1d blocks due to bank conflicts +name:other_assists type:bitmask default:0x2 + 0x2 itlb_miss_retired Instructions that experienced an ITLB miss. Non Pebs + 0x10 avx_to_sse Number of transitions from AVX-256 to legacy SSE when penalty applicable Non Pebs + 0x20 sse_to_avx Number of transitions from legacy SSE to AVX-256 when penalty applicable Non Pebs +name:uops_retired type:bitmask default:0x1 + 0x1 all All uops that actually retired. + 0x2 retire_slots number of retirement slots used non PEBS + 0x1 extra:cmask=1,inv stall_cycles Cycles no executable uops retired + 0x1 extra:cmask=10,inv total_cycles Number of cycles using always true condition applied to non PEBS uops retired event. +name:machine_clears type:bitmask default:0x2 + 0x2 memory_ordering Number of Memory Ordering Machine Clears detected. + 0x4 smc Number of Self-modifying code (SMC) Machine Clears detected. + 0x20 maskmov Number of AVX masked mov Machine Clears detected. +name:br_inst_retired type:bitmask default:0x1 + 0x1 conditional Counts all taken and not taken macro conditional branch instructions. + 0x2 near_call Counts all macro direct and indirect near calls. non PEBS + 0x8 near_return This event counts the number of near ret instructions retired. + 0x10 not_taken Counts all not taken macro branch instructions retired. + 0x20 near_taken Counts the number of near branch taken instructions retired. + 0x40 far_branch Counts the number of far branch instructions retired. + 0x4 all_branches_ps Counts all taken and not taken macro branches including far branches.(Precise Event) + 0x2 near_call_r3 Ring123 only near calls (non precise) + 0x2 near_call_r3_ps Ring123 only near calls (precise event) +name:br_misp_retired type:bitmask default:0x1 + 0x1 conditional All mispredicted macro conditional branch instructions. + 0x2 near_call All macro direct and indirect near calls + 0x10 not_taken number of branch instructions retired that were mispredicted and not-taken. + 0x20 taken number of branch instructions retired that were mispredicted and taken. + 0x4 all_branches_ps all macro branches (Precise Event) +name:fp_assist type:bitmask default:0x1e + 0x1e extra:cmask=1 any Counts any FP_ASSIST umask was incrementing. + 0x2 x87_output output - Numeric Overflow, Numeric Underflow, Inexact Result + 0x4 x87_input input - Invalid Operation, Denormal Operand, SNaN Operand + 0x8 simd_output Any output SSE* FP Assist - Numeric Overflow, Numeric Underflow. + 0x10 simd_input Any input SSE* FP Assist +name:mem_uops_retired type:bitmask default:0x11 + 0x11 stlb_miss_loads STLB misses dues to retired loads + 0x12 stlb_miss_stores STLB misses dues to retired stores + 0x21 lock_loads Locked retired loads + 0x41 split_loads Retired loads causing cacheline splits + 0x42 split_stores Retired stores causing cacheline splits + 0x81 all_loads Any retired loads + 0x82 all_stores Any retired stores +name:mem_load_uops_retired type:bitmask default:0x1 + 0x1 l1_hit Load hit in nearest-level (L1D) cache + 0x2 l2_hit Load hit in mid-level (L2) cache + 0x4 llc_hit Load hit in last-level (L3) cache with no snoop needed + 0x40 hit_lfb A load missed L1D but hit the Fill Buffer +name:mem_load_uops_llc_hit_retired type:bitmask default:0x1 + 0x1 xsnp_miss Load LLC Hit and a cross-core Snoop missed in on-pkg core cache + 0x2 xsnp_hit Load LLC Hit and a cross-core Snoop hits in on-pkg core cache + 0x4 xsnp_hitm Load had HitM Response from a core on same socket (shared LLC). + 0x8 xsnp_none Load hit in last-level (L3) cache with no snoop needed. +name:l2_trans type:bitmask default:0x80 + 0x80 all_requests Transactions accessing L2 pipe + 0x1 demand_data_rd Demand Data Read requests that access L2 cache, includes L1D prefetches. + 0x2 rfo RFO requests that access L2 cache + 0x4 code_rd L2 cache accesses when fetching instructions including L1D code prefetches + 0x8 all_pf L2 or LLC HW prefetches that access L2 cache + 0x10 l1d_wb L1D writebacks that access L2 cache + 0x20 l2_fill L2 fill requests that access L2 cache + 0x40 l2_wb L2 writebacks that access L2 cache +name:l2_lines_in type:bitmask default:0x7 + 0x7 all L2 cache lines filling L2 + 0x1 i L2 cache lines in I state filling L2 + 0x2 s L2 cache lines in S state filling L2 + 0x4 e L2 cache lines in E state filling L2 +name:l2_lines_out type:bitmask default:0x1 + 0x1 demand_clean Clean line evicted by a demand + 0x2 demand_dirty Dirty line evicted by a demand + 0x4 pf_clean Clean line evicted by an L2 Prefetch + 0x8 pf_dirty Dirty line evicted by an L2 Prefetch + 0xa dirty_all Any Dirty line evicted diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c index b2ebf54..9d11b21 100644 --- a/libop/op_cpu_type.c +++ b/libop/op_cpu_type.c @@ -93,6 +93,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { { "Intel Westmere microarchitecture", "i386/westmere", CPU_WESTMERE, 4 }, { "ARMv7 Scorpion", "arm/armv7-scorpion", CPU_ARM_SCORPION, 5 }, { "ARMv7 ScorpionMP", "arm/armv7-scorpionmp", CPU_ARM_SCORPIONMP, 5 }, + { "Intel Sandy Bridge microarchitecture", "i386/sandybridge", CPU_SANDYBRIDGE, 8 }, }; static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); @@ -117,6 +118,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) case CPU_ATOM: case CPU_NEHALEM: case CPU_WESTMERE: + case CPU_SANDYBRIDGE: return CPU_ARCH_PERFMON; default: /* assume processor in a class by itself */ diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h index 9283ec7..d6cae3a 100644 --- a/libop/op_cpu_type.h +++ b/libop/op_cpu_type.h @@ -90,6 +90,7 @@ typedef enum { CPU_WESTMERE, /* Intel Westmere microarchitecture */ CPU_ARM_SCORPION, /**< ARM SCORPION */ CPU_ARM_SCORPIONMP, /**< ARM SCORPIONMP */ + CPU_SANDYBRIDGE, /* Intel Sandy-Bridge microarchitecture */ MAX_CPU_TYPE } op_cpu; diff --git a/libop/op_events.c b/libop/op_events.c index 502ff01..0aa0ad3 100644 --- a/libop/op_events.c +++ b/libop/op_events.c @@ -1023,6 +1023,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) case CPU_CORE_I7: case CPU_NEHALEM: case CPU_WESTMERE: + case CPU_SANDYBRIDGE: case CPU_MIPS_LOONGSON2: case CPU_FAMILY12H: case CPU_FAMILY14H: diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h index f1d67a5..a529dd6 100644 --- a/libop/op_hw_specific.h +++ b/libop/op_hw_specific.h @@ -117,6 +117,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) case 0x2c: /* Westmere-EP (Intel Xeon 5600 series) */ case 0x2f: /* Westmere-EX */ return CPU_WESTMERE; + case 0x2a: + case 0x2d: + return CPU_SANDYBRIDGE; } } return cpu_type; diff --git a/utils/ophelp.c b/utils/ophelp.c index b3aebde..f4e0653 100644 --- a/utils/ophelp.c +++ b/utils/ophelp.c @@ -533,6 +533,7 @@ int main(int argc, char const * argv[]) case CPU_CORE_I7: case CPU_NEHALEM: case CPU_WESTMERE: + case CPU_SANDYBRIDGE: case CPU_ATOM: event_doc = "See Intel Architecture Developer's Manual Volume 3B, Appendix A and\n" -- 1.7.4.4 |
From: Maynard J. <may...@us...> - 2011-05-16 19:39:10
|
Andi Kleen wrote: > From: Andi Kleen <ak...@li...> > > Add an event list for Sandy Bridge. Modify oprofile to detect Sandy Bridges. Suravee, I'm pretty sure this patch is fine, but as the x86 oprofile userspace maintainer, I'd like your ack. If you're OK with it, you can commit it. -Maynard > > Signed-off-by: Andi Kleen <ak...@li...> > --- > events/Makefile.am | 1 + > events/i386/sandybridge/events | 67 +++++++++ > events/i386/sandybridge/unit_masks | 275 ++++++++++++++++++++++++++++++++++++ > libop/op_cpu_type.c | 2 + > libop/op_cpu_type.h | 1 + > libop/op_events.c | 1 + > libop/op_hw_specific.h | 3 + > utils/ophelp.c | 1 + > 8 files changed, 351 insertions(+), 0 deletions(-) > create mode 100644 events/i386/sandybridge/events > create mode 100644 events/i386/sandybridge/unit_masks > > diff --git a/events/Makefile.am b/events/Makefile.am > index 60c4164..c4101cc 100644 > --- a/events/Makefile.am > +++ b/events/Makefile.am > @@ -18,6 +18,7 @@ event_files = \ > i386/core_i7/events i386/core_i7/unit_masks \ > i386/nehalem/events i386/nehalem/unit_masks \ > i386/westmere/events i386/westmere/unit_masks \ > + i386/sandybridge/events i386/sandybridge/unit_masks \ > ia64/ia64/events ia64/ia64/unit_masks \ > ia64/itanium2/events ia64/itanium2/unit_masks \ > ia64/itanium/events ia64/itanium/unit_masks \ > diff --git a/events/i386/sandybridge/events b/events/i386/sandybridge/events > new file mode 100644 > index 0000000..bf941c7 > --- /dev/null > +++ b/events/i386/sandybridge/events > @@ -0,0 +1,67 @@ > +# > +# Intel "sandy-bridge" microarchitecture core events. > +# > +# See http://ark.intel.com/ for help in identifying sandy-bridge based CPUs > +# > +# Note the minimum counts are not discovered experimentally and could be likely > +# lowered in many cases without ill effect. > +# > +include:i386/arch_perfmon > +event:0x03 counters:cpuid um:ld_blocks minimum:100000 name:ld_blocks : blocked loads > +event:0x05 counters:cpuid um:misalign_mem_ref minimum:2000000 name:misalign_mem_ref : Misaligned memory references > +event:0x07 counters:cpuid um:ld_blocks_partial minimum:100000 name:ld_blocks_partial : Partial loads > +event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000000 name:dtlb_load_misses : D-TLB misses > +event:0x0d counters:cpuid um:int_misc minimum:2000000 name:int_misc : Instruction decoder events > +event:0x0e counters:0,1,2,3 um:uops_issued minimum:2000000 name:uops_issued : Number of Uops issued > +event:0x14 counters:cpuid um:arith minimum:2000000 name:arith : Misc ALU events > +event:0x17 counters:cpuid um:one minimum:2000000 name:insts_written_to_iq : Number of instructions written to Instruction Queue (IQ) this cycle. > +event:0x24 counters:cpuid um:l2_rqsts minimum:200000 name:l2_rqsts : Requests from L2 cache > +event:0x27 counters:cpuid um:l2_store_lock_rqsts minimum:200000 name:l2_store_lock_rqsts : L2 cache store lock requests > +event:0x28 counters:cpuid um:l2_l1d_wb_rqsts minimum:200000 name:l2_l1d_wb_rqsts : writebacks from L1D to the L2 cache > +event:0x48 counters:2 um:l1d_pend_miss minimum:2000000 name:l1d_pend_miss : Cycles with L1D load Misses outstanding. > +event:0x49 counters:cpuid um:dtlb_store_misses minimum:2000000 name:dtlb_store_misses : D-TLB store misses > +event:0x4c counters:cpuid um:load_hit_pre minimum:100000 name:load_hit_pre : Load dispatches that hit fill buffer > +event:0x4e counters:cpuid um:x02 minimum:2000000 name:hw_pre_req : Hardware Prefetch requests > +event:0x51 counters:cpuid um:l1d minimum:2000000 name:l1d : L1D cache events > +event:0x59 counters:cpuid um:partial_rat_stalls minimum:2000000 name:partial_rat_stalls : Partial RAT stalls > +event:0x5b counters:0,1,2,3 um:resource_stalls2 minimum:2000000 name:resource_stalls2 : Misc resource stalls > +event:0x5c counters:cpuid um:cpl_cycles minimum:2000000 name:cpl_cycles : Unhalted core cycles in specific rings > +event:0x5e counters:0,1,2,3 um:one minimum:2000000 name:rs_events : Events for the reservation station > +event:0x60 counters:cpuid um:offcore_requests_outstanding minimum:2000000 name:offcore_requests_outstanding : Offcore outstanding transactions > +event:0x63 counters:cpuid um:lock_cycles minimum:2000000 name:lock_cycles : Cycles due to LOCK prefixes. > +event:0x79 counters:0,1,2,3 um:idq minimum:2000000 name:idq : Instruction Decode Queue events > +event:0x80 counters:cpuid um:x02 minimum:200000 name:icache : Instruction cache events > +event:0x85 counters:cpuid um:itlb_misses minimum:2000000 name:itlb_misses : I-TLB misses > +event:0x87 counters:cpuid um:ild_stall minimum:2000000 name:ild_stall : Instruction decoding stalls > +event:0x88 counters:cpuid um:br_inst_exec minimum:200000 name:br_inst_exec : Branch instructions > +event:0x89 counters:cpuid um:br_misp_exec minimum:200000 name:br_misp_exec : Mispredicted branch instructions > +event:0x9c counters:0,1,2,3 um:idq_uops_not_delivered minimum:2000000 name:idq_uops_not_delivered : uops not delivered to IDQ. > +event:0xa1 counters:cpuid um:uops_dispatched_port minimum:2000000 name:uops_dispatched_port : Count on which ports uops are dispatched. > +event:0xa2 counters:cpuid um:resource_stalls minimum:2000000 name:resource_stalls : Core resource stalls > +event:0xab counters:cpuid um:dsb2mite_switches minimum:2000000 name:dsb2mite_switches : Number of Decode Stream Buffer (DSB) to MITE switches > +event:0xac counters:cpuid um:dsb_fill minimum:2000000 name:dsb_fill : DSB fill events > +event:0xae counters:cpuid um:one minimum:10000 name:itlb : ITLB events > +event:0xb0 counters:cpuid um:offcore_requests minimum:100000 name:offcore_requests : Requests sent outside the core > +event:0xb1 counters:0,1,2,3 um:uops_dispatched minimum:2000000 name:uops_dispatched : uops dispatched > +event:0xb2 counters:cpuid um:one minimum:2000000 name:offcore_requests_buffer : Offcore requests buffer events > +event:0xb6 counters:cpuid um:one minimum:100000 name:agu_bypass_cancel : AGU bypass cancel > +event:0xbd counters:cpuid um:tlb_flush minimum:10000 name:tlb_flush : TLB flushes > +event:0xbf counters:cpuid um:l1d_blocks minimum:100000 name:l1d_blocks : L1D cache blocking events > +event:0xc0 counters:1 um:one minimum:2000000 name:inst_retired : Instructions retired > +event:0xc1 counters:cpuid um:other_assists minimum:100000 name:other_assists : Instructions that needed an assist > +event:0xc2 counters:0,1,2,3 um:uops_retired minimum:2000000 name:uops_retired : uops that actually retired. > +event:0xc3 counters:cpuid um:machine_clears minimum:100000 name:machine_clears : Number of Machine Clears detected. > +event:0xc4 counters:0,1,2,3 um:br_inst_retired minimum:400000 name:br_inst_retired : Counts branch instructions retired > +event:0xc5 counters:0,1,2,3 um:br_misp_retired minimum:400000 name:br_misp_retired : Counts mispredicted branch instructions > +event:0xca counters:0,1,2,3 um:fp_assist minimum:100000 name:fp_assist : Counts floating point assists > +event:0xcb counters:cpuid um:one minimum:100000 name:hw_interrupts : Number of hardware interrupts received by the processor. > +event:0xcc counters:cpuid um:x20 minimum:2000000 name:rob_misc_events : Count ROB (Register Reorder Buffer) events. > +event:0xcd counters:3 um:x02 minimum:2000000 name:mem_trans_retired : Count memory transactions > +event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000000 name:mem_uops_retired : Count uops with memory accessed retired > +event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000000 name:mem_load_uops_retired : Memory load uops. > +event:0xd2 counters:0,1,2,3 um:mem_load_uops_llc_hit_retired minimum:100000 name:mem_load_uops_llc_hit_retired : Memory load uops with LLC (Last level cache) hit > +event:0xd4 counters:0,1,2,3 um:x02 minimum:10000 name:mem_load_uops_misc_retired : Memory load uops retired > +event:0xf0 counters:cpuid um:l2_trans minimum:200000 name:l2_trans : L2 cache accesses > +event:0xf1 counters:cpuid um:l2_lines_in minimum:100000 name:l2_lines_in : L2 cache lines in > +event:0xf2 counters:cpuid um:l2_lines_out minimum:100000 name:l2_lines_out : L2 cache lines out > +event:0xf4 counters:cpuid um:x10 minimum:100000 name:sq_misc : Store queue misc events > diff --git a/events/i386/sandybridge/unit_masks b/events/i386/sandybridge/unit_masks > new file mode 100644 > index 0000000..cca6cb9 > --- /dev/null > +++ b/events/i386/sandybridge/unit_masks > @@ -0,0 +1,275 @@ > +# > +# Unit masks for the Intel "sandy-bridge" micro architecture > +# > +# See http://ark.intel.com/ for help in identifying sandy-bridge based CPUs > +# > +include:i386/arch_perfmon > +name:x02 type:mandatory default:0x2 > + 0x2 No unit mask > +name:x10 type:mandatory default:0x10 > + 0x10 No unit mask > +name:x20 type:mandatory default:0x20 > + 0x20 No unit mask > +name:ld_blocks type:bitmask default:0x1 > + 0x1 data_unknown blocked loads due to store buffer blocks with unknown data. > + 0x2 store_forward loads blocked by overlapping with store buffer that cannot be forwarded > + 0x8 no_sr This event counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use. > + 0x10 all_block Number of cases where any load is blocked but has no DCU miss. > +name:misalign_mem_ref type:bitmask default:0x1 > + 0x1 loads Speculative cache-line split load uops dispatched to the L1D. > + 0x2 stores Speculative cache-line split Store-address uops dispatched to L1D > +name:ld_blocks_partial type:bitmask default:0x1 > + 0x1 address_alias False dependencies in MOB due to partial compare on address > + 0x8 all_sta_block This event counts the number of times that load operations are temporarily blocked because of older stores, with addresses that are not yet known. A load operation may incur more than one block of this type. > +name:dtlb_load_misses type:bitmask default:0x1 > + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M/1G) > + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M/1G) > + 0x4 walk_duration Cycles PMH is busy with this walk > + 0x10 stlb_hit First level miss but second level hit; no page walk. > +name:int_misc type:bitmask default:0x40 > + 0x40 rat_stall_cycles Cycles Resource Allocation Table (RAT) external stall is sent to Instruction Decode Queue (IDQ) for this thread. > + 0x3 extra:cmask=1 recovery_cycles Number of cycles waiting to be recover after Nuke due to all other cases except JEClear. > + 0x3 extra:cmask=1,edge recovery_stalls_count Edge applied to recovery_cycles, thus counts occurrences. > +name:uops_issued type:bitmask default:0x1 > + 0x1 any Number of Uops issued by the Resource Allocation Table (RAT) to the Reservation Station (RS) > + 0x1 extra:cmask=1,inv stall_cycles cycles no uops issued by this thread. > +name:arith type:bitmask default:0x1 > + 0x1 fpu_div_active Cycles that the divider is busy with any divide or sqrt operation. > + 0x1 extra:cmask=1,edge fpu_div Number of times that the divider is actived, includes INT, SIMD and FP. > +name:l2_rqsts type:bitmask default:0x1 > + 0x1 demand_data_rd_hit Demand Data Read hit L2, no rejects > + 0x4 rfo_hit RFO requests that hit L2 cache > + 0x8 rfo_miss RFO requests that miss L2 cache > + 0x10 code_rd_hit L2 cache hits when fetching instructions, code reads. > + 0x20 code_rd_miss L2 cache misses when fetching instructions > + 0x40 pf_hit Requests from the L2 hardware prefetchers that hit L2 cache > + 0x80 pf_miss Requests from the L2 hardware prefetchers that miss L2 cache > + 0x3 all_demand_data_rd Any data read request to L2 cache > + 0xc all_rfo Any data RFO request to L2 cache > + 0x30 all_code_rd Any code read request to L2 cache > + 0xc0 all_pf Any L2 HW prefetch request to L2 cache > +name:l2_store_lock_rqsts type:bitmask default:0xf > + 0xf all RFOs that access cache lines in any state > + 0x1 miss RFO (as a result of regular RFO or Lock request) miss cache - I state > + 0x4 hit_e RFO (as a result of regular RFO or Lock request) hits cache in E state > + 0x8 hit_m RFO (as a result of regular RFO or Lock request) hits cache in M state > +name:l2_l1d_wb_rqsts type:bitmask default:0x4 > + 0x4 hit_e writebacks from L1D to L2 cache lines in E state > + 0x8 hit_m writebacks from L1D to L2 cache lines in M state > +name:l1d_pend_miss type:bitmask default:0x1 > + 0x1 pending Cycles with L1D load Misses outstanding. > + 0x1 extra:cmask=1,edge occurences This event counts the number of L1D misses outstanding occurences. > +name:dtlb_store_misses type:bitmask default:0x1 > + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M/1G) > + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M/1G) > + 0x4 walk_duration Cycles PMH is busy with this walk > + 0x10 stlb_hit First level miss but second level hit; no page walk. Only relevant if multiple levels. > +name:load_hit_pre type:bitmask default:0x1 > + 0x1 sw_pf Load dispatches that hit fill buffer allocated for S/W prefetch. > + 0x2 hw_pf Load dispatches that hit fill buffer allocated for HW prefetch. > +name:l1d type:bitmask default:0x1 > + 0x1 replacement L1D Data line replacements. > + 0x2 allocated_in_m L1D M-state Data Cache Lines Allocated > + 0x4 eviction L1D M-state Data Cache Lines Evicted due to replacement (only) > + 0x8 all_m_replacement All Modified lines evicted out of L1D > +name:partial_rat_stalls type:bitmask default:0x20 > + 0x20 flags_merge_uop Number of perf sensitive flags-merge uops added by Sandy Bridge u-arch. > + 0x40 slow_lea_window Number of cycles with at least 1 slow Load Effective Address (LEA) uop being allocated. > + 0x80 mul_single_uop Number of Multiply packed/scalar single precision uops allocated > + 0x20 extra:cmask=1 flags_merge_uop_cycles Cycles with perf sensitive flags-merge uops added by SandyBridge u-arch. > +name:resource_stalls2 type:bitmask default:0x40 > + 0x40 bob_full Cycles Allocator is stalled due Branch Order Buffer (BOB). > + 0xf all_prf_control Resource stalls2 control structures full for physical registers > + 0xc all_fl_empty Cycles with either free list is empty > + 0x4f ooo_rsrc Resource stalls2 control structures full Physical Register Reclaim Table (PRRT), Physical History Table (PHT), INT or SIMD Free List (FL), Branch Order Buffer (BOB) > +name:cpl_cycles type:bitmask default:0x1 > + 0x1 ring0 Unhalted core cycles the Thread was in Rings 0. > + 0x1 extra:cmask=1,edge ring0_trans Transitions from ring123 to Ring0. > + 0x2 ring123 Unhalted core cycles the Thread was in Rings 1/2/3. > +name:offcore_requests_outstanding type:bitmask default:0x1 > + 0x1 demand_data_rd Offcore outstanding Demand Data Read transactions in the SuperQueue (SQ), queue to uncore, every cycle. Includes L1D data hardware prefetches. > + 0x1 extra:cmask=1 cycles_with_demand_data_rd cycles there are Offcore outstanding RD data transactions in the SuperQueue (SQ), queue to uncore. > + 0x2 demand_code_rd Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x4 demand_rfo Offcore outstanding RFO (store) transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x8 all_data_rd Offcore outstanding all cacheable Core Data Read transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x8 extra:cmask=1 cycles_with_data_rd Cycles there are Offcore outstanding all Data read transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x2 extra:cmask=1 cycles_with_demand_code_rd Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x4 extra:cmask=1 cycles_with_demand_rfo Cycles with offcore outstanding demand RFO Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. > +name:lock_cycles type:bitmask default:0x1 > + 0x1 split_lock_uc_lock_duration Cycles in which the L1D and L2 are locked, due to a UC lock or split lock > + 0x2 cache_lock_duration cycles that theL1D is locked > +name:idq type:bitmask default:0x2 > + 0x2 empty Cycles the Instruction Decode Queue (IDQ) is empty. > + 0x4 mite_uops Number of uops delivered to Instruction Decode Queue (IDQ) from MITE path. > + 0x8 dsb_uops Number of uops delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path. > + 0x10 ms_dsb_uops Number of Uops delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by Decode Stream Buffer (DSB). > + 0x20 ms_mite_uops Number of Uops delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by MITE. > + 0x30 ms_uops Number of Uops were delivered into Instruction Decode Queue (IDQ) from MS, initiated by Decode Stream Buffer (DSB) or MITE. > + 0x30 extra:cmask=1 ms_cycles Number of cycles that Uops were delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by Decode Stream Buffer (DSB) or MITE. > + 0x4 extra:cmask=1 mite_cycles Cycles MITE is active > + 0x8 extra:cmask=1 dsb_cycles Cycles Decode Stream Buffer (DSB) is active > + 0x10 extra:cmask=1 ms_dsb_cycles Cycles Decode Stream Buffer (DSB) Microcode Sequenser (MS) is active > + 0x10 extra:cmask=1,edge ms_dsb_occur Occurences of Decode Stream Buffer (DSB) Microcode Sequenser (MS) going active > + 0x18 extra:cmask=1 all_dsb_cycles_any_uops Cycles Decode Stream Buffer (DSB) is delivering anything > + 0x18 extra:cmask=4 all_dsb_cycles_4_uops Cycles Decode Stream Buffer (DSB) is delivering 4 Uops > + 0x24 extra:cmask=1 all_mite_cycles_any_uops Cycles MITE is delivering anything > + 0x24 extra:cmask=4 all_mite_cycles_4_uops Cycles MITE is delivering 4 Uops > + 0x3c mite_all_uops Number of uops delivered to Instruction Decode Queue (IDQ) from any path. > +name:itlb_misses type:bitmask default:0x1 > + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M) > + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M) > + 0x4 walk_duration Cycles PMH is busy with this walk. > + 0x10 stlb_hit First level miss but second level hit; no page walk. > +name:ild_stall type:bitmask default:0x1 > + 0x1 lcp Stall "occurrences" due to length changing prefixes (LCP). > + 0x4 iq_full Stall cycles when instructions cannot be written because the Instruction Queue (IQ) is full. > +name:br_inst_exec type:bitmask default:0xff > + 0xff all_branches All branch instructions executed. > + 0x41 nontaken_conditional All macro conditional nontaken branch instructions. > + 0x81 taken_conditional All macro conditional taken branch instructions. > + 0x82 taken_direct_jump All macro unconditional taken branch instructions, excluding calls and indirects. > + 0x84 taken_indirect_jump_non_call_ret All taken indirect branches that are not calls nor returns. > + 0x88 taken_indirect_near_return All taken indirect branches that have a return mnemonic. > + 0x90 taken_direct_near_call All taken non-indirect calls. > + 0xa0 taken_indirect_near_call All taken indirect calls, including both register and memory indirect. > + 0xc1 all_conditional All macro conditional branch instructions. > + 0xc2 all_direct_jmp All macro unconditional branch instructions, excluding calls and indirects > + 0xc4 all_indirect_jump_non_call_ret All indirect branches that are not calls nor returns. > + 0xc8 all_indirect_near_return All indirect return branches. > + 0xd0 all_direct_near_call All non-indirect calls executed. > +name:br_misp_exec type:bitmask default:0xff > + 0xff all_branches All mispredicted branch instructions executed. > + 0x41 nontaken_conditional All nontaken mispredicted macro conditional branch instructions. > + 0x81 taken_conditional All taken mispredicted macro conditional branch instructions. > + 0x84 taken_indirect_jump_non_call_ret All taken mispredicted indirect branches that are not calls nor returns. > + 0x88 taken_return_near All taken mispredicted indirect branches that have a return mnemonic. > + 0x90 taken_direct_near_call All taken mispredicted non-indirect calls. > + 0xa0 taken_indirect_near_call All taken mispredicted indirect calls, including both register and memory indirect. > + 0xc1 all_conditional All mispredicted macro conditional branch instructions. > + 0xc4 all_indirect_jump_non_call_ret All mispredicted indirect branches that are not calls nor returns. > + 0xd0 all_direct_near_call All mispredicted non-indirect calls > +name:idq_uops_not_delivered type:bitmask default:0x1 > + 0x1 core Count number of non-delivered uops to Resource Allocation Table (RAT). > + 0x1 extra:cmask=4 cycles_0_uops_deliv.core Counts the cycles no uops were delivered > + 0x1 extra:cmask=3 cycles_le_1_uop_deliv.core Counts the cycles less than 1 uops were delivered > + 0x1 extra:cmask=2 cycles_le_2_uop_deliv.core Counts the cycles less than 2 uops were delivered > + 0x1 extra:cmask=1 cycles_le_3_uop_deliv.core Counts the cycles less than 3 uops were delivered > + 0x1 extra:cmask=4,inv cycles_ge_1_uop_deliv.core Cycles when 1 or more uops were delivered to the by the front end. > + 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. > +name:uops_dispatched_port type:bitmask default:0x1 > + 0x1 port_0 Cycles which a Uop is dispatched on port 0 > + 0x2 port_1 Cycles which a Uop is dispatched on port 1 > + 0x4 port_2_ld Cycles which a load Uop is dispatched on port 2 > + 0x8 port_2_sta Cycles which a STA Uop is dispatched on port 2 > + 0x10 port_3_ld Cycles which a load Uop is dispatched on port 3 > + 0x20 port_3_sta Cycles which a STA Uop is dispatched on port 3 > + 0x40 port_4 Cycles which a Uop is dispatched on port 4 > + 0x80 port_5 Cycles which a Uop is dispatched on port 5 > + 0xc port_2 Uops disptached to port 2, loads and stores (speculative and retired) > + 0x30 port_3 Uops disptached to port 3, loads and stores (speculative and retired) > + 0xc port_2_core Uops disptached to port 2, loads and stores per core (speculative and retired) > + 0x30 port_3_core Uops disptached to port 3, loads and stores per core (speculative and retired) > +name:resource_stalls type:bitmask default:0x1 > + 0x1 any Cycles Allocation is stalled due to Resource Related reason. > + 0x2 lb Cycles Allocator is stalled due to Load Buffer full > + 0x4 rs Stall due to no eligible Reservation Station (RS) entry available. > + 0x8 sb Cycles Allocator is stalled due to Store Buffer full (not including draining from synch). > + 0x10 rob ROB full cycles. > + 0xe mem_rs Resource stalls due to LB, SB or Reservation Station (RS) being completely in use > + 0xf0 ooo_rsrc Resource stalls due to Rob being full, FCSW, MXCSR and OTHER > + 0xa lb_sb Resource stalls due to load or store buffers > +name:dsb2mite_switches type:bitmask default:0x1 > + 0x1 count Number of Decode Stream Buffer (DSB) to MITE switches > + 0x2 penalty_cycles Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles. > +name:dsb_fill type:bitmask default:0x2 > + 0x2 other_cancel Count number of times a valid DSB fill has been actually cancelled for any reason. > + 0x8 exceed_dsb_lines Decode Stream Buffer (DSB) Fill encountered > 3 Decode Stream Buffer (DSB) lines. > + 0xa all_cancel Count number of times a valid Decode Stream Buffer (DSB) fill has been actually cancelled for any reason. > +name:offcore_requests type:bitmask default:0x1 > + 0x1 demand_data_rd Demand Data Read requests sent to uncore > + 0x2 demand_code_rd Offcore Code read requests. Includes Cacheable and Un-cacheables. > + 0x4 demand_rfo Offcore Demand RFOs. Includes regular RFO, Locks, ItoM. > + 0x8 all_data_rd Offcore Demand and prefetch data reads returned to the core. > +name:uops_dispatched type:bitmask default:0x1 > + 0x1 thread Counts total number of uops to be dispatched per-thread each cycle. > + 0x1 extra:cmask=1,inv stall_cycles Counts number of cycles no uops were dispatced to be executed on this thread. > + 0x2 core Counts total number of uops dispatched from any thread > +name:tlb_flush type:bitmask default:0x1 > + 0x1 dtlb_thread Count number of DTLB flushes of thread-specific entries. > + 0x20 stlb_any Count number of any STLB flushes > +name:l1d_blocks type:bitmask default:0x1 > + 0x1 ld_bank_conflict Any dispatched loads cancelled due to DCU bank conflict > + 0x5 extra:cmask=1 bank_conflict_cycles Cycles with l1d blocks due to bank conflicts > +name:other_assists type:bitmask default:0x2 > + 0x2 itlb_miss_retired Instructions that experienced an ITLB miss. Non Pebs > + 0x10 avx_to_sse Number of transitions from AVX-256 to legacy SSE when penalty applicable Non Pebs > + 0x20 sse_to_avx Number of transitions from legacy SSE to AVX-256 when penalty applicable Non Pebs > +name:uops_retired type:bitmask default:0x1 > + 0x1 all All uops that actually retired. > + 0x2 retire_slots number of retirement slots used non PEBS > + 0x1 extra:cmask=1,inv stall_cycles Cycles no executable uops retired > + 0x1 extra:cmask=10,inv total_cycles Number of cycles using always true condition applied to non PEBS uops retired event. > +name:machine_clears type:bitmask default:0x2 > + 0x2 memory_ordering Number of Memory Ordering Machine Clears detected. > + 0x4 smc Number of Self-modifying code (SMC) Machine Clears detected. > + 0x20 maskmov Number of AVX masked mov Machine Clears detected. > +name:br_inst_retired type:bitmask default:0x1 > + 0x1 conditional Counts all taken and not taken macro conditional branch instructions. > + 0x2 near_call Counts all macro direct and indirect near calls. non PEBS > + 0x8 near_return This event counts the number of near ret instructions retired. > + 0x10 not_taken Counts all not taken macro branch instructions retired. > + 0x20 near_taken Counts the number of near branch taken instructions retired. > + 0x40 far_branch Counts the number of far branch instructions retired. > + 0x4 all_branches_ps Counts all taken and not taken macro branches including far branches.(Precise Event) > + 0x2 near_call_r3 Ring123 only near calls (non precise) > + 0x2 near_call_r3_ps Ring123 only near calls (precise event) > +name:br_misp_retired type:bitmask default:0x1 > + 0x1 conditional All mispredicted macro conditional branch instructions. > + 0x2 near_call All macro direct and indirect near calls > + 0x10 not_taken number of branch instructions retired that were mispredicted and not-taken. > + 0x20 taken number of branch instructions retired that were mispredicted and taken. > + 0x4 all_branches_ps all macro branches (Precise Event) > +name:fp_assist type:bitmask default:0x1e > + 0x1e extra:cmask=1 any Counts any FP_ASSIST umask was incrementing. > + 0x2 x87_output output - Numeric Overflow, Numeric Underflow, Inexact Result > + 0x4 x87_input input - Invalid Operation, Denormal Operand, SNaN Operand > + 0x8 simd_output Any output SSE* FP Assist - Numeric Overflow, Numeric Underflow. > + 0x10 simd_input Any input SSE* FP Assist > +name:mem_uops_retired type:bitmask default:0x11 > + 0x11 stlb_miss_loads STLB misses dues to retired loads > + 0x12 stlb_miss_stores STLB misses dues to retired stores > + 0x21 lock_loads Locked retired loads > + 0x41 split_loads Retired loads causing cacheline splits > + 0x42 split_stores Retired stores causing cacheline splits > + 0x81 all_loads Any retired loads > + 0x82 all_stores Any retired stores > +name:mem_load_uops_retired type:bitmask default:0x1 > + 0x1 l1_hit Load hit in nearest-level (L1D) cache > + 0x2 l2_hit Load hit in mid-level (L2) cache > + 0x4 llc_hit Load hit in last-level (L3) cache with no snoop needed > + 0x40 hit_lfb A load missed L1D but hit the Fill Buffer > +name:mem_load_uops_llc_hit_retired type:bitmask default:0x1 > + 0x1 xsnp_miss Load LLC Hit and a cross-core Snoop missed in on-pkg core cache > + 0x2 xsnp_hit Load LLC Hit and a cross-core Snoop hits in on-pkg core cache > + 0x4 xsnp_hitm Load had HitM Response from a core on same socket (shared LLC). > + 0x8 xsnp_none Load hit in last-level (L3) cache with no snoop needed. > +name:l2_trans type:bitmask default:0x80 > + 0x80 all_requests Transactions accessing L2 pipe > + 0x1 demand_data_rd Demand Data Read requests that access L2 cache, includes L1D prefetches. > + 0x2 rfo RFO requests that access L2 cache > + 0x4 code_rd L2 cache accesses when fetching instructions including L1D code prefetches > + 0x8 all_pf L2 or LLC HW prefetches that access L2 cache > + 0x10 l1d_wb L1D writebacks that access L2 cache > + 0x20 l2_fill L2 fill requests that access L2 cache > + 0x40 l2_wb L2 writebacks that access L2 cache > +name:l2_lines_in type:bitmask default:0x7 > + 0x7 all L2 cache lines filling L2 > + 0x1 i L2 cache lines in I state filling L2 > + 0x2 s L2 cache lines in S state filling L2 > + 0x4 e L2 cache lines in E state filling L2 > +name:l2_lines_out type:bitmask default:0x1 > + 0x1 demand_clean Clean line evicted by a demand > + 0x2 demand_dirty Dirty line evicted by a demand > + 0x4 pf_clean Clean line evicted by an L2 Prefetch > + 0x8 pf_dirty Dirty line evicted by an L2 Prefetch > + 0xa dirty_all Any Dirty line evicted > diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c > index b2ebf54..9d11b21 100644 > --- a/libop/op_cpu_type.c > +++ b/libop/op_cpu_type.c > @@ -93,6 +93,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { > { "Intel Westmere microarchitecture", "i386/westmere", CPU_WESTMERE, 4 }, > { "ARMv7 Scorpion", "arm/armv7-scorpion", CPU_ARM_SCORPION, 5 }, > { "ARMv7 ScorpionMP", "arm/armv7-scorpionmp", CPU_ARM_SCORPIONMP, 5 }, > + { "Intel Sandy Bridge microarchitecture", "i386/sandybridge", CPU_SANDYBRIDGE, 8 }, > }; > > static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); > @@ -117,6 +118,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) > case CPU_ATOM: > case CPU_NEHALEM: > case CPU_WESTMERE: > + case CPU_SANDYBRIDGE: > return CPU_ARCH_PERFMON; > default: > /* assume processor in a class by itself */ > diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h > index 9283ec7..d6cae3a 100644 > --- a/libop/op_cpu_type.h > +++ b/libop/op_cpu_type.h > @@ -90,6 +90,7 @@ typedef enum { > CPU_WESTMERE, /* Intel Westmere microarchitecture */ > CPU_ARM_SCORPION, /**< ARM SCORPION */ > CPU_ARM_SCORPIONMP, /**< ARM SCORPIONMP */ > + CPU_SANDYBRIDGE, /* Intel Sandy-Bridge microarchitecture */ > MAX_CPU_TYPE > } op_cpu; > > diff --git a/libop/op_events.c b/libop/op_events.c > index 502ff01..0aa0ad3 100644 > --- a/libop/op_events.c > +++ b/libop/op_events.c > @@ -1023,6 +1023,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) > case CPU_CORE_I7: > case CPU_NEHALEM: > case CPU_WESTMERE: > + case CPU_SANDYBRIDGE: > case CPU_MIPS_LOONGSON2: > case CPU_FAMILY12H: > case CPU_FAMILY14H: > diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h > index f1d67a5..a529dd6 100644 > --- a/libop/op_hw_specific.h > +++ b/libop/op_hw_specific.h > @@ -117,6 +117,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) > case 0x2c: /* Westmere-EP (Intel Xeon 5600 series) */ > case 0x2f: /* Westmere-EX */ > return CPU_WESTMERE; > + case 0x2a: > + case 0x2d: > + return CPU_SANDYBRIDGE; > } > } > return cpu_type; > diff --git a/utils/ophelp.c b/utils/ophelp.c > index b3aebde..f4e0653 100644 > --- a/utils/ophelp.c > +++ b/utils/ophelp.c > @@ -533,6 +533,7 @@ int main(int argc, char const * argv[]) > case CPU_CORE_I7: > case CPU_NEHALEM: > case CPU_WESTMERE: > + case CPU_SANDYBRIDGE: > case CPU_ATOM: > event_doc = > "See Intel Architecture Developer's Manual Volume 3B, Appendix A and\n" |
From: Suthikulpanit, S. <Sur...@am...> - 2011-05-17 19:13:17
|
Besides a couple issues: - events file already exists and exactly the same (I could just move the existing events file so that the patch would apply cleanly). - white spaces (I could fix this during commit.) This patch applied clean. PS: Do we mention anywhere (i.e. in the opcontrol --help) regarding the "Named-unitmask" feature? -----Original Message----- From: Maynard Johnson [mailto:may...@us...] Sent: Monday, May 16, 2011 2:39 PM To: Andi Kleen; Suthikulpanit, Suravee Cc: opr...@li...; Andi Kleen Subject: Re: [PATCH 3/4] Add Sandy Bridge support Andi Kleen wrote: > From: Andi Kleen <ak...@li...> > > Add an event list for Sandy Bridge. Modify oprofile to detect Sandy Bridges. Suravee, I'm pretty sure this patch is fine, but as the x86 oprofile userspace maintainer, I'd like your ack. If you're OK with it, you can commit it. -Maynard > > Signed-off-by: Andi Kleen <ak...@li...> > --- > events/Makefile.am | 1 + > events/i386/sandybridge/events | 67 +++++++++ > events/i386/sandybridge/unit_masks | 275 ++++++++++++++++++++++++++++++++++++ > libop/op_cpu_type.c | 2 + > libop/op_cpu_type.h | 1 + > libop/op_events.c | 1 + > libop/op_hw_specific.h | 3 + > utils/ophelp.c | 1 + > 8 files changed, 351 insertions(+), 0 deletions(-) > create mode 100644 events/i386/sandybridge/events > create mode 100644 events/i386/sandybridge/unit_masks > > diff --git a/events/Makefile.am b/events/Makefile.am > index 60c4164..c4101cc 100644 > --- a/events/Makefile.am > +++ b/events/Makefile.am > @@ -18,6 +18,7 @@ event_files = \ > i386/core_i7/events i386/core_i7/unit_masks \ > i386/nehalem/events i386/nehalem/unit_masks \ > i386/westmere/events i386/westmere/unit_masks \ > + i386/sandybridge/events i386/sandybridge/unit_masks \ > ia64/ia64/events ia64/ia64/unit_masks \ > ia64/itanium2/events ia64/itanium2/unit_masks \ > ia64/itanium/events ia64/itanium/unit_masks \ > diff --git a/events/i386/sandybridge/events b/events/i386/sandybridge/events > new file mode 100644 > index 0000000..bf941c7 > --- /dev/null > +++ b/events/i386/sandybridge/events > @@ -0,0 +1,67 @@ > +# > +# Intel "sandy-bridge" microarchitecture core events. > +# > +# See http://ark.intel.com/ for help in identifying sandy-bridge based CPUs > +# > +# Note the minimum counts are not discovered experimentally and could be likely > +# lowered in many cases without ill effect. > +# > +include:i386/arch_perfmon > +event:0x03 counters:cpuid um:ld_blocks minimum:100000 name:ld_blocks : blocked loads > +event:0x05 counters:cpuid um:misalign_mem_ref minimum:2000000 name:misalign_mem_ref : Misaligned memory references > +event:0x07 counters:cpuid um:ld_blocks_partial minimum:100000 name:ld_blocks_partial : Partial loads > +event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000000 name:dtlb_load_misses : D-TLB misses > +event:0x0d counters:cpuid um:int_misc minimum:2000000 name:int_misc : Instruction decoder events > +event:0x0e counters:0,1,2,3 um:uops_issued minimum:2000000 name:uops_issued : Number of Uops issued > +event:0x14 counters:cpuid um:arith minimum:2000000 name:arith : Misc ALU events > +event:0x17 counters:cpuid um:one minimum:2000000 name:insts_written_to_iq : Number of instructions written to Instruction Queue (IQ) this cycle. > +event:0x24 counters:cpuid um:l2_rqsts minimum:200000 name:l2_rqsts : Requests from L2 cache > +event:0x27 counters:cpuid um:l2_store_lock_rqsts minimum:200000 name:l2_store_lock_rqsts : L2 cache store lock requests > +event:0x28 counters:cpuid um:l2_l1d_wb_rqsts minimum:200000 name:l2_l1d_wb_rqsts : writebacks from L1D to the L2 cache > +event:0x48 counters:2 um:l1d_pend_miss minimum:2000000 name:l1d_pend_miss : Cycles with L1D load Misses outstanding. > +event:0x49 counters:cpuid um:dtlb_store_misses minimum:2000000 name:dtlb_store_misses : D-TLB store misses > +event:0x4c counters:cpuid um:load_hit_pre minimum:100000 name:load_hit_pre : Load dispatches that hit fill buffer > +event:0x4e counters:cpuid um:x02 minimum:2000000 name:hw_pre_req : Hardware Prefetch requests > +event:0x51 counters:cpuid um:l1d minimum:2000000 name:l1d : L1D cache events > +event:0x59 counters:cpuid um:partial_rat_stalls minimum:2000000 name:partial_rat_stalls : Partial RAT stalls > +event:0x5b counters:0,1,2,3 um:resource_stalls2 minimum:2000000 name:resource_stalls2 : Misc resource stalls > +event:0x5c counters:cpuid um:cpl_cycles minimum:2000000 name:cpl_cycles : Unhalted core cycles in specific rings > +event:0x5e counters:0,1,2,3 um:one minimum:2000000 name:rs_events : Events for the reservation station > +event:0x60 counters:cpuid um:offcore_requests_outstanding minimum:2000000 name:offcore_requests_outstanding : Offcore outstanding transactions > +event:0x63 counters:cpuid um:lock_cycles minimum:2000000 name:lock_cycles : Cycles due to LOCK prefixes. > +event:0x79 counters:0,1,2,3 um:idq minimum:2000000 name:idq : Instruction Decode Queue events > +event:0x80 counters:cpuid um:x02 minimum:200000 name:icache : Instruction cache events > +event:0x85 counters:cpuid um:itlb_misses minimum:2000000 name:itlb_misses : I-TLB misses > +event:0x87 counters:cpuid um:ild_stall minimum:2000000 name:ild_stall : Instruction decoding stalls > +event:0x88 counters:cpuid um:br_inst_exec minimum:200000 name:br_inst_exec : Branch instructions > +event:0x89 counters:cpuid um:br_misp_exec minimum:200000 name:br_misp_exec : Mispredicted branch instructions > +event:0x9c counters:0,1,2,3 um:idq_uops_not_delivered minimum:2000000 name:idq_uops_not_delivered : uops not delivered to IDQ. > +event:0xa1 counters:cpuid um:uops_dispatched_port minimum:2000000 name:uops_dispatched_port : Count on which ports uops are dispatched. > +event:0xa2 counters:cpuid um:resource_stalls minimum:2000000 name:resource_stalls : Core resource stalls > +event:0xab counters:cpuid um:dsb2mite_switches minimum:2000000 name:dsb2mite_switches : Number of Decode Stream Buffer (DSB) to MITE switches > +event:0xac counters:cpuid um:dsb_fill minimum:2000000 name:dsb_fill : DSB fill events > +event:0xae counters:cpuid um:one minimum:10000 name:itlb : ITLB events > +event:0xb0 counters:cpuid um:offcore_requests minimum:100000 name:offcore_requests : Requests sent outside the core > +event:0xb1 counters:0,1,2,3 um:uops_dispatched minimum:2000000 name:uops_dispatched : uops dispatched > +event:0xb2 counters:cpuid um:one minimum:2000000 name:offcore_requests_buffer : Offcore requests buffer events > +event:0xb6 counters:cpuid um:one minimum:100000 name:agu_bypass_cancel : AGU bypass cancel > +event:0xbd counters:cpuid um:tlb_flush minimum:10000 name:tlb_flush : TLB flushes > +event:0xbf counters:cpuid um:l1d_blocks minimum:100000 name:l1d_blocks : L1D cache blocking events > +event:0xc0 counters:1 um:one minimum:2000000 name:inst_retired : Instructions retired > +event:0xc1 counters:cpuid um:other_assists minimum:100000 name:other_assists : Instructions that needed an assist > +event:0xc2 counters:0,1,2,3 um:uops_retired minimum:2000000 name:uops_retired : uops that actually retired. > +event:0xc3 counters:cpuid um:machine_clears minimum:100000 name:machine_clears : Number of Machine Clears detected. > +event:0xc4 counters:0,1,2,3 um:br_inst_retired minimum:400000 name:br_inst_retired : Counts branch instructions retired > +event:0xc5 counters:0,1,2,3 um:br_misp_retired minimum:400000 name:br_misp_retired : Counts mispredicted branch instructions > +event:0xca counters:0,1,2,3 um:fp_assist minimum:100000 name:fp_assist : Counts floating point assists > +event:0xcb counters:cpuid um:one minimum:100000 name:hw_interrupts : Number of hardware interrupts received by the processor. > +event:0xcc counters:cpuid um:x20 minimum:2000000 name:rob_misc_events : Count ROB (Register Reorder Buffer) events. > +event:0xcd counters:3 um:x02 minimum:2000000 name:mem_trans_retired : Count memory transactions > +event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000000 name:mem_uops_retired : Count uops with memory accessed retired > +event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000000 name:mem_load_uops_retired : Memory load uops. > +event:0xd2 counters:0,1,2,3 um:mem_load_uops_llc_hit_retired minimum:100000 name:mem_load_uops_llc_hit_retired : Memory load uops with LLC (Last level cache) hit > +event:0xd4 counters:0,1,2,3 um:x02 minimum:10000 name:mem_load_uops_misc_retired : Memory load uops retired > +event:0xf0 counters:cpuid um:l2_trans minimum:200000 name:l2_trans : L2 cache accesses > +event:0xf1 counters:cpuid um:l2_lines_in minimum:100000 name:l2_lines_in : L2 cache lines in > +event:0xf2 counters:cpuid um:l2_lines_out minimum:100000 name:l2_lines_out : L2 cache lines out > +event:0xf4 counters:cpuid um:x10 minimum:100000 name:sq_misc : Store queue misc events > diff --git a/events/i386/sandybridge/unit_masks b/events/i386/sandybridge/unit_masks > new file mode 100644 > index 0000000..cca6cb9 > --- /dev/null > +++ b/events/i386/sandybridge/unit_masks > @@ -0,0 +1,275 @@ > +# > +# Unit masks for the Intel "sandy-bridge" micro architecture > +# > +# See http://ark.intel.com/ for help in identifying sandy-bridge based CPUs > +# > +include:i386/arch_perfmon > +name:x02 type:mandatory default:0x2 > + 0x2 No unit mask > +name:x10 type:mandatory default:0x10 > + 0x10 No unit mask > +name:x20 type:mandatory default:0x20 > + 0x20 No unit mask > +name:ld_blocks type:bitmask default:0x1 > + 0x1 data_unknown blocked loads due to store buffer blocks with unknown data. > + 0x2 store_forward loads blocked by overlapping with store buffer that cannot be forwarded > + 0x8 no_sr This event counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use. > + 0x10 all_block Number of cases where any load is blocked but has no DCU miss. > +name:misalign_mem_ref type:bitmask default:0x1 > + 0x1 loads Speculative cache-line split load uops dispatched to the L1D. > + 0x2 stores Speculative cache-line split Store-address uops dispatched to L1D > +name:ld_blocks_partial type:bitmask default:0x1 > + 0x1 address_alias False dependencies in MOB due to partial compare on address > + 0x8 all_sta_block This event counts the number of times that load operations are temporarily blocked because of older stores, with addresses that are not yet known. A load operation may incur more than one block of this type. > +name:dtlb_load_misses type:bitmask default:0x1 > + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M/1G) > + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M/1G) > + 0x4 walk_duration Cycles PMH is busy with this walk > + 0x10 stlb_hit First level miss but second level hit; no page walk. > +name:int_misc type:bitmask default:0x40 > + 0x40 rat_stall_cycles Cycles Resource Allocation Table (RAT) external stall is sent to Instruction Decode Queue (IDQ) for this thread. > + 0x3 extra:cmask=1 recovery_cycles Number of cycles waiting to be recover after Nuke due to all other cases except JEClear. > + 0x3 extra:cmask=1,edge recovery_stalls_count Edge applied to recovery_cycles, thus counts occurrences. > +name:uops_issued type:bitmask default:0x1 > + 0x1 any Number of Uops issued by the Resource Allocation Table (RAT) to the Reservation Station (RS) > + 0x1 extra:cmask=1,inv stall_cycles cycles no uops issued by this thread. > +name:arith type:bitmask default:0x1 > + 0x1 fpu_div_active Cycles that the divider is busy with any divide or sqrt operation. > + 0x1 extra:cmask=1,edge fpu_div Number of times that the divider is actived, includes INT, SIMD and FP. > +name:l2_rqsts type:bitmask default:0x1 > + 0x1 demand_data_rd_hit Demand Data Read hit L2, no rejects > + 0x4 rfo_hit RFO requests that hit L2 cache > + 0x8 rfo_miss RFO requests that miss L2 cache > + 0x10 code_rd_hit L2 cache hits when fetching instructions, code reads. > + 0x20 code_rd_miss L2 cache misses when fetching instructions > + 0x40 pf_hit Requests from the L2 hardware prefetchers that hit L2 cache > + 0x80 pf_miss Requests from the L2 hardware prefetchers that miss L2 cache > + 0x3 all_demand_data_rd Any data read request to L2 cache > + 0xc all_rfo Any data RFO request to L2 cache > + 0x30 all_code_rd Any code read request to L2 cache > + 0xc0 all_pf Any L2 HW prefetch request to L2 cache > +name:l2_store_lock_rqsts type:bitmask default:0xf > + 0xf all RFOs that access cache lines in any state > + 0x1 miss RFO (as a result of regular RFO or Lock request) miss cache - I state > + 0x4 hit_e RFO (as a result of regular RFO or Lock request) hits cache in E state > + 0x8 hit_m RFO (as a result of regular RFO or Lock request) hits cache in M state > +name:l2_l1d_wb_rqsts type:bitmask default:0x4 > + 0x4 hit_e writebacks from L1D to L2 cache lines in E state > + 0x8 hit_m writebacks from L1D to L2 cache lines in M state > +name:l1d_pend_miss type:bitmask default:0x1 > + 0x1 pending Cycles with L1D load Misses outstanding. > + 0x1 extra:cmask=1,edge occurences This event counts the number of L1D misses outstanding occurences. > +name:dtlb_store_misses type:bitmask default:0x1 > + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M/1G) > + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M/1G) > + 0x4 walk_duration Cycles PMH is busy with this walk > + 0x10 stlb_hit First level miss but second level hit; no page walk. Only relevant if multiple levels. > +name:load_hit_pre type:bitmask default:0x1 > + 0x1 sw_pf Load dispatches that hit fill buffer allocated for S/W prefetch. > + 0x2 hw_pf Load dispatches that hit fill buffer allocated for HW prefetch. > +name:l1d type:bitmask default:0x1 > + 0x1 replacement L1D Data line replacements. > + 0x2 allocated_in_m L1D M-state Data Cache Lines Allocated > + 0x4 eviction L1D M-state Data Cache Lines Evicted due to replacement (only) > + 0x8 all_m_replacement All Modified lines evicted out of L1D > +name:partial_rat_stalls type:bitmask default:0x20 > + 0x20 flags_merge_uop Number of perf sensitive flags-merge uops added by Sandy Bridge u-arch. > + 0x40 slow_lea_window Number of cycles with at least 1 slow Load Effective Address (LEA) uop being allocated. > + 0x80 mul_single_uop Number of Multiply packed/scalar single precision uops allocated > + 0x20 extra:cmask=1 flags_merge_uop_cycles Cycles with perf sensitive flags-merge uops added by SandyBridge u-arch. > +name:resource_stalls2 type:bitmask default:0x40 > + 0x40 bob_full Cycles Allocator is stalled due Branch Order Buffer (BOB). > + 0xf all_prf_control Resource stalls2 control structures full for physical registers > + 0xc all_fl_empty Cycles with either free list is empty > + 0x4f ooo_rsrc Resource stalls2 control structures full Physical Register Reclaim Table (PRRT), Physical History Table (PHT), INT or SIMD Free List (FL), Branch Order Buffer (BOB) > +name:cpl_cycles type:bitmask default:0x1 > + 0x1 ring0 Unhalted core cycles the Thread was in Rings 0. > + 0x1 extra:cmask=1,edge ring0_trans Transitions from ring123 to Ring0. > + 0x2 ring123 Unhalted core cycles the Thread was in Rings 1/2/3. > +name:offcore_requests_outstanding type:bitmask default:0x1 > + 0x1 demand_data_rd Offcore outstanding Demand Data Read transactions in the SuperQueue (SQ), queue to uncore, every cycle. Includes L1D data hardware prefetches. > + 0x1 extra:cmask=1 cycles_with_demand_data_rd cycles there are Offcore outstanding RD data transactions in the SuperQueue (SQ), queue to uncore. > + 0x2 demand_code_rd Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x4 demand_rfo Offcore outstanding RFO (store) transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x8 all_data_rd Offcore outstanding all cacheable Core Data Read transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x8 extra:cmask=1 cycles_with_data_rd Cycles there are Offcore outstanding all Data read transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x2 extra:cmask=1 cycles_with_demand_code_rd Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x4 extra:cmask=1 cycles_with_demand_rfo Cycles with offcore outstanding demand RFO Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. > +name:lock_cycles type:bitmask default:0x1 > + 0x1 split_lock_uc_lock_duration Cycles in which the L1D and L2 are locked, due to a UC lock or split lock > + 0x2 cache_lock_duration cycles that theL1D is locked > +name:idq type:bitmask default:0x2 > + 0x2 empty Cycles the Instruction Decode Queue (IDQ) is empty. > + 0x4 mite_uops Number of uops delivered to Instruction Decode Queue (IDQ) from MITE path. > + 0x8 dsb_uops Number of uops delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path. > + 0x10 ms_dsb_uops Number of Uops delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by Decode Stream Buffer (DSB). > + 0x20 ms_mite_uops Number of Uops delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by MITE. > + 0x30 ms_uops Number of Uops were delivered into Instruction Decode Queue (IDQ) from MS, initiated by Decode Stream Buffer (DSB) or MITE. > + 0x30 extra:cmask=1 ms_cycles Number of cycles that Uops were delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by Decode Stream Buffer (DSB) or MITE. > + 0x4 extra:cmask=1 mite_cycles Cycles MITE is active > + 0x8 extra:cmask=1 dsb_cycles Cycles Decode Stream Buffer (DSB) is active > + 0x10 extra:cmask=1 ms_dsb_cycles Cycles Decode Stream Buffer (DSB) Microcode Sequenser (MS) is active > + 0x10 extra:cmask=1,edge ms_dsb_occur Occurences of Decode Stream Buffer (DSB) Microcode Sequenser (MS) going active > + 0x18 extra:cmask=1 all_dsb_cycles_any_uops Cycles Decode Stream Buffer (DSB) is delivering anything > + 0x18 extra:cmask=4 all_dsb_cycles_4_uops Cycles Decode Stream Buffer (DSB) is delivering 4 Uops > + 0x24 extra:cmask=1 all_mite_cycles_any_uops Cycles MITE is delivering anything > + 0x24 extra:cmask=4 all_mite_cycles_4_uops Cycles MITE is delivering 4 Uops > + 0x3c mite_all_uops Number of uops delivered to Instruction Decode Queue (IDQ) from any path. > +name:itlb_misses type:bitmask default:0x1 > + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M) > + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M) > + 0x4 walk_duration Cycles PMH is busy with this walk. > + 0x10 stlb_hit First level miss but second level hit; no page walk. > +name:ild_stall type:bitmask default:0x1 > + 0x1 lcp Stall "occurrences" due to length changing prefixes (LCP). > + 0x4 iq_full Stall cycles when instructions cannot be written because the Instruction Queue (IQ) is full. > +name:br_inst_exec type:bitmask default:0xff > + 0xff all_branches All branch instructions executed. > + 0x41 nontaken_conditional All macro conditional nontaken branch instructions. > + 0x81 taken_conditional All macro conditional taken branch instructions. > + 0x82 taken_direct_jump All macro unconditional taken branch instructions, excluding calls and indirects. > + 0x84 taken_indirect_jump_non_call_ret All taken indirect branches that are not calls nor returns. > + 0x88 taken_indirect_near_return All taken indirect branches that have a return mnemonic. > + 0x90 taken_direct_near_call All taken non-indirect calls. > + 0xa0 taken_indirect_near_call All taken indirect calls, including both register and memory indirect. > + 0xc1 all_conditional All macro conditional branch instructions. > + 0xc2 all_direct_jmp All macro unconditional branch instructions, excluding calls and indirects > + 0xc4 all_indirect_jump_non_call_ret All indirect branches that are not calls nor returns. > + 0xc8 all_indirect_near_return All indirect return branches. > + 0xd0 all_direct_near_call All non-indirect calls executed. > +name:br_misp_exec type:bitmask default:0xff > + 0xff all_branches All mispredicted branch instructions executed. > + 0x41 nontaken_conditional All nontaken mispredicted macro conditional branch instructions. > + 0x81 taken_conditional All taken mispredicted macro conditional branch instructions. > + 0x84 taken_indirect_jump_non_call_ret All taken mispredicted indirect branches that are not calls nor returns. > + 0x88 taken_return_near All taken mispredicted indirect branches that have a return mnemonic. > + 0x90 taken_direct_near_call All taken mispredicted non-indirect calls. > + 0xa0 taken_indirect_near_call All taken mispredicted indirect calls, including both register and memory indirect. > + 0xc1 all_conditional All mispredicted macro conditional branch instructions. > + 0xc4 all_indirect_jump_non_call_ret All mispredicted indirect branches that are not calls nor returns. > + 0xd0 all_direct_near_call All mispredicted non-indirect calls > +name:idq_uops_not_delivered type:bitmask default:0x1 > + 0x1 core Count number of non-delivered uops to Resource Allocation Table (RAT). > + 0x1 extra:cmask=4 cycles_0_uops_deliv.core Counts the cycles no uops were delivered > + 0x1 extra:cmask=3 cycles_le_1_uop_deliv.core Counts the cycles less than 1 uops were delivered > + 0x1 extra:cmask=2 cycles_le_2_uop_deliv.core Counts the cycles less than 2 uops were delivered > + 0x1 extra:cmask=1 cycles_le_3_uop_deliv.core Counts the cycles less than 3 uops were delivered > + 0x1 extra:cmask=4,inv cycles_ge_1_uop_deliv.core Cycles when 1 or more uops were delivered to the by the front end. > + 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. > +name:uops_dispatched_port type:bitmask default:0x1 > + 0x1 port_0 Cycles which a Uop is dispatched on port 0 > + 0x2 port_1 Cycles which a Uop is dispatched on port 1 > + 0x4 port_2_ld Cycles which a load Uop is dispatched on port 2 > + 0x8 port_2_sta Cycles which a STA Uop is dispatched on port 2 > + 0x10 port_3_ld Cycles which a load Uop is dispatched on port 3 > + 0x20 port_3_sta Cycles which a STA Uop is dispatched on port 3 > + 0x40 port_4 Cycles which a Uop is dispatched on port 4 > + 0x80 port_5 Cycles which a Uop is dispatched on port 5 > + 0xc port_2 Uops disptached to port 2, loads and stores (speculative and retired) > + 0x30 port_3 Uops disptached to port 3, loads and stores (speculative and retired) > + 0xc port_2_core Uops disptached to port 2, loads and stores per core (speculative and retired) > + 0x30 port_3_core Uops disptached to port 3, loads and stores per core (speculative and retired) > +name:resource_stalls type:bitmask default:0x1 > + 0x1 any Cycles Allocation is stalled due to Resource Related reason. > + 0x2 lb Cycles Allocator is stalled due to Load Buffer full > + 0x4 rs Stall due to no eligible Reservation Station (RS) entry available. > + 0x8 sb Cycles Allocator is stalled due to Store Buffer full (not including draining from synch). > + 0x10 rob ROB full cycles. > + 0xe mem_rs Resource stalls due to LB, SB or Reservation Station (RS) being completely in use > + 0xf0 ooo_rsrc Resource stalls due to Rob being full, FCSW, MXCSR and OTHER > + 0xa lb_sb Resource stalls due to load or store buffers > +name:dsb2mite_switches type:bitmask default:0x1 > + 0x1 count Number of Decode Stream Buffer (DSB) to MITE switches > + 0x2 penalty_cycles Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles. > +name:dsb_fill type:bitmask default:0x2 > + 0x2 other_cancel Count number of times a valid DSB fill has been actually cancelled for any reason. > + 0x8 exceed_dsb_lines Decode Stream Buffer (DSB) Fill encountered > 3 Decode Stream Buffer (DSB) lines. > + 0xa all_cancel Count number of times a valid Decode Stream Buffer (DSB) fill has been actually cancelled for any reason. > +name:offcore_requests type:bitmask default:0x1 > + 0x1 demand_data_rd Demand Data Read requests sent to uncore > + 0x2 demand_code_rd Offcore Code read requests. Includes Cacheable and Un-cacheables. > + 0x4 demand_rfo Offcore Demand RFOs. Includes regular RFO, Locks, ItoM. > + 0x8 all_data_rd Offcore Demand and prefetch data reads returned to the core. > +name:uops_dispatched type:bitmask default:0x1 > + 0x1 thread Counts total number of uops to be dispatched per-thread each cycle. > + 0x1 extra:cmask=1,inv stall_cycles Counts number of cycles no uops were dispatced to be executed on this thread. > + 0x2 core Counts total number of uops dispatched from any thread > +name:tlb_flush type:bitmask default:0x1 > + 0x1 dtlb_thread Count number of DTLB flushes of thread-specific entries. > + 0x20 stlb_any Count number of any STLB flushes > +name:l1d_blocks type:bitmask default:0x1 > + 0x1 ld_bank_conflict Any dispatched loads cancelled due to DCU bank conflict > + 0x5 extra:cmask=1 bank_conflict_cycles Cycles with l1d blocks due to bank conflicts > +name:other_assists type:bitmask default:0x2 > + 0x2 itlb_miss_retired Instructions that experienced an ITLB miss. Non Pebs > + 0x10 avx_to_sse Number of transitions from AVX-256 to legacy SSE when penalty applicable Non Pebs > + 0x20 sse_to_avx Number of transitions from legacy SSE to AVX-256 when penalty applicable Non Pebs > +name:uops_retired type:bitmask default:0x1 > + 0x1 all All uops that actually retired. > + 0x2 retire_slots number of retirement slots used non PEBS > + 0x1 extra:cmask=1,inv stall_cycles Cycles no executable uops retired > + 0x1 extra:cmask=10,inv total_cycles Number of cycles using always true condition applied to non PEBS uops retired event. > +name:machine_clears type:bitmask default:0x2 > + 0x2 memory_ordering Number of Memory Ordering Machine Clears detected. > + 0x4 smc Number of Self-modifying code (SMC) Machine Clears detected. > + 0x20 maskmov Number of AVX masked mov Machine Clears detected. > +name:br_inst_retired type:bitmask default:0x1 > + 0x1 conditional Counts all taken and not taken macro conditional branch instructions. > + 0x2 near_call Counts all macro direct and indirect near calls. non PEBS > + 0x8 near_return This event counts the number of near ret instructions retired. > + 0x10 not_taken Counts all not taken macro branch instructions retired. > + 0x20 near_taken Counts the number of near branch taken instructions retired. > + 0x40 far_branch Counts the number of far branch instructions retired. > + 0x4 all_branches_ps Counts all taken and not taken macro branches including far branches.(Precise Event) > + 0x2 near_call_r3 Ring123 only near calls (non precise) > + 0x2 near_call_r3_ps Ring123 only near calls (precise event) > +name:br_misp_retired type:bitmask default:0x1 > + 0x1 conditional All mispredicted macro conditional branch instructions. > + 0x2 near_call All macro direct and indirect near calls > + 0x10 not_taken number of branch instructions retired that were mispredicted and not-taken. > + 0x20 taken number of branch instructions retired that were mispredicted and taken. > + 0x4 all_branches_ps all macro branches (Precise Event) > +name:fp_assist type:bitmask default:0x1e > + 0x1e extra:cmask=1 any Counts any FP_ASSIST umask was incrementing. > + 0x2 x87_output output - Numeric Overflow, Numeric Underflow, Inexact Result > + 0x4 x87_input input - Invalid Operation, Denormal Operand, SNaN Operand > + 0x8 simd_output Any output SSE* FP Assist - Numeric Overflow, Numeric Underflow. > + 0x10 simd_input Any input SSE* FP Assist > +name:mem_uops_retired type:bitmask default:0x11 > + 0x11 stlb_miss_loads STLB misses dues to retired loads > + 0x12 stlb_miss_stores STLB misses dues to retired stores > + 0x21 lock_loads Locked retired loads > + 0x41 split_loads Retired loads causing cacheline splits > + 0x42 split_stores Retired stores causing cacheline splits > + 0x81 all_loads Any retired loads > + 0x82 all_stores Any retired stores > +name:mem_load_uops_retired type:bitmask default:0x1 > + 0x1 l1_hit Load hit in nearest-level (L1D) cache > + 0x2 l2_hit Load hit in mid-level (L2) cache > + 0x4 llc_hit Load hit in last-level (L3) cache with no snoop needed > + 0x40 hit_lfb A load missed L1D but hit the Fill Buffer > +name:mem_load_uops_llc_hit_retired type:bitmask default:0x1 > + 0x1 xsnp_miss Load LLC Hit and a cross-core Snoop missed in on-pkg core cache > + 0x2 xsnp_hit Load LLC Hit and a cross-core Snoop hits in on-pkg core cache > + 0x4 xsnp_hitm Load had HitM Response from a core on same socket (shared LLC). > + 0x8 xsnp_none Load hit in last-level (L3) cache with no snoop needed. > +name:l2_trans type:bitmask default:0x80 > + 0x80 all_requests Transactions accessing L2 pipe > + 0x1 demand_data_rd Demand Data Read requests that access L2 cache, includes L1D prefetches. > + 0x2 rfo RFO requests that access L2 cache > + 0x4 code_rd L2 cache accesses when fetching instructions including L1D code prefetches > + 0x8 all_pf L2 or LLC HW prefetches that access L2 cache > + 0x10 l1d_wb L1D writebacks that access L2 cache > + 0x20 l2_fill L2 fill requests that access L2 cache > + 0x40 l2_wb L2 writebacks that access L2 cache > +name:l2_lines_in type:bitmask default:0x7 > + 0x7 all L2 cache lines filling L2 > + 0x1 i L2 cache lines in I state filling L2 > + 0x2 s L2 cache lines in S state filling L2 > + 0x4 e L2 cache lines in E state filling L2 > +name:l2_lines_out type:bitmask default:0x1 > + 0x1 demand_clean Clean line evicted by a demand > + 0x2 demand_dirty Dirty line evicted by a demand > + 0x4 pf_clean Clean line evicted by an L2 Prefetch > + 0x8 pf_dirty Dirty line evicted by an L2 Prefetch > + 0xa dirty_all Any Dirty line evicted > diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c > index b2ebf54..9d11b21 100644 > --- a/libop/op_cpu_type.c > +++ b/libop/op_cpu_type.c > @@ -93,6 +93,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { > { "Intel Westmere microarchitecture", "i386/westmere", CPU_WESTMERE, 4 }, > { "ARMv7 Scorpion", "arm/armv7-scorpion", CPU_ARM_SCORPION, 5 }, > { "ARMv7 ScorpionMP", "arm/armv7-scorpionmp", CPU_ARM_SCORPIONMP, 5 }, > + { "Intel Sandy Bridge microarchitecture", "i386/sandybridge", CPU_SANDYBRIDGE, 8 }, > }; > > static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); > @@ -117,6 +118,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) > case CPU_ATOM: > case CPU_NEHALEM: > case CPU_WESTMERE: > + case CPU_SANDYBRIDGE: > return CPU_ARCH_PERFMON; > default: > /* assume processor in a class by itself */ > diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h > index 9283ec7..d6cae3a 100644 > --- a/libop/op_cpu_type.h > +++ b/libop/op_cpu_type.h > @@ -90,6 +90,7 @@ typedef enum { > CPU_WESTMERE, /* Intel Westmere microarchitecture */ > CPU_ARM_SCORPION, /**< ARM SCORPION */ > CPU_ARM_SCORPIONMP, /**< ARM SCORPIONMP */ > + CPU_SANDYBRIDGE, /* Intel Sandy-Bridge microarchitecture */ > MAX_CPU_TYPE > } op_cpu; > > diff --git a/libop/op_events.c b/libop/op_events.c > index 502ff01..0aa0ad3 100644 > --- a/libop/op_events.c > +++ b/libop/op_events.c > @@ -1023,6 +1023,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) > case CPU_CORE_I7: > case CPU_NEHALEM: > case CPU_WESTMERE: > + case CPU_SANDYBRIDGE: > case CPU_MIPS_LOONGSON2: > case CPU_FAMILY12H: > case CPU_FAMILY14H: > diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h > index f1d67a5..a529dd6 100644 > --- a/libop/op_hw_specific.h > +++ b/libop/op_hw_specific.h > @@ -117,6 +117,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) > case 0x2c: /* Westmere-EP (Intel Xeon 5600 series) */ > case 0x2f: /* Westmere-EX */ > return CPU_WESTMERE; > + case 0x2a: > + case 0x2d: > + return CPU_SANDYBRIDGE; > } > } > return cpu_type; > diff --git a/utils/ophelp.c b/utils/ophelp.c > index b3aebde..f4e0653 100644 > --- a/utils/ophelp.c > +++ b/utils/ophelp.c > @@ -533,6 +533,7 @@ int main(int argc, char const * argv[]) > case CPU_CORE_I7: > case CPU_NEHALEM: > case CPU_WESTMERE: > + case CPU_SANDYBRIDGE: > case CPU_ATOM: > event_doc = > "See Intel Architecture Developer's Manual Volume 3B, Appendix A and\n" |
From: Maynard J. <may...@us...> - 2011-05-17 20:06:04
|
On 05/17/2011 2:12 PM, Suthikulpanit, Suravee wrote: > Besides a couple issues: > - events file already exists and exactly the same (I could just move the existing events file so that the patch would apply cleanly). > - white spaces (I could fix this during commit.) > > This patch applied clean. You can commit it then. > > PS: Do we mention anywhere (i.e. in the opcontrol --help) regarding the "Named-unitmask" feature? Yes, the opcontrol man page and the oprofile user manual were updated via patch 2. But when I just double-checked, the wording in the oprofile manual doesn't match the opcontrol man page, so I'll fix that and commit it. -Maynard > > -----Original Message----- > From: Maynard Johnson [mailto:may...@us...] > Sent: Monday, May 16, 2011 2:39 PM > To: Andi Kleen; Suthikulpanit, Suravee > Cc: opr...@li...; Andi Kleen > Subject: Re: [PATCH 3/4] Add Sandy Bridge support > > Andi Kleen wrote: >> From: Andi Kleen<ak...@li...> >> >> Add an event list for Sandy Bridge. Modify oprofile to detect Sandy Bridges. > > Suravee, > I'm pretty sure this patch is fine, but as the x86 oprofile userspace maintainer, I'd like your ack. If you're OK with it, you can commit it. > > -Maynard > >> >> Signed-off-by: Andi Kleen<ak...@li...> >> --- >> events/Makefile.am | 1 + >> events/i386/sandybridge/events | 67 +++++++++ >> events/i386/sandybridge/unit_masks | 275 ++++++++++++++++++++++++++++++++++++ >> libop/op_cpu_type.c | 2 + >> libop/op_cpu_type.h | 1 + >> libop/op_events.c | 1 + >> libop/op_hw_specific.h | 3 + >> utils/ophelp.c | 1 + >> 8 files changed, 351 insertions(+), 0 deletions(-) >> create mode 100644 events/i386/sandybridge/events >> create mode 100644 events/i386/sandybridge/unit_masks >> >> diff --git a/events/Makefile.am b/events/Makefile.am >> index 60c4164..c4101cc 100644 >> --- a/events/Makefile.am >> +++ b/events/Makefile.am >> @@ -18,6 +18,7 @@ event_files = \ >> i386/core_i7/events i386/core_i7/unit_masks \ >> i386/nehalem/events i386/nehalem/unit_masks \ >> i386/westmere/events i386/westmere/unit_masks \ >> + i386/sandybridge/events i386/sandybridge/unit_masks \ >> ia64/ia64/events ia64/ia64/unit_masks \ >> ia64/itanium2/events ia64/itanium2/unit_masks \ >> ia64/itanium/events ia64/itanium/unit_masks \ >> diff --git a/events/i386/sandybridge/events b/events/i386/sandybridge/events >> new file mode 100644 >> index 0000000..bf941c7 >> --- /dev/null >> +++ b/events/i386/sandybridge/events >> @@ -0,0 +1,67 @@ >> +# >> +# Intel "sandy-bridge" microarchitecture core events. >> +# >> +# See http://ark.intel.com/ for help in identifying sandy-bridge based CPUs >> +# >> +# Note the minimum counts are not discovered experimentally and could be likely >> +# lowered in many cases without ill effect. >> +# >> +include:i386/arch_perfmon >> +event:0x03 counters:cpuid um:ld_blocks minimum:100000 name:ld_blocks : blocked loads >> +event:0x05 counters:cpuid um:misalign_mem_ref minimum:2000000 name:misalign_mem_ref : Misaligned memory references >> +event:0x07 counters:cpuid um:ld_blocks_partial minimum:100000 name:ld_blocks_partial : Partial loads >> +event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000000 name:dtlb_load_misses : D-TLB misses >> +event:0x0d counters:cpuid um:int_misc minimum:2000000 name:int_misc : Instruction decoder events >> +event:0x0e counters:0,1,2,3 um:uops_issued minimum:2000000 name:uops_issued : Number of Uops issued >> +event:0x14 counters:cpuid um:arith minimum:2000000 name:arith : Misc ALU events >> +event:0x17 counters:cpuid um:one minimum:2000000 name:insts_written_to_iq : Number of instructions written to Instruction Queue (IQ) this cycle. >> +event:0x24 counters:cpuid um:l2_rqsts minimum:200000 name:l2_rqsts : Requests from L2 cache >> +event:0x27 counters:cpuid um:l2_store_lock_rqsts minimum:200000 name:l2_store_lock_rqsts : L2 cache store lock requests >> +event:0x28 counters:cpuid um:l2_l1d_wb_rqsts minimum:200000 name:l2_l1d_wb_rqsts : writebacks from L1D to the L2 cache >> +event:0x48 counters:2 um:l1d_pend_miss minimum:2000000 name:l1d_pend_miss : Cycles with L1D load Misses outstanding. >> +event:0x49 counters:cpuid um:dtlb_store_misses minimum:2000000 name:dtlb_store_misses : D-TLB store misses >> +event:0x4c counters:cpuid um:load_hit_pre minimum:100000 name:load_hit_pre : Load dispatches that hit fill buffer >> +event:0x4e counters:cpuid um:x02 minimum:2000000 name:hw_pre_req : Hardware Prefetch requests >> +event:0x51 counters:cpuid um:l1d minimum:2000000 name:l1d : L1D cache events >> +event:0x59 counters:cpuid um:partial_rat_stalls minimum:2000000 name:partial_rat_stalls : Partial RAT stalls >> +event:0x5b counters:0,1,2,3 um:resource_stalls2 minimum:2000000 name:resource_stalls2 : Misc resource stalls >> +event:0x5c counters:cpuid um:cpl_cycles minimum:2000000 name:cpl_cycles : Unhalted core cycles in specific rings >> +event:0x5e counters:0,1,2,3 um:one minimum:2000000 name:rs_events : Events for the reservation station >> +event:0x60 counters:cpuid um:offcore_requests_outstanding minimum:2000000 name:offcore_requests_outstanding : Offcore outstanding transactions >> +event:0x63 counters:cpuid um:lock_cycles minimum:2000000 name:lock_cycles : Cycles due to LOCK prefixes. >> +event:0x79 counters:0,1,2,3 um:idq minimum:2000000 name:idq : Instruction Decode Queue events >> +event:0x80 counters:cpuid um:x02 minimum:200000 name:icache : Instruction cache events >> +event:0x85 counters:cpuid um:itlb_misses minimum:2000000 name:itlb_misses : I-TLB misses >> +event:0x87 counters:cpuid um:ild_stall minimum:2000000 name:ild_stall : Instruction decoding stalls >> +event:0x88 counters:cpuid um:br_inst_exec minimum:200000 name:br_inst_exec : Branch instructions >> +event:0x89 counters:cpuid um:br_misp_exec minimum:200000 name:br_misp_exec : Mispredicted branch instructions >> +event:0x9c counters:0,1,2,3 um:idq_uops_not_delivered minimum:2000000 name:idq_uops_not_delivered : uops not delivered to IDQ. >> +event:0xa1 counters:cpuid um:uops_dispatched_port minimum:2000000 name:uops_dispatched_port : Count on which ports uops are dispatched. >> +event:0xa2 counters:cpuid um:resource_stalls minimum:2000000 name:resource_stalls : Core resource stalls >> +event:0xab counters:cpuid um:dsb2mite_switches minimum:2000000 name:dsb2mite_switches : Number of Decode Stream Buffer (DSB) to MITE switches >> +event:0xac counters:cpuid um:dsb_fill minimum:2000000 name:dsb_fill : DSB fill events >> +event:0xae counters:cpuid um:one minimum:10000 name:itlb : ITLB events >> +event:0xb0 counters:cpuid um:offcore_requests minimum:100000 name:offcore_requests : Requests sent outside the core >> +event:0xb1 counters:0,1,2,3 um:uops_dispatched minimum:2000000 name:uops_dispatched : uops dispatched >> +event:0xb2 counters:cpuid um:one minimum:2000000 name:offcore_requests_buffer : Offcore requests buffer events >> +event:0xb6 counters:cpuid um:one minimum:100000 name:agu_bypass_cancel : AGU bypass cancel >> +event:0xbd counters:cpuid um:tlb_flush minimum:10000 name:tlb_flush : TLB flushes >> +event:0xbf counters:cpuid um:l1d_blocks minimum:100000 name:l1d_blocks : L1D cache blocking events >> +event:0xc0 counters:1 um:one minimum:2000000 name:inst_retired : Instructions retired >> +event:0xc1 counters:cpuid um:other_assists minimum:100000 name:other_assists : Instructions that needed an assist >> +event:0xc2 counters:0,1,2,3 um:uops_retired minimum:2000000 name:uops_retired : uops that actually retired. >> +event:0xc3 counters:cpuid um:machine_clears minimum:100000 name:machine_clears : Number of Machine Clears detected. >> +event:0xc4 counters:0,1,2,3 um:br_inst_retired minimum:400000 name:br_inst_retired : Counts branch instructions retired >> +event:0xc5 counters:0,1,2,3 um:br_misp_retired minimum:400000 name:br_misp_retired : Counts mispredicted branch instructions >> +event:0xca counters:0,1,2,3 um:fp_assist minimum:100000 name:fp_assist : Counts floating point assists >> +event:0xcb counters:cpuid um:one minimum:100000 name:hw_interrupts : Number of hardware interrupts received by the processor. >> +event:0xcc counters:cpuid um:x20 minimum:2000000 name:rob_misc_events : Count ROB (Register Reorder Buffer) events. >> +event:0xcd counters:3 um:x02 minimum:2000000 name:mem_trans_retired : Count memory transactions >> +event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000000 name:mem_uops_retired : Count uops with memory accessed retired >> +event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000000 name:mem_load_uops_retired : Memory load uops. >> +event:0xd2 counters:0,1,2,3 um:mem_load_uops_llc_hit_retired minimum:100000 name:mem_load_uops_llc_hit_retired : Memory load uops with LLC (Last level cache) hit >> +event:0xd4 counters:0,1,2,3 um:x02 minimum:10000 name:mem_load_uops_misc_retired : Memory load uops retired >> +event:0xf0 counters:cpuid um:l2_trans minimum:200000 name:l2_trans : L2 cache accesses >> +event:0xf1 counters:cpuid um:l2_lines_in minimum:100000 name:l2_lines_in : L2 cache lines in >> +event:0xf2 counters:cpuid um:l2_lines_out minimum:100000 name:l2_lines_out : L2 cache lines out >> +event:0xf4 counters:cpuid um:x10 minimum:100000 name:sq_misc : Store queue misc events >> diff --git a/events/i386/sandybridge/unit_masks b/events/i386/sandybridge/unit_masks >> new file mode 100644 >> index 0000000..cca6cb9 >> --- /dev/null >> +++ b/events/i386/sandybridge/unit_masks >> @@ -0,0 +1,275 @@ >> +# >> +# Unit masks for the Intel "sandy-bridge" micro architecture >> +# >> +# See http://ark.intel.com/ for help in identifying sandy-bridge based CPUs >> +# >> +include:i386/arch_perfmon >> +name:x02 type:mandatory default:0x2 >> + 0x2 No unit mask >> +name:x10 type:mandatory default:0x10 >> + 0x10 No unit mask >> +name:x20 type:mandatory default:0x20 >> + 0x20 No unit mask >> +name:ld_blocks type:bitmask default:0x1 >> + 0x1 data_unknown blocked loads due to store buffer blocks with unknown data. >> + 0x2 store_forward loads blocked by overlapping with store buffer that cannot be forwarded >> + 0x8 no_sr This event counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use. >> + 0x10 all_block Number of cases where any load is blocked but has no DCU miss. >> +name:misalign_mem_ref type:bitmask default:0x1 >> + 0x1 loads Speculative cache-line split load uops dispatched to the L1D. >> + 0x2 stores Speculative cache-line split Store-address uops dispatched to L1D >> +name:ld_blocks_partial type:bitmask default:0x1 >> + 0x1 address_alias False dependencies in MOB due to partial compare on address >> + 0x8 all_sta_block This event counts the number of times that load operations are temporarily blocked because of older stores, with addresses that are not yet known. A load operation may incur more than one block of this type. >> +name:dtlb_load_misses type:bitmask default:0x1 >> + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M/1G) >> + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M/1G) >> + 0x4 walk_duration Cycles PMH is busy with this walk >> + 0x10 stlb_hit First level miss but second level hit; no page walk. >> +name:int_misc type:bitmask default:0x40 >> + 0x40 rat_stall_cycles Cycles Resource Allocation Table (RAT) external stall is sent to Instruction Decode Queue (IDQ) for this thread. >> + 0x3 extra:cmask=1 recovery_cycles Number of cycles waiting to be recover after Nuke due to all other cases except JEClear. >> + 0x3 extra:cmask=1,edge recovery_stalls_count Edge applied to recovery_cycles, thus counts occurrences. >> +name:uops_issued type:bitmask default:0x1 >> + 0x1 any Number of Uops issued by the Resource Allocation Table (RAT) to the Reservation Station (RS) >> + 0x1 extra:cmask=1,inv stall_cycles cycles no uops issued by this thread. >> +name:arith type:bitmask default:0x1 >> + 0x1 fpu_div_active Cycles that the divider is busy with any divide or sqrt operation. >> + 0x1 extra:cmask=1,edge fpu_div Number of times that the divider is actived, includes INT, SIMD and FP. >> +name:l2_rqsts type:bitmask default:0x1 >> + 0x1 demand_data_rd_hit Demand Data Read hit L2, no rejects >> + 0x4 rfo_hit RFO requests that hit L2 cache >> + 0x8 rfo_miss RFO requests that miss L2 cache >> + 0x10 code_rd_hit L2 cache hits when fetching instructions, code reads. >> + 0x20 code_rd_miss L2 cache misses when fetching instructions >> + 0x40 pf_hit Requests from the L2 hardware prefetchers that hit L2 cache >> + 0x80 pf_miss Requests from the L2 hardware prefetchers that miss L2 cache >> + 0x3 all_demand_data_rd Any data read request to L2 cache >> + 0xc all_rfo Any data RFO request to L2 cache >> + 0x30 all_code_rd Any code read request to L2 cache >> + 0xc0 all_pf Any L2 HW prefetch request to L2 cache >> +name:l2_store_lock_rqsts type:bitmask default:0xf >> + 0xf all RFOs that access cache lines in any state >> + 0x1 miss RFO (as a result of regular RFO or Lock request) miss cache - I state >> + 0x4 hit_e RFO (as a result of regular RFO or Lock request) hits cache in E state >> + 0x8 hit_m RFO (as a result of regular RFO or Lock request) hits cache in M state >> +name:l2_l1d_wb_rqsts type:bitmask default:0x4 >> + 0x4 hit_e writebacks from L1D to L2 cache lines in E state >> + 0x8 hit_m writebacks from L1D to L2 cache lines in M state >> +name:l1d_pend_miss type:bitmask default:0x1 >> + 0x1 pending Cycles with L1D load Misses outstanding. >> + 0x1 extra:cmask=1,edge occurences This event counts the number of L1D misses outstanding occurences. >> +name:dtlb_store_misses type:bitmask default:0x1 >> + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M/1G) >> + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M/1G) >> + 0x4 walk_duration Cycles PMH is busy with this walk >> + 0x10 stlb_hit First level miss but second level hit; no page walk. Only relevant if multiple levels. >> +name:load_hit_pre type:bitmask default:0x1 >> + 0x1 sw_pf Load dispatches that hit fill buffer allocated for S/W prefetch. >> + 0x2 hw_pf Load dispatches that hit fill buffer allocated for HW prefetch. >> +name:l1d type:bitmask default:0x1 >> + 0x1 replacement L1D Data line replacements. >> + 0x2 allocated_in_m L1D M-state Data Cache Lines Allocated >> + 0x4 eviction L1D M-state Data Cache Lines Evicted due to replacement (only) >> + 0x8 all_m_replacement All Modified lines evicted out of L1D >> +name:partial_rat_stalls type:bitmask default:0x20 >> + 0x20 flags_merge_uop Number of perf sensitive flags-merge uops added by Sandy Bridge u-arch. >> + 0x40 slow_lea_window Number of cycles with at least 1 slow Load Effective Address (LEA) uop being allocated. >> + 0x80 mul_single_uop Number of Multiply packed/scalar single precision uops allocated >> + 0x20 extra:cmask=1 flags_merge_uop_cycles Cycles with perf sensitive flags-merge uops added by SandyBridge u-arch. >> +name:resource_stalls2 type:bitmask default:0x40 >> + 0x40 bob_full Cycles Allocator is stalled due Branch Order Buffer (BOB). >> + 0xf all_prf_control Resource stalls2 control structures full for physical registers >> + 0xc all_fl_empty Cycles with either free list is empty >> + 0x4f ooo_rsrc Resource stalls2 control structures full Physical Register Reclaim Table (PRRT), Physical History Table (PHT), INT or SIMD Free List (FL), Branch Order Buffer (BOB) >> +name:cpl_cycles type:bitmask default:0x1 >> + 0x1 ring0 Unhalted core cycles the Thread was in Rings 0. >> + 0x1 extra:cmask=1,edge ring0_trans Transitions from ring123 to Ring0. >> + 0x2 ring123 Unhalted core cycles the Thread was in Rings 1/2/3. >> +name:offcore_requests_outstanding type:bitmask default:0x1 >> + 0x1 demand_data_rd Offcore outstanding Demand Data Read transactions in the SuperQueue (SQ), queue to uncore, every cycle. Includes L1D data hardware prefetches. >> + 0x1 extra:cmask=1 cycles_with_demand_data_rd cycles there are Offcore outstanding RD data transactions in the SuperQueue (SQ), queue to uncore. >> + 0x2 demand_code_rd Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. >> + 0x4 demand_rfo Offcore outstanding RFO (store) transactions in the SuperQueue (SQ), queue to uncore, every cycle. >> + 0x8 all_data_rd Offcore outstanding all cacheable Core Data Read transactions in the SuperQueue (SQ), queue to uncore, every cycle. >> + 0x8 extra:cmask=1 cycles_with_data_rd Cycles there are Offcore outstanding all Data read transactions in the SuperQueue (SQ), queue to uncore, every cycle. >> + 0x2 extra:cmask=1 cycles_with_demand_code_rd Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. >> + 0x4 extra:cmask=1 cycles_with_demand_rfo Cycles with offcore outstanding demand RFO Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. >> +name:lock_cycles type:bitmask default:0x1 >> + 0x1 split_lock_uc_lock_duration Cycles in which the L1D and L2 are locked, due to a UC lock or split lock >> + 0x2 cache_lock_duration cycles that theL1D is locked >> +name:idq type:bitmask default:0x2 >> + 0x2 empty Cycles the Instruction Decode Queue (IDQ) is empty. >> + 0x4 mite_uops Number of uops delivered to Instruction Decode Queue (IDQ) from MITE path. >> + 0x8 dsb_uops Number of uops delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path. >> + 0x10 ms_dsb_uops Number of Uops delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by Decode Stream Buffer (DSB). >> + 0x20 ms_mite_uops Number of Uops delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by MITE. >> + 0x30 ms_uops Number of Uops were delivered into Instruction Decode Queue (IDQ) from MS, initiated by Decode Stream Buffer (DSB) or MITE. >> + 0x30 extra:cmask=1 ms_cycles Number of cycles that Uops were delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by Decode Stream Buffer (DSB) or MITE. >> + 0x4 extra:cmask=1 mite_cycles Cycles MITE is active >> + 0x8 extra:cmask=1 dsb_cycles Cycles Decode Stream Buffer (DSB) is active >> + 0x10 extra:cmask=1 ms_dsb_cycles Cycles Decode Stream Buffer (DSB) Microcode Sequenser (MS) is active >> + 0x10 extra:cmask=1,edge ms_dsb_occur Occurences of Decode Stream Buffer (DSB) Microcode Sequenser (MS) going active >> + 0x18 extra:cmask=1 all_dsb_cycles_any_uops Cycles Decode Stream Buffer (DSB) is delivering anything >> + 0x18 extra:cmask=4 all_dsb_cycles_4_uops Cycles Decode Stream Buffer (DSB) is delivering 4 Uops >> + 0x24 extra:cmask=1 all_mite_cycles_any_uops Cycles MITE is delivering anything >> + 0x24 extra:cmask=4 all_mite_cycles_4_uops Cycles MITE is delivering 4 Uops >> + 0x3c mite_all_uops Number of uops delivered to Instruction Decode Queue (IDQ) from any path. >> +name:itlb_misses type:bitmask default:0x1 >> + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M) >> + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M) >> + 0x4 walk_duration Cycles PMH is busy with this walk. >> + 0x10 stlb_hit First level miss but second level hit; no page walk. >> +name:ild_stall type:bitmask default:0x1 >> + 0x1 lcp Stall "occurrences" due to length changing prefixes (LCP). >> + 0x4 iq_full Stall cycles when instructions cannot be written because the Instruction Queue (IQ) is full. >> +name:br_inst_exec type:bitmask default:0xff >> + 0xff all_branches All branch instructions executed. >> + 0x41 nontaken_conditional All macro conditional nontaken branch instructions. >> + 0x81 taken_conditional All macro conditional taken branch instructions. >> + 0x82 taken_direct_jump All macro unconditional taken branch instructions, excluding calls and indirects. >> + 0x84 taken_indirect_jump_non_call_ret All taken indirect branches that are not calls nor returns. >> + 0x88 taken_indirect_near_return All taken indirect branches that have a return mnemonic. >> + 0x90 taken_direct_near_call All taken non-indirect calls. >> + 0xa0 taken_indirect_near_call All taken indirect calls, including both register and memory indirect. >> + 0xc1 all_conditional All macro conditional branch instructions. >> + 0xc2 all_direct_jmp All macro unconditional branch instructions, excluding calls and indirects >> + 0xc4 all_indirect_jump_non_call_ret All indirect branches that are not calls nor returns. >> + 0xc8 all_indirect_near_return All indirect return branches. >> + 0xd0 all_direct_near_call All non-indirect calls executed. >> +name:br_misp_exec type:bitmask default:0xff >> + 0xff all_branches All mispredicted branch instructions executed. >> + 0x41 nontaken_conditional All nontaken mispredicted macro conditional branch instructions. >> + 0x81 taken_conditional All taken mispredicted macro conditional branch instructions. >> + 0x84 taken_indirect_jump_non_call_ret All taken mispredicted indirect branches that are not calls nor returns. >> + 0x88 taken_return_near All taken mispredicted indirect branches that have a return mnemonic. >> + 0x90 taken_direct_near_call All taken mispredicted non-indirect calls. >> + 0xa0 taken_indirect_near_call All taken mispredicted indirect calls, including both register and memory indirect. >> + 0xc1 all_conditional All mispredicted macro conditional branch instructions. >> + 0xc4 all_indirect_jump_non_call_ret All mispredicted indirect branches that are not calls nor returns. >> + 0xd0 all_direct_near_call All mispredicted non-indirect calls >> +name:idq_uops_not_delivered type:bitmask default:0x1 >> + 0x1 core Count number of non-delivered uops to Resource Allocation Table (RAT). >> + 0x1 extra:cmask=4 cycles_0_uops_deliv.core Counts the cycles no uops were delivered >> + 0x1 extra:cmask=3 cycles_le_1_uop_deliv.core Counts the cycles less than 1 uops were delivered >> + 0x1 extra:cmask=2 cycles_le_2_uop_deliv.core Counts the cycles less than 2 uops were delivered >> + 0x1 extra:cmask=1 cycles_le_3_uop_deliv.core Counts the cycles less than 3 uops were delivered >> + 0x1 extra:cmask=4,inv cycles_ge_1_uop_deliv.core Cycles when 1 or more uops were delivered to the by the front end. >> + 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. >> +name:uops_dispatched_port type:bitmask default:0x1 >> + 0x1 port_0 Cycles which a Uop is dispatched on port 0 >> + 0x2 port_1 Cycles which a Uop is dispatched on port 1 >> + 0x4 port_2_ld Cycles which a load Uop is dispatched on port 2 >> + 0x8 port_2_sta Cycles which a STA Uop is dispatched on port 2 >> + 0x10 port_3_ld Cycles which a load Uop is dispatched on port 3 >> + 0x20 port_3_sta Cycles which a STA Uop is dispatched on port 3 >> + 0x40 port_4 Cycles which a Uop is dispatched on port 4 >> + 0x80 port_5 Cycles which a Uop is dispatched on port 5 >> + 0xc port_2 Uops disptached to port 2, loads and stores (speculative and retired) >> + 0x30 port_3 Uops disptached to port 3, loads and stores (speculative and retired) >> + 0xc port_2_core Uops disptached to port 2, loads and stores per core (speculative and retired) >> + 0x30 port_3_core Uops disptached to port 3, loads and stores per core (speculative and retired) >> +name:resource_stalls type:bitmask default:0x1 >> + 0x1 any Cycles Allocation is stalled due to Resource Related reason. >> + 0x2 lb Cycles Allocator is stalled due to Load Buffer full >> + 0x4 rs Stall due to no eligible Reservation Station (RS) entry available. >> + 0x8 sb Cycles Allocator is stalled due to Store Buffer full (not including draining from synch). >> + 0x10 rob ROB full cycles. >> + 0xe mem_rs Resource stalls due to LB, SB or Reservation Station (RS) being completely in use >> + 0xf0 ooo_rsrc Resource stalls due to Rob being full, FCSW, MXCSR and OTHER >> + 0xa lb_sb Resource stalls due to load or store buffers >> +name:dsb2mite_switches type:bitmask default:0x1 >> + 0x1 count Number of Decode Stream Buffer (DSB) to MITE switches >> + 0x2 penalty_cycles Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles. >> +name:dsb_fill type:bitmask default:0x2 >> + 0x2 other_cancel Count number of times a valid DSB fill has been actually cancelled for any reason. >> + 0x8 exceed_dsb_lines Decode Stream Buffer (DSB) Fill encountered> 3 Decode Stream Buffer (DSB) lines. >> + 0xa all_cancel Count number of times a valid Decode Stream Buffer (DSB) fill has been actually cancelled for any reason. >> +name:offcore_requests type:bitmask default:0x1 >> + 0x1 demand_data_rd Demand Data Read requests sent to uncore >> + 0x2 demand_code_rd Offcore Code read requests. Includes Cacheable and Un-cacheables. >> + 0x4 demand_rfo Offcore Demand RFOs. Includes regular RFO, Locks, ItoM. >> + 0x8 all_data_rd Offcore Demand and prefetch data reads returned to the core. >> +name:uops_dispatched type:bitmask default:0x1 >> + 0x1 thread Counts total number of uops to be dispatched per-thread each cycle. >> + 0x1 extra:cmask=1,inv stall_cycles Counts number of cycles no uops were dispatced to be executed on this thread. >> + 0x2 core Counts total number of uops dispatched from any thread >> +name:tlb_flush type:bitmask default:0x1 >> + 0x1 dtlb_thread Count number of DTLB flushes of thread-specific entries. >> + 0x20 stlb_any Count number of any STLB flushes >> +name:l1d_blocks type:bitmask default:0x1 >> + 0x1 ld_bank_conflict Any dispatched loads cancelled due to DCU bank conflict >> + 0x5 extra:cmask=1 bank_conflict_cycles Cycles with l1d blocks due to bank conflicts >> +name:other_assists type:bitmask default:0x2 >> + 0x2 itlb_miss_retired Instructions that experienced an ITLB miss. Non Pebs >> + 0x10 avx_to_sse Number of transitions from AVX-256 to legacy SSE when penalty applicable Non Pebs >> + 0x20 sse_to_avx Number of transitions from legacy SSE to AVX-256 when penalty applicable Non Pebs >> +name:uops_retired type:bitmask default:0x1 >> + 0x1 all All uops that actually retired. >> + 0x2 retire_slots number of retirement slots used non PEBS >> + 0x1 extra:cmask=1,inv stall_cycles Cycles no executable uops retired >> + 0x1 extra:cmask=10,inv total_cycles Number of cycles using always true condition applied to non PEBS uops retired event. >> +name:machine_clears type:bitmask default:0x2 >> + 0x2 memory_ordering Number of Memory Ordering Machine Clears detected. >> + 0x4 smc Number of Self-modifying code (SMC) Machine Clears detected. >> + 0x20 maskmov Number of AVX masked mov Machine Clears detected. >> +name:br_inst_retired type:bitmask default:0x1 >> + 0x1 conditional Counts all taken and not taken macro conditional branch instructions. >> + 0x2 near_call Counts all macro direct and indirect near calls. non PEBS >> + 0x8 near_return This event counts the number of near ret instructions retired. >> + 0x10 not_taken Counts all not taken macro branch instructions retired. >> + 0x20 near_taken Counts the number of near branch taken instructions retired. >> + 0x40 far_branch Counts the number of far branch instructions retired. >> + 0x4 all_branches_ps Counts all taken and not taken macro branches including far branches.(Precise Event) >> + 0x2 near_call_r3 Ring123 only near calls (non precise) >> + 0x2 near_call_r3_ps Ring123 only near calls (precise event) >> +name:br_misp_retired type:bitmask default:0x1 >> + 0x1 conditional All mispredicted macro conditional branch instructions. >> + 0x2 near_call All macro direct and indirect near calls >> + 0x10 not_taken number of branch instructions retired that were mispredicted and not-taken. >> + 0x20 taken number of branch instructions retired that were mispredicted and taken. >> + 0x4 all_branches_ps all macro branches (Precise Event) >> +name:fp_assist type:bitmask default:0x1e >> + 0x1e extra:cmask=1 any Counts any FP_ASSIST umask was incrementing. >> + 0x2 x87_output output - Numeric Overflow, Numeric Underflow, Inexact Result >> + 0x4 x87_input input - Invalid Operation, Denormal Operand, SNaN Operand >> + 0x8 simd_output Any output SSE* FP Assist - Numeric Overflow, Numeric Underflow. >> + 0x10 simd_input Any input SSE* FP Assist >> +name:mem_uops_retired type:bitmask default:0x11 >> + 0x11 stlb_miss_loads STLB misses dues to retired loads >> + 0x12 stlb_miss_stores STLB misses dues to retired stores >> + 0x21 lock_loads Locked retired loads >> + 0x41 split_loads Retired loads causing cacheline splits >> + 0x42 split_stores Retired stores causing cacheline splits >> + 0x81 all_loads Any retired loads >> + 0x82 all_stores Any retired stores >> +name:mem_load_uops_retired type:bitmask default:0x1 >> + 0x1 l1_hit Load hit in nearest-level (L1D) cache >> + 0x2 l2_hit Load hit in mid-level (L2) cache >> + 0x4 llc_hit Load hit in last-level (L3) cache with no snoop needed >> + 0x40 hit_lfb A load missed L1D but hit the Fill Buffer >> +name:mem_load_uops_llc_hit_retired type:bitmask default:0x1 >> + 0x1 xsnp_miss Load LLC Hit and a cross-core Snoop missed in on-pkg core cache >> + 0x2 xsnp_hit Load LLC Hit and a cross-core Snoop hits in on-pkg core cache >> + 0x4 xsnp_hitm Load had HitM Response from a core on same socket (shared LLC). >> + 0x8 xsnp_none Load hit in last-level (L3) cache with no snoop needed. >> +name:l2_trans type:bitmask default:0x80 >> + 0x80 all_requests Transactions accessing L2 pipe >> + 0x1 demand_data_rd Demand Data Read requests that access L2 cache, includes L1D prefetches. >> + 0x2 rfo RFO requests that access L2 cache >> + 0x4 code_rd L2 cache accesses when fetching instructions including L1D code prefetches >> + 0x8 all_pf L2 or LLC HW prefetches that access L2 cache >> + 0x10 l1d_wb L1D writebacks that access L2 cache >> + 0x20 l2_fill L2 fill requests that access L2 cache >> + 0x40 l2_wb L2 writebacks that access L2 cache >> +name:l2_lines_in type:bitmask default:0x7 >> + 0x7 all L2 cache lines filling L2 >> + 0x1 i L2 cache lines in I state filling L2 >> + 0x2 s L2 cache lines in S state filling L2 >> + 0x4 e L2 cache lines in E state filling L2 >> +name:l2_lines_out type:bitmask default:0x1 >> + 0x1 demand_clean Clean line evicted by a demand >> + 0x2 demand_dirty Dirty line evicted by a demand >> + 0x4 pf_clean Clean line evicted by an L2 Prefetch >> + 0x8 pf_dirty Dirty line evicted by an L2 Prefetch >> + 0xa dirty_all Any Dirty line evicted >> diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c >> index b2ebf54..9d11b21 100644 >> --- a/libop/op_cpu_type.c >> +++ b/libop/op_cpu_type.c >> @@ -93,6 +93,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { >> { "Intel Westmere microarchitecture", "i386/westmere", CPU_WESTMERE, 4 }, >> { "ARMv7 Scorpion", "arm/armv7-scorpion", CPU_ARM_SCORPION, 5 }, >> { "ARMv7 ScorpionMP", "arm/armv7-scorpionmp", CPU_ARM_SCORPIONMP, 5 }, >> + { "Intel Sandy Bridge microarchitecture", "i386/sandybridge", CPU_SANDYBRIDGE, 8 }, >> }; >> >> static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); >> @@ -117,6 +118,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) >> case CPU_ATOM: >> case CPU_NEHALEM: >> case CPU_WESTMERE: >> + case CPU_SANDYBRIDGE: >> return CPU_ARCH_PERFMON; >> default: >> /* assume processor in a class by itself */ >> diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h >> index 9283ec7..d6cae3a 100644 >> --- a/libop/op_cpu_type.h >> +++ b/libop/op_cpu_type.h >> @@ -90,6 +90,7 @@ typedef enum { >> CPU_WESTMERE, /* Intel Westmere microarchitecture */ >> CPU_ARM_SCORPION, /**< ARM SCORPION */ >> CPU_ARM_SCORPIONMP, /**< ARM SCORPIONMP */ >> + CPU_SANDYBRIDGE, /* Intel Sandy-Bridge microarchitecture */ >> MAX_CPU_TYPE >> } op_cpu; >> >> diff --git a/libop/op_events.c b/libop/op_events.c >> index 502ff01..0aa0ad3 100644 >> --- a/libop/op_events.c >> +++ b/libop/op_events.c >> @@ -1023,6 +1023,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) >> case CPU_CORE_I7: >> case CPU_NEHALEM: >> case CPU_WESTMERE: >> + case CPU_SANDYBRIDGE: >> case CPU_MIPS_LOONGSON2: >> case CPU_FAMILY12H: >> case CPU_FAMILY14H: >> diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h >> index f1d67a5..a529dd6 100644 >> --- a/libop/op_hw_specific.h >> +++ b/libop/op_hw_specific.h >> @@ -117,6 +117,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) >> case 0x2c: /* Westmere-EP (Intel Xeon 5600 series) */ >> case 0x2f: /* Westmere-EX */ >> return CPU_WESTMERE; >> + case 0x2a: >> + case 0x2d: >> + return CPU_SANDYBRIDGE; >> } >> } >> return cpu_type; >> diff --git a/utils/ophelp.c b/utils/ophelp.c >> index b3aebde..f4e0653 100644 >> --- a/utils/ophelp.c >> +++ b/utils/ophelp.c >> @@ -533,6 +533,7 @@ int main(int argc, char const * argv[]) >> case CPU_CORE_I7: >> case CPU_NEHALEM: >> case CPU_WESTMERE: >> + case CPU_SANDYBRIDGE: >> case CPU_ATOM: >> event_doc = >> "See Intel Architecture Developer's Manual Volume 3B, Appendix A and\n" > > > |
From: Suthikulpanit, S. <Sur...@am...> - 2011-05-17 23:09:24
|
commited -----Original Message----- From: Suthikulpanit, Suravee [mailto:Sur...@am...] Sent: Tuesday, May 17, 2011 2:13 PM To: Maynard Johnson; Andi Kleen Cc: Andi Kleen; opr...@li... Subject: RE: [PATCH 3/4] Add Sandy Bridge support Besides a couple issues: - events file already exists and exactly the same (I could just move the existing events file so that the patch would apply cleanly). - white spaces (I could fix this during commit.) This patch applied clean. PS: Do we mention anywhere (i.e. in the opcontrol --help) regarding the "Named-unitmask" feature? -----Original Message----- From: Maynard Johnson [mailto:may...@us...] Sent: Monday, May 16, 2011 2:39 PM To: Andi Kleen; Suthikulpanit, Suravee Cc: opr...@li...; Andi Kleen Subject: Re: [PATCH 3/4] Add Sandy Bridge support Andi Kleen wrote: > From: Andi Kleen <ak...@li...> > > Add an event list for Sandy Bridge. Modify oprofile to detect Sandy Bridges. Suravee, I'm pretty sure this patch is fine, but as the x86 oprofile userspace maintainer, I'd like your ack. If you're OK with it, you can commit it. -Maynard > > Signed-off-by: Andi Kleen <ak...@li...> > --- > events/Makefile.am | 1 + > events/i386/sandybridge/events | 67 +++++++++ > events/i386/sandybridge/unit_masks | 275 ++++++++++++++++++++++++++++++++++++ > libop/op_cpu_type.c | 2 + > libop/op_cpu_type.h | 1 + > libop/op_events.c | 1 + > libop/op_hw_specific.h | 3 + > utils/ophelp.c | 1 + > 8 files changed, 351 insertions(+), 0 deletions(-) > create mode 100644 events/i386/sandybridge/events > create mode 100644 events/i386/sandybridge/unit_masks > > diff --git a/events/Makefile.am b/events/Makefile.am > index 60c4164..c4101cc 100644 > --- a/events/Makefile.am > +++ b/events/Makefile.am > @@ -18,6 +18,7 @@ event_files = \ > i386/core_i7/events i386/core_i7/unit_masks \ > i386/nehalem/events i386/nehalem/unit_masks \ > i386/westmere/events i386/westmere/unit_masks \ > + i386/sandybridge/events i386/sandybridge/unit_masks \ > ia64/ia64/events ia64/ia64/unit_masks \ > ia64/itanium2/events ia64/itanium2/unit_masks \ > ia64/itanium/events ia64/itanium/unit_masks \ > diff --git a/events/i386/sandybridge/events b/events/i386/sandybridge/events > new file mode 100644 > index 0000000..bf941c7 > --- /dev/null > +++ b/events/i386/sandybridge/events > @@ -0,0 +1,67 @@ > +# > +# Intel "sandy-bridge" microarchitecture core events. > +# > +# See http://ark.intel.com/ for help in identifying sandy-bridge based CPUs > +# > +# Note the minimum counts are not discovered experimentally and could be likely > +# lowered in many cases without ill effect. > +# > +include:i386/arch_perfmon > +event:0x03 counters:cpuid um:ld_blocks minimum:100000 name:ld_blocks : blocked loads > +event:0x05 counters:cpuid um:misalign_mem_ref minimum:2000000 name:misalign_mem_ref : Misaligned memory references > +event:0x07 counters:cpuid um:ld_blocks_partial minimum:100000 name:ld_blocks_partial : Partial loads > +event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000000 name:dtlb_load_misses : D-TLB misses > +event:0x0d counters:cpuid um:int_misc minimum:2000000 name:int_misc : Instruction decoder events > +event:0x0e counters:0,1,2,3 um:uops_issued minimum:2000000 name:uops_issued : Number of Uops issued > +event:0x14 counters:cpuid um:arith minimum:2000000 name:arith : Misc ALU events > +event:0x17 counters:cpuid um:one minimum:2000000 name:insts_written_to_iq : Number of instructions written to Instruction Queue (IQ) this cycle. > +event:0x24 counters:cpuid um:l2_rqsts minimum:200000 name:l2_rqsts : Requests from L2 cache > +event:0x27 counters:cpuid um:l2_store_lock_rqsts minimum:200000 name:l2_store_lock_rqsts : L2 cache store lock requests > +event:0x28 counters:cpuid um:l2_l1d_wb_rqsts minimum:200000 name:l2_l1d_wb_rqsts : writebacks from L1D to the L2 cache > +event:0x48 counters:2 um:l1d_pend_miss minimum:2000000 name:l1d_pend_miss : Cycles with L1D load Misses outstanding. > +event:0x49 counters:cpuid um:dtlb_store_misses minimum:2000000 name:dtlb_store_misses : D-TLB store misses > +event:0x4c counters:cpuid um:load_hit_pre minimum:100000 name:load_hit_pre : Load dispatches that hit fill buffer > +event:0x4e counters:cpuid um:x02 minimum:2000000 name:hw_pre_req : Hardware Prefetch requests > +event:0x51 counters:cpuid um:l1d minimum:2000000 name:l1d : L1D cache events > +event:0x59 counters:cpuid um:partial_rat_stalls minimum:2000000 name:partial_rat_stalls : Partial RAT stalls > +event:0x5b counters:0,1,2,3 um:resource_stalls2 minimum:2000000 name:resource_stalls2 : Misc resource stalls > +event:0x5c counters:cpuid um:cpl_cycles minimum:2000000 name:cpl_cycles : Unhalted core cycles in specific rings > +event:0x5e counters:0,1,2,3 um:one minimum:2000000 name:rs_events : Events for the reservation station > +event:0x60 counters:cpuid um:offcore_requests_outstanding minimum:2000000 name:offcore_requests_outstanding : Offcore outstanding transactions > +event:0x63 counters:cpuid um:lock_cycles minimum:2000000 name:lock_cycles : Cycles due to LOCK prefixes. > +event:0x79 counters:0,1,2,3 um:idq minimum:2000000 name:idq : Instruction Decode Queue events > +event:0x80 counters:cpuid um:x02 minimum:200000 name:icache : Instruction cache events > +event:0x85 counters:cpuid um:itlb_misses minimum:2000000 name:itlb_misses : I-TLB misses > +event:0x87 counters:cpuid um:ild_stall minimum:2000000 name:ild_stall : Instruction decoding stalls > +event:0x88 counters:cpuid um:br_inst_exec minimum:200000 name:br_inst_exec : Branch instructions > +event:0x89 counters:cpuid um:br_misp_exec minimum:200000 name:br_misp_exec : Mispredicted branch instructions > +event:0x9c counters:0,1,2,3 um:idq_uops_not_delivered minimum:2000000 name:idq_uops_not_delivered : uops not delivered to IDQ. > +event:0xa1 counters:cpuid um:uops_dispatched_port minimum:2000000 name:uops_dispatched_port : Count on which ports uops are dispatched. > +event:0xa2 counters:cpuid um:resource_stalls minimum:2000000 name:resource_stalls : Core resource stalls > +event:0xab counters:cpuid um:dsb2mite_switches minimum:2000000 name:dsb2mite_switches : Number of Decode Stream Buffer (DSB) to MITE switches > +event:0xac counters:cpuid um:dsb_fill minimum:2000000 name:dsb_fill : DSB fill events > +event:0xae counters:cpuid um:one minimum:10000 name:itlb : ITLB events > +event:0xb0 counters:cpuid um:offcore_requests minimum:100000 name:offcore_requests : Requests sent outside the core > +event:0xb1 counters:0,1,2,3 um:uops_dispatched minimum:2000000 name:uops_dispatched : uops dispatched > +event:0xb2 counters:cpuid um:one minimum:2000000 name:offcore_requests_buffer : Offcore requests buffer events > +event:0xb6 counters:cpuid um:one minimum:100000 name:agu_bypass_cancel : AGU bypass cancel > +event:0xbd counters:cpuid um:tlb_flush minimum:10000 name:tlb_flush : TLB flushes > +event:0xbf counters:cpuid um:l1d_blocks minimum:100000 name:l1d_blocks : L1D cache blocking events > +event:0xc0 counters:1 um:one minimum:2000000 name:inst_retired : Instructions retired > +event:0xc1 counters:cpuid um:other_assists minimum:100000 name:other_assists : Instructions that needed an assist > +event:0xc2 counters:0,1,2,3 um:uops_retired minimum:2000000 name:uops_retired : uops that actually retired. > +event:0xc3 counters:cpuid um:machine_clears minimum:100000 name:machine_clears : Number of Machine Clears detected. > +event:0xc4 counters:0,1,2,3 um:br_inst_retired minimum:400000 name:br_inst_retired : Counts branch instructions retired > +event:0xc5 counters:0,1,2,3 um:br_misp_retired minimum:400000 name:br_misp_retired : Counts mispredicted branch instructions > +event:0xca counters:0,1,2,3 um:fp_assist minimum:100000 name:fp_assist : Counts floating point assists > +event:0xcb counters:cpuid um:one minimum:100000 name:hw_interrupts : Number of hardware interrupts received by the processor. > +event:0xcc counters:cpuid um:x20 minimum:2000000 name:rob_misc_events : Count ROB (Register Reorder Buffer) events. > +event:0xcd counters:3 um:x02 minimum:2000000 name:mem_trans_retired : Count memory transactions > +event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000000 name:mem_uops_retired : Count uops with memory accessed retired > +event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000000 name:mem_load_uops_retired : Memory load uops. > +event:0xd2 counters:0,1,2,3 um:mem_load_uops_llc_hit_retired minimum:100000 name:mem_load_uops_llc_hit_retired : Memory load uops with LLC (Last level cache) hit > +event:0xd4 counters:0,1,2,3 um:x02 minimum:10000 name:mem_load_uops_misc_retired : Memory load uops retired > +event:0xf0 counters:cpuid um:l2_trans minimum:200000 name:l2_trans : L2 cache accesses > +event:0xf1 counters:cpuid um:l2_lines_in minimum:100000 name:l2_lines_in : L2 cache lines in > +event:0xf2 counters:cpuid um:l2_lines_out minimum:100000 name:l2_lines_out : L2 cache lines out > +event:0xf4 counters:cpuid um:x10 minimum:100000 name:sq_misc : Store queue misc events > diff --git a/events/i386/sandybridge/unit_masks b/events/i386/sandybridge/unit_masks > new file mode 100644 > index 0000000..cca6cb9 > --- /dev/null > +++ b/events/i386/sandybridge/unit_masks > @@ -0,0 +1,275 @@ > +# > +# Unit masks for the Intel "sandy-bridge" micro architecture > +# > +# See http://ark.intel.com/ for help in identifying sandy-bridge based CPUs > +# > +include:i386/arch_perfmon > +name:x02 type:mandatory default:0x2 > + 0x2 No unit mask > +name:x10 type:mandatory default:0x10 > + 0x10 No unit mask > +name:x20 type:mandatory default:0x20 > + 0x20 No unit mask > +name:ld_blocks type:bitmask default:0x1 > + 0x1 data_unknown blocked loads due to store buffer blocks with unknown data. > + 0x2 store_forward loads blocked by overlapping with store buffer that cannot be forwarded > + 0x8 no_sr This event counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use. > + 0x10 all_block Number of cases where any load is blocked but has no DCU miss. > +name:misalign_mem_ref type:bitmask default:0x1 > + 0x1 loads Speculative cache-line split load uops dispatched to the L1D. > + 0x2 stores Speculative cache-line split Store-address uops dispatched to L1D > +name:ld_blocks_partial type:bitmask default:0x1 > + 0x1 address_alias False dependencies in MOB due to partial compare on address > + 0x8 all_sta_block This event counts the number of times that load operations are temporarily blocked because of older stores, with addresses that are not yet known. A load operation may incur more than one block of this type. > +name:dtlb_load_misses type:bitmask default:0x1 > + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M/1G) > + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M/1G) > + 0x4 walk_duration Cycles PMH is busy with this walk > + 0x10 stlb_hit First level miss but second level hit; no page walk. > +name:int_misc type:bitmask default:0x40 > + 0x40 rat_stall_cycles Cycles Resource Allocation Table (RAT) external stall is sent to Instruction Decode Queue (IDQ) for this thread. > + 0x3 extra:cmask=1 recovery_cycles Number of cycles waiting to be recover after Nuke due to all other cases except JEClear. > + 0x3 extra:cmask=1,edge recovery_stalls_count Edge applied to recovery_cycles, thus counts occurrences. > +name:uops_issued type:bitmask default:0x1 > + 0x1 any Number of Uops issued by the Resource Allocation Table (RAT) to the Reservation Station (RS) > + 0x1 extra:cmask=1,inv stall_cycles cycles no uops issued by this thread. > +name:arith type:bitmask default:0x1 > + 0x1 fpu_div_active Cycles that the divider is busy with any divide or sqrt operation. > + 0x1 extra:cmask=1,edge fpu_div Number of times that the divider is actived, includes INT, SIMD and FP. > +name:l2_rqsts type:bitmask default:0x1 > + 0x1 demand_data_rd_hit Demand Data Read hit L2, no rejects > + 0x4 rfo_hit RFO requests that hit L2 cache > + 0x8 rfo_miss RFO requests that miss L2 cache > + 0x10 code_rd_hit L2 cache hits when fetching instructions, code reads. > + 0x20 code_rd_miss L2 cache misses when fetching instructions > + 0x40 pf_hit Requests from the L2 hardware prefetchers that hit L2 cache > + 0x80 pf_miss Requests from the L2 hardware prefetchers that miss L2 cache > + 0x3 all_demand_data_rd Any data read request to L2 cache > + 0xc all_rfo Any data RFO request to L2 cache > + 0x30 all_code_rd Any code read request to L2 cache > + 0xc0 all_pf Any L2 HW prefetch request to L2 cache > +name:l2_store_lock_rqsts type:bitmask default:0xf > + 0xf all RFOs that access cache lines in any state > + 0x1 miss RFO (as a result of regular RFO or Lock request) miss cache - I state > + 0x4 hit_e RFO (as a result of regular RFO or Lock request) hits cache in E state > + 0x8 hit_m RFO (as a result of regular RFO or Lock request) hits cache in M state > +name:l2_l1d_wb_rqsts type:bitmask default:0x4 > + 0x4 hit_e writebacks from L1D to L2 cache lines in E state > + 0x8 hit_m writebacks from L1D to L2 cache lines in M state > +name:l1d_pend_miss type:bitmask default:0x1 > + 0x1 pending Cycles with L1D load Misses outstanding. > + 0x1 extra:cmask=1,edge occurences This event counts the number of L1D misses outstanding occurences. > +name:dtlb_store_misses type:bitmask default:0x1 > + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M/1G) > + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M/1G) > + 0x4 walk_duration Cycles PMH is busy with this walk > + 0x10 stlb_hit First level miss but second level hit; no page walk. Only relevant if multiple levels. > +name:load_hit_pre type:bitmask default:0x1 > + 0x1 sw_pf Load dispatches that hit fill buffer allocated for S/W prefetch. > + 0x2 hw_pf Load dispatches that hit fill buffer allocated for HW prefetch. > +name:l1d type:bitmask default:0x1 > + 0x1 replacement L1D Data line replacements. > + 0x2 allocated_in_m L1D M-state Data Cache Lines Allocated > + 0x4 eviction L1D M-state Data Cache Lines Evicted due to replacement (only) > + 0x8 all_m_replacement All Modified lines evicted out of L1D > +name:partial_rat_stalls type:bitmask default:0x20 > + 0x20 flags_merge_uop Number of perf sensitive flags-merge uops added by Sandy Bridge u-arch. > + 0x40 slow_lea_window Number of cycles with at least 1 slow Load Effective Address (LEA) uop being allocated. > + 0x80 mul_single_uop Number of Multiply packed/scalar single precision uops allocated > + 0x20 extra:cmask=1 flags_merge_uop_cycles Cycles with perf sensitive flags-merge uops added by SandyBridge u-arch. > +name:resource_stalls2 type:bitmask default:0x40 > + 0x40 bob_full Cycles Allocator is stalled due Branch Order Buffer (BOB). > + 0xf all_prf_control Resource stalls2 control structures full for physical registers > + 0xc all_fl_empty Cycles with either free list is empty > + 0x4f ooo_rsrc Resource stalls2 control structures full Physical Register Reclaim Table (PRRT), Physical History Table (PHT), INT or SIMD Free List (FL), Branch Order Buffer (BOB) > +name:cpl_cycles type:bitmask default:0x1 > + 0x1 ring0 Unhalted core cycles the Thread was in Rings 0. > + 0x1 extra:cmask=1,edge ring0_trans Transitions from ring123 to Ring0. > + 0x2 ring123 Unhalted core cycles the Thread was in Rings 1/2/3. > +name:offcore_requests_outstanding type:bitmask default:0x1 > + 0x1 demand_data_rd Offcore outstanding Demand Data Read transactions in the SuperQueue (SQ), queue to uncore, every cycle. Includes L1D data hardware prefetches. > + 0x1 extra:cmask=1 cycles_with_demand_data_rd cycles there are Offcore outstanding RD data transactions in the SuperQueue (SQ), queue to uncore. > + 0x2 demand_code_rd Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x4 demand_rfo Offcore outstanding RFO (store) transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x8 all_data_rd Offcore outstanding all cacheable Core Data Read transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x8 extra:cmask=1 cycles_with_data_rd Cycles there are Offcore outstanding all Data read transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x2 extra:cmask=1 cycles_with_demand_code_rd Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. > + 0x4 extra:cmask=1 cycles_with_demand_rfo Cycles with offcore outstanding demand RFO Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. > +name:lock_cycles type:bitmask default:0x1 > + 0x1 split_lock_uc_lock_duration Cycles in which the L1D and L2 are locked, due to a UC lock or split lock > + 0x2 cache_lock_duration cycles that theL1D is locked > +name:idq type:bitmask default:0x2 > + 0x2 empty Cycles the Instruction Decode Queue (IDQ) is empty. > + 0x4 mite_uops Number of uops delivered to Instruction Decode Queue (IDQ) from MITE path. > + 0x8 dsb_uops Number of uops delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path. > + 0x10 ms_dsb_uops Number of Uops delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by Decode Stream Buffer (DSB). > + 0x20 ms_mite_uops Number of Uops delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by MITE. > + 0x30 ms_uops Number of Uops were delivered into Instruction Decode Queue (IDQ) from MS, initiated by Decode Stream Buffer (DSB) or MITE. > + 0x30 extra:cmask=1 ms_cycles Number of cycles that Uops were delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by Decode Stream Buffer (DSB) or MITE. > + 0x4 extra:cmask=1 mite_cycles Cycles MITE is active > + 0x8 extra:cmask=1 dsb_cycles Cycles Decode Stream Buffer (DSB) is active > + 0x10 extra:cmask=1 ms_dsb_cycles Cycles Decode Stream Buffer (DSB) Microcode Sequenser (MS) is active > + 0x10 extra:cmask=1,edge ms_dsb_occur Occurences of Decode Stream Buffer (DSB) Microcode Sequenser (MS) going active > + 0x18 extra:cmask=1 all_dsb_cycles_any_uops Cycles Decode Stream Buffer (DSB) is delivering anything > + 0x18 extra:cmask=4 all_dsb_cycles_4_uops Cycles Decode Stream Buffer (DSB) is delivering 4 Uops > + 0x24 extra:cmask=1 all_mite_cycles_any_uops Cycles MITE is delivering anything > + 0x24 extra:cmask=4 all_mite_cycles_4_uops Cycles MITE is delivering 4 Uops > + 0x3c mite_all_uops Number of uops delivered to Instruction Decode Queue (IDQ) from any path. > +name:itlb_misses type:bitmask default:0x1 > + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M) > + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M) > + 0x4 walk_duration Cycles PMH is busy with this walk. > + 0x10 stlb_hit First level miss but second level hit; no page walk. > +name:ild_stall type:bitmask default:0x1 > + 0x1 lcp Stall "occurrences" due to length changing prefixes (LCP). > + 0x4 iq_full Stall cycles when instructions cannot be written because the Instruction Queue (IQ) is full. > +name:br_inst_exec type:bitmask default:0xff > + 0xff all_branches All branch instructions executed. > + 0x41 nontaken_conditional All macro conditional nontaken branch instructions. > + 0x81 taken_conditional All macro conditional taken branch instructions. > + 0x82 taken_direct_jump All macro unconditional taken branch instructions, excluding calls and indirects. > + 0x84 taken_indirect_jump_non_call_ret All taken indirect branches that are not calls nor returns. > + 0x88 taken_indirect_near_return All taken indirect branches that have a return mnemonic. > + 0x90 taken_direct_near_call All taken non-indirect calls. > + 0xa0 taken_indirect_near_call All taken indirect calls, including both register and memory indirect. > + 0xc1 all_conditional All macro conditional branch instructions. > + 0xc2 all_direct_jmp All macro unconditional branch instructions, excluding calls and indirects > + 0xc4 all_indirect_jump_non_call_ret All indirect branches that are not calls nor returns. > + 0xc8 all_indirect_near_return All indirect return branches. > + 0xd0 all_direct_near_call All non-indirect calls executed. > +name:br_misp_exec type:bitmask default:0xff > + 0xff all_branches All mispredicted branch instructions executed. > + 0x41 nontaken_conditional All nontaken mispredicted macro conditional branch instructions. > + 0x81 taken_conditional All taken mispredicted macro conditional branch instructions. > + 0x84 taken_indirect_jump_non_call_ret All taken mispredicted indirect branches that are not calls nor returns. > + 0x88 taken_return_near All taken mispredicted indirect branches that have a return mnemonic. > + 0x90 taken_direct_near_call All taken mispredicted non-indirect calls. > + 0xa0 taken_indirect_near_call All taken mispredicted indirect calls, including both register and memory indirect. > + 0xc1 all_conditional All mispredicted macro conditional branch instructions. > + 0xc4 all_indirect_jump_non_call_ret All mispredicted indirect branches that are not calls nor returns. > + 0xd0 all_direct_near_call All mispredicted non-indirect calls > +name:idq_uops_not_delivered type:bitmask default:0x1 > + 0x1 core Count number of non-delivered uops to Resource Allocation Table (RAT). > + 0x1 extra:cmask=4 cycles_0_uops_deliv.core Counts the cycles no uops were delivered > + 0x1 extra:cmask=3 cycles_le_1_uop_deliv.core Counts the cycles less than 1 uops were delivered > + 0x1 extra:cmask=2 cycles_le_2_uop_deliv.core Counts the cycles less than 2 uops were delivered > + 0x1 extra:cmask=1 cycles_le_3_uop_deliv.core Counts the cycles less than 3 uops were delivered > + 0x1 extra:cmask=4,inv cycles_ge_1_uop_deliv.core Cycles when 1 or more uops were delivered to the by the front end. > + 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. > +name:uops_dispatched_port type:bitmask default:0x1 > + 0x1 port_0 Cycles which a Uop is dispatched on port 0 > + 0x2 port_1 Cycles which a Uop is dispatched on port 1 > + 0x4 port_2_ld Cycles which a load Uop is dispatched on port 2 > + 0x8 port_2_sta Cycles which a STA Uop is dispatched on port 2 > + 0x10 port_3_ld Cycles which a load Uop is dispatched on port 3 > + 0x20 port_3_sta Cycles which a STA Uop is dispatched on port 3 > + 0x40 port_4 Cycles which a Uop is dispatched on port 4 > + 0x80 port_5 Cycles which a Uop is dispatched on port 5 > + 0xc port_2 Uops disptached to port 2, loads and stores (speculative and retired) > + 0x30 port_3 Uops disptached to port 3, loads and stores (speculative and retired) > + 0xc port_2_core Uops disptached to port 2, loads and stores per core (speculative and retired) > + 0x30 port_3_core Uops disptached to port 3, loads and stores per core (speculative and retired) > +name:resource_stalls type:bitmask default:0x1 > + 0x1 any Cycles Allocation is stalled due to Resource Related reason. > + 0x2 lb Cycles Allocator is stalled due to Load Buffer full > + 0x4 rs Stall due to no eligible Reservation Station (RS) entry available. > + 0x8 sb Cycles Allocator is stalled due to Store Buffer full (not including draining from synch). > + 0x10 rob ROB full cycles. > + 0xe mem_rs Resource stalls due to LB, SB or Reservation Station (RS) being completely in use > + 0xf0 ooo_rsrc Resource stalls due to Rob being full, FCSW, MXCSR and OTHER > + 0xa lb_sb Resource stalls due to load or store buffers > +name:dsb2mite_switches type:bitmask default:0x1 > + 0x1 count Number of Decode Stream Buffer (DSB) to MITE switches > + 0x2 penalty_cycles Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles. > +name:dsb_fill type:bitmask default:0x2 > + 0x2 other_cancel Count number of times a valid DSB fill has been actually cancelled for any reason. > + 0x8 exceed_dsb_lines Decode Stream Buffer (DSB) Fill encountered > 3 Decode Stream Buffer (DSB) lines. > + 0xa all_cancel Count number of times a valid Decode Stream Buffer (DSB) fill has been actually cancelled for any reason. > +name:offcore_requests type:bitmask default:0x1 > + 0x1 demand_data_rd Demand Data Read requests sent to uncore > + 0x2 demand_code_rd Offcore Code read requests. Includes Cacheable and Un-cacheables. > + 0x4 demand_rfo Offcore Demand RFOs. Includes regular RFO, Locks, ItoM. > + 0x8 all_data_rd Offcore Demand and prefetch data reads returned to the core. > +name:uops_dispatched type:bitmask default:0x1 > + 0x1 thread Counts total number of uops to be dispatched per-thread each cycle. > + 0x1 extra:cmask=1,inv stall_cycles Counts number of cycles no uops were dispatced to be executed on this thread. > + 0x2 core Counts total number of uops dispatched from any thread > +name:tlb_flush type:bitmask default:0x1 > + 0x1 dtlb_thread Count number of DTLB flushes of thread-specific entries. > + 0x20 stlb_any Count number of any STLB flushes > +name:l1d_blocks type:bitmask default:0x1 > + 0x1 ld_bank_conflict Any dispatched loads cancelled due to DCU bank conflict > + 0x5 extra:cmask=1 bank_conflict_cycles Cycles with l1d blocks due to bank conflicts > +name:other_assists type:bitmask default:0x2 > + 0x2 itlb_miss_retired Instructions that experienced an ITLB miss. Non Pebs > + 0x10 avx_to_sse Number of transitions from AVX-256 to legacy SSE when penalty applicable Non Pebs > + 0x20 sse_to_avx Number of transitions from legacy SSE to AVX-256 when penalty applicable Non Pebs > +name:uops_retired type:bitmask default:0x1 > + 0x1 all All uops that actually retired. > + 0x2 retire_slots number of retirement slots used non PEBS > + 0x1 extra:cmask=1,inv stall_cycles Cycles no executable uops retired > + 0x1 extra:cmask=10,inv total_cycles Number of cycles using always true condition applied to non PEBS uops retired event. > +name:machine_clears type:bitmask default:0x2 > + 0x2 memory_ordering Number of Memory Ordering Machine Clears detected. > + 0x4 smc Number of Self-modifying code (SMC) Machine Clears detected. > + 0x20 maskmov Number of AVX masked mov Machine Clears detected. > +name:br_inst_retired type:bitmask default:0x1 > + 0x1 conditional Counts all taken and not taken macro conditional branch instructions. > + 0x2 near_call Counts all macro direct and indirect near calls. non PEBS > + 0x8 near_return This event counts the number of near ret instructions retired. > + 0x10 not_taken Counts all not taken macro branch instructions retired. > + 0x20 near_taken Counts the number of near branch taken instructions retired. > + 0x40 far_branch Counts the number of far branch instructions retired. > + 0x4 all_branches_ps Counts all taken and not taken macro branches including far branches.(Precise Event) > + 0x2 near_call_r3 Ring123 only near calls (non precise) > + 0x2 near_call_r3_ps Ring123 only near calls (precise event) > +name:br_misp_retired type:bitmask default:0x1 > + 0x1 conditional All mispredicted macro conditional branch instructions. > + 0x2 near_call All macro direct and indirect near calls > + 0x10 not_taken number of branch instructions retired that were mispredicted and not-taken. > + 0x20 taken number of branch instructions retired that were mispredicted and taken. > + 0x4 all_branches_ps all macro branches (Precise Event) > +name:fp_assist type:bitmask default:0x1e > + 0x1e extra:cmask=1 any Counts any FP_ASSIST umask was incrementing. > + 0x2 x87_output output - Numeric Overflow, Numeric Underflow, Inexact Result > + 0x4 x87_input input - Invalid Operation, Denormal Operand, SNaN Operand > + 0x8 simd_output Any output SSE* FP Assist - Numeric Overflow, Numeric Underflow. > + 0x10 simd_input Any input SSE* FP Assist > +name:mem_uops_retired type:bitmask default:0x11 > + 0x11 stlb_miss_loads STLB misses dues to retired loads > + 0x12 stlb_miss_stores STLB misses dues to retired stores > + 0x21 lock_loads Locked retired loads > + 0x41 split_loads Retired loads causing cacheline splits > + 0x42 split_stores Retired stores causing cacheline splits > + 0x81 all_loads Any retired loads > + 0x82 all_stores Any retired stores > +name:mem_load_uops_retired type:bitmask default:0x1 > + 0x1 l1_hit Load hit in nearest-level (L1D) cache > + 0x2 l2_hit Load hit in mid-level (L2) cache > + 0x4 llc_hit Load hit in last-level (L3) cache with no snoop needed > + 0x40 hit_lfb A load missed L1D but hit the Fill Buffer > +name:mem_load_uops_llc_hit_retired type:bitmask default:0x1 > + 0x1 xsnp_miss Load LLC Hit and a cross-core Snoop missed in on-pkg core cache > + 0x2 xsnp_hit Load LLC Hit and a cross-core Snoop hits in on-pkg core cache > + 0x4 xsnp_hitm Load had HitM Response from a core on same socket (shared LLC). > + 0x8 xsnp_none Load hit in last-level (L3) cache with no snoop needed. > +name:l2_trans type:bitmask default:0x80 > + 0x80 all_requests Transactions accessing L2 pipe > + 0x1 demand_data_rd Demand Data Read requests that access L2 cache, includes L1D prefetches. > + 0x2 rfo RFO requests that access L2 cache > + 0x4 code_rd L2 cache accesses when fetching instructions including L1D code prefetches > + 0x8 all_pf L2 or LLC HW prefetches that access L2 cache > + 0x10 l1d_wb L1D writebacks that access L2 cache > + 0x20 l2_fill L2 fill requests that access L2 cache > + 0x40 l2_wb L2 writebacks that access L2 cache > +name:l2_lines_in type:bitmask default:0x7 > + 0x7 all L2 cache lines filling L2 > + 0x1 i L2 cache lines in I state filling L2 > + 0x2 s L2 cache lines in S state filling L2 > + 0x4 e L2 cache lines in E state filling L2 > +name:l2_lines_out type:bitmask default:0x1 > + 0x1 demand_clean Clean line evicted by a demand > + 0x2 demand_dirty Dirty line evicted by a demand > + 0x4 pf_clean Clean line evicted by an L2 Prefetch > + 0x8 pf_dirty Dirty line evicted by an L2 Prefetch > + 0xa dirty_all Any Dirty line evicted > diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c > index b2ebf54..9d11b21 100644 > --- a/libop/op_cpu_type.c > +++ b/libop/op_cpu_type.c > @@ -93,6 +93,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { > { "Intel Westmere microarchitecture", "i386/westmere", CPU_WESTMERE, 4 }, > { "ARMv7 Scorpion", "arm/armv7-scorpion", CPU_ARM_SCORPION, 5 }, > { "ARMv7 ScorpionMP", "arm/armv7-scorpionmp", CPU_ARM_SCORPIONMP, 5 }, > + { "Intel Sandy Bridge microarchitecture", "i386/sandybridge", CPU_SANDYBRIDGE, 8 }, > }; > > static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); > @@ -117,6 +118,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) > case CPU_ATOM: > case CPU_NEHALEM: > case CPU_WESTMERE: > + case CPU_SANDYBRIDGE: > return CPU_ARCH_PERFMON; > default: > /* assume processor in a class by itself */ > diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h > index 9283ec7..d6cae3a 100644 > --- a/libop/op_cpu_type.h > +++ b/libop/op_cpu_type.h > @@ -90,6 +90,7 @@ typedef enum { > CPU_WESTMERE, /* Intel Westmere microarchitecture */ > CPU_ARM_SCORPION, /**< ARM SCORPION */ > CPU_ARM_SCORPIONMP, /**< ARM SCORPIONMP */ > + CPU_SANDYBRIDGE, /* Intel Sandy-Bridge microarchitecture */ > MAX_CPU_TYPE > } op_cpu; > > diff --git a/libop/op_events.c b/libop/op_events.c > index 502ff01..0aa0ad3 100644 > --- a/libop/op_events.c > +++ b/libop/op_events.c > @@ -1023,6 +1023,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) > case CPU_CORE_I7: > case CPU_NEHALEM: > case CPU_WESTMERE: > + case CPU_SANDYBRIDGE: > case CPU_MIPS_LOONGSON2: > case CPU_FAMILY12H: > case CPU_FAMILY14H: > diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h > index f1d67a5..a529dd6 100644 > --- a/libop/op_hw_specific.h > +++ b/libop/op_hw_specific.h > @@ -117,6 +117,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) > case 0x2c: /* Westmere-EP (Intel Xeon 5600 series) */ > case 0x2f: /* Westmere-EX */ > return CPU_WESTMERE; > + case 0x2a: > + case 0x2d: > + return CPU_SANDYBRIDGE; > } > } > return cpu_type; > diff --git a/utils/ophelp.c b/utils/ophelp.c > index b3aebde..f4e0653 100644 > --- a/utils/ophelp.c > +++ b/utils/ophelp.c > @@ -533,6 +533,7 @@ int main(int argc, char const * argv[]) > case CPU_CORE_I7: > case CPU_NEHALEM: > case CPU_WESTMERE: > + case CPU_SANDYBRIDGE: > case CPU_ATOM: > event_doc = > "See Intel Architecture Developer's Manual Volume 3B, Appendix A and\n" ------------------------------------------------------------------------------ Achieve unprecedented app performance and reliability What every C/C++ and Fortran developer should know. Learn how Intel has extended the reach of its next-generation tools to help boost performance applications - inlcuding clusters. http://p.sf.net/sfu/intel-dev2devmay _______________________________________________ oprofile-list mailing list opr...@li... https://lists.sourceforge.net/lists/listinfo/oprofile-list |