From: Andi K. <an...@fi...> - 2011-04-23 00:39:55
|
From: Andi Kleen <ak...@li...> Add an event list for Sandy Bridge. Modify oprofile to detect Sandy Bridges. Signed-off-by: Andi Kleen <ak...@li...> --- events/Makefile.am | 1 + events/i386/sandybridge/events | 67 +++++++++ events/i386/sandybridge/unit_masks | 275 ++++++++++++++++++++++++++++++++++++ libop/op_cpu_type.c | 2 + libop/op_cpu_type.h | 1 + libop/op_events.c | 1 + libop/op_hw_specific.h | 3 + utils/ophelp.c | 1 + 8 files changed, 351 insertions(+), 0 deletions(-) create mode 100644 events/i386/sandybridge/events create mode 100644 events/i386/sandybridge/unit_masks diff --git a/events/Makefile.am b/events/Makefile.am index 60c4164..c4101cc 100644 --- a/events/Makefile.am +++ b/events/Makefile.am @@ -18,6 +18,7 @@ event_files = \ i386/core_i7/events i386/core_i7/unit_masks \ i386/nehalem/events i386/nehalem/unit_masks \ i386/westmere/events i386/westmere/unit_masks \ + i386/sandybridge/events i386/sandybridge/unit_masks \ ia64/ia64/events ia64/ia64/unit_masks \ ia64/itanium2/events ia64/itanium2/unit_masks \ ia64/itanium/events ia64/itanium/unit_masks \ diff --git a/events/i386/sandybridge/events b/events/i386/sandybridge/events new file mode 100644 index 0000000..bf941c7 --- /dev/null +++ b/events/i386/sandybridge/events @@ -0,0 +1,67 @@ +# +# Intel "sandy-bridge" microarchitecture core events. +# +# See http://ark.intel.com/ for help in identifying sandy-bridge based CPUs +# +# Note the minimum counts are not discovered experimentally and could be likely +# lowered in many cases without ill effect. +# +include:i386/arch_perfmon +event:0x03 counters:cpuid um:ld_blocks minimum:100000 name:ld_blocks : blocked loads +event:0x05 counters:cpuid um:misalign_mem_ref minimum:2000000 name:misalign_mem_ref : Misaligned memory references +event:0x07 counters:cpuid um:ld_blocks_partial minimum:100000 name:ld_blocks_partial : Partial loads +event:0x08 counters:cpuid um:dtlb_load_misses minimum:2000000 name:dtlb_load_misses : D-TLB misses +event:0x0d counters:cpuid um:int_misc minimum:2000000 name:int_misc : Instruction decoder events +event:0x0e counters:0,1,2,3 um:uops_issued minimum:2000000 name:uops_issued : Number of Uops issued +event:0x14 counters:cpuid um:arith minimum:2000000 name:arith : Misc ALU events +event:0x17 counters:cpuid um:one minimum:2000000 name:insts_written_to_iq : Number of instructions written to Instruction Queue (IQ) this cycle. +event:0x24 counters:cpuid um:l2_rqsts minimum:200000 name:l2_rqsts : Requests from L2 cache +event:0x27 counters:cpuid um:l2_store_lock_rqsts minimum:200000 name:l2_store_lock_rqsts : L2 cache store lock requests +event:0x28 counters:cpuid um:l2_l1d_wb_rqsts minimum:200000 name:l2_l1d_wb_rqsts : writebacks from L1D to the L2 cache +event:0x48 counters:2 um:l1d_pend_miss minimum:2000000 name:l1d_pend_miss : Cycles with L1D load Misses outstanding. +event:0x49 counters:cpuid um:dtlb_store_misses minimum:2000000 name:dtlb_store_misses : D-TLB store misses +event:0x4c counters:cpuid um:load_hit_pre minimum:100000 name:load_hit_pre : Load dispatches that hit fill buffer +event:0x4e counters:cpuid um:x02 minimum:2000000 name:hw_pre_req : Hardware Prefetch requests +event:0x51 counters:cpuid um:l1d minimum:2000000 name:l1d : L1D cache events +event:0x59 counters:cpuid um:partial_rat_stalls minimum:2000000 name:partial_rat_stalls : Partial RAT stalls +event:0x5b counters:0,1,2,3 um:resource_stalls2 minimum:2000000 name:resource_stalls2 : Misc resource stalls +event:0x5c counters:cpuid um:cpl_cycles minimum:2000000 name:cpl_cycles : Unhalted core cycles in specific rings +event:0x5e counters:0,1,2,3 um:one minimum:2000000 name:rs_events : Events for the reservation station +event:0x60 counters:cpuid um:offcore_requests_outstanding minimum:2000000 name:offcore_requests_outstanding : Offcore outstanding transactions +event:0x63 counters:cpuid um:lock_cycles minimum:2000000 name:lock_cycles : Cycles due to LOCK prefixes. +event:0x79 counters:0,1,2,3 um:idq minimum:2000000 name:idq : Instruction Decode Queue events +event:0x80 counters:cpuid um:x02 minimum:200000 name:icache : Instruction cache events +event:0x85 counters:cpuid um:itlb_misses minimum:2000000 name:itlb_misses : I-TLB misses +event:0x87 counters:cpuid um:ild_stall minimum:2000000 name:ild_stall : Instruction decoding stalls +event:0x88 counters:cpuid um:br_inst_exec minimum:200000 name:br_inst_exec : Branch instructions +event:0x89 counters:cpuid um:br_misp_exec minimum:200000 name:br_misp_exec : Mispredicted branch instructions +event:0x9c counters:0,1,2,3 um:idq_uops_not_delivered minimum:2000000 name:idq_uops_not_delivered : uops not delivered to IDQ. +event:0xa1 counters:cpuid um:uops_dispatched_port minimum:2000000 name:uops_dispatched_port : Count on which ports uops are dispatched. +event:0xa2 counters:cpuid um:resource_stalls minimum:2000000 name:resource_stalls : Core resource stalls +event:0xab counters:cpuid um:dsb2mite_switches minimum:2000000 name:dsb2mite_switches : Number of Decode Stream Buffer (DSB) to MITE switches +event:0xac counters:cpuid um:dsb_fill minimum:2000000 name:dsb_fill : DSB fill events +event:0xae counters:cpuid um:one minimum:10000 name:itlb : ITLB events +event:0xb0 counters:cpuid um:offcore_requests minimum:100000 name:offcore_requests : Requests sent outside the core +event:0xb1 counters:0,1,2,3 um:uops_dispatched minimum:2000000 name:uops_dispatched : uops dispatched +event:0xb2 counters:cpuid um:one minimum:2000000 name:offcore_requests_buffer : Offcore requests buffer events +event:0xb6 counters:cpuid um:one minimum:100000 name:agu_bypass_cancel : AGU bypass cancel +event:0xbd counters:cpuid um:tlb_flush minimum:10000 name:tlb_flush : TLB flushes +event:0xbf counters:cpuid um:l1d_blocks minimum:100000 name:l1d_blocks : L1D cache blocking events +event:0xc0 counters:1 um:one minimum:2000000 name:inst_retired : Instructions retired +event:0xc1 counters:cpuid um:other_assists minimum:100000 name:other_assists : Instructions that needed an assist +event:0xc2 counters:0,1,2,3 um:uops_retired minimum:2000000 name:uops_retired : uops that actually retired. +event:0xc3 counters:cpuid um:machine_clears minimum:100000 name:machine_clears : Number of Machine Clears detected. +event:0xc4 counters:0,1,2,3 um:br_inst_retired minimum:400000 name:br_inst_retired : Counts branch instructions retired +event:0xc5 counters:0,1,2,3 um:br_misp_retired minimum:400000 name:br_misp_retired : Counts mispredicted branch instructions +event:0xca counters:0,1,2,3 um:fp_assist minimum:100000 name:fp_assist : Counts floating point assists +event:0xcb counters:cpuid um:one minimum:100000 name:hw_interrupts : Number of hardware interrupts received by the processor. +event:0xcc counters:cpuid um:x20 minimum:2000000 name:rob_misc_events : Count ROB (Register Reorder Buffer) events. +event:0xcd counters:3 um:x02 minimum:2000000 name:mem_trans_retired : Count memory transactions +event:0xd0 counters:0,1,2,3 um:mem_uops_retired minimum:2000000 name:mem_uops_retired : Count uops with memory accessed retired +event:0xd1 counters:0,1,2,3 um:mem_load_uops_retired minimum:2000000 name:mem_load_uops_retired : Memory load uops. +event:0xd2 counters:0,1,2,3 um:mem_load_uops_llc_hit_retired minimum:100000 name:mem_load_uops_llc_hit_retired : Memory load uops with LLC (Last level cache) hit +event:0xd4 counters:0,1,2,3 um:x02 minimum:10000 name:mem_load_uops_misc_retired : Memory load uops retired +event:0xf0 counters:cpuid um:l2_trans minimum:200000 name:l2_trans : L2 cache accesses +event:0xf1 counters:cpuid um:l2_lines_in minimum:100000 name:l2_lines_in : L2 cache lines in +event:0xf2 counters:cpuid um:l2_lines_out minimum:100000 name:l2_lines_out : L2 cache lines out +event:0xf4 counters:cpuid um:x10 minimum:100000 name:sq_misc : Store queue misc events diff --git a/events/i386/sandybridge/unit_masks b/events/i386/sandybridge/unit_masks new file mode 100644 index 0000000..cca6cb9 --- /dev/null +++ b/events/i386/sandybridge/unit_masks @@ -0,0 +1,275 @@ +# +# Unit masks for the Intel "sandy-bridge" micro architecture +# +# See http://ark.intel.com/ for help in identifying sandy-bridge based CPUs +# +include:i386/arch_perfmon +name:x02 type:mandatory default:0x2 + 0x2 No unit mask +name:x10 type:mandatory default:0x10 + 0x10 No unit mask +name:x20 type:mandatory default:0x20 + 0x20 No unit mask +name:ld_blocks type:bitmask default:0x1 + 0x1 data_unknown blocked loads due to store buffer blocks with unknown data. + 0x2 store_forward loads blocked by overlapping with store buffer that cannot be forwarded + 0x8 no_sr This event counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use. + 0x10 all_block Number of cases where any load is blocked but has no DCU miss. +name:misalign_mem_ref type:bitmask default:0x1 + 0x1 loads Speculative cache-line split load uops dispatched to the L1D. + 0x2 stores Speculative cache-line split Store-address uops dispatched to L1D +name:ld_blocks_partial type:bitmask default:0x1 + 0x1 address_alias False dependencies in MOB due to partial compare on address + 0x8 all_sta_block This event counts the number of times that load operations are temporarily blocked because of older stores, with addresses that are not yet known. A load operation may incur more than one block of this type. +name:dtlb_load_misses type:bitmask default:0x1 + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M/1G) + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M/1G) + 0x4 walk_duration Cycles PMH is busy with this walk + 0x10 stlb_hit First level miss but second level hit; no page walk. +name:int_misc type:bitmask default:0x40 + 0x40 rat_stall_cycles Cycles Resource Allocation Table (RAT) external stall is sent to Instruction Decode Queue (IDQ) for this thread. + 0x3 extra:cmask=1 recovery_cycles Number of cycles waiting to be recover after Nuke due to all other cases except JEClear. + 0x3 extra:cmask=1,edge recovery_stalls_count Edge applied to recovery_cycles, thus counts occurrences. +name:uops_issued type:bitmask default:0x1 + 0x1 any Number of Uops issued by the Resource Allocation Table (RAT) to the Reservation Station (RS) + 0x1 extra:cmask=1,inv stall_cycles cycles no uops issued by this thread. +name:arith type:bitmask default:0x1 + 0x1 fpu_div_active Cycles that the divider is busy with any divide or sqrt operation. + 0x1 extra:cmask=1,edge fpu_div Number of times that the divider is actived, includes INT, SIMD and FP. +name:l2_rqsts type:bitmask default:0x1 + 0x1 demand_data_rd_hit Demand Data Read hit L2, no rejects + 0x4 rfo_hit RFO requests that hit L2 cache + 0x8 rfo_miss RFO requests that miss L2 cache + 0x10 code_rd_hit L2 cache hits when fetching instructions, code reads. + 0x20 code_rd_miss L2 cache misses when fetching instructions + 0x40 pf_hit Requests from the L2 hardware prefetchers that hit L2 cache + 0x80 pf_miss Requests from the L2 hardware prefetchers that miss L2 cache + 0x3 all_demand_data_rd Any data read request to L2 cache + 0xc all_rfo Any data RFO request to L2 cache + 0x30 all_code_rd Any code read request to L2 cache + 0xc0 all_pf Any L2 HW prefetch request to L2 cache +name:l2_store_lock_rqsts type:bitmask default:0xf + 0xf all RFOs that access cache lines in any state + 0x1 miss RFO (as a result of regular RFO or Lock request) miss cache - I state + 0x4 hit_e RFO (as a result of regular RFO or Lock request) hits cache in E state + 0x8 hit_m RFO (as a result of regular RFO or Lock request) hits cache in M state +name:l2_l1d_wb_rqsts type:bitmask default:0x4 + 0x4 hit_e writebacks from L1D to L2 cache lines in E state + 0x8 hit_m writebacks from L1D to L2 cache lines in M state +name:l1d_pend_miss type:bitmask default:0x1 + 0x1 pending Cycles with L1D load Misses outstanding. + 0x1 extra:cmask=1,edge occurences This event counts the number of L1D misses outstanding occurences. +name:dtlb_store_misses type:bitmask default:0x1 + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M/1G) + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M/1G) + 0x4 walk_duration Cycles PMH is busy with this walk + 0x10 stlb_hit First level miss but second level hit; no page walk. Only relevant if multiple levels. +name:load_hit_pre type:bitmask default:0x1 + 0x1 sw_pf Load dispatches that hit fill buffer allocated for S/W prefetch. + 0x2 hw_pf Load dispatches that hit fill buffer allocated for HW prefetch. +name:l1d type:bitmask default:0x1 + 0x1 replacement L1D Data line replacements. + 0x2 allocated_in_m L1D M-state Data Cache Lines Allocated + 0x4 eviction L1D M-state Data Cache Lines Evicted due to replacement (only) + 0x8 all_m_replacement All Modified lines evicted out of L1D +name:partial_rat_stalls type:bitmask default:0x20 + 0x20 flags_merge_uop Number of perf sensitive flags-merge uops added by Sandy Bridge u-arch. + 0x40 slow_lea_window Number of cycles with at least 1 slow Load Effective Address (LEA) uop being allocated. + 0x80 mul_single_uop Number of Multiply packed/scalar single precision uops allocated + 0x20 extra:cmask=1 flags_merge_uop_cycles Cycles with perf sensitive flags-merge uops added by SandyBridge u-arch. +name:resource_stalls2 type:bitmask default:0x40 + 0x40 bob_full Cycles Allocator is stalled due Branch Order Buffer (BOB). + 0xf all_prf_control Resource stalls2 control structures full for physical registers + 0xc all_fl_empty Cycles with either free list is empty + 0x4f ooo_rsrc Resource stalls2 control structures full Physical Register Reclaim Table (PRRT), Physical History Table (PHT), INT or SIMD Free List (FL), Branch Order Buffer (BOB) +name:cpl_cycles type:bitmask default:0x1 + 0x1 ring0 Unhalted core cycles the Thread was in Rings 0. + 0x1 extra:cmask=1,edge ring0_trans Transitions from ring123 to Ring0. + 0x2 ring123 Unhalted core cycles the Thread was in Rings 1/2/3. +name:offcore_requests_outstanding type:bitmask default:0x1 + 0x1 demand_data_rd Offcore outstanding Demand Data Read transactions in the SuperQueue (SQ), queue to uncore, every cycle. Includes L1D data hardware prefetches. + 0x1 extra:cmask=1 cycles_with_demand_data_rd cycles there are Offcore outstanding RD data transactions in the SuperQueue (SQ), queue to uncore. + 0x2 demand_code_rd Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. + 0x4 demand_rfo Offcore outstanding RFO (store) transactions in the SuperQueue (SQ), queue to uncore, every cycle. + 0x8 all_data_rd Offcore outstanding all cacheable Core Data Read transactions in the SuperQueue (SQ), queue to uncore, every cycle. + 0x8 extra:cmask=1 cycles_with_data_rd Cycles there are Offcore outstanding all Data read transactions in the SuperQueue (SQ), queue to uncore, every cycle. + 0x2 extra:cmask=1 cycles_with_demand_code_rd Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. + 0x4 extra:cmask=1 cycles_with_demand_rfo Cycles with offcore outstanding demand RFO Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. +name:lock_cycles type:bitmask default:0x1 + 0x1 split_lock_uc_lock_duration Cycles in which the L1D and L2 are locked, due to a UC lock or split lock + 0x2 cache_lock_duration cycles that theL1D is locked +name:idq type:bitmask default:0x2 + 0x2 empty Cycles the Instruction Decode Queue (IDQ) is empty. + 0x4 mite_uops Number of uops delivered to Instruction Decode Queue (IDQ) from MITE path. + 0x8 dsb_uops Number of uops delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path. + 0x10 ms_dsb_uops Number of Uops delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by Decode Stream Buffer (DSB). + 0x20 ms_mite_uops Number of Uops delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by MITE. + 0x30 ms_uops Number of Uops were delivered into Instruction Decode Queue (IDQ) from MS, initiated by Decode Stream Buffer (DSB) or MITE. + 0x30 extra:cmask=1 ms_cycles Number of cycles that Uops were delivered into Instruction Decode Queue (IDQ) when MS_Busy, initiated by Decode Stream Buffer (DSB) or MITE. + 0x4 extra:cmask=1 mite_cycles Cycles MITE is active + 0x8 extra:cmask=1 dsb_cycles Cycles Decode Stream Buffer (DSB) is active + 0x10 extra:cmask=1 ms_dsb_cycles Cycles Decode Stream Buffer (DSB) Microcode Sequenser (MS) is active + 0x10 extra:cmask=1,edge ms_dsb_occur Occurences of Decode Stream Buffer (DSB) Microcode Sequenser (MS) going active + 0x18 extra:cmask=1 all_dsb_cycles_any_uops Cycles Decode Stream Buffer (DSB) is delivering anything + 0x18 extra:cmask=4 all_dsb_cycles_4_uops Cycles Decode Stream Buffer (DSB) is delivering 4 Uops + 0x24 extra:cmask=1 all_mite_cycles_any_uops Cycles MITE is delivering anything + 0x24 extra:cmask=4 all_mite_cycles_4_uops Cycles MITE is delivering 4 Uops + 0x3c mite_all_uops Number of uops delivered to Instruction Decode Queue (IDQ) from any path. +name:itlb_misses type:bitmask default:0x1 + 0x1 miss_causes_a_walk Miss in all TLB levels causes an page walk of any page size (4K/2M/4M) + 0x2 walk_completed Miss in all TLB levels causes a page walk that completes of any page size (4K/2M/4M) + 0x4 walk_duration Cycles PMH is busy with this walk. + 0x10 stlb_hit First level miss but second level hit; no page walk. +name:ild_stall type:bitmask default:0x1 + 0x1 lcp Stall "occurrences" due to length changing prefixes (LCP). + 0x4 iq_full Stall cycles when instructions cannot be written because the Instruction Queue (IQ) is full. +name:br_inst_exec type:bitmask default:0xff + 0xff all_branches All branch instructions executed. + 0x41 nontaken_conditional All macro conditional nontaken branch instructions. + 0x81 taken_conditional All macro conditional taken branch instructions. + 0x82 taken_direct_jump All macro unconditional taken branch instructions, excluding calls and indirects. + 0x84 taken_indirect_jump_non_call_ret All taken indirect branches that are not calls nor returns. + 0x88 taken_indirect_near_return All taken indirect branches that have a return mnemonic. + 0x90 taken_direct_near_call All taken non-indirect calls. + 0xa0 taken_indirect_near_call All taken indirect calls, including both register and memory indirect. + 0xc1 all_conditional All macro conditional branch instructions. + 0xc2 all_direct_jmp All macro unconditional branch instructions, excluding calls and indirects + 0xc4 all_indirect_jump_non_call_ret All indirect branches that are not calls nor returns. + 0xc8 all_indirect_near_return All indirect return branches. + 0xd0 all_direct_near_call All non-indirect calls executed. +name:br_misp_exec type:bitmask default:0xff + 0xff all_branches All mispredicted branch instructions executed. + 0x41 nontaken_conditional All nontaken mispredicted macro conditional branch instructions. + 0x81 taken_conditional All taken mispredicted macro conditional branch instructions. + 0x84 taken_indirect_jump_non_call_ret All taken mispredicted indirect branches that are not calls nor returns. + 0x88 taken_return_near All taken mispredicted indirect branches that have a return mnemonic. + 0x90 taken_direct_near_call All taken mispredicted non-indirect calls. + 0xa0 taken_indirect_near_call All taken mispredicted indirect calls, including both register and memory indirect. + 0xc1 all_conditional All mispredicted macro conditional branch instructions. + 0xc4 all_indirect_jump_non_call_ret All mispredicted indirect branches that are not calls nor returns. + 0xd0 all_direct_near_call All mispredicted non-indirect calls +name:idq_uops_not_delivered type:bitmask default:0x1 + 0x1 core Count number of non-delivered uops to Resource Allocation Table (RAT). + 0x1 extra:cmask=4 cycles_0_uops_deliv.core Counts the cycles no uops were delivered + 0x1 extra:cmask=3 cycles_le_1_uop_deliv.core Counts the cycles less than 1 uops were delivered + 0x1 extra:cmask=2 cycles_le_2_uop_deliv.core Counts the cycles less than 2 uops were delivered + 0x1 extra:cmask=1 cycles_le_3_uop_deliv.core Counts the cycles less than 3 uops were delivered + 0x1 extra:cmask=4,inv cycles_ge_1_uop_deliv.core Cycles when 1 or more uops were delivered to the by the front end. + 0x1 extra:cmask=1,inv cycles_fe_was_ok Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE. +name:uops_dispatched_port type:bitmask default:0x1 + 0x1 port_0 Cycles which a Uop is dispatched on port 0 + 0x2 port_1 Cycles which a Uop is dispatched on port 1 + 0x4 port_2_ld Cycles which a load Uop is dispatched on port 2 + 0x8 port_2_sta Cycles which a STA Uop is dispatched on port 2 + 0x10 port_3_ld Cycles which a load Uop is dispatched on port 3 + 0x20 port_3_sta Cycles which a STA Uop is dispatched on port 3 + 0x40 port_4 Cycles which a Uop is dispatched on port 4 + 0x80 port_5 Cycles which a Uop is dispatched on port 5 + 0xc port_2 Uops disptached to port 2, loads and stores (speculative and retired) + 0x30 port_3 Uops disptached to port 3, loads and stores (speculative and retired) + 0xc port_2_core Uops disptached to port 2, loads and stores per core (speculative and retired) + 0x30 port_3_core Uops disptached to port 3, loads and stores per core (speculative and retired) +name:resource_stalls type:bitmask default:0x1 + 0x1 any Cycles Allocation is stalled due to Resource Related reason. + 0x2 lb Cycles Allocator is stalled due to Load Buffer full + 0x4 rs Stall due to no eligible Reservation Station (RS) entry available. + 0x8 sb Cycles Allocator is stalled due to Store Buffer full (not including draining from synch). + 0x10 rob ROB full cycles. + 0xe mem_rs Resource stalls due to LB, SB or Reservation Station (RS) being completely in use + 0xf0 ooo_rsrc Resource stalls due to Rob being full, FCSW, MXCSR and OTHER + 0xa lb_sb Resource stalls due to load or store buffers +name:dsb2mite_switches type:bitmask default:0x1 + 0x1 count Number of Decode Stream Buffer (DSB) to MITE switches + 0x2 penalty_cycles Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles. +name:dsb_fill type:bitmask default:0x2 + 0x2 other_cancel Count number of times a valid DSB fill has been actually cancelled for any reason. + 0x8 exceed_dsb_lines Decode Stream Buffer (DSB) Fill encountered > 3 Decode Stream Buffer (DSB) lines. + 0xa all_cancel Count number of times a valid Decode Stream Buffer (DSB) fill has been actually cancelled for any reason. +name:offcore_requests type:bitmask default:0x1 + 0x1 demand_data_rd Demand Data Read requests sent to uncore + 0x2 demand_code_rd Offcore Code read requests. Includes Cacheable and Un-cacheables. + 0x4 demand_rfo Offcore Demand RFOs. Includes regular RFO, Locks, ItoM. + 0x8 all_data_rd Offcore Demand and prefetch data reads returned to the core. +name:uops_dispatched type:bitmask default:0x1 + 0x1 thread Counts total number of uops to be dispatched per-thread each cycle. + 0x1 extra:cmask=1,inv stall_cycles Counts number of cycles no uops were dispatced to be executed on this thread. + 0x2 core Counts total number of uops dispatched from any thread +name:tlb_flush type:bitmask default:0x1 + 0x1 dtlb_thread Count number of DTLB flushes of thread-specific entries. + 0x20 stlb_any Count number of any STLB flushes +name:l1d_blocks type:bitmask default:0x1 + 0x1 ld_bank_conflict Any dispatched loads cancelled due to DCU bank conflict + 0x5 extra:cmask=1 bank_conflict_cycles Cycles with l1d blocks due to bank conflicts +name:other_assists type:bitmask default:0x2 + 0x2 itlb_miss_retired Instructions that experienced an ITLB miss. Non Pebs + 0x10 avx_to_sse Number of transitions from AVX-256 to legacy SSE when penalty applicable Non Pebs + 0x20 sse_to_avx Number of transitions from legacy SSE to AVX-256 when penalty applicable Non Pebs +name:uops_retired type:bitmask default:0x1 + 0x1 all All uops that actually retired. + 0x2 retire_slots number of retirement slots used non PEBS + 0x1 extra:cmask=1,inv stall_cycles Cycles no executable uops retired + 0x1 extra:cmask=10,inv total_cycles Number of cycles using always true condition applied to non PEBS uops retired event. +name:machine_clears type:bitmask default:0x2 + 0x2 memory_ordering Number of Memory Ordering Machine Clears detected. + 0x4 smc Number of Self-modifying code (SMC) Machine Clears detected. + 0x20 maskmov Number of AVX masked mov Machine Clears detected. +name:br_inst_retired type:bitmask default:0x1 + 0x1 conditional Counts all taken and not taken macro conditional branch instructions. + 0x2 near_call Counts all macro direct and indirect near calls. non PEBS + 0x8 near_return This event counts the number of near ret instructions retired. + 0x10 not_taken Counts all not taken macro branch instructions retired. + 0x20 near_taken Counts the number of near branch taken instructions retired. + 0x40 far_branch Counts the number of far branch instructions retired. + 0x4 all_branches_ps Counts all taken and not taken macro branches including far branches.(Precise Event) + 0x2 near_call_r3 Ring123 only near calls (non precise) + 0x2 near_call_r3_ps Ring123 only near calls (precise event) +name:br_misp_retired type:bitmask default:0x1 + 0x1 conditional All mispredicted macro conditional branch instructions. + 0x2 near_call All macro direct and indirect near calls + 0x10 not_taken number of branch instructions retired that were mispredicted and not-taken. + 0x20 taken number of branch instructions retired that were mispredicted and taken. + 0x4 all_branches_ps all macro branches (Precise Event) +name:fp_assist type:bitmask default:0x1e + 0x1e extra:cmask=1 any Counts any FP_ASSIST umask was incrementing. + 0x2 x87_output output - Numeric Overflow, Numeric Underflow, Inexact Result + 0x4 x87_input input - Invalid Operation, Denormal Operand, SNaN Operand + 0x8 simd_output Any output SSE* FP Assist - Numeric Overflow, Numeric Underflow. + 0x10 simd_input Any input SSE* FP Assist +name:mem_uops_retired type:bitmask default:0x11 + 0x11 stlb_miss_loads STLB misses dues to retired loads + 0x12 stlb_miss_stores STLB misses dues to retired stores + 0x21 lock_loads Locked retired loads + 0x41 split_loads Retired loads causing cacheline splits + 0x42 split_stores Retired stores causing cacheline splits + 0x81 all_loads Any retired loads + 0x82 all_stores Any retired stores +name:mem_load_uops_retired type:bitmask default:0x1 + 0x1 l1_hit Load hit in nearest-level (L1D) cache + 0x2 l2_hit Load hit in mid-level (L2) cache + 0x4 llc_hit Load hit in last-level (L3) cache with no snoop needed + 0x40 hit_lfb A load missed L1D but hit the Fill Buffer +name:mem_load_uops_llc_hit_retired type:bitmask default:0x1 + 0x1 xsnp_miss Load LLC Hit and a cross-core Snoop missed in on-pkg core cache + 0x2 xsnp_hit Load LLC Hit and a cross-core Snoop hits in on-pkg core cache + 0x4 xsnp_hitm Load had HitM Response from a core on same socket (shared LLC). + 0x8 xsnp_none Load hit in last-level (L3) cache with no snoop needed. +name:l2_trans type:bitmask default:0x80 + 0x80 all_requests Transactions accessing L2 pipe + 0x1 demand_data_rd Demand Data Read requests that access L2 cache, includes L1D prefetches. + 0x2 rfo RFO requests that access L2 cache + 0x4 code_rd L2 cache accesses when fetching instructions including L1D code prefetches + 0x8 all_pf L2 or LLC HW prefetches that access L2 cache + 0x10 l1d_wb L1D writebacks that access L2 cache + 0x20 l2_fill L2 fill requests that access L2 cache + 0x40 l2_wb L2 writebacks that access L2 cache +name:l2_lines_in type:bitmask default:0x7 + 0x7 all L2 cache lines filling L2 + 0x1 i L2 cache lines in I state filling L2 + 0x2 s L2 cache lines in S state filling L2 + 0x4 e L2 cache lines in E state filling L2 +name:l2_lines_out type:bitmask default:0x1 + 0x1 demand_clean Clean line evicted by a demand + 0x2 demand_dirty Dirty line evicted by a demand + 0x4 pf_clean Clean line evicted by an L2 Prefetch + 0x8 pf_dirty Dirty line evicted by an L2 Prefetch + 0xa dirty_all Any Dirty line evicted diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c index b2ebf54..9d11b21 100644 --- a/libop/op_cpu_type.c +++ b/libop/op_cpu_type.c @@ -93,6 +93,7 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = { { "Intel Westmere microarchitecture", "i386/westmere", CPU_WESTMERE, 4 }, { "ARMv7 Scorpion", "arm/armv7-scorpion", CPU_ARM_SCORPION, 5 }, { "ARMv7 ScorpionMP", "arm/armv7-scorpionmp", CPU_ARM_SCORPIONMP, 5 }, + { "Intel Sandy Bridge microarchitecture", "i386/sandybridge", CPU_SANDYBRIDGE, 8 }, }; static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr); @@ -117,6 +118,7 @@ op_cpu op_cpu_base_type(op_cpu cpu_type) case CPU_ATOM: case CPU_NEHALEM: case CPU_WESTMERE: + case CPU_SANDYBRIDGE: return CPU_ARCH_PERFMON; default: /* assume processor in a class by itself */ diff --git a/libop/op_cpu_type.h b/libop/op_cpu_type.h index 9283ec7..d6cae3a 100644 --- a/libop/op_cpu_type.h +++ b/libop/op_cpu_type.h @@ -90,6 +90,7 @@ typedef enum { CPU_WESTMERE, /* Intel Westmere microarchitecture */ CPU_ARM_SCORPION, /**< ARM SCORPION */ CPU_ARM_SCORPIONMP, /**< ARM SCORPIONMP */ + CPU_SANDYBRIDGE, /* Intel Sandy-Bridge microarchitecture */ MAX_CPU_TYPE } op_cpu; diff --git a/libop/op_events.c b/libop/op_events.c index 0f27288..def8fe2 100644 --- a/libop/op_events.c +++ b/libop/op_events.c @@ -1023,6 +1023,7 @@ void op_default_event(op_cpu cpu_type, struct op_default_event_descr * descr) case CPU_CORE_I7: case CPU_NEHALEM: case CPU_WESTMERE: + case CPU_SANDYBRIDGE: case CPU_MIPS_LOONGSON2: case CPU_FAMILY12H: case CPU_FAMILY14H: diff --git a/libop/op_hw_specific.h b/libop/op_hw_specific.h index 4b4d55a..023515d 100644 --- a/libop/op_hw_specific.h +++ b/libop/op_hw_specific.h @@ -116,6 +116,9 @@ static inline op_cpu op_cpu_specific_type(op_cpu cpu_type) case 0x25: case 0x2c: return CPU_WESTMERE; + case 0x2a: + case 0x2d: + return CPU_SANDYBRIDGE; } } return cpu_type; diff --git a/utils/ophelp.c b/utils/ophelp.c index b3aebde..f4e0653 100644 --- a/utils/ophelp.c +++ b/utils/ophelp.c @@ -533,6 +533,7 @@ int main(int argc, char const * argv[]) case CPU_CORE_I7: case CPU_NEHALEM: case CPU_WESTMERE: + case CPU_SANDYBRIDGE: case CPU_ATOM: event_doc = "See Intel Architecture Developer's Manual Volume 3B, Appendix A and\n" -- 1.7.4.2 |