You can subscribe to this list here.
| 2002 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(1) |
Oct
(122) |
Nov
(152) |
Dec
(69) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2003 |
Jan
(6) |
Feb
(25) |
Mar
(73) |
Apr
(82) |
May
(24) |
Jun
(25) |
Jul
(10) |
Aug
(11) |
Sep
(10) |
Oct
(54) |
Nov
(203) |
Dec
(182) |
| 2004 |
Jan
(307) |
Feb
(305) |
Mar
(430) |
Apr
(312) |
May
(187) |
Jun
(342) |
Jul
(487) |
Aug
(637) |
Sep
(336) |
Oct
(373) |
Nov
(441) |
Dec
(210) |
| 2005 |
Jan
(385) |
Feb
(480) |
Mar
(636) |
Apr
(544) |
May
(679) |
Jun
(625) |
Jul
(810) |
Aug
(838) |
Sep
(634) |
Oct
(521) |
Nov
(965) |
Dec
(543) |
| 2006 |
Jan
(494) |
Feb
(431) |
Mar
(546) |
Apr
(411) |
May
(406) |
Jun
(322) |
Jul
(256) |
Aug
(401) |
Sep
(345) |
Oct
(542) |
Nov
(308) |
Dec
(481) |
| 2007 |
Jan
(427) |
Feb
(326) |
Mar
(367) |
Apr
(255) |
May
(244) |
Jun
(204) |
Jul
(223) |
Aug
(231) |
Sep
(354) |
Oct
(374) |
Nov
(497) |
Dec
(362) |
| 2008 |
Jan
(322) |
Feb
(482) |
Mar
(658) |
Apr
(422) |
May
(476) |
Jun
(396) |
Jul
(455) |
Aug
(267) |
Sep
(280) |
Oct
(253) |
Nov
(232) |
Dec
(304) |
| 2009 |
Jan
(486) |
Feb
(470) |
Mar
(458) |
Apr
(423) |
May
(696) |
Jun
(461) |
Jul
(551) |
Aug
(575) |
Sep
(134) |
Oct
(110) |
Nov
(157) |
Dec
(102) |
| 2010 |
Jan
(226) |
Feb
(86) |
Mar
(147) |
Apr
(117) |
May
(107) |
Jun
(203) |
Jul
(193) |
Aug
(238) |
Sep
(300) |
Oct
(246) |
Nov
(23) |
Dec
(75) |
| 2011 |
Jan
(133) |
Feb
(195) |
Mar
(315) |
Apr
(200) |
May
(267) |
Jun
(293) |
Jul
(353) |
Aug
(237) |
Sep
(278) |
Oct
(611) |
Nov
(274) |
Dec
(260) |
| 2012 |
Jan
(303) |
Feb
(391) |
Mar
(417) |
Apr
(441) |
May
(488) |
Jun
(655) |
Jul
(590) |
Aug
(610) |
Sep
(526) |
Oct
(478) |
Nov
(359) |
Dec
(372) |
| 2013 |
Jan
(467) |
Feb
(226) |
Mar
(391) |
Apr
(281) |
May
(299) |
Jun
(252) |
Jul
(311) |
Aug
(352) |
Sep
(481) |
Oct
(571) |
Nov
(222) |
Dec
(231) |
| 2014 |
Jan
(185) |
Feb
(329) |
Mar
(245) |
Apr
(238) |
May
(281) |
Jun
(399) |
Jul
(382) |
Aug
(500) |
Sep
(579) |
Oct
(435) |
Nov
(487) |
Dec
(256) |
| 2015 |
Jan
(338) |
Feb
(357) |
Mar
(330) |
Apr
(294) |
May
(191) |
Jun
(108) |
Jul
(142) |
Aug
(261) |
Sep
(190) |
Oct
(54) |
Nov
(83) |
Dec
(22) |
| 2016 |
Jan
(49) |
Feb
(89) |
Mar
(33) |
Apr
(50) |
May
(27) |
Jun
(34) |
Jul
(53) |
Aug
(53) |
Sep
(98) |
Oct
(206) |
Nov
(93) |
Dec
(53) |
| 2017 |
Jan
(65) |
Feb
(82) |
Mar
(102) |
Apr
(86) |
May
(187) |
Jun
(67) |
Jul
(23) |
Aug
(93) |
Sep
(65) |
Oct
(45) |
Nov
(35) |
Dec
(17) |
| 2018 |
Jan
(26) |
Feb
(35) |
Mar
(38) |
Apr
(32) |
May
(8) |
Jun
(43) |
Jul
(27) |
Aug
(30) |
Sep
(43) |
Oct
(42) |
Nov
(38) |
Dec
(67) |
| 2019 |
Jan
(32) |
Feb
(37) |
Mar
(53) |
Apr
(64) |
May
(49) |
Jun
(18) |
Jul
(14) |
Aug
(53) |
Sep
(25) |
Oct
(30) |
Nov
(49) |
Dec
(31) |
| 2020 |
Jan
(87) |
Feb
(45) |
Mar
(37) |
Apr
(51) |
May
(99) |
Jun
(36) |
Jul
(11) |
Aug
(14) |
Sep
(20) |
Oct
(24) |
Nov
(40) |
Dec
(23) |
| 2021 |
Jan
(14) |
Feb
(53) |
Mar
(85) |
Apr
(15) |
May
(19) |
Jun
(3) |
Jul
(14) |
Aug
(1) |
Sep
(57) |
Oct
(73) |
Nov
(56) |
Dec
(22) |
| 2022 |
Jan
(3) |
Feb
(22) |
Mar
(6) |
Apr
(55) |
May
(46) |
Jun
(39) |
Jul
(15) |
Aug
(9) |
Sep
(11) |
Oct
(34) |
Nov
(20) |
Dec
(36) |
| 2023 |
Jan
(79) |
Feb
(41) |
Mar
(99) |
Apr
(169) |
May
(48) |
Jun
(16) |
Jul
(16) |
Aug
(57) |
Sep
(19) |
Oct
|
Nov
|
Dec
|
| S | M | T | W | T | F | S |
|---|---|---|---|---|---|---|
|
|
|
1
(16) |
2
(10) |
3
(7) |
4
(8) |
5
(8) |
|
6
(11) |
7
(6) |
8
(14) |
9
(9) |
10
(6) |
11
(5) |
12
(5) |
|
13
(5) |
14
(8) |
15
(8) |
16
(12) |
17
(7) |
18
(7) |
19
(6) |
|
20
(7) |
21
(6) |
22
(6) |
23
(9) |
24
(13) |
25
(8) |
26
(6) |
|
27
(6) |
28
(6) |
29
(6) |
30
(7) |
31
(6) |
|
|
|
From: <sv...@va...> - 2007-05-08 22:35:27
|
Author: sewardj
Date: 2007-05-08 23:35:21 +0100 (Tue, 08 May 2007)
New Revision: 1772
Log:
When generating code for helper calls, be more aggressive about
computing values directly into argument registers, thereby avoiding
some reg-reg shuffling. This reduces the amount of code (on amd64)
generated by Cachegrind by about 6% and has zero or marginal benefit
for other tools.
Modified:
branches/CGTUNE/priv/host-amd64/isel.c
Modified: branches/CGTUNE/priv/host-amd64/isel.c
===================================================================
--- branches/CGTUNE/priv/host-amd64/isel.c 2007-05-08 18:45:59 UTC (rev 1771)
+++ branches/CGTUNE/priv/host-amd64/isel.c 2007-05-08 22:35:21 UTC (rev 1772)
@@ -372,20 +372,54 @@
//.. }
-/* Used only in doHelperCall. See big comment in doHelperCall re
- handling of register-parameter args. This function figures out
- whether evaluation of an expression might require use of a fixed
- register. If in doubt return True (safe but suboptimal).
-*/
-static
-Bool mightRequireFixedRegs ( IRExpr* e )
+/* Used only in doHelperCall. If possible, produce a single
+ instruction which computes 'e' into 'dst'. If not possible, return
+ NULL. */
+
+static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
+ HReg dst,
+ IRExpr* e )
{
- switch (e->tag) {
- case Iex_RdTmp: case Iex_Const: case Iex_Get:
- return False;
- default:
- return True;
+ vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
+
+ if (e->tag == Iex_Const) {
+ vassert(e->Iex.Const.con->tag == Ico_U64);
+ if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
+ return AMD64Instr_Alu64R(
+ Aalu_MOV,
+ AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
+ dst
+ );
+ } else {
+ return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
+ }
}
+
+ if (e->tag == Iex_RdTmp) {
+ HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
+ return mk_iMOVsd_RR(src, dst);
+ }
+
+ if (e->tag == Iex_Get) {
+ vassert(e->Iex.Get.ty == Ity_I64);
+ return AMD64Instr_Alu64R(
+ Aalu_MOV,
+ AMD64RMI_Mem(
+ AMD64AMode_IR(e->Iex.Get.offset,
+ hregAMD64_RBP())),
+ dst);
+ }
+
+ if (e->tag == Iex_Unop
+ && e->Iex.Unop.op == Iop_32Uto64
+ && e->Iex.Unop.arg->tag == Iex_RdTmp) {
+ HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
+ return AMD64Instr_MovZLQ(src, dst);
+ }
+
+ if (0) { ppIRExpr(e); vex_printf("\n"); }
+
+ return NULL;
}
@@ -401,7 +435,7 @@
AMD64CondCode cc;
HReg argregs[6];
HReg tmpregs[6];
- Bool go_fast;
+ AMD64Instr* fastinstrs[6];
Int n_args, i, argreg;
/* Marshal args for a call and do the call.
@@ -471,12 +505,13 @@
tmpregs[0] = tmpregs[1] = tmpregs[2] =
tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
+ fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
+ fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
+
/* First decide which scheme (slow or fast) is to be used. First
assume the fast scheme, and select slow if any contraindications
(wow) appear. */
- go_fast = True;
-
if (guard) {
if (guard->tag == Iex_Const
&& guard->Iex.Const.con->tag == Ico_U1
@@ -484,91 +519,94 @@
/* unconditional */
} else {
/* Not manifestly unconditional -- be conservative. */
- go_fast = False;
+ goto slowscheme;
}
}
- if (go_fast) {
- for (i = 0; i < n_args; i++) {
- if (mightRequireFixedRegs(args[i])) {
- go_fast = False;
- break;
- }
- }
+ /* Ok, let's try for the fast scheme. If it doesn't pan out, we'll
+ use the slow scheme. Because this is tentative, we can't call
+ addInstr (that is, commit to) any instructions until we're
+ handled all the arguments. So park the resulting instructions
+ in a buffer and emit that if we're successful. */
+
+ /* FAST SCHEME */
+ argreg = 0;
+ if (passBBP) {
+ fastinstrs[argreg] = mk_iMOVsd_RR( hregAMD64_RBP(), argregs[argreg]);
+ argreg++;
}
- /* At this point the scheme to use has been established. Generate
- code to get the arg values into the argument rregs. */
+ for (i = 0; i < n_args; i++) {
+ vassert(argreg < 6);
+ vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
+ fastinstrs[argreg]
+ = iselIntExpr_single_instruction( env, argregs[argreg], args[i] );
+ if (fastinstrs[argreg] == NULL)
+ goto slowscheme;
+ argreg++;
+ }
- if (go_fast) {
+ /* Looks like we're in luck. Emit the accumulated instructions and
+ move on to doing the call itself. */
+ vassert(argreg <= 6);
+ for (i = 0; i < argreg; i++)
+ addInstr(env, fastinstrs[i]);
- /* FAST SCHEME */
- argreg = 0;
- if (passBBP) {
- addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), argregs[argreg]));
- argreg++;
- }
+ /* Fast scheme only applies for unconditional calls. Hence: */
+ cc = Acc_ALWAYS;
- for (i = 0; i < n_args; i++) {
- vassert(argreg < 6);
- vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
- addInstr(env, AMD64Instr_Alu64R(
- Aalu_MOV,
- iselIntExpr_RMI(env, args[i]),
- argregs[argreg]
- )
- );
- argreg++;
- }
+ goto handle_call;
- /* Fast scheme only applies for unconditional calls. Hence: */
- cc = Acc_ALWAYS;
- } else {
+ /* SLOW SCHEME; move via temporaries */
+ slowscheme:
+#if 0
+if (n_args > 0) {for (i = 0; args[i]; i++) {
+ppIRExpr(args[i]); vex_printf(" "); }
+vex_printf("\n");}
+#endif
+ argreg = 0;
- /* SLOW SCHEME; move via temporaries */
- argreg = 0;
+ if (passBBP) {
+ /* This is pretty stupid; better to move directly to rdi
+ after the rest of the args are done. */
+ tmpregs[argreg] = newVRegI(env);
+ addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[argreg]));
+ argreg++;
+ }
- if (passBBP) {
- /* This is pretty stupid; better to move directly to rdi
- after the rest of the args are done. */
- tmpregs[argreg] = newVRegI(env);
- addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[argreg]));
- argreg++;
- }
+ for (i = 0; i < n_args; i++) {
+ vassert(argreg < 6);
+ vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
+ tmpregs[argreg] = iselIntExpr_R(env, args[i]);
+ argreg++;
+ }
- for (i = 0; i < n_args; i++) {
- vassert(argreg < 6);
- vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
- tmpregs[argreg] = iselIntExpr_R(env, args[i]);
- argreg++;
+ /* Now we can compute the condition. We can't do it earlier
+ because the argument computations could trash the condition
+ codes. Be a bit clever to handle the common case where the
+ guard is 1:Bit. */
+ cc = Acc_ALWAYS;
+ if (guard) {
+ if (guard->tag == Iex_Const
+ && guard->Iex.Const.con->tag == Ico_U1
+ && guard->Iex.Const.con->Ico.U1 == True) {
+ /* unconditional -- do nothing */
+ } else {
+ cc = iselCondCode( env, guard );
}
+ }
- /* Now we can compute the condition. We can't do it earlier
- because the argument computations could trash the condition
- codes. Be a bit clever to handle the common case where the
- guard is 1:Bit. */
- cc = Acc_ALWAYS;
- if (guard) {
- if (guard->tag == Iex_Const
- && guard->Iex.Const.con->tag == Ico_U1
- && guard->Iex.Const.con->Ico.U1 == True) {
- /* unconditional -- do nothing */
- } else {
- cc = iselCondCode( env, guard );
- }
- }
+ /* Move the args to their final destinations. */
+ for (i = 0; i < argreg; i++) {
+ /* None of these insns, including any spill code that might
+ be generated, may alter the condition codes. */
+ addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
+ }
- /* Move the args to their final destinations. */
- for (i = 0; i < argreg; i++) {
- /* None of these insns, including any spill code that might
- be generated, may alter the condition codes. */
- addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
- }
- }
-
/* Finally, the call itself. */
+ handle_call:
addInstr(env, AMD64Instr_Call(
cc,
Ptr_to_ULong(cee->addr),
|
|
From: <sv...@va...> - 2007-05-08 18:46:00
|
Author: sewardj
Date: 2007-05-08 19:45:59 +0100 (Tue, 08 May 2007)
New Revision: 1771
Log:
Handle Left64. Fixes failure on none/tests/x86/insn_sse2.
Modified:
branches/CGTUNE/priv/host-x86/isel.c
Modified: branches/CGTUNE/priv/host-x86/isel.c
===================================================================
--- branches/CGTUNE/priv/host-x86/isel.c 2007-05-08 18:00:19 UTC (rev 1770)
+++ branches/CGTUNE/priv/host-x86/isel.c 2007-05-08 18:45:59 UTC (rev 1771)
@@ -2501,6 +2501,29 @@
return;
}
+ /* Left64(e) */
+ case Iop_Left64: {
+ HReg yLo, yHi;
+ HReg tLo = newVRegI(env);
+ HReg tHi = newVRegI(env);
+ /* yHi:yLo = arg */
+ iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
+ /* tLo = 0 - yLo, and set carry */
+ addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tLo));
+ addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
+ /* tHi = 0 - yHi - carry */
+ addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
+ addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
+ /* So now we have tHi:tLo = -arg. To finish off, or 'arg'
+ back in, so as to give the final result
+ tHi:tLo = arg | -arg. */
+ addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yLo), tLo));
+ addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yHi), tHi));
+ *rHi = tHi;
+ *rLo = tLo;
+ return;
+ }
+
/* --- patterns rooted at: CmpwNEZ64 --- */
/* CmpwNEZ64(e) */
|
|
From: <sv...@va...> - 2007-05-08 18:02:01
|
Author: sewardj
Date: 2007-05-08 19:01:59 +0100 (Tue, 08 May 2007)
New Revision: 6737
Log:
Track vex r1770 (removal of Iop_Neg64/32/16/8 primops)
Modified:
branches/CGTUNE/memcheck/mc_translate.c
Modified: branches/CGTUNE/memcheck/mc_translate.c
===================================================================
--- branches/CGTUNE/memcheck/mc_translate.c 2007-05-08 13:49:13 UTC (rev 6736)
+++ branches/CGTUNE/memcheck/mc_translate.c 2007-05-08 18:01:59 UTC (rev 6737)
@@ -2483,17 +2483,6 @@
case Iop_Not1:
return vatom;
- /* Neg* really fall under the Add/Sub banner, and as such you
- might think would qualify for the 'expensive add/sub'
- treatment. However, in this case since the implied literal
- is zero (0 - arg), we just do the cheap thing anyway. */
- case Iop_Neg8:
- return mkLeft8(mce, vatom);
- case Iop_Neg16:
- return mkLeft16(mce, vatom);
- case Iop_Neg32:
- return mkLeft32(mce, vatom);
-
default:
ppIROp(op);
VG_(tool_panic)("memcheck:expr2vbits_Unop");
|
|
From: <sv...@va...> - 2007-05-08 18:00:26
|
Author: sewardj
Date: 2007-05-08 19:00:19 +0100 (Tue, 08 May 2007)
New Revision: 1770
Log:
Get rid of Iop_Neg64/32/16/8 as they are no longer used by Memcheck,
and any uses as generated by the front ends are so infrequent that
generating the equivalent Sub(0, ..) is good enough. This gets rid of
quite a few lines of code. Add isel cases for Sub(0, ..) patterns so
that the x86/amd64 backends still generate negl/negq where possible.
Modified:
branches/CGTUNE/priv/guest-ppc/toIR.c
branches/CGTUNE/priv/guest-x86/toIR.c
branches/CGTUNE/priv/host-amd64/isel.c
branches/CGTUNE/priv/host-ppc/isel.c
branches/CGTUNE/priv/host-x86/isel.c
branches/CGTUNE/priv/ir/irdefs.c
branches/CGTUNE/priv/ir/iropt.c
branches/CGTUNE/pub/libvex_ir.h
Modified: branches/CGTUNE/priv/guest-ppc/toIR.c
===================================================================
--- branches/CGTUNE/priv/guest-ppc/toIR.c 2007-05-08 13:45:27 UTC (rev 1769)
+++ branches/CGTUNE/priv/guest-ppc/toIR.c 2007-05-08 18:00:19 UTC (rev 1770)
@@ -783,7 +783,7 @@
op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8 ||
op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8 ||
op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8 ||
- op8 == Iop_Not8 || op8 == Iop_Neg8 );
+ op8 == Iop_Not8 );
adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : (ty==Ity_I32 ? 2 : 3));
return adj + op8;
}
Modified: branches/CGTUNE/priv/guest-x86/toIR.c
===================================================================
--- branches/CGTUNE/priv/guest-x86/toIR.c 2007-05-08 13:45:27 UTC (rev 1769)
+++ branches/CGTUNE/priv/guest-x86/toIR.c 2007-05-08 18:00:19 UTC (rev 1770)
@@ -684,7 +684,7 @@
|| op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
|| op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
|| op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
- || op8 == Iop_Not8 || op8 == Iop_Neg8);
+ || op8 == Iop_Not8);
adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
return adj + op8;
}
@@ -2631,7 +2631,7 @@
dst1 = newTemp(ty);
assign(dst0, mkU(ty,0));
assign(src, getIReg(sz,eregOfRM(modrm)));
- assign(dst1, unop(mkSizedOp(ty,Iop_Neg8), mkexpr(src)));
+ assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
@@ -2693,7 +2693,7 @@
dst1 = newTemp(ty);
assign(dst0, mkU(ty,0));
assign(src, mkexpr(t1));
- assign(dst1, unop(mkSizedOp(ty,Iop_Neg8), mkexpr(src)));
+ assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
storeLE( mkexpr(addr), mkexpr(dst1) );
DIP("neg%c %s\n", nameISize(sz), dis_buf);
Modified: branches/CGTUNE/priv/host-amd64/isel.c
===================================================================
--- branches/CGTUNE/priv/host-amd64/isel.c 2007-05-08 13:45:27 UTC (rev 1769)
+++ branches/CGTUNE/priv/host-amd64/isel.c 2007-05-08 18:00:19 UTC (rev 1770)
@@ -278,15 +278,22 @@
return toBool(x == y1);
}
-//.. /* Is this a 32-bit zero expression? */
-//..
-//.. static Bool isZero32 ( IRExpr* e )
-//.. {
-//.. return e->tag == Iex_Const
-//.. && e->Iex.Const.con->tag == Ico_U32
-//.. && e->Iex.Const.con->Ico.U32 == 0;
-//.. }
+/* Is this a 64-bit zero expression? */
+static Bool isZeroU64 ( IRExpr* e )
+{
+ return e->tag == Iex_Const
+ && e->Iex.Const.con->tag == Ico_U64
+ && e->Iex.Const.con->Ico.U64 == 0ULL;
+}
+
+static Bool isZeroU32 ( IRExpr* e )
+{
+ return e->tag == Iex_Const
+ && e->Iex.Const.con->tag == Ico_U32
+ && e->Iex.Const.con->Ico.U32 == 0;
+}
+
/* Make a int reg-reg move. */
static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
@@ -841,16 +848,17 @@
AMD64AluOp aluOp;
AMD64ShiftOp shOp;
-//..
-//.. /* Pattern: Sub32(0,x) */
-//.. if (e->Iex.Binop.op == Iop_Sub32 && isZero32(e->Iex.Binop.arg1)) {
-//.. HReg dst = newVRegI(env);
-//.. HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
-//.. addInstr(env, mk_iMOVsd_RR(reg,dst));
-//.. addInstr(env, X86Instr_Unary32(Xun_NEG,X86RM_Reg(dst)));
-//.. return dst;
-//.. }
-//..
+ /* Pattern: Sub64(0,x) */
+ /* and: Sub32(0,x) */
+ if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
+ || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
+ HReg dst = newVRegI(env);
+ HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
+ addInstr(env, mk_iMOVsd_RR(reg,dst));
+ addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
+ return dst;
+ }
+
/* Is it an addition or logical style op? */
switch (e->Iex.Binop.op) {
case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
@@ -1449,16 +1457,6 @@
AMD64RMI_Reg(tmp), dst));
return dst;
}
- case Iop_Neg8:
- case Iop_Neg16:
- case Iop_Neg32:
- case Iop_Neg64: {
- HReg dst = newVRegI(env);
- HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
- addInstr(env, mk_iMOVsd_RR(reg,dst));
- addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
- return dst;
- }
case Iop_CmpwNEZ64: {
HReg dst = newVRegI(env);
Modified: branches/CGTUNE/priv/host-ppc/isel.c
===================================================================
--- branches/CGTUNE/priv/host-ppc/isel.c 2007-05-08 13:45:27 UTC (rev 1769)
+++ branches/CGTUNE/priv/host-ppc/isel.c 2007-05-08 18:00:19 UTC (rev 1770)
@@ -1705,17 +1705,6 @@
addInstr(env, PPCInstr_Unary(op_clz,r_dst,r_src));
return r_dst;
}
- case Iop_Neg8:
- case Iop_Neg16:
- case Iop_Neg32:
- case Iop_Neg64: {
- HReg r_dst = newVRegI(env);
- HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
- if (op_unop == Iop_Neg64 && !mode64)
- goto irreducible;
- addInstr(env, PPCInstr_Unary(Pun_NEG,r_dst,r_src));
- return r_dst;
- }
case Iop_Left8:
case Iop_Left32:
@@ -1805,8 +1794,6 @@
case Iop_32to16:
case Iop_64to8:
/* These are no-ops. */
- if (op_unop == Iop_Neg64 && !mode64)
- goto irreducible;
return iselWordExpr_R(env, e->Iex.Unop.arg);
/* ReinterpF64asI64(e) */
@@ -2816,22 +2803,6 @@
*rLo = tLo;
return;
}
-
- case Iop_Neg64: {
- HReg yLo, yHi;
- HReg zero = newVRegI(env);
- HReg tLo = newVRegI(env);
- HReg tHi = newVRegI(env);
- iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
- addInstr(env, PPCInstr_LI(zero, 0, False/*mode32*/));
- addInstr(env, PPCInstr_AddSubC( False/*sub*/, True/*set carry*/,
- tLo, zero, yLo));
- addInstr(env, PPCInstr_AddSubC( False/*sub*/, False/*read carry*/,
- tHi, zero, yHi));
- *rHi = tHi;
- *rLo = tLo;
- return;
- }
/* ReinterpF64asI64(e) */
/* Given an IEEE754 double, produce an I64 with the same bit
Modified: branches/CGTUNE/priv/host-x86/isel.c
===================================================================
--- branches/CGTUNE/priv/host-x86/isel.c 2007-05-08 13:45:27 UTC (rev 1769)
+++ branches/CGTUNE/priv/host-x86/isel.c 2007-05-08 18:00:19 UTC (rev 1770)
@@ -120,6 +120,13 @@
&& e->Iex.Const.con->Ico.U8 == 0;
}
+static Bool isZeroU32 ( IRExpr* e )
+{
+ return e->tag == Iex_Const
+ && e->Iex.Const.con->tag == Ico_U32
+ && e->Iex.Const.con->Ico.U32 == 0;
+}
+
static Bool isZeroU64 ( IRExpr* e )
{
return e->tag == Iex_Const
@@ -805,6 +812,15 @@
X86AluOp aluOp;
X86ShiftOp shOp;
+ /* Pattern: Sub32(0,x) */
+ if (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1)) {
+ HReg dst = newVRegI(env);
+ HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
+ addInstr(env, mk_iMOVsd_RR(reg,dst));
+ addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
+ return dst;
+ }
+
/* Is it an addition or logical style op? */
switch (e->Iex.Binop.op) {
case Iop_Add8: case Iop_Add16: case Iop_Add32:
@@ -1194,15 +1210,6 @@
X86RMI_Reg(tmp), dst));
return dst;
}
- case Iop_Neg8:
- case Iop_Neg16:
- case Iop_Neg32: {
- HReg dst = newVRegI(env);
- HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
- addInstr(env, mk_iMOVsd_RR(reg,dst));
- addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
- return dst;
- }
case Iop_CmpwNEZ32: {
HReg dst = newVRegI(env);
@@ -2494,24 +2501,6 @@
return;
}
- /* Neg64(e) */
- case Iop_Neg64: {
- HReg yLo, yHi;
- HReg tLo = newVRegI(env);
- HReg tHi = newVRegI(env);
- /* yHi:yLo = arg */
- iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
- /* tLo = 0 - yLo, and set carry */
- addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tLo));
- addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
- /* tHi = 0 - yHi - carry */
- addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
- addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
- *rHi = tHi;
- *rLo = tLo;
- return;
- }
-
/* --- patterns rooted at: CmpwNEZ64 --- */
/* CmpwNEZ64(e) */
Modified: branches/CGTUNE/priv/ir/irdefs.c
===================================================================
--- branches/CGTUNE/priv/ir/irdefs.c 2007-05-08 13:45:27 UTC (rev 1769)
+++ branches/CGTUNE/priv/ir/irdefs.c 2007-05-08 18:00:19 UTC (rev 1770)
@@ -217,11 +217,6 @@
case Iop_CmpORD64U: vex_printf("CmpORD64U"); return;
case Iop_CmpORD64S: vex_printf("CmpORD64S"); return;
- case Iop_Neg8: vex_printf("Neg8"); return;
- case Iop_Neg16: vex_printf("Neg16"); return;
- case Iop_Neg32: vex_printf("Neg32"); return;
- case Iop_Neg64: vex_printf("Neg64"); return;
-
case Iop_DivU32: vex_printf("DivU32"); return;
case Iop_DivS32: vex_printf("DivS32"); return;
case Iop_DivU64: vex_printf("DivU64"); return;
@@ -1525,14 +1520,13 @@
case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
BINARY(Ity_I64,Ity_I8, Ity_I64);
- case Iop_Not8: case Iop_Neg8:
+ case Iop_Not8:
UNARY(Ity_I8, Ity_I8);
- case Iop_Not16: case Iop_Neg16:
+ case Iop_Not16:
UNARY(Ity_I16, Ity_I16);
- case Iop_Not32: case Iop_Neg32:
+ case Iop_Not32:
UNARY(Ity_I32, Ity_I32);
- case Iop_Neg64:
case Iop_Not64:
case Iop_CmpNEZ32x2: case Iop_CmpNEZ16x4: case Iop_CmpNEZ8x8:
UNARY(Ity_I64, Ity_I64);
Modified: branches/CGTUNE/priv/ir/iropt.c
===================================================================
--- branches/CGTUNE/priv/ir/iropt.c 2007-05-08 13:45:27 UTC (rev 1769)
+++ branches/CGTUNE/priv/ir/iropt.c 2007-05-08 18:00:19 UTC (rev 1770)
@@ -1011,19 +1011,6 @@
notBool(e->Iex.Unop.arg->Iex.Const.con->Ico.U1)));
break;
- case Iop_Neg64:
- e2 = IRExpr_Const(IRConst_U64(
- - (e->Iex.Unop.arg->Iex.Const.con->Ico.U64)));
- break;
- case Iop_Neg32:
- e2 = IRExpr_Const(IRConst_U32(
- - (e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
- break;
- case Iop_Neg8:
- e2 = IRExpr_Const(IRConst_U8(toUChar(
- - (e->Iex.Unop.arg->Iex.Const.con->Ico.U8))));
- break;
-
case Iop_64to8: {
ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
w64 &= 0xFFULL;
Modified: branches/CGTUNE/pub/libvex_ir.h
===================================================================
--- branches/CGTUNE/pub/libvex_ir.h 2007-05-08 13:45:27 UTC (rev 1769)
+++ branches/CGTUNE/pub/libvex_ir.h 2007-05-08 18:00:19 UTC (rev 1770)
@@ -422,7 +422,6 @@
Iop_CmpNE8, Iop_CmpNE16, Iop_CmpNE32, Iop_CmpNE64,
/* Tags for unary ops */
Iop_Not8, Iop_Not16, Iop_Not32, Iop_Not64,
- Iop_Neg8, Iop_Neg16, Iop_Neg32, Iop_Neg64,
/* -- Ordering not important after here. -- */
|
|
From: <sv...@va...> - 2007-05-08 13:49:14
|
Author: sewardj
Date: 2007-05-08 14:49:13 +0100 (Tue, 08 May 2007)
New Revision: 6736
Log:
Hook up Memcheck to the new Left and CmpwNEZ primops defined in vex r1769.
Modified:
branches/CGTUNE/memcheck/mc_translate.c
Modified: branches/CGTUNE/memcheck/mc_translate.c
===================================================================
--- branches/CGTUNE/memcheck/mc_translate.c 2007-05-08 12:07:52 UTC (rev 6735)
+++ branches/CGTUNE/memcheck/mc_translate.c 2007-05-08 13:49:13 UTC (rev 6736)
@@ -356,38 +356,22 @@
static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
tl_assert(isShadowAtom(mce,a1));
- /* It's safe to duplicate a1 since it's only an atom */
- return assignNew(mce, Ity_I8,
- binop(Iop_Or8, a1,
- assignNew(mce, Ity_I8,
- unop(Iop_Neg8, a1))));
+ return assignNew(mce, Ity_I8, unop(Iop_Left8, a1));
}
static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
tl_assert(isShadowAtom(mce,a1));
- /* It's safe to duplicate a1 since it's only an atom */
- return assignNew(mce, Ity_I16,
- binop(Iop_Or16, a1,
- assignNew(mce, Ity_I16,
- unop(Iop_Neg16, a1))));
+ return assignNew(mce, Ity_I16, unop(Iop_Left16, a1));
}
static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
tl_assert(isShadowAtom(mce,a1));
- /* It's safe to duplicate a1 since it's only an atom */
- return assignNew(mce, Ity_I32,
- binop(Iop_Or32, a1,
- assignNew(mce, Ity_I32,
- unop(Iop_Neg32, a1))));
+ return assignNew(mce, Ity_I32, unop(Iop_Left32, a1));
}
static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
tl_assert(isShadowAtom(mce,a1));
- /* It's safe to duplicate a1 since it's only an atom */
- return assignNew(mce, Ity_I64,
- binop(Iop_Or64, a1,
- assignNew(mce, Ity_I64,
- unop(Iop_Neg64, a1))));
+ return assignNew(mce, Ity_I64, unop(Iop_Left64, a1));
}
/* --------- 'Improvement' functions for AND/OR. --------- */
@@ -502,14 +486,28 @@
static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
{
- IRType ty;
+ IRType src_ty;
IRAtom* tmp1;
/* Note, dst_ty is a shadow type, not an original type. */
/* First of all, collapse vbits down to a single bit. */
tl_assert(isShadowAtom(mce,vbits));
- ty = typeOfIRExpr(mce->bb->tyenv, vbits);
- tmp1 = NULL;
- switch (ty) {
+ src_ty = typeOfIRExpr(mce->bb->tyenv, vbits);
+
+ /* Fast-track some common cases */
+ if (src_ty == Ity_I32 && dst_ty == Ity_I32)
+ return assignNew(mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
+
+ if (src_ty == Ity_I64 && dst_ty == Ity_I64)
+ return assignNew(mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
+
+ if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
+ IRAtom* tmp = assignNew(mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
+ return assignNew(mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
+ }
+
+ /* Else do it the slow way .. */
+ tmp1 = NULL;
+ switch (src_ty) {
case Ity_I1:
tmp1 = vbits;
break;
@@ -536,7 +534,7 @@
break;
}
default:
- ppIRType(ty);
+ ppIRType(src_ty);
VG_(tool_panic)("mkPCastTo(1)");
}
tl_assert(tmp1);
@@ -1260,12 +1258,30 @@
IRAtom* mkLazyN ( MCEnv* mce,
IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
{
- Int i;
+ Int i;
IRAtom* here;
- IRAtom* curr = definedOfType(Ity_I32);
+ IRAtom* curr;
+ IRType mergeTy;
+ IRType mergeTy64 = True;
+
+ /* Decide on the type of the merge intermediary. If all relevant
+ args are I64, then it's I64. In all other circumstances, use
+ I32. */
for (i = 0; exprvec[i]; i++) {
tl_assert(i < 32);
tl_assert(isOriginalAtom(mce, exprvec[i]));
+ if (cee->mcx_mask & (1<<i))
+ continue;
+ if (typeOfIRExpr(mce->bb->tyenv, exprvec[i]) != Ity_I64)
+ mergeTy64 = False;
+ }
+
+ mergeTy = mergeTy64 ? Ity_I64 : Ity_I32;
+ curr = definedOfType(mergeTy);
+
+ for (i = 0; exprvec[i]; i++) {
+ tl_assert(i < 32);
+ tl_assert(isOriginalAtom(mce, exprvec[i]));
/* Only take notice of this arg if the callee's mc-exclusion
mask does not say it is to be excluded. */
if (cee->mcx_mask & (1<<i)) {
@@ -1275,8 +1291,10 @@
} else {
/* calculate the arg's definedness, and pessimistically merge
it in. */
- here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, exprvec[i]) );
- curr = mkUifU32(mce, here, curr);
+ here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i]) );
+ curr = mergeTy64
+ ? mkUifU64(mce, here, curr)
+ : mkUifU32(mce, here, curr);
}
}
return mkPCastTo(mce, finalVtype, curr );
|
|
From: <sv...@va...> - 2007-05-08 13:45:29
|
Author: sewardj
Date: 2007-05-08 14:45:27 +0100 (Tue, 08 May 2007)
New Revision: 1769
Log:
This commit provides a bunch of enhancements to the IR optimiser
(iropt) and to the various backend instruction selectors.
Unfortunately the changes are interrelated and cannot easily be
committed in pieces in any meaningful way. Between them and the
already-committed register allocation enhancements (r1765, r1767)
performance of Memcheck is improved by 0%-10%. Improvements are also
applicable to other tools to lesser extents.
Main changes are:
* Add new IR primops Iop_Left64/32/16/8 and Iop_CmpwNEZ64/32/16/8
which Memcheck uses to express some primitive operations on
definedness (V) bits:
Left(x) = set all bits to the left of the rightmost 1 bit to 1
CmpwNEZ(x) = if x == 0 then 0 else 0xFF...FF
Left and CmpwNEZ are detailed in the Usenix 2005 paper (in which
CmpwNEZ is called PCast). The new primops expose opportunities for
IR optimisation at tree-build time. Prior to this change Memcheck
expressed Left and CmpwNEZ in terms of lower level primitives
(logical or, negation, compares, various casts) which was simpler
but hindered further optimisation.
* Enhance the IR optimiser's tree builder so it can rewrite trees
as they are constructed, according to useful identities, for example:
CmpwNEZ64( Or64 ( CmpwNEZ64(x), y ) ) --> CmpwNEZ64( Or64( x, y ) )
which gets rid of a CmpwNEZ64 operation - a win as they are relatively
expensive. See functions fold_IRExpr_Binop and fold_IRExpr_Unop.
Allowing the tree builder to rewrite trees also makes it possible to
have a single implementation of certain transformation rules which
were previously duplicated in the x86, amd64 and ppc instruction
selectors. For example
32to1(1Uto32(x)) --> x
This simplifies the instruction selectors and gives a central place
to put such IR-level transformations, which is a Good Thing.
* Various minor refinements to the instruction selectors:
- ppc64 generates 32Sto64 into 1 instruction instead of 2
- x86 can now generate movsbl
- x86 handles 64-bit integer Mux0X better for cases typically
arising from Memchecking of FP code
- misc other patterns handled better
Overall these changes are a straight win - vex generates less code,
and does so a bit faster since its register allocator has to chew
through fewer instructions. The main risk is that of correctness:
making Left and CmpwNEZ explicit, and adding rewrite rules for them,
is a substantial change in the way Memcheck deals with undefined value
tracking, and I am concerned to ensure that the changes do not cause
false negatives. I _think_ it's all correct so far.
Modified:
branches/CGTUNE/priv/host-amd64/isel.c
branches/CGTUNE/priv/host-ppc/hdefs.c
branches/CGTUNE/priv/host-ppc/isel.c
branches/CGTUNE/priv/host-x86/hdefs.c
branches/CGTUNE/priv/host-x86/isel.c
branches/CGTUNE/priv/ir/irdefs.c
branches/CGTUNE/priv/ir/iropt.c
branches/CGTUNE/priv/main/vex_util.c
branches/CGTUNE/pub/libvex_ir.h
Modified: branches/CGTUNE/priv/host-amd64/isel.c
===================================================================
--- branches/CGTUNE/priv/host-amd64/isel.c 2007-05-07 18:36:48 UTC (rev 1768)
+++ branches/CGTUNE/priv/host-amd64/isel.c 2007-05-08 13:45:27 UTC (rev 1769)
@@ -1460,6 +1460,43 @@
return dst;
}
+ case Iop_CmpwNEZ64: {
+ HReg dst = newVRegI(env);
+ HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+ addInstr(env, mk_iMOVsd_RR(src,dst));
+ addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
+ addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
+ AMD64RMI_Reg(src), dst));
+ addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
+ return dst;
+ }
+
+ case Iop_CmpwNEZ32: {
+ HReg src = newVRegI(env);
+ HReg dst = newVRegI(env);
+ HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
+ addInstr(env, mk_iMOVsd_RR(pre,src));
+ addInstr(env, AMD64Instr_MovZLQ(src,src));
+ addInstr(env, mk_iMOVsd_RR(src,dst));
+ addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
+ addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
+ AMD64RMI_Reg(src), dst));
+ addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
+ return dst;
+ }
+
+ case Iop_Left8:
+ case Iop_Left16:
+ case Iop_Left32:
+ case Iop_Left64: {
+ HReg dst = newVRegI(env);
+ HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+ addInstr(env, mk_iMOVsd_RR(src, dst));
+ addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
+ addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
+ return dst;
+ }
+
case Iop_V128to32: {
HReg dst = newVRegI(env);
HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
@@ -1965,11 +2002,7 @@
static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
{
MatchInfo mi;
-//.. DECLARE_PATTERN(p_1Uto32_then_32to1);
-//.. DECLARE_PATTERN(p_1Sto32_then_32to1);
- DECLARE_PATTERN(p_1Uto64_then_64to1);
-
vassert(e);
vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
@@ -2002,30 +2035,6 @@
/* --- patterns rooted at: 64to1 --- */
- /* 64to1(1Uto64(expr1)) ==> expr1 */
- DEFINE_PATTERN( p_1Uto64_then_64to1,
- unop(Iop_64to1, unop(Iop_1Uto64, bind(0))) );
- if (matchIRExpr(&mi,p_1Uto64_then_64to1,e)) {
- IRExpr* expr1 = mi.bindee[0];
- return iselCondCode(env, expr1);
- }
-
-//.. /* 32to1(1Uto32(expr1)) -- the casts are pointless, ignore them */
-//.. DEFINE_PATTERN(p_1Uto32_then_32to1,
-//.. unop(Iop_32to1,unop(Iop_1Uto32,bind(0))));
-//.. if (matchIRExpr(&mi,p_1Uto32_then_32to1,e)) {
-//.. IRExpr* expr1 = mi.bindee[0];
-//.. return iselCondCode(env, expr1);
-//.. }
-//..
-//.. /* 32to1(1Sto32(expr1)) -- the casts are pointless, ignore them */
-//.. DEFINE_PATTERN(p_1Sto32_then_32to1,
-//.. unop(Iop_32to1,unop(Iop_1Sto32,bind(0))));
-//.. if (matchIRExpr(&mi,p_1Sto32_then_32to1,e)) {
-//.. IRExpr* expr1 = mi.bindee[0];
-//.. return iselCondCode(env, expr1);
-//.. }
-
/* 64to1 */
if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
@@ -2168,53 +2177,6 @@
}
}
-//.. /* CmpNE64(1Sto64(b), 0) ==> b */
-//.. {
-//.. DECLARE_PATTERN(p_CmpNE64_1Sto64);
-//.. DEFINE_PATTERN(
-//.. p_CmpNE64_1Sto64,
-//.. binop(Iop_CmpNE64, unop(Iop_1Sto64,bind(0)), mkU64(0)));
-//.. if (matchIRExpr(&mi, p_CmpNE64_1Sto64, e)) {
-//.. return iselCondCode(env, mi.bindee[0]);
-//.. }
-//.. }
-//..
-//.. /* CmpNE64(x, 0) */
-//.. {
-//.. DECLARE_PATTERN(p_CmpNE64_x_zero);
-//.. DEFINE_PATTERN(
-//.. p_CmpNE64_x_zero,
-//.. binop(Iop_CmpNE64, bind(0), mkU64(0)) );
-//.. if (matchIRExpr(&mi, p_CmpNE64_x_zero, e)) {
-//.. HReg hi, lo;
-//.. IRExpr* x = mi.bindee[0];
-//.. HReg tmp = newVRegI(env);
-//.. iselInt64Expr( &hi, &lo, env, x );
-//.. addInstr(env, mk_iMOVsd_RR(hi, tmp));
-//.. addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo), tmp));
-//.. return Xcc_NZ;
-//.. }
-//.. }
-//..
-//.. /* CmpNE64 */
-//.. if (e->tag == Iex_Binop
-//.. && e->Iex.Binop.op == Iop_CmpNE64) {
-//.. HReg hi1, hi2, lo1, lo2;
-//.. HReg tHi = newVRegI(env);
-//.. HReg tLo = newVRegI(env);
-//.. iselInt64Expr( &hi1, &lo1, env, e->Iex.Binop.arg1 );
-//.. iselInt64Expr( &hi2, &lo2, env, e->Iex.Binop.arg2 );
-//.. addInstr(env, mk_iMOVsd_RR(hi1, tHi));
-//.. addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(hi2), tHi));
-//.. addInstr(env, mk_iMOVsd_RR(lo1, tLo));
-//.. addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(lo2), tLo));
-//.. addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(tHi), tLo));
-//.. switch (e->Iex.Binop.op) {
-//.. case Iop_CmpNE64: return Xcc_NZ;
-//.. default: vpanic("iselCondCode(x86): CmpXX64");
-//.. }
-//.. }
-
ppIRExpr(e);
vpanic("iselCondCode(amd64)");
}
Modified: branches/CGTUNE/priv/host-ppc/hdefs.c
===================================================================
--- branches/CGTUNE/priv/host-ppc/hdefs.c 2007-05-07 18:36:48 UTC (rev 1768)
+++ branches/CGTUNE/priv/host-ppc/hdefs.c 2007-05-08 13:45:27 UTC (rev 1769)
@@ -2706,7 +2706,13 @@
/* srawi (PPC32 p507) */
UInt n = srcR->Prh.Imm.imm16;
vassert(!srcR->Prh.Imm.syned);
- vassert(n > 0 && n < 32);
+ /* In 64-bit mode, we allow right shifts by zero bits
+ as that is a handy way to sign extend the lower 32
+ bits into the upper 32 bits. */
+ if (mode64)
+ vassert(n >= 0 && n < 32);
+ else
+ vassert(n > 0 && n < 32);
p = mkFormX(p, 31, r_srcL, r_dst, n, 824, 0);
} else {
/* sraw (PPC32 p506) */
Modified: branches/CGTUNE/priv/host-ppc/isel.c
===================================================================
--- branches/CGTUNE/priv/host-ppc/isel.c 2007-05-07 18:36:48 UTC (rev 1768)
+++ branches/CGTUNE/priv/host-ppc/isel.c 2007-05-08 13:45:27 UTC (rev 1769)
@@ -1569,8 +1569,7 @@
return r_dst;
}
case Iop_8Sto64:
- case Iop_16Sto64:
- case Iop_32Sto64: {
+ case Iop_16Sto64: {
HReg r_dst = newVRegI(env);
HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
UShort amt = toUShort(op_unop==Iop_8Sto64 ? 56 :
@@ -1584,6 +1583,17 @@
r_dst, r_dst, PPCRH_Imm(False,amt)));
return r_dst;
}
+ case Iop_32Sto64: {
+ HReg r_dst = newVRegI(env);
+ HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+ vassert(mode64);
+ /* According to the IBM docs, in 64 bit mode, srawi r,r,0
+ sign extends the lower 32 bits into the upper 32 bits. */
+ addInstr(env,
+ PPCInstr_Shft(Pshft_SAR, True/*32bit shift*/,
+ r_dst, r_src, PPCRH_Imm(False,0)));
+ return r_dst;
+ }
case Iop_Not8:
case Iop_Not16:
case Iop_Not32:
@@ -1707,6 +1717,40 @@
return r_dst;
}
+ case Iop_Left8:
+ case Iop_Left32:
+ case Iop_Left64: {
+ HReg r_src, r_dst;
+ if (op_unop == Iop_Left64 && !mode64)
+ goto irreducible;
+ r_dst = newVRegI(env);
+ r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+ addInstr(env, PPCInstr_Unary(Pun_NEG,r_dst,r_src));
+ addInstr(env, PPCInstr_Alu(Palu_OR, r_dst, r_dst, PPCRH_Reg(r_src)));
+ return r_dst;
+ }
+
+ case Iop_CmpwNEZ32: {
+ HReg r_dst = newVRegI(env);
+ HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+ addInstr(env, PPCInstr_Unary(Pun_NEG,r_dst,r_src));
+ addInstr(env, PPCInstr_Alu(Palu_OR, r_dst, r_dst, PPCRH_Reg(r_src)));
+ addInstr(env, PPCInstr_Shft(Pshft_SAR, True/*32bit shift*/,
+ r_dst, r_dst, PPCRH_Imm(False, 31)));
+ return r_dst;
+ }
+
+ case Iop_CmpwNEZ64: {
+ HReg r_dst = newVRegI(env);
+ HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+ if (!mode64) goto irreducible;
+ addInstr(env, PPCInstr_Unary(Pun_NEG,r_dst,r_src));
+ addInstr(env, PPCInstr_Alu(Palu_OR, r_dst, r_dst, PPCRH_Reg(r_src)));
+ addInstr(env, PPCInstr_Shft(Pshft_SAR, False/*64bit shift*/,
+ r_dst, r_dst, PPCRH_Imm(False, 63)));
+ return r_dst;
+ }
+
case Iop_V128to32: {
HReg r_aligned16;
HReg dst = newVRegI(env);
@@ -2685,6 +2729,24 @@
if (e->tag == Iex_Unop) {
switch (e->Iex.Unop.op) {
+ /* CmpwNEZ64(e) */
+ case Iop_CmpwNEZ64: {
+ HReg argHi, argLo;
+ HReg tmp1 = newVRegI(env);
+ HReg tmp2 = newVRegI(env);
+ iselInt64Expr(&argHi, &argLo, env, e->Iex.Unop.arg);
+ /* tmp1 = argHi | argLo */
+ addInstr(env, PPCInstr_Alu(Palu_OR, tmp1, argHi, PPCRH_Reg(argLo)));
+ /* tmp2 = (tmp1 | -tmp1) >>s 31 */
+ addInstr(env, PPCInstr_Unary(Pun_NEG,tmp2,tmp1));
+ addInstr(env, PPCInstr_Alu(Palu_OR, tmp2, tmp2, PPCRH_Reg(tmp1)));
+ addInstr(env, PPCInstr_Shft(Pshft_SAR, True/*32bit shift*/,
+ tmp2, tmp2, PPCRH_Imm(False, 31)));
+ *rHi = tmp2;
+ *rLo = tmp2; /* yes, really tmp2 */
+ return;
+ }
+
/* 32Sto64(e) */
case Iop_32Sto64: {
HReg tHi = newVRegI(env);
Modified: branches/CGTUNE/priv/host-x86/hdefs.c
===================================================================
--- branches/CGTUNE/priv/host-x86/hdefs.c 2007-05-07 18:36:48 UTC (rev 1768)
+++ branches/CGTUNE/priv/host-x86/hdefs.c 2007-05-08 13:45:27 UTC (rev 1769)
@@ -1612,7 +1612,7 @@
/* The given instruction reads the specified vreg exactly once, and
that vreg is currently located at the given spill offset. If
- possible, return a variant of the instruction which instead
+ possible, return a variant of the instruction to one which instead
references the spill slot directly. */
X86Instr* directReload_X86( X86Instr* i, HReg vreg, Short spill_off )
@@ -2407,6 +2407,13 @@
p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
goto done;
}
+ if (i->Xin.LoadEX.szSmall == 1 && i->Xin.LoadEX.syned) {
+ /* movsbl */
+ *p++ = 0x0F;
+ *p++ = 0xBE;
+ p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
+ goto done;
+ }
break;
case Xin_Set32:
Modified: branches/CGTUNE/priv/host-x86/isel.c
===================================================================
--- branches/CGTUNE/priv/host-x86/isel.c 2007-05-07 18:36:48 UTC (rev 1768)
+++ branches/CGTUNE/priv/host-x86/isel.c 2007-05-08 13:45:27 UTC (rev 1769)
@@ -120,7 +120,14 @@
&& e->Iex.Const.con->Ico.U8 == 0;
}
+static Bool isZeroU64 ( IRExpr* e )
+{
+ return e->tag == Iex_Const
+ && e->Iex.Const.con->tag == Ico_U64
+ && e->Iex.Const.con->Ico.U64 == 0ULL;
+}
+
/*---------------------------------------------------------*/
/*--- ISelEnv ---*/
/*---------------------------------------------------------*/
@@ -730,7 +737,6 @@
static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
{
MatchInfo mi;
- DECLARE_PATTERN(p_32to1_then_1Uto8);
IRType ty = typeOfIRExpr(env->type_env,e);
vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
@@ -1011,21 +1017,53 @@
/* --------- UNARY OP --------- */
case Iex_Unop: {
+
/* 1Uto8(32to1(expr32)) */
- DEFINE_PATTERN(p_32to1_then_1Uto8,
- unop(Iop_1Uto8,unop(Iop_32to1,bind(0))));
- if (matchIRExpr(&mi,p_32to1_then_1Uto8,e)) {
- IRExpr* expr32 = mi.bindee[0];
- HReg dst = newVRegI(env);
- HReg src = iselIntExpr_R(env, expr32);
- addInstr(env, mk_iMOVsd_RR(src,dst) );
- addInstr(env, X86Instr_Alu32R(Xalu_AND,
- X86RMI_Imm(1), dst));
- return dst;
+ if (e->Iex.Unop.op == Iop_1Uto8) {
+ DECLARE_PATTERN(p_32to1_then_1Uto8);
+ DEFINE_PATTERN(p_32to1_then_1Uto8,
+ unop(Iop_1Uto8,unop(Iop_32to1,bind(0))));
+ if (matchIRExpr(&mi,p_32to1_then_1Uto8,e)) {
+ IRExpr* expr32 = mi.bindee[0];
+ HReg dst = newVRegI(env);
+ HReg src = iselIntExpr_R(env, expr32);
+ addInstr(env, mk_iMOVsd_RR(src,dst) );
+ addInstr(env, X86Instr_Alu32R(Xalu_AND,
+ X86RMI_Imm(1), dst));
+ return dst;
+ }
}
+ /* 8Uto32(LDle(expr32)) */
+ if (e->Iex.Unop.op == Iop_8Uto32) {
+ DECLARE_PATTERN(p_LDle8_then_8Uto32);
+ DEFINE_PATTERN(p_LDle8_then_8Uto32,
+ unop(Iop_8Uto32,
+ IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
+ if (matchIRExpr(&mi,p_LDle8_then_8Uto32,e)) {
+ HReg dst = newVRegI(env);
+ X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
+ addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
+ return dst;
+ }
+ }
+
+ /* 8Sto32(LDle(expr32)) */
+ if (e->Iex.Unop.op == Iop_8Sto32) {
+ DECLARE_PATTERN(p_LDle8_then_8Sto32);
+ DEFINE_PATTERN(p_LDle8_then_8Sto32,
+ unop(Iop_8Sto32,
+ IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
+ if (matchIRExpr(&mi,p_LDle8_then_8Sto32,e)) {
+ HReg dst = newVRegI(env);
+ X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
+ addInstr(env, X86Instr_LoadEX(1,True,amode,dst));
+ return dst;
+ }
+ }
+
/* 16Uto32(LDle(expr32)) */
- {
+ if (e->Iex.Unop.op == Iop_16Uto32) {
DECLARE_PATTERN(p_LDle16_then_16Uto32);
DEFINE_PATTERN(p_LDle16_then_16Uto32,
unop(Iop_16Uto32,
@@ -1038,6 +1076,34 @@
}
}
+ /* 8Uto32(GET:I8) */
+ if (e->Iex.Unop.op == Iop_8Uto32) {
+ if (e->Iex.Unop.arg->tag == Iex_Get) {
+ HReg dst;
+ X86AMode* amode;
+ vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I8);
+ dst = newVRegI(env);
+ amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
+ hregX86_EBP());
+ addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
+ return dst;
+ }
+ }
+
+ /* 16to32(GET:I16) */
+ if (e->Iex.Unop.op == Iop_16Uto32) {
+ if (e->Iex.Unop.arg->tag == Iex_Get) {
+ HReg dst;
+ X86AMode* amode;
+ vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I16);
+ dst = newVRegI(env);
+ amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
+ hregX86_EBP());
+ addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
+ return dst;
+ }
+ }
+
switch (e->Iex.Unop.op) {
case Iop_8Uto16:
case Iop_8Uto32:
@@ -1138,6 +1204,27 @@
return dst;
}
+ case Iop_CmpwNEZ32: {
+ HReg dst = newVRegI(env);
+ HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+ addInstr(env, mk_iMOVsd_RR(src,dst));
+ addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
+ addInstr(env, X86Instr_Alu32R(Xalu_OR,
+ X86RMI_Reg(src), dst));
+ addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
+ return dst;
+ }
+ case Iop_Left8:
+ case Iop_Left16:
+ case Iop_Left32: {
+ HReg dst = newVRegI(env);
+ HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+ addInstr(env, mk_iMOVsd_RR(src, dst));
+ addInstr(env, X86Instr_Unary32(Xun_NEG, dst));
+ addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(src), dst));
+ return dst;
+ }
+
case Iop_V128to32: {
HReg dst = newVRegI(env);
HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
@@ -1547,9 +1634,6 @@
static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
{
MatchInfo mi;
- DECLARE_PATTERN(p_32to1);
- DECLARE_PATTERN(p_1Uto32_then_32to1);
- DECLARE_PATTERN(p_1Sto32_then_32to1);
vassert(e);
vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
@@ -1582,28 +1666,9 @@
/* --- patterns rooted at: 32to1 --- */
- /* 32to1(1Uto32(e)) ==> e */
- DEFINE_PATTERN(p_1Uto32_then_32to1,
- unop(Iop_32to1,unop(Iop_1Uto32,bind(0))));
- if (matchIRExpr(&mi,p_1Uto32_then_32to1,e)) {
- IRExpr* expr1 = mi.bindee[0];
- return iselCondCode(env, expr1);
- }
-
- /* 32to1(1Sto32(e)) ==> e */
- DEFINE_PATTERN(p_1Sto32_then_32to1,
- unop(Iop_32to1,unop(Iop_1Sto32,bind(0))));
- if (matchIRExpr(&mi,p_1Sto32_then_32to1,e)) {
- IRExpr* expr1 = mi.bindee[0];
- return iselCondCode(env, expr1);
- }
-
- /* 32to1(expr32) */
- DEFINE_PATTERN(p_32to1,
- unop(Iop_32to1,bind(0))
- );
- if (matchIRExpr(&mi,p_32to1,e)) {
- X86RM* rm = iselIntExpr_RM(env, mi.bindee[0]);
+ if (e->tag == Iex_Unop
+ && e->Iex.Unop.op == Iop_32to1) {
+ X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
addInstr(env, X86Instr_Test32(1,rm));
return Xcc_NZ;
}
@@ -1630,16 +1695,6 @@
/* --- patterns rooted at: CmpNEZ32 --- */
- /* CmpNEZ32(1Sto32(b)) ==> b */
- {
- DECLARE_PATTERN(p_CmpNEZ32_1Sto32);
- DEFINE_PATTERN(p_CmpNEZ32_1Sto32,
- unop(Iop_CmpNEZ32, unop(Iop_1Sto32,bind(0))));
- if (matchIRExpr(&mi, p_CmpNEZ32_1Sto32, e)) {
- return iselCondCode(env, mi.bindee[0]);
- }
- }
-
/* CmpNEZ32(And32(x,y)) */
{
DECLARE_PATTERN(p_CmpNEZ32_And32);
@@ -1670,6 +1725,16 @@
}
}
+ /* CmpNEZ32(GET(..):I32) */
+ if (e->tag == Iex_Unop
+ && e->Iex.Unop.op == Iop_CmpNEZ32
+ && e->Iex.Unop.arg->tag == Iex_Get) {
+ X86AMode* am = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
+ hregX86_EBP());
+ addInstr(env, X86Instr_Alu32M(Xalu_CMP, X86RI_Imm(0), am));
+ return Xcc_NZ;
+ }
+
/* CmpNEZ32(x) */
if (e->tag == Iex_Unop
&& e->Iex.Unop.op == Iop_CmpNEZ32) {
@@ -1681,17 +1746,6 @@
/* --- patterns rooted at: CmpNEZ64 --- */
- /* CmpNEZ64(1Sto64(b)) ==> b */
- {
- DECLARE_PATTERN(p_CmpNEZ64_1Sto64);
- DEFINE_PATTERN(
- p_CmpNEZ64_1Sto64,
- unop(Iop_CmpNEZ64, unop(Iop_1Sto64,bind(0))));
- if (matchIRExpr(&mi, p_CmpNEZ64_1Sto64, e)) {
- return iselCondCode(env, mi.bindee[0]);
- }
- }
-
/* CmpNEZ64(Or64(x,y)) */
{
DECLARE_PATTERN(p_CmpNEZ64_Or64);
@@ -1839,6 +1893,7 @@
/* DO NOT CALL THIS DIRECTLY ! */
static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
{
+ MatchInfo mi;
HWord fn = 0; /* helper fn for most SIMD64 stuff */
vassert(e);
vassert(typeOfIRExpr(env->type_env,e) == Ity_I64);
@@ -1915,18 +1970,59 @@
return;
}
- /* 64-bit Mux0X */
+ /* 64-bit Mux0X: Mux0X(g, expr, 0:I64) */
+ if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.exprX)) {
+ X86RM* r8;
+ HReg e0Lo, e0Hi;
+ HReg tLo = newVRegI(env);
+ HReg tHi = newVRegI(env);
+ X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+ iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
+ r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
+ addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
+ addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
+ addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
+ addInstr(env, X86Instr_Test32(0xFF, r8));
+ addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tHi));
+ addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tLo));
+ add_to_esp(env, 4);
+ *rHi = tHi;
+ *rLo = tLo;
+ return;
+ }
+ /* 64-bit Mux0X: Mux0X(g, 0:I64, expr) */
+ if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.expr0)) {
+ X86RM* r8;
+ HReg e0Lo, e0Hi;
+ HReg tLo = newVRegI(env);
+ HReg tHi = newVRegI(env);
+ X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+ iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.exprX);
+ r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
+ addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
+ addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
+ addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
+ addInstr(env, X86Instr_Test32(0xFF, r8));
+ addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tHi));
+ addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tLo));
+ add_to_esp(env, 4);
+ *rHi = tHi;
+ *rLo = tLo;
+ return;
+ }
+
+ /* 64-bit Mux0X: Mux0X(g, expr, expr) */
if (e->tag == Iex_Mux0X) {
- X86RM* rm8;
- HReg e0Lo, e0Hi, eXLo, eXHi;
- HReg tLo = newVRegI(env);
- HReg tHi = newVRegI(env);
+ X86RM* r8;
+ HReg e0Lo, e0Hi, eXLo, eXHi;
+ HReg tLo = newVRegI(env);
+ HReg tHi = newVRegI(env);
iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
iselInt64Expr(&eXHi, &eXLo, env, e->Iex.Mux0X.exprX);
addInstr(env, mk_iMOVsd_RR(eXHi, tHi));
addInstr(env, mk_iMOVsd_RR(eXLo, tLo));
- rm8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
- addInstr(env, X86Instr_Test32(0xFF, rm8));
+ r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
+ addInstr(env, X86Instr_Test32(0xFF, r8));
/* This assumes the first cmov32 doesn't trash the condition
codes, so they are still available for the second cmov32 */
addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Hi),tHi));
@@ -1992,10 +2088,10 @@
: e->Iex.Binop.op==Iop_And64 ? Xalu_AND
: Xalu_XOR;
iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+ iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
addInstr(env, mk_iMOVsd_RR(xHi, tHi));
+ addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
addInstr(env, mk_iMOVsd_RR(xLo, tLo));
- iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
- addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo));
*rHi = tHi;
*rLo = tLo;
@@ -2416,6 +2512,65 @@
return;
}
+ /* --- patterns rooted at: CmpwNEZ64 --- */
+
+ /* CmpwNEZ64(e) */
+ case Iop_CmpwNEZ64: {
+
+ DECLARE_PATTERN(p_CmpwNEZ64_Or64);
+ DEFINE_PATTERN(p_CmpwNEZ64_Or64,
+ unop(Iop_CmpwNEZ64,binop(Iop_Or64,bind(0),bind(1))));
+ if (matchIRExpr(&mi, p_CmpwNEZ64_Or64, e)) {
+ /* CmpwNEZ64(Or64(x,y)) */
+ HReg xHi,xLo,yHi,yLo;
+ HReg xBoth = newVRegI(env);
+ HReg merged = newVRegI(env);
+ HReg tmp2 = newVRegI(env);
+
+ iselInt64Expr(&xHi,&xLo, env, mi.bindee[0]);
+ addInstr(env, mk_iMOVsd_RR(xHi,xBoth));
+ addInstr(env, X86Instr_Alu32R(Xalu_OR,
+ X86RMI_Reg(xLo),xBoth));
+
+ iselInt64Expr(&yHi,&yLo, env, mi.bindee[1]);
+ addInstr(env, mk_iMOVsd_RR(yHi,merged));
+ addInstr(env, X86Instr_Alu32R(Xalu_OR,
+ X86RMI_Reg(yLo),merged));
+ addInstr(env, X86Instr_Alu32R(Xalu_OR,
+ X86RMI_Reg(xBoth),merged));
+
+ /* tmp2 = (merged | -merged) >>s 31 */
+ addInstr(env, mk_iMOVsd_RR(merged,tmp2));
+ addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
+ addInstr(env, X86Instr_Alu32R(Xalu_OR,
+ X86RMI_Reg(merged), tmp2));
+ addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
+ *rHi = tmp2;
+ *rLo = tmp2;
+ return;
+ } else {
+ /* CmpwNEZ64(e) */
+ HReg srcLo, srcHi;
+ HReg tmp1 = newVRegI(env);
+ HReg tmp2 = newVRegI(env);
+ /* srcHi:srcLo = arg */
+ iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
+ /* tmp1 = srcHi | srcLo */
+ addInstr(env, mk_iMOVsd_RR(srcHi,tmp1));
+ addInstr(env, X86Instr_Alu32R(Xalu_OR,
+ X86RMI_Reg(srcLo), tmp1));
+ /* tmp2 = (tmp1 | -tmp1) >>s 31 */
+ addInstr(env, mk_iMOVsd_RR(tmp1,tmp2));
+ addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
+ addInstr(env, X86Instr_Alu32R(Xalu_OR,
+ X86RMI_Reg(tmp1), tmp2));
+ addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
+ *rHi = tmp2;
+ *rLo = tmp2;
+ return;
+ }
+ }
+
/* ReinterpF64asI64(e) */
/* Given an IEEE754 double, produce an I64 with the same bit
pattern. */
@@ -2829,12 +2984,12 @@
if (e->tag == Iex_Mux0X) {
if (ty == Ity_F64
&& typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
- X86RM* rm8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
- HReg rX = iselDblExpr(env, e->Iex.Mux0X.exprX);
- HReg r0 = iselDblExpr(env, e->Iex.Mux0X.expr0);
- HReg dst = newVRegF(env);
+ X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
+ HReg rX = iselDblExpr(env, e->Iex.Mux0X.exprX);
+ HReg r0 = iselDblExpr(env, e->Iex.Mux0X.expr0);
+ HReg dst = newVRegF(env);
addInstr(env, X86Instr_FpUnary(Xfp_MOV,rX,dst));
- addInstr(env, X86Instr_Test32(0xFF, rm8));
+ addInstr(env, X86Instr_Test32(0xFF, r8));
addInstr(env, X86Instr_FpCMov(Xcc_Z,r0,dst));
return dst;
}
@@ -3350,12 +3505,12 @@
} /* if (e->tag == Iex_Binop) */
if (e->tag == Iex_Mux0X) {
- X86RM* rm8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
- HReg rX = iselVecExpr(env, e->Iex.Mux0X.exprX);
- HReg r0 = iselVecExpr(env, e->Iex.Mux0X.expr0);
- HReg dst = newVRegV(env);
+ X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
+ HReg rX = iselVecExpr(env, e->Iex.Mux0X.exprX);
+ HReg r0 = iselVecExpr(env, e->Iex.Mux0X.expr0);
+ HReg dst = newVRegV(env);
addInstr(env, mk_vMOVsd_RR(rX,dst));
- addInstr(env, X86Instr_Test32(0xFF, rm8));
+ addInstr(env, X86Instr_Test32(0xFF, r8));
addInstr(env, X86Instr_SseCMov(Xcc_Z,r0,dst));
return dst;
}
Modified: branches/CGTUNE/priv/ir/irdefs.c
===================================================================
--- branches/CGTUNE/priv/ir/irdefs.c 2007-05-07 18:36:48 UTC (rev 1768)
+++ branches/CGTUNE/priv/ir/irdefs.c 2007-05-08 13:45:27 UTC (rev 1769)
@@ -203,6 +203,14 @@
case Iop_CmpNEZ32: vex_printf("CmpNEZ32"); return;
case Iop_CmpNEZ64: vex_printf("CmpNEZ64"); return;
+ case Iop_CmpwNEZ32: vex_printf("CmpwNEZ32"); return;
+ case Iop_CmpwNEZ64: vex_printf("CmpwNEZ64"); return;
+
+ case Iop_Left8: vex_printf("Left8"); return;
+ case Iop_Left16: vex_printf("Left16"); return;
+ case Iop_Left32: vex_printf("Left32"); return;
+ case Iop_Left64: vex_printf("Left64"); return;
+
case Iop_CmpORD32U: vex_printf("CmpORD32U"); return;
case Iop_CmpORD32S: vex_printf("CmpORD32S"); return;
@@ -1547,6 +1555,11 @@
case Iop_CmpNEZ32: UNARY_COMPARISON(Ity_I32);
case Iop_CmpNEZ64: UNARY_COMPARISON(Ity_I64);
+ case Iop_Left8: UNARY(Ity_I8, Ity_I8);
+ case Iop_Left16: UNARY(Ity_I16,Ity_I16);
+ case Iop_CmpwNEZ32: case Iop_Left32: UNARY(Ity_I32,Ity_I32);
+ case Iop_CmpwNEZ64: case Iop_Left64: UNARY(Ity_I64,Ity_I64);
+
case Iop_MullU8: case Iop_MullS8:
BINARY(Ity_I8,Ity_I8, Ity_I16);
case Iop_MullU16: case Iop_MullS16:
Modified: branches/CGTUNE/priv/ir/iropt.c
===================================================================
--- branches/CGTUNE/priv/ir/iropt.c 2007-05-07 18:36:48 UTC (rev 1768)
+++ branches/CGTUNE/priv/ir/iropt.c 2007-05-08 13:45:27 UTC (rev 1769)
@@ -1072,6 +1072,39 @@
)));
break;
+ case Iop_CmpwNEZ32: {
+ UInt w32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
+ if (w32 == 0)
+ e2 = IRExpr_Const(IRConst_U32( 0 ));
+ else
+ e2 = IRExpr_Const(IRConst_U32( 0xFFFFFFFF ));
+ break;
+ }
+ case Iop_CmpwNEZ64: {
+ ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
+ if (w64 == 0)
+ e2 = IRExpr_Const(IRConst_U64( 0 ));
+ else
+ e2 = IRExpr_Const(IRConst_U64( 0xFFFFFFFFFFFFFFFFULL ));
+ break;
+ }
+
+ case Iop_Left32: {
+ UInt u32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
+ Int s32 = (Int)(u32 & 0xFFFFFFFF);
+ s32 = (s32 | (-s32));
+ e2 = IRExpr_Const( IRConst_U32( (UInt)s32 ));
+ break;
+ }
+
+ case Iop_Left64: {
+ ULong u64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
+ Long s64 = (Long)u64;
+ s64 = (s64 | (-s64));
+ e2 = IRExpr_Const( IRConst_U64( (ULong)s64 ));
+ break;
+ }
+
default:
goto unhandled;
}
@@ -1465,13 +1498,20 @@
e2 = IRExpr_Const(IRConst_U32(0));
} else
- /* And32(0,x) ==> 0 */
- if (e->Iex.Binop.op == Iop_And32
+ /* And32/Shl32(0,x) ==> 0 */
+ if ((e->Iex.Binop.op == Iop_And32 || e->Iex.Binop.op == Iop_Shl32)
&& e->Iex.Binop.arg1->tag == Iex_Const
&& e->Iex.Binop.arg1->Iex.Const.con->Ico.U32 == 0) {
e2 = IRExpr_Const(IRConst_U32(0));
} else
+ /* Or8(0,x) ==> x */
+ if (e->Iex.Binop.op == Iop_Or8
+ && e->Iex.Binop.arg1->tag == Iex_Const
+ && e->Iex.Binop.arg1->Iex.Const.con->Ico.U8 == 0) {
+ e2 = e->Iex.Binop.arg2;
+ } else
+
/* Or32(0,x) ==> x */
if (e->Iex.Binop.op == Iop_Or32
&& e->Iex.Binop.arg1->tag == Iex_Const
@@ -3698,6 +3738,94 @@
'single-shot', so once a binding is used, it is marked as no longer
available, by setting its .bindee field to NULL. */
+static inline Bool is_Unop ( IRExpr* e, IROp op ) {
+ return e->tag == Iex_Unop && e->Iex.Unop.op == op;
+}
+static inline Bool is_Binop ( IRExpr* e, IROp op ) {
+ return e->tag == Iex_Binop && e->Iex.Binop.op == op;
+}
+
+static IRExpr* fold_IRExpr_Binop ( IROp op, IRExpr* a1, IRExpr* a2 )
+{
+ switch (op) {
+ case Iop_Or32:
+ /* Or32( CmpwNEZ32(x), CmpwNEZ32(y) ) --> CmpwNEZ32( Or32( x, y ) ) */
+ if (is_Unop(a1, Iop_CmpwNEZ32) && is_Unop(a2, Iop_CmpwNEZ32))
+ return IRExpr_Unop( Iop_CmpwNEZ32,
+ IRExpr_Binop( Iop_Or32, a1->Iex.Unop.arg,
+ a2->Iex.Unop.arg ) );
+ break;
+ default:
+ break;
+ }
+ /* no reduction rule applies */
+ return IRExpr_Binop( op, a1, a2 );
+}
+
+static IRExpr* fold_IRExpr_Unop ( IROp op, IRExpr* aa )
+{
+ switch (op) {
+ case Iop_CmpwNEZ64:
+ /* CmpwNEZ64( Or64 ( CmpwNEZ64(x), y ) ) --> CmpwNEZ64( Or64( x, y ) ) */
+ if (is_Binop(aa, Iop_Or64)
+ && is_Unop(aa->Iex.Binop.arg1, Iop_CmpwNEZ64))
+ return fold_IRExpr_Unop(
+ Iop_CmpwNEZ64,
+ IRExpr_Binop(Iop_Or64,
+ aa->Iex.Binop.arg1->Iex.Unop.arg,
+ aa->Iex.Binop.arg2));
+ /* CmpwNEZ64( Or64 ( x, CmpwNEZ64(y) ) ) --> CmpwNEZ64( Or64( x, y ) ) */
+ if (is_Binop(aa, Iop_Or64)
+ && is_Unop(aa->Iex.Binop.arg2, Iop_CmpwNEZ64))
+ return fold_IRExpr_Unop(
+ Iop_CmpwNEZ64,
+ IRExpr_Binop(Iop_Or64,
+ aa->Iex.Binop.arg1,
+ aa->Iex.Binop.arg2->Iex.Unop.arg));
+ break;
+ case Iop_CmpNEZ64:
+ /* CmpNEZ64( Left64(x) ) --> CmpNEZ64(x) */
+ if (is_Unop(aa, Iop_Left64))
+ return IRExpr_Unop(Iop_CmpNEZ64, aa->Iex.Unop.arg);
+ break;
+ case Iop_CmpwNEZ32:
+ /* CmpwNEZ32( CmpwNEZ32 ( x ) ) --> CmpwNEZ32 ( x ) */
+ if (is_Unop(aa, Iop_CmpwNEZ32))
+ return IRExpr_Unop( Iop_CmpwNEZ32, aa->Iex.Unop.arg );
+ break;
+ case Iop_CmpNEZ32:
+ /* CmpNEZ32( Left32(x) ) --> CmpNEZ32(x) */
+ if (is_Unop(aa, Iop_Left32))
+ return IRExpr_Unop(Iop_CmpNEZ32, aa->Iex.Unop.arg);
+ break;
+ case Iop_Left32:
+ /* Left32( Left32(x) ) --> Left32(x) */
+ if (is_Unop(aa, Iop_Left32))
+ return IRExpr_Unop( Iop_Left32, aa->Iex.Unop.arg );
+ break;
+ case Iop_32to1:
+ /* 32to1( 1Uto32 ( x ) ) --> x */
+ if (is_Unop(aa, Iop_1Uto32))
+ return aa->Iex.Unop.arg;
+ /* 32to1( CmpwNEZ32 ( x )) --> CmpNEZ32(x) */
+ if (is_Unop(aa, Iop_CmpwNEZ32))
+ return IRExpr_Unop( Iop_CmpNEZ32, aa->Iex.Unop.arg );
+ break;
+ case Iop_64to1:
+ /* 64to1( 1Uto64 ( x ) ) --> x */
+ if (is_Unop(aa, Iop_1Uto64))
+ return aa->Iex.Unop.arg;
+ /* 64to1( CmpwNEZ64 ( x )) --> CmpNEZ64(x) */
+ if (is_Unop(aa, Iop_CmpwNEZ64))
+ return IRExpr_Unop( Iop_CmpNEZ64, aa->Iex.Unop.arg );
+ break;
+ default:
+ break;
+ }
+ /* no reduction rule applies */
+ return IRExpr_Unop( op, aa );
+}
+
static IRExpr* atbSubst_Expr ( ATmpInfo* env, IRExpr* e )
{
IRExpr* e2;
@@ -3740,13 +3868,13 @@
atbSubst_Expr(env, e->Iex.Triop.arg3)
);
case Iex_Binop:
- return IRExpr_Binop(
+ return fold_IRExpr_Binop(
e->Iex.Binop.op,
atbSubst_Expr(env, e->Iex.Binop.arg1),
atbSubst_Expr(env, e->Iex.Binop.arg2)
);
case Iex_Unop:
- return IRExpr_Unop(
+ return fold_IRExpr_Unop(
e->Iex.Unop.op,
atbSubst_Expr(env, e->Iex.Unop.arg)
);
Modified: branches/CGTUNE/priv/main/vex_util.c
===================================================================
--- branches/CGTUNE/priv/main/vex_util.c 2007-05-07 18:36:48 UTC (rev 1768)
+++ branches/CGTUNE/priv/main/vex_util.c 2007-05-08 13:45:27 UTC (rev 1769)
@@ -441,6 +441,10 @@
PAD(len1); PUT('0'); PUT('x'); PUTSTR(intbuf); PAD(len3);
break;
}
+ case '%': {
+ PUT('%');
+ break;
+ }
default:
/* no idea what it is. Print the format literally and
move on. */
Modified: branches/CGTUNE/pub/libvex_ir.h
===================================================================
--- branches/CGTUNE/pub/libvex_ir.h 2007-05-07 18:36:48 UTC (rev 1768)
+++ branches/CGTUNE/pub/libvex_ir.h 2007-05-08 13:45:27 UTC (rev 1769)
@@ -445,6 +445,8 @@
/* As a sop to Valgrind-Memcheck, the following are useful. */
Iop_CmpNEZ8, Iop_CmpNEZ16, Iop_CmpNEZ32, Iop_CmpNEZ64,
+ Iop_CmpwNEZ32, Iop_CmpwNEZ64, /* all-0s -> all-Os; other -> all-1s */
+ Iop_Left8, Iop_Left16, Iop_Left32, Iop_Left64, /* \x -> x | -x */
/* PowerPC-style 3-way integer comparisons. Without them it is
difficult to simulate PPC efficiently.
|
|
From: <js...@ac...> - 2007-05-08 13:44:08
|
Nightly build on minnie ( SuSE 10.0, ppc32 ) started at 2007-05-08 09:00:01 BST Results unchanged from 24 hours ago Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 219 tests, 10 stderr failures, 6 stdout failures, 0 posttest failures == memcheck/tests/leak-tree (stderr) memcheck/tests/leakotron (stdout) memcheck/tests/pointer-trace (stderr) memcheck/tests/stack_changes (stderr) memcheck/tests/xml1 (stderr) none/tests/faultstatus (stderr) none/tests/fdleak_cmsg (stderr) none/tests/mremap (stderr) none/tests/mremap2 (stdout) none/tests/ppc32/jm-fp (stdout) none/tests/ppc32/jm-fp (stderr) none/tests/ppc32/round (stdout) none/tests/ppc32/round (stderr) none/tests/ppc32/test_fx (stdout) none/tests/ppc32/test_fx (stderr) none/tests/ppc32/test_gx (stdout) |
|
From: <sv...@va...> - 2007-05-08 12:07:53
|
Author: weidendo
Date: 2007-05-08 13:07:52 +0100 (Tue, 08 May 2007)
New Revision: 6735
Log:
Update
Not really a bug, but the callgrind doc improvement is useful
on the 3.2 branch as well.
Modified:
trunk/docs/internals/3_2_BUGSTATUS.txt
Modified: trunk/docs/internals/3_2_BUGSTATUS.txt
===================================================================
--- trunk/docs/internals/3_2_BUGSTATUS.txt 2007-05-08 11:58:23 UTC (rev 6734)
+++ trunk/docs/internals/3_2_BUGSTATUS.txt 2007-05-08 12:07:52 UTC (rev 6735)
@@ -42,6 +42,8 @@
r6601 r6712 32 n-i-bz callgrind: Fix threads display
of "callgrind_control -s"
+r6734 pending n-i-nz Callgrind: improve documentation
+
r6622 r6713 32 n-i-bz .eh_frame crud for m_trampoline.S fns
r6630 pending n-i-bz add additional ptrace reason codes
|
|
From: <sv...@va...> - 2007-05-08 11:58:24
|
Author: weidendo
Date: 2007-05-08 12:58:23 +0100 (Tue, 08 May 2007)
New Revision: 6734
Log:
Callgrind: improve documentation
* Looks a little bit more like the Cachegrind manual
(at least in front)
* Removed the out-of-place general section about profiling
and gprof. Perhaps something like this can be put at
another place
* Notes about Callgrinds problems with call tracing on PPC
* Include usage of callgrind_annotate, and note its lack of
cycle detection
Modified:
trunk/callgrind/docs/cl-manual.xml
Modified: trunk/callgrind/docs/cl-manual.xml
===================================================================
--- trunk/callgrind/docs/cl-manual.xml 2007-05-08 09:20:25 UTC (rev 6733)
+++ trunk/callgrind/docs/cl-manual.xml 2007-05-08 11:58:23 UTC (rev 6734)
@@ -10,11 +10,13 @@
<sect1 id="cl-manual.use" xreflabel="Overview">
<title>Overview</title>
-<para>Callgrind is a Valgrind tool for profiling programs.
-The collected data consists of
-the number of instructions executed on a run, their relationship
+<para>Callgrind is a Valgrind tool for profiling programs
+with the ability to construct a call graph from the execution.
+By default, the collected data consists of
+the number of instructions executed, their attribution
to source lines, and
-call relationship among functions together with call counts.
+call relationship among functions together with number of
+actually executed calls.
Optionally, a cache simulator (similar to cachegrind) can produce
further information about the memory access behavior of the application.
</para>
@@ -27,7 +29,7 @@
<term><command>callgrind_annotate</command></term>
<listitem>
<para>This command reads in the profile data, and prints a
- sorted lists of functions, optionally with annotation.</para>
+ sorted lists of functions, optionally with source annotation.</para>
<!--
<para>You can read the manpage here: <xref
linkend="callgrind-annotate"/>.</para>
@@ -44,8 +46,8 @@
<para>This command enables you to interactively observe and control
the status of currently running applications, without stopping
the application. You can
- get statistics information, the current stack trace, and request
- zeroing of counters, and dumping of profiles data.</para>
+ get statistics information as well as the current stack trace, and
+ you can request zeroing of counters or dumping of profile data.</para>
<!--
<para>You can read the manpage here: <xref linkend="callgrind-control"/>.</para>
-->
@@ -57,94 +59,55 @@
<computeroutput>--tool=callgrind</computeroutput> on the Valgrind
command line.</para>
+ <sect2 id="cl-manual.functionality" xreflabel="Functionality">
+ <title>Functionality</title>
+
+<para>Cachegrind provides a flat profile: event counts (reads, misses etc.)
+attributed to functions exactly represent events which happened while the
+function itself was running, which also is called <emphasis>self</emphasis>
+or <emphasis>exclusive</emphasis> cost. In addition, Callgrind further
+attributes call sites inside functions with event counts for events which
+happened while the call was active, ie. while code was executed which actually
+was called from the given call site. Adding these call costs to the self cost of
+a function gives the so called <emphasis>inclusive</emphasis> cost.
+As an example, inclusive cost of <computeroutput>main()</computeroutput> should
+be almost 100 percent (apart from any cost spent in startup before main, such as
+initialization of the run time linker or construction of global C++ objects).
+</para>
+
+<para>Together with the call graph, this allows you to see the call chains starting
+from <computeroutput>main()</computeroutput>, inside which most of the
+events were happening. This especially is useful for functions called from
+multiple call sites, and where any optimization makes sense only by changing
+code in the caller (e.g. by reducing the call count).</para>
+
<para>Callgrind's cache simulation is based on the
-<ulink url="&cg-tool-url;">Cachegrind tool</ulink> of the
-<ulink url="&vg-url;">Valgrind</ulink> package. Read
+<ulink url="&cg-tool-url;">Cachegrind tool</ulink>. Read
<ulink url="&cg-doc-url;">Cachegrind's documentation</ulink> first;
this page describes the features supported in addition to
Cachegrind's features.</para>
-</sect1>
+<para>Callgrinds ability to trace function call varies with the ISA of the
+platform it is run on. Its usage was specially tailored for x86 and amd64,
+and unfortunately, it currently happens to show quite bad call/return detection
+in PPC32/64 code (this is because there are only jump/branch instructions
+in the PPC ISA, and Callgrind has to rely on heuristics).</para>
-
-<sect1 id="cl-manual.purpose" xreflabel="Purpose">
-<title>Purpose</title>
-
-
- <sect2 id="cl-manual.devel"
- xreflabel="Profiling as part of Application Development">
- <title>Profiling as part of Application Development</title>
-
- <para>With application development, a common step is
- to improve runtime performance. To not waste time on
- optimizing functions which are rarely used, one needs to know
- in which parts of the program most of the time is spent.</para>
-
- <para>This is done with a technique called profiling. The program
- is run under control of a profiling tool, which gives the time
- distribution of executed functions in the run. After examination
- of the program's profile, it should be clear if and where optimization
- is useful. Afterwards, one should verify any runtime changes by another
- profile run.</para>
-
</sect2>
+ <sect2 id="cl-manual.basics" xreflabel="Basic Usage">
+ <title>Basic Usage</title>
- <sect2 id="cl-manual.tools" xreflabel="Profiling Tools">
- <title>Profiling Tools</title>
+ <para>As with Cachegrind, you probably want to compile with debugging info
+ (the -g flag), but with optimization turned on.</para>
- <para>Most widely known is the GCC profiling tool <command>GProf</command>:
- one needs to compile an application with the compiler option
- <computeroutput>-pg</computeroutput>. Running the program generates
- a file <computeroutput>gmon.out</computeroutput>, which can be
- transformed into human readable form with the command line tool
- <computeroutput>gprof</computeroutput>. A disadvantage here is the
- the need to recompile everything, and also the need to statically link the
- executable.</para>
-
- <para>Another profiling tool is <command>Cachegrind</command>, part
- of <ulink url="&vg-url;">Valgrind</ulink>. It uses the processor
- emulation of Valgrind to run the executable, and catches all memory
- accesses, which are used to drive a cache simulator.
- The program does not need to be
- recompiled, it can use shared libraries and plugins, and the profile
- measurement doesn't influence the memory access behaviour.
- The trace includes
- the number of instruction/data memory accesses and 1st/2nd level
- cache misses, and relates it to source lines and functions of the
- run program. A disadvantage is the slowdown involved in the
- processor emulation, around 50 times slower.</para>
-
- <para>Cachegrind can only deliver a flat profile. There is no call
- relationship among the functions of an application stored. Thus,
- inclusive costs, i.e. costs of a function including the cost of all
- functions called from there, cannot be calculated. Callgrind extends
- Cachegrind by including call relationship and exact event counts
- spent while doing a call.</para>
-
- <para>Because Callgrind (and Cachegrind) is based on simulation, the
- slowdown due to processing the synthetic runtime events does not
- influence the results. See <xref linkend="cl-manual.usage"/> for more
- details on the possibilities.</para>
-
- </sect2>
-
-</sect1>
-
-
-<sect1 id="cl-manual.usage" xreflabel="Usage">
-<title>Usage</title>
-
- <sect2 id="cl-manual.basics" xreflabel="Basics">
- <title>Basics</title>
-
<para>To start a profile run for a program, execute:
<screen>callgrind [callgrind options] your-program [program options]</screen>
</para>
<para>While the simulation is running, you can observe execution with
<screen>callgrind_control -b</screen>
- This will print out a current backtrace. To annotate the backtrace with
+ This will print out the current backtrace. To annotate the backtrace with
event counts, run
<screen>callgrind_control -e -b</screen>
</para>
@@ -152,26 +115,73 @@
<para>After program termination, a profile data file named
<computeroutput>callgrind.out.pid</computeroutput>
is generated with <emphasis>pid</emphasis> being the process ID
- of the execution of this profile run.</para>
-
- <para>The data file contains information about the calls made in the
+ of the execution of this profile run.
+ The data file contains information about the calls made in the
program among the functions executed, together with events of type
<command>Instruction Read Accesses</command> (Ir).</para>
+ <para>To generate a function-by-function summary from the profile
+ data file, use
+ <screen>callgrind_annotate [options] callgrind.out.pid</screen>
+ This summary is similar to the output you get from a Cachegrind
+ run with <computeroutput>cg_annotate</computeroutput>: the list
+ of functions is ordered by exclusive cost of functions, which also
+ are the ones that are shown.
+ Important for the additional features of Callgrind are
+ the following two options:</para>
+
+ <itemizedlist>
+ <listitem>
+ <para><option>--inclusive=yes</option>: Instead of using
+ exclusive cost of functions as sorting order, use and show
+ inclusive cost.</para>
+ </listitem>
+
+ <listitem>
+ <para><option>--tree=both</option>: Interleaved into the
+ ordered list of function, show the callers and the callees
+ of each function. In these lines, which represents executed
+ calls, the cost gives the number of events spent in the call.
+ Indented, above each given function, there is the list of callers,
+ and below, the list of callees. The sum of events in calls to
+ a given function (caller lines), as well as the sum of events in
+ calls from the function (callee lines) together with the self
+ cost, gives the total inclusive cost of the function.</para>
+ </listitem>
+ </itemizedlist>
+
+ <para>Use <option>--auto=yes</option> to get annotated source code
+ for all relevant functions for which the source can be found. In
+ addition to source annotation as produced by
+ <computeroutput>cg_annotate</computeroutput>, you will see the
+ annotated call sites with call counts. For all other options, look
+ up the manual for <computeroutput>cg_annotate</computeroutput>.
+ </para>
+
+ <para>For better call graph browsing experience, it is highly recommended
+ to use <ulink url="&cl-gui;">KCachegrind</ulink>. If your code happens
+ to spent relevant fractions of cost in <emphasis>cycles</emphasis> (sets
+ of functions calling each other in a recursive manner), you have to
+ use KCachegrind, as <computeroutput>callgrind_annotate</computeroutput>
+ currently does not do any cycle detection, which is important to get correct
+ results in this case.</para>
+
<para>If you are additionally interested in measuring the
- cache behaviour of your
+ cache behavior of your
program, use Callgrind with the option
<option><xref linkend="opt.simulate-cache"/>=yes.</option>
- This will further slow down the run approximately by a factor of 2.</para>
+ However, expect a further slow down approximately by a factor of 2.</para>
<para>If the program section you want to profile is somewhere in the
middle of the run, it is beneficial to
<emphasis>fast forward</emphasis> to this section without any
- profiling at all, and switch it on later. This is achieved by using
+ profiling at all, and switch profiling on later. This is achieved by using
<option><xref linkend="opt.instr-atstart"/>=no</option>
and interactively use
<computeroutput>callgrind_control -i on</computeroutput> before the
- interesting code section is about to be executed.</para>
+ interesting code section is about to be executed. To exactly specify
+ the code position where profiling should start, use the client request
+ <computeroutput>CALLGRIND_START_INSTRUMENTATION</computeroutput>.</para>
<para>If you want to be able to see assembler annotation, specify
<option><xref linkend="opt.dump-instr"/>=yes</option>. This will produce
@@ -184,12 +194,16 @@
</sect2>
+</sect1>
+<sect1 id="cl-manual.usage" xreflabel="Advanced Usage">
+<title>Advanced Usage</title>
+
<sect2 id="cl-manual.dumps"
xreflabel="Multiple dumps from one program run">
<title>Multiple profiling dumps from one program run</title>
- <para>Often, you aren't interested in time characteristics of a full
+ <para>Often, you are not interested in characteristics of a full
program run, but only of a small part of it (e.g. execution of one
algorithm). If there are multiple algorithms or one algorithm
running with different input data, it's even useful to get different
|
|
From: <sv...@va...> - 2007-05-08 09:20:28
|
Author: sewardj
Date: 2007-05-08 10:20:25 +0100 (Tue, 08 May 2007)
New Revision: 6733
Log:
Add branch-misprediction profiling to Cachegrind. When the (new) flag
--branch-sim=yes is specified, Cachegrind simulates a simple indirect
branch predictor and a conditional branch predictor. The latter
considers both the branch instruction's address and the behaviour of
the last few conditional branches. Return stack prediction is not
modelled.
The new counted events are: conditional branches (Bc), mispredicted
conditional branches (Bcm), indirect branches (Bi) and mispredicted
indirect branches (Bim). Postprocessing tools (cg_annotate, cg_merge)
handle the new events as you would expect. Note that branch
simulation is not enabled by default as it gives a 20%-25% slowdown,
so you need to ask for it explicitly using --branch-sim=yes.
Added:
trunk/cachegrind/cg_branchpred.c
Modified:
trunk/NEWS
trunk/cachegrind/Makefile.am
trunk/cachegrind/cg_main.c
trunk/cachegrind/docs/cg-manual.xml
trunk/cachegrind/tests/filter_stderr
Modified: trunk/NEWS
===================================================================
--- trunk/NEWS 2007-05-06 13:22:23 UTC (rev 6732)
+++ trunk/NEWS 2007-05-08 09:20:25 UTC (rev 6733)
@@ -6,6 +6,11 @@
Other user-visible changes:
+- Cachegrind has been extended to do branch-misprediction profiling.
+ Both conditional and indirect branches are profiled. The default
+ behaviour of Cachegrind is unchanged. To use the new functionality,
+ give the option --branch-sim=yes.
+
- A new suppression kind has been introduced: "Jump". This is for
suppressing jump-to-invalid-address errors. Previously you had to use an
"Addr1" suppression, which didn't make much sense.
Modified: trunk/cachegrind/Makefile.am
===================================================================
--- trunk/cachegrind/Makefile.am 2007-05-06 13:22:23 UTC (rev 6732)
+++ trunk/cachegrind/Makefile.am 2007-05-08 09:20:25 UTC (rev 6733)
@@ -2,7 +2,7 @@
bin_SCRIPTS = cg_annotate
-noinst_HEADERS = cg_arch.h cg_sim.c
+noinst_HEADERS = cg_arch.h cg_sim.c cg_branchpred.c
noinst_PROGRAMS =
if VGP_X86_LINUX
Added: trunk/cachegrind/cg_branchpred.c
===================================================================
--- trunk/cachegrind/cg_branchpred.c (rev 0)
+++ trunk/cachegrind/cg_branchpred.c 2007-05-08 09:20:25 UTC (rev 6733)
@@ -0,0 +1,154 @@
+
+/*--------------------------------------------------------------------*/
+/*--- Branch predictor simulation cg_branchpred.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+ This file is part of Cachegrind, a Valgrind tool for cache
+ profiling programs.
+
+ Copyright (C) 2002-2007 Nicholas Nethercote
+ nj...@va...
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307, USA.
+
+ The GNU General Public License is contained in the file COPYING.
+*/
+
+
+/* This file contains the actual branch predictor simulator and its
+ associated state. As with cg_sim.c it is #included directly into
+ cg_main.c. It provides:
+
+ - a taken/not-taken predictor for conditional branches
+ - a branch target address predictor for indirect branches
+
+ Function return-address prediction is not modelled, on the basis
+ that return stack predictors almost always predict correctly, and
+ also that it is difficult for Valgrind to robustly identify
+ function calls and returns.
+*/
+
+/* How many bits at the bottom of an instruction address are
+ guaranteed to be zero? */
+#if defined(VGA_ppc32) || defined(VGA_ppc64)
+# define N_IADDR_LO_ZERO_BITS 2
+#elif defined(VGA_x86) || defined(VGA_amd64)
+# define N_IADDR_LO_ZERO_BITS 0
+#else
+# error "Unsupported architecture"
+#endif
+
+
+/* Get a taken/not-taken prediction for the instruction (presumably a
+ conditional branch) at instr_addr. Once that's done, update the
+ predictor state based on whether or not it was actually taken, as
+ indicated by 'taken'. Finally, return 1 for a mispredict and 0 for
+ a successful predict.
+
+ The predictor is an array of 16k (== 2^14) 2-bit saturating
+ counters. Given the address of the branch instruction, the array
+ index to use is computed both from the low order bits of the branch
+ instruction's address, and the global history - that is, from the
+ taken/not-taken behaviour of the most recent few branches. This
+ makes the predictor able to correlate this branch's behaviour with
+ that of other branches.
+
+ TODO: use predictor written by someone who understands this stuff.
+ Perhaps it would be better to move to a standard GShare predictor
+ and/or tournament predictor.
+*/
+/* The index is composed of N_HIST bits at the top and N_IADD bits at
+ the bottom. These numbers chosen somewhat arbitrarily, but note
+ that making N_IADD_BITS too small (eg 4) can cause large amounts of
+ aliasing, and hence misprediction, particularly if the history bits
+ are mostly unchanging. */
+#define N_HIST_BITS 7
+#define N_IADD_BITS 7
+
+#define N_BITS (N_HIST_BITS + N_IADD_BITS)
+#define N_COUNTERS (1 << N_BITS)
+
+static UWord shift_register = 0; /* Contains global history */
+static UChar counters[N_COUNTERS]; /* Counter array; presumably auto-zeroed */
+
+
+static ULong do_cond_branch_predict ( Addr instr_addr, Word takenW )
+{
+ UWord index;
+ Bool predicted_taken, actually_taken, mispredict;
+
+ const UWord hist_mask = (1 << N_HIST_BITS) - 1;
+ const UWord iadd_mask = (1 << N_IADD_BITS) - 1;
+ UWord hist_bits = shift_register & hist_mask;
+ UWord iadd_bits = (instr_addr >> N_IADDR_LO_ZERO_BITS)
+ & iadd_mask;
+
+ tl_assert(hist_bits <= hist_mask);
+ tl_assert(iadd_bits <= iadd_mask);
+ index = (hist_bits << N_IADD_BITS) | iadd_bits;
+ tl_assert(index < N_COUNTERS);
+ if (0) VG_(printf)("index = %d\n", (Int)index);
+
+ tl_assert(takenW <= 1);
+ predicted_taken = counters[ index ] >= 2;
+ actually_taken = takenW > 0;
+
+ mispredict = (actually_taken && (!predicted_taken))
+ || ((!actually_taken) && predicted_taken);
+
+ shift_register <<= 1;
+ shift_register |= (actually_taken ? 1 : 0);
+
+ if (actually_taken) {
+ if (counters[index] < 3)
+ counters[index]++;
+ } else {
+ if (counters[index] > 0)
+ counters[index]--;
+ }
+
+ tl_assert(counters[index] <= 3);
+
+ return mispredict ? 1 : 0;
+}
+
+
+/* A very simple indirect branch predictor. Use the branch's address
+ to index a table which records the previous target address for this
+ branch (or whatever aliased with it) and use that as the
+ prediction. */
+#define N_BTAC_BITS 9
+#define N_BTAC (1 << N_BTAC_BITS)
+static Addr btac[N_BTAC]; /* BTAC; presumably auto-zeroed */
+
+static ULong do_ind_branch_predict ( Addr instr_addr, Addr actual )
+{
+ Bool mispredict;
+ const UWord mask = (1 << N_BTAC_BITS) - 1;
+ UWord index = (instr_addr >> N_IADDR_LO_ZERO_BITS)
+ & mask;
+ tl_assert(index < N_BTAC);
+ mispredict = btac[index] != actual;
+ btac[index] = actual;
+ return mispredict ? 1 : 0;
+}
+
+
+/*--------------------------------------------------------------------*/
+/*--- end cg_branchpred.c ---*/
+/*--------------------------------------------------------------------*/
+
Modified: trunk/cachegrind/cg_main.c
===================================================================
--- trunk/cachegrind/cg_main.c 2007-05-06 13:22:23 UTC (rev 6732)
+++ trunk/cachegrind/cg_main.c 2007-05-08 09:20:25 UTC (rev 6733)
@@ -48,6 +48,7 @@
#include "cg_arch.h"
#include "cg_sim.c"
+#include "cg_branchpred.c"
/*------------------------------------------------------------*/
/*--- Constants ---*/
@@ -61,16 +62,31 @@
#define FN_LEN 256
/*------------------------------------------------------------*/
+/*--- Options ---*/
+/*------------------------------------------------------------*/
+
+static Bool clo_cache_sim = True; /* do cache simulation? */
+static Bool clo_branch_sim = False; /* do branch simulation? */
+
+/*------------------------------------------------------------*/
/*--- Types and Data Structures ---*/
/*------------------------------------------------------------*/
-typedef struct _CC CC;
-struct _CC {
- ULong a;
- ULong m1;
- ULong m2;
-};
+typedef
+ struct {
+ ULong a; /* total # memory accesses of this kind */
+ ULong m1; /* misses in the first level cache */
+ ULong m2; /* misses in the second level cache */
+ }
+ CacheCC;
+typedef
+ struct {
+ ULong b; /* total # branches of this kind */
+ ULong mp; /* number of branches mispredicted */
+ }
+ BranchCC;
+
//------------------------------------------------------------
// Primary data structure #1: CC table
// - Holds the per-source-line hit/miss stats, grouped by file/function/line.
@@ -85,13 +101,14 @@
}
CodeLoc;
-typedef struct _LineCC LineCC;
-struct _LineCC {
- CodeLoc loc;
- CC Ir;
- CC Dr;
- CC Dw;
-};
+typedef struct {
+ CodeLoc loc; /* Source location that these counts pertain to */
+ CacheCC Ir; /* Insn read counts */
+ CacheCC Dr; /* Data read counts */
+ CacheCC Dw; /* Data write/modify counts */
+ BranchCC Bc; /* Conditional branch counts */
+ BranchCC Bi; /* Indirect branch counts */
+} LineCC;
// First compare file, then fn, then line.
static Word cmp_CodeLoc_LineCC(void *vloc, void *vcc)
@@ -246,6 +263,10 @@
lineCC->Dw.a = 0;
lineCC->Dw.m1 = 0;
lineCC->Dw.m2 = 0;
+ lineCC->Bc.b = 0;
+ lineCC->Bc.mp = 0;
+ lineCC->Bi.b = 0;
+ lineCC->Bi.mp = 0;
VG_(OSet_Insert)(CC_table, lineCC);
}
@@ -351,6 +372,32 @@
n->parent->Dw.a++;
}
+/* For branches, we consult two different predictors, one which
+ predicts taken/untaken for conditional branches, and the other
+ which predicts the branch target address for indirect branches
+ (jump-to-register style ones). */
+
+static VG_REGPARM(2)
+void log_cond_branch(InstrInfo* n, Word taken)
+{
+ //VG_(printf)("cbrnch: CCaddr=0x%010lx, taken=0x%010lx\n",
+ // n, taken);
+ n->parent->Bc.b++;
+ n->parent->Bc.mp
+ += (1 & do_cond_branch_predict(n->instr_addr, taken));
+}
+
+static VG_REGPARM(2)
+void log_ind_branch(InstrInfo* n, UWord actual_dst)
+{
+ //VG_(printf)("ibrnch: CCaddr=0x%010lx, dst=0x%010lx\n",
+ // n, actual_dst);
+ n->parent->Bi.b++;
+ n->parent->Bi.mp
+ += (1 & do_ind_branch_predict(n->instr_addr, actual_dst));
+}
+
+
/*------------------------------------------------------------*/
/*--- Instrumentation types and structures ---*/
/*------------------------------------------------------------*/
@@ -389,18 +436,68 @@
IRAtom;
typedef
- enum { Event_Ir, Event_Dr, Event_Dw, Event_Dm }
- EventKind;
+ enum {
+ Ev_Ir, // Instruction read
+ Ev_Dr, // Data read
+ Ev_Dw, // Data write
+ Ev_Dm, // Data modify (read then write)
+ Ev_Bc, // branch conditional
+ Ev_Bi // branch indirect (to unknown destination)
+ }
+ EventTag;
typedef
struct {
- EventKind ekind; // All
- InstrInfo* inode; // All; inode for this event's instruction
- Int datasize; // Dr/Dw/Dm only
- IRAtom* dataEA; // Dr/Dw/Dm only; IR ATOM ONLY
+ EventTag tag;
+ InstrInfo* inode;
+ union {
+ struct {
+ } Ir;
+ struct {
+ IRAtom* ea;
+ Int szB;
+ } Dr;
+ struct {
+ IRAtom* ea;
+ Int szB;
+ } Dw;
+ struct {
+ IRAtom* ea;
+ Int szB;
+ } Dm;
+ struct {
+ IRAtom* taken; /* :: Ity_I1 */
+ } Bc;
+ struct {
+ IRAtom* dst;
+ } Bi;
+ } Ev;
}
Event;
+static void init_Event ( Event* ev ) {
+ VG_(memset)(ev, 0, sizeof(Event));
+}
+
+static IRAtom* get_Event_dea ( Event* ev ) {
+ switch (ev->tag) {
+ case Ev_Dr: return ev->Ev.Dr.ea;
+ case Ev_Dw: return ev->Ev.Dw.ea;
+ case Ev_Dm: return ev->Ev.Dm.ea;
+ default: tl_assert(0);
+ }
+}
+
+static Int get_Event_dszB ( Event* ev ) {
+ switch (ev->tag) {
+ case Ev_Dr: return ev->Ev.Dr.szB;
+ case Ev_Dw: return ev->Ev.Dw.szB;
+ case Ev_Dm: return ev->Ev.Dm.szB;
+ default: tl_assert(0);
+ }
+}
+
+
/* Up to this many unnotified events are allowed. Number is
arbitrary. Larger numbers allow more event merging to occur, but
potentially induce more spilling due to extending live ranges of
@@ -470,25 +567,35 @@
static void showEvent ( Event* ev )
{
- switch (ev->ekind) {
- case Event_Ir:
+ switch (ev->tag) {
+ case Ev_Ir:
VG_(printf)("Ir %p\n", ev->inode);
break;
- case Event_Dr:
- VG_(printf)("Dr %p %d EA=", ev->inode, ev->datasize);
- ppIRExpr(ev->dataEA);
+ case Ev_Dr:
+ VG_(printf)("Dr %p %d EA=", ev->inode, ev->Ev.Dr.szB);
+ ppIRExpr(ev->Ev.Dr.ea);
VG_(printf)("\n");
break;
- case Event_Dw:
- VG_(printf)("Dw %p %d EA=", ev->inode, ev->datasize);
- ppIRExpr(ev->dataEA);
+ case Ev_Dw:
+ VG_(printf)("Dw %p %d EA=", ev->inode, ev->Ev.Dw.szB);
+ ppIRExpr(ev->Ev.Dw.ea);
VG_(printf)("\n");
break;
- case Event_Dm:
- VG_(printf)("Dm %p %d EA=", ev->inode, ev->datasize);
- ppIRExpr(ev->dataEA);
+ case Ev_Dm:
+ VG_(printf)("Dm %p %d EA=", ev->inode, ev->Ev.Dm.szB);
+ ppIRExpr(ev->Ev.Dm.ea);
VG_(printf)("\n");
break;
+ case Ev_Bc:
+ VG_(printf)("Bc %p GA=", ev->inode);
+ ppIRExpr(ev->Ev.Bc.taken);
+ VG_(printf)("\n");
+ break;
+ case Ev_Bi:
+ VG_(printf)("Bi %p DST=", ev->inode);
+ ppIRExpr(ev->Ev.Bi.dst);
+ VG_(printf)("\n");
+ break;
default:
tl_assert(0);
break;
@@ -552,34 +659,42 @@
/* Decide on helper fn to call and args to pass it, and advance
i appropriately. */
- switch (ev->ekind) {
- case Event_Ir:
- /* Merge with a following Dr/Dm if it is from this insn. */
- if (ev2 && (ev2->ekind == Event_Dr || ev2->ekind == Event_Dm)) {
+ switch (ev->tag) {
+ case Ev_Ir:
+ /* Merge an Ir with a following Dr/Dm. */
+ if (ev2 && (ev2->tag == Ev_Dr || ev2->tag == Ev_Dm)) {
+ /* Why is this true? It's because we're merging an Ir
+ with a following Dr or Dm. The Ir derives from the
+ instruction's IMark and the Dr/Dm from data
+ references which follow it. In short it holds
+ because each insn starts with an IMark, hence an
+ Ev_Ir, and so these Dr/Dm must pertain to the
+ immediately preceding Ir. Same applies to analogous
+ assertions in the subsequent cases. */
tl_assert(ev2->inode == ev->inode);
helperName = "log_1I_1Dr_cache_access";
helperAddr = &log_1I_1Dr_cache_access;
argv = mkIRExprVec_3( i_node_expr,
- ev2->dataEA,
- mkIRExpr_HWord( ev2->datasize ) );
+ get_Event_dea(ev2),
+ mkIRExpr_HWord( get_Event_dszB(ev2) ) );
regparms = 3;
i += 2;
}
- /* Merge with a following Dw if it is from this insn. */
+ /* Merge an Ir with a following Dw. */
else
- if (ev2 && ev2->ekind == Event_Dw) {
+ if (ev2 && ev2->tag == Ev_Dw) {
tl_assert(ev2->inode == ev->inode);
helperName = "log_1I_1Dw_cache_access";
helperAddr = &log_1I_1Dw_cache_access;
argv = mkIRExprVec_3( i_node_expr,
- ev2->dataEA,
- mkIRExpr_HWord( ev2->datasize ) );
+ get_Event_dea(ev2),
+ mkIRExpr_HWord( get_Event_dszB(ev2) ) );
regparms = 3;
i += 2;
}
- /* Merge with two following Irs if possible. */
+ /* Merge an Ir with two following Irs. */
else
- if (ev2 && ev3 && ev2->ekind == Event_Ir && ev3->ekind == Event_Ir)
+ if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir)
{
helperName = "log_3I_0D_cache_access";
helperAddr = &log_3I_0D_cache_access;
@@ -589,9 +704,9 @@
regparms = 3;
i += 3;
}
- /* Merge with a following Ir if possible. */
+ /* Merge an Ir with one following Ir. */
else
- if (ev2 && ev2->ekind == Event_Ir) {
+ if (ev2 && ev2->tag == Ev_Ir) {
helperName = "log_2I_0D_cache_access";
helperAddr = &log_2I_0D_cache_access;
argv = mkIRExprVec_2( i_node_expr,
@@ -601,10 +716,6 @@
}
/* No merging possible; emit as-is. */
else {
- // Assertion: this Event_Ir must be the last one in the
- // events buffer, otherwise it would have been merged with a
- // following event.
- tl_assert(!ev2 && !ev3);
helperName = "log_1I_0D_cache_access";
helperAddr = &log_1I_0D_cache_access;
argv = mkIRExprVec_1( i_node_expr );
@@ -612,25 +723,43 @@
i++;
}
break;
- case Event_Dr:
- case Event_Dm:
+ case Ev_Dr:
+ case Ev_Dm:
+ /* Data read or modify */
helperName = "log_0I_1Dr_cache_access";
helperAddr = &log_0I_1Dr_cache_access;
argv = mkIRExprVec_3( i_node_expr,
- ev->dataEA,
- mkIRExpr_HWord( ev->datasize ) );
+ get_Event_dea(ev),
+ mkIRExpr_HWord( get_Event_dszB(ev) ) );
regparms = 3;
i++;
break;
- case Event_Dw:
+ case Ev_Dw:
+ /* Data write */
helperName = "log_0I_1Dw_cache_access";
helperAddr = &log_0I_1Dw_cache_access;
argv = mkIRExprVec_3( i_node_expr,
- ev->dataEA,
- mkIRExpr_HWord( ev->datasize ) );
+ get_Event_dea(ev),
+ mkIRExpr_HWord( get_Event_dszB(ev) ) );
regparms = 3;
i++;
break;
+ case Ev_Bc:
+ /* Conditional branch */
+ helperName = "log_cond_branch";
+ helperAddr = &log_cond_branch;
+ argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
+ regparms = 2;
+ i++;
+ break;
+ case Ev_Bi:
+ /* Branch to an unknown destination */
+ helperName = "log_ind_branch";
+ helperAddr = &log_ind_branch;
+ argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
+ regparms = 2;
+ i++;
+ break;
default:
tl_assert(0);
}
@@ -655,10 +784,9 @@
flushEvents(cgs);
tl_assert(cgs->events_used >= 0 && cgs->events_used < N_EVENTS);
evt = &cgs->events[cgs->events_used];
- evt->ekind = Event_Ir;
+ init_Event(evt);
+ evt->tag = Ev_Ir;
evt->inode = inode;
- evt->datasize = 0;
- evt->dataEA = NULL; /*paranoia*/
cgs->events_used++;
}
@@ -668,14 +796,17 @@
Event* evt;
tl_assert(isIRAtom(ea));
tl_assert(datasize >= 1 && datasize <= MIN_LINE_SIZE);
+ if (!clo_cache_sim)
+ return;
if (cgs->events_used == N_EVENTS)
flushEvents(cgs);
tl_assert(cgs->events_used >= 0 && cgs->events_used < N_EVENTS);
evt = &cgs->events[cgs->events_used];
- evt->ekind = Event_Dr;
- evt->inode = inode;
- evt->datasize = datasize;
- evt->dataEA = ea;
+ init_Event(evt);
+ evt->tag = Ev_Dr;
+ evt->inode = inode;
+ evt->Ev.Dr.szB = datasize;
+ evt->Ev.Dr.ea = ea;
cgs->events_used++;
}
@@ -688,15 +819,18 @@
tl_assert(isIRAtom(ea));
tl_assert(datasize >= 1 && datasize <= MIN_LINE_SIZE);
+ if (!clo_cache_sim)
+ return;
+
/* Is it possible to merge this write with the preceding read? */
lastEvt = &cgs->events[cgs->events_used-1];
if (cgs->events_used > 0
- && lastEvt->ekind == Event_Dr
- && lastEvt->datasize == datasize
- && lastEvt->inode == inode
- && eqIRAtom(lastEvt->dataEA, ea))
+ && lastEvt->tag == Ev_Dr
+ && lastEvt->Ev.Dr.szB == datasize
+ && lastEvt->inode == inode
+ && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
{
- lastEvt->ekind = Event_Dm;
+ lastEvt->tag = Ev_Dm;
return;
}
@@ -705,13 +839,54 @@
flushEvents(cgs);
tl_assert(cgs->events_used >= 0 && cgs->events_used < N_EVENTS);
evt = &cgs->events[cgs->events_used];
- evt->ekind = Event_Dw;
- evt->inode = inode;
- evt->datasize = datasize;
- evt->dataEA = ea;
+ init_Event(evt);
+ evt->tag = Ev_Dw;
+ evt->inode = inode;
+ evt->Ev.Dw.szB = datasize;
+ evt->Ev.Dw.ea = ea;
cgs->events_used++;
}
+static
+void addEvent_Bc ( CgState* cgs, InstrInfo* inode, IRAtom* guard )
+{
+ Event* evt;
+ tl_assert(isIRAtom(guard));
+ tl_assert(typeOfIRExpr(cgs->sbOut->tyenv, guard)
+ == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
+ if (!clo_branch_sim)
+ return;
+ if (cgs->events_used == N_EVENTS)
+ flushEvents(cgs);
+ tl_assert(cgs->events_used >= 0 && cgs->events_used < N_EVENTS);
+ evt = &cgs->events[cgs->events_used];
+ init_Event(evt);
+ evt->tag = Ev_Bc;
+ evt->inode = inode;
+ evt->Ev.Bc.taken = guard;
+ cgs->events_used++;
+}
+
+static
+void addEvent_Bi ( CgState* cgs, InstrInfo* inode, IRAtom* whereTo )
+{
+ Event* evt;
+ tl_assert(isIRAtom(whereTo));
+ tl_assert(typeOfIRExpr(cgs->sbOut->tyenv, whereTo)
+ == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
+ if (!clo_branch_sim)
+ return;
+ if (cgs->events_used == N_EVENTS)
+ flushEvents(cgs);
+ tl_assert(cgs->events_used >= 0 && cgs->events_used < N_EVENTS);
+ evt = &cgs->events[cgs->events_used];
+ init_Event(evt);
+ evt->tag = Ev_Bi;
+ evt->inode = inode;
+ evt->Ev.Bi.dst = whereTo;
+ cgs->events_used++;
+}
+
////////////////////////////////////////////////////////////
@@ -749,8 +924,13 @@
tl_assert(i < sbIn->stmts_used);
st = sbIn->stmts[i];
tl_assert(Ist_IMark == st->tag);
- cia = st->Ist.IMark.addr;
+ cia = st->Ist.IMark.addr;
+ isize = st->Ist.IMark.len;
+ // If Vex fails to decode an instruction, the size will be zero.
+ // Pretend otherwise.
+ if (isize == 0) isize = VG_MIN_INSTR_SZB;
+
// Set up running state and get block info
tl_assert(closure->readdr == vge->base[0]);
cgs.events_used = 0;
@@ -840,11 +1020,66 @@
break;
}
- case Ist_Exit:
+ case Ist_Exit: {
+ /* Stuff to widen the guard expression to a host word, so
+ we can pass it to the branch predictor simulation
+ functions easily. */
+ Bool inverted;
+ Addr64 nia, sea;
+ IRConst* dst;
+ IROp tyW = hWordTy;
+ IROp widen = tyW==Ity_I32 ? Iop_1Uto32 : Iop_1Uto64;
+ IROp opXOR = tyW==Ity_I32 ? Iop_Xor32 : Iop_Xor64;
+ IRTemp guard1 = newIRTemp(cgs.sbOut->tyenv, Ity_I1);
+ IRTemp guardW = newIRTemp(cgs.sbOut->tyenv, tyW);
+ IRTemp guard = newIRTemp(cgs.sbOut->tyenv, tyW);
+ IRExpr* one = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
+ : IRExpr_Const(IRConst_U64(1));
+
+ /* First we need to figure out whether the side exit got
+ inverted by the ir optimiser. To do that, figure out
+ the next (fallthrough) instruction's address and the
+ side exit address and see if they are the same. */
+ nia = cia + (Addr64)isize;
+ if (tyW == Ity_I32)
+ nia &= 0xFFFFFFFFULL;
+
+ /* Side exit address */
+ dst = st->Ist.Exit.dst;
+ if (tyW == Ity_I32) {
+ tl_assert(dst->tag == Ico_U32);
+ sea = (Addr64)(UInt)dst->Ico.U32;
+ } else {
+ tl_assert(tyW == Ity_I64);
+ tl_assert(dst->tag == Ico_U64);
+ sea = dst->Ico.U64;
+ }
+
+ inverted = nia == sea;
+
+ /* Widen the guard expression. */
+ addStmtToIRSB( cgs.sbOut,
+ IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
+ addStmtToIRSB( cgs.sbOut,
+ IRStmt_WrTmp( guardW,
+ IRExpr_Unop(widen,
+ IRExpr_RdTmp(guard1))) );
+ /* If the exit is inverted, invert the sense of the guard. */
+ addStmtToIRSB(
+ cgs.sbOut,
+ IRStmt_WrTmp(
+ guard,
+ inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
+ : IRExpr_RdTmp(guardW)
+ ));
+ /* And post the event. */
+ addEvent_Bc( &cgs, curr_inode, IRExpr_RdTmp(guard) );
+
/* We may never reach the next statement, so need to flush
all outstanding transactions now. */
flushEvents( &cgs );
break;
+ }
default:
tl_assert(0);
@@ -860,6 +1095,26 @@
}
}
+ /* Deal with branches to unknown destinations. Except ignore ones
+ which are function returns as we assume the return stack
+ predictor never mispredicts. */
+ if (sbIn->jumpkind == Ijk_Boring) {
+ if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
+ switch (sbIn->next->tag) {
+ case Iex_Const:
+ break; /* boring - branch to known address */
+ case Iex_RdTmp:
+ /* looks like an indirect branch (branch to unknown) */
+ addEvent_Bi( &cgs, curr_inode, sbIn->next );
+ break;
+ default:
+ /* shouldn't happen - if the incoming IR is properly
+ flattened, should only have tmp and const cases to
+ consider. */
+ tl_assert(0);
+ }
+ }
+
/* At the end of the bb. Flush outstandings. */
flushEvents( &cgs );
@@ -983,9 +1238,11 @@
// Total reads/writes/misses. Calculated during CC traversal at the end.
// All auto-zeroed.
-static CC Ir_total;
-static CC Dr_total;
-static CC Dw_total;
+static CacheCC Ir_total;
+static CacheCC Dr_total;
+static CacheCC Dw_total;
+static BranchCC Bc_total;
+static BranchCC Bi_total;
// The output file base name specified by the user using the
// --cachegrind-out-file switch. This is combined with the
@@ -1044,7 +1301,21 @@
}
}
// "events:" line
- VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw\n");
+ if (clo_cache_sim && clo_branch_sim) {
+ VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw "
+ "Bc Bcm Bi Bim\n");
+ }
+ else if (clo_cache_sim && !clo_branch_sim) {
+ VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw "
+ "\n");
+ }
+ else if (!clo_cache_sim && clo_branch_sim) {
+ VG_(sprintf)(buf, "\nevents: Ir "
+ "Bc Bcm Bi Bim\n");
+ }
+ else
+ tl_assert(0); /* can't happen */
+
VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
// Traverse every lineCC
@@ -1076,11 +1347,38 @@
}
// Print the LineCC
- VG_(sprintf)(buf, "%u %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
- lineCC->loc.line,
- lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.m2,
- lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.m2,
- lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.m2);
+ if (clo_cache_sim && clo_branch_sim) {
+ VG_(sprintf)(buf, "%u %llu %llu %llu"
+ " %llu %llu %llu"
+ " %llu %llu %llu"
+ " %llu %llu %llu %llu\n",
+ lineCC->loc.line,
+ lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.m2,
+ lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.m2,
+ lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.m2,
+ lineCC->Bc.b, lineCC->Bc.mp,
+ lineCC->Bi.b, lineCC->Bi.mp);
+ }
+ else if (clo_cache_sim && !clo_branch_sim) {
+ VG_(sprintf)(buf, "%u %llu %llu %llu"
+ " %llu %llu %llu"
+ " %llu %llu %llu\n",
+ lineCC->loc.line,
+ lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.m2,
+ lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.m2,
+ lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.m2);
+ }
+ else if (!clo_cache_sim && clo_branch_sim) {
+ VG_(sprintf)(buf, "%u %llu"
+ " %llu %llu %llu %llu\n",
+ lineCC->loc.line,
+ lineCC->Ir.a,
+ lineCC->Bc.b, lineCC->Bc.mp,
+ lineCC->Bi.b, lineCC->Bi.mp);
+ }
+ else
+ tl_assert(0);
+
VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
// Update summary stats
@@ -1093,17 +1391,48 @@
Dw_total.a += lineCC->Dw.a;
Dw_total.m1 += lineCC->Dw.m1;
Dw_total.m2 += lineCC->Dw.m2;
+ Bc_total.b += lineCC->Bc.b;
+ Bc_total.mp += lineCC->Bc.mp;
+ Bi_total.b += lineCC->Bi.b;
+ Bi_total.mp += lineCC->Bi.mp;
distinct_lines++;
}
// Summary stats must come after rest of table, since we calculate them
- // during traversal. */
- VG_(sprintf)(buf, "summary: "
- "%llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
- Ir_total.a, Ir_total.m1, Ir_total.m2,
- Dr_total.a, Dr_total.m1, Dr_total.m2,
- Dw_total.a, Dw_total.m1, Dw_total.m2);
+ // during traversal. */
+ if (clo_cache_sim && clo_branch_sim) {
+ VG_(sprintf)(buf, "summary:"
+ " %llu %llu %llu"
+ " %llu %llu %llu"
+ " %llu %llu %llu"
+ " %llu %llu %llu %llu\n",
+ Ir_total.a, Ir_total.m1, Ir_total.m2,
+ Dr_total.a, Dr_total.m1, Dr_total.m2,
+ Dw_total.a, Dw_total.m1, Dw_total.m2,
+ Bc_total.b, Bc_total.mp,
+ Bi_total.b, Bi_total.mp);
+ }
+ else if (clo_cache_sim && !clo_branch_sim) {
+ VG_(sprintf)(buf, "summary:"
+ " %llu %llu %llu"
+ " %llu %llu %llu"
+ " %llu %llu %llu\n",
+ Ir_total.a, Ir_total.m1, Ir_total.m2,
+ Dr_total.a, Dr_total.m1, Dr_total.m2,
+ Dw_total.a, Dw_total.m1, Dw_total.m2);
+ }
+ else if (!clo_cache_sim && clo_branch_sim) {
+ VG_(sprintf)(buf, "summary:"
+ " %llu"
+ " %llu %llu %llu %llu\n",
+ Ir_total.a,
+ Bc_total.b, Bc_total.mp,
+ Bi_total.b, Bi_total.mp);
+ }
+ else
+ tl_assert(0);
+
VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
VG_(close)(fd);
}
@@ -1123,12 +1452,17 @@
{
static Char buf1[128], buf2[128], buf3[128], buf4[123], fmt[128];
- CC D_total;
+ CacheCC D_total;
+ BranchCC B_total;
ULong L2_total_m, L2_total_mr, L2_total_mw,
L2_total, L2_total_r, L2_total_w;
- Int l1, l2, l3;
+ Int l1, l2, l3, l4;
Int p;
+ /* Running with both cache and branch simulation disabled is not
+ allowed (checked during command line option processing). */
+ tl_assert(clo_cache_sim || clo_branch_sim);
+
fprint_CC_table_and_calc_totals();
if (VG_(clo_verbosity) == 0)
@@ -1139,76 +1473,106 @@
l1 = ULong_width(Ir_total.a);
l2 = ULong_width(Dr_total.a);
l3 = ULong_width(Dw_total.a);
+ l4 = ULong_width(Bc_total.b + Bi_total.b);
/* Make format string, getting width right for numbers */
VG_(sprintf)(fmt, "%%s %%,%dllu", l1);
+ /* Always print this */
VG_(message)(Vg_UserMsg, fmt, "I refs: ", Ir_total.a);
- VG_(message)(Vg_UserMsg, fmt, "I1 misses: ", Ir_total.m1);
- VG_(message)(Vg_UserMsg, fmt, "L2i misses: ", Ir_total.m2);
- p = 100;
+ /* If cache profiling is enabled, show D access numbers and all
+ miss numbers */
+ if (clo_cache_sim) {
+ VG_(message)(Vg_UserMsg, fmt, "I1 misses: ", Ir_total.m1);
+ VG_(message)(Vg_UserMsg, fmt, "L2i misses: ", Ir_total.m2);
- if (0 == Ir_total.a) Ir_total.a = 1;
- VG_(percentify)(Ir_total.m1, Ir_total.a, 2, l1+1, buf1);
- VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1);
+ p = 100;
- VG_(percentify)(Ir_total.m2, Ir_total.a, 2, l1+1, buf1);
- VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
- VG_(message)(Vg_UserMsg, "");
+ if (0 == Ir_total.a) Ir_total.a = 1;
+ VG_(percentify)(Ir_total.m1, Ir_total.a, 2, l1+1, buf1);
+ VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1);
- /* D cache results. Use the D_refs.rd and D_refs.wr values to determine the
- * width of columns 2 & 3. */
- D_total.a = Dr_total.a + Dw_total.a;
- D_total.m1 = Dr_total.m1 + Dw_total.m1;
- D_total.m2 = Dr_total.m2 + Dw_total.m2;
+ VG_(percentify)(Ir_total.m2, Ir_total.a, 2, l1+1, buf1);
+ VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
+ VG_(message)(Vg_UserMsg, "");
- /* Make format string, getting width right for numbers */
- VG_(sprintf)(fmt, "%%s %%,%dllu (%%,%dllu rd + %%,%dllu wr)", l1, l2, l3);
+ /* D cache results. Use the D_refs.rd and D_refs.wr values to
+ * determine the width of columns 2 & 3. */
+ D_total.a = Dr_total.a + Dw_total.a;
+ D_total.m1 = Dr_total.m1 + Dw_total.m1;
+ D_total.m2 = Dr_total.m2 + Dw_total.m2;
- VG_(message)(Vg_UserMsg, fmt, "D refs: ",
- D_total.a, Dr_total.a, Dw_total.a);
- VG_(message)(Vg_UserMsg, fmt, "D1 misses: ",
- D_total.m1, Dr_total.m1, Dw_total.m1);
- VG_(message)(Vg_UserMsg, fmt, "L2d misses: ",
- D_total.m2, Dr_total.m2, Dw_total.m2);
+ /* Make format string, getting width right for numbers */
+ VG_(sprintf)(fmt, "%%s %%,%dllu (%%,%dllu rd + %%,%dllu wr)", l1, l2, l3);
- p = 10;
+ VG_(message)(Vg_UserMsg, fmt, "D refs: ",
+ D_total.a, Dr_total.a, Dw_total.a);
+ VG_(message)(Vg_UserMsg, fmt, "D1 misses: ",
+ D_total.m1, Dr_total.m1, Dw_total.m1);
+ VG_(message)(Vg_UserMsg, fmt, "L2d misses: ",
+ D_total.m2, Dr_total.m2, Dw_total.m2);
- if (0 == D_total.a) D_total.a = 1;
- if (0 == Dr_total.a) Dr_total.a = 1;
- if (0 == Dw_total.a) Dw_total.a = 1;
- VG_(percentify)( D_total.m1, D_total.a, 1, l1+1, buf1);
- VG_(percentify)(Dr_total.m1, Dr_total.a, 1, l2+1, buf2);
- VG_(percentify)(Dw_total.m1, Dw_total.a, 1, l3+1, buf3);
- VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3);
+ p = 10;
- VG_(percentify)( D_total.m2, D_total.a, 1, l1+1, buf1);
- VG_(percentify)(Dr_total.m2, Dr_total.a, 1, l2+1, buf2);
- VG_(percentify)(Dw_total.m2, Dw_total.a, 1, l3+1, buf3);
- VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3);
- VG_(message)(Vg_UserMsg, "");
+ if (0 == D_total.a) D_total.a = 1;
+ if (0 == Dr_total.a) Dr_total.a = 1;
+ if (0 == Dw_total.a) Dw_total.a = 1;
+ VG_(percentify)( D_total.m1, D_total.a, 1, l1+1, buf1);
+ VG_(percentify)(Dr_total.m1, Dr_total.a, 1, l2+1, buf2);
+ VG_(percentify)(Dw_total.m1, Dw_total.a, 1, l3+1, buf3);
+ VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3);
- /* L2 overall results */
+ VG_(percentify)( D_total.m2, D_total.a, 1, l1+1, buf1);
+ VG_(percentify)(Dr_total.m2, Dr_total.a, 1, l2+1, buf2);
+ VG_(percentify)(Dw_total.m2, Dw_total.a, 1, l3+1, buf3);
+ VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3);
+ VG_(message)(Vg_UserMsg, "");
- L2_total = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
- L2_total_r = Dr_total.m1 + Ir_total.m1;
- L2_total_w = Dw_total.m1;
- VG_(message)(Vg_UserMsg, fmt, "L2 refs: ",
- L2_total, L2_total_r, L2_total_w);
+ /* L2 overall results */
- L2_total_m = Dr_total.m2 + Dw_total.m2 + Ir_total.m2;
- L2_total_mr = Dr_total.m2 + Ir_total.m2;
- L2_total_mw = Dw_total.m2;
- VG_(message)(Vg_UserMsg, fmt, "L2 misses: ",
- L2_total_m, L2_total_mr, L2_total_mw);
+ L2_total = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
+ L2_total_r = Dr_total.m1 + Ir_total.m1;
+ L2_total_w = Dw_total.m1;
+ VG_(message)(Vg_UserMsg, fmt, "L2 refs: ",
+ L2_total, L2_total_r, L2_total_w);
- VG_(percentify)(L2_total_m, (Ir_total.a + D_total.a), 1, l1+1, buf1);
- VG_(percentify)(L2_total_mr, (Ir_total.a + Dr_total.a), 1, l2+1, buf2);
- VG_(percentify)(L2_total_mw, Dw_total.a, 1, l3+1, buf3);
- VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )", buf1, buf2,buf3);
+ L2_total_m = Dr_total.m2 + Dw_total.m2 + Ir_total.m2;
+ L2_total_mr = Dr_total.m2 + Ir_total.m2;
+ L2_total_mw = Dw_total.m2;
+ VG_(message)(Vg_UserMsg, fmt, "L2 misses: ",
+ L2_total_m, L2_total_mr, L2_total_mw);
+ VG_(percentify)(L2_total_m, (Ir_total.a + D_total.a), 1, l1+1, buf1);
+ VG_(percentify)(L2_total_mr, (Ir_total.a + Dr_total.a), 1, l2+1, buf2);
+ VG_(percentify)(L2_total_mw, Dw_total.a, 1, l3+1, buf3);
+ VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )", buf1, buf2,buf3);
+ }
+ /* If branch profiling is enabled, show branch overall results. */
+ if (clo_branch_sim) {
+ /* Make format string, getting width right for numbers */
+ VG_(sprintf)(fmt, "%%s %%,%dllu (%%,%dllu cond + %%,%dllu ind)", l1, l2, l3);
+
+ if (0 == Bc_total.b) Bc_total.b = 1;
+ if (0 == Bi_total.b) Bi_total.b = 1;
+ B_total.b = Bc_total.b + Bi_total.b;
+ B_total.mp = Bc_total.mp + Bi_total.mp;
+
+ VG_(message)(Vg_UserMsg, "");
+ VG_(message)(Vg_UserMsg, fmt, "Branches: ",
+ B_total.b, Bc_total.b, Bi_total.b);
+
+ VG_(message)(Vg_UserMsg, fmt, "Mispredicts: ",
+ B_total.mp, Bc_total.mp, Bi_total.mp);
+
+ VG_(percentify)(B_total.mp, B_total.b, 1, l1+1, buf1);
+ VG_(percentify)(Bc_total.mp, Bc_total.b, 1, l2+1, buf2);
+ VG_(percentify)(Bi_total.mp, Bi_total.b, 1, l3+1, buf3);
+
+ VG_(message)(Vg_UserMsg, "Mispred rate: %s (%s + %s )", buf1, buf2,buf3);
+ }
+
// Various stats
if (VG_(clo_verbosity) > 1) {
Int debug_lookups = full_debugs + fn_debugs +
@@ -1318,6 +1682,8 @@
else if (VG_CLO_STREQN(22, arg, "--cachegrind-out-file=")) {
cachegrind_out_file_basename = &arg[22];
}
+ else VG_BOOL_CLO(arg, "--cache-sim", clo_cache_sim)
+ else VG_BOOL_CLO(arg, "--branch-sim", clo_branch_sim)
else
return False;
@@ -1330,6 +1696,8 @@
" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
+" --cache-sim=yes|no [yes] collect cache stats?\n"
+" --branch-sim=yes|no [no] collect branch prediction stats?\n"
" --cachegrind-out-file=<file> write profile data to <file>.<pid>\n"
" [cachegrind.out.<pid>]\n"
);
@@ -1354,7 +1722,7 @@
{
VG_(details_name) ("Cachegrind");
VG_(details_version) (NULL);
- VG_(details_description) ("an I1/D1/L2 cache profiler");
+ VG_(details_description) ("a cache and branch-prediction profiler");
VG_(details_copyright_author)(
"Copyright (C) 2002-2007, and GNU GPL'd, by Nicholas Nethercote et al.");
VG_(details_bug_reports_to) (VG_BUGS_TO);
@@ -1376,6 +1744,15 @@
cache_t I1c, D1c, L2c;
Int filename_szB;
+ /* Can't disable both cache and branch profiling */
+ if ((!clo_cache_sim) && (!clo_branch_sim)) {
+ VG_(message)(Vg_DebugMsg,
+ "ERROR: --cache-sim=no --branch-sim=no is not allowed.");
+ VG_(message)(Vg_DebugMsg,
+ "You must select cache profiling, or branch profiling, or both.");
+ VG_(exit)(2);
+ }
+
/* Get working directory */
tl_assert( VG_(getcwd)(base_dir, VKI_PATH_MAX) );
Modified: trunk/cachegrind/docs/cg-manual.xml
===================================================================
--- trunk/cachegrind/docs/cg-manual.xml 2007-05-06 13:22:23 UTC (rev 6732)
+++ trunk/cachegrind/docs/cg-manual.xml 2007-05-08 09:20:25 UTC (rev 6733)
@@ -5,18 +5,23 @@
<chapter id="cg-manual" xreflabel="Cachegrind: a cache-miss profiler">
-<title>Cachegrind: a cache profiler</title>
+<title>Cachegrind: a cache and branch profiler</title>
<sect1 id="cg-manual.cache" xreflabel="Cache profiling">
-<title>Cache profiling</title>
+<title>Cache and branch profiling</title>
<para>To use this tool, you must specify
<computeroutput>--tool=cachegrind</computeroutput> on the
Valgrind command line.</para>
-<para>Cachegrind is a tool for doing cache simulations and
-annotating your source line-by-line with the number of cache
-misses. In particular, it records:</para>
+<para>Cachegrind is a tool for finding places where programs
+interact badly with typical modern superscalar processors
+and run slowly as a result.
+In particular, it will do a cache simulation of your program,
+and optionally a branch-predictor simulation, and can
+then annotate your source line-by-line with the number of cache
+misses and branch mispredictions. The following statistics are
+collected:</para>
<itemizedlist>
<listitem>
<para>L1 instruction cache reads and misses;</para>
@@ -29,18 +34,31 @@
<para>L2 unified cache reads and read misses, writes and
writes misses.</para>
</listitem>
+ <listitem>
+ <para>Conditional branches and mispredicted conditional branches.</para>
+ </listitem>
+ <listitem>
+ <para>Indirect branches and mispredicted indirect branches. An
+ indirect branch is a jump or call to a destination only known at
+ run time.</para>
+ </listitem>
</itemizedlist>
<para>On a modern machine, an L1 miss will typically cost
-around 10 cycles, and an L2 miss can cost as much as 200
-cycles. Detailed cache profiling can be very useful for improving
-the performance of your program.</para>
+around 10 cycles, an L2 miss can cost as much as 200
+cycles, and a mispredicted branch costs in the region of 10
+to 30 cycles. Detailed cache and branch profiling can be very useful
+for improving the performance of your program.</para>
<para>Also, since one instruction cache read is performed per
instruction executed, you can find out how many instructions are
executed per line, which can be useful for traditional profiling
and test coverage.</para>
+<para>Branch profiling is not enabled by default. To use it, you must
+additionally specify <computeroutput>--branch-sim=yes</computeroutput>
+on the command line.</para>
+
<para>Any feedback, bug-fixes, suggestions, etc, welcome.</para>
@@ -67,6 +85,11 @@
<computeroutput>pid</computeroutput> is the program's process
id.</para>
+ <para>Branch prediction statistics are not collected by default.
+ To do so, add the flag
+ <computeroutput>--branch-sim=yes</computeroutput>.
+ </para>
+
<para>This step should be done every time you want to collect
information about a new program, a changed program, or about
the same program with different input.</para>
@@ -208,6 +231,49 @@
</sect2>
+
+<sect2 id="branch-sim" xreflabel="Branch simulation specifics">
+<title>Branch simulation specifics</title>
+
+<para>Cachegrind simulates branch predictors intended to be
+typical of mainstream desktop/server processors of around 2004.</para>
+
+<para>Conditional branches are predicted using an array of 16384 2-bit
+saturating counters. The array index used for a branch instruction is
+computed partly from the low-order bits of the branch instruction's
+address and partly using the taken/not-taken behaviour of the last few
+conditional branches. As a result the predictions for any specific
+branch depend both on its own history and the behaviour of previous
+branches. This is a standard technique for improving prediction
+accuracy.</para>
+
+<para>For indirect branches (that is, jumps to unknown destinations)
+Cachegrind uses a simple branch target address predictor. Targets are
+predicted using an array of 512 entries indexed by the low order 9
+bits of the branch instruction's address. Each branch is predicted to
+jump to the same address it did last time. Any other behaviour causes
+a mispredict.</para>
+
+<para>More recent processors have better branch predictors, in
+particular better indirect branch predictors. Cachegrind's predictor
+design is deliberately conservative so as to be representative of the
+large installed base of processors which pre-date widespread
+deployment of more sophisticated indirect branch predictors. In
+particular, late model Pentium 4s (Prescott), Pentium M, Core and Core
+2 have more sophisticated indirect branch predictors than modelled by
+Cachegrind. </para>
+
+<para>Cachegrind does not simulate a return stack predictor. It
+assumes that processors perfectly predict function return addresses,
+an assumption which is probably close to being true.</para>
+
+<para>See Hennessy and Patterson's classic text "Computer
+Architecture: A Quantitative Approach", 4th edition (2007), Section
+2.3 (pages 80-89) for background on modern branch predictors.</para>
+
+</sect2>
+
+
</sect1>
@@ -377,6 +443,31 @@
</listitem>
</varlistentry>
+ <varlistentry id="opt.cache-sim" xreflabel="--cache-sim">
+ <term>
+ <option><![CDATA[--cache-sim=no|yes [yes] ]]></option>
+ </term>
+ <listitem>
+ <para>Enables or disables collection of cache access and miss
+ counts.</para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry id="opt.branch-sim" xreflabel="--branch-sim">
+ <term>
+ <option><![CDATA[--branch-sim=no|yes [no] ]]></option>
+ </term>
+ <listitem>
+ <para>Enables or disables collection of branch instruction and
+ misprediction counts. By default this is disabled as it
+ slows Cachegrind down by approximately 25%. Note that you
+ cannot specify <computeroutput>--cache-sim=no</computeroutput>
+ and <computeroutput>--branch-sim=no</computeroutput>
+ together, as that would leave Cachegrind with no
+ information to collect.</para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
<!-- end of xi:include in the manpage -->
@@ -495,6 +586,22 @@
<para><computeroutput>D2mw</computeroutput>: L2 cache data
write misses</para>
</listitem>
+ <listitem>
+ <para><computeroutput>Bc</computeroutput>: Conditional branches
+ executed</para>
+ </listitem>
+ <listitem>
+ <para><computeroutput>Bcm</computeroutput>: Conditional branches
+ mispredicted</para>
+ </listitem>
+ <listitem>
+ <para><computeroutput>Bi</computeroutput>: Indirect branches
+ executed</para>
+ </listitem>
+ <listitem>
+ <para><computeroutput>Bim</computeroutput>: Conditional branches
+ mispredicted</para>
+ </listitem>
</itemizedlist>
<para>Note that D1 total accesses is given by
Modified: trunk/cachegrind/tests/filter_stderr
===================================================================
--- trunk/cachegrind/tests/filter_stderr 2007-05-06 13:22:23 UTC (rev 6732)
+++ trunk/cachegrind/tests/filter_stderr 2007-05-08 09:20:25 UTC (rev 6733)
@@ -5,7 +5,7 @@
$dir/../../tests/filter_stderr_basic |
# Remove "Cachegrind, ..." line and the following copyright line.
-sed "/^Cachegrind, an I1\/D1\/L2 cache profiler./ , /./ d" |
+sed "/^Cachegrind, a cache and branch-prediction profiler./ , /./ d" |
# Remove numbers from I/D/L2 "refs:" lines
sed "s/\(\(I\|D\|L2\) *refs:\)[ 0-9,()+rdw]*$/\1/" |
|
|
From: Tom H. <th...@cy...> - 2007-05-08 02:24:06
|
Nightly build on dellow ( x86_64, Fedora Core 6 ) started at 2007-05-08 03:10:04 BST Results differ from 24 hours ago Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 292 tests, 4 stderr failures, 2 stdout failures, 0 posttest failures == memcheck/tests/pointer-trace (stderr) memcheck/tests/x86/scalar (stderr) memcheck/tests/xml1 (stderr) none/tests/mremap (stderr) none/tests/mremap2 (stdout) none/tests/pth_detached (stdout) ================================================= == Results from 24 hours ago == ================================================= Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 292 tests, 4 stderr failures, 1 stdout failure, 0 posttest failures == memcheck/tests/pointer-trace (stderr) memcheck/tests/x86/scalar (stderr) memcheck/tests/xml1 (stderr) none/tests/mremap (stderr) none/tests/mremap2 (stdout) ================================================= == Difference between 24 hours ago and now == ================================================= *** old.short Tue May 8 03:16:51 2007 --- new.short Tue May 8 03:23:59 2007 *************** *** 8,10 **** ! == 292 tests, 4 stderr failures, 1 stdout failure, 0 posttest failures == memcheck/tests/pointer-trace (stderr) --- 8,10 ---- ! == 292 tests, 4 stderr failures, 2 stdout failures, 0 posttest failures == memcheck/tests/pointer-trace (stderr) *************** *** 14,15 **** --- 14,16 ---- none/tests/mremap2 (stdout) + none/tests/pth_detached (stdout) |
|
From: Tom H. <th...@cy...> - 2007-05-08 02:23:31
|
Nightly build on gill ( x86_64, Fedora Core 2 ) started at 2007-05-08 03:00:06 BST Results unchanged from 24 hours ago Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 294 tests, 6 stderr failures, 1 stdout failure, 0 posttest failures == memcheck/tests/pointer-trace (stderr) memcheck/tests/stack_switch (stderr) memcheck/tests/x86/scalar (stderr) memcheck/tests/x86/scalar_supp (stderr) none/tests/fdleak_fcntl (stderr) none/tests/mremap (stderr) none/tests/mremap2 (stdout) |
|
From: Tom H. <th...@cy...> - 2007-05-08 02:17:33
|
Nightly build on lloyd ( x86_64, Fedora Core 3 ) started at 2007-05-08 03:05:03 BST Results unchanged from 24 hours ago Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 292 tests, 6 stderr failures, 1 stdout failure, 0 posttest failures == memcheck/tests/pointer-trace (stderr) memcheck/tests/stack_switch (stderr) memcheck/tests/x86/scalar (stderr) memcheck/tests/x86/scalar_supp (stderr) memcheck/tests/xml1 (stderr) none/tests/mremap (stderr) none/tests/mremap2 (stdout) |
|
From: <js...@ac...> - 2007-05-08 00:00:55
|
Nightly build on g5 ( SuSE 10.1, ppc970 ) started at 2007-05-08 02:00:01 CEST Results differ from 24 hours ago Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 226 tests, 6 stderr failures, 2 stdout failures, 0 posttest failures == memcheck/tests/deep_templates (stdout) memcheck/tests/leak-cycle (stderr) memcheck/tests/leak-tree (stderr) memcheck/tests/pointer-trace (stderr) none/tests/faultstatus (stderr) none/tests/fdleak_cmsg (stderr) none/tests/mremap (stderr) none/tests/mremap2 (stdout) ================================================= == Results from 24 hours ago == ================================================= Checking out valgrind source tree ... done Configuring valgrind ... done Building valgrind ... done Running regression tests ... failed Regression test results follow == 226 tests, 6 stderr failures, 3 stdout failures, 0 posttest failures == memcheck/tests/deep_templates (stdout) memcheck/tests/leak-cycle (stderr) memcheck/tests/leak-tree (stderr) memcheck/tests/pointer-trace (stderr) none/tests/faultstatus (stderr) none/tests/fdleak_cmsg (stderr) none/tests/mremap (stderr) none/tests/mremap2 (stdout) none/tests/res_search (stdout) ================================================= == Difference between 24 hours ago and now == ================================================= *** old.short Tue May 8 02:08:57 2007 --- new.short Tue May 8 02:16:58 2007 *************** *** 8,10 **** ! == 226 tests, 6 stderr failures, 3 stdout failures, 0 posttest failures == memcheck/tests/deep_templates (stdout) --- 8,10 ---- ! == 226 tests, 6 stderr failures, 2 stdout failures, 0 posttest failures == memcheck/tests/deep_templates (stdout) *************** *** 17,19 **** none/tests/mremap2 (stdout) - none/tests/res_search (stdout) --- 17,18 ---- |