|
From: <sv...@va...> - 2011-05-01 18:47:23
|
Author: sewardj
Date: 2011-05-01 19:47:10 +0100 (Sun, 01 May 2011)
New Revision: 2138
Log:
Improvements to condition code handling on ARM.
(1) guest_arm_spechelper: add another spec rule for
armg_calculate_condition. Add a spec rules for
armg_calculate_flag_c and armg_calculate_flag_v.
(2) guest_arm_toIR.c: when storing oldC (shifter carry out) and
oldV values in the thunk, be sure to ensure the top 31 bits
are zero. This improves the effectiveness of the new spec
rules (1) by avoiding getting into situations where we have
Mux0X(c, x, And32(x,1)), where in fact x has bits 31:1 as
zero. iropt can't fold that out. So make sure the spec
rules don't generate any unnecessary And32(x,1); hence the
above becomes Mux0X(c, x, x) which iropt can reduce simply
to "x".
(3) armg_calculate_flags_nzcv: assert condition (2) wherever
possible.
These changes drastically improve --tool=none performance in some
cases (eg, perf/fbench with softfloat is doubled in speed).
Modified:
trunk/priv/guest_arm_defs.h
trunk/priv/guest_arm_helpers.c
trunk/priv/guest_arm_toIR.c
Modified: trunk/priv/guest_arm_defs.h
===================================================================
--- trunk/priv/guest_arm_defs.h 2011-05-01 18:36:51 UTC (rev 2137)
+++ trunk/priv/guest_arm_defs.h 2011-05-01 18:47:10 UTC (rev 2138)
@@ -148,6 +148,10 @@
that the definedness of the stored flags always depends on
all 3 DEP values.
+ Fields carrying only 1 or 2 bits of useful information (old_C,
+ shifter_co, old_V, oldC:oldV) must have their top 31 or 30 bits
+ (respectively) zero. The text "31x0:" or "30x0:" denotes this.
+
A summary of the field usages is:
OP DEP1 DEP2 DEP3
@@ -156,11 +160,11 @@
OP_COPY current NZCV unused unused
OP_ADD argL argR unused
OP_SUB argL argR unused
- OP_ADC argL argR old_C
- OP_SBB argL argR old_C
- OP_LOGIC result shifter_co old_V
- OP_MUL result unused old_C:old_V
- OP_MULL resLO32 resHI32 old_C:old_V
+ OP_ADC argL argR 31x0:old_C
+ OP_SBB argL argR 31x0:old_C
+ OP_LOGIC result 31x0:shifter_co 31x0:old_V
+ OP_MUL result unused 30x0:old_C:old_V
+ OP_MULL resLO32 resHI32 30x0:old_C:old_V
*/
enum {
Modified: trunk/priv/guest_arm_helpers.c
===================================================================
--- trunk/priv/guest_arm_helpers.c 2011-05-01 18:36:51 UTC (rev 2137)
+++ trunk/priv/guest_arm_helpers.c 2011-05-01 18:47:10 UTC (rev 2138)
@@ -110,6 +110,7 @@
UInt argL = cc_dep1;
UInt argR = cc_dep2;
UInt oldC = cc_dep3;
+ vassert((oldC & ~1) == 0);
UInt res = (argL + argR) + oldC;
UInt nf = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
UInt zf = lshift( res == 0, ARMG_CC_SHIFT_Z );
@@ -127,6 +128,7 @@
UInt argL = cc_dep1;
UInt argR = cc_dep2;
UInt oldC = cc_dep3;
+ vassert((oldC & ~1) == 0);
UInt res = argL - argR - (oldC ^ 1);
UInt nf = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
UInt zf = lshift( res == 0, ARMG_CC_SHIFT_Z );
@@ -143,11 +145,13 @@
/* (res, shco, oldV) */
UInt res = cc_dep1;
UInt shco = cc_dep2;
+ vassert((shco & ~1) == 0);
UInt oldV = cc_dep3;
+ vassert((oldV & ~1) == 0);
UInt nf = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
UInt zf = lshift( res == 0, ARMG_CC_SHIFT_Z );
- UInt cf = lshift( shco & 1, ARMG_CC_SHIFT_C );
- UInt vf = lshift( oldV & 1, ARMG_CC_SHIFT_V );
+ UInt cf = lshift( shco, ARMG_CC_SHIFT_C );
+ UInt vf = lshift( oldV, ARMG_CC_SHIFT_V );
return nf | zf | cf | vf;
}
case ARMG_CC_OP_MUL: {
@@ -155,10 +159,11 @@
UInt res = cc_dep1;
UInt oldC = (cc_dep3 >> 1) & 1;
UInt oldV = (cc_dep3 >> 0) & 1;
+ vassert((cc_dep3 & ~3) == 0);
UInt nf = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
UInt zf = lshift( res == 0, ARMG_CC_SHIFT_Z );
- UInt cf = lshift( oldC & 1, ARMG_CC_SHIFT_C );
- UInt vf = lshift( oldV & 1, ARMG_CC_SHIFT_V );
+ UInt cf = lshift( oldC, ARMG_CC_SHIFT_C );
+ UInt vf = lshift( oldV, ARMG_CC_SHIFT_V );
return nf | zf | cf | vf;
}
case ARMG_CC_OP_MULL: {
@@ -167,10 +172,11 @@
UInt resHi32 = cc_dep2;
UInt oldC = (cc_dep3 >> 1) & 1;
UInt oldV = (cc_dep3 >> 0) & 1;
+ vassert((cc_dep3 & ~3) == 0);
UInt nf = lshift( resHi32 & (1<<31), ARMG_CC_SHIFT_N - 31 );
UInt zf = lshift( (resHi32|resLo32) == 0, ARMG_CC_SHIFT_Z );
- UInt cf = lshift( oldC & 1, ARMG_CC_SHIFT_C );
- UInt vf = lshift( oldV & 1, ARMG_CC_SHIFT_V );
+ UInt cf = lshift( oldC, ARMG_CC_SHIFT_C );
+ UInt vf = lshift( oldV, ARMG_CC_SHIFT_V );
return nf | zf | cf | vf;
}
default:
@@ -185,7 +191,7 @@
/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate the C flag from the thunk components, in the lowest bit
- of the word (bit 0). */
+ of the word (bit 0). Bits 31:1 of the returned value are zero. */
UInt armg_calculate_flag_c ( UInt cc_op, UInt cc_dep1,
UInt cc_dep2, UInt cc_dep3 )
{
@@ -196,7 +202,7 @@
/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate the V flag from the thunk components, in the lowest bit
- of the word (bit 0). */
+ of the word (bit 0). Bits 31:1 of the returned value are zero. */
UInt armg_calculate_flag_v ( UInt cc_op, UInt cc_dep1,
UInt cc_dep2, UInt cc_dep3 )
{
@@ -221,7 +227,7 @@
/* Calculate the specified condition from the thunk components, in the
lowest bit of the word (bit 0). */
extern
-UInt armg_calculate_condition ( UInt cond_n_op /* ARMCondcode << 4 | cc_op */,
+UInt armg_calculate_condition ( UInt cond_n_op /* (ARMCondcode << 4) | cc_op */,
UInt cc_dep1,
UInt cc_dep2, UInt cc_dep3 )
{
@@ -332,12 +338,17 @@
/* --------- specialising "armg_calculate_condition" --------- */
if (vex_streq(function_name, "armg_calculate_condition")) {
- /* specialise calls to above "armg_calculate condition" function */
- IRExpr *cond_n_op, *cc_dep1, *cc_dep2;
+
+ /* specialise calls to the "armg_calculate_condition" function.
+ Not sure whether this is strictly necessary, but: the
+ replacement IR must produce only the values 0 or 1. Bits
+ 31:1 are required to be zero. */
+ IRExpr *cond_n_op, *cc_dep1, *cc_dep2, *cc_ndep;
vassert(arity == 4);
- cond_n_op = args[0]; /* ARMCondcode << 4 | ARMG_CC_OP_* */
+ cond_n_op = args[0]; /* (ARMCondcode << 4) | ARMG_CC_OP_* */
cc_dep1 = args[1];
cc_dep2 = args[2];
+ cc_ndep = args[3];
/*---------------- SUB ----------------*/
@@ -384,7 +395,27 @@
binop(Iop_CmpLE32U, cc_dep1, cc_dep2));
}
+ /*---------------- SBB ----------------*/
+
+ if (isU32(cond_n_op, (ARMCondHS << 4) | ARMG_CC_OP_SBB)) {
+ /* This seems to happen a lot in softfloat code, eg __divdf3+140 */
+ /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
+ /* HS after SBB (same as C after SBB below)
+ --> oldC ? (argL >=u argR) : (argL >u argR)
+ --> oldC ? (argR <=u argL) : (argR <u argL)
+ */
+ return
+ IRExpr_Mux0X(
+ unop(Iop_32to8, cc_ndep),
+ /* case oldC == 0 */
+ unop(Iop_1Uto32, binop(Iop_CmpLT32U, cc_dep2, cc_dep1)),
+ /* case oldC != 0 */
+ unop(Iop_1Uto32, binop(Iop_CmpLE32U, cc_dep2, cc_dep1))
+ );
+ }
+
/*---------------- LOGIC ----------------*/
+
if (isU32(cond_n_op, (ARMCondEQ << 4) | ARMG_CC_OP_LOGIC)) {
/* EQ after LOGIC --> test res == 0 */
return unop(Iop_1Uto32,
@@ -397,6 +428,7 @@
}
/*----------------- AL -----------------*/
+
/* A critically important case for Thumb code.
What we're trying to spot is the case where cond_n_op is an
@@ -443,6 +475,107 @@
}
}
+ /* --------- specialising "armg_calculate_flag_c" --------- */
+
+ else
+ if (vex_streq(function_name, "armg_calculate_flag_c")) {
+
+ /* specialise calls to the "armg_calculate_flag_c" function.
+ Note that the returned value must be either 0 or 1; nonzero
+ bits 31:1 are not allowed. In turn, incoming oldV and oldC
+ values (from the thunk) are assumed to have bits 31:1
+ clear. */
+ IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
+ vassert(arity == 4);
+ cc_op = args[0]; /* ARMG_CC_OP_* */
+ cc_dep1 = args[1];
+ cc_dep2 = args[2];
+ cc_ndep = args[3];
+
+ if (isU32(cc_op, ARMG_CC_OP_LOGIC)) {
+ /* Thunk args are (result, shco, oldV) */
+ /* C after LOGIC --> shco */
+ return cc_dep2;
+ }
+
+ if (isU32(cc_op, ARMG_CC_OP_SUB)) {
+ /* Thunk args are (argL, argR, unused) */
+ /* C after SUB --> argL >=u argR
+ --> argR <=u argL */
+ return unop(Iop_1Uto32,
+ binop(Iop_CmpLE32U, cc_dep2, cc_dep1));
+ }
+
+ if (isU32(cc_op, ARMG_CC_OP_SBB)) {
+ /* This happens occasionally in softfloat code, eg __divdf3+140 */
+ /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
+ /* C after SBB (same as HS after SBB above)
+ --> oldC ? (argL >=u argR) : (argL >u argR)
+ --> oldC ? (argR <=u argL) : (argR <u argL)
+ */
+ return
+ IRExpr_Mux0X(
+ unop(Iop_32to8, cc_ndep),
+ /* case oldC == 0 */
+ unop(Iop_1Uto32, binop(Iop_CmpLT32U, cc_dep2, cc_dep1)),
+ /* case oldC != 0 */
+ unop(Iop_1Uto32, binop(Iop_CmpLE32U, cc_dep2, cc_dep1))
+ );
+ }
+
+ }
+
+ /* --------- specialising "armg_calculate_flag_v" --------- */
+
+ else
+ if (vex_streq(function_name, "armg_calculate_flag_v")) {
+
+ /* specialise calls to the "armg_calculate_flag_v" function.
+ Note that the returned value must be either 0 or 1; nonzero
+ bits 31:1 are not allowed. In turn, incoming oldV and oldC
+ values (from the thunk) are assumed to have bits 31:1
+ clear. */
+ IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
+ vassert(arity == 4);
+ cc_op = args[0]; /* ARMG_CC_OP_* */
+ cc_dep1 = args[1];
+ cc_dep2 = args[2];
+ cc_ndep = args[3];
+
+ if (isU32(cc_op, ARMG_CC_OP_LOGIC)) {
+ /* Thunk args are (result, shco, oldV) */
+ /* V after LOGIC --> oldV */
+ return cc_ndep;
+ }
+
+ if (isU32(cc_op, ARMG_CC_OP_SBB)) {
+ /* This happens occasionally in softfloat code, eg __divdf3+140 */
+ /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
+ /* V after SBB
+ --> let res = argL - argR - (oldC ^ 1)
+ in (argL ^ argR) & (argL ^ res) & 1
+ */
+ return
+ binop(
+ Iop_And32,
+ binop(
+ Iop_And32,
+ // argL ^ argR
+ binop(Iop_Xor32, cc_dep1, cc_dep2),
+ // argL ^ (argL - argR - (oldC ^ 1))
+ binop(Iop_Xor32,
+ cc_dep1,
+ binop(Iop_Sub32,
+ binop(Iop_Sub32, cc_dep1, cc_dep2),
+ binop(Iop_Xor32, cc_ndep, mkU32(1)))
+ )
+ ),
+ mkU32(1)
+ );
+ }
+
+ }
+
# undef unop
# undef binop
# undef mkU32
Modified: trunk/priv/guest_arm_toIR.c
===================================================================
--- trunk/priv/guest_arm_toIR.c 2011-05-01 18:36:51 UTC (rev 2137)
+++ trunk/priv/guest_arm_toIR.c 2011-05-01 18:47:10 UTC (rev 2138)
@@ -1088,14 +1088,13 @@
static IRExpr* mk_armg_calculate_condition_dyn ( IRExpr* cond )
{
vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I32);
- /* And 'cond' had better produce a value in which only bits 7:4
- bits are nonzero. However, obviously we can't assert for
- that. */
+ /* And 'cond' had better produce a value in which only bits 7:4 are
+ nonzero. However, obviously we can't assert for that. */
/* So what we're constructing for the first argument is
- "(cond << 4) | stored-operation-operation". However,
- as per comments above, must be supplied pre-shifted to this
- function.
+ "(cond << 4) | stored-operation".
+ However, as per comments above, 'cond' must be supplied
+ pre-shifted to this function.
This pairing scheme requires that the ARM_CC_OP_ values all fit
in 4 bits. Hence we are passing a (COND, OP) pair in the lowest
@@ -1700,6 +1699,12 @@
The calling convention for res and newC is a bit funny. They could
be passed by value, but instead are passed by ref.
+
+ The C (shco) value computed must be zero in bits 31:1, as the IR
+ optimisations for flag handling (guest_arm_spechelper) rely on
+ that, and the slow-path handlers (armg_calculate_flags_nzcv) assert
+ for it. Same applies to all these functions that compute shco
+ after a shift or rotate, not just this one.
*/
static void compute_result_and_C_after_LSL_by_imm5 (
@@ -1751,7 +1756,7 @@
/* mux0X(amt == 0,
mux0X(amt < 32,
0,
- Rm[(32-amt) & 31])
+ Rm[(32-amt) & 31]),
oldC)
*/
/* About the best you can do is pray that iropt is able
@@ -1767,16 +1772,19 @@
unop(Iop_1Uto8,
binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))),
mkU32(0),
- binop(Iop_Shr32,
- mkexpr(rMt),
- unop(Iop_32to8,
- binop(Iop_And32,
- binop(Iop_Sub32,
- mkU32(32),
- mkexpr(amtT)),
- mkU32(31)
- )
- )
+ binop(Iop_And32,
+ binop(Iop_Shr32,
+ mkexpr(rMt),
+ unop(Iop_32to8,
+ binop(Iop_And32,
+ binop(Iop_Sub32,
+ mkU32(32),
+ mkexpr(amtT)),
+ mkU32(31)
+ )
+ )
+ ),
+ mkU32(1)
)
),
mkexpr(oldC)
@@ -1862,7 +1870,7 @@
/* mux0X(amt == 0,
mux0X(amt < 32,
0,
- Rm[(amt-1) & 31])
+ Rm[(amt-1) & 31]),
oldC)
*/
IRTemp oldC = newTemp(Ity_I32);
@@ -1876,16 +1884,19 @@
unop(Iop_1Uto8,
binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))),
mkU32(0),
- binop(Iop_Shr32,
- mkexpr(rMt),
- unop(Iop_32to8,
- binop(Iop_And32,
- binop(Iop_Sub32,
- mkexpr(amtT),
- mkU32(1)),
- mkU32(31)
- )
- )
+ binop(Iop_And32,
+ binop(Iop_Shr32,
+ mkexpr(rMt),
+ unop(Iop_32to8,
+ binop(Iop_And32,
+ binop(Iop_Sub32,
+ mkexpr(amtT),
+ mkU32(1)),
+ mkU32(31)
+ )
+ )
+ ),
+ mkU32(1)
)
),
mkexpr(oldC)
@@ -1984,20 +1995,26 @@
IRExpr_Mux0X(
unop(Iop_1Uto8,
binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))),
- binop(Iop_Shr32,
- mkexpr(rMt),
- mkU8(31)
+ binop(Iop_And32,
+ binop(Iop_Shr32,
+ mkexpr(rMt),
+ mkU8(31)
+ ),
+ mkU32(1)
),
- binop(Iop_Shr32,
- mkexpr(rMt),
- unop(Iop_32to8,
- binop(Iop_And32,
- binop(Iop_Sub32,
- mkexpr(amtT),
- mkU32(1)),
- mkU32(31)
- )
- )
+ binop(Iop_And32,
+ binop(Iop_Shr32,
+ mkexpr(rMt),
+ unop(Iop_32to8,
+ binop(Iop_And32,
+ binop(Iop_Sub32,
+ mkexpr(amtT),
+ mkU32(1)),
+ mkU32(31)
+ )
+ )
+ ),
+ mkU32(1)
)
),
mkexpr(oldC)
|