|
From: Jeremy F. <je...@go...> - 2002-10-03 04:42:13
|
Hi,
This patch makes FPU state changes lazy, so there should only be one
save/restore pair per basic block. With this change in place,
FPU-intensive programs (in my case, some 3D code using OpenGL) are
significantly faster.
Rather than adding the fplive argument to emitUInstr(), I considered
adding another bit to regs_live_before/after which signifies FP state
liveness. That was a little more invasive, and it wasn't clear whether
I should maintain such a bit in emitUInstr or add the logic to the
register allocator.
J
Index: coregrind/vg_from_ucode.c
===================================================================
RCS file: /cvsroot/valgrind/valgrind/coregrind/vg_from_ucode.c,v
retrieving revision 1.15
diff -u -r1.15 vg_from_ucode.c
--- coregrind/vg_from_ucode.c 2 Oct 2002 13:26:34 -0000 1.15
+++ coregrind/vg_from_ucode.c 3 Oct 2002 04:38:21 -0000
@@ -1808,18 +1808,14 @@
UChar second_byte_masked,
Int reg )
{
- emit_get_fpu_state();
emit_fpu_regmem ( first_byte, second_byte_masked, reg );
- emit_put_fpu_state();
}
static void synth_fpu_no_mem ( UChar first_byte,
UChar second_byte )
{
- emit_get_fpu_state();
emit_fpu_no_mem ( first_byte, second_byte );
- emit_put_fpu_state();
}
@@ -1961,7 +1957,7 @@
return (u->flags_w != FlagsEmpty);
}
-static void emitUInstr ( UCodeBlock* cb, Int i, RRegSet regs_live_before )
+static Bool emitUInstr ( UCodeBlock* cb, Int i, RRegSet regs_live_before, Bool fplive )
{
Int old_emitted_code_used;
UInstr* u = &cb->instrs[i];
@@ -2299,6 +2295,10 @@
case JMP: {
vg_assert(u->tag2 == NoValue);
vg_assert(u->tag1 == RealReg || u->tag1 == Literal);
+ if (fplive) {
+ emit_put_fpu_state();
+ fplive = False;
+ }
if (u->cond == CondAlways) {
switch (u->tag1) {
case RealReg:
@@ -2353,6 +2353,10 @@
vg_assert(u->size == 0);
if (readFlagUse ( u ))
emit_get_eflags();
+ if (fplive) {
+ emit_put_fpu_state();
+ fplive = False;
+ }
VG_(synth_call) ( False, u->val1 );
if (writeFlagUse ( u ))
emit_put_eflags();
@@ -2375,6 +2379,10 @@
else vg_assert(u->tag3 == NoValue);
vg_assert(u->size == 0);
+ if (fplive) {
+ emit_put_fpu_state();
+ fplive = False;
+ }
VG_(synth_ccall) ( u->lit32, u->argc, u->regparms_n, argv, tagv,
ret_reg, regs_live_before, u->regs_live_after );
break;
@@ -2397,6 +2405,10 @@
case FPU_W:
vg_assert(u->tag1 == Lit16);
vg_assert(u->tag2 == RealReg);
+ if (!fplive) {
+ emit_get_fpu_state();
+ fplive = True;
+ }
synth_fpu_regmem ( (u->val1 >> 8) & 0xFF,
u->val1 & 0xFF,
u->val2 );
@@ -2407,6 +2419,10 @@
vg_assert(u->tag2 == NoValue);
if (readFlagUse ( u ))
emit_get_eflags();
+ if (!fplive) {
+ emit_get_fpu_state();
+ fplive = True;
+ }
synth_fpu_no_mem ( (u->val1 >> 8) & 0xFF,
u->val1 & 0xFF );
if (writeFlagUse ( u ))
@@ -2430,6 +2446,8 @@
vg_assert(u->opcode < 100);
histogram[u->opcode].counts++;
histogram[u->opcode].size += (emitted_code_used - old_emitted_code_used);
+
+ return fplive;
}
@@ -2439,17 +2457,17 @@
{
Int i;
UChar regs_live_before = 0; /* No regs live at BB start */
-
+ Bool fplive = False; /* FPU state not loaded */
+
emitted_code_used = 0;
emitted_code_size = 500; /* reasonable initial size */
emitted_code = VG_(arena_malloc)(VG_AR_JITTER, emitted_code_size);
if (dis) VG_(printf)("Generated x86 code:\n");
-
+
for (i = 0; i < cb->used; i++) {
UInstr* u = &cb->instrs[i];
if (cb->instrs[i].opcode != NOP) {
-
/* Check on the sanity of this insn. */
Bool sane = VG_(saneUInstr)( False, False, u );
if (!sane) {
@@ -2457,10 +2475,12 @@
VG_(up_UInstr)( i, u );
}
vg_assert(sane);
- emitUInstr( cb, i, regs_live_before );
+ fplive = emitUInstr( cb, i, regs_live_before, fplive );
}
regs_live_before = u->regs_live_after;
}
+ vg_assert(!fplive); /* FPU state must be saved by end of BB */
+
if (dis) VG_(printf)("\n");
/* Returns a pointer to the emitted code. This will have to be
|