From: Jeremy F. <je...@go...> - 2002-10-03 04:42:13
|
Hi, This patch makes FPU state changes lazy, so there should only be one save/restore pair per basic block. With this change in place, FPU-intensive programs (in my case, some 3D code using OpenGL) are significantly faster. Rather than adding the fplive argument to emitUInstr(), I considered adding another bit to regs_live_before/after which signifies FP state liveness. That was a little more invasive, and it wasn't clear whether I should maintain such a bit in emitUInstr or add the logic to the register allocator. J Index: coregrind/vg_from_ucode.c =================================================================== RCS file: /cvsroot/valgrind/valgrind/coregrind/vg_from_ucode.c,v retrieving revision 1.15 diff -u -r1.15 vg_from_ucode.c --- coregrind/vg_from_ucode.c 2 Oct 2002 13:26:34 -0000 1.15 +++ coregrind/vg_from_ucode.c 3 Oct 2002 04:38:21 -0000 @@ -1808,18 +1808,14 @@ UChar second_byte_masked, Int reg ) { - emit_get_fpu_state(); emit_fpu_regmem ( first_byte, second_byte_masked, reg ); - emit_put_fpu_state(); } static void synth_fpu_no_mem ( UChar first_byte, UChar second_byte ) { - emit_get_fpu_state(); emit_fpu_no_mem ( first_byte, second_byte ); - emit_put_fpu_state(); } @@ -1961,7 +1957,7 @@ return (u->flags_w != FlagsEmpty); } -static void emitUInstr ( UCodeBlock* cb, Int i, RRegSet regs_live_before ) +static Bool emitUInstr ( UCodeBlock* cb, Int i, RRegSet regs_live_before, Bool fplive ) { Int old_emitted_code_used; UInstr* u = &cb->instrs[i]; @@ -2299,6 +2295,10 @@ case JMP: { vg_assert(u->tag2 == NoValue); vg_assert(u->tag1 == RealReg || u->tag1 == Literal); + if (fplive) { + emit_put_fpu_state(); + fplive = False; + } if (u->cond == CondAlways) { switch (u->tag1) { case RealReg: @@ -2353,6 +2353,10 @@ vg_assert(u->size == 0); if (readFlagUse ( u )) emit_get_eflags(); + if (fplive) { + emit_put_fpu_state(); + fplive = False; + } VG_(synth_call) ( False, u->val1 ); if (writeFlagUse ( u )) emit_put_eflags(); @@ -2375,6 +2379,10 @@ else vg_assert(u->tag3 == NoValue); vg_assert(u->size == 0); + if (fplive) { + emit_put_fpu_state(); + fplive = False; + } VG_(synth_ccall) ( u->lit32, u->argc, u->regparms_n, argv, tagv, ret_reg, regs_live_before, u->regs_live_after ); break; @@ -2397,6 +2405,10 @@ case FPU_W: vg_assert(u->tag1 == Lit16); vg_assert(u->tag2 == RealReg); + if (!fplive) { + emit_get_fpu_state(); + fplive = True; + } synth_fpu_regmem ( (u->val1 >> 8) & 0xFF, u->val1 & 0xFF, u->val2 ); @@ -2407,6 +2419,10 @@ vg_assert(u->tag2 == NoValue); if (readFlagUse ( u )) emit_get_eflags(); + if (!fplive) { + emit_get_fpu_state(); + fplive = True; + } synth_fpu_no_mem ( (u->val1 >> 8) & 0xFF, u->val1 & 0xFF ); if (writeFlagUse ( u )) @@ -2430,6 +2446,8 @@ vg_assert(u->opcode < 100); histogram[u->opcode].counts++; histogram[u->opcode].size += (emitted_code_used - old_emitted_code_used); + + return fplive; } @@ -2439,17 +2457,17 @@ { Int i; UChar regs_live_before = 0; /* No regs live at BB start */ - + Bool fplive = False; /* FPU state not loaded */ + emitted_code_used = 0; emitted_code_size = 500; /* reasonable initial size */ emitted_code = VG_(arena_malloc)(VG_AR_JITTER, emitted_code_size); if (dis) VG_(printf)("Generated x86 code:\n"); - + for (i = 0; i < cb->used; i++) { UInstr* u = &cb->instrs[i]; if (cb->instrs[i].opcode != NOP) { - /* Check on the sanity of this insn. */ Bool sane = VG_(saneUInstr)( False, False, u ); if (!sane) { @@ -2457,10 +2475,12 @@ VG_(up_UInstr)( i, u ); } vg_assert(sane); - emitUInstr( cb, i, regs_live_before ); + fplive = emitUInstr( cb, i, regs_live_before, fplive ); } regs_live_before = u->regs_live_after; } + vg_assert(!fplive); /* FPU state must be saved by end of BB */ + if (dis) VG_(printf)("\n"); /* Returns a pointer to the emitted code. This will have to be |
From: Jeremy F. <je...@go...> - 2002-10-03 04:48:31
|
On Wed, 2002-10-02 at 21:42, Jeremy Fitzhardinge wrote: > Hi, > > This patch makes FPU state changes lazy, so there should only be one > save/restore pair per basic block. Oh, for safety's sake, it should also probably have: default: if (VG_(needs).extended_UCode) { + if (fplive) { + emit_put_fpu_state(); + fplive = False; + } SK_(emit_XUInstr)(u, regs_live_before); } else { VG_(printf)("\nError:\n" " unhandled opcode: %u. Perhaps " " VG_(needs).extended_UCode should be set?\n", u->opcode); VG_(pp_UInstr)(0,u); VG_(core_panic)("emitUInstr: unimplemented opcode"); } J |