|
From: <sv...@va...> - 2007-03-25 04:15:02
|
Author: sewardj
Date: 2007-03-25 05:14:58 +0100 (Sun, 25 Mar 2007)
New Revision: 1744
Log:
x86 back end: use 80-bit loads/stores for floating point spills rather
than 64-bit ones, to reduce accuracy loss. To support this, in
reg-alloc, allocate 2 64-bit spill slots for each HRcFlt64 vreg
instead of just 1.
Modified:
trunk/priv/host-generic/h_generic_regs.h
trunk/priv/host-generic/reg_alloc2.c
trunk/priv/host-x86/hdefs.c
Modified: trunk/priv/host-generic/h_generic_regs.h
===================================================================
--- trunk/priv/host-generic/h_generic_regs.h 2007-03-21 00:21:56 UTC (rev 1743)
+++ trunk/priv/host-generic/h_generic_regs.h 2007-03-25 04:14:58 UTC (rev 1744)
@@ -87,10 +87,17 @@
available on any specific host. For example on x86, the available
classes are: Int32, Flt64, Vec128 only.
- IMPORTANT NOTE: Vec128 is the only >= 128-bit-sized class, and
- reg_alloc2.c handles it specially when assigning spill slots. If
- you add another 128-bit or larger regclass, you must remember to
- update reg_alloc2.c accordingly.
+ IMPORTANT NOTE: reg_alloc2.c needs how much space is needed to spill
+ each class of register. It has the following knowledge hardwired in:
+
+ HRcInt32 32 bits
+ HRcInt64 64 bits
+ HRcFlt64 80 bits (on x86 these are spilled by fstpt/fldt)
+ HRcVec64 64 bits
+ HRcVec128 128 bits
+
+ If you add another regclass, you must remember to update
+ reg_alloc2.c accordingly.
*/
typedef
enum {
Modified: trunk/priv/host-generic/reg_alloc2.c
===================================================================
--- trunk/priv/host-generic/reg_alloc2.c 2007-03-21 00:21:56 UTC (rev 1743)
+++ trunk/priv/host-generic/reg_alloc2.c 2007-03-25 04:14:58 UTC (rev 1744)
@@ -778,8 +778,9 @@
/* --------- Stage 3: allocate spill slots. --------- */
- /* Each spill slot is 8 bytes long. For 128-bit vregs
- we have to allocate two spill slots.
+ /* Each spill slot is 8 bytes long. For vregs which take more than
+ 64 bits to spill (classes Flt64 and Vec128), we have to allocate
+ two spill slots.
Do a rank-based allocation of vregs to spill slot numbers. We
put as few values as possible in spill slows, but nevertheless
@@ -799,14 +800,31 @@
continue;
}
- /* The spill slots are 64 bits in size. That means, to spill a
- Vec128-class vreg, we'll need to find two adjacent spill
- slots to use. Note, this special-casing needs to happen for
- all 128-bit sized register classes. Currently though
- HRcVector is the only such class. */
+ /* The spill slots are 64 bits in size. As per the comment on
+ definition of HRegClass in h_generic_regs.h, that means, to
+ spill a vreg of class Flt64 or Vec128, we'll need to find two
+ adjacent spill slots to use. Note, this logic needs to kept
+ in sync with the size info on the definition of HRegClass. */
- if (vreg_lrs[j].reg_class != HRcVec128) {
+ if (vreg_lrs[j].reg_class == HRcVec128
+ || vreg_lrs[j].reg_class == HRcFlt64) {
+ /* Find two adjacent free slots in which between them provide
+ up to 128 bits in which to spill the vreg. */
+
+ for (k = 0; k < N_SPILL64S-1; k++)
+ if (ss_busy_until_before[k] <= vreg_lrs[j].live_after
+ && ss_busy_until_before[k+1] <= vreg_lrs[j].live_after)
+ break;
+ if (k == N_SPILL64S-1) {
+ vpanic("LibVEX_N_SPILL_BYTES is too low. "
+ "Increase and recompile.");
+ }
+ ss_busy_until_before[k+0] = vreg_lrs[j].dead_before;
+ ss_busy_until_before[k+1] = vreg_lrs[j].dead_before;
+
+ } else {
+
/* The ordinary case -- just find a single spill slot. */
/* Find the lowest-numbered spill slot which is available at
@@ -821,22 +839,6 @@
}
ss_busy_until_before[k] = vreg_lrs[j].dead_before;
- } else {
-
- /* Find two adjacent free slots in which to spill a 128-bit
- vreg. */
-
- for (k = 0; k < N_SPILL64S-1; k++)
- if (ss_busy_until_before[k] <= vreg_lrs[j].live_after
- && ss_busy_until_before[k+1] <= vreg_lrs[j].live_after)
- break;
- if (k == N_SPILL64S-1) {
- vpanic("LibVEX_N_SPILL_BYTES is too low. "
- "Increase and recompile.");
- }
- ss_busy_until_before[k+0] = vreg_lrs[j].dead_before;
- ss_busy_until_before[k+1] = vreg_lrs[j].dead_before;
-
}
/* This reflects LibVEX's hard-wired knowledge of the baseBlock
Modified: trunk/priv/host-x86/hdefs.c
===================================================================
--- trunk/priv/host-x86/hdefs.c 2007-03-21 00:21:56 UTC (rev 1743)
+++ trunk/priv/host-x86/hdefs.c 2007-03-25 04:14:58 UTC (rev 1744)
@@ -737,7 +737,7 @@
i->Xin.FpLdSt.sz = sz;
i->Xin.FpLdSt.reg = reg;
i->Xin.FpLdSt.addr = addr;
- vassert(sz == 4 || sz == 8);
+ vassert(sz == 4 || sz == 8 || sz == 10);
return i;
}
X86Instr* X86Instr_FpLdStI ( Bool isLoad, UChar sz,
@@ -1005,12 +1005,14 @@
break;
case Xin_FpLdSt:
if (i->Xin.FpLdSt.isLoad) {
- vex_printf("gld%c " , i->Xin.FpLdSt.sz==8 ? 'D' : 'F');
+ vex_printf("gld%c " , i->Xin.FpLdSt.sz==10 ? 'T'
+ : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
ppX86AMode(i->Xin.FpLdSt.addr);
vex_printf(", ");
ppHRegX86(i->Xin.FpLdSt.reg);
} else {
- vex_printf("gst%c " , i->Xin.FpLdSt.sz==8 ? 'D' : 'F');
+ vex_printf("gst%c " , i->Xin.FpLdSt.sz==10 ? 'T'
+ : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
ppHRegX86(i->Xin.FpLdSt.reg);
vex_printf(", ");
ppX86AMode(i->Xin.FpLdSt.addr);
@@ -1558,7 +1560,7 @@
case HRcInt32:
return X86Instr_Alu32M ( Xalu_MOV, X86RI_Reg(rreg), am );
case HRcFlt64:
- return X86Instr_FpLdSt ( False/*store*/, 8, rreg, am );
+ return X86Instr_FpLdSt ( False/*store*/, 10, rreg, am );
case HRcVec128:
return X86Instr_SseLdSt ( False/*store*/, rreg, am );
default:
@@ -1578,7 +1580,7 @@
case HRcInt32:
return X86Instr_Alu32R ( Xalu_MOV, X86RMI_Mem(am), rreg );
case HRcFlt64:
- return X86Instr_FpLdSt ( True/*load*/, 8, rreg, am );
+ return X86Instr_FpLdSt ( True/*load*/, 10, rreg, am );
case HRcVec128:
return X86Instr_SseLdSt ( True/*load*/, rreg, am );
default:
@@ -2497,14 +2499,27 @@
goto done;
case Xin_FpLdSt:
- vassert(i->Xin.FpLdSt.sz == 4 || i->Xin.FpLdSt.sz == 8);
if (i->Xin.FpLdSt.isLoad) {
/* Load from memory into %fakeN.
- --> ffree %st(7) ; fld{s/l} amode ; fstp st(N+1)
+ --> ffree %st(7) ; fld{s/l/t} amode ; fstp st(N+1)
*/
p = do_ffree_st7(p);
- *p++ = toUChar(i->Xin.FpLdSt.sz==4 ? 0xD9 : 0xDD);
- p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
+ switch (i->Xin.FpLdSt.sz) {
+ case 4:
+ *p++ = 0xD9;
+ p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
+ break;
+ case 8:
+ *p++ = 0xDD;
+ p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
+ break;
+ case 10:
+ *p++ = 0xDB;
+ p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdSt.addr);
+ break;
+ default:
+ vpanic("emitX86Instr(FpLdSt,load)");
+ }
p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdSt.reg));
goto done;
} else {
@@ -2513,8 +2528,22 @@
*/
p = do_ffree_st7(p);
p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdSt.reg));
- *p++ = toUChar(i->Xin.FpLdSt.sz==4 ? 0xD9 : 0xDD);
- p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
+ switch (i->Xin.FpLdSt.sz) {
+ case 4:
+ *p++ = 0xD9;
+ p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
+ break;
+ case 8:
+ *p++ = 0xDD;
+ p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
+ break;
+ case 10:
+ *p++ = 0xDB;
+ p = doAMode_M(p, fake(7)/*subopcode*/, i->Xin.FpLdSt.addr);
+ break;
+ default:
+ vpanic("emitX86Instr(FpLdSt,store)");
+ }
goto done;
}
break;
|