|
From: <sv...@va...> - 2007-01-10 04:59:37
|
Author: sewardj
Date: 2007-01-10 04:59:33 +0000 (Wed, 10 Jan 2007)
New Revision: 1722
Log:
Implement FXSAVE on amd64. Mysteriously my Athlon64 does not seem to
write all the fields that the AMD documentation says it should: it
skips ROP, RIP and RDP, so vex's implementation writes zeroes there.
Modified:
trunk/priv/guest-amd64/gdefs.h
trunk/priv/guest-amd64/ghelpers.c
trunk/priv/guest-amd64/toIR.c
Modified: trunk/priv/guest-amd64/gdefs.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/guest-amd64/gdefs.h 2007-01-09 15:20:07 UTC (rev 1721)
+++ trunk/priv/guest-amd64/gdefs.h 2007-01-10 04:59:33 UTC (rev 1722)
@@ -150,6 +150,8 @@
=20
extern void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* );
=20
+extern void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State*, HWord );
+
extern ULong amd64g_dirtyhelper_RDTSC ( void );
=20
//extern void amd64g_dirtyhelper_CPUID_sse0 ( VexGuestAMD64State* );
@@ -166,7 +168,6 @@
//extern VexEmWarn=20
// amd64g_dirtyhelper_FLDENV ( VexGuestAMD64State*, HWord );
=20
-//extern void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State*, HWord );
=20
=20
/*---------------------------------------------------------*/
Modified: trunk/priv/guest-amd64/ghelpers.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/guest-amd64/ghelpers.c 2007-01-09 15:20:07 UTC (rev 1721)
+++ trunk/priv/guest-amd64/ghelpers.c 2007-01-10 04:59:33 UTC (rev 1722)
@@ -1391,6 +1391,162 @@
}
=20
=20
+/* Create an x87 FPU state from the guest state, as close as
+ we can approximate it. */
+static
+void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
+ /*OUT*/UChar* x87_state )
+{
+ Int i, stno, preg;
+ UInt tagw;
+ ULong* vexRegs =3D (ULong*)(&vex_state->guest_FPREG[0]);
+ UChar* vexTags =3D (UChar*)(&vex_state->guest_FPTAG[0]);
+ Fpu_State* x87 =3D (Fpu_State*)x87_state;
+ UInt ftop =3D vex_state->guest_FTOP;
+ UInt c3210 =3D vex_state->guest_FC3210;
+
+ for (i =3D 0; i < 14; i++)
+ x87->env[i] =3D 0;
+
+ x87->env[1] =3D x87->env[3] =3D x87->env[5] =3D x87->env[13] =3D 0xFF=
FF;
+ x87->env[FP_ENV_STAT]=20
+ =3D toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
+ x87->env[FP_ENV_CTRL]=20
+ =3D toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
+
+ /* Dump the register stack in ST order. */
+ tagw =3D 0;
+ for (stno =3D 0; stno < 8; stno++) {
+ preg =3D (stno + ftop) & 7;
+ if (vexTags[preg] =3D=3D 0) {
+ /* register is empty */
+ tagw |=3D (3 << (2*preg));
+ convert_f64le_to_f80le( (UChar*)&vexRegs[preg],=20
+ &x87->reg[10*stno] );
+ } else {
+ /* register is full. */
+ tagw |=3D (0 << (2*preg));
+ convert_f64le_to_f80le( (UChar*)&vexRegs[preg],=20
+ &x87->reg[10*stno] );
+ }
+ }
+ x87->env[FP_ENV_TAG] =3D toUShort(tagw);
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (reads guest state, writes guest mem) */
+/* NOTE: only handles 32-bit format (no REX.W on the insn) */
+void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State* gst, HWord addr )
+{
+ /* Derived from values obtained from
+ vendor_id : AuthenticAMD
+ cpu family : 15
+ model : 12
+ model name : AMD Athlon(tm) 64 Processor 3200+
+ stepping : 0
+ cpu MHz : 2200.000
+ cache size : 512 KB
+ */
+ /* Somewhat roundabout, but at least it's simple. */
+ Fpu_State tmp;
+ UShort* addrS =3D (UShort*)addr;
+ UChar* addrC =3D (UChar*)addr;
+ U128* xmm =3D (U128*)(addr + 160);
+ UInt mxcsr;
+ UShort fp_tags;
+ UInt summary_tags;
+ Int r, stno;
+ UShort *srcS, *dstS;
+
+ do_get_x87( gst, (UChar*)&tmp );
+ mxcsr =3D amd64g_create_mxcsr( gst->guest_SSEROUND );
+
+ /* Now build the proper fxsave image from the x87 image we just
+ made. */
+
+ addrS[0] =3D tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
+ addrS[1] =3D tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
+
+ /* set addrS[2] in an endian-independent way */
+ summary_tags =3D 0;
+ fp_tags =3D tmp.env[FP_ENV_TAG];
+ for (r =3D 0; r < 8; r++) {
+ if ( ((fp_tags >> (2*r)) & 3) !=3D 3 )
+ summary_tags |=3D (1 << r);
+ }
+ addrC[4] =3D toUChar(summary_tags); /* FTW: tag summary byte */
+ addrC[5] =3D 0; /* pad */
+
+ /* FOP: faulting fpu opcode. From experimentation, the real CPU
+ does not write this field. (?!) */
+ addrS[3] =3D 0; /* BOGUS */
+
+ /* RIP (Last x87 instruction pointer). From experimentation, the
+ real CPU does not write this field. (?!) */
+ addrS[4] =3D 0; /* BOGUS */
+ addrS[5] =3D 0; /* BOGUS */
+ addrS[6] =3D 0; /* BOGUS */
+ addrS[7] =3D 0; /* BOGUS */
+
+ /* RDP (Last x87 data pointer). From experimentation, the real CPU
+ does not write this field. (?!) */
+ addrS[8] =3D 0; /* BOGUS */
+ addrS[9] =3D 0; /* BOGUS */
+ addrS[10] =3D 0; /* BOGUS */
+ addrS[11] =3D 0; /* BOGUS */
+
+ addrS[12] =3D toUShort(mxcsr); /* MXCSR */
+ addrS[13] =3D toUShort(mxcsr >> 16);
+
+ addrS[14] =3D 0xFFFF; /* MXCSR mask (lo16) */
+ addrS[15] =3D 0x0000; /* MXCSR mask (hi16) */
+
+ /* Copy in the FP registers, in ST order. */
+ for (stno =3D 0; stno < 8; stno++) {
+ srcS =3D (UShort*)(&tmp.reg[10*stno]);
+ dstS =3D (UShort*)(&addrS[16 + 8*stno]);
+ dstS[0] =3D srcS[0];
+ dstS[1] =3D srcS[1];
+ dstS[2] =3D srcS[2];
+ dstS[3] =3D srcS[3];
+ dstS[4] =3D srcS[4];
+ dstS[5] =3D 0;
+ dstS[6] =3D 0;
+ dstS[7] =3D 0;
+ }
+
+ /* That's the first 160 bytes of the image done. Now only %xmm0
+ .. %xmm15 remain to be copied. If the host is big-endian, these
+ need to be byte-swapped. */
+ vassert(host_is_little_endian());
+
+# define COPY_U128(_dst,_src) \
+ do { _dst[0] =3D _src[0]; _dst[1] =3D _src[1]; \
+ _dst[2] =3D _src[2]; _dst[3] =3D _src[3]; } \
+ while (0)
+
+ COPY_U128( xmm[0], gst->guest_XMM0 );
+ COPY_U128( xmm[1], gst->guest_XMM1 );
+ COPY_U128( xmm[2], gst->guest_XMM2 );
+ COPY_U128( xmm[3], gst->guest_XMM3 );
+ COPY_U128( xmm[4], gst->guest_XMM4 );
+ COPY_U128( xmm[5], gst->guest_XMM5 );
+ COPY_U128( xmm[6], gst->guest_XMM6 );
+ COPY_U128( xmm[7], gst->guest_XMM7 );
+ COPY_U128( xmm[8], gst->guest_XMM8 );
+ COPY_U128( xmm[9], gst->guest_XMM9 );
+ COPY_U128( xmm[10], gst->guest_XMM10 );
+ COPY_U128( xmm[11], gst->guest_XMM11 );
+ COPY_U128( xmm[12], gst->guest_XMM12 );
+ COPY_U128( xmm[13], gst->guest_XMM13 );
+ COPY_U128( xmm[14], gst->guest_XMM14 );
+ COPY_U128( xmm[15], gst->guest_XMM15 );
+
+# undef COPY_U128
+}
+
+
/* DIRTY HELPER (writes guest state) */
/* Initialise the x87 FPU state as per 'finit'. */
void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
Modified: trunk/priv/guest-amd64/toIR.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/guest-amd64/toIR.c 2007-01-09 15:20:07 UTC (rev 1721)
+++ trunk/priv/guest-amd64/toIR.c 2007-01-10 04:59:33 UTC (rev 1722)
@@ -8427,90 +8427,87 @@
=20
insn =3D (UChar*)&guest_code[delta];
=20
-//.. /* Treat fxsave specially. It should be doable even on an SSE0
-//.. (Pentium-II class) CPU. Hence be prepared to handle it on
-//.. any subarchitecture variant.
-//.. */
-//..=20
-//.. /* 0F AE /0 =3D FXSAVE m512 -- write x87 and SSE state to memory=
*/
-//.. if (sz =3D=3D 4 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xAE
-//.. && !epartIsReg(insn[2]) && gregOfRM(insn[2]) =3D=3D 0) {
-//.. modrm =3D getUChar(delta+2);
-//.. vassert(sz =3D=3D 4);
-//.. vassert(!epartIsReg(modrm));
-//..=20
-//.. addr =3D disAMode ( &alen, sorb, delta+2, dis_buf );
-//.. delta +=3D 2+alen;
-//..=20
-//.. DIP("fxsave %s\n", dis_buf);
-//..=20
-//.. /* Uses dirty helper:=20
-//.. void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
-//.. IRDirty* d =3D unsafeIRDirty_0_N (=20
-//.. 0/*regparms*/,=20
-//.. "x86g_dirtyhelper_FXSAVE",=20
-//.. &x86g_dirtyhelper_FXSAVE,
-//.. mkIRExprVec_1( mkexpr(addr) )
-//.. );
-//.. d->needsBBP =3D True;
-//..=20
-//.. /* declare we're writing memory */
-//.. d->mFx =3D Ifx_Write;
-//.. d->mAddr =3D mkexpr(addr);
-//.. d->mSize =3D 512;
-//..=20
-//.. /* declare we're reading guest state */
-//.. d->nFxState =3D 7;
-//..=20
-//.. d->fxState[0].fx =3D Ifx_Read;
-//.. d->fxState[0].offset =3D OFFB_FTOP;
-//.. d->fxState[0].size =3D sizeof(UInt);
-//..=20
-//.. d->fxState[1].fx =3D Ifx_Read;
-//.. d->fxState[1].offset =3D OFFB_FPREGS;
-//.. d->fxState[1].size =3D 8 * sizeof(ULong);
-//..=20
-//.. d->fxState[2].fx =3D Ifx_Read;
-//.. d->fxState[2].offset =3D OFFB_FPTAGS;
-//.. d->fxState[2].size =3D 8 * sizeof(UChar);
-//..=20
-//.. d->fxState[3].fx =3D Ifx_Read;
-//.. d->fxState[3].offset =3D OFFB_FPROUND;
-//.. d->fxState[3].size =3D sizeof(UInt);
-//..=20
-//.. d->fxState[4].fx =3D Ifx_Read;
-//.. d->fxState[4].offset =3D OFFB_FC3210;
-//.. d->fxState[4].size =3D sizeof(UInt);
-//..=20
-//.. d->fxState[5].fx =3D Ifx_Read;
-//.. d->fxState[5].offset =3D OFFB_XMM0;
-//.. d->fxState[5].size =3D 8 * sizeof(U128);
-//..=20
-//.. d->fxState[6].fx =3D Ifx_Read;
-//.. d->fxState[6].offset =3D OFFB_SSEROUND;
-//.. d->fxState[6].size =3D sizeof(UInt);
-//..=20
-//.. /* Be paranoid ... this assertion tries to ensure the 8 %xmm
-//.. images are packed back-to-back. If not, the value of
-//.. d->fxState[5].size is wrong. */
-//.. vassert(16 =3D=3D sizeof(U128));
-//.. vassert(OFFB_XMM7 =3D=3D (OFFB_XMM0 + 7 * 16));
-//..=20
-//.. stmt( IRStmt_Dirty(d) );
-//..=20
-//.. goto decode_success;
-//.. }
-//..=20
-//.. /* ------ SSE decoder main ------ */
-//..=20
-//.. /* Skip parts of the decoder which don't apply given the stated
-//.. guest subarchitecture. */
-//.. if (subarch =3D=3D VexSubArchX86_sse0)
-//.. goto after_sse_decoders;
-//.. =20
-//.. /* Otherwise we must be doing sse1 or sse2, so we can at least t=
ry
-//.. for SSE1 here. */
+ /* FXSAVE is spuriously at the start here only because it is
+ thusly placed in guest-x86/toIR.c. */
=20
+ /* 0F AE /0 =3D FXSAVE m512 -- write x87 and SSE state to memory.
+ Note that REX.W 0F AE /0 writes a slightly different format and
+ we don't handle that here. */
+ if (haveNo66noF2noF3(pfx) && sz =3D=3D 4=20
+ && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xAE
+ && !epartIsReg(insn[2]) && gregOfRexRM(pfx,insn[2]) =3D=3D 0) {
+ IRDirty* d;
+ modrm =3D getUChar(delta+2);
+ vassert(sz =3D=3D 4);
+ vassert(!epartIsReg(modrm));
+ /* REX.W must not be set. That should be assured us by sz =3D=3D =
4
+ above. */
+ vassert(!(pfx & PFX_REXW));
+
+ addr =3D disAMode ( &alen, pfx, delta+2, dis_buf, 0 );
+ delta +=3D 2+alen;
+
+ DIP("fxsave %s\n", dis_buf);
+
+ /* Uses dirty helper:=20
+ void amd64g_do_FXSAVE ( VexGuestAMD64State*, UInt ) */
+ d =3D unsafeIRDirty_0_N (=20
+ 0/*regparms*/,=20
+ "amd64g_dirtyhelper_FXSAVE",=20
+ &amd64g_dirtyhelper_FXSAVE,
+ mkIRExprVec_1( mkexpr(addr) )
+ );
+ d->needsBBP =3D True;
+
+ /* declare we're writing memory */
+ d->mFx =3D Ifx_Write;
+ d->mAddr =3D mkexpr(addr);
+ d->mSize =3D 512;
+
+ /* declare we're reading guest state */
+ d->nFxState =3D 7;
+
+ d->fxState[0].fx =3D Ifx_Read;
+ d->fxState[0].offset =3D OFFB_FTOP;
+ d->fxState[0].size =3D sizeof(UInt);
+
+ d->fxState[1].fx =3D Ifx_Read;
+ d->fxState[1].offset =3D OFFB_FPREGS;
+ d->fxState[1].size =3D 8 * sizeof(ULong);
+
+ d->fxState[2].fx =3D Ifx_Read;
+ d->fxState[2].offset =3D OFFB_FPTAGS;
+ d->fxState[2].size =3D 8 * sizeof(UChar);
+
+ d->fxState[3].fx =3D Ifx_Read;
+ d->fxState[3].offset =3D OFFB_FPROUND;
+ d->fxState[3].size =3D sizeof(ULong);
+
+ d->fxState[4].fx =3D Ifx_Read;
+ d->fxState[4].offset =3D OFFB_FC3210;
+ d->fxState[4].size =3D sizeof(ULong);
+
+ d->fxState[5].fx =3D Ifx_Read;
+ d->fxState[5].offset =3D OFFB_XMM0;
+ d->fxState[5].size =3D 16 * sizeof(U128);
+
+ d->fxState[6].fx =3D Ifx_Read;
+ d->fxState[6].offset =3D OFFB_SSEROUND;
+ d->fxState[6].size =3D sizeof(ULong);
+
+ /* Be paranoid ... this assertion tries to ensure the 16 %xmm
+ images are packed back-to-back. If not, the value of
+ d->fxState[5].size is wrong. */
+ vassert(16 =3D=3D sizeof(U128));
+ vassert(OFFB_XMM15 =3D=3D (OFFB_XMM0 + 15 * 16));
+
+ stmt( IRStmt_Dirty(d) );
+
+ goto decode_success;
+ }
+
+ /* ------ SSE decoder main ------ */
+
/* 0F 58 =3D ADDPS -- add 32Fx4 from R/M to R */
if (haveNo66noF2noF3(pfx) && sz =3D=3D 4=20
&& insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0x58) {
|