From 8584e9930268c920db51b0123de902c468d10d88 Mon Sep 17 00:00:00 2001
Message-Id: <8584e9930268c920db51b0123de902c468d10d88.1320837391.git.trast@student.ethz.ch>
From: Thomas Rast <trast@student.ethz.ch>
Date: Tue, 8 Nov 2011 21:50:11 +0100
Subject: [PATCH] Proper rounding on amd64

Changes the IOPs in question to take a rounding mode argument, and
then changes the frontend and backend to fill it in properly.
---
 priv/guest_amd64_toIR.c |  286 +++++++++++++++++++++++++++--------------------
 priv/host_amd64_isel.c  |  104 +++++++++++++----
 priv/ir_defs.c          |   38 ++++---
 pub/libvex_ir.h         |   27 +++--
 4 files changed, 285 insertions(+), 170 deletions(-)

diff --git a/priv/guest_amd64_toIR.c b/priv/guest_amd64_toIR.c
index 40c46f5..8eb6cc3 100644
--- a/priv/guest_amd64_toIR.c
+++ b/priv/guest_amd64_toIR.c
@@ -46,11 +46,6 @@
  
    * no FP exceptions, except for handling stack over/underflow
  
-   * FP rounding mode observed only for float->int conversions and
-     int->float conversions which could lose accuracy, and for
-     float-to-float rounding.  For all other operations,
-     round-to-nearest is used, regardless.
- 
    * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
      simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
      even when it isn't.
@@ -250,11 +245,27 @@ static void assign ( IRTemp dst, IRExpr* e )
    return IRExpr_Unop(op, a);
 }
 
+static IRExpr* unop_maybe_round ( IROp op, IRExpr* round, IRExpr* a )
+{
+   if (round)
+      return IRExpr_Binop(op, round, a);
+   else
+      return IRExpr_Unop(op, a);
+}
+
 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
 {
    return IRExpr_Binop(op, a1, a2);
 }
 
+static IRExpr* binop_maybe_round ( IROp op, IRExpr* round, IRExpr* a1, IRExpr* a2 )
+{
+   if (round)
+      return IRExpr_Triop(op, round, a1, a2);
+   else
+      return IRExpr_Binop(op, a1, a2);
+}
+
 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
 {
    return IRExpr_Triop(op, a1, a2, a3);
@@ -4612,12 +4623,6 @@ static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
    return binop( Iop_And32, get_fpround(), mkU32(3) );
 }
 
-static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
-{
-   return mkU32(Irrm_NEAREST);
-}
-
-
 /* --------- Get/set FP register tag bytes. --------- */
 
 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
@@ -4755,21 +4760,20 @@ void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
    if (dbl) {
       put_ST_UNCHECKED(0, 
          triop( op, 
-                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                get_roundingmode(),
                 get_ST(0), 
                 loadLE(Ity_F64,mkexpr(addr))
          ));
    } else {
       put_ST_UNCHECKED(0, 
          triop( op, 
-                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                get_roundingmode(),
                 get_ST(0), 
                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
          ));
    }
 }
 
-
 /* ST(0) = mem64/32(addr) `op` ST(0)
    Need to check ST(0)'s tag on read, but not on write.
 */
@@ -4781,14 +4785,14 @@ void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
    if (dbl) {
       put_ST_UNCHECKED(0, 
          triop( op, 
-                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                get_roundingmode(),
                 loadLE(Ity_F64,mkexpr(addr)),
                 get_ST(0)
          ));
    } else {
       put_ST_UNCHECKED(0, 
          triop( op, 
-                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                get_roundingmode(),
                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
                 get_ST(0)
          ));
@@ -4807,7 +4811,7 @@ void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
    put_ST_UNCHECKED( 
       st_dst, 
       triop( op, 
-             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+             get_roundingmode(),
              get_ST(st_dst), 
              get_ST(st_src) ) 
    );
@@ -4826,7 +4830,7 @@ void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
    put_ST_UNCHECKED( 
       st_dst, 
       triop( op, 
-             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+             get_roundingmode(),
              get_ST(st_src), 
              get_ST(st_dst) ) 
    );
@@ -5328,7 +5332,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                DIP("f2xm1\n");
                put_ST_UNCHECKED(0, 
                   binop(Iop_2xm1F64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         get_ST(0)));
                break;
 
@@ -5336,7 +5340,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                DIP("fyl2x\n");
                put_ST_UNCHECKED(1, 
                   triop(Iop_Yl2xF64,
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         get_ST(1), 
                         get_ST(0)));
                fp_pop();
@@ -5346,7 +5350,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                DIP("ftan\n");
                put_ST_UNCHECKED(0, 
                   binop(Iop_TanF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         get_ST(0)));
                fp_push();
                put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
@@ -5357,7 +5361,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                DIP("fpatan\n");
                put_ST_UNCHECKED(1, 
                   triop(Iop_AtanF64,
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         get_ST(1), 
                         get_ST(0)));
                fp_pop();
@@ -5409,13 +5413,13 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                assign( a2, get_ST(1) );
                put_ST_UNCHECKED(0,
                   triop(Iop_PRem1F64,
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         mkexpr(a1),
                         mkexpr(a2)));
                put_C3210(
                   unop(Iop_32Uto64,
                   triop(Iop_PRem1C3210F64,
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         mkexpr(a1),
                         mkexpr(a2)) ));
                break;
@@ -5436,13 +5440,13 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                assign( a2, get_ST(1) );
                put_ST_UNCHECKED(0,
                   triop(Iop_PRemF64,
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         mkexpr(a1),
                         mkexpr(a2)));
                put_C3210(
                   unop(Iop_32Uto64,
                   triop(Iop_PRemC3210F64,
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         mkexpr(a1),
                         mkexpr(a2)) ));
                break;
@@ -5452,7 +5456,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                DIP("fyl2xp1\n");
                put_ST_UNCHECKED(1, 
                   triop(Iop_Yl2xp1F64,
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         get_ST(1), 
                         get_ST(0)));
                fp_pop();
@@ -5462,7 +5466,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                DIP("fsqrt\n");
                put_ST_UNCHECKED(0, 
                   binop(Iop_SqrtF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         get_ST(0)));
                break;
 
@@ -5472,12 +5476,12 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                DIP("fsincos\n");
                put_ST_UNCHECKED(0, 
                   binop(Iop_SinF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         mkexpr(a1)));
                fp_push();
                put_ST(0, 
                   binop(Iop_CosF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         mkexpr(a1)));
                clear_C2(); /* HACK */
                break;
@@ -5493,7 +5497,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                DIP("fscale\n");
                put_ST_UNCHECKED(0, 
                   triop(Iop_ScaleF64,
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         get_ST(0), 
                         get_ST(1)));
                break;
@@ -5502,7 +5506,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                DIP("fsin\n");
                put_ST_UNCHECKED(0, 
                   binop(Iop_SinF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         get_ST(0)));
                clear_C2(); /* HACK */
                break;
@@ -5511,7 +5515,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                DIP("fcos\n");
                put_ST_UNCHECKED(0, 
                   binop(Iop_CosF64, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         get_ST(0)));
                clear_C2(); /* HACK */
                break;
@@ -5568,7 +5572,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
             do_fop_m32:
                put_ST_UNCHECKED(0, 
                   triop(fop, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         get_ST(0),
                         unop(Iop_I32StoF64,
                              loadLE(Ity_I32, mkexpr(addr)))));
@@ -5577,7 +5581,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
             do_foprev_m32:
                put_ST_UNCHECKED(0, 
                   triop(fop, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         unop(Iop_I32StoF64,
                              loadLE(Ity_I32, mkexpr(addr))),
                         get_ST(0)));
@@ -6244,7 +6248,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
             do_fop_m16:
                put_ST_UNCHECKED(0, 
                   triop(fop, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         get_ST(0),
                         unop(Iop_I32StoF64,
                              unop(Iop_16Sto32, 
@@ -6254,7 +6258,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
             do_foprev_m16:
                put_ST_UNCHECKED(0, 
                   triop(fop, 
-                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_roundingmode(),
                         unop(Iop_I32StoF64,
                              unop(Iop_16Sto32, 
                                   loadLE(Ity_I16, mkexpr(addr)))),
@@ -8107,7 +8111,8 @@ static ULong dis_SSE_E_to_G_all_wrk (
                 VexAbiInfo* vbi,
                 Prefix pfx, Long delta, 
                 HChar* opname, IROp op,
-                Bool   invertG
+                Bool   invertG,
+		IRExpr* round
              )
 {
    HChar   dis_buf[50];
@@ -8119,8 +8124,8 @@ static ULong dis_SSE_E_to_G_all_wrk (
                 : getXMMReg(gregOfRexRM(pfx,rm));
    if (epartIsReg(rm)) {
       putXMMReg( gregOfRexRM(pfx,rm), 
-                 binop(op, gpart,
-                           getXMMReg(eregOfRexRM(pfx,rm))) );
+		 binop_maybe_round(op, round, gpart,
+				   getXMMReg(eregOfRexRM(pfx,rm))) );
       DIP("%s %s,%s\n", opname,
                         nameXMMReg(eregOfRexRM(pfx,rm)),
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -8128,8 +8133,8 @@ static ULong dis_SSE_E_to_G_all_wrk (
    } else {
       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
       putXMMReg( gregOfRexRM(pfx,rm), 
-                 binop(op, gpart,
-                           loadLE(Ity_V128, mkexpr(addr))) );
+                 binop_maybe_round(op, round, gpart,
+				   loadLE(Ity_V128, mkexpr(addr))) );
       DIP("%s %s,%s\n", opname,
                         dis_buf,
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -8143,9 +8148,10 @@ static ULong dis_SSE_E_to_G_all_wrk (
 static
 ULong dis_SSE_E_to_G_all ( VexAbiInfo* vbi,
                            Prefix pfx, Long delta, 
-                           HChar* opname, IROp op )
+                           HChar* opname, IROp op,
+			   IRExpr* round )
 {
-   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
+   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False, round );
 }
 
 /* All lanes SSE binary operation, G = (not G) `op` E. */
@@ -8153,9 +8159,10 @@ ULong dis_SSE_E_to_G_all ( VexAbiInfo* vbi,
 static
 ULong dis_SSE_E_to_G_all_invG ( VexAbiInfo* vbi,
                                 Prefix pfx, Long delta, 
-                                HChar* opname, IROp op )
+                                HChar* opname, IROp op,
+				IRExpr* round )
 {
-   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
+   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True, round );
 }
 
 
@@ -8163,7 +8170,8 @@ ULong dis_SSE_E_to_G_all_invG ( VexAbiInfo* vbi,
 
 static ULong dis_SSE_E_to_G_lo32 ( VexAbiInfo* vbi,
                                    Prefix pfx, Long delta, 
-                                   HChar* opname, IROp op )
+                                   HChar* opname, IROp op,
+				   IRExpr* round )
 {
    HChar   dis_buf[50];
    Int     alen;
@@ -8172,8 +8180,8 @@ static ULong dis_SSE_E_to_G_lo32 ( VexAbiInfo* vbi,
    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
    if (epartIsReg(rm)) {
       putXMMReg( gregOfRexRM(pfx,rm), 
-                 binop(op, gpart,
-                           getXMMReg(eregOfRexRM(pfx,rm))) );
+                 binop_maybe_round(op, round, gpart,
+				   getXMMReg(eregOfRexRM(pfx,rm))) );
       DIP("%s %s,%s\n", opname,
                         nameXMMReg(eregOfRexRM(pfx,rm)),
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -8186,7 +8194,7 @@ static ULong dis_SSE_E_to_G_lo32 ( VexAbiInfo* vbi,
       assign( epart, unop( Iop_32UtoV128,
                            loadLE(Ity_I32, mkexpr(addr))) );
       putXMMReg( gregOfRexRM(pfx,rm), 
-                 binop(op, gpart, mkexpr(epart)) );
+                 binop_maybe_round(op, round, gpart, mkexpr(epart)) );
       DIP("%s %s,%s\n", opname,
                         dis_buf,
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -8199,7 +8207,8 @@ static ULong dis_SSE_E_to_G_lo32 ( VexAbiInfo* vbi,
 
 static ULong dis_SSE_E_to_G_lo64 ( VexAbiInfo* vbi,
                                    Prefix pfx, Long delta, 
-                                   HChar* opname, IROp op )
+                                   HChar* opname, IROp op,
+				   IRExpr* round )
 {
    HChar   dis_buf[50];
    Int     alen;
@@ -8207,9 +8216,9 @@ static ULong dis_SSE_E_to_G_lo64 ( VexAbiInfo* vbi,
    UChar   rm = getUChar(delta);
    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
    if (epartIsReg(rm)) {
-      putXMMReg( gregOfRexRM(pfx,rm), 
-                 binop(op, gpart,
-                           getXMMReg(eregOfRexRM(pfx,rm))) );
+      putXMMReg( gregOfRexRM(pfx,rm),
+                 binop_maybe_round(op, round, gpart,
+				   getXMMReg(eregOfRexRM(pfx,rm))) );
       DIP("%s %s,%s\n", opname,
                         nameXMMReg(eregOfRexRM(pfx,rm)),
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -8222,7 +8231,7 @@ static ULong dis_SSE_E_to_G_lo64 ( VexAbiInfo* vbi,
       assign( epart, unop( Iop_64UtoV128,
                            loadLE(Ity_I64, mkexpr(addr))) );
       putXMMReg( gregOfRexRM(pfx,rm), 
-                 binop(op, gpart, mkexpr(epart)) );
+                 binop_maybe_round(op, round, gpart, mkexpr(epart)) );
       DIP("%s %s,%s\n", opname,
                         dis_buf,
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -8236,7 +8245,8 @@ static ULong dis_SSE_E_to_G_lo64 ( VexAbiInfo* vbi,
 static ULong dis_SSE_E_to_G_unary_all ( 
                 VexAbiInfo* vbi,
                 Prefix pfx, Long delta, 
-                HChar* opname, IROp op
+                HChar* opname, IROp op,
+		IRExpr* round
              )
 {
    HChar   dis_buf[50];
@@ -8245,7 +8255,7 @@ static ULong dis_SSE_E_to_G_unary_all (
    UChar   rm = getUChar(delta);
    if (epartIsReg(rm)) {
       putXMMReg( gregOfRexRM(pfx,rm), 
-                 unop(op, getXMMReg(eregOfRexRM(pfx,rm))) );
+                 unop_maybe_round(op, round, getXMMReg(eregOfRexRM(pfx,rm))) );
       DIP("%s %s,%s\n", opname,
                         nameXMMReg(eregOfRexRM(pfx,rm)),
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -8253,7 +8263,7 @@ static ULong dis_SSE_E_to_G_unary_all (
    } else {
       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
       putXMMReg( gregOfRexRM(pfx,rm), 
-                 unop(op, loadLE(Ity_V128, mkexpr(addr))) );
+                 unop_maybe_round(op, round, loadLE(Ity_V128, mkexpr(addr))) );
       DIP("%s %s,%s\n", opname,
                         dis_buf,
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -8267,7 +8277,8 @@ static ULong dis_SSE_E_to_G_unary_all (
 static ULong dis_SSE_E_to_G_unary_lo32 ( 
                 VexAbiInfo* vbi,
                 Prefix pfx, Long delta, 
-                HChar* opname, IROp op
+                HChar* opname, IROp op,
+		IRExpr* round
              )
 {
    /* First we need to get the old G value and patch the low 32 bits
@@ -8284,9 +8295,9 @@ static ULong dis_SSE_E_to_G_unary_lo32 (
    if (epartIsReg(rm)) {
       assign( oldG1, 
               binop( Iop_SetV128lo32,
-                     mkexpr(oldG0),
-                     getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
-      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
+		     mkexpr(oldG0),
+		     getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
+      putXMMReg( gregOfRexRM(pfx,rm), unop_maybe_round(op, round, mkexpr(oldG1)) );
       DIP("%s %s,%s\n", opname,
                         nameXMMReg(eregOfRexRM(pfx,rm)),
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -8295,9 +8306,9 @@ static ULong dis_SSE_E_to_G_unary_lo32 (
       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
       assign( oldG1, 
               binop( Iop_SetV128lo32,
-                     mkexpr(oldG0),
-                     loadLE(Ity_I32, mkexpr(addr)) ));
-      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
+		     mkexpr(oldG0),
+		     loadLE(Ity_I32, mkexpr(addr)) ));
+      putXMMReg( gregOfRexRM(pfx,rm), unop_maybe_round(op, round, mkexpr(oldG1)) );
       DIP("%s %s,%s\n", opname,
                         dis_buf,
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -8311,7 +8322,8 @@ static ULong dis_SSE_E_to_G_unary_lo32 (
 static ULong dis_SSE_E_to_G_unary_lo64 ( 
                 VexAbiInfo* vbi,
                 Prefix pfx, Long delta, 
-                HChar* opname, IROp op
+                HChar* opname, IROp op,
+		IRExpr* round
              )
 {
    /* First we need to get the old G value and patch the low 64 bits
@@ -8330,7 +8342,7 @@ static ULong dis_SSE_E_to_G_unary_lo64 (
               binop( Iop_SetV128lo64,
                      mkexpr(oldG0),
                      getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
-      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
+      putXMMReg( gregOfRexRM(pfx,rm), unop_maybe_round(op, round, mkexpr(oldG1)) );
       DIP("%s %s,%s\n", opname,
                         nameXMMReg(eregOfRexRM(pfx,rm)),
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -8341,7 +8353,7 @@ static ULong dis_SSE_E_to_G_unary_lo64 (
               binop( Iop_SetV128lo64,
                      mkexpr(oldG0),
                      loadLE(Ity_I64, mkexpr(addr)) ));
-      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
+      putXMMReg( gregOfRexRM(pfx,rm), unop_maybe_round(op, round, mkexpr(oldG1)) );
       DIP("%s %s,%s\n", opname,
                         dis_buf,
                         nameXMMReg(gregOfRexRM(pfx,rm)) );
@@ -9411,28 +9423,30 @@ DisResult disInstr_AMD64_WRK (
    /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
    if (haveNo66noF2noF3(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x58) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "addps", Iop_Add32Fx4 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "addps", Iop_Add32Fx4,
+				  get_sse_roundingmode() );
       goto decode_success;
    }
 
    /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
    if (haveF3no66noF2(pfx) && sz == 4
        && insn[0] == 0x0F && insn[1] == 0x58) {
-      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "addss", Iop_Add32F0x4 );
+      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "addss", Iop_Add32F0x4,
+				   get_sse_roundingmode() );
       goto decode_success;
    }
 
    /* 0F 55 = ANDNPS -- G = (not G) and E */
    if (haveNo66noF2noF3(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x55) {
-      delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "andnps", Iop_AndV128 );
+      delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "andnps", Iop_AndV128, NULL );
       goto decode_success;
    }
 
    /* 0F 54 = ANDPS -- G = G and E */
    if (haveNo66noF2noF3(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x54) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "andps", Iop_AndV128 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "andps", Iop_AndV128, NULL );
       goto decode_success;
    }
 
@@ -9701,14 +9715,16 @@ DisResult disInstr_AMD64_WRK (
    /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
    if (haveNo66noF2noF3(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x5E) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "divps", Iop_Div32Fx4 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "divps", Iop_Div32Fx4,
+				  get_sse_roundingmode() );
       goto decode_success;
    }
 
    /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
    if (haveF3no66noF2(pfx) && sz == 4
        && insn[0] == 0x0F && insn[1] == 0x5E) {
-      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "divss", Iop_Div32F0x4 );
+      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "divss", Iop_Div32F0x4,
+				   get_sse_roundingmode()  );
       goto decode_success;
    }
 
@@ -9775,28 +9791,28 @@ DisResult disInstr_AMD64_WRK (
    /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
    if (haveNo66noF2noF3(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x5F) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "maxps", Iop_Max32Fx4 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "maxps", Iop_Max32Fx4, NULL );
       goto decode_success;
    }
 
    /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
    if (haveF3no66noF2(pfx) && sz == 4
        && insn[0] == 0x0F && insn[1] == 0x5F) {
-      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "maxss", Iop_Max32F0x4 );
+      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "maxss", Iop_Max32F0x4, NULL );
       goto decode_success;
    }
 
    /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
    if (haveNo66noF2noF3(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x5D) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "minps", Iop_Min32Fx4 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "minps", Iop_Min32Fx4, NULL );
       goto decode_success;
    }
 
    /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
    if (haveF3no66noF2(pfx) && sz == 4
        && insn[0] == 0x0F && insn[1] == 0x5D) {
-      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "minss", Iop_Min32F0x4 );
+      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "minss", Iop_Min32F0x4, NULL );
       goto decode_success;
    }
 
@@ -10070,21 +10086,23 @@ DisResult disInstr_AMD64_WRK (
    /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
    if (haveNo66noF2noF3(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x59) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "mulps", Iop_Mul32Fx4 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "mulps", Iop_Mul32Fx4,
+				  get_sse_roundingmode() );
       goto decode_success;
    }
 
    /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
    if (haveF3no66noF2(pfx) && sz == 4
        && insn[0] == 0x0F && insn[1] == 0x59) {
-      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "mulss", Iop_Mul32F0x4 );
+      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "mulss", Iop_Mul32F0x4,
+				   get_sse_roundingmode() );
       goto decode_success;
    }
 
    /* 0F 56 = ORPS -- G = G and E */
    if (haveNo66noF2noF3(pfx) && sz == 4
        && insn[0] == 0x0F && insn[1] == 0x56) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "orps", Iop_OrV128 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "orps", Iop_OrV128, NULL );
       goto decode_success;
    }
 
@@ -10350,7 +10368,8 @@ DisResult disInstr_AMD64_WRK (
    if (haveNo66noF2noF3(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x53) {
       delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2, 
-                                        "rcpps", Iop_Recip32Fx4 );
+                                        "rcpps", Iop_Recip32Fx4,
+					get_sse_roundingmode() );
       goto decode_success;
    }
 
@@ -10358,7 +10377,8 @@ DisResult disInstr_AMD64_WRK (
    if (haveF3no66noF2(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x53) {
       delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2, 
-                                         "rcpss", Iop_Recip32F0x4 );
+                                         "rcpss", Iop_Recip32F0x4,
+					 get_sse_roundingmode() );
       goto decode_success;
    }
 
@@ -10366,7 +10386,8 @@ DisResult disInstr_AMD64_WRK (
    if (haveNo66noF2noF3(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x52) {
       delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2, 
-                                        "rsqrtps", Iop_RSqrt32Fx4 );
+                                        "rsqrtps", Iop_RSqrt32Fx4,
+					get_sse_roundingmode() );
       goto decode_success;
    }
 
@@ -10374,7 +10395,8 @@ DisResult disInstr_AMD64_WRK (
    if (haveF3no66noF2(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x52) {
       delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2, 
-                                         "rsqrtss", Iop_RSqrt32F0x4 );
+                                         "rsqrtss", Iop_RSqrt32F0x4,
+					 get_sse_roundingmode() );
       goto decode_success;
    }
 
@@ -10443,7 +10465,8 @@ DisResult disInstr_AMD64_WRK (
    if (haveNo66noF2noF3(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x51) {
       delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2, 
-                                        "sqrtps", Iop_Sqrt32Fx4 );
+                                        "sqrtps", Iop_Sqrt32Fx4,
+					get_sse_roundingmode() );
       goto decode_success;
    }
 
@@ -10451,7 +10474,8 @@ DisResult disInstr_AMD64_WRK (
    if (haveF3no66noF2(pfx) && sz == 4
        && insn[0] == 0x0F && insn[1] == 0x51) {
       delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2, 
-                                         "sqrtss", Iop_Sqrt32F0x4 );
+                                         "sqrtss", Iop_Sqrt32F0x4,
+					 get_sse_roundingmode() );
       goto decode_success;
    }
 
@@ -10485,14 +10509,16 @@ DisResult disInstr_AMD64_WRK (
    /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
    if (haveNo66noF2noF3(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x5C) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "subps", Iop_Sub32Fx4 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "subps", Iop_Sub32Fx4,
+				  get_sse_roundingmode() );
       goto decode_success;
    }
 
    /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
    if (haveF3no66noF2(pfx) && sz == 4
        && insn[0] == 0x0F && insn[1] == 0x5C) {
-      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "subss", Iop_Sub32F0x4 );
+      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "subss", Iop_Sub32F0x4,
+				   get_sse_roundingmode() );
       goto decode_success;
    }
 
@@ -10540,7 +10566,7 @@ DisResult disInstr_AMD64_WRK (
    /* 0F 57 = XORPS -- G = G and E */
    if (haveNo66noF2noF3(pfx) && sz == 4 
        && insn[0] == 0x0F && insn[1] == 0x57) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "xorps", Iop_XorV128 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "xorps", Iop_XorV128, NULL );
       goto decode_success;
    }
 
@@ -10556,7 +10582,8 @@ DisResult disInstr_AMD64_WRK (
    if (have66noF2noF3(pfx) 
        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
        && insn[0] == 0x0F && insn[1] == 0x58) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "addpd", Iop_Add64Fx2 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "addpd", Iop_Add64Fx2,
+				  get_sse_roundingmode() );
       goto decode_success;
    }
  
@@ -10564,21 +10591,22 @@ DisResult disInstr_AMD64_WRK (
    if (haveF2no66noF3(pfx) 
        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
        && insn[0] == 0x0F && insn[1] == 0x58) {
-      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "addsd", Iop_Add64F0x2 );
+      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "addsd", Iop_Add64F0x2,
+					get_sse_roundingmode() );
       goto decode_success;
    }
 
    /* 66 0F 55 = ANDNPD -- G = (not G) and E */
    if (have66noF2noF3(pfx) && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0x55) {
-      delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "andnpd", Iop_AndV128 );
+      delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "andnpd", Iop_AndV128, NULL );
       goto decode_success;
    }
 
    /* 66 0F 54 = ANDPD -- G = G and E */
    if (have66noF2noF3(pfx) && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0x54) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "andpd", Iop_AndV128 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "andpd", Iop_AndV128, NULL );
       goto decode_success;
    }
 
@@ -11151,14 +11179,16 @@ DisResult disInstr_AMD64_WRK (
    /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
    if (have66noF2noF3(pfx) && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0x5E) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "divpd", Iop_Div64Fx2 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "divpd", Iop_Div64Fx2,
+				  get_sse_roundingmode() );
       goto decode_success;
    }
 
    /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
    if (haveF2no66noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x5E) {
       vassert(sz == 4);
-      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "divsd", Iop_Div64F0x2 );
+      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "divsd", Iop_Div64F0x2,
+					get_sse_roundingmode() );
       goto decode_success;
    }
 
@@ -11179,28 +11209,28 @@ DisResult disInstr_AMD64_WRK (
    /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
    if (have66noF2noF3(pfx) && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0x5F) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "maxpd", Iop_Max64Fx2 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "maxpd", Iop_Max64Fx2, NULL );
       goto decode_success;
    }
 
    /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
    if (haveF2no66noF3(pfx) && sz == 4
        && insn[0] == 0x0F && insn[1] == 0x5F) {
-      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "maxsd", Iop_Max64F0x2 );
+      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "maxsd", Iop_Max64F0x2, NULL );
       goto decode_success;
    }
 
    /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
    if (have66noF2noF3(pfx) && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0x5D) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "minpd", Iop_Min64Fx2 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "minpd", Iop_Min64Fx2, NULL );
       goto decode_success;
    }
 
    /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
    if (haveF2no66noF3(pfx) && sz == 4
        && insn[0] == 0x0F && insn[1] == 0x5D) {
-      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "minsd", Iop_Min64F0x2 );
+      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "minsd", Iop_Min64F0x2, NULL );
       goto decode_success;
    }
 
@@ -11688,7 +11718,7 @@ DisResult disInstr_AMD64_WRK (
    if (have66noF2noF3(pfx) 
        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
        && insn[0] == 0x0F && insn[1] == 0x59) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "mulpd", Iop_Mul64Fx2 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "mulpd", Iop_Mul64Fx2, get_sse_roundingmode() );
       goto decode_success;
    }
 
@@ -11696,14 +11726,14 @@ DisResult disInstr_AMD64_WRK (
    if (haveF2no66noF3(pfx) 
        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
        && insn[0] == 0x0F && insn[1] == 0x59) {
-      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "mulsd", Iop_Mul64F0x2 );
+      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "mulsd", Iop_Mul64F0x2, get_sse_roundingmode() );
       goto decode_success;
    }
 
    /* 66 0F 56 = ORPD -- G = G and E */
    if (have66noF2noF3(pfx) && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0x56) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "orpd", Iop_OrV128 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "orpd", Iop_OrV128, NULL );
       goto decode_success;
    }
 
@@ -11761,7 +11791,8 @@ DisResult disInstr_AMD64_WRK (
    if (have66noF2noF3(pfx) && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0x51) {
       delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2, 
-                                        "sqrtpd", Iop_Sqrt64Fx2 );
+                                        "sqrtpd", Iop_Sqrt64Fx2,
+					get_sse_roundingmode() );
       goto decode_success;
    }
 
@@ -11769,14 +11800,16 @@ DisResult disInstr_AMD64_WRK (
    if (haveF2no66noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x51) {
       vassert(sz == 4);
       delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta+2, 
-                                         "sqrtsd", Iop_Sqrt64F0x2 );
+                                         "sqrtsd", Iop_Sqrt64F0x2,
+					get_sse_roundingmode() );
       goto decode_success;
    }
 
    /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
    if (have66noF2noF3(pfx) && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0x5C) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "subpd", Iop_Sub64Fx2 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "subpd", Iop_Sub64Fx2,
+				  get_sse_roundingmode() );
       goto decode_success;
    }
 
@@ -11784,7 +11817,8 @@ DisResult disInstr_AMD64_WRK (
    if (haveF2no66noF3(pfx) 
        && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
        && insn[0] == 0x0F && insn[1] == 0x5C) {
-      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "subsd", Iop_Sub64F0x2 );
+      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "subsd", Iop_Sub64F0x2,
+				   get_sse_roundingmode() );
       goto decode_success;
    }
 
@@ -11839,7 +11873,7 @@ DisResult disInstr_AMD64_WRK (
    /* 66 0F 57 = XORPD -- G = G xor E */
    if (have66noF2noF3(pfx) && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0x57) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "xorpd", Iop_XorV128 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "xorpd", Iop_XorV128, NULL );
       goto decode_success;
    }
 
@@ -11947,14 +11981,14 @@ DisResult disInstr_AMD64_WRK (
    /* 66 0F DB = PAND */
    if (have66noF2noF3(pfx) && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0xDB) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "pand", Iop_AndV128 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "pand", Iop_AndV128, NULL );
       goto decode_success;
    }
 
    /* 66 0F DF = PANDN */
    if (have66noF2noF3(pfx) && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0xDF) {
-      delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "pandn", Iop_AndV128 );
+      delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "pandn", Iop_AndV128, NULL );
       goto decode_success;
    }
 
@@ -12297,7 +12331,7 @@ DisResult disInstr_AMD64_WRK (
    /* 66 0F EB = POR */
    if (have66noF2noF3(pfx) && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0xEB) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "por", Iop_OrV128 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "por", Iop_OrV128, NULL );
       goto decode_success;
    }
 
@@ -12882,7 +12916,7 @@ DisResult disInstr_AMD64_WRK (
    /* 66 0F EF = PXOR */
    if (have66noF2noF3(pfx) && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0xEF) {
-      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "pxor", Iop_XorV128 );
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "pxor", Iop_XorV128, NULL );
       goto decode_success;
    }
 
@@ -13032,8 +13066,8 @@ DisResult disInstr_AMD64_WRK (
 
       assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
 
-      assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
-      assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
+      assign( addV, triop(Iop_Add32Fx4, get_sse_roundingmode(), mkexpr(gV), mkexpr(eV)) );
+      assign( subV, triop(Iop_Sub32Fx4, get_sse_roundingmode(), mkexpr(gV), mkexpr(eV)) );
 
       breakup128to32s( addV, &a3, &a2, &a1, &a0 );
       breakup128to32s( subV, &s3, &s2, &s1, &s0 );
@@ -13068,8 +13102,8 @@ DisResult disInstr_AMD64_WRK (
 
       assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
 
-      assign( addV, binop(Iop_Add64Fx2, mkexpr(gV), mkexpr(eV)) );
-      assign( subV, binop(Iop_Sub64Fx2, mkexpr(gV), mkexpr(eV)) );
+      assign( addV, triop(Iop_Add64Fx2, get_sse_roundingmode(), mkexpr(gV), mkexpr(eV)) );
+      assign( subV, triop(Iop_Sub64Fx2, get_sse_roundingmode(), mkexpr(gV), mkexpr(eV)) );
 
       assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
       assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
@@ -13115,7 +13149,8 @@ DisResult disInstr_AMD64_WRK (
       assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
 
       putXMMReg( gregOfRexRM(pfx,modrm), 
-                 binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4, 
+                 triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
+		       get_sse_roundingmode(),
                        mkexpr(leftV), mkexpr(rightV) ) );
       goto decode_success;
    }
@@ -13160,7 +13195,8 @@ DisResult disInstr_AMD64_WRK (
       assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
 
       putXMMReg( gregOfRexRM(pfx,modrm), 
-                 binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2, 
+                 triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
+		       get_sse_roundingmode(),
                        mkexpr(leftV), mkexpr(rightV) ) );
       goto decode_success;
    }
@@ -14265,11 +14301,13 @@ DisResult disInstr_AMD64_WRK (
       UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
 
       assign( and_vec, binop( Iop_AndV128,
-                              binop( Iop_Mul64Fx2,
+                              triop( Iop_Mul64Fx2,
+				     get_sse_roundingmode(),
                                      mkexpr(dst_vec), mkexpr(src_vec) ),
                               mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
 
-      assign( sum_vec, binop( Iop_Add64F0x2,
+      assign( sum_vec, triop( Iop_Add64F0x2,
+			      get_sse_roundingmode(),
                               binop( Iop_InterleaveHI64x2,
                                      mkexpr(and_vec), mkexpr(and_vec) ),
                               binop( Iop_InterleaveLO64x2,
@@ -14330,12 +14368,13 @@ DisResult disInstr_AMD64_WRK (
 
       assign( tmp_prod_vec, 
               binop( Iop_AndV128, 
-                     binop( Iop_Mul32Fx4, mkexpr(xmm1_vec), mkexpr(xmm2_vec) ), 
+                     triop( Iop_Mul32Fx4, get_sse_roundingmode(), mkexpr(xmm1_vec), mkexpr(xmm2_vec) ), 
                      mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
       breakup128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
       assign( prod_vec, mk128from32s( v3, v1, v2, v0 ) );
 
-      assign( sum_vec, binop( Iop_Add32Fx4,
+      assign( sum_vec, triop( Iop_Add32Fx4,
+			      get_sse_roundingmode(),
                               binop( Iop_InterleaveHI32x4, 
                                      mkexpr(prod_vec), mkexpr(prod_vec) ), 
                               binop( Iop_InterleaveLO32x4, 
@@ -14343,7 +14382,8 @@ DisResult disInstr_AMD64_WRK (
 
       putXMMReg( gregOfRexRM(pfx, modrm), 
                  binop( Iop_AndV128, 
-                        binop( Iop_Add32Fx4,
+                        triop( Iop_Add32Fx4,
+			       get_sse_roundingmode(),
                                binop( Iop_InterleaveHI32x4,
                                       mkexpr(sum_vec), mkexpr(sum_vec) ), 
                                binop( Iop_InterleaveLO32x4,
diff --git a/priv/host_amd64_isel.c b/priv/host_amd64_isel.c
index bcd213f..dd48afc 100644
--- a/priv/host_amd64_isel.c
+++ b/priv/host_amd64_isel.c
@@ -3004,9 +3004,9 @@ static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
          HReg argL = iselDblExpr(env, e->Iex.Triop.arg2);
          HReg argR = iselDblExpr(env, e->Iex.Triop.arg3);
          addInstr(env, mk_vMOVsd_RR(argL, dst));
-         /* XXXROUNDINGFIXME */
-         /* set roundingmode here */
+         set_SSE_rounding_mode(env, e->Iex.Triop.arg1);
          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
+         set_SSE_rounding_default(env);
          return dst;
       }
    }
@@ -3062,9 +3062,8 @@ static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
                        False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
 
-      /* do it */
-      /* XXXROUNDINGFIXME */
-      /* set roundingmode here */
+      set_FPU_rounding_mode(env, e->Iex.Triop.arg1);
+
       switch (e->Iex.Triop.op) {
          case Iop_ScaleF64: 
             addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
@@ -3091,6 +3090,9 @@ static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
       /* save result */
       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
+
+      set_FPU_rounding_default(env);
+
       return dst;
    }
 
@@ -3155,8 +3157,7 @@ static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
          addInstr(env, AMD64Instr_A87Free(nNeeded));
          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
-         /* XXXROUNDINGFIXME */
-         /* set roundingmode here */
+         set_FPU_rounding_mode(env, e->Iex.Binop.arg1);
          addInstr(env, AMD64Instr_A87FpOp(fpop));
          if (e->Iex.Binop.op==Iop_TanF64) {
             /* get rid of the extra 1.0 that fptan pushes */
@@ -3164,6 +3165,7 @@ static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
          }
          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
+         set_FPU_rounding_default(env);
          return dst;
       }
    }
@@ -3474,12 +3476,8 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
       case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
       case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
       case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
-      case Iop_Add32Fx4:   op = Asse_ADDF;   goto do_32Fx4;
-      case Iop_Div32Fx4:   op = Asse_DIVF;   goto do_32Fx4;
       case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
       case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
-      case Iop_Mul32Fx4:   op = Asse_MULF;   goto do_32Fx4;
-      case Iop_Sub32Fx4:   op = Asse_SUBF;   goto do_32Fx4;
       do_32Fx4:
       {
          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
@@ -3494,12 +3492,8 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
       case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
       case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
       case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
-      case Iop_Add64Fx2:   op = Asse_ADDF;   goto do_64Fx2;
-      case Iop_Div64Fx2:   op = Asse_DIVF;   goto do_64Fx2;
       case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
       case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
-      case Iop_Mul64Fx2:   op = Asse_MULF;   goto do_64Fx2;
-      case Iop_Sub64Fx2:   op = Asse_SUBF;   goto do_64Fx2;
       do_64Fx2:
       {
          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
@@ -3514,12 +3508,8 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
       case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
       case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
       case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
-      case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
-      case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
       case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
       case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
-      case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
-      case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
       do_32F0x4: {
          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
@@ -3533,12 +3523,8 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
       case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
       case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
       case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
-      case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
-      case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
       case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
       case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
-      case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
-      case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
       do_64F0x2: {
          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
@@ -3775,6 +3761,78 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
    } /* switch (e->Iex.Binop.op) */
    } /* if (e->tag == Iex_Binop) */
 
+   if (e->tag == Iex_Triop) {
+   switch (e->Iex.Triop.op) {
+
+      case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2_rounded;
+      case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2_rounded;
+      case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2_rounded;
+      case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2_rounded;
+      do_64F0x2_rounded:
+      {
+         HReg argL = iselVecExpr(env, e->Iex.Triop.arg2);
+         HReg argR = iselVecExpr(env, e->Iex.Triop.arg3);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         set_SSE_rounding_mode(env, e->Iex.Triop.arg1);
+         addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
+         set_SSE_rounding_default(env);
+         return dst;
+      }
+
+      case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4_rounded;
+      case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4_rounded;
+      case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4_rounded;
+      case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4_rounded;
+      do_32F0x4_rounded:
+      {
+         HReg argL = iselVecExpr(env, e->Iex.Triop.arg2);
+         HReg argR = iselVecExpr(env, e->Iex.Triop.arg3);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         set_SSE_rounding_mode(env, e->Iex.Triop.arg1);
+         addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
+         set_SSE_rounding_default(env);
+         return dst;
+      }
+
+      case Iop_Mul64Fx2:   op = Asse_MULF;   goto do_64Fx2_rounded;
+      case Iop_Sub64Fx2:   op = Asse_SUBF;   goto do_64Fx2_rounded;
+      case Iop_Add64Fx2:   op = Asse_ADDF;   goto do_64Fx2_rounded;
+      case Iop_Div64Fx2:   op = Asse_DIVF;   goto do_64Fx2_rounded;
+      do_64Fx2_rounded:
+      {
+         HReg argL = iselVecExpr(env, e->Iex.Triop.arg2);
+         HReg argR = iselVecExpr(env, e->Iex.Triop.arg3);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         set_SSE_rounding_mode(env, e->Iex.Triop.arg1);
+         addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
+         set_SSE_rounding_default(env);
+         return dst;
+      }
+
+      case Iop_Add32Fx4:   op = Asse_ADDF;   goto do_32Fx4_rounded;
+      case Iop_Div32Fx4:   op = Asse_DIVF;   goto do_32Fx4_rounded;
+      case Iop_Mul32Fx4:   op = Asse_MULF;   goto do_32Fx4_rounded;
+      case Iop_Sub32Fx4:   op = Asse_SUBF;   goto do_32Fx4_rounded;
+      do_32Fx4_rounded:
+      {
+         HReg argL = iselVecExpr(env, e->Iex.Triop.arg2);
+         HReg argR = iselVecExpr(env, e->Iex.Triop.arg3);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         set_SSE_rounding_mode(env, e->Iex.Triop.arg1);
+         addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
+         set_SSE_rounding_default(env);
+         return dst;
+      }
+
+      default:
+         break;
+   } /* switch (e->Iex.Triop.op) */
+   } /* if (e->tag == Iex_Triop) */
+
    if (e->tag == Iex_Mux0X) {
       HReg r8  = iselIntExpr_R(env, e->Iex.Mux0X.cond);
       HReg rX  = iselVecExpr(env, e->Iex.Mux0X.exprX);
diff --git a/priv/ir_defs.c b/priv/ir_defs.c
index 06708a2..073a659 100644
--- a/priv/ir_defs.c
+++ b/priv/ir_defs.c
@@ -2341,10 +2341,12 @@ void typeOfPrimop ( IROp op,
       case Iop_RoundF32x4_RP:
       case Iop_RoundF32x4_RN:
       case Iop_RoundF32x4_RZ:
+         UNARY(Ity_V128, Ity_V128);
+
       case Iop_Abs32Fx4:
       case Iop_Rsqrte32Fx4:
       case Iop_Rsqrte32x4:
-         UNARY(Ity_V128, Ity_V128);
+	 BINARY(ity_RMode, Ity_V128, Ity_V128);
 
       case Iop_64HLtoV128:
          BINARY(Ity_I64,Ity_I64, Ity_V128);
@@ -2396,19 +2398,11 @@ void typeOfPrimop ( IROp op,
       case Iop_CmpEQ64F0x2: case Iop_CmpLT64F0x2:
       case Iop_CmpLE32F0x4: case Iop_CmpUN32F0x4:
       case Iop_CmpLE64F0x2: case Iop_CmpUN64F0x2:
-      case Iop_Add32Fx4: case Iop_Add32F0x4:
-      case Iop_Add64Fx2: case Iop_Add64F0x2:
-      case Iop_Div32Fx4: case Iop_Div32F0x4:
-      case Iop_Div64Fx2: case Iop_Div64F0x2:
       case Iop_Max32Fx4: case Iop_Max32F0x4:
       case Iop_PwMax32Fx4: case Iop_PwMin32Fx4:
       case Iop_Max64Fx2: case Iop_Max64F0x2:
       case Iop_Min32Fx4: case Iop_Min32F0x4:
       case Iop_Min64Fx2: case Iop_Min64F0x2:
-      case Iop_Mul32Fx4: case Iop_Mul32F0x4:
-      case Iop_Mul64Fx2: case Iop_Mul64F0x2:
-      case Iop_Sub32Fx4: case Iop_Sub32F0x4:
-      case Iop_Sub64Fx2: case Iop_Sub64F0x2:
       case Iop_AndV128: case Iop_OrV128: case Iop_XorV128:
       case Iop_Add8x16:   case Iop_Add16x8:   
       case Iop_Add32x4:   case Iop_Add64x2:
@@ -2470,6 +2464,16 @@ void typeOfPrimop ( IROp op,
       case Iop_Rsqrts32Fx4:
          BINARY(Ity_V128,Ity_V128, Ity_V128);
 
+      case Iop_Mul32Fx4: case Iop_Mul32F0x4:
+      case Iop_Mul64Fx2: case Iop_Mul64F0x2:
+      case Iop_Sub32Fx4: case Iop_Sub32F0x4:
+      case Iop_Sub64Fx2: case Iop_Sub64F0x2:
+      case Iop_Add32Fx4: case Iop_Add32F0x4:
+      case Iop_Add64Fx2: case Iop_Add64F0x2:
+      case Iop_Div32Fx4: case Iop_Div32F0x4:
+      case Iop_Div64Fx2: case Iop_Div64F0x2:
+	 TERNARY(ity_RMode, Ity_V128, Ity_V128, Ity_V128);
+
       case Iop_PolynomialMull8x8:
       case Iop_Mull8Ux8: case Iop_Mull8Sx8:
       case Iop_Mull16Ux4: case Iop_Mull16Sx4:
@@ -2477,13 +2481,6 @@ void typeOfPrimop ( IROp op,
          BINARY(Ity_I64, Ity_I64, Ity_V128);
 
       case Iop_NotV128:
-      case Iop_Recip32Fx4: case Iop_Recip32F0x4:
-      case Iop_Recip32x4:
-      case Iop_Recip64Fx2: case Iop_Recip64F0x2:
-      case Iop_RSqrt32Fx4: case Iop_RSqrt32F0x4:
-      case Iop_RSqrt64Fx2: case Iop_RSqrt64F0x2:
-      case Iop_Sqrt32Fx4:  case Iop_Sqrt32F0x4:
-      case Iop_Sqrt64Fx2:  case Iop_Sqrt64F0x2:
       case Iop_CmpNEZ8x16: case Iop_CmpNEZ16x8:
       case Iop_CmpNEZ32x4: case Iop_CmpNEZ64x2:
       case Iop_Cnt8x16:
@@ -2498,6 +2495,15 @@ void typeOfPrimop ( IROp op,
       case Iop_Abs8x16: case Iop_Abs16x8: case Iop_Abs32x4:
          UNARY(Ity_V128, Ity_V128);
 
+      case Iop_Recip32Fx4: case Iop_Recip32F0x4:
+      case Iop_Recip32x4:
+      case Iop_Recip64Fx2: case Iop_Recip64F0x2:
+      case Iop_RSqrt32Fx4: case Iop_RSqrt32F0x4:
+      case Iop_RSqrt64Fx2: case Iop_RSqrt64F0x2:
+      case Iop_Sqrt32Fx4:  case Iop_Sqrt32F0x4:
+      case Iop_Sqrt64Fx2:  case Iop_Sqrt64F0x2:
+	 BINARY(ity_RMode, Ity_V128, Ity_V128);
+
       case Iop_ShlV128: case Iop_ShrV128:
       case Iop_ShlN8x16: case Iop_ShlN16x8: 
       case Iop_ShlN32x4: case Iop_ShlN64x2:
diff --git a/pub/libvex_ir.h b/pub/libvex_ir.h
index 04c9fa9..1bce377 100644
--- a/pub/libvex_ir.h
+++ b/pub/libvex_ir.h
@@ -987,12 +987,15 @@
 
       /* --- 32x4 vector FP --- */
 
-      /* binary */
+      /* :: IRRoundingMode(I32) x V128 x V128 -> V128 */
       Iop_Add32Fx4, Iop_Sub32Fx4, Iop_Mul32Fx4, Iop_Div32Fx4, 
+      /* :: V128 x V128 -> V128 */
       Iop_Max32Fx4, Iop_Min32Fx4,
+      /* :: IRRoundingMode(I32) x V128 x V128 -> V128 */
       Iop_Add32Fx2, Iop_Sub32Fx2,
       /* Note: For the following compares, the ppc and arm front-ends assume a
          nan in a lane of either argument returns zero for that lane. */
+      /* :: V128 x V128 -> V128 */
       Iop_CmpEQ32Fx4, Iop_CmpLT32Fx4, Iop_CmpLE32Fx4, Iop_CmpUN32Fx4,
       Iop_CmpGT32Fx4, Iop_CmpGE32Fx4,
 
@@ -1002,27 +1005,32 @@
       /* Pairwise Max and Min. See integer pairwise operations for details. */
       Iop_PwMax32Fx4, Iop_PwMin32Fx4,
 
-      /* unary */
+      /* :: IRRoundingMode(I32) x V128 -> 32Fx4 */
       Iop_Sqrt32Fx4, Iop_RSqrt32Fx4,
+      /* :: V128 -> V128 */
       Iop_Neg32Fx4,
 
       /* Vector Reciprocal Estimate finds an approximate reciprocal of each
       element in the operand vector, and places the results in the destination
       vector.  */
+      /* :: IRRoundingMode(I32) x V128 -> V128 */
       Iop_Recip32Fx4,
 
       /* Vector Reciprocal Step computes (2.0 - arg1 * arg2).
          Note, that if one of the arguments is zero and another one is infinity
          of arbitrary sign the result of the operation is 2.0. */
+      /* :: IRRoundingMode(I32) x V128 -> V128 */
       Iop_Recps32Fx4,
 
       /* Vector Reciprocal Square Root Estimate finds an approximate reciprocal
          square root of each element in the operand vector. */
+      /* :: IRRoundingMode(I32) x V128 -> V128 */
       Iop_Rsqrte32Fx4,
 
       /* Vector Reciprocal Square Root Step computes (3.0 - arg1 * arg2) / 2.0.
          Note, that of one of the arguments is zero and another one is infiinty
          of arbitrary sign the result of the operation is 1.5. */
+      /* :: IRRoundingMode(I32) x V128 -> V128 */
       Iop_Rsqrts32Fx4,
 
 
@@ -1050,22 +1058,24 @@
       /* In binary cases, upper 3/4 is copied from first operand.  In
          unary cases, upper 3/4 is copied from the operand. */
 
-      /* binary */
+      /* :: IRRoundingMode(I32) x V128 x V128 -> V128 */
       Iop_Add32F0x4, Iop_Sub32F0x4, Iop_Mul32F0x4, Iop_Div32F0x4, 
+      /* :: V128 x V128 -> V128 */
       Iop_Max32F0x4, Iop_Min32F0x4,
       Iop_CmpEQ32F0x4, Iop_CmpLT32F0x4, Iop_CmpLE32F0x4, Iop_CmpUN32F0x4, 
 
-      /* unary */
+      /* :: IRRoundingMode(I32) x V128 -> V128 */
       Iop_Recip32F0x4, Iop_Sqrt32F0x4, Iop_RSqrt32F0x4,
 
       /* --- 64x2 vector FP --- */
 
-      /* binary */
+      /* :: IRRoundingMode(I32) x V128 x V128 -> V128 */
       Iop_Add64Fx2, Iop_Sub64Fx2, Iop_Mul64Fx2, Iop_Div64Fx2, 
       Iop_Max64Fx2, Iop_Min64Fx2,
+      /* :: V128 x V128 -> V128 */
       Iop_CmpEQ64Fx2, Iop_CmpLT64Fx2, Iop_CmpLE64Fx2, Iop_CmpUN64Fx2, 
 
-      /* unary */
+      /* :: IRRoundingMode(I32) x V128 -> V128 */
       Iop_Recip64Fx2, Iop_Sqrt64Fx2, Iop_RSqrt64Fx2,
 
       /* --- 64x2 lowest-lane-only scalar FP --- */
@@ -1073,12 +1083,13 @@
       /* In binary cases, upper half is copied from first operand.  In
          unary cases, upper half is copied from the operand. */
 
-      /* binary */
+      /* :: IRRoundingMode(I32) x V128 x V128 -> V128 */
       Iop_Add64F0x2, Iop_Sub64F0x2, Iop_Mul64F0x2, Iop_Div64F0x2, 
       Iop_Max64F0x2, Iop_Min64F0x2,
+      /* :: V128 x V128 -> V128 */
       Iop_CmpEQ64F0x2, Iop_CmpLT64F0x2, Iop_CmpLE64F0x2, Iop_CmpUN64F0x2, 
 
-      /* unary */
+      /* :: IRRoundingMode(I32) x V128 -> V128 */
       Iop_Recip64F0x2, Iop_Sqrt64F0x2, Iop_RSqrt64F0x2,
 
       /* --- pack / unpack --- */
-- 
1.7.8.rc0.302.g61121

