|
From: <sv...@va...> - 2013-03-27 11:37:50
|
sewardj 2013-03-27 11:37:33 +0000 (Wed, 27 Mar 2013)
New Revision: 2702
Log:
AMD64: Add support for AVX2, BMI1, BMI2 and FMA instructions (VEX side).
Fixes #305728. (Jakub Jelinek, ja...@re...)
Added files:
trunk/priv/host_generic_maddf.c
trunk/priv/host_generic_maddf.h
trunk/priv/host_generic_simd256.c
trunk/priv/host_generic_simd256.h
Modified files:
trunk/priv/guest_amd64_defs.h
trunk/priv/guest_amd64_helpers.c
trunk/priv/guest_amd64_toIR.c
trunk/priv/host_amd64_isel.c
trunk/priv/ir_defs.c
trunk/priv/main_main.c
trunk/pub/libvex.h
trunk/pub/libvex_basictypes.h
trunk/pub/libvex_ir.h
Modified: trunk/priv/ir_defs.c (+90 -0)
===================================================================
--- trunk/priv/ir_defs.c 2013-03-26 13:53:18 +00:00 (rev 2701)
+++ trunk/priv/ir_defs.c 2013-03-27 11:37:33 +00:00 (rev 2702)
@@ -1036,6 +1036,68 @@
case Iop_NotV256: vex_printf("NotV256"); return;
case Iop_CmpNEZ64x4: vex_printf("CmpNEZ64x4"); return;
case Iop_CmpNEZ32x8: vex_printf("CmpNEZ32x8"); return;
+ case Iop_CmpNEZ16x16: vex_printf("CmpNEZ16x16"); return;
+ case Iop_CmpNEZ8x32: vex_printf("CmpNEZ8x32"); return;
+
+ case Iop_Add8x32: vex_printf("Add8x32"); return;
+ case Iop_Add16x16: vex_printf("Add16x16"); return;
+ case Iop_Add32x8: vex_printf("Add32x8"); return;
+ case Iop_Add64x4: vex_printf("Add64x4"); return;
+ case Iop_Sub8x32: vex_printf("Sub8x32"); return;
+ case Iop_Sub16x16: vex_printf("Sub16x16"); return;
+ case Iop_Sub32x8: vex_printf("Sub32x8"); return;
+ case Iop_Sub64x4: vex_printf("Sub64x4"); return;
+ case Iop_QAdd8Ux32: vex_printf("QAdd8Ux32"); return;
+ case Iop_QAdd16Ux16: vex_printf("QAdd16Ux16"); return;
+ case Iop_QAdd8Sx32: vex_printf("QAdd8Sx32"); return;
+ case Iop_QAdd16Sx16: vex_printf("QAdd16Sx16"); return;
+ case Iop_QSub8Ux32: vex_printf("QSub8Ux32"); return;
+ case Iop_QSub16Ux16: vex_printf("QSub16Ux16"); return;
+ case Iop_QSub8Sx32: vex_printf("QSub8Sx32"); return;
+ case Iop_QSub16Sx16: vex_printf("QSub16Sx16"); return;
+
+ case Iop_Mul16x16: vex_printf("Mul16x16"); return;
+ case Iop_Mul32x8: vex_printf("Mul32x8"); return;
+ case Iop_MulHi16Ux16: vex_printf("MulHi16Ux16"); return;
+ case Iop_MulHi16Sx16: vex_printf("MulHi16Sx16"); return;
+
+ case Iop_Avg8Ux32: vex_printf("Avg8Ux32"); return;
+ case Iop_Avg16Ux16: vex_printf("Avg16Ux16"); return;
+
+ case Iop_Max8Sx32: vex_printf("Max8Sx32"); return;
+ case Iop_Max16Sx16: vex_printf("Max16Sx16"); return;
+ case Iop_Max32Sx8: vex_printf("Max32Sx8"); return;
+ case Iop_Max8Ux32: vex_printf("Max8Ux32"); return;
+ case Iop_Max16Ux16: vex_printf("Max16Ux16"); return;
+ case Iop_Max32Ux8: vex_printf("Max32Ux8"); return;
+
+ case Iop_Min8Sx32: vex_printf("Min8Sx32"); return;
+ case Iop_Min16Sx16: vex_printf("Min16Sx16"); return;
+ case Iop_Min32Sx8: vex_printf("Min32Sx8"); return;
+ case Iop_Min8Ux32: vex_printf("Min8Ux32"); return;
+ case Iop_Min16Ux16: vex_printf("Min16Ux16"); return;
+ case Iop_Min32Ux8: vex_printf("Min32Ux8"); return;
+
+ case Iop_CmpEQ8x32: vex_printf("CmpEQ8x32"); return;
+ case Iop_CmpEQ16x16: vex_printf("CmpEQ16x16"); return;
+ case Iop_CmpEQ32x8: vex_printf("CmpEQ32x8"); return;
+ case Iop_CmpEQ64x4: vex_printf("CmpEQ64x4"); return;
+ case Iop_CmpGT8Sx32: vex_printf("CmpGT8Sx32"); return;
+ case Iop_CmpGT16Sx16: vex_printf("CmpGT16Sx16"); return;
+ case Iop_CmpGT32Sx8: vex_printf("CmpGT32Sx8"); return;
+ case Iop_CmpGT64Sx4: vex_printf("CmpGT64Sx4"); return;
+
+ case Iop_ShlN16x16: vex_printf("ShlN16x16"); return;
+ case Iop_ShlN32x8: vex_printf("ShlN32x8"); return;
+ case Iop_ShlN64x4: vex_printf("ShlN64x4"); return;
+ case Iop_ShrN16x16: vex_printf("ShrN16x16"); return;
+ case Iop_ShrN32x8: vex_printf("ShrN32x8"); return;
+ case Iop_ShrN64x4: vex_printf("ShrN64x4"); return;
+ case Iop_SarN16x16: vex_printf("SarN16x16"); return;
+ case Iop_SarN32x8: vex_printf("SarN32x8"); return;
+
+ case Iop_Perm32x8: vex_printf("Perm32x8"); return;
+
default: vpanic("ppIROp(1)");
}
@@ -3001,6 +3063,26 @@
case Iop_XorV256:
case Iop_Max32Fx8: case Iop_Min32Fx8:
case Iop_Max64Fx4: case Iop_Min64Fx4:
+ case Iop_Add8x32: case Iop_Add16x16:
+ case Iop_Add32x8: case Iop_Add64x4:
+ case Iop_Sub8x32: case Iop_Sub16x16:
+ case Iop_Sub32x8: case Iop_Sub64x4:
+ case Iop_Mul16x16: case Iop_Mul32x8:
+ case Iop_MulHi16Ux16: case Iop_MulHi16Sx16:
+ case Iop_Avg8Ux32: case Iop_Avg16Ux16:
+ case Iop_Max8Sx32: case Iop_Max16Sx16: case Iop_Max32Sx8:
+ case Iop_Max8Ux32: case Iop_Max16Ux16: case Iop_Max32Ux8:
+ case Iop_Min8Sx32: case Iop_Min16Sx16: case Iop_Min32Sx8:
+ case Iop_Min8Ux32: case Iop_Min16Ux16: case Iop_Min32Ux8:
+ case Iop_CmpEQ8x32: case Iop_CmpEQ16x16:
+ case Iop_CmpEQ32x8: case Iop_CmpEQ64x4:
+ case Iop_CmpGT8Sx32: case Iop_CmpGT16Sx16:
+ case Iop_CmpGT32Sx8: case Iop_CmpGT64Sx4:
+ case Iop_QAdd8Ux32: case Iop_QAdd16Ux16:
+ case Iop_QAdd8Sx32: case Iop_QAdd16Sx16:
+ case Iop_QSub8Ux32: case Iop_QSub16Ux16:
+ case Iop_QSub8Sx32: case Iop_QSub16Sx16:
+ case Iop_Perm32x8:
BINARY(Ity_V256,Ity_V256, Ity_V256);
case Iop_V256toV128_1: case Iop_V256toV128_0:
@@ -3014,9 +3096,17 @@
case Iop_Sqrt32Fx8:
case Iop_Sqrt64Fx4:
case Iop_Recip32Fx8:
+ case Iop_CmpNEZ8x32: case Iop_CmpNEZ16x16:
case Iop_CmpNEZ64x4: case Iop_CmpNEZ32x8:
UNARY(Ity_V256, Ity_V256);
+ case Iop_ShlN16x16: case Iop_ShlN32x8:
+ case Iop_ShlN64x4:
+ case Iop_ShrN16x16: case Iop_ShrN32x8:
+ case Iop_ShrN64x4:
+ case Iop_SarN16x16: case Iop_SarN32x8:
+ BINARY(Ity_V256,Ity_I8, Ity_V256);
+
default:
ppIROp(op);
vpanic("typeOfPrimop");
Added: trunk/priv/host_generic_simd256.h (+55 -0)
===================================================================
--- trunk/priv/host_generic_simd256.h 2013-03-26 13:53:18 +00:00 (rev 2701)
+++ trunk/priv/host_generic_simd256.h 2013-03-27 11:37:33 +00:00 (rev 2702)
@@ -0,0 +1,55 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin host_generic_simd256.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+ This file is part of Valgrind, a dynamic binary instrumentation
+ framework.
+
+ Copyright (C) 2012 OpenWorks GbR
+ in...@op...
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+ The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Generic helper functions for doing 256-bit SIMD arithmetic in cases
+ where the instruction selectors cannot generate code in-line.
+ These are purely back-end entities and cannot be seen/referenced
+ as clean helper functions from IR.
+
+ These will get called from generated code and therefore should be
+ well behaved -- no floating point or mmx insns, just straight
+ integer code.
+
+ Each function implements the correspondingly-named IR primop.
+*/
+
+#ifndef __VEX_HOST_GENERIC_SIMD256_H
+#define __VEX_HOST_GENERIC_SIMD256_H
+
+#include "libvex_basictypes.h"
+
+extern VEX_REGPARM(3)
+ void h_generic_calc_Perm32x8 ( /*OUT*/V256*, V256*, V256* );
+
+#endif /* ndef __VEX_HOST_GENERIC_SIMD256_H */
+
+/*---------------------------------------------------------------*/
+/*--- end host_generic_simd256.h ---*/
+/*---------------------------------------------------------------*/
Added: trunk/priv/host_generic_simd256.c (+57 -0)
===================================================================
--- trunk/priv/host_generic_simd256.c 2013-03-26 13:53:18 +00:00 (rev 2701)
+++ trunk/priv/host_generic_simd256.c 2013-03-27 11:37:33 +00:00 (rev 2702)
@@ -0,0 +1,57 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin host_generic_simd256.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+ This file is part of Valgrind, a dynamic binary instrumentation
+ framework.
+
+ Copyright (C) 2012 OpenWorks GbR
+ in...@op...
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+ The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Generic helper functions for doing 256-bit SIMD arithmetic in cases
+ where the instruction selectors cannot generate code in-line.
+ These are purely back-end entities and cannot be seen/referenced
+ from IR. */
+
+#include "libvex_basictypes.h"
+#include "host_generic_simd256.h"
+
+
+void VEX_REGPARM(3)
+ h_generic_calc_Perm32x8 ( /*OUT*/V256* res,
+ V256* argL, V256* argR )
+{
+ res->w32[0] = argL->w32[ argR->w32[0] & 7 ];
+ res->w32[1] = argL->w32[ argR->w32[1] & 7 ];
+ res->w32[2] = argL->w32[ argR->w32[2] & 7 ];
+ res->w32[3] = argL->w32[ argR->w32[3] & 7 ];
+ res->w32[4] = argL->w32[ argR->w32[4] & 7 ];
+ res->w32[5] = argL->w32[ argR->w32[5] & 7 ];
+ res->w32[6] = argL->w32[ argR->w32[6] & 7 ];
+ res->w32[7] = argL->w32[ argR->w32[7] & 7 ];
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end host_generic_simd256.c ---*/
+/*---------------------------------------------------------------*/
Added: trunk/priv/host_generic_maddf.h (+48 -0)
===================================================================
--- trunk/priv/host_generic_maddf.h 2013-03-26 13:53:18 +00:00 (rev 2701)
+++ trunk/priv/host_generic_maddf.h 2013-03-27 11:37:33 +00:00 (rev 2702)
@@ -0,0 +1,48 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin host_generic_maddf.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+ Compute x * y + z as ternary operation.
+ Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Jakub Jelinek <ja...@re...>, 2010.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+/* Generic helper functions for doing FMA, i.e. compute x * y + z
+ as ternary operation.
+ These are purely back-end entities and cannot be seen/referenced
+ from IR. */
+
+#ifndef __VEX_HOST_GENERIC_MADDF_H
+#define __VEX_HOST_GENERIC_MADDF_H
+
+#include "libvex_basictypes.h"
+
+extern VEX_REGPARM(3)
+ void h_generic_calc_MAddF32 ( /*OUT*/Float*, Float*, Float*, Float* );
+
+extern VEX_REGPARM(3)
+ void h_generic_calc_MAddF64 ( /*OUT*/Double*, Double*, Double*,
+ Double* );
+
+#endif /* ndef __VEX_HOST_GENERIC_MADDF_H */
+
+/*---------------------------------------------------------------*/
+/*--- end host_generic_maddf.h --*/
+/*---------------------------------------------------------------*/
Added: trunk/priv/host_generic_maddf.c (+320 -0)
===================================================================
--- trunk/priv/host_generic_maddf.c 2013-03-26 13:53:18 +00:00 (rev 2701)
+++ trunk/priv/host_generic_maddf.c 2013-03-27 11:37:33 +00:00 (rev 2702)
@@ -0,0 +1,320 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin host_generic_maddf.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+ Compute x * y + z as ternary operation.
+ Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Jakub Jelinek <ja...@re...>, 2010.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+/* Generic helper functions for doing FMA, i.e. compute x * y + z
+ as ternary operation.
+ These are purely back-end entities and cannot be seen/referenced
+ from IR. */
+
+#include "libvex_basictypes.h"
+#include "host_generic_maddf.h"
+#include "main_util.h"
+
+/* This implementation relies on Double being more than twice as
+ precise as Float and uses rounding to odd in order to avoid problems
+ with double rounding.
+ See a paper by Boldo and Melquiond:
+ http://www.lri.fr/~melquion/doc/08-tc.pdf */
+
+#define FORCE_EVAL(X) __asm __volatile__ ("" : : "m" (X))
+
+#if defined(__x86_64__) && defined(__SSE2_MATH__)
+# define ENV_TYPE unsigned int
+/* Save current rounding mode into ENV, hold exceptions, set rounding
+ mode to rounding toward zero. */
+# define ROUNDTOZERO(env) \
+ do { \
+ unsigned int mxcsr; \
+ __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \
+ (env) = mxcsr; \
+ mxcsr = (mxcsr | 0x7f80) & ~0x3f; \
+ __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
+ } while (0)
+/* Restore exceptions from ENV, return if inexact exception has been raised
+ since ROUNDTOZERO. */
+# define RESET_TESTINEXACT(env) \
+ ({ \
+ unsigned int mxcsr, ret; \
+ __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \
+ ret = (mxcsr >> 5) & 1; \
+ mxcsr = (mxcsr & 0x3d) | (env); \
+ __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
+ ret; \
+ })
+/* Return if inexact exception has been raised since ROUNDTOZERO. */
+# define TESTINEXACT() \
+ ({ \
+ unsigned int mxcsr; \
+ __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \
+ (mxcsr >> 5) & 1; \
+ })
+#endif
+
+#define DBL_MANT_DIG 53
+#define IEEE754_DOUBLE_BIAS 0x3ff
+
+union vg_ieee754_double {
+ Double d;
+
+ /* This is the IEEE 754 double-precision format. */
+ struct {
+#ifdef VKI_BIG_ENDIAN
+ unsigned int negative:1;
+ unsigned int exponent:11;
+ unsigned int mantissa0:20;
+ unsigned int mantissa1:32;
+#else
+ unsigned int mantissa1:32;
+ unsigned int mantissa0:20;
+ unsigned int exponent:11;
+ unsigned int negative:1;
+#endif
+ } ieee;
+};
+
+void VEX_REGPARM(3)
+ h_generic_calc_MAddF32 ( /*OUT*/Float* res,
+ Float* argX, Float* argY, Float* argZ )
+{
+#ifndef ENV_TYPE
+ /* Lame fallback implementation. */
+ *res = *argX * *argY + *argZ;
+#else
+ ENV_TYPE env;
+ /* Multiplication is always exact. */
+ Double temp = (Double) *argX * (Double) *argY;
+ union vg_ieee754_double u;
+
+ ROUNDTOZERO (env);
+
+ /* Perform addition with round to odd. */
+ u.d = temp + (Double) *argZ;
+ /* Ensure the addition is not scheduled after fetestexcept call. */
+ FORCE_EVAL (u.d);
+
+ /* Reset rounding mode and test for inexact simultaneously. */
+ int j = RESET_TESTINEXACT (env);
+
+ if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
+ u.ieee.mantissa1 |= j;
+
+ /* And finally truncation with round to nearest. */
+ *res = (Float) u.d;
+#endif
+}
+
+
+void VEX_REGPARM(3)
+ h_generic_calc_MAddF64 ( /*OUT*/Double* res,
+ Double* argX, Double* argY, Double* argZ )
+{
+#ifndef ENV_TYPE
+ /* Lame fallback implementation. */
+ *res = *argX * *argY + *argZ;
+#else
+ Double x = *argX, y = *argY, z = *argZ;
+ union vg_ieee754_double u, v, w;
+ int adjust = 0;
+ u.d = x;
+ v.d = y;
+ w.d = z;
+ if (UNLIKELY (u.ieee.exponent + v.ieee.exponent
+ >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG)
+ || UNLIKELY (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
+ || UNLIKELY (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
+ || UNLIKELY (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
+ || UNLIKELY (u.ieee.exponent + v.ieee.exponent
+ <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG)) {
+ /* If z is Inf, but x and y are finite, the result should be
+ z rather than NaN. */
+ if (w.ieee.exponent == 0x7ff
+ && u.ieee.exponent != 0x7ff
+ && v.ieee.exponent != 0x7ff) {
+ *res = (z + x) + y;
+ return;
+ }
+ /* If x or y or z is Inf/NaN, or if fma will certainly overflow,
+ or if x * y is less than half of DBL_DENORM_MIN,
+ compute as x * y + z. */
+ if (u.ieee.exponent == 0x7ff
+ || v.ieee.exponent == 0x7ff
+ || w.ieee.exponent == 0x7ff
+ || u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS
+ || u.ieee.exponent + v.ieee.exponent
+ < IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2) {
+ *res = x * y + z;
+ return;
+ }
+ if (u.ieee.exponent + v.ieee.exponent
+ >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) {
+ /* Compute 1p-53 times smaller result and multiply
+ at the end. */
+ if (u.ieee.exponent > v.ieee.exponent)
+ u.ieee.exponent -= DBL_MANT_DIG;
+ else
+ v.ieee.exponent -= DBL_MANT_DIG;
+ /* If x + y exponent is very large and z exponent is very small,
+ it doesn't matter if we don't adjust it. */
+ if (w.ieee.exponent > DBL_MANT_DIG)
+ w.ieee.exponent -= DBL_MANT_DIG;
+ adjust = 1;
+ } else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
+ /* Similarly.
+ If z exponent is very large and x and y exponents are
+ very small, it doesn't matter if we don't adjust it. */
+ if (u.ieee.exponent > v.ieee.exponent) {
+ if (u.ieee.exponent > DBL_MANT_DIG)
+ u.ieee.exponent -= DBL_MANT_DIG;
+ } else if (v.ieee.exponent > DBL_MANT_DIG)
+ v.ieee.exponent -= DBL_MANT_DIG;
+ w.ieee.exponent -= DBL_MANT_DIG;
+ adjust = 1;
+ } else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
+ u.ieee.exponent -= DBL_MANT_DIG;
+ if (v.ieee.exponent)
+ v.ieee.exponent += DBL_MANT_DIG;
+ else
+ v.d *= 0x1p53;
+ } else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
+ v.ieee.exponent -= DBL_MANT_DIG;
+ if (u.ieee.exponent)
+ u.ieee.exponent += DBL_MANT_DIG;
+ else
+ u.d *= 0x1p53;
+ } else /* if (u.ieee.exponent + v.ieee.exponent
+ <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */ {
+ if (u.ieee.exponent > v.ieee.exponent)
+ u.ieee.exponent += 2 * DBL_MANT_DIG;
+ else
+ v.ieee.exponent += 2 * DBL_MANT_DIG;
+ if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4) {
+ if (w.ieee.exponent)
+ w.ieee.exponent += 2 * DBL_MANT_DIG;
+ else
+ w.d *= 0x1p106;
+ adjust = -1;
+ }
+ /* Otherwise x * y should just affect inexact
+ and nothing else. */
+ }
+ x = u.d;
+ y = v.d;
+ z = w.d;
+ }
+ /* Multiplication m1 + m2 = x * y using Dekker's algorithm. */
+# define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1)
+ Double x1 = x * C;
+ Double y1 = y * C;
+ Double m1 = x * y;
+ x1 = (x - x1) + x1;
+ y1 = (y - y1) + y1;
+ Double x2 = x - x1;
+ Double y2 = y - y1;
+ Double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
+# undef C
+
+ /* Addition a1 + a2 = z + m1 using Knuth's algorithm. */
+ Double a1 = z + m1;
+ Double t1 = a1 - z;
+ Double t2 = a1 - t1;
+ t1 = m1 - t1;
+ t2 = z - t2;
+ Double a2 = t1 + t2;
+
+ ENV_TYPE env;
+ ROUNDTOZERO (env);
+
+ /* Perform m2 + a2 addition with round to odd. */
+ u.d = a2 + m2;
+
+ if (UNLIKELY (adjust < 0)) {
+ if ((u.ieee.mantissa1 & 1) == 0)
+ u.ieee.mantissa1 |= TESTINEXACT ();
+ v.d = a1 + u.d;
+ /* Ensure the addition is not scheduled after fetestexcept call. */
+ FORCE_EVAL (v.d);
+ }
+
+ /* Reset rounding mode and test for inexact simultaneously. */
+ int j = RESET_TESTINEXACT (env) != 0;
+
+ if (LIKELY (adjust == 0)) {
+ if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
+ u.ieee.mantissa1 |= j;
+ /* Result is a1 + u.d. */
+ *res = a1 + u.d;
+ } else if (LIKELY (adjust > 0)) {
+ if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
+ u.ieee.mantissa1 |= j;
+ /* Result is a1 + u.d, scaled up. */
+ *res = (a1 + u.d) * 0x1p53;
+ } else {
+ /* If a1 + u.d is exact, the only rounding happens during
+ scaling down. */
+ if (j == 0) {
+ *res = v.d * 0x1p-106;
+ return;
+ }
+ /* If result rounded to zero is not subnormal, no double
+ rounding will occur. */
+ if (v.ieee.exponent > 106) {
+ *res = (a1 + u.d) * 0x1p-106;
+ return;
+ }
+ /* If v.d * 0x1p-106 with round to zero is a subnormal above
+ or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa
+ down just by 1 bit, which means v.ieee.mantissa1 |= j would
+ change the round bit, not sticky or guard bit.
+ v.d * 0x1p-106 never normalizes by shifting up,
+ so round bit plus sticky bit should be already enough
+ for proper rounding. */
+ if (v.ieee.exponent == 106) {
+ /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
+ v.ieee.mantissa1 & 1 is the round bit and j is our sticky
+ bit. In round-to-nearest 001 rounds down like 00,
+ 011 rounds up, even though 01 rounds down (thus we need
+ to adjust), 101 rounds down like 10 and 111 rounds up
+ like 11. */
+ if ((v.ieee.mantissa1 & 3) == 1) {
+ v.d *= 0x1p-106;
+ if (v.ieee.negative)
+ *res = v.d - 0x1p-1074;
+ else
+ *res = v.d + 0x1p-1074;
+ } else
+ *res = v.d * 0x1p-106;
+ return;
+ }
+ v.ieee.mantissa1 |= j;
+ *res = v.d * 0x1p-106;
+ return;
+ }
+#endif
+}
+
+/*---------------------------------------------------------------*/
+/*--- end host_generic_maddf.c --*/
+/*---------------------------------------------------------------*/
Modified: trunk/priv/host_amd64_isel.c (+356 -1)
===================================================================
--- trunk/priv/host_amd64_isel.c 2013-03-26 13:53:18 +00:00 (rev 2701)
+++ trunk/priv/host_amd64_isel.c 2013-03-27 11:37:33 +00:00 (rev 2702)
@@ -43,6 +43,8 @@
#include "host_generic_regs.h"
#include "host_generic_simd64.h"
#include "host_generic_simd128.h"
+#include "host_generic_simd256.h"
+#include "host_generic_maddf.h"
#include "host_amd64_defs.h"
@@ -2531,6 +2533,73 @@
return dst;
}
+ if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
+ /* Sigh ... very rough code. Could do much better. */
+ /* Get the 128-bit literal 00---0 10---0 into a register
+ and xor it with the value to be negated. */
+ HReg r1 = newVRegI(env);
+ HReg dst = newVRegV(env);
+ HReg tmp = newVRegV(env);
+ HReg src = iselFltExpr(env, e->Iex.Unop.arg);
+ AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
+ addInstr(env, mk_vMOVsd_RR(src,tmp));
+ addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
+ addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
+ addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
+ addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
+ addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
+ add_to_rsp(env, 16);
+ return dst;
+ }
+
+ if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
+ IRQop *qop = e->Iex.Qop.details;
+ HReg dst = newVRegV(env);
+ HReg argX = iselFltExpr(env, qop->arg2);
+ HReg argY = iselFltExpr(env, qop->arg3);
+ HReg argZ = iselFltExpr(env, qop->arg4);
+ /* XXXROUNDINGFIXME */
+ /* set roundingmode here */
+ /* subq $16, %rsp -- make a space*/
+ sub_from_rsp(env, 16);
+ /* Prepare 4 arg regs:
+ leaq 0(%rsp), %rdi
+ leaq 4(%rsp), %rsi
+ leaq 8(%rsp), %rdx
+ leaq 12(%rsp), %rcx
+ */
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
+ hregAMD64_RDI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
+ hregAMD64_RSI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
+ hregAMD64_RDX()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
+ hregAMD64_RCX()));
+ /* Store the three args, at (%rsi), (%rdx) and (%rcx):
+ movss %argX, 0(%rsi)
+ movss %argY, 0(%rdx)
+ movss %argZ, 0(%rcx)
+ */
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
+ AMD64AMode_IR(0, hregAMD64_RSI())));
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
+ AMD64AMode_IR(0, hregAMD64_RDX())));
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
+ AMD64AMode_IR(0, hregAMD64_RCX())));
+ /* call the helper */
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
+ (ULong)(HWord)h_generic_calc_MAddF32,
+ 4, RetLocNone ));
+ /* fetch the result from memory, using %r_argp, which the
+ register allocator will keep alive across the call. */
+ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
+ AMD64AMode_IR(0, hregAMD64_RSP())));
+ /* and finally, clear the space */
+ add_to_rsp(env, 16);
+ return dst;
+ }
+
ppIRExpr(e);
vpanic("iselFltExpr_wrk");
}
@@ -2662,6 +2731,54 @@
}
}
+ if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
+ IRQop *qop = e->Iex.Qop.details;
+ HReg dst = newVRegV(env);
+ HReg argX = iselDblExpr(env, qop->arg2);
+ HReg argY = iselDblExpr(env, qop->arg3);
+ HReg argZ = iselDblExpr(env, qop->arg4);
+ /* XXXROUNDINGFIXME */
+ /* set roundingmode here */
+ /* subq $32, %rsp -- make a space*/
+ sub_from_rsp(env, 32);
+ /* Prepare 4 arg regs:
+ leaq 0(%rsp), %rdi
+ leaq 8(%rsp), %rsi
+ leaq 16(%rsp), %rdx
+ leaq 24(%rsp), %rcx
+ */
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
+ hregAMD64_RDI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
+ hregAMD64_RSI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
+ hregAMD64_RDX()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
+ hregAMD64_RCX()));
+ /* Store the three args, at (%rsi), (%rdx) and (%rcx):
+ movsd %argX, 0(%rsi)
+ movsd %argY, 0(%rdx)
+ movsd %argZ, 0(%rcx)
+ */
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
+ AMD64AMode_IR(0, hregAMD64_RSI())));
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
+ AMD64AMode_IR(0, hregAMD64_RDX())));
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
+ AMD64AMode_IR(0, hregAMD64_RCX())));
+ /* call the helper */
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
+ (ULong)(HWord)h_generic_calc_MAddF64,
+ 4, RetLocNone ));
+ /* fetch the result from memory, using %r_argp, which the
+ register allocator will keep alive across the call. */
+ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
+ AMD64AMode_IR(0, hregAMD64_RSP())));
+ /* and finally, clear the space */
+ add_to_rsp(env, 32);
+ return dst;
+ }
+
if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
HReg arg = iselDblExpr(env, e->Iex.Binop.arg2);
@@ -3478,6 +3595,7 @@
static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
ISelEnv* env, IRExpr* e )
{
+ HWord fn = 0; /* address of helper fn, if required */
vassert(e);
IRType ty = typeOfIRExpr(env->type_env,e);
vassert(ty == Ity_V256);
@@ -3599,6 +3717,8 @@
}
case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
+ case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
+ case Iop_CmpNEZ8x32: op = Asse_CMPEQ8; goto do_CmpNEZ_vector;
do_CmpNEZ_vector:
{
HReg argHi, argLo;
@@ -3673,6 +3793,37 @@
case Iop_AndV256: op = Asse_AND; goto do_SseReRg;
case Iop_OrV256: op = Asse_OR; goto do_SseReRg;
case Iop_XorV256: op = Asse_XOR; goto do_SseReRg;
+ case Iop_Add8x32: op = Asse_ADD8; goto do_SseReRg;
+ case Iop_Add16x16: op = Asse_ADD16; goto do_SseReRg;
+ case Iop_Add32x8: op = Asse_ADD32; goto do_SseReRg;
+ case Iop_Add64x4: op = Asse_ADD64; goto do_SseReRg;
+ case Iop_QAdd8Sx32: op = Asse_QADD8S; goto do_SseReRg;
+ case Iop_QAdd16Sx16: op = Asse_QADD16S; goto do_SseReRg;
+ case Iop_QAdd8Ux32: op = Asse_QADD8U; goto do_SseReRg;
+ case Iop_QAdd16Ux16: op = Asse_QADD16U; goto do_SseReRg;
+ case Iop_Avg8Ux32: op = Asse_AVG8U; goto do_SseReRg;
+ case Iop_Avg16Ux16: op = Asse_AVG16U; goto do_SseReRg;
+ case Iop_CmpEQ8x32: op = Asse_CMPEQ8; goto do_SseReRg;
+ case Iop_CmpEQ16x16: op = Asse_CMPEQ16; goto do_SseReRg;
+ case Iop_CmpEQ32x8: op = Asse_CMPEQ32; goto do_SseReRg;
+ case Iop_CmpGT8Sx32: op = Asse_CMPGT8S; goto do_SseReRg;
+ case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
+ case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
+ case Iop_Max16Sx16: op = Asse_MAX16S; goto do_SseReRg;
+ case Iop_Max8Ux32: op = Asse_MAX8U; goto do_SseReRg;
+ case Iop_Min16Sx16: op = Asse_MIN16S; goto do_SseReRg;
+ case Iop_Min8Ux32: op = Asse_MIN8U; goto do_SseReRg;
+ case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
+ case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
+ case Iop_Mul16x16: op = Asse_MUL16; goto do_SseReRg;
+ case Iop_Sub8x32: op = Asse_SUB8; goto do_SseReRg;
+ case Iop_Sub16x16: op = Asse_SUB16; goto do_SseReRg;
+ case Iop_Sub32x8: op = Asse_SUB32; goto do_SseReRg;
+ case Iop_Sub64x4: op = Asse_SUB64; goto do_SseReRg;
+ case Iop_QSub8Sx32: op = Asse_QSUB8S; goto do_SseReRg;
+ case Iop_QSub16Sx16: op = Asse_QSUB16S; goto do_SseReRg;
+ case Iop_QSub8Ux32: op = Asse_QSUB8U; goto do_SseReRg;
+ case Iop_QSub16Ux16: op = Asse_QSUB16U; goto do_SseReRg;
do_SseReRg:
{
HReg argLhi, argLlo, argRhi, argRlo;
@@ -3689,12 +3840,198 @@
return;
}
+ case Iop_ShlN16x16: op = Asse_SHL16; goto do_SseShift;
+ case Iop_ShlN32x8: op = Asse_SHL32; goto do_SseShift;
+ case Iop_ShlN64x4: op = Asse_SHL64; goto do_SseShift;
+ case Iop_SarN16x16: op = Asse_SAR16; goto do_SseShift;
+ case Iop_SarN32x8: op = Asse_SAR32; goto do_SseShift;
+ case Iop_ShrN16x16: op = Asse_SHR16; goto do_SseShift;
+ case Iop_ShrN32x8: op = Asse_SHR32; goto do_SseShift;
+ case Iop_ShrN64x4: op = Asse_SHR64; goto do_SseShift;
+ do_SseShift: {
+ HReg gregHi, gregLo;
+ iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
+ AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+ AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
+ HReg ereg = newVRegV(env);
+ HReg dstHi = newVRegV(env);
+ HReg dstLo = newVRegV(env);
+ addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
+ addInstr(env, AMD64Instr_Push(rmi));
+ addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
+ addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
+ addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
+ addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
+ addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
+ add_to_rsp(env, 16);
+ *rHi = dstHi;
+ *rLo = dstLo;
+ return;
+ }
+
case Iop_V128HLtoV256: {
*rHi = iselVecExpr(env, e->Iex.Binop.arg1);
*rLo = iselVecExpr(env, e->Iex.Binop.arg2);
return;
}
+ case Iop_Mul32x8: fn = (HWord)h_generic_calc_Mul32x4;
+ goto do_SseAssistedBinary;
+ case Iop_Max32Sx8: fn = (HWord)h_generic_calc_Max32Sx4;
+ goto do_SseAssistedBinary;
+ case Iop_Min32Sx8: fn = (HWord)h_generic_calc_Min32Sx4;
+ goto do_SseAssistedBinary;
+ case Iop_Max32Ux8: fn = (HWord)h_generic_calc_Max32Ux4;
+ goto do_SseAssistedBinary;
+ case Iop_Min32Ux8: fn = (HWord)h_generic_calc_Min32Ux4;
+ goto do_SseAssistedBinary;
+ case Iop_Max16Ux16: fn = (HWord)h_generic_calc_Max16Ux8;
+ goto do_SseAssistedBinary;
+ case Iop_Min16Ux16: fn = (HWord)h_generic_calc_Min16Ux8;
+ goto do_SseAssistedBinary;
+ case Iop_Max8Sx32: fn = (HWord)h_generic_calc_Max8Sx16;
+ goto do_SseAssistedBinary;
+ case Iop_Min8Sx32: fn = (HWord)h_generic_calc_Min8Sx16;
+ goto do_SseAssistedBinary;
+ case Iop_CmpEQ64x4: fn = (HWord)h_generic_calc_CmpEQ64x2;
+ goto do_SseAssistedBinary;
+ case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
+ goto do_SseAssistedBinary;
+ do_SseAssistedBinary: {
+ /* RRRufff! RRRufff code is what we're generating here. Oh
+ well. */
+ vassert(fn != 0);
+ HReg dstHi = newVRegV(env);
+ HReg dstLo = newVRegV(env);
+ HReg argLhi, argLlo, argRhi, argRlo;
+ iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
+ iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
+ HReg argp = newVRegI(env);
+ /* subq $160, %rsp -- make a space*/
+ sub_from_rsp(env, 160);
+ /* leaq 48(%rsp), %r_argp -- point into it */
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
+ argp));
+ /* andq $-16, %r_argp -- 16-align the pointer */
+ addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+ AMD64RMI_Imm( ~(UInt)15 ),
+ argp));
+ /* Prepare 3 arg regs:
+ leaq 0(%r_argp), %rdi
+ leaq 16(%r_argp), %rsi
+ leaq 32(%r_argp), %rdx
+ */
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
+ hregAMD64_RDI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
+ hregAMD64_RSI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
+ hregAMD64_RDX()));
+ /* Store the two high args, at (%rsi) and (%rdx):
+ movupd %argLhi, 0(%rsi)
+ movupd %argRhi, 0(%rdx)
+ */
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
+ AMD64AMode_IR(0, hregAMD64_RSI())));
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
+ AMD64AMode_IR(0, hregAMD64_RDX())));
+ /* Store the two low args, at 48(%rsi) and 48(%rdx):
+ movupd %argLlo, 48(%rsi)
+ movupd %argRlo, 48(%rdx)
+ */
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
+ AMD64AMode_IR(48, hregAMD64_RSI())));
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
+ AMD64AMode_IR(48, hregAMD64_RDX())));
+ /* call the helper */
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, RetLocNone ));
+ /* Prepare 3 arg regs:
+ leaq 48(%r_argp), %rdi
+ leaq 64(%r_argp), %rsi
+ leaq 80(%r_argp), %rdx
+ */
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
+ hregAMD64_RDI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
+ hregAMD64_RSI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
+ hregAMD64_RDX()));
+ /* call the helper */
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, RetLocNone ));
+ /* fetch the result from memory, using %r_argp, which the
+ register allocator will keep alive across the call. */
+ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
+ AMD64AMode_IR(0, argp)));
+ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
+ AMD64AMode_IR(48, argp)));
+ /* and finally, clear the space */
+ add_to_rsp(env, 160);
+ *rHi = dstHi;
+ *rLo = dstLo;
+ return;
+ }
+
+ case Iop_Perm32x8: fn = (HWord)h_generic_calc_Perm32x8;
+ goto do_SseAssistedBinary256;
+ do_SseAssistedBinary256: {
+ /* RRRufff! RRRufff code is what we're generating here. Oh
+ well. */
+ vassert(fn != 0);
+ HReg dstHi = newVRegV(env);
+ HReg dstLo = newVRegV(env);
+ HReg argLhi, argLlo, argRhi, argRlo;
+ iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
+ iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
+ HReg argp = newVRegI(env);
+ /* subq $160, %rsp -- make a space*/
+ sub_from_rsp(env, 160);
+ /* leaq 48(%rsp), %r_argp -- point into it */
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
+ argp));
+ /* andq $-16, %r_argp -- 16-align the pointer */
+ addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+ AMD64RMI_Imm( ~(UInt)15 ),
+ argp));
+ /* Prepare 3 arg regs:
+ leaq 0(%r_argp), %rdi
+ leaq 32(%r_argp), %rsi
+ leaq 64(%r_argp), %rdx
+ */
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
+ hregAMD64_RDI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
+ hregAMD64_RSI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
+ hregAMD64_RDX()));
+ /* Store the two args, at (%rsi) and (%rdx):
+ movupd %argLlo, 0(%rsi)
+ movupd %argLhi, 16(%rsi)
+ movupd %argRlo, 0(%rdx)
+ movupd %argRhi, 16(%rdx)
+ */
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
+ AMD64AMode_IR(0, hregAMD64_RSI())));
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
+ AMD64AMode_IR(16, hregAMD64_RSI())));
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
+ AMD64AMode_IR(0, hregAMD64_RDX())));
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
+ AMD64AMode_IR(16, hregAMD64_RDX())));
+ /* call the helper */
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, RetLocNone ));
+ /* fetch the result from memory, using %r_argp, which the
+ register allocator will keep alive across the call. */
+ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
+ AMD64AMode_IR(0, argp)));
+ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
+ AMD64AMode_IR(16, argp)));
+ /* and finally, clear the space */
+ add_to_rsp(env, 160);
+ *rHi = dstHi;
+ *rLo = dstLo;
+ return;
+ }
+
default:
break;
} /* switch (e->Iex.Binop.op) */
@@ -3725,6 +4062,22 @@
return;
}
+ if (e->tag == Iex_ITE) {
+ HReg r1Hi, r1Lo, r0Hi, r0Lo;
+ iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
+ iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
+ HReg dstHi = newVRegV(env);
+ HReg dstLo = newVRegV(env);
+ addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
+ addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
+ AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
+ addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
+ addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
+ *rHi = dstHi;
+ *rLo = dstLo;
+ return;
+ }
+
//avx_fail:
vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
@@ -4303,7 +4656,9 @@
| VEX_HWCAPS_AMD64_CX16
| VEX_HWCAPS_AMD64_LZCNT
| VEX_HWCAPS_AMD64_AVX
- | VEX_HWCAPS_AMD64_RDTSCP)));
+ | VEX_HWCAPS_AMD64_RDTSCP
+ | VEX_HWCAPS_AMD64_BMI
+ | VEX_HWCAPS_AMD64_AVX2)));
/* Make up an initial environment to use. */
env = LibVEX_Alloc(sizeof(ISelEnv));
Modified: trunk/pub/libvex_ir.h (+28 -1)
===================================================================
--- trunk/pub/libvex_ir.h 2013-03-26 13:53:18 +00:00 (rev 2701)
+++ trunk/pub/libvex_ir.h 2013-03-27 11:37:33 +00:00 (rev 2702)
@@ -1514,8 +1514,35 @@
Iop_NotV256,
/* MISC (vector integer cmp != 0) */
- Iop_CmpNEZ32x8, Iop_CmpNEZ64x4,
+ Iop_CmpNEZ8x32, Iop_CmpNEZ16x16, Iop_CmpNEZ32x8, Iop_CmpNEZ64x4,
+ Iop_Add8x32, Iop_Add16x16, Iop_Add32x8, Iop_Add64x4,
+ Iop_Sub8x32, Iop_Sub16x16, Iop_Sub32x8, Iop_Sub64x4,
+
+ Iop_CmpEQ8x32, Iop_CmpEQ16x16, Iop_CmpEQ32x8, Iop_CmpEQ64x4,
+ Iop_CmpGT8Sx32, Iop_CmpGT16Sx16, Iop_CmpGT32Sx8, Iop_CmpGT64Sx4,
+
+ Iop_ShlN16x16, Iop_ShlN32x8, Iop_ShlN64x4,
+ Iop_ShrN16x16, Iop_ShrN32x8, Iop_ShrN64x4,
+ Iop_SarN16x16, Iop_SarN32x8,
+
+ Iop_Max8Sx32, Iop_Max16Sx16, Iop_Max32Sx8,
+ Iop_Max8Ux32, Iop_Max16Ux16, Iop_Max32Ux8,
+ Iop_Min8Sx32, Iop_Min16Sx16, Iop_Min32Sx8,
+ Iop_Min8Ux32, Iop_Min16Ux16, Iop_Min32Ux8,
+
+ Iop_Mul16x16, Iop_Mul32x8,
+ Iop_MulHi16Ux16, Iop_MulHi16Sx16,
+
+ Iop_QAdd8Ux32, Iop_QAdd16Ux16,
+ Iop_QAdd8Sx32, Iop_QAdd16Sx16,
+ Iop_QSub8Ux32, Iop_QSub16Ux16,
+ Iop_QSub8Sx32, Iop_QSub16Sx16,
+
+ Iop_Avg8Ux32, Iop_Avg16Ux16,
+
+ Iop_Perm32x8,
+
/* ------------------ 256-bit SIMD FP. ------------------ */
Iop_Add64Fx4,
Iop_Sub64Fx4,
Modified: trunk/pub/libvex.h (+7 -5)
===================================================================
--- trunk/pub/libvex.h 2013-03-26 13:53:18 +00:00 (rev 2701)
+++ trunk/pub/libvex.h 2013-03-27 11:37:33 +00:00 (rev 2702)
@@ -79,11 +79,13 @@
/* amd64: baseline capability is SSE2, with cmpxchg8b but not
cmpxchg16b. */
-#define VEX_HWCAPS_AMD64_SSE3 (1<<5) /* SSE3 support */
-#define VEX_HWCAPS_AMD64_CX16 (1<<6) /* cmpxchg16b support */
-#define VEX_HWCAPS_AMD64_LZCNT (1<<7) /* SSE4a LZCNT insn */
-#define VEX_HWCAPS_AMD64_AVX (1<<8) /* AVX instructions */
-#define VEX_HWCAPS_AMD64_RDTSCP (1<<9) /* RDTSCP instruction */
+#define VEX_HWCAPS_AMD64_SSE3 (1<<5) /* SSE3 support */
+#define VEX_HWCAPS_AMD64_CX16 (1<<6) /* cmpxchg16b support */
+#define VEX_HWCAPS_AMD64_LZCNT (1<<7) /* SSE4a LZCNT insn */
+#define VEX_HWCAPS_AMD64_AVX (1<<8) /* AVX instructions */
+#define VEX_HWCAPS_AMD64_RDTSCP (1<<9) /* RDTSCP instruction */
+#define VEX_HWCAPS_AMD64_BMI (1<<10) /* BMI1 instructions */
+#define VEX_HWCAPS_AMD64_AVX2 (1<<11) /* AVX2 instructions */
/* ppc32: baseline capability is integer only */
#define VEX_HWCAPS_PPC32_F (1<<8) /* basic (non-optional) FP */
Modified: trunk/pub/libvex_basictypes.h (+10 -0)
===================================================================
--- trunk/pub/libvex_basictypes.h 2013-03-26 13:53:18 +00:00 (rev 2701)
+++ trunk/pub/libvex_basictypes.h 2013-03-27 11:37:33 +00:00 (rev 2702)
@@ -75,6 +75,16 @@
}
V128;
+/* A union for doing 256-bit vector primitives conveniently. */
+typedef
+ union {
+ UChar w8[32];
+ UShort w16[16];
+ UInt w32[8];
+ ULong w64[4];
+ }
+ V256;
+
/* Floating point. */
typedef float Float; /* IEEE754 single-precision (32-bit) value */
typedef double Double; /* IEEE754 double-precision (64-bit) value */
Modified: trunk/priv/guest_amd64_toIR.c (+3983 -26)
===================================================================
--- trunk/priv/guest_amd64_toIR.c 2013-03-26 13:53:18 +00:00 (rev 2701)
+++ trunk/priv/guest_amd64_toIR.c 2013-03-27 11:37:33 +00:00 (rev 2702)
@@ -1290,6 +1290,38 @@
}
+static
+IRExpr* getIRegV ( Int sz, Prefix pfx )
+{
+ if (sz == 4) {
+ sz = 8;
+ return unop(Iop_64to32,
+ IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
+ szToITy(sz) ));
+ } else {
+ return IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
+ szToITy(sz) );
+ }
+}
+
+static
+void putIRegV ( Int sz, Prefix pfx, IRExpr* e )
+{
+ vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
+ if (sz == 4) {
+ e = unop(Iop_32Uto64,e);
+ }
+ stmt( IRStmt_Put( offsetIReg( sz, getVexNvvvv(pfx), False ), e ) );
+}
+
+static
+const HChar* nameIRegV ( Int sz, Prefix pfx )
+{
+ return nameIReg( sz, getVexNvvvv(pfx), False );
+}
+
+
+
/* Produce the guest state offset for a reference to the 'e' register
field in a modrm byte, taking into account REX (or its absence),
and the size of the access. eregOfRexRM will assert if mod_reg_rm
@@ -2677,6 +2709,88 @@
}
+/* Similarly for VSIB addressing. This returns just the addend,
+ and fills in *rI and *vscale with the register number of the vector
+ index and its multiplicand. */
+static
+IRTemp disAVSIBMode ( /*OUT*/Int* len,
+ VexAbiInfo* vbi, Prefix pfx, Long delta,
+ /*OUT*/HChar* buf, /*OUT*/UInt* rI,
+ IRType ty, /*OUT*/Int* vscale )
+{
+ UChar mod_reg_rm = getUChar(delta);
+ const HChar *vindex;
+
+ *len = 0;
+ *rI = 0;
+ *vscale = 0;
+ buf[0] = (UChar)0;
+ if ((mod_reg_rm & 7) != 4 || epartIsReg(mod_reg_rm))
+ return IRTemp_INVALID;
+
+ UChar sib = getUChar(delta+1);
+ UChar scale = toUChar((sib >> 6) & 3);
+ UChar index_r = toUChar((sib >> 3) & 7);
+ UChar base_r = toUChar(sib & 7);
+ Long d = 0;
+ /* correct since #(R13) == 8 + #(RBP) */
+ Bool base_is_BPor13 = toBool(base_r == R_RBP);
+ delta += 2;
+ *len = 2;
+
+ *rI = index_r | (getRexX(pfx) << 3);
+ if (ty == Ity_V128)
+ vindex = nameXMMReg(*rI);
+ else
+ vindex = nameYMMReg(*rI);
+ *vscale = 1<<scale;
+
+ switch (mod_reg_rm >> 6) {
+ case 0:
+ if (base_is_BPor13) {
+ d = getSDisp32(delta);
+ *len += 4;
+ if (scale == 0) {
+ DIS(buf, "%s%lld(,%s)", segRegTxt(pfx), d, vindex);
+ } else {
+ DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d, vindex, 1<<scale);
+ }
+ return disAMode_copy2tmp( mkU64(d) );
+ } else {
+ if (scale == 0) {
+ DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
+ nameIRegRexB(8,pfx,base_r), vindex);
+ } else {
+ DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
+ nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
+ }
+ }
+ break;
+ case 1:
+ d = getSDisp8(delta);
+ *len += 1;
+ goto have_disp;
+ case 2:
+ d = getSDisp32(delta);
+ *len += 4;
+ have_disp:
+ if (scale == 0) {
+ DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
+ nameIRegRexB(8,pfx,base_r), vindex);
+ } else {
+ DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
+ nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
+ }
+ break;
+ }
+
+ if (!d)
+ return disAMode_copy2tmp( getIRegRexB(8,pfx,base_r) );
+ return disAMode_copy2tmp( binop(Iop_Add64, getIRegRexB(8,pfx,base_r),
+ mkU64(d)) );
+}
+
+
/* Figure out the number of (insn-stream) bytes constituting the amode
beginning at delta. Is useful for getting hold of literals beyond
the end of the amode before it has been disassembled. */
@@ -2822,7 +2936,7 @@
&& offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
if (False && op8 == Iop_Sub8)
vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
- putIRegG(size,pfx,rm, mkU(ty,0));
+ putIRegG(size,pfx,rm, mkU(ty,0));
}
assign( dst0, getIRegG(size,pfx,rm) );
@@ -3734,7 +3848,7 @@
/* Write the result back, if non-BT. */
if (gregLO3ofRM(modrm) != 4 /* BT */) {
if (epartIsReg(modrm)) {
- putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
+ putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
} else {
if (pfx & PFX_LOCK) {
casLE( mkexpr(t_addr),
@@ -3931,7 +4045,7 @@
} else {
addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
/* we have to inform disAMode of any immediate
- bytes used */
+ bytes used */
gregLO3ofRM(modrm)==0/*TEST*/
? imin(4,sz)
: 0
@@ -4212,9 +4326,9 @@
putIReg64(R_RSP, mkexpr(t2) );
storeLE( mkexpr(t2), mkexpr(t3) );
break;
- } else {
+ } else {
goto unhandled; /* awaiting test case */
- }
+ }
default:
unhandled:
*decode_OK = False;
@@ -4673,6 +4787,34 @@
}
+/* Generate an IR sequence to do a count-trailing-zeroes operation on
+ the supplied IRTemp, and return a new IRTemp holding the result.
+ 'ty' may be Ity_I16, Ity_I32 or Ity_I64 only. In the case where
+ the argument is zero, return the number of bits in the word (the
+ natural semantics). */
+static IRTemp gen_TZCNT ( IRType ty, IRTemp src )
+{
+ vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
+
+ IRTemp src64 = newTemp(Ity_I64);
+ assign(src64, widenUto64( mkexpr(src) ));
+
+ // Ctz64 has undefined semantics when its input is zero, so
+ // special-case around that.
+ IRTemp res64 = newTemp(Ity_I64);
+ assign(res64,
+ IRExpr_ITE(
+ binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0)),
+ mkU64(8 * sizeofIRType(ty)),
+ unop(Iop_Ctz64, mkexpr(src64))
+ ));
+
+ IRTemp res = newTemp(ty);
+ assign(res, narrowTo(ty, mkexpr(res64)));
+ return res;
+}
+
+
/*------------------------------------------------------------*/
/*--- ---*/
/*--- x87 FLOATING POINT INSTRUCTIONS ---*/
@@ -5248,7 +5390,7 @@
issue. If needed, side-exit to the next insn,
reporting the warning, so that Valgrind's dispatcher
sees the warning. */
- assign(ew, unop(Iop_64to32,mkexpr(w64)) );
+ assign(ew, unop(Iop_64to32,mkexpr(w64)) );
put_emwarn( mkexpr(ew) );
stmt(
IRStmt_Exit(
@@ -7512,7 +7654,7 @@
binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
));
- /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
+ /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
assign( res64,
binop(Iop_Shr64,
binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
@@ -8142,8 +8284,7 @@
putIRegG(sz, pfx, rm, mkexpr(tmpd));
putIRegE(sz, pfx, rm, mkexpr(tmpt1));
DIP("xadd%c %s, %s\n",
- nameISize(sz), nameIRegG(sz,pfx,rm),
- nameIRegE(sz,pfx,rm));
+ nameISize(sz), nameIRegG(sz,pfx,rm), nameIRegE(sz,pfx,rm));
*decode_ok = True;
return 1+delta0;
}
@@ -8570,7 +8711,7 @@
}
putXMMReg( gregOfRexRM(pfx,rm),
eLeft ? binop(op, epart, gpart)
- : binop(op, gpart, epart) );
+ : binop(op, gpart, epart) );
return delta;
}
@@ -8743,7 +8884,7 @@
? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
: /*sz==4*/
unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
- )
+ )
);
delta += alen+1;
DIP("%s $%d,%s,%s\n", opname,
@@ -9267,6 +9408,31 @@
return math_PABS_XMM(aa, 1);
}
+/* YMM version of math_PABS_XMM. */
+static IRTemp math_PABS_YMM ( IRTemp aa, Int laneszB )
+{
+ IRTemp res = newTemp(Ity_V256);
+ IRTemp aaHi = IRTemp_INVALID;
+ IRTemp aaLo = IRTemp_INVALID;
+ breakupV256toV128s(aa, &aaHi, &aaLo);
+ assign(res, binop(Iop_V128HLtoV256,
+ mkexpr(math_PABS_XMM(aaHi, laneszB)),
+ mkexpr(math_PABS_XMM(aaLo, laneszB))));
+ return res;
+}
+
+static IRTemp math_PABS_YMM_pap4 ( IRTemp aa ) {
+ return math_PABS_YMM(aa, 4);
+}
+
+static IRTemp math_PABS_YMM_pap2 ( IRTemp aa ) {
+ return math_PABS_YMM(aa, 2);
+}
+
+static IRTemp math_PABS_YMM_pap1 ( IRTemp aa ) {
+ return math_PABS_YMM(aa, 1);
+}
+
static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
IRTemp lo64, Long byteShift )
{
@@ -9634,6 +9800,47 @@
}
+static Long dis_PSHUFD_32x8 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
+{
+ Int order;
+ Int alen = 0;
+ HChar dis_buf[50];
+ IRTemp sV = newTemp(Ity_V256);
+ UChar modrm = getUChar(delta);
+ IRTemp addr = IRTemp_INVALID;
+ UInt rG = gregOfRexRM(pfx,modrm);
+ if (epartIsReg(modrm)) {
+ UInt rE = eregOfRexRM(pfx,modrm);
+ assign( sV, getYMMReg(rE) );
+ order = (Int)getUChar(delta+1);
+ delta += 1+1;
+ DIP("vpshufd $%d,%s,%s\n", order, nameYMMReg(rE), nameYMMReg(rG));
+ } else {
+ addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
+ 1/*byte after the amode*/ );
+ assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
+ order = (Int)getUChar(delta+alen);
+ delta += alen+1;
+ DIP("vpshufd $%d,%s,%s\n", order, dis_buf, nameYMMReg(rG));
+ }
+
+ IRTemp s[8];
+ s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
+ breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
+ &s[3], &s[2], &s[1], &s[0] );
+
+ putYMMReg( rG, mkV256from32s( s[4 + ((order>>6)&3)],
+ s[4 + ((order>>4)&3)],
+ s[4 + ((order>>2)&3)],
+ s[4 + ((order>>0)&3)],
+ s[0 + ((order>>6)&3)],
+ s[0 + ((order>>4)&3)],
+ s[0 + ((order>>2)&3)],
+ s[0 + ((order>>0)&3)] ) );
+ return delta;
+}
+
+
static IRTemp math_PSRLDQ ( IRTemp sV, Int imm )
{
IRTemp dV = newTemp(Ity_V128);
@@ -10280,6 +10487,28 @@
}
+static Long dis_PMOVMSKB_256 ( VexAbiInfo* vbi, Prefix pfx,
+ Long delta )
+{
+ UChar modrm = getUChar(delta);
+ vassert(epartIsReg(modrm)); /* ensured by caller */
+ UInt rE = eregOfRexRM(pfx,modrm);
+ UInt rG = gregOfRexRM(pfx,modrm);
+ IRTemp t0 = newTemp(Ity_V128);
+ IRTemp t1 = newTemp(Ity_V128);
+ IRTemp t2 = newTemp(Ity_I16);
+ IRTemp t3 = newTemp(Ity_I16);
+ assign(t0, getYMMRegLane128(rE, 0));
+ assign(t1, getYMMRegLane128(rE, 1));
+ assign(t2, unop(Iop_GetMSBs8x16, mkexpr(t0)));
+ assign(t3, unop(Iop_GetMSBs8x16, mkexpr(t1)));
+ putIReg32(rG, binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)));
+ DIP("vpmovmskb %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
+ delta += 1;
+ return delta;
+}
+
+
/* FIXME: why not just use InterleaveLO / InterleaveHI? I think the
relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
/* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
@@ -10542,6 +10771,22 @@
}
+static IRTemp math_PMULUDQ_256 ( IRTemp sV, IRTemp dV )
+{
+ /* This is a really poor translation -- could be improved if
+ performance critical */
+ IRTemp sHi, sLo, dHi, dLo;
+ sHi = sLo = dHi = dLo = IRTemp_INVALID;
+ breakupV256toV128s( dV, &dHi, &dLo);
+ breakupV256toV128s( sV, &sHi, &sLo);
+...
[truncated message content] |