|
From: <sv...@va...> - 2013-03-27 11:42:16
|
sewardj 2013-03-27 11:42:05 +0000 (Wed, 27 Mar 2013)
New Revision: 13339
Log:
Testcase additions pertaining to #305728, which added support for
AVX2, BMI1, BMI2 and FMA instructions.
(Jakub Jelinek, ja...@re...)
Added files:
trunk/none/tests/amd64/avx2-1.c
trunk/none/tests/amd64/avx2-1.stderr.exp
trunk/none/tests/amd64/avx2-1.stdout.exp
trunk/none/tests/amd64/avx2-1.vgtest
trunk/none/tests/amd64/bmi.c
trunk/none/tests/amd64/bmi.stderr.exp
trunk/none/tests/amd64/bmi.stdout.exp
trunk/none/tests/amd64/bmi.vgtest
trunk/none/tests/amd64/fma.c
trunk/none/tests/amd64/fma.stderr.exp
trunk/none/tests/amd64/fma.stdout.exp
trunk/none/tests/amd64/fma.vgtest
Modified files:
trunk/none/tests/amd64/Makefile.am
trunk/none/tests/amd64/avx-1.c
trunk/none/tests/amd64/avx-1.stdout.exp
Added: trunk/none/tests/amd64/bmi.vgtest (+3 -0)
===================================================================
--- trunk/none/tests/amd64/bmi.vgtest 2013-03-27 11:40:02 +00:00 (rev 13338)
+++ trunk/none/tests/amd64/bmi.vgtest 2013-03-27 11:42:05 +00:00 (rev 13339)
@@ -0,0 +1,3 @@
+prog: bmi
+prereq: ../../../tests/x86_amd64_features amd64-avx
+vgopts: -q
Added: trunk/none/tests/amd64/fma.stdout.exp (+1 -0)
===================================================================
--- trunk/none/tests/amd64/fma.stdout.exp 2013-03-27 11:40:02 +00:00 (rev 13338)
+++ trunk/none/tests/amd64/fma.stdout.exp 2013-03-27 11:42:05 +00:00 (rev 13339)
@@ -0,0 +1 @@
+Testing successful
Added: trunk/none/tests/amd64/fma.c (+1431 -0)
===================================================================
--- trunk/none/tests/amd64/fma.c 2013-03-27 11:40:02 +00:00 (rev 13338)
+++ trunk/none/tests/amd64/fma.c 2013-03-27 11:42:05 +00:00 (rev 13339)
@@ -0,0 +1,1431 @@
+#include <stdio.h>
+#include <string.h>
+
+#define N 64
+struct float_test {
+ float x[N], y[N], z[N], expected[N], res[N];
+} ft __attribute__((aligned (32)));
+
+struct double_test {
+ double x[N], y[N], z[N], expected[N], res[N];
+} dt __attribute__((aligned (32)));
+
+float plus_zero, plus_infty, minus_infty, nan_value;
+
+static int testf( float x, float y )
+{
+ unsigned int a, b;
+ memcpy( &a, &x, sizeof (a) );
+ memcpy( &b, &y, sizeof (b) );
+ if ((a & 0x7fc00000U) == 0x7fc00000U)
+ return (b & 0x7fc00000U) != 0x7fc00000U;
+ return memcmp( &a, &b, sizeof (a) ) != 0;
+}
+
+static int test_fmaf( void )
+{
+ int res = 0, i, j;
+ float w;
+ for (i = 0; i < N; i++) {
+ int thisres = 0;
+ __asm __volatile__ ("vfmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
+ thisres |= testf( w, ft.expected[i] );
+ __asm __volatile__ ("vfmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
+ thisres |= testf( w, ft.expected[i] );
+ __asm __volatile__ ("vfmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
+ thisres |= testf( w, ft.expected[i] );
+ __asm __volatile__ ("vfmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
+ thisres |= testf( w, ft.expected[i] );
+ __asm __volatile__ ("vfmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
+ thisres |= testf( w, ft.expected[i] );
+ __asm __volatile__ ("vfmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
+ thisres |= testf( w, ft.expected[i] );
+ if (thisres)
+ printf( "Failure 1 %d %a %a\n", i, w, ft.expected[i] );
+ res |= thisres;
+ thisres = 0;
+ __asm __volatile__ ("vfnmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
+ thisres |= testf( -w, ft.expected[i] );
+ __asm __volatile__ ("vfnmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
+ thisres |= testf( -w, ft.expected[i] );
+ __asm __volatile__ ("vfnmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
+ thisres |= testf( -w, ft.expected[i] );
+ __asm __volatile__ ("vfnmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
+ thisres |= testf( -w, ft.expected[i] );
+ __asm __volatile__ ("vfnmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
+ thisres |= testf( -w, ft.expected[i] );
+ __asm __volatile__ ("vfnmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
+ thisres |= testf( -w, ft.expected[i] );
+ if (thisres)
+ printf( "Failure 2 %d %a %a\n", i, w, ft.expected[i] );
+ res |= thisres;
+ }
+ for (i = 0; i < N; i++)
+ ft.z[i] = -ft.z[i];
+ for (i = 0; i < N; i++) {
+ int thisres = 0;
+ __asm __volatile__ ("vfmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
+ thisres |= testf( w, ft.expected[i] );
+ __asm __volatile__ ("vfmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
+ thisres |= testf( w, ft.expected[i] );
+ __asm __volatile__ ("vfmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
+ thisres |= testf( w, ft.expected[i] );
+ __asm __volatile__ ("vfmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
+ thisres |= testf( w, ft.expected[i] );
+ __asm __volatile__ ("vfmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
+ thisres |= testf( w, ft.expected[i] );
+ __asm __volatile__ ("vfmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
+ thisres |= testf( w, ft.expected[i] );
+ if (thisres)
+ printf( "Failure 3 %d %a %a\n", i, w, ft.expected[i] );
+ res |= thisres;
+ thisres = 0;
+ __asm __volatile__ ("vfnmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
+ thisres |= testf( -w, ft.expected[i] );
+ __asm __volatile__ ("vfnmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
+ thisres |= testf( -w, ft.expected[i] );
+ __asm __volatile__ ("vfnmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
+ thisres |= testf( -w, ft.expected[i] );
+ __asm __volatile__ ("vfnmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
+ thisres |= testf( -w, ft.expected[i] );
+ __asm __volatile__ ("vfnmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
+ thisres |= testf( -w, ft.expected[i] );
+ __asm __volatile__ ("vfnmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
+ thisres |= testf( -w, ft.expected[i] );
+ if (thisres)
+ printf( "Failure 4 %d %a %a\n", i, w, ft.expected[i] );
+ res |= thisres;
+ }
+ for (i = 0; i < N; i++)
+ ft.z[i] = -ft.z[i];
+ for (i = 0; i < N; i += 4) {
+ int thisres = 0;
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
+ "vfmadd132ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
+ "vfmadd132ps (%2), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
+ "vfmadd213ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
+ "vfmadd213ps (%3), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
+ "vfmadd231ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
+ "vfmadd231ps (%2), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 5 %d", i );
+ for (j = 0; j < 4; j++)
+ printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ thisres = 0;
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
+ "vfnmsub132ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
+ "vfnmsub132ps (%2), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
+ "vfnmsub213ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
+ "vfnmsub213ps (%3), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
+ "vfnmsub231ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
+ "vfnmsub231ps (%2), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 6 %d", i );
+ for (j = 0; j < 4; j++)
+ printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ }
+ for (i = 0; i < N; i++)
+ ft.z[i] = -ft.z[i];
+ for (i = 0; i < N; i += 4) {
+ int thisres = 0;
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
+ "vfmsub132ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
+ "vfmsub132ps (%2), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
+ "vfmsub213ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
+ "vfmsub213ps (%3), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
+ "vfmsub231ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
+ "vfmsub231ps (%2), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 7 %d", i );
+ for (j = 0; j < 4; j++)
+ printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ thisres = 0;
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
+ "vfnmadd132ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
+ "vfnmadd132ps (%2), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
+ "vfnmadd213ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
+ "vfnmadd213ps (%3), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
+ "vfnmadd231ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
+ "vfnmadd231ps (%2), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 8 %d", i );
+ for (j = 0; j < 4; j++)
+ printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ }
+ for (i = 1; i < N; i += 2)
+ ft.z[i] = -ft.z[i];
+ for (i = 0; i < N; i += 4) {
+ int thisres = 0;
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
+ "vfmaddsub132ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
+ "vfmaddsub132ps (%2), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
+ "vfmaddsub213ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
+ "vfmaddsub213ps (%3), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
+ "vfmaddsub231ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
+ "vfmaddsub231ps (%2), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 9 %d", i );
+ for (j = 0; j < 4; j++)
+ printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ }
+ for (i = 0; i < N; i++)
+ ft.z[i] = -ft.z[i];
+ for (i = 0; i < N; i += 4) {
+ int thisres = 0;
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
+ "vfmsubadd132ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
+ "vfmsubadd132ps (%2), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
+ "vfmsubadd213ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
+ "vfmsubadd213ps (%3), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
+ "vfmsubadd231ps %%xmm7, %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
+ "vfmsubadd231ps (%2), %%xmm8, %%xmm9;"
+ "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 4; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 10 %d", i );
+ for (j = 0; j < 4; j++)
+ printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ }
+ for (i = 1; i < N; i += 2)
+ ft.z[i] = -ft.z[i];
+ for (i = 0; i < N; i += 8) {
+ int thisres = 0;
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
+ "vfmadd132ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
+ "vfmadd132ps (%2), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
+ "vfmadd213ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
+ "vfmadd213ps (%3), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
+ "vfmadd231ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
+ "vfmadd231ps (%2), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 11 %d", i );
+ for (j = 0; j < 8; j++)
+ printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ thisres = 0;
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
+ "vfnmsub132ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
+ "vfnmsub132ps (%2), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
+ "vfnmsub213ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
+ "vfnmsub213ps (%3), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
+ "vfnmsub231ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
+ "vfnmsub231ps (%2), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 12 %d", i );
+ for (j = 0; j < 8; j++)
+ printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ }
+ for (i = 0; i < N; i++)
+ ft.z[i] = -ft.z[i];
+ for (i = 0; i < N; i += 8) {
+ int thisres = 0;
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
+ "vfmsub132ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
+ "vfmsub132ps (%2), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
+ "vfmsub213ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
+ "vfmsub213ps (%3), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
+ "vfmsub231ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
+ "vfmsub231ps (%2), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 13 %d", i );
+ for (j = 0; j < 8; j++)
+ printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ thisres = 0;
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
+ "vfnmadd132ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
+ "vfnmadd132ps (%2), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
+ "vfnmadd213ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
+ "vfnmadd213ps (%3), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
+ "vfnmadd231ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
+ "vfnmadd231ps (%2), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 14 %d", i );
+ for (j = 0; j < 8; j++)
+ printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ }
+ for (i = 1; i < N; i += 2)
+ ft.z[i] = -ft.z[i];
+ for (i = 0; i < N; i += 8) {
+ int thisres = 0;
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
+ "vfmaddsub132ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
+ "vfmaddsub132ps (%2), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
+ "vfmaddsub213ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
+ "vfmaddsub213ps (%3), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
+ "vfmaddsub231ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
+ "vfmaddsub231ps (%2), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 15 %d", i );
+ for (j = 0; j < 8; j++)
+ printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ }
+ for (i = 0; i < N; i++)
+ ft.z[i] = -ft.z[i];
+ for (i = 0; i < N; i += 8) {
+ int thisres = 0;
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
+ "vfmsubadd132ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
+ "vfmsubadd132ps (%2), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
+ "vfmsubadd213ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
+ "vfmsubadd213ps (%3), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
+ "vfmsubadd231ps %%ymm7, %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
+ "vfmsubadd231ps (%2), %%ymm8, %%ymm9;"
+ "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
+ "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 8; j++)
+ thisres |= testf( ft.res[i+j], ft.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 16 %d", i );
+ for (j = 0; j < 8; j++)
+ printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ }
+ for (i = 1; i < N; i += 2)
+ ft.z[i] = -ft.z[i];
+ return res;
+}
+
+static int test( double x, double y )
+{
+ unsigned long long a, b;
+ memcpy( &a, &x, sizeof (a) );
+ memcpy( &b, &y, sizeof (b) );
+ if ((a & 0x7ff8000000000000ULL) == 0x7ff8000000000000ULL)
+ return (b & 0x7ff8000000000000ULL) != 0x7ff8000000000000ULL;
+ return memcmp( &a, &b, sizeof (a) ) != 0;
+}
+
+static int test_fma( void )
+{
+ int res = 0, i, j;
+ double w;
+ for (i = 0; i < N; i++) {
+ int thisres = 0;
+ __asm __volatile__ ("vfmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
+ thisres |= test( w, dt.expected[i] );
+ __asm __volatile__ ("vfmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
+ thisres |= test( w, dt.expected[i] );
+ __asm __volatile__ ("vfmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
+ thisres |= test( w, dt.expected[i] );
+ __asm __volatile__ ("vfmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
+ thisres |= test( w, dt.expected[i] );
+ __asm __volatile__ ("vfmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
+ thisres |= test( w, dt.expected[i] );
+ __asm __volatile__ ("vfmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
+ thisres |= test( w, dt.expected[i] );
+ if (thisres)
+ printf( "Failure 1 %d %a %a\n", i, w, dt.expected[i] );
+ res |= thisres;
+ thisres = 0;
+ __asm __volatile__ ("vfnmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
+ thisres |= test( -w, dt.expected[i] );
+ __asm __volatile__ ("vfnmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
+ thisres |= test( -w, dt.expected[i] );
+ __asm __volatile__ ("vfnmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
+ thisres |= test( -w, dt.expected[i] );
+ __asm __volatile__ ("vfnmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
+ thisres |= test( -w, dt.expected[i] );
+ __asm __volatile__ ("vfnmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
+ thisres |= test( -w, dt.expected[i] );
+ __asm __volatile__ ("vfnmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
+ thisres |= test( -w, dt.expected[i] );
+ if (thisres)
+ printf( "Failure 2 %d %a %a\n", i, w, dt.expected[i] );
+ res |= thisres;
+ }
+ for (i = 0; i < N; i++)
+ dt.z[i] = -dt.z[i];
+ for (i = 0; i < N; i++) {
+ int thisres = 0;
+ __asm __volatile__ ("vfmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
+ thisres |= test( w, dt.expected[i] );
+ __asm __volatile__ ("vfmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
+ thisres |= test( w, dt.expected[i] );
+ __asm __volatile__ ("vfmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
+ thisres |= test( w, dt.expected[i] );
+ __asm __volatile__ ("vfmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
+ thisres |= test( w, dt.expected[i] );
+ __asm __volatile__ ("vfmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
+ thisres |= test( w, dt.expected[i] );
+ __asm __volatile__ ("vfmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
+ thisres |= test( w, dt.expected[i] );
+ if (thisres)
+ printf( "Failure 3 %d %a %a\n", i, w, dt.expected[i] );
+ res |= thisres;
+ thisres = 0;
+ __asm __volatile__ ("vfnmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
+ thisres |= test( -w, dt.expected[i] );
+ __asm __volatile__ ("vfnmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
+ thisres |= test( -w, dt.expected[i] );
+ __asm __volatile__ ("vfnmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
+ thisres |= test( -w, dt.expected[i] );
+ __asm __volatile__ ("vfnmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
+ thisres |= test( -w, dt.expected[i] );
+ __asm __volatile__ ("vfnmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
+ thisres |= test( -w, dt.expected[i] );
+ __asm __volatile__ ("vfnmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
+ thisres |= test( -w, dt.expected[i] );
+ if (thisres)
+ printf( "Failure 4 %d %a %a\n", i, w, dt.expected[i] );
+ res |= thisres;
+ }
+ for (i = 0; i < N; i++)
+ dt.z[i] = -dt.z[i];
+ for (i = 0; i < N; i += 2) {
+ int thisres = 0;
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
+ "vfmadd132pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
+ "vfmadd132pd (%2), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
+ "vfmadd213pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
+ "vfmadd213pd (%3), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
+ "vfmadd231pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
+ "vfmadd231pd (%2), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 5 %d", i );
+ for (j = 0; j < 2; j++)
+ printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ thisres = 0;
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
+ "vfnmsub132pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( -dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
+ "vfnmsub132pd (%2), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( -dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
+ "vfnmsub213pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( -dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
+ "vfnmsub213pd (%3), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( -dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
+ "vfnmsub231pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( -dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
+ "vfnmsub231pd (%2), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( -dt.res[i+j], dt.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 6 %d", i );
+ for (j = 0; j < 2; j++)
+ printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ }
+ for (i = 0; i < N; i++)
+ dt.z[i] = -dt.z[i];
+ for (i = 0; i < N; i += 2) {
+ int thisres = 0;
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
+ "vfmsub132pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
+ "vfmsub132pd (%2), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
+ "vfmsub213pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
+ "vfmsub213pd (%3), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
+ "vfmsub231pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
+ "vfmsub231pd (%2), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 7 %d", i );
+ for (j = 0; j < 2; j++)
+ printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ thisres = 0;
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
+ "vfnmadd132pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( -dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
+ "vfnmadd132pd (%2), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( -dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
+ "vfnmadd213pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( -dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
+ "vfnmadd213pd (%3), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( -dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
+ "vfnmadd231pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( -dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
+ "vfnmadd231pd (%2), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( -dt.res[i+j], dt.expected[i+j] );
+ if (thisres) {
+ printf( "Failure 8 %d", i );
+ for (j = 0; j < 2; j++)
+ printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
+ printf( "\n" );
+ }
+ res |= thisres;
+ }
+ for (i = 1; i < N; i += 2)
+ dt.z[i] = -dt.z[i];
+ for (i = 0; i < N; i += 2) {
+ int thisres = 0;
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
+ "vfmaddsub132pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
+ "vfmaddsub132pd (%2), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
+ "vfmaddsub213pd %%xmm7, %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
+ for (j = 0; j < 2; j++)
+ thisres |= test( dt.res[i+j], dt.expected[i+j] );
+ __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
+ "vfmaddsub213pd (%3), %%xmm8, %%xmm9;"
+ "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
+ "...
[truncated message content] |