|
From: Harold N. <ha...@al...> - 2005-11-06 14:06:43
|
As promised, here is a quick SSE3 program that generates an unhandled
instruction.
This program does complex multiplication of two arrays, which is basically
what SSE3 is designed to do.
It chokes on _mm_moveldup_ps, which is the first SSE3 instruction in the
program.
I compiled this program with g++-4.0.2 -g -msse3
Valgrind version is from subversion as of November 5, 2005.
My kernel is 2.6.12-gentoo-r10.
My arch is amd64. The chip is AMD 64 x2.
The output of valgrind is:
=3D=3D15797=3D=3D Memcheck, a memory error detector.
=3D=3D15797=3D=3D Copyright (C) 2002-2005, and GNU GPL'd, by Julian Seward =
et al.
=3D=3D15797=3D=3D Using LibVEX rev 1404, a library for dynamic binary trans=
lation.
=3D=3D15797=3D=3D Copyright (C) 2004-2005, and GNU GPL'd, by OpenWorks LLP.
=3D=3D15797=3D=3D Using valgrind-3.1.SVN, a dynamic binary instrumentation
framework.
=3D=3D15797=3D=3D Copyright (C) 2000-2005, and GNU GPL'd, by Julian Seward =
et al.
=3D=3D15797=3D=3D For more details, rerun with: -v
=3D=3D15797=3D=3D
vex amd64->IR: unhandled instruction bytes: 0xF3 0xF 0x12 0x0
=3D=3D15797=3D=3D Your program just tried to execute an instruction that Va=
lgrind
=3D=3D15797=3D=3D did not recognise. This might be because your program has=
a bug
=3D=3D15797=3D=3D and erroneously jumped to a non-code location. If you are=
running
=3D=3D15797=3D=3D Memcheck, you might have just seen a warning about a bad =
jump,
=3D=3D15797=3D=3D which is a good indication that this is so. Or it might b=
e
=3D=3D15797=3D=3D because the instruction is unimplemented in Valgrind; if =
you
=3D=3D15797=3D=3D think this is the case, or you are not sure, please let u=
s know.
=3D=3D15797=3D=3D
=3D=3D15797=3D=3D Process terminating with default action of signal 4 (SIGI=
LL)
=3D=3D15797=3D=3D Illegal opcode at address 0x4007FE
=3D=3D15797=3D=3D at 0x4007FE: _mm_moveldup_ps(__vector float) (pmmintrin.h=
:74)
=3D=3D15797=3D=3D by 0x4008EC: sse3_example(float*, float*, int) (analyzeFu=
ncs.cpp
:43)
=3D=3D15797=3D=3D by 0x400A64: main (analyzeFuncs.cpp:18)
=3D=3D15797=3D=3D
=3D=3D15797=3D=3D ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 5 fr=
om 2)
=3D=3D15797=3D=3D malloc/free: in use at exit: 24576 bytes in 3 blocks.
=3D=3D15797=3D=3D malloc/free: 3 allocs, 0 frees, 24576 bytes allocated.
=3D=3D15797=3D=3D For counts of detected errors, rerun with: -v
=3D=3D15797=3D=3D searching for pointers to 3 not-freed blocks.
=3D=3D15797=3D=3D checked 217936 bytes.
=3D=3D15797=3D=3D
=3D=3D15797=3D=3D LEAK SUMMARY:
=3D=3D15797=3D=3D definitely lost: 0 bytes in 0 blocks.
=3D=3D15797=3D=3D possibly lost: 0 bytes in 0 blocks.
=3D=3D15797=3D=3D still reachable: 24576 bytes in 3 blocks.
=3D=3D15797=3D=3D suppressed: 0 bytes in 0 blocks.
=3D=3D15797=3D=3D Reachable blocks (those to which a pointer was found) are=
not
shown.
=3D=3D15797=3D=3D To see them, rerun with: --show-reachable=3Dyes
Illegal instruction
-----------------START SSE3 PROGRAM --------------------
#include <malloc.h>
#include <xmmintrin.h> // SSE3
#include <pmmintrin.h> // SSE3
#include <math.h>
static const int npoints =3D 1024;
void sse3_example(float* In, float* Out, int points );
int main(int *argv[],int argc)
{
float *In=3D (float *) malloc(2*npoints*sizeof(float));
float *Out =3D (float *) malloc(2*npoints*sizeof(float));
for (int i=3D0;i<npoints;i++)
{
In[2*i] =3D cos(i);
In[2*i+1] =3D sin(i);
}
sse3_example(In,Out,npoints);
return(0);
}
void sse3_example(
float* In,
float* Out,
int points
) {
static float minus1=3D-1.0f;
bool minus=3Dfalse;
int index, float_size=3Dsizeof(float);
register float c, s, real, imag;
__m128
mm_data,mm_exp1,mm_exp,mm_exp_c,mm_exp_s,mm_minus=3D_mm_load_ps1(&minus1);
float *exp =3D (float *) _mm_malloc(2*points*sizeof(float),256);
register int i=3D0,n=3Dpoints*2;
do
{
for (int j=3D0;j<4;j++) exp[j] =3D In[j]; // just some initialization
mm_data =3D _mm_loadu_ps(&In[i]);
mm_exp =3D _mm_load_ps(&exp[i]); // (cos[i],sin[i],cos[i+1],sin[i+1])
// next two commands SSE3
mm_exp_c =3D _mm_moveldup_ps(mm_exp); // (cos[i],cos[i],cos[i+1],cos[i+1])
mm_exp_s =3D _mm_movehdup_ps(mm_exp); // (sin[i],sin[i],sin[i+1],sin[i+1])
if (minus) mm_exp_s =3D _mm_mul_ss(mm_exp_s,mm_minus); // multiply all sine=
s
by minus1
mm_exp =3D _mm_mul_ps(mm_exp_c,mm_data);
//(re[i]*cos[i],im[i]*cos[i],re[i+1]*cos[i+1],im[i+1]*cos[i+1])
mm_exp1 =3D
_mm_mul_ps(mm_exp_s,mm_data);//(re[i]*sin[i],im[i]*sin[i],re[i+1]*sin[i+1],=
im[i+1]*sin[i+1])
mm_exp1 =3D _mm_shuffle_ps(mm_exp1,mm_exp1,0xB1);
//(im[i]*sin[i],re[i]*sin[i],im[i+1]*sin[i+1],re[i+1]*sin[i+1])
// next command is SSE3
mm_exp =3D _mm_addsub_ps(mm_exp,mm_exp1); //
(re[i]*cos[i]-im[i]*sin[i],im[i]*cos[i]+re[i]*sin[i],
// re[i+1]*cos[i+1]-im[i+1]*sin[i+1],im[i+1]*cos[i+1]+re[i+1]*sin[i+1])
_mm_storeu_ps(&Out[i],mm_exp); // unaligned.
i =3D i+4;
} while (i < n);
} // sse3_example
--------------END SSE3 PROGRAM---------------------
|