From: <axl...@us...> - 2009-01-04 16:38:05
|
Revision: 143 http://hgengine.svn.sourceforge.net/hgengine/?rev=143&view=rev Author: axlecrusher Date: 2009-01-04 16:37:53 +0000 (Sun, 04 Jan 2009) Log Message: ----------- First stage of overhauling mercury math so squeeze out a little more speed Use our own FloatRow to define rows for matrices instead of float arrays. We can define these differently if we are using SSE or not. So far this makes matrix multiplies about %14 faster. Modified Paths: -------------- Mercury2/src/MercuryMath.cpp Mercury2/src/MercuryMath.h Mercury2/src/MercuryMatrix.cpp Mercury2/src/MercuryMatrix.h Modified: Mercury2/src/MercuryMath.cpp =================================================================== --- Mercury2/src/MercuryMath.cpp 2009-01-03 17:50:45 UTC (rev 142) +++ Mercury2/src/MercuryMath.cpp 2009-01-04 16:37:53 UTC (rev 143) @@ -4,6 +4,11 @@ //Generic Math functions. Compile these if you can not use optimized functions. +void ZeroFloatRow(FloatRow& r) +{ + Copy4f(&r, (FloatRow){ 0.0f, 0.0f, 0.0f, 0.0f }); +} + void Mul4f(const float* first, const float* second, float* out) { out[0] = first[0] * second[0]; @@ -80,8 +85,14 @@ ((float*)dest)[15] = ((float*)source)[15]; } -void R_ConcatTransforms4 ( const float* in1, const float* in2, float* out) +void R_ConcatTransforms4 ( const FloatRow* in1a, const FloatRow* in2a, FloatRow* outa) { + float *in1, *in2, *out; + + in1 = (float*)in1a; + in2 = (float*)in2a; + out = (float*)outa; + out[0] = in1[0] * in2[0] + in1[1] * in2[4] + in1[2] * in2[8] + in1[3] * in2[12]; out[1] = in1[0] * in2[1] + in1[1] * in2[5] + @@ -128,9 +139,8 @@ } #else -#include <xmmintrin.h> -inline __m128 Hadd4(__m128 x); +//inline __m128 Hadd4(__m128 x); __m128 Hadd4(__m128 x) { //add the low and high components of x @@ -225,33 +235,27 @@ _mm_store_ps((float*)&(((float*)dest)[12]), xmm[3]); } -void R_ConcatTransforms4 ( const float* in1, const float* in2, float* out) +void R_ConcatTransforms4 ( const FloatRow* in1, const FloatRow* in2, FloatRow* out) { - unsigned int x, y; - __m128 xmm[8]; - - xmm[1] = _mm_load_ps((float*)&(in2[0])); - xmm[3] = _mm_load_ps((float*)&(in2[4])); - xmm[5] = _mm_load_ps((float*)&(in2[8])); - xmm[7] = _mm_load_ps((float*)&(in2[12])); - + unsigned int y; + __m128 xmm[4]; + for (y = 0; y < 4; ++y) { - xmm[0] = _mm_set_ps1(in1[(y*4)+0]); - xmm[2] = _mm_set_ps1(in1[(y*4)+1]); - xmm[4] = _mm_set_ps1(in1[(y*4)+2]); - xmm[6] = _mm_set_ps1(in1[(y*4)+3]); + //load columns + xmm[3] = _mm_shuffle_ps (in1[y], in1[y], 0xff); + xmm[2] = _mm_shuffle_ps (in1[y], in1[y], 0xaa); + xmm[1] = _mm_shuffle_ps (in1[y], in1[y], 0x55); + xmm[0] = _mm_shuffle_ps (in1[y], in1[y], 0x00); - xmm[0] = _mm_mul_ps( xmm[0], xmm[1] ); - xmm[2] = _mm_mul_ps( xmm[2], xmm[3] ); - xmm[4] = _mm_mul_ps( xmm[4], xmm[5] ); - xmm[6] = _mm_mul_ps( xmm[6], xmm[7] ); + xmm[0] = _mm_mul_ps( xmm[0], in2[0] ); + xmm[1] = _mm_mul_ps( xmm[1], in2[1] ); + xmm[2] = _mm_mul_ps( xmm[2], in2[2] ); + xmm[3] = _mm_mul_ps( xmm[3], in2[3] ); - xmm[0] = _mm_add_ps( xmm[0], xmm[2] ); - xmm[4] = _mm_add_ps( xmm[4], xmm[6] ); - xmm[0] = _mm_add_ps( xmm[0], xmm[4] ); - - _mm_store_ps(&(out[(y*4)]), xmm[0]); + xmm[0] = _mm_add_ps( xmm[0], xmm[1] ); + xmm[2] = _mm_add_ps( xmm[2], xmm[3] ); + out[y] = _mm_add_ps( xmm[0], xmm[2] ); } } @@ -288,6 +292,11 @@ _mm_store_ps(out, tmp); } +void ZeroFloatRow(FloatRow& r) +{ + r = (FloatRow)_mm_setzero_ps(); +} + #endif /* Modified: Mercury2/src/MercuryMath.h =================================================================== --- Mercury2/src/MercuryMath.h 2009-01-03 17:50:45 UTC (rev 142) +++ Mercury2/src/MercuryMath.h 2009-01-04 16:37:53 UTC (rev 143) @@ -3,6 +3,15 @@ #include <math.h> +#ifdef USE_SSE +#include <xmmintrin.h> +typedef __m128 FloatRow __attribute__((aligned(16))); +#else +typedef float FloatRow[4]; +#endif + +void ZeroFloatRow(FloatRow& r); + #define DEGRAD 0.01745329251994329576f //degree to radian #define RADDEG 57.2957795130823208767f //radian to degree #define Q_PI 3.14159265358979323846f @@ -42,7 +51,8 @@ void Copy4f( void * dest, const void * source ); void Copy8f( void * dest, const void * source ); void Copy16f( void * dest, const void * source ); -void R_ConcatTransforms4 ( const float* in1, const float* in2, float* out ); +//void R_ConcatTransforms4 ( const float* in1, const float* in2, float* out ); +void R_ConcatTransforms4 ( const FloatRow* in1, const FloatRow* in2, FloatRow* out ); void VectorMultiply4f(const float *m, float *p, float *out ); #endif Modified: Mercury2/src/MercuryMatrix.cpp =================================================================== --- Mercury2/src/MercuryMatrix.cpp 2009-01-03 17:50:45 UTC (rev 142) +++ Mercury2/src/MercuryMatrix.cpp 2009-01-04 16:37:53 UTC (rev 143) @@ -25,56 +25,47 @@ void MercuryMatrix::Zero() { - m_matrix[0][0] = 0; - m_matrix[0][1] = 0; - m_matrix[0][2] = 0; - m_matrix[0][3] = 0; - - m_matrix[1][0] = 0; - m_matrix[1][1] = 0; - m_matrix[1][2] = 0; - m_matrix[1][3] = 0; - - m_matrix[2][0] = 0; - m_matrix[2][1] = 0; - m_matrix[2][2] = 0; - m_matrix[2][3] = 0; - - m_matrix[3][0] = 0; - m_matrix[3][1] = 0; - m_matrix[3][2] = 0; - m_matrix[3][3] = 0; + ZeroFloatRow( m_matrix[0] ); + ZeroFloatRow( m_matrix[1] ); + ZeroFloatRow( m_matrix[2] ); + ZeroFloatRow( m_matrix[3] ); } void MercuryMatrix::Identity() { - m_matrix[0][0] = 1; - m_matrix[0][1] = 0; - m_matrix[0][2] = 0; - m_matrix[0][3] = 0; + Copy4f(&m_matrix[0], (void*)&((FloatRow){ 1.0f, 0.0f, 0.0f, 0.0f })); + Copy4f(&m_matrix[1], (void*)&((FloatRow){ 0.0f, 1.0f, 0.0f, 0.0f })); + Copy4f(&m_matrix[2], (void*)&((FloatRow){ 0.0f, 0.0f, 1.0f, 0.0f })); + Copy4f(&m_matrix[3], (void*)&((FloatRow){ 0.0f, 0.0f, 0.0f, 1.0f })); +/* + (*this)[0][0] = 1; + (*this)[0][1] = 0; + (*this)[0][2] = 0; + (*this)[0][3] = 0; - m_matrix[1][0] = 0; - m_matrix[1][1] = 1; - m_matrix[1][2] = 0; - m_matrix[1][3] = 0; + (*this)[1][0] = 0; + (*this)[1][1] = 1; + (*this)[1][2] = 0; + (*this)[1][3] = 0; - m_matrix[2][0] = 0; - m_matrix[2][1] = 0; - m_matrix[2][2] = 1; - m_matrix[2][3] = 0; + (*this)[2][0] = 0; + (*this)[2][1] = 0; + (*this)[2][2] = 1; + (*this)[2][3] = 0; - m_matrix[3][0] = 0; - m_matrix[3][1] = 0; - m_matrix[3][2] = 0; - m_matrix[3][3] = 1; + (*this)[3][0] = 0; + (*this)[3][1] = 0; + (*this)[3][2] = 0; + (*this)[3][3] = 1; + */ } void MercuryMatrix::Translate(float x, float y, float z) { MercuryMatrix m; - m.m_matrix[0][3] = x; - m.m_matrix[1][3] = y; - m.m_matrix[2][3] = z; + m[0][3] = x; + m[1][3] = y; + m[2][3] = z; *this *= m; } @@ -95,25 +86,25 @@ //Row major //manually transposed - matrix.m_matrix[0][0] = cy*cz; - matrix.m_matrix[1][0] = (sx*sy*cz)-(cx*sz); - matrix.m_matrix[2][0] = (cx*sy*cz)+(sx*sz); - matrix.m_matrix[3][0] = 0; + matrix[0][0] = cy*cz; + matrix[1][0] = (sx*sy*cz)-(cx*sz); + matrix[2][0] = (cx*sy*cz)+(sx*sz); + matrix[3][0] = 0; - matrix.m_matrix[0][1] = cy*sz; - matrix.m_matrix[1][1] = (sx*sy*sz)+(cx*cz); - matrix.m_matrix[2][1] = (cx*sy*sz)-(sx*cz); - matrix.m_matrix[3][1] = 0; + matrix[0][1] = cy*sz; + matrix[1][1] = (sx*sy*sz)+(cx*cz); + matrix[2][1] = (cx*sy*sz)-(sx*cz); + matrix[3][1] = 0; - matrix.m_matrix[0][2] = -sy; - matrix.m_matrix[1][2] = sx*cy; - matrix.m_matrix[2][2] = cx*cy; - matrix.m_matrix[3][2] = 0; + matrix[0][2] = -sy; + matrix[1][2] = sx*cy; + matrix[2][2] = cx*cy; + matrix[3][2] = 0; - matrix.m_matrix[0][3] = 0; - matrix.m_matrix[1][3] = 0; - matrix.m_matrix[2][3] = 0; - matrix.m_matrix[3][3] = 1; + matrix[0][3] = 0; + matrix[1][3] = 0; + matrix[2][3] = 0; + matrix[3][3] = 1; *this *= matrix; } @@ -127,25 +118,25 @@ float y = iy/absin; float z = iz/absin; - m_matrix[0][0] = x*x*(1-c)+c; - m_matrix[0][1] = x*y*(1-c)-z*s; - m_matrix[0][2] = x*z*(1-c)+y*s; - m_matrix[0][3] = 0; + (*this)[0][0] = x*x*(1-c)+c; + (*this)[0][1] = x*y*(1-c)-z*s; + (*this)[0][2] = x*z*(1-c)+y*s; + (*this)[0][3] = 0; - m_matrix[1][0] = y*x*(1-c)+z*s; - m_matrix[1][1] = y*y*(1-c)+c; - m_matrix[1][2] = y*z*(1-c)-x*s; - m_matrix[1][3] = 0; + (*this)[1][0] = y*x*(1-c)+z*s; + (*this)[1][1] = y*y*(1-c)+c; + (*this)[1][2] = y*z*(1-c)-x*s; + (*this)[1][3] = 0; - m_matrix[2][0] = x*z*(1-c)-y*s; - m_matrix[2][1] = y*z*(1-c)+x*s; - m_matrix[2][2] = z*z*(1-c)+c; - m_matrix[2][3] = 0; + (*this)[2][0] = x*z*(1-c)-y*s; + (*this)[2][1] = y*z*(1-c)+x*s; + (*this)[2][2] = z*z*(1-c)+c; + (*this)[2][3] = 0; - m_matrix[3][0] = 0; - m_matrix[3][1] = 0; - m_matrix[3][2] = 0; - m_matrix[3][3] = 1; + (*this)[3][0] = 0; + (*this)[3][1] = 0; + (*this)[3][2] = 0; + (*this)[3][3] = 1; } void MercuryMatrix::Transotale( float tX, float tY, float tZ, float rX, float rY, float rZ, float sX, float sY, float sZ ) @@ -165,25 +156,25 @@ //Row major //manually transposed - matrix.m_matrix[0][0] = sX*cy*cz; - matrix.m_matrix[1][0] = sX*((sx*sy*cz)-(cx*sz)); - matrix.m_matrix[2][0] = sX*((cx*sy*cz)+(sx*sz)); - matrix.m_matrix[3][0] = 0; + matrix[0][0] = sX*cy*cz; + matrix[1][0] = sX*((sx*sy*cz)-(cx*sz)); + matrix[2][0] = sX*((cx*sy*cz)+(sx*sz)); + matrix[3][0] = 0; - matrix.m_matrix[0][1] = sY*cy*sz; - matrix.m_matrix[1][1] = sY*((sx*sy*sz)+(cx*cz)); - matrix.m_matrix[2][1] = sY*((cx*sy*sz)-(sx*cz)); - matrix.m_matrix[3][1] = 0; + matrix[0][1] = sY*cy*sz; + matrix[1][1] = sY*((sx*sy*sz)+(cx*cz)); + matrix[2][1] = sY*((cx*sy*sz)-(sx*cz)); + matrix[3][1] = 0; - matrix.m_matrix[0][2] = sZ*(-sy); - matrix.m_matrix[1][2] = sZ*sx*cy; - matrix.m_matrix[2][2] = sZ*cx*cy; - matrix.m_matrix[3][2] = 0; + matrix[0][2] = sZ*(-sy); + matrix[1][2] = sZ*sx*cy; + matrix[2][2] = sZ*cx*cy; + matrix[3][2] = 0; - matrix.m_matrix[0][3] = tX; - matrix.m_matrix[1][3] = tY; - matrix.m_matrix[2][3] = tZ; - matrix.m_matrix[3][3] = 1; + matrix[0][3] = tX; + matrix[1][3] = tY; + matrix[2][3] = tZ; + matrix[3][3] = 1; *this *= matrix; } @@ -193,9 +184,9 @@ { MercuryMatrix m; - m.m_matrix[0][0] = x; - m.m_matrix[1][1] = y; - m.m_matrix[2][2] = z; + m[0][0] = x; + m[1][1] = y; + m[2][2] = z; *this *= m; } @@ -203,14 +194,16 @@ MercuryMatrix MercuryMatrix::operator*(const MercuryMatrix& m) const { MercuryMatrix r(*this); - R_ConcatTransforms4 ( (float*)&m_matrix, (float*)&m.m_matrix, (float*)&r.m_matrix); +// R_ConcatTransforms4 ( (float*)&m_matrix, (float*)&m.m_matrix, (float*)&r.m_matrix); + R_ConcatTransforms4 ( m_matrix, m.m_matrix, r.m_matrix); return r; } MercuryMatrix& MercuryMatrix::operator*=(const MercuryMatrix& m) { MercuryMatrix r(*this); - R_ConcatTransforms4 ( (float*)&r.m_matrix, (float*)&m.m_matrix, (float*)&m_matrix); +// R_ConcatTransforms4 ( (float*)&r.m_matrix, (float*)&m.m_matrix, (float*)&m_matrix); + R_ConcatTransforms4 ( r.m_matrix, m.m_matrix, m_matrix); return *this; } @@ -218,7 +211,7 @@ { for (int i = 0; i < 4; ++i) { - printf( "%f %f %f %f\n", m_matrix[i][0], m_matrix[i][1], m_matrix[i][2], m_matrix[i][3] ); + printf( "%f %f %f %f\n", (*this)[i][0], (*this)[i][1], (*this)[i][2], (*this)[i][3] ); } printf("\n"); } Modified: Mercury2/src/MercuryMatrix.h =================================================================== --- Mercury2/src/MercuryMatrix.h 2009-01-03 17:50:45 UTC (rev 142) +++ Mercury2/src/MercuryMatrix.h 2009-01-04 16:37:53 UTC (rev 143) @@ -15,13 +15,14 @@ { private: ///[row][column] (The internal matrix) - float m_matrix[4][4]; +// float m_matrix[4][4]; + FloatRow m_matrix[4]; public: MercuryMatrix(); inline MercuryMatrix(const MercuryMatrix& m) { *this = m; } - inline float* operator[](unsigned int i) { return m_matrix[i]; } + inline float* operator[](unsigned int i) { return (float*)&m_matrix[i]; } ///Allow typecasting to float * for use in APIs - inline const float* operator[](unsigned int i) const { return m_matrix[i]; } + inline const float* operator[](unsigned int i) const { return (float*)&m_matrix[i]; } const MercuryMatrix& operator=(const MercuryMatrix& m); const MercuryMatrix& operator=(const float* m); inline float* Ptr() { return (float*)&m_matrix; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |