From: <axl...@us...> - 2010-05-18 19:58:27
|
Revision: 735 http://hgengine.svn.sourceforge.net/hgengine/?rev=735&view=rev Author: axlecrusher Date: 2010-05-18 19:58:20 +0000 (Tue, 18 May 2010) Log Message: ----------- Faster matrix transpose and memcpy apparently is faster, when optimize build Modified Paths: -------------- Mercury2/src/MercuryMath.cpp Mercury2/src/MercuryMath.h Mercury2/src/MercuryMatrix.cpp Mercury2/src/MercuryMatrix.h Modified: Mercury2/src/MercuryMath.cpp =================================================================== --- Mercury2/src/MercuryMath.cpp 2010-05-18 14:48:33 UTC (rev 734) +++ Mercury2/src/MercuryMath.cpp 2010-05-18 19:58:20 UTC (rev 735) @@ -2,10 +2,15 @@ #include "MercuryMath.h" //the SSE version of this was really slow, this is quicker -void TransposeMatrix( FloatRow* m ) + +#ifndef USE_SSE + +//Generic Math functions. Compile these if you can not use optimized functions. + +void TransposeMatrix( const FloatRow* matrix, FloatRow* out ) { float tmp; - float *_m = (float*)m; + float *_m = (float*)out; tmp = _m[1]; _m[1] = _m[4]; @@ -30,10 +35,6 @@ _m[14] = tmp; } -#ifndef USE_SSE - -//Generic Math functions. Compile these if you can not use optimized functions. - void ZeroFloatRow(FloatRow& r) { Copy4f(&r, &gfrZero ); @@ -71,21 +72,6 @@ Copy4f(out,r); } -void Copy4f( void * dest, const void * source ) -{ - COPY<float,4>((float*)source, (float*)dest); -} - -void Copy8f( void * dest, const void * source ) -{ - COPY<float,8>((float*)source, (float*)dest); -} - -void Copy16f( void * dest, const void * source ) -{ - COPY<float,16>((float*)source, (float*)dest); -} - void MatrixMultiply4f ( const FloatRow* in1a, const FloatRow* in2a, FloatRow* outa) { const float *in1 = *in1a; @@ -216,30 +202,6 @@ _mm_store_ps(out,o); } -void Copy4f( void * dest, const void * source ) -{ - _mm_stream_ps(((float*)dest),((__m128*)source)[0]); -} - -void Copy8f( void * dest, const void * source ) -{ - _mm_stream_ps(((float*)dest),((__m128*)source)[0]); - _mm_stream_ps(((float*)dest)+4,((__m128*)source)[1]); -} - -void Copy16f( void * dest, const void * source ) -{ -/* _mm_stream_si128((__m128i*)dest,((__m128i*)source)[0]); - _mm_stream_si128(&((__m128i*)dest)[1],((__m128i*)source)[1]); - _mm_stream_si128(&((__m128i*)dest)[2],((__m128i*)source)[2]); - _mm_stream_si128(&((__m128i*)dest)[3],((__m128i*)source)[3]); -*/ - _mm_stream_ps(((float*)dest),((__m128*)source)[0]); - _mm_stream_ps(((float*)dest)+4,((__m128*)source)[1]); - _mm_stream_ps(((float*)dest)+8,((__m128*)source)[2]); - _mm_stream_ps(((float*)dest)+12,((__m128*)source)[3]); -} - void MatrixMultiply4f( const FloatRow* in1, const FloatRow* in2, FloatRow* out) { unsigned int y = 0; @@ -399,7 +361,7 @@ __m128 r[2]; - r[0] = _mm_set_ps(0.0f,0.0f,0.0f,1.0f); + r[0] = _mm_set_ss(1.0f); _mm_storer_ps(m+12,r[0]); //reverse r[0] r[1] = _mm_shuffle_ps(r[0], r[0], _MM_SHUFFLE(1,1,0,1)); @@ -409,9 +371,42 @@ _mm_store_ps(m+4, r[1]); } +void TransposeMatrix( const FloatRow* matrix, FloatRow* out ) +{ + //compiler acts better when we send in 2 parameter rather than 1 + __m128 m[4],r[4]; + + m[0] = _mm_load_ps(matrix[0]); + m[1] = _mm_load_ps(matrix[1]); + m[2] = _mm_load_ps(matrix[2]); + m[3] = _mm_load_ps(matrix[3]); + + r[0] = _mm_movelh_ps(m[0],m[1]); + r[1] = _mm_movelh_ps(m[2],m[3]); + r[2] = _mm_movehl_ps(m[1],m[0]); + r[3] = _mm_movehl_ps(m[3],m[2]); + //done with m matrix, we can reuse it now + + m[0] = _mm_shuffle_ps(r[0], r[0], _MM_SHUFFLE(3,1,2,0)); //produce beginning of new row 0 and 1 + m[1] = _mm_shuffle_ps(r[1], r[1], _MM_SHUFFLE(3,1,2,0)); //produce ending of new row 0 and 1 + m[2] = _mm_shuffle_ps(r[2], r[2], _MM_SHUFFLE(3,1,2,0)); //produce beginning of new row 2 and 3 + m[3] = _mm_shuffle_ps(r[3], r[3], _MM_SHUFFLE(3,1,2,0)); //produce ending of new row 2 and 3 + + r[0] = _mm_movelh_ps(m[0],m[1]); //row 0 is done + r[2] = _mm_movelh_ps(m[2],m[3]); //row 2 is done + r[1] = _mm_movehl_ps(m[1],m[0]); //row 1 is done + r[3] = _mm_movehl_ps(m[3],m[2]); //row 3 is done + + _mm_store_ps(out[0], r[0]); + _mm_store_ps(out[1], r[1]); + _mm_store_ps(out[2], r[2]); + _mm_store_ps(out[3], r[3]); +} + void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result) { - __m128 a,b,c,d,r;//using more registers is faster + //using more registers is faster(8 maximum) + __m128 a,b,c,d,r; __m128 t1,t2; //unaligned load, vectors are not aligned Modified: Mercury2/src/MercuryMath.h =================================================================== --- Mercury2/src/MercuryMath.h 2010-05-18 14:48:33 UTC (rev 734) +++ Mercury2/src/MercuryMath.h 2010-05-18 19:58:20 UTC (rev 735) @@ -102,18 +102,22 @@ void Div4f(const FloatRow& first, const FloatRow& second, FloatRow& out); void Add4f(const FloatRow& first, const FloatRow& second, FloatRow& out); void Sub4f(const FloatRow& first, const FloatRow& second, FloatRow& out); -void Copy4f( void * dest, const void * source ); -void Copy8f( void * dest, const void * source ); -void Copy16f( void * dest, const void * source ); +inline void Copy4f( void * dest, const void * source ) { memcpy(dest,source,16); } +inline void Copy8f( void * dest, const void * source ) { memcpy(dest,source,32); } +inline void Copy16f( void * dest, const void * source ) { memcpy(dest,source,64); } void MatrixMultiply4f ( const FloatRow* in1, const FloatRow* in2, FloatRow* out ); void VectorMultiply4f(const FloatRow* matrix, const FloatRow& p, FloatRow& out ); -void TransposeMatrix( FloatRow* m ); +void TransposeMatrix(const FloatRow* matrix, FloatRow* out); void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result); void LoadIdentity(FloatRow* matrix); //http://graphics.stanford.edu/~seander/bithacks.html -#define SetBit(x,mask,t) ((x & ~mask) | (-t & mask)) /*superscalar CPU version*/ -#define GetBit(x,mask) ((x & mask)>0) +inline unsigned int SetBit(unsigned int x, unsigned int mask, bool t) +{ + #pragma warning( disable : 4804 ) + return ((x & ~mask) | (-t & mask)); /*superscalar CPU version*/ +} +inline bool GetBit(unsigned int x, unsigned int mask) { return ((x & mask)>0); } //void Float2FloatRow(const float* f, FloatRow& r); //void FloatRow2Float(const FloatRow& fr, float* f); Modified: Mercury2/src/MercuryMatrix.cpp =================================================================== --- Mercury2/src/MercuryMatrix.cpp 2010-05-18 14:48:33 UTC (rev 734) +++ Mercury2/src/MercuryMatrix.cpp 2010-05-18 19:58:20 UTC (rev 735) @@ -25,6 +25,18 @@ for (unsigned int i = 0; i < rows;i++) m_free.push_back( m_data.Buffer()+i ); +/* + //test matrix transpose + MercuryMatrix test; + for (int i = 0; i < 16; ++i) + test.Ptr()[i] = i+1; + + LOG.Write("before transpose\n"); + test.Print(); + test.Transpose(); + LOG.Write("after Transpose\n"); + test.Print(); + */ } FloatRow* MercuryMatrixMemory::GetNewMatrix() @@ -45,26 +57,10 @@ MSemaphoreLock lock(&m_lock); m_free.push_back((MatrixArray*)m); } -/* -VC_ALIGN(16) float base_matrix_identity[16] CC_ALIGN(16) = { - 1.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 1.0f }; -*/ + MercuryMatrix::MercuryMatrix() :m_matrix(0) -{/* -#ifdef USE_SSE - m_matrix[0] = _mm_load1_ps( &base_matrix_identity[0] ); - m_matrix[1] = _mm_load1_ps( &base_matrix_identity[4] ); - m_matrix[2] = _mm_load1_ps( &base_matrix_identity[8] ); - m_matrix[3] = _mm_load1_ps( &base_matrix_identity[12] ); -#else - Copy16f(m_matrix[0], base_matrix_identity ); -#endif -*/ -// *this = Identity(); +{ m_matrix = MercuryMatrixMemory::GetInstance().GetNewMatrix(); LoadIdentity(); } @@ -292,6 +288,14 @@ return r; } +void MercuryMatrix::Transpose() +{ + //we know we will be operating on this data so try to go get it, 3-4x increase in speed. + PREFETCH((const char*)m_matrix,_MM_HINT_NTA); + TransposeMatrix( m_matrix, m_matrix ); +} + + MercuryMatrix MercuryMatrix::IdentityMatrix; /* Modified: Mercury2/src/MercuryMatrix.h =================================================================== --- Mercury2/src/MercuryMatrix.h 2010-05-18 14:48:33 UTC (rev 734) +++ Mercury2/src/MercuryMatrix.h 2010-05-18 19:58:20 UTC (rev 735) @@ -77,7 +77,7 @@ inline void Scale(const MercuryVertex& v) { Scale(v[0], v[1], v[2]); } void Transotale( float tX, float tY, float tZ, float rX, float rY, float rZ, float sX, float sY, float sZ ); - inline void Transpose() { TransposeMatrix( m_matrix ); } + void Transpose(); void Zero(); static const MercuryMatrix& Identity(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |