|
From: <axl...@us...> - 2010-05-18 19:58:27
|
Revision: 735
http://hgengine.svn.sourceforge.net/hgengine/?rev=735&view=rev
Author: axlecrusher
Date: 2010-05-18 19:58:20 +0000 (Tue, 18 May 2010)
Log Message:
-----------
Faster matrix transpose and memcpy apparently is faster, when optimize build
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Mercury2/src/MercuryMath.h
Mercury2/src/MercuryMatrix.cpp
Mercury2/src/MercuryMatrix.h
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2010-05-18 14:48:33 UTC (rev 734)
+++ Mercury2/src/MercuryMath.cpp 2010-05-18 19:58:20 UTC (rev 735)
@@ -2,10 +2,15 @@
#include "MercuryMath.h"
//the SSE version of this was really slow, this is quicker
-void TransposeMatrix( FloatRow* m )
+
+#ifndef USE_SSE
+
+//Generic Math functions. Compile these if you can not use optimized functions.
+
+void TransposeMatrix( const FloatRow* matrix, FloatRow* out )
{
float tmp;
- float *_m = (float*)m;
+ float *_m = (float*)out;
tmp = _m[1];
_m[1] = _m[4];
@@ -30,10 +35,6 @@
_m[14] = tmp;
}
-#ifndef USE_SSE
-
-//Generic Math functions. Compile these if you can not use optimized functions.
-
void ZeroFloatRow(FloatRow& r)
{
Copy4f(&r, &gfrZero );
@@ -71,21 +72,6 @@
Copy4f(out,r);
}
-void Copy4f( void * dest, const void * source )
-{
- COPY<float,4>((float*)source, (float*)dest);
-}
-
-void Copy8f( void * dest, const void * source )
-{
- COPY<float,8>((float*)source, (float*)dest);
-}
-
-void Copy16f( void * dest, const void * source )
-{
- COPY<float,16>((float*)source, (float*)dest);
-}
-
void MatrixMultiply4f ( const FloatRow* in1a, const FloatRow* in2a, FloatRow* outa)
{
const float *in1 = *in1a;
@@ -216,30 +202,6 @@
_mm_store_ps(out,o);
}
-void Copy4f( void * dest, const void * source )
-{
- _mm_stream_ps(((float*)dest),((__m128*)source)[0]);
-}
-
-void Copy8f( void * dest, const void * source )
-{
- _mm_stream_ps(((float*)dest),((__m128*)source)[0]);
- _mm_stream_ps(((float*)dest)+4,((__m128*)source)[1]);
-}
-
-void Copy16f( void * dest, const void * source )
-{
-/* _mm_stream_si128((__m128i*)dest,((__m128i*)source)[0]);
- _mm_stream_si128(&((__m128i*)dest)[1],((__m128i*)source)[1]);
- _mm_stream_si128(&((__m128i*)dest)[2],((__m128i*)source)[2]);
- _mm_stream_si128(&((__m128i*)dest)[3],((__m128i*)source)[3]);
-*/
- _mm_stream_ps(((float*)dest),((__m128*)source)[0]);
- _mm_stream_ps(((float*)dest)+4,((__m128*)source)[1]);
- _mm_stream_ps(((float*)dest)+8,((__m128*)source)[2]);
- _mm_stream_ps(((float*)dest)+12,((__m128*)source)[3]);
-}
-
void MatrixMultiply4f( const FloatRow* in1, const FloatRow* in2, FloatRow* out)
{
unsigned int y = 0;
@@ -399,7 +361,7 @@
__m128 r[2];
- r[0] = _mm_set_ps(0.0f,0.0f,0.0f,1.0f);
+ r[0] = _mm_set_ss(1.0f);
_mm_storer_ps(m+12,r[0]); //reverse r[0]
r[1] = _mm_shuffle_ps(r[0], r[0], _MM_SHUFFLE(1,1,0,1));
@@ -409,9 +371,42 @@
_mm_store_ps(m+4, r[1]);
}
+void TransposeMatrix( const FloatRow* matrix, FloatRow* out )
+{
+ //compiler acts better when we send in 2 parameter rather than 1
+ __m128 m[4],r[4];
+
+ m[0] = _mm_load_ps(matrix[0]);
+ m[1] = _mm_load_ps(matrix[1]);
+ m[2] = _mm_load_ps(matrix[2]);
+ m[3] = _mm_load_ps(matrix[3]);
+
+ r[0] = _mm_movelh_ps(m[0],m[1]);
+ r[1] = _mm_movelh_ps(m[2],m[3]);
+ r[2] = _mm_movehl_ps(m[1],m[0]);
+ r[3] = _mm_movehl_ps(m[3],m[2]);
+ //done with m matrix, we can reuse it now
+
+ m[0] = _mm_shuffle_ps(r[0], r[0], _MM_SHUFFLE(3,1,2,0)); //produce beginning of new row 0 and 1
+ m[1] = _mm_shuffle_ps(r[1], r[1], _MM_SHUFFLE(3,1,2,0)); //produce ending of new row 0 and 1
+ m[2] = _mm_shuffle_ps(r[2], r[2], _MM_SHUFFLE(3,1,2,0)); //produce beginning of new row 2 and 3
+ m[3] = _mm_shuffle_ps(r[3], r[3], _MM_SHUFFLE(3,1,2,0)); //produce ending of new row 2 and 3
+
+ r[0] = _mm_movelh_ps(m[0],m[1]); //row 0 is done
+ r[2] = _mm_movelh_ps(m[2],m[3]); //row 2 is done
+ r[1] = _mm_movehl_ps(m[1],m[0]); //row 1 is done
+ r[3] = _mm_movehl_ps(m[3],m[2]); //row 3 is done
+
+ _mm_store_ps(out[0], r[0]);
+ _mm_store_ps(out[1], r[1]);
+ _mm_store_ps(out[2], r[2]);
+ _mm_store_ps(out[3], r[3]);
+}
+
void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result)
{
- __m128 a,b,c,d,r;//using more registers is faster
+ //using more registers is faster(8 maximum)
+ __m128 a,b,c,d,r;
__m128 t1,t2;
//unaligned load, vectors are not aligned
Modified: Mercury2/src/MercuryMath.h
===================================================================
--- Mercury2/src/MercuryMath.h 2010-05-18 14:48:33 UTC (rev 734)
+++ Mercury2/src/MercuryMath.h 2010-05-18 19:58:20 UTC (rev 735)
@@ -102,18 +102,22 @@
void Div4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
void Add4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
void Sub4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
-void Copy4f( void * dest, const void * source );
-void Copy8f( void * dest, const void * source );
-void Copy16f( void * dest, const void * source );
+inline void Copy4f( void * dest, const void * source ) { memcpy(dest,source,16); }
+inline void Copy8f( void * dest, const void * source ) { memcpy(dest,source,32); }
+inline void Copy16f( void * dest, const void * source ) { memcpy(dest,source,64); }
void MatrixMultiply4f ( const FloatRow* in1, const FloatRow* in2, FloatRow* out );
void VectorMultiply4f(const FloatRow* matrix, const FloatRow& p, FloatRow& out );
-void TransposeMatrix( FloatRow* m );
+void TransposeMatrix(const FloatRow* matrix, FloatRow* out);
void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result);
void LoadIdentity(FloatRow* matrix);
//http://graphics.stanford.edu/~seander/bithacks.html
-#define SetBit(x,mask,t) ((x & ~mask) | (-t & mask)) /*superscalar CPU version*/
-#define GetBit(x,mask) ((x & mask)>0)
+inline unsigned int SetBit(unsigned int x, unsigned int mask, bool t)
+{
+ #pragma warning( disable : 4804 )
+ return ((x & ~mask) | (-t & mask)); /*superscalar CPU version*/
+}
+inline bool GetBit(unsigned int x, unsigned int mask) { return ((x & mask)>0); }
//void Float2FloatRow(const float* f, FloatRow& r);
//void FloatRow2Float(const FloatRow& fr, float* f);
Modified: Mercury2/src/MercuryMatrix.cpp
===================================================================
--- Mercury2/src/MercuryMatrix.cpp 2010-05-18 14:48:33 UTC (rev 734)
+++ Mercury2/src/MercuryMatrix.cpp 2010-05-18 19:58:20 UTC (rev 735)
@@ -25,6 +25,18 @@
for (unsigned int i = 0; i < rows;i++)
m_free.push_back( m_data.Buffer()+i );
+/*
+ //test matrix transpose
+ MercuryMatrix test;
+ for (int i = 0; i < 16; ++i)
+ test.Ptr()[i] = i+1;
+
+ LOG.Write("before transpose\n");
+ test.Print();
+ test.Transpose();
+ LOG.Write("after Transpose\n");
+ test.Print();
+ */
}
FloatRow* MercuryMatrixMemory::GetNewMatrix()
@@ -45,26 +57,10 @@
MSemaphoreLock lock(&m_lock);
m_free.push_back((MatrixArray*)m);
}
-/*
-VC_ALIGN(16) float base_matrix_identity[16] CC_ALIGN(16) = {
- 1.0f, 0.0f, 0.0f, 0.0f,
- 0.0f, 1.0f, 0.0f, 0.0f,
- 0.0f, 0.0f, 1.0f, 0.0f,
- 0.0f, 0.0f, 0.0f, 1.0f };
-*/
+
MercuryMatrix::MercuryMatrix()
:m_matrix(0)
-{/*
-#ifdef USE_SSE
- m_matrix[0] = _mm_load1_ps( &base_matrix_identity[0] );
- m_matrix[1] = _mm_load1_ps( &base_matrix_identity[4] );
- m_matrix[2] = _mm_load1_ps( &base_matrix_identity[8] );
- m_matrix[3] = _mm_load1_ps( &base_matrix_identity[12] );
-#else
- Copy16f(m_matrix[0], base_matrix_identity );
-#endif
-*/
-// *this = Identity();
+{
m_matrix = MercuryMatrixMemory::GetInstance().GetNewMatrix();
LoadIdentity();
}
@@ -292,6 +288,14 @@
return r;
}
+void MercuryMatrix::Transpose()
+{
+ //we know we will be operating on this data so try to go get it, 3-4x increase in speed.
+ PREFETCH((const char*)m_matrix,_MM_HINT_NTA);
+ TransposeMatrix( m_matrix, m_matrix );
+}
+
+
MercuryMatrix MercuryMatrix::IdentityMatrix;
/*
Modified: Mercury2/src/MercuryMatrix.h
===================================================================
--- Mercury2/src/MercuryMatrix.h 2010-05-18 14:48:33 UTC (rev 734)
+++ Mercury2/src/MercuryMatrix.h 2010-05-18 19:58:20 UTC (rev 735)
@@ -77,7 +77,7 @@
inline void Scale(const MercuryVertex& v) { Scale(v[0], v[1], v[2]); }
void Transotale( float tX, float tY, float tZ, float rX, float rY, float rZ, float sX, float sY, float sZ );
- inline void Transpose() { TransposeMatrix( m_matrix ); }
+ void Transpose();
void Zero();
static const MercuryMatrix& Identity();
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|