|
From: <axl...@us...> - 2010-05-18 14:48:40
|
Revision: 734
http://hgengine.svn.sourceforge.net/hgengine/?rev=734&view=rev
Author: axlecrusher
Date: 2010-05-18 14:48:33 +0000 (Tue, 18 May 2010)
Log Message:
-----------
Faster matrix multiplication (7.3 time samples down to 1.8 in the profiler).
Much faster LoadIdentity function (6.58 to 0.34 time samples in the profiler).
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Mercury2/src/MercuryMath.h
Mercury2/src/MercuryMatrix.cpp
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2010-05-17 23:58:27 UTC (rev 733)
+++ Mercury2/src/MercuryMath.cpp 2010-05-18 14:48:33 UTC (rev 734)
@@ -242,42 +242,114 @@
void MatrixMultiply4f( const FloatRow* in1, const FloatRow* in2, FloatRow* out)
{
- unsigned int y;
- __m128 xmm[4], a[4], b[4];
+ unsigned int y = 0;
+ __m128 xmm[4], b[4];
-// PREFETCH(in1, _MM_HINT_T0);
-// PREFETCH(in2, _MM_HINT_T1);
-// PREFETCH(out, _MM_HINT_T1);
+ b[3] = _mm_load_ps(in1[0]); //use b3 as temporary storage for matrix1 row1
+
+ //start loading matrix2
b[0] = _mm_load_ps(in2[0]);
b[1] = _mm_load_ps(in2[1]);
b[2] = _mm_load_ps(in2[2]);
- b[3] = _mm_load_ps(in2[3]);
+
+ //load row1 of matrix1 into columns
+ xmm[0] = _mm_shuffle_ps (b[3], b[3], 0x00);
+ xmm[1] = _mm_shuffle_ps (b[3], b[3], 0x55);
+ xmm[2] = _mm_shuffle_ps (b[3], b[3], 0xaa);
+ xmm[3] = _mm_shuffle_ps (b[3], b[3], 0xff);
- for (y = 0; y < 4; ++y)
+ b[3] = _mm_load_ps(in2[3]); //finish loading matrix2, do not change b after this!
+/*
+ do
{
- a[y] = _mm_load_ps(in1[y]);
-
- //load rows as columns
- xmm[3] = _mm_shuffle_ps (a[y], a[y], 0xff);
- xmm[2] = _mm_shuffle_ps (a[y], a[y], 0xaa);
- xmm[1] = _mm_shuffle_ps (a[y], a[y], 0x55);
- xmm[0] = _mm_shuffle_ps (a[y], a[y], 0x00);
-
xmm[0] = _mm_mul_ps( xmm[0], b[0] );
xmm[1] = _mm_mul_ps( xmm[1], b[1] );
+ xmm[0] = _mm_add_ps( xmm[0], xmm[1] ); //done with xmm1
+
+ xmm[1] = _mm_load_ps(in1[y+1]); //load next row, shuffle this last
+
xmm[2] = _mm_mul_ps( xmm[2], b[2] );
xmm[3] = _mm_mul_ps( xmm[3], b[3] );
+ xmm[2] = _mm_add_ps( xmm[2], xmm[3] ); //done with xmm3
- xmm[0] = _mm_add_ps( xmm[0], xmm[1] );
- xmm[2] = _mm_add_ps( xmm[2], xmm[3] );
- a[y] = _mm_add_ps( xmm[0], xmm[2] );
- }
+ xmm[2] = _mm_add_ps( xmm[0], xmm[2] ); //final result
+ _mm_store_ps(out[y++], xmm[2]);
- //try to use the CPU's write-combining
- _mm_store_ps(out[0], a[0]);
- _mm_store_ps(out[1], a[1]);
- _mm_store_ps(out[2], a[2]);
- _mm_store_ps(out[3], a[3]);
+ xmm[3] = _mm_shuffle_ps (xmm[1], xmm[1], 0xff);
+ xmm[0] = _mm_shuffle_ps (xmm[1], xmm[1], 0x00);
+ xmm[2] = _mm_shuffle_ps (xmm[1], xmm[1], 0xaa);
+ xmm[1] = _mm_shuffle_ps (xmm[1], xmm[1], 0x55);
+ } while (y < 4);
+*/
+ //manually unroll loop, much faster!!
+ //loop 1
+ xmm[0] = _mm_mul_ps( xmm[0], b[0] );
+ xmm[1] = _mm_mul_ps( xmm[1], b[1] );
+ xmm[0] = _mm_add_ps( xmm[0], xmm[1] ); //done with xmm1
+
+ xmm[1] = _mm_load_ps(in1[1]); //load next row, shuffle this last
+
+ xmm[2] = _mm_mul_ps( xmm[2], b[2] );
+ xmm[3] = _mm_mul_ps( xmm[3], b[3] );
+ xmm[2] = _mm_add_ps( xmm[2], xmm[3] ); //done with xmm3
+
+ xmm[2] = _mm_add_ps( xmm[0], xmm[2] ); //final result
+ _mm_store_ps(out[0], xmm[2]);
+
+ xmm[3] = _mm_shuffle_ps (xmm[1], xmm[1], 0xff);
+ xmm[0] = _mm_shuffle_ps (xmm[1], xmm[1], 0x00);
+ xmm[2] = _mm_shuffle_ps (xmm[1], xmm[1], 0xaa);
+ xmm[1] = _mm_shuffle_ps (xmm[1], xmm[1], 0x55);
+
+ //loop2
+ xmm[0] = _mm_mul_ps( xmm[0], b[0] );
+ xmm[1] = _mm_mul_ps( xmm[1], b[1] );
+ xmm[0] = _mm_add_ps( xmm[0], xmm[1] ); //done with xmm1
+
+ xmm[1] = _mm_load_ps(in1[2]); //load next row, shuffle this last
+
+ xmm[2] = _mm_mul_ps( xmm[2], b[2] );
+ xmm[3] = _mm_mul_ps( xmm[3], b[3] );
+ xmm[2] = _mm_add_ps( xmm[2], xmm[3] ); //done with xmm3
+
+ xmm[2] = _mm_add_ps( xmm[0], xmm[2] ); //final result
+ _mm_store_ps(out[1], xmm[2]);
+
+ xmm[3] = _mm_shuffle_ps (xmm[1], xmm[1], 0xff);
+ xmm[0] = _mm_shuffle_ps (xmm[1], xmm[1], 0x00);
+ xmm[2] = _mm_shuffle_ps (xmm[1], xmm[1], 0xaa);
+ xmm[1] = _mm_shuffle_ps (xmm[1], xmm[1], 0x55);
+
+ //loop3
+ xmm[0] = _mm_mul_ps( xmm[0], b[0] );
+ xmm[1] = _mm_mul_ps( xmm[1], b[1] );
+ xmm[0] = _mm_add_ps( xmm[0], xmm[1] ); //done with xmm1
+
+ xmm[1] = _mm_load_ps(in1[3]); //load next row, shuffle this last
+
+ xmm[2] = _mm_mul_ps( xmm[2], b[2] );
+ xmm[3] = _mm_mul_ps( xmm[3], b[3] );
+ xmm[2] = _mm_add_ps( xmm[2], xmm[3] ); //done with xmm3
+
+ xmm[2] = _mm_add_ps( xmm[0], xmm[2] ); //final result
+ _mm_store_ps(out[2], xmm[2]);
+
+ xmm[3] = _mm_shuffle_ps (xmm[1], xmm[1], 0xff);
+ xmm[0] = _mm_shuffle_ps (xmm[1], xmm[1], 0x00);
+ xmm[2] = _mm_shuffle_ps (xmm[1], xmm[1], 0xaa);
+ xmm[1] = _mm_shuffle_ps (xmm[1], xmm[1], 0x55);
+
+ //loop4
+ xmm[0] = _mm_mul_ps( xmm[0], b[0] );
+ xmm[1] = _mm_mul_ps( xmm[1], b[1] );
+ xmm[0] = _mm_add_ps( xmm[0], xmm[1] ); //done with xmm1
+
+ xmm[2] = _mm_mul_ps( xmm[2], b[2] );
+ xmm[3] = _mm_mul_ps( xmm[3], b[3] );
+ xmm[2] = _mm_add_ps( xmm[2], xmm[3] ); //done with xmm3
+
+ xmm[2] = _mm_add_ps( xmm[0], xmm[2] ); //final result
+ _mm_store_ps(out[3], xmm[2]);
}
//This is an SSE matrix vector multiply, see the standard C++ code
@@ -321,6 +393,22 @@
}
*/
+void LoadIdentity(FloatRow* matrix)
+{
+ float *m = (float*)matrix;
+
+ __m128 r[2];
+
+ r[0] = _mm_set_ps(0.0f,0.0f,0.0f,1.0f);
+ _mm_storer_ps(m+12,r[0]); //reverse r[0]
+
+ r[1] = _mm_shuffle_ps(r[0], r[0], _MM_SHUFFLE(1,1,0,1));
+ _mm_storer_ps(m+8,r[1]); //reverse r[1]
+
+ _mm_store_ps(m, r[0]);
+ _mm_store_ps(m+4, r[1]);
+}
+
void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result)
{
__m128 a,b,c,d,r;//using more registers is faster
Modified: Mercury2/src/MercuryMath.h
===================================================================
--- Mercury2/src/MercuryMath.h 2010-05-17 23:58:27 UTC (rev 733)
+++ Mercury2/src/MercuryMath.h 2010-05-18 14:48:33 UTC (rev 734)
@@ -109,6 +109,7 @@
void VectorMultiply4f(const FloatRow* matrix, const FloatRow& p, FloatRow& out );
void TransposeMatrix( FloatRow* m );
void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result);
+void LoadIdentity(FloatRow* matrix);
//http://graphics.stanford.edu/~seander/bithacks.html
#define SetBit(x,mask,t) ((x & ~mask) | (-t & mask)) /*superscalar CPU version*/
Modified: Mercury2/src/MercuryMatrix.cpp
===================================================================
--- Mercury2/src/MercuryMatrix.cpp 2010-05-17 23:58:27 UTC (rev 733)
+++ Mercury2/src/MercuryMatrix.cpp 2010-05-18 14:48:33 UTC (rev 734)
@@ -126,9 +126,7 @@
void MercuryMatrix::LoadIdentity()
{
- for (uint8_t x=0;x<4;++x)
- for (uint8_t y=0;y<4;++y)
- m_matrix[x][y] = (x==y)?1.0f:0.0f;
+ ::LoadIdentity(m_matrix);
}
void MercuryMatrix::Translate(float x, float y, float z)
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|