[Hgengine-cvs] SF.net SVN: hgengine:[734] Mercury2/src

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 734
          http://hgengine.svn.sourceforge.net/hgengine/?rev=734&view=rev
Author:   axlecrusher
Date:     2010-05-18 14:48:33 +0000 (Tue, 18 May 2010)

Log Message:
-----------
Faster matrix multiplication (7.3 time samples down to 1.8 in the profiler).
Much faster LoadIdentity function (6.58 to 0.34 time samples in the profiler).

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp
    Mercury2/src/MercuryMath.h
    Mercury2/src/MercuryMatrix.cpp

Modified: Mercury2/src/MercuryMath.cpp
===================================================================

--- Mercury2/src/MercuryMath.cpp	2010-05-17 23:58:27 UTC (rev 733)
+++ Mercury2/src/MercuryMath.cpp	2010-05-18 14:48:33 UTC (rev 734)
@@ -242,42 +242,114 @@
 
 void MatrixMultiply4f( const FloatRow* in1, const FloatRow* in2, FloatRow* out)
 {
-	unsigned int y;
-	__m128 xmm[4], a[4], b[4];
+	unsigned int y = 0;
+	__m128 xmm[4], b[4];
 
-//	PREFETCH(in1, _MM_HINT_T0);
-//	PREFETCH(in2, _MM_HINT_T1);
-//	PREFETCH(out, _MM_HINT_T1);
+	b[3] = _mm_load_ps(in1[0]); //use b3 as temporary storage for matrix1 row1
+
+	//start loading matrix2
 	b[0] = _mm_load_ps(in2[0]);
 	b[1] = _mm_load_ps(in2[1]);
 	b[2] = _mm_load_ps(in2[2]);
-	b[3] = _mm_load_ps(in2[3]);
+		
+	//load row1 of matrix1 into columns
+	xmm[0] = _mm_shuffle_ps (b[3], b[3], 0x00);
+	xmm[1] = _mm_shuffle_ps (b[3], b[3], 0x55);
+	xmm[2] = _mm_shuffle_ps (b[3], b[3], 0xaa);
+	xmm[3] = _mm_shuffle_ps (b[3], b[3], 0xff);
 
-	for (y = 0; y < 4; ++y)
+	b[3] = _mm_load_ps(in2[3]); //finish loading matrix2, do not change b after this!
+/*
+	do
 	{
-		a[y] = _mm_load_ps(in1[y]);
-		
-		//load rows as columns
-		xmm[3] = _mm_shuffle_ps (a[y], a[y], 0xff);
-		xmm[2] = _mm_shuffle_ps (a[y], a[y], 0xaa);
-		xmm[1] = _mm_shuffle_ps (a[y], a[y], 0x55);
-		xmm[0] = _mm_shuffle_ps (a[y], a[y], 0x00);
-
 		xmm[0] = _mm_mul_ps( xmm[0], b[0] );
 		xmm[1] = _mm_mul_ps( xmm[1], b[1] );
+		xmm[0] = _mm_add_ps( xmm[0], xmm[1] ); //done with xmm1
+
+		xmm[1] = _mm_load_ps(in1[y+1]); //load next row, shuffle this last
+
 		xmm[2] = _mm_mul_ps( xmm[2], b[2] );
 		xmm[3] = _mm_mul_ps( xmm[3], b[3] );
+		xmm[2] = _mm_add_ps( xmm[2], xmm[3] ); //done with xmm3
 
-		xmm[0] = _mm_add_ps( xmm[0], xmm[1] );
-		xmm[2] = _mm_add_ps( xmm[2], xmm[3] );
-		a[y] = _mm_add_ps( xmm[0], xmm[2] );
-	}
+		xmm[2] = _mm_add_ps( xmm[0], xmm[2] ); //final result
+		_mm_store_ps(out[y++], xmm[2]);
 
-	//try to use the CPU's write-combining
-	_mm_store_ps(out[0], a[0]);
-	_mm_store_ps(out[1], a[1]);
-	_mm_store_ps(out[2], a[2]);
-	_mm_store_ps(out[3], a[3]);
+		xmm[3] = _mm_shuffle_ps (xmm[1], xmm[1], 0xff);
+		xmm[0] = _mm_shuffle_ps (xmm[1], xmm[1], 0x00);
+		xmm[2] = _mm_shuffle_ps (xmm[1], xmm[1], 0xaa);
+		xmm[1] = _mm_shuffle_ps (xmm[1], xmm[1], 0x55); 
+	}	while (y < 4);
+*/
+	//manually unroll loop, much faster!!
+	//loop 1
+	xmm[0] = _mm_mul_ps( xmm[0], b[0] );
+	xmm[1] = _mm_mul_ps( xmm[1], b[1] );
+	xmm[0] = _mm_add_ps( xmm[0], xmm[1] ); //done with xmm1
+
+	xmm[1] = _mm_load_ps(in1[1]); //load next row, shuffle this last
+
+	xmm[2] = _mm_mul_ps( xmm[2], b[2] );
+	xmm[3] = _mm_mul_ps( xmm[3], b[3] );
+	xmm[2] = _mm_add_ps( xmm[2], xmm[3] ); //done with xmm3
+
+	xmm[2] = _mm_add_ps( xmm[0], xmm[2] ); //final result
+	_mm_store_ps(out[0], xmm[2]);
+
+	xmm[3] = _mm_shuffle_ps (xmm[1], xmm[1], 0xff);
+	xmm[0] = _mm_shuffle_ps (xmm[1], xmm[1], 0x00);
+	xmm[2] = _mm_shuffle_ps (xmm[1], xmm[1], 0xaa);
+	xmm[1] = _mm_shuffle_ps (xmm[1], xmm[1], 0x55);
+
+	//loop2
+	xmm[0] = _mm_mul_ps( xmm[0], b[0] );
+	xmm[1] = _mm_mul_ps( xmm[1], b[1] );
+	xmm[0] = _mm_add_ps( xmm[0], xmm[1] ); //done with xmm1
+
+	xmm[1] = _mm_load_ps(in1[2]); //load next row, shuffle this last
+
+	xmm[2] = _mm_mul_ps( xmm[2], b[2] );
+	xmm[3] = _mm_mul_ps( xmm[3], b[3] );
+	xmm[2] = _mm_add_ps( xmm[2], xmm[3] ); //done with xmm3
+
+	xmm[2] = _mm_add_ps( xmm[0], xmm[2] ); //final result
+	_mm_store_ps(out[1], xmm[2]);
+
+	xmm[3] = _mm_shuffle_ps (xmm[1], xmm[1], 0xff);
+	xmm[0] = _mm_shuffle_ps (xmm[1], xmm[1], 0x00);
+	xmm[2] = _mm_shuffle_ps (xmm[1], xmm[1], 0xaa);
+	xmm[1] = _mm_shuffle_ps (xmm[1], xmm[1], 0x55);
+
+	//loop3
+	xmm[0] = _mm_mul_ps( xmm[0], b[0] );
+	xmm[1] = _mm_mul_ps( xmm[1], b[1] );
+	xmm[0] = _mm_add_ps( xmm[0], xmm[1] ); //done with xmm1
+
+	xmm[1] = _mm_load_ps(in1[3]); //load next row, shuffle this last
+
+	xmm[2] = _mm_mul_ps( xmm[2], b[2] );
+	xmm[3] = _mm_mul_ps( xmm[3], b[3] );
+	xmm[2] = _mm_add_ps( xmm[2], xmm[3] ); //done with xmm3
+
+	xmm[2] = _mm_add_ps( xmm[0], xmm[2] ); //final result
+	_mm_store_ps(out[2], xmm[2]);
+
+	xmm[3] = _mm_shuffle_ps (xmm[1], xmm[1], 0xff);
+	xmm[0] = _mm_shuffle_ps (xmm[1], xmm[1], 0x00);
+	xmm[2] = _mm_shuffle_ps (xmm[1], xmm[1], 0xaa);
+	xmm[1] = _mm_shuffle_ps (xmm[1], xmm[1], 0x55);
+
+	//loop4
+	xmm[0] = _mm_mul_ps( xmm[0], b[0] );
+	xmm[1] = _mm_mul_ps( xmm[1], b[1] );
+	xmm[0] = _mm_add_ps( xmm[0], xmm[1] ); //done with xmm1
+
+	xmm[2] = _mm_mul_ps( xmm[2], b[2] );
+	xmm[3] = _mm_mul_ps( xmm[3], b[3] );
+	xmm[2] = _mm_add_ps( xmm[2], xmm[3] ); //done with xmm3
+
+	xmm[2] = _mm_add_ps( xmm[0], xmm[2] ); //final result
+	_mm_store_ps(out[3], xmm[2]);
 }
 
 //This is an SSE matrix vector multiply, see the standard C++ code
@@ -321,6 +393,22 @@
 }
 */
 
+void LoadIdentity(FloatRow* matrix)
+{
+	float *m = (float*)matrix;
+
+	__m128 r[2];
+
+	r[0] = _mm_set_ps(0.0f,0.0f,0.0f,1.0f);
+	_mm_storer_ps(m+12,r[0]); //reverse r[0]
+
+	r[1] = _mm_shuffle_ps(r[0], r[0], _MM_SHUFFLE(1,1,0,1));
+	_mm_storer_ps(m+8,r[1]); //reverse r[1]
+
+	_mm_store_ps(m, r[0]);
+	_mm_store_ps(m+4, r[1]);
+}
+
 void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result)
 {
 	__m128 a,b,c,d,r;//using more registers is faster

Modified: Mercury2/src/MercuryMath.h
===================================================================
--- Mercury2/src/MercuryMath.h	2010-05-17 23:58:27 UTC (rev 733)
+++ Mercury2/src/MercuryMath.h	2010-05-18 14:48:33 UTC (rev 734)
@@ -109,6 +109,7 @@
 void VectorMultiply4f(const FloatRow* matrix, const FloatRow& p, FloatRow& out );
 void TransposeMatrix( FloatRow* m );
 void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result);
+void LoadIdentity(FloatRow* matrix);
 
 //http://graphics.stanford.edu/~seander/bithacks.html
 #define SetBit(x,mask,t) ((x & ~mask) | (-t & mask)) /*superscalar CPU version*/

Modified: Mercury2/src/MercuryMatrix.cpp
===================================================================
--- Mercury2/src/MercuryMatrix.cpp	2010-05-17 23:58:27 UTC (rev 733)
+++ Mercury2/src/MercuryMatrix.cpp	2010-05-18 14:48:33 UTC (rev 734)
@@ -126,9 +126,7 @@
 
 void MercuryMatrix::LoadIdentity()
 {
-	for (uint8_t x=0;x<4;++x)
-		for (uint8_t y=0;y<4;++y)
-			m_matrix[x][y] = (x==y)?1.0f:0.0f;
+	::LoadIdentity(m_matrix);
 }
 
 void MercuryMatrix::Translate(float x, float y, float z)


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.