[Hgengine-cvs] SF.net SVN: hgengine:[735] Mercury2/src

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 735
          http://hgengine.svn.sourceforge.net/hgengine/?rev=735&view=rev
Author:   axlecrusher
Date:     2010-05-18 19:58:20 +0000 (Tue, 18 May 2010)

Log Message:
-----------
Faster matrix transpose and memcpy apparently is faster, when optimize build

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp
    Mercury2/src/MercuryMath.h
    Mercury2/src/MercuryMatrix.cpp
    Mercury2/src/MercuryMatrix.h

Modified: Mercury2/src/MercuryMath.cpp
===================================================================

--- Mercury2/src/MercuryMath.cpp	2010-05-18 14:48:33 UTC (rev 734)
+++ Mercury2/src/MercuryMath.cpp	2010-05-18 19:58:20 UTC (rev 735)
@@ -2,10 +2,15 @@
 #include "MercuryMath.h"
 
 //the SSE version of this was really slow, this is quicker
-void TransposeMatrix( FloatRow* m )
+
+#ifndef USE_SSE
+
+//Generic Math functions. Compile these if you can not use optimized functions.
+
+void TransposeMatrix( const FloatRow* matrix, FloatRow* out )
 {
 	float tmp;
-	float *_m = (float*)m;
+	float *_m = (float*)out;
 	
 	tmp = _m[1];
 	_m[1] = _m[4];
@@ -30,10 +35,6 @@
 	_m[14] = tmp;
 }
 
-#ifndef USE_SSE
-
-//Generic Math functions. Compile these if you can not use optimized functions.
-
 void ZeroFloatRow(FloatRow& r)
 {
 	Copy4f(&r, &gfrZero );
@@ -71,21 +72,6 @@
 	Copy4f(out,r);
 }
 
-void Copy4f( void * dest, const void * source )
-{
-	COPY<float,4>((float*)source, (float*)dest);
-}
-
-void Copy8f( void * dest, const void * source )
-{
-	COPY<float,8>((float*)source, (float*)dest);
-}
-
-void Copy16f( void * dest, const void * source )
-{
-	COPY<float,16>((float*)source, (float*)dest);
-}
-
 void MatrixMultiply4f ( const FloatRow* in1a, const FloatRow* in2a, FloatRow* outa)
 {
 	const float *in1 = *in1a;
@@ -216,30 +202,6 @@
 	_mm_store_ps(out,o);
 }
 
-void Copy4f( void * dest, const void * source )
-{
-	_mm_stream_ps(((float*)dest),((__m128*)source)[0]);
-}
-
-void Copy8f( void * dest, const void * source )
-{
-	_mm_stream_ps(((float*)dest),((__m128*)source)[0]);
-	_mm_stream_ps(((float*)dest)+4,((__m128*)source)[1]);
-}
-
-void Copy16f( void * dest, const void * source )
-{
-/*	_mm_stream_si128((__m128i*)dest,((__m128i*)source)[0]);
-	_mm_stream_si128(&((__m128i*)dest)[1],((__m128i*)source)[1]);
-	_mm_stream_si128(&((__m128i*)dest)[2],((__m128i*)source)[2]);
-	_mm_stream_si128(&((__m128i*)dest)[3],((__m128i*)source)[3]);
-*/
-	_mm_stream_ps(((float*)dest),((__m128*)source)[0]);
-	_mm_stream_ps(((float*)dest)+4,((__m128*)source)[1]);
-	_mm_stream_ps(((float*)dest)+8,((__m128*)source)[2]);
-	_mm_stream_ps(((float*)dest)+12,((__m128*)source)[3]);
-}
-
 void MatrixMultiply4f( const FloatRow* in1, const FloatRow* in2, FloatRow* out)
 {
 	unsigned int y = 0;
@@ -399,7 +361,7 @@
 
 	__m128 r[2];
 
-	r[0] = _mm_set_ps(0.0f,0.0f,0.0f,1.0f);
+	r[0] = _mm_set_ss(1.0f);
 	_mm_storer_ps(m+12,r[0]); //reverse r[0]
 
 	r[1] = _mm_shuffle_ps(r[0], r[0], _MM_SHUFFLE(1,1,0,1));
@@ -409,9 +371,42 @@
 	_mm_store_ps(m+4, r[1]);
 }
 
+void TransposeMatrix( const FloatRow* matrix, FloatRow* out )
+{
+	//compiler acts better when we send in 2 parameter rather than 1
+	__m128 m[4],r[4];
+
+	m[0] = _mm_load_ps(matrix[0]);
+	m[1] = _mm_load_ps(matrix[1]);
+	m[2] = _mm_load_ps(matrix[2]);
+	m[3] = _mm_load_ps(matrix[3]);
+
+	r[0] = _mm_movelh_ps(m[0],m[1]);
+	r[1] = _mm_movelh_ps(m[2],m[3]);
+	r[2] = _mm_movehl_ps(m[1],m[0]);
+	r[3] = _mm_movehl_ps(m[3],m[2]);
+	//done with m matrix, we can reuse it now
+
+	m[0] = _mm_shuffle_ps(r[0], r[0], _MM_SHUFFLE(3,1,2,0)); //produce beginning of new row 0 and 1
+	m[1] = _mm_shuffle_ps(r[1], r[1], _MM_SHUFFLE(3,1,2,0)); //produce ending of new row 0 and 1
+	m[2] = _mm_shuffle_ps(r[2], r[2], _MM_SHUFFLE(3,1,2,0)); //produce beginning of new row 2 and 3
+	m[3] = _mm_shuffle_ps(r[3], r[3], _MM_SHUFFLE(3,1,2,0)); //produce ending of new row 2 and 3
+
+	r[0] = _mm_movelh_ps(m[0],m[1]); //row 0 is done
+	r[2] = _mm_movelh_ps(m[2],m[3]); //row 2 is done
+	r[1] = _mm_movehl_ps(m[1],m[0]); //row 1 is done
+	r[3] = _mm_movehl_ps(m[3],m[2]); //row 3 is done
+
+	_mm_store_ps(out[0], r[0]);
+	_mm_store_ps(out[1], r[1]);
+	_mm_store_ps(out[2], r[2]);
+	_mm_store_ps(out[3], r[3]);
+}
+
 void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result)
 {
-	__m128 a,b,c,d,r;//using more registers is faster
+	//using more registers is faster(8 maximum)
+	__m128 a,b,c,d,r;
 	__m128 t1,t2;
 	
 	//unaligned load, vectors are not aligned

Modified: Mercury2/src/MercuryMath.h
===================================================================
--- Mercury2/src/MercuryMath.h	2010-05-18 14:48:33 UTC (rev 734)
+++ Mercury2/src/MercuryMath.h	2010-05-18 19:58:20 UTC (rev 735)
@@ -102,18 +102,22 @@
 void Div4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
 void Add4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
 void Sub4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
-void Copy4f( void * dest, const void * source );
-void Copy8f( void * dest, const void * source );
-void Copy16f( void * dest, const void * source );
+inline void Copy4f( void * dest, const void * source ) { memcpy(dest,source,16); }
+inline void Copy8f( void * dest, const void * source ) { memcpy(dest,source,32); }
+inline void Copy16f( void * dest, const void * source ) { memcpy(dest,source,64); }
 void MatrixMultiply4f ( const FloatRow* in1, const FloatRow* in2, FloatRow* out );
 void VectorMultiply4f(const FloatRow* matrix, const FloatRow& p, FloatRow& out );
-void TransposeMatrix( FloatRow* m );
+void TransposeMatrix(const FloatRow* matrix, FloatRow* out);
 void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result);
 void LoadIdentity(FloatRow* matrix);
 
 //http://graphics.stanford.edu/~seander/bithacks.html
-#define SetBit(x,mask,t) ((x & ~mask) | (-t & mask)) /*superscalar CPU version*/
-#define GetBit(x,mask) ((x & mask)>0)
+inline unsigned int SetBit(unsigned int x, unsigned int mask, bool t)
+{
+	#pragma warning( disable : 4804 )
+	return ((x & ~mask) | (-t & mask));  /*superscalar CPU version*/
+}
+inline bool GetBit(unsigned int x, unsigned int mask) { return ((x & mask)>0); }
 
 //void Float2FloatRow(const float* f, FloatRow& r);
 //void FloatRow2Float(const FloatRow& fr, float* f);

Modified: Mercury2/src/MercuryMatrix.cpp
===================================================================
--- Mercury2/src/MercuryMatrix.cpp	2010-05-18 14:48:33 UTC (rev 734)
+++ Mercury2/src/MercuryMatrix.cpp	2010-05-18 19:58:20 UTC (rev 735)
@@ -25,6 +25,18 @@
 
 	for (unsigned int i = 0; i < rows;i++)
 		m_free.push_back( m_data.Buffer()+i );
+/*
+	//test matrix transpose
+	MercuryMatrix test;
+	for (int i = 0; i < 16; ++i)
+		test.Ptr()[i] = i+1;
+
+	LOG.Write("before transpose\n");
+	test.Print();
+	test.Transpose();
+	LOG.Write("after Transpose\n");
+	test.Print();
+	*/
 }
 
 FloatRow* MercuryMatrixMemory::GetNewMatrix()
@@ -45,26 +57,10 @@
 	MSemaphoreLock lock(&m_lock);
 	m_free.push_back((MatrixArray*)m);
 }
-/*
-VC_ALIGN(16) float base_matrix_identity[16] CC_ALIGN(16) = {
-	1.0f, 0.0f, 0.0f, 0.0f,
-	0.0f, 1.0f, 0.0f, 0.0f,
-	0.0f, 0.0f, 1.0f, 0.0f,
-	0.0f, 0.0f, 0.0f, 1.0f };
-*/
+
 MercuryMatrix::MercuryMatrix()
 	:m_matrix(0)
-{/*
-#ifdef USE_SSE
-	m_matrix[0] = _mm_load1_ps( &base_matrix_identity[0] );
-	m_matrix[1] = _mm_load1_ps( &base_matrix_identity[4] );
-	m_matrix[2] = _mm_load1_ps( &base_matrix_identity[8] );
-	m_matrix[3] = _mm_load1_ps( &base_matrix_identity[12] );
-#else
-	Copy16f(m_matrix[0], base_matrix_identity );
-#endif
-*/
-//	*this = Identity();
+{
 	m_matrix = MercuryMatrixMemory::GetInstance().GetNewMatrix();
 	LoadIdentity();
 }
@@ -292,6 +288,14 @@
 	return r;
 }
 
+void MercuryMatrix::Transpose()
+{
+	//we know we will be operating on this data so try to go get it, 3-4x increase in speed.
+	PREFETCH((const char*)m_matrix,_MM_HINT_NTA);
+	TransposeMatrix( m_matrix, m_matrix );
+}
+
+
 MercuryMatrix MercuryMatrix::IdentityMatrix;
 
 /* 

Modified: Mercury2/src/MercuryMatrix.h
===================================================================
--- Mercury2/src/MercuryMatrix.h	2010-05-18 14:48:33 UTC (rev 734)
+++ Mercury2/src/MercuryMatrix.h	2010-05-18 19:58:20 UTC (rev 735)
@@ -77,7 +77,7 @@
 	inline void Scale(const MercuryVertex& v) { Scale(v[0], v[1], v[2]); }
 	
 	void Transotale( float tX, float tY, float tZ, float rX, float rY, float rZ, float sX, float sY, float sZ );
-	inline void Transpose() { TransposeMatrix( m_matrix ); }
+	void Transpose();
 
 	void Zero();
 	static const MercuryMatrix& Identity();


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.