Thread: [Hgengine-cvs] SF.net SVN: hgengine:[139] Mercury2/src/MercuryMath.cpp

Status: Alpha

Brought to you by: axlecrusher, cnlohr

hgengine-cvs

[Hgengine-cvs] SF.net SVN: hgengine:[139] Mercury2/src/MercuryMath.cpp

From: <axl...@us...> - 2009-01-02 01:20:36

Revision: 139
          http://hgengine.svn.sourceforge.net/hgengine/?rev=139&view=rev
Author:   axlecrusher
Date:     2009-01-02 01:20:29 +0000 (Fri, 02 Jan 2009)

Log Message:
-----------
fix not defined

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp

Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp	2009-01-02 01:19:05 UTC (rev 138)
+++ Mercury2/src/MercuryMath.cpp	2009-01-02 01:20:29 UTC (rev 139)
@@ -1,6 +1,6 @@
 #include "MercuryMath.h"
 
-#if !defined( USE_SSE )
+#ifndef USE_SSE
 
 //Generic Math functions. Compile these if you can not use optimized functions.
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Hgengine-cvs] SF.net SVN: hgengine:[146] Mercury2/src/MercuryMath.cpp

From: <axl...@us...> - 2009-01-04 18:32:07

Revision: 146
          http://hgengine.svn.sourceforge.net/hgengine/?rev=146&view=rev
Author:   axlecrusher
Date:     2009-01-04 18:32:00 +0000 (Sun, 04 Jan 2009)

Log Message:
-----------
convert non optimized functions to FloatRow

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp

Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp	2009-01-04 18:07:42 UTC (rev 145)
+++ Mercury2/src/MercuryMath.cpp	2009-01-04 18:32:00 UTC (rev 146)
@@ -9,36 +9,36 @@
 	Copy4f(&r, (FloatRow){ 0.0f, 0.0f, 0.0f, 0.0f });
 }
 
-void Mul4f(const float* first, const float* second, float* out)
+void Mul4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
 {
-    out[0] = first[0] * second[0];
-    out[1] = first[1] * second[1];
-    out[2] = first[2] * second[2];
-    out[3] = first[3] * second[3];
+    ((float*)out)[0] = ((float*)first)[0] * ((float*)second)[0];
+    ((float*)out)[1] = ((float*)first)[1] * ((float*)second)[1];
+    ((float*)out)[2] = ((float*)first)[2] * ((float*)second)[2];
+    ((float*)out)[3] = ((float*)first)[3] * ((float*)second)[3];
 }
 
-void Div4f(const float* first, const float* second, float* out)
+void Div4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
 {
-    out[0] = first[0] / second[0];
-    out[1] = first[1] / second[1];
-    out[2] = first[2] / second[2];
-    out[3] = first[3] / second[3];
+    ((float*)out)[0] = ((float*)first)[0] / ((float*)second)[0];
+    ((float*)out)[1] = ((float*)first)[1] / ((float*)second)[1];
+    ((float*)out)[2] = ((float*)first)[2] / ((float*)second)[2];
+    ((float*)out)[3] = ((float*)first)[3] / ((float*)second)[3];
 }
 
-void Add4f(const float* first, const float* second, float* out)
+void Add4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
 {
-    out[0] = first[0] + second[0];
-    out[1] = first[1] + second[1];
-    out[2] = first[2] + second[2];
-    out[3] = first[3] + second[3];
+    ((float*)out)[0] = ((float*)first)[0] + ((float*)second)[0];
+    ((float*)out)[1] = ((float*)first)[1] + ((float*)second)[1];
+    ((float*)out)[2] = ((float*)first)[2] + ((float*)second)[2];
+    ((float*)out)[3] = ((float*)first)[3] + ((float*)second)[3];
 }
 
-void Sub4f(const float* first, const float* second, float* out)
+void Sub4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
 {
-    out[0] = first[0] - second[0];
-    out[1] = first[1] - second[1];
-    out[2] = first[2] - second[2];
-    out[3] = first[3] - second[3];
+    ((float*)out)[0] = ((float*)first)[0] - ((float*)second)[0];
+    ((float*)out)[1] = ((float*)first)[1] - ((float*)second)[1];
+    ((float*)out)[2] = ((float*)first)[2] - ((float*)second)[2];
+    ((float*)out)[3] = ((float*)first)[3] - ((float*)second)[3];
 }
 
 void Copy4f( void * dest, const void * source )
@@ -215,7 +215,7 @@
 	
 	for (y = 0; y < 4; ++y)
 	{
-		//load columns
+		//load rows as columns
 		xmm[3] = _mm_shuffle_ps (in1[y], in1[y], 0xff);
 		xmm[2] = _mm_shuffle_ps (in1[y], in1[y], 0xaa);
 		xmm[1] = _mm_shuffle_ps (in1[y], in1[y], 0x55);


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Hgengine-cvs] SF.net SVN: hgengine:[161] Mercury2/src/MercuryMath.cpp

From: <axl...@us...> - 2009-03-02 19:51:33

Revision: 161
          http://hgengine.svn.sourceforge.net/hgengine/?rev=161&view=rev
Author:   axlecrusher
Date:     2009-03-02 19:51:29 +0000 (Mon, 02 Mar 2009)

Log Message:
-----------
fix

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp

Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp	2009-03-02 19:46:08 UTC (rev 160)
+++ Mercury2/src/MercuryMath.cpp	2009-03-02 19:51:29 UTC (rev 161)
@@ -172,10 +172,11 @@
 
 void Float2FloatRow(const float* f, FloatRow* r)
 {
-	*r[0] = f[0];
-	*r[1] = f[1];
-	*r[2] = f[2];
-	*r[3] = f[3];
+	float* row = (float*)r;
+	row[0] = f[0];
+	row[1] = f[1];
+	row[2] = f[2];
+	row[3] = f[3];
 }
 
 void FloatRow2Float( const FloatRow* fr, float* f)


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Hgengine-cvs] SF.net SVN: hgengine:[166] Mercury2/src/MercuryMath.cpp

From: <axl...@us...> - 2009-03-02 21:07:22

Revision: 166
          http://hgengine.svn.sourceforge.net/hgengine/?rev=166&view=rev
Author:   axlecrusher
Date:     2009-03-02 21:07:12 +0000 (Mon, 02 Mar 2009)

Log Message:
-----------
fix broken SSE vector multiply

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp

Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp	2009-03-02 20:36:05 UTC (rev 165)
+++ Mercury2/src/MercuryMath.cpp	2009-03-02 21:07:12 UTC (rev 166)
@@ -4,7 +4,7 @@
 void TransposeMatrix( FloatRow* m )
 {
 	float tmp;
-	float *_m = *m;
+	float *_m = (float*)m;
 	
 	tmp = _m[1];
 	_m[1] = _m[4];
@@ -284,14 +284,14 @@
 	
 	//compute term 1 and term 2 and store them in the low order
 	//of outxmm[0]
-	out[0] = Hadd4( _mm_mul_ps( matrix[1], *p ) );
-	tmp = Hadd4( _mm_mul_ps( matrix[2], *p ) );
+	out[0] = Hadd4( _mm_mul_ps( matrix[0], *p ) );
+	tmp = Hadd4( _mm_mul_ps( matrix[1], *p ) );
 	out[0] = _mm_unpacklo_ps(out[0], tmp);
 
 	//compute term 3 and term 4 and store them in the high order
 	//of outxmm[1]
-	out[1] = Hadd4( _mm_mul_ps( matrix[3], *p ) );
-	tmp = Hadd4( _mm_mul_ps( matrix[4], *p ) );
+	out[1] = Hadd4( _mm_mul_ps( matrix[2], *p ) );
+	tmp = Hadd4( _mm_mul_ps( matrix[3], *p ) );
 	out[1] = _mm_unpacklo_ps(out[1], tmp);
 
 	//shuffle the low order of outxmm[0] into the loworder of tmp
@@ -304,9 +304,9 @@
 	r = (FloatRow)_mm_setzero_ps();
 }
 
-FloatRow Float2FloatRow(const float* f, , FloatRow* r)
+void Float2FloatRow(const float* f, FloatRow* r)
 {
-	r = _mm_load_ps( f );
+	*r = _mm_load_ps( f );
 }
 
 void FloatRow2Float( const FloatRow* fr, float* f)


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Hgengine-cvs] SF.net SVN: hgengine:[164] Mercury2/src/MercuryMath.cpp

From: <axl...@us...> - 2009-03-02 20:19:56

Revision: 164
          http://hgengine.svn.sourceforge.net/hgengine/?rev=164&view=rev
Author:   axlecrusher
Date:     2009-03-02 20:19:52 +0000 (Mon, 02 Mar 2009)

Log Message:
-----------
dereference not cast

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp

Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp	2009-03-02 20:07:37 UTC (rev 163)
+++ Mercury2/src/MercuryMath.cpp	2009-03-02 20:19:52 UTC (rev 164)
@@ -4,7 +4,7 @@
 void TransposeMatrix( FloatRow* m )
 {
 	float tmp;
-	float *_m = (float*)m;
+	float *_m = *m;
 	
 	tmp = _m[1];
 	_m[1] = _m[4];
@@ -40,34 +40,34 @@
 
 void Mul4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
 {
-    ((float*)out)[0] = ((float*)first)[0] * ((float*)second)[0];
-    ((float*)out)[1] = ((float*)first)[1] * ((float*)second)[1];
-    ((float*)out)[2] = ((float*)first)[2] * ((float*)second)[2];
-    ((float*)out)[3] = ((float*)first)[3] * ((float*)second)[3];
+    (*out)[0] = (*first)[0] * (*second)[0];
+    (*out)[1] = (*first)[1] * (*second)[1];
+    (*out)[2] = (*first)[2] * (*second)[2];
+    (*out)[3] = (*first)[3] * (*second)[3];
 }
 
 void Div4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
 {
-    ((float*)out)[0] = ((float*)first)[0] / ((float*)second)[0];
-    ((float*)out)[1] = ((float*)first)[1] / ((float*)second)[1];
-    ((float*)out)[2] = ((float*)first)[2] / ((float*)second)[2];
-    ((float*)out)[3] = ((float*)first)[3] / ((float*)second)[3];
+    (*out)[0] = (*first)[0] / (*second)[0];
+    (*out)[1] = (*first)[1] / (*second)[1];
+    (*out)[2] = (*first)[2] / (*second)[2];
+    (*out)[3] = (*first)[3] / (*second)[3];
 }
 
 void Add4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
 {
-    ((float*)out)[0] = ((float*)first)[0] + ((float*)second)[0];
-    ((float*)out)[1] = ((float*)first)[1] + ((float*)second)[1];
-    ((float*)out)[2] = ((float*)first)[2] + ((float*)second)[2];
-    ((float*)out)[3] = ((float*)first)[3] + ((float*)second)[3];
+    (*out)[0] = (*first)[0] + (*second)[0];
+    (*out)[1] = (*first)[1] + (*second)[1];
+    (*out)[2] = (*first)[2] + (*second)[2];
+    (*out)[3] = (*first)[3] + (*second)[3];
 }
 
 void Sub4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
 {
-    ((float*)out)[0] = ((float*)first)[0] - ((float*)second)[0];
-    ((float*)out)[1] = ((float*)first)[1] - ((float*)second)[1];
-    ((float*)out)[2] = ((float*)first)[2] - ((float*)second)[2];
-    ((float*)out)[3] = ((float*)first)[3] - ((float*)second)[3];
+    (*out)[0] = (*first)[0] - (*second)[0];
+    (*out)[1] = (*first)[1] - (*second)[1];
+    (*out)[2] = (*first)[2] - (*second)[2];
+    (*out)[3] = (*first)[3] - (*second)[3];
 }
 
 void Copy4f( void * dest, const void * source )
@@ -116,12 +116,10 @@
 
 void MatrixMultiply4f ( const FloatRow* in1a, const FloatRow* in2a, FloatRow* outa)
 {
-	float *in1, *in2, *out;
+	const float *in1 = *in1a;
+	const float *in2 = *in2a;
+	float *out = *outa;
 	
-	in1 = (float*)in1a;
-	in2 = (float*)in2a;
-	out = (float*)outa;
-	
 	out[0] = in1[0] * in2[0] + in1[1] * in2[4] +
 				in1[2] * in2[8] + in1[3] * in2[12];
 	out[1] = in1[0] * in2[1] + in1[1] * in2[5] +
@@ -161,9 +159,9 @@
 
 void VectorMultiply4f( const FloatRow* matrix, const FloatRow* pa, FloatRow* outa )
 {
-	float *m = (float*)matrix;
-	float *p = (float*)pa;
-	float *out = (float*)outa;
+	const float *m = *matrix;
+	const float *p = *pa;
+	float *out = *outa;
 	out[0] = p[0] * m[0] + p[1] * m[1] + p[2] * m[2] + p[3] * m[3];
 	out[1] = p[0] * m[4] + p[1] * m[5] + p[2] * m[6] + p[3] * m[7];
 	out[2] = p[0] * m[8] + p[1] * m[9] + p[2] * m[10] + p[3] * m[11];
@@ -172,7 +170,7 @@
 
 void Float2FloatRow(const float* f, FloatRow* r)
 {
-	float* row = (float*)r;
+	float* row = *r;
 	row[0] = f[0];
 	row[1] = f[1];
 	row[2] = f[2];


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Hgengine-cvs] SF.net SVN: hgengine:[169] Mercury2/src/MercuryMath.cpp

From: <axl...@us...> - 2009-03-03 02:06:54

Revision: 169
          http://hgengine.svn.sourceforge.net/hgengine/?rev=169&view=rev
Author:   axlecrusher
Date:     2009-03-03 02:06:45 +0000 (Tue, 03 Mar 2009)

Log Message:
-----------
Fix nonSSE compile

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp

Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp	2009-03-03 02:02:34 UTC (rev 168)
+++ Mercury2/src/MercuryMath.cpp	2009-03-03 02:06:45 UTC (rev 169)
@@ -157,11 +157,11 @@
 				in1[14] * in2[11] + in1[15] * in2[15];
 }
 
-void VectorMultiply4f( const FloatRow* matrix, const FloatRow* pa, FloatRow* outa )
+void VectorMultiply4f( const FloatRow* matrix, const FloatRow& pa, FloatRow& outa )
 {
 	const float *m = *matrix;
-	const float *p = *pa;
-	float *out = *outa;
+	const float *p = pa;
+	float *out = outa;
 	out[0] = p[0] * m[0] + p[1] * m[1] + p[2] * m[2] + p[3] * m[3];
 	out[1] = p[0] * m[4] + p[1] * m[5] + p[2] * m[6] + p[3] * m[7];
 	out[2] = p[0] * m[8] + p[1] * m[9] + p[2] * m[10] + p[3] * m[11];


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Hgengine-cvs] SF.net SVN: hgengine:[700] Mercury2/src/MercuryMath.cpp

From: <axl...@us...> - 2010-04-25 22:34:03

Revision: 700
          http://hgengine.svn.sourceforge.net/hgengine/?rev=700&view=rev
Author:   axlecrusher
Date:     2010-04-25 22:33:57 +0000 (Sun, 25 Apr 2010)

Log Message:
-----------
proper command for copying data with SSE. It tries to avoid polluting caches

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp

Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp	2010-04-25 20:16:49 UTC (rev 699)
+++ Mercury2/src/MercuryMath.cpp	2010-04-25 22:33:57 UTC (rev 700)
@@ -184,38 +184,21 @@
 
 void Copy4f( void * dest, const void * source )
 {
-	__m128 xmm;
-
-	xmm = _mm_load_ps((float*)source);
-	_mm_store_ps((float*)dest, xmm);
+	_mm_stream_ps(((float*)dest),((__m128*)source)[0]);
 }
 
 void Copy8f( void * dest, const void * source )
 {
-	__m128 xmm[2];
-
-	xmm[0] = _mm_load_ps((float*)source);
-	_mm_store_ps((float*)dest, xmm[0]);
-
-	xmm[1] = _mm_load_ps((float*)&(((float*)source)[4]));
-	_mm_store_ps((float*)&(((float*)dest)[4]), xmm[1]);
+	_mm_stream_ps(((float*)dest),((__m128*)source)[0]);
+	_mm_stream_ps(((float*)dest)+4,((__m128*)source)[1]);
 }
 
 void Copy16f( void * dest, const void * source )
 {
-	__m128 xmm[4];
-
-	xmm[0] = _mm_load_ps((float*)source);
-	_mm_store_ps((float*)dest, xmm[0]);
-
-	xmm[1] = _mm_load_ps((float*)&(((float*)source)[4]));
-	_mm_store_ps((float*)&(((float*)dest)[4]), xmm[1]);
-
-	xmm[2] = _mm_load_ps((float*)&(((float*)source)[8]));
-	_mm_store_ps((float*)&(((float*)dest)[8]), xmm[2]);
-
-	xmm[3] = _mm_load_ps((float*)&(((float*)source)[12]));
-	_mm_store_ps((float*)&(((float*)dest)[12]), xmm[3]);
+	_mm_stream_ps(((float*)dest),((__m128*)source)[0]);
+	_mm_stream_ps(((float*)dest)+4,((__m128*)source)[1]);
+	_mm_stream_ps(((float*)dest)+8,((__m128*)source)[2]);
+	_mm_stream_ps(((float*)dest)+12,((__m128*)source)[3]);
 }
 
 void MatrixMultiply4f( const FloatRow* in1, const FloatRow* in2, FloatRow* out)


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Hgengine-cvs] SF.net SVN: hgengine:[703] Mercury2/src/MercuryMath.cpp

From: <axl...@us...> - 2010-04-27 18:16:19

Revision: 703
          http://hgengine.svn.sourceforge.net/hgengine/?rev=703&view=rev
Author:   axlecrusher
Date:     2010-04-27 18:16:10 +0000 (Tue, 27 Apr 2010)

Log Message:
-----------
fix windows compile

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp

Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp	2010-04-27 00:16:04 UTC (rev 702)
+++ Mercury2/src/MercuryMath.cpp	2010-04-27 18:16:10 UTC (rev 703)
@@ -275,7 +275,8 @@
 
 	c = _mm_shuffle_ps(r2, r2, 0xc9);
 	d = _mm_shuffle_ps(r1, r1, 0xd2);
-	r -= _mm_mul_ps( c, d );
+	a = _mm_mul_ps( c, d );
+	a = _mm_sub_ps(r,a);
 	result = r;
 }
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Hgengine-cvs] SF.net SVN: hgengine:[708] Mercury2/src/MercuryMath.cpp

From: <axl...@us...> - 2010-04-28 02:26:38

Revision: 708
          http://hgengine.svn.sourceforge.net/hgengine/?rev=708&view=rev
Author:   axlecrusher
Date:     2010-04-28 02:26:32 +0000 (Wed, 28 Apr 2010)

Log Message:
-----------
fix bug

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp

Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp	2010-04-28 02:05:26 UTC (rev 707)
+++ Mercury2/src/MercuryMath.cpp	2010-04-28 02:26:32 UTC (rev 708)
@@ -276,7 +276,7 @@
 	c = _mm_shuffle_ps(r2, r2, 0xc9);
 	d = _mm_shuffle_ps(r1, r1, 0xd2);
 	a = _mm_mul_ps( c, d );
-	a = _mm_sub_ps(r,a);
+	r = _mm_sub_ps(r,a);
 	result = r;
 }
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Hgengine-cvs] SF.net SVN: hgengine:[716] Mercury2/src/MercuryMath.cpp

From: <axl...@us...> - 2010-05-02 14:32:49

Revision: 716
          http://hgengine.svn.sourceforge.net/hgengine/?rev=716&view=rev
Author:   axlecrusher
Date:     2010-05-02 14:32:43 +0000 (Sun, 02 May 2010)

Log Message:
-----------
fixe SSE since FloatRow is no longer __m128

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp

Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp	2010-04-29 23:53:04 UTC (rev 715)
+++ Mercury2/src/MercuryMath.cpp	2010-05-02 14:32:43 UTC (rev 716)
@@ -164,22 +164,38 @@
 
 void Mul4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
 {
-	out = _mm_mul_ps( first, second );
+	__m128 a,b,o;
+	a = _mm_load_ps(first);
+	b = _mm_load_ps(second);
+	o = _mm_mul_ps( a, b );
+	_mm_store_ps(out,o);
 }
 
 void Div4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
 {
-	out = _mm_div_ps( first, second );
+	__m128 a,b,o;
+	a = _mm_load_ps(first);
+	b = _mm_load_ps(second);
+	o = _mm_div_ps( a, b );
+	_mm_store_ps(out,o);
 }
 
 void Add4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
 {
-	out = _mm_add_ps( first, second );
+	__m128 a,b,o;
+	a = _mm_load_ps(first);
+	b = _mm_load_ps(second);
+	o = _mm_add_ps( a, b );
+	_mm_store_ps(out,o);
 }
 
 void Sub4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
 {
-	out = _mm_sub_ps( first, second );
+	__m128 a,b,o;
+	a = _mm_load_ps(first);
+	b = _mm_load_ps(second);
+	o = _mm_sub_ps( a, b );
+	_mm_store_ps(out,o);
 }
 
 void Copy4f( void * dest, const void * source )
@@ -195,6 +211,11 @@
 
 void Copy16f( void * dest, const void * source )
 {
+/*	_mm_stream_si128((__m128i*)dest,((__m128i*)source)[0]);
+	_mm_stream_si128(&((__m128i*)dest)[1],((__m128i*)source)[1]);
+	_mm_stream_si128(&((__m128i*)dest)[2],((__m128i*)source)[2]);
+	_mm_stream_si128(&((__m128i*)dest)[3],((__m128i*)source)[3]);
+*/
 	_mm_stream_ps(((float*)dest),((__m128*)source)[0]);
 	_mm_stream_ps(((float*)dest)+4,((__m128*)source)[1]);
 	_mm_stream_ps(((float*)dest)+8,((__m128*)source)[2]);
@@ -204,52 +225,68 @@
 void MatrixMultiply4f( const FloatRow* in1, const FloatRow* in2, FloatRow* out)
 {
 	unsigned int y;
-	__m128 xmm[4];
+	__m128 xmm[4], a[4], b[4];
 
 //	PREFETCH(in1, _MM_HINT_T0);
 //	PREFETCH(in2, _MM_HINT_T1);
 //	PREFETCH(out, _MM_HINT_T1);
+	b[0] = _mm_load_ps(in2[0]);
+	b[1] = _mm_load_ps(in2[1]);
+	b[2] = _mm_load_ps(in2[2]);
+	b[3] = _mm_load_ps(in2[3]);
 
 	for (y = 0; y < 4; ++y)
 	{
+		a[y] = _mm_load_ps(in1[y]);
+		
 		//load rows as columns
-		xmm[3] = _mm_shuffle_ps (in1[y], in1[y], 0xff);
-		xmm[2] = _mm_shuffle_ps (in1[y], in1[y], 0xaa);
-		xmm[1] = _mm_shuffle_ps (in1[y], in1[y], 0x55);
-		xmm[0] = _mm_shuffle_ps (in1[y], in1[y], 0x00);
+		xmm[3] = _mm_shuffle_ps (a[y], a[y], 0xff);
+		xmm[2] = _mm_shuffle_ps (a[y], a[y], 0xaa);
+		xmm[1] = _mm_shuffle_ps (a[y], a[y], 0x55);
+		xmm[0] = _mm_shuffle_ps (a[y], a[y], 0x00);
 
-		xmm[0] = _mm_mul_ps( xmm[0], in2[0] );
-		xmm[1] = _mm_mul_ps( xmm[1], in2[1] );
-		xmm[2] = _mm_mul_ps( xmm[2], in2[2] );
-		xmm[3] = _mm_mul_ps( xmm[3], in2[3] );
+		xmm[0] = _mm_mul_ps( xmm[0], b[0] );
+		xmm[1] = _mm_mul_ps( xmm[1], b[1] );
+		xmm[2] = _mm_mul_ps( xmm[2], b[2] );
+		xmm[3] = _mm_mul_ps( xmm[3], b[3] );
 
 		xmm[0] = _mm_add_ps( xmm[0], xmm[1] );
 		xmm[2] = _mm_add_ps( xmm[2], xmm[3] );
-		out[y] = _mm_add_ps( xmm[0], xmm[2] );
+		a[y] = _mm_add_ps( xmm[0], xmm[2] );
 	}
+
+	//try to use the CPU's write-combining
+	_mm_store_ps(out[0], a[0]);
+	_mm_store_ps(out[1], a[1]);
+	_mm_store_ps(out[2], a[2]);
+	_mm_store_ps(out[3], a[3]);
 }
 
 //This is an SSE matrix vector multiply, see the standard C++ code
 //for a clear algorithim.  This seems like it works.
 void VectorMultiply4f( const FloatRow* matrix, const FloatRow& p, FloatRow& out )
 {
-	__m128 tmp, XY;
+	__m128 tmp,tmp2, XY, pp;
 	
+	pp=_mm_load_ps(p);
+	
 	//compute term X and term Y and store them in the low order of XY
-	XY = Hadd4( _mm_mul_ps( matrix[0], p ) ); //compute X
-	tmp = Hadd4( _mm_mul_ps( matrix[1], p ) ); //compute Y
+	XY = Hadd4( _mm_mul_ps( _mm_load_ps(matrix[0]), pp ) ); //compute X
+	tmp = Hadd4( _mm_mul_ps( _mm_load_ps(matrix[1]), pp ) ); //compute Y
 	XY = _mm_unpacklo_ps(XY, tmp);
 
 	//compute term Z and term W and store them in the low order of out
-	out = Hadd4( _mm_mul_ps( matrix[2], p ) ); //compute Z
-	tmp = Hadd4( _mm_mul_ps( matrix[3], p ) ); //compute W
-	out = _mm_unpacklo_ps(out, tmp);
+	tmp2 = Hadd4( _mm_mul_ps( _mm_load_ps(matrix[2]), pp ) ); //compute Z
+	tmp = Hadd4( _mm_mul_ps( _mm_load_ps(matrix[3]), pp ) ); //compute W
+	pp = _mm_unpacklo_ps(tmp2, tmp);
 
 	//shuffle the low order of XY into the loworder of out
 	//and shuffle the low order of out into the high order of out
-	out = _mm_movelh_ps(XY, out);
+	tmp = _mm_movelh_ps(XY, pp);
+	
+	_mm_store_ps(out, tmp);
 }
-
+/*
 void ZeroFloatRow(FloatRow& r)
 {
 	r = _mm_setzero_ps();
@@ -264,20 +301,25 @@
 {
 	_mm_store_ps( f, r );
 }
-
+*/
 void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result)
 {
 	__m128 a,b,c,d,r;//using more registers is faster
+	__m128 t1,t2;
+	
+	t1 = _mm_load_ps(r1);
+	t2 = _mm_load_ps(r2);
 
-	a = _mm_shuffle_ps(r1, r1, 0xc9);
-	b = _mm_shuffle_ps(r2, r2, 0xd2);
+	a = _mm_shuffle_ps(t1, t1, 0xc9);
+	b = _mm_shuffle_ps(t2, t2, 0xd2);
 	r = _mm_mul_ps( a, b );
 
-	c = _mm_shuffle_ps(r2, r2, 0xc9);
-	d = _mm_shuffle_ps(r1, r1, 0xd2);
+	c = _mm_shuffle_ps(t2, t2, 0xc9);
+	d = _mm_shuffle_ps(t1, t1, 0xd2);
 	a = _mm_mul_ps( c, d );
 	r = _mm_sub_ps(r,a);
-	result = r;
+
+	_mm_store_ps(result, r);
 }
 
 #endif


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Hgengine-cvs] SF.net SVN: hgengine:[717] Mercury2/src/MercuryMath.cpp

From: <axl...@us...> - 2010-05-03 02:07:30

Revision: 717
          http://hgengine.svn.sourceforge.net/hgengine/?rev=717&view=rev
Author:   axlecrusher
Date:     2010-05-03 02:07:24 +0000 (Mon, 03 May 2010)

Log Message:
-----------
Don't write results directly to output reference. Use local variables and copy the results then the calculations are finished. This allows the input and output to be the same address.

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp

Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp	2010-05-02 14:32:43 UTC (rev 716)
+++ Mercury2/src/MercuryMath.cpp	2010-05-03 02:07:24 UTC (rev 717)
@@ -41,26 +41,34 @@
 
 void Mul4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
 {
+	FloatRow r;
 	for (uint8_t i = 0; i < 4; ++i)
-	    out[i] = first[i] * second[i];
+	    r[i] = first[i] * second[i];
+	Copy4f(out,r);
 }
 
 void Div4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
 {
+	FloatRow r;
 	for (uint8_t i = 0; i < 4; ++i)
 		out[i] = first[i] / second[i];
+	Copy4f(out,r);
 }
 
 void Add4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
 {
+	FloatRow r;
 	for (uint8_t i = 0; i < 4; ++i)
 		out[i] = first[i] + second[i];
+	Copy4f(out,r);
 }
 
 void Sub4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
 {
+	FloatRow r;
 	for (uint8_t i = 0; i < 4; ++i)
 		out[i] = first[i] - second[i];
+	Copy4f(out,r);
 }
 
 void Copy4f( void * dest, const void * source )
@@ -82,56 +90,61 @@
 {
 	const float *in1 = *in1a;
 	const float *in2 = *in2a;
-	float *out = *outa;
+	FloatRow r[4];
 	
-	out[0] = in1[0] * in2[0] + in1[1] * in2[4] +
+	(*r)[0] = in1[0] * in2[0] + in1[1] * in2[4] +
 				in1[2] * in2[8] + in1[3] * in2[12];
-	out[1] = in1[0] * in2[1] + in1[1] * in2[5] +
+	(*r)[1] = in1[0] * in2[1] + in1[1] * in2[5] +
 				in1[2] * in2[9] + in1[3] * in2[13];
-	out[2] = in1[0] * in2[2] + in1[1] * in2[6] +
+	(*r)[2] = in1[0] * in2[2] + in1[1] * in2[6] +
 				in1[2] * in2[10] + in1[3] * in2[14];
-	out[3] = in1[0] * in2[3] + in1[1] * in2[7] +
+	(*r)[3] = in1[0] * in2[3] + in1[1] * in2[7] +
 				in1[2] * in2[11] + in1[3] * in2[15];
 
-	out[4] = in1[4] * in2[0] + in1[5] * in2[4] +
+	(*r)[4] = in1[4] * in2[0] + in1[5] * in2[4] +
 				in1[6] * in2[8] + in1[7] * in2[12];
-	out[5] = in1[4] * in2[1] + in1[5] * in2[5] +
+	(*r)[5] = in1[4] * in2[1] + in1[5] * in2[5] +
 				in1[6] * in2[9] + in1[7] * in2[13];
-	out[6] = in1[4] * in2[2] + in1[5] * in2[6] +
+	(*r)[6] = in1[4] * in2[2] + in1[5] * in2[6] +
 				in1[6] * in2[10] + in1[7] * in2[14];
-	out[7] = in1[4] * in2[3] + in1[5] * in2[7] +
+	(*r)[7] = in1[4] * in2[3] + in1[5] * in2[7] +
 				in1[6] * in2[11] + in1[7] * in2[15];
 
-	out[8] = in1[8] * in2[0] + in1[9] * in2[4] +
+	(*r)[8] = in1[8] * in2[0] + in1[9] * in2[4] +
 				in1[10] * in2[8] + in1[11] * in2[12];
-	out[9] = in1[8] * in2[1] + in1[9] * in2[5] +
+	(*r)[9] = in1[8] * in2[1] + in1[9] * in2[5] +
 				in1[10] * in2[9] + in1[11] * in2[13];
-	out[10] = in1[8] * in2[2] + in1[9] * in2[6] +
+	(*r)[10] = in1[8] * in2[2] + in1[9] * in2[6] +
 				in1[10] * in2[10] + in1[11] * in2[14];
-	out[11] = in1[8] * in2[3] + in1[9] * in2[7] +
+	(*r)[11] = in1[8] * in2[3] + in1[9] * in2[7] +
 				in1[10] * in2[11] + in1[11] * in2[15];
 
-	out[12] = in1[12] * in2[0] + in1[13] * in2[4] +
+	(*r)[12] = in1[12] * in2[0] + in1[13] * in2[4] +
 				in1[14] * in2[8] + in1[15] * in2[12];
-	out[13] = in1[12] * in2[1] + in1[13] * in2[5] +
+	(*r)[13] = in1[12] * in2[1] + in1[13] * in2[5] +
 				in1[14] * in2[9] + in1[15] * in2[13];
-	out[14] = in1[12] * in2[2] + in1[13] * in2[6] +
+	(*r)[14] = in1[12] * in2[2] + in1[13] * in2[6] +
 				in1[14] * in2[10] + in1[15] * in2[14];
-	out[15] = in1[12] * in2[3] + in1[13] * in2[7] +
+	(*r)[15] = in1[12] * in2[3] + in1[13] * in2[7] +
 				in1[14] * in2[11] + in1[15] * in2[15];
+
+	Copy16f(outa,r);
 }
 
 void VectorMultiply4f( const FloatRow* matrix, const FloatRow& pa, FloatRow& outa )
 {
+	FloatRow r;
 	const float *m = *matrix;
 	const float *p = pa;
-	float *out = outa;
-	out[0] = p[0] * m[0] + p[1] * m[1] + p[2] * m[2] + p[3] * m[3];
-	out[1] = p[0] * m[4] + p[1] * m[5] + p[2] * m[6] + p[3] * m[7];
-	out[2] = p[0] * m[8] + p[1] * m[9] + p[2] * m[10] + p[3] * m[11];
-	out[3] = p[0] * m[12] + p[1] * m[13] + p[2] * m[14] + p[3] * m[15];
+	
+	r[0] = p[0] * m[0] + p[1] * m[1] + p[2] * m[2] + p[3] * m[3];
+	r[1] = p[0] * m[4] + p[1] * m[5] + p[2] * m[6] + p[3] * m[7];
+	r[2] = p[0] * m[8] + p[1] * m[9] + p[2] * m[10] + p[3] * m[11];
+	r[3] = p[0] * m[12] + p[1] * m[13] + p[2] * m[14] + p[3] * m[15];
+	
+	Copy4f(outa,r);
 }
-
+/*
 void Float2FloatRow(const float* f, FloatRow& r)
 {
 	for (uint8_t i = 0; i < 4; ++i)
@@ -143,12 +156,16 @@
 	for (uint8_t i = 0; i < 4; ++i)
 		f[i] = r[i];
 }
-
+*/
 void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result)
 {
-	result[0] = r1[1]*r2[2] - r1[2]*r2[1];
-	result[1] = r1[2]*r2[0] - r1[0]*r2[2];
-	result[2] = r1[0]*r2[1] - r1[1]*r2[0];
+	FloatRow r;
+	
+	r[0] = r1[1]*r2[2] - r1[2]*r2[1];
+	r[1] = r1[2]*r2[0] - r1[0]*r2[2];
+	r[2] = r1[0]*r2[1] - r1[1]*r2[0];
+	
+	Copy4f(result,r);
 }
 
 #else


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.