|
From: <axl...@us...> - 2009-01-02 01:20:36
|
Revision: 139
http://hgengine.svn.sourceforge.net/hgengine/?rev=139&view=rev
Author: axlecrusher
Date: 2009-01-02 01:20:29 +0000 (Fri, 02 Jan 2009)
Log Message:
-----------
fix not defined
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2009-01-02 01:19:05 UTC (rev 138)
+++ Mercury2/src/MercuryMath.cpp 2009-01-02 01:20:29 UTC (rev 139)
@@ -1,6 +1,6 @@
#include "MercuryMath.h"
-#if !defined( USE_SSE )
+#ifndef USE_SSE
//Generic Math functions. Compile these if you can not use optimized functions.
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <axl...@us...> - 2009-01-04 18:32:07
|
Revision: 146
http://hgengine.svn.sourceforge.net/hgengine/?rev=146&view=rev
Author: axlecrusher
Date: 2009-01-04 18:32:00 +0000 (Sun, 04 Jan 2009)
Log Message:
-----------
convert non optimized functions to FloatRow
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2009-01-04 18:07:42 UTC (rev 145)
+++ Mercury2/src/MercuryMath.cpp 2009-01-04 18:32:00 UTC (rev 146)
@@ -9,36 +9,36 @@
Copy4f(&r, (FloatRow){ 0.0f, 0.0f, 0.0f, 0.0f });
}
-void Mul4f(const float* first, const float* second, float* out)
+void Mul4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
{
- out[0] = first[0] * second[0];
- out[1] = first[1] * second[1];
- out[2] = first[2] * second[2];
- out[3] = first[3] * second[3];
+ ((float*)out)[0] = ((float*)first)[0] * ((float*)second)[0];
+ ((float*)out)[1] = ((float*)first)[1] * ((float*)second)[1];
+ ((float*)out)[2] = ((float*)first)[2] * ((float*)second)[2];
+ ((float*)out)[3] = ((float*)first)[3] * ((float*)second)[3];
}
-void Div4f(const float* first, const float* second, float* out)
+void Div4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
{
- out[0] = first[0] / second[0];
- out[1] = first[1] / second[1];
- out[2] = first[2] / second[2];
- out[3] = first[3] / second[3];
+ ((float*)out)[0] = ((float*)first)[0] / ((float*)second)[0];
+ ((float*)out)[1] = ((float*)first)[1] / ((float*)second)[1];
+ ((float*)out)[2] = ((float*)first)[2] / ((float*)second)[2];
+ ((float*)out)[3] = ((float*)first)[3] / ((float*)second)[3];
}
-void Add4f(const float* first, const float* second, float* out)
+void Add4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
{
- out[0] = first[0] + second[0];
- out[1] = first[1] + second[1];
- out[2] = first[2] + second[2];
- out[3] = first[3] + second[3];
+ ((float*)out)[0] = ((float*)first)[0] + ((float*)second)[0];
+ ((float*)out)[1] = ((float*)first)[1] + ((float*)second)[1];
+ ((float*)out)[2] = ((float*)first)[2] + ((float*)second)[2];
+ ((float*)out)[3] = ((float*)first)[3] + ((float*)second)[3];
}
-void Sub4f(const float* first, const float* second, float* out)
+void Sub4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
{
- out[0] = first[0] - second[0];
- out[1] = first[1] - second[1];
- out[2] = first[2] - second[2];
- out[3] = first[3] - second[3];
+ ((float*)out)[0] = ((float*)first)[0] - ((float*)second)[0];
+ ((float*)out)[1] = ((float*)first)[1] - ((float*)second)[1];
+ ((float*)out)[2] = ((float*)first)[2] - ((float*)second)[2];
+ ((float*)out)[3] = ((float*)first)[3] - ((float*)second)[3];
}
void Copy4f( void * dest, const void * source )
@@ -215,7 +215,7 @@
for (y = 0; y < 4; ++y)
{
- //load columns
+ //load rows as columns
xmm[3] = _mm_shuffle_ps (in1[y], in1[y], 0xff);
xmm[2] = _mm_shuffle_ps (in1[y], in1[y], 0xaa);
xmm[1] = _mm_shuffle_ps (in1[y], in1[y], 0x55);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <axl...@us...> - 2009-03-02 19:51:33
|
Revision: 161
http://hgengine.svn.sourceforge.net/hgengine/?rev=161&view=rev
Author: axlecrusher
Date: 2009-03-02 19:51:29 +0000 (Mon, 02 Mar 2009)
Log Message:
-----------
fix
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2009-03-02 19:46:08 UTC (rev 160)
+++ Mercury2/src/MercuryMath.cpp 2009-03-02 19:51:29 UTC (rev 161)
@@ -172,10 +172,11 @@
void Float2FloatRow(const float* f, FloatRow* r)
{
- *r[0] = f[0];
- *r[1] = f[1];
- *r[2] = f[2];
- *r[3] = f[3];
+ float* row = (float*)r;
+ row[0] = f[0];
+ row[1] = f[1];
+ row[2] = f[2];
+ row[3] = f[3];
}
void FloatRow2Float( const FloatRow* fr, float* f)
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <axl...@us...> - 2009-03-02 21:07:22
|
Revision: 166
http://hgengine.svn.sourceforge.net/hgengine/?rev=166&view=rev
Author: axlecrusher
Date: 2009-03-02 21:07:12 +0000 (Mon, 02 Mar 2009)
Log Message:
-----------
fix broken SSE vector multiply
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2009-03-02 20:36:05 UTC (rev 165)
+++ Mercury2/src/MercuryMath.cpp 2009-03-02 21:07:12 UTC (rev 166)
@@ -4,7 +4,7 @@
void TransposeMatrix( FloatRow* m )
{
float tmp;
- float *_m = *m;
+ float *_m = (float*)m;
tmp = _m[1];
_m[1] = _m[4];
@@ -284,14 +284,14 @@
//compute term 1 and term 2 and store them in the low order
//of outxmm[0]
- out[0] = Hadd4( _mm_mul_ps( matrix[1], *p ) );
- tmp = Hadd4( _mm_mul_ps( matrix[2], *p ) );
+ out[0] = Hadd4( _mm_mul_ps( matrix[0], *p ) );
+ tmp = Hadd4( _mm_mul_ps( matrix[1], *p ) );
out[0] = _mm_unpacklo_ps(out[0], tmp);
//compute term 3 and term 4 and store them in the high order
//of outxmm[1]
- out[1] = Hadd4( _mm_mul_ps( matrix[3], *p ) );
- tmp = Hadd4( _mm_mul_ps( matrix[4], *p ) );
+ out[1] = Hadd4( _mm_mul_ps( matrix[2], *p ) );
+ tmp = Hadd4( _mm_mul_ps( matrix[3], *p ) );
out[1] = _mm_unpacklo_ps(out[1], tmp);
//shuffle the low order of outxmm[0] into the loworder of tmp
@@ -304,9 +304,9 @@
r = (FloatRow)_mm_setzero_ps();
}
-FloatRow Float2FloatRow(const float* f, , FloatRow* r)
+void Float2FloatRow(const float* f, FloatRow* r)
{
- r = _mm_load_ps( f );
+ *r = _mm_load_ps( f );
}
void FloatRow2Float( const FloatRow* fr, float* f)
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <axl...@us...> - 2009-03-02 20:19:56
|
Revision: 164
http://hgengine.svn.sourceforge.net/hgengine/?rev=164&view=rev
Author: axlecrusher
Date: 2009-03-02 20:19:52 +0000 (Mon, 02 Mar 2009)
Log Message:
-----------
dereference not cast
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2009-03-02 20:07:37 UTC (rev 163)
+++ Mercury2/src/MercuryMath.cpp 2009-03-02 20:19:52 UTC (rev 164)
@@ -4,7 +4,7 @@
void TransposeMatrix( FloatRow* m )
{
float tmp;
- float *_m = (float*)m;
+ float *_m = *m;
tmp = _m[1];
_m[1] = _m[4];
@@ -40,34 +40,34 @@
void Mul4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
{
- ((float*)out)[0] = ((float*)first)[0] * ((float*)second)[0];
- ((float*)out)[1] = ((float*)first)[1] * ((float*)second)[1];
- ((float*)out)[2] = ((float*)first)[2] * ((float*)second)[2];
- ((float*)out)[3] = ((float*)first)[3] * ((float*)second)[3];
+ (*out)[0] = (*first)[0] * (*second)[0];
+ (*out)[1] = (*first)[1] * (*second)[1];
+ (*out)[2] = (*first)[2] * (*second)[2];
+ (*out)[3] = (*first)[3] * (*second)[3];
}
void Div4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
{
- ((float*)out)[0] = ((float*)first)[0] / ((float*)second)[0];
- ((float*)out)[1] = ((float*)first)[1] / ((float*)second)[1];
- ((float*)out)[2] = ((float*)first)[2] / ((float*)second)[2];
- ((float*)out)[3] = ((float*)first)[3] / ((float*)second)[3];
+ (*out)[0] = (*first)[0] / (*second)[0];
+ (*out)[1] = (*first)[1] / (*second)[1];
+ (*out)[2] = (*first)[2] / (*second)[2];
+ (*out)[3] = (*first)[3] / (*second)[3];
}
void Add4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
{
- ((float*)out)[0] = ((float*)first)[0] + ((float*)second)[0];
- ((float*)out)[1] = ((float*)first)[1] + ((float*)second)[1];
- ((float*)out)[2] = ((float*)first)[2] + ((float*)second)[2];
- ((float*)out)[3] = ((float*)first)[3] + ((float*)second)[3];
+ (*out)[0] = (*first)[0] + (*second)[0];
+ (*out)[1] = (*first)[1] + (*second)[1];
+ (*out)[2] = (*first)[2] + (*second)[2];
+ (*out)[3] = (*first)[3] + (*second)[3];
}
void Sub4f(const FloatRow* first, const FloatRow* second, FloatRow* out)
{
- ((float*)out)[0] = ((float*)first)[0] - ((float*)second)[0];
- ((float*)out)[1] = ((float*)first)[1] - ((float*)second)[1];
- ((float*)out)[2] = ((float*)first)[2] - ((float*)second)[2];
- ((float*)out)[3] = ((float*)first)[3] - ((float*)second)[3];
+ (*out)[0] = (*first)[0] - (*second)[0];
+ (*out)[1] = (*first)[1] - (*second)[1];
+ (*out)[2] = (*first)[2] - (*second)[2];
+ (*out)[3] = (*first)[3] - (*second)[3];
}
void Copy4f( void * dest, const void * source )
@@ -116,12 +116,10 @@
void MatrixMultiply4f ( const FloatRow* in1a, const FloatRow* in2a, FloatRow* outa)
{
- float *in1, *in2, *out;
+ const float *in1 = *in1a;
+ const float *in2 = *in2a;
+ float *out = *outa;
- in1 = (float*)in1a;
- in2 = (float*)in2a;
- out = (float*)outa;
-
out[0] = in1[0] * in2[0] + in1[1] * in2[4] +
in1[2] * in2[8] + in1[3] * in2[12];
out[1] = in1[0] * in2[1] + in1[1] * in2[5] +
@@ -161,9 +159,9 @@
void VectorMultiply4f( const FloatRow* matrix, const FloatRow* pa, FloatRow* outa )
{
- float *m = (float*)matrix;
- float *p = (float*)pa;
- float *out = (float*)outa;
+ const float *m = *matrix;
+ const float *p = *pa;
+ float *out = *outa;
out[0] = p[0] * m[0] + p[1] * m[1] + p[2] * m[2] + p[3] * m[3];
out[1] = p[0] * m[4] + p[1] * m[5] + p[2] * m[6] + p[3] * m[7];
out[2] = p[0] * m[8] + p[1] * m[9] + p[2] * m[10] + p[3] * m[11];
@@ -172,7 +170,7 @@
void Float2FloatRow(const float* f, FloatRow* r)
{
- float* row = (float*)r;
+ float* row = *r;
row[0] = f[0];
row[1] = f[1];
row[2] = f[2];
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <axl...@us...> - 2009-03-03 02:06:54
|
Revision: 169
http://hgengine.svn.sourceforge.net/hgengine/?rev=169&view=rev
Author: axlecrusher
Date: 2009-03-03 02:06:45 +0000 (Tue, 03 Mar 2009)
Log Message:
-----------
Fix nonSSE compile
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2009-03-03 02:02:34 UTC (rev 168)
+++ Mercury2/src/MercuryMath.cpp 2009-03-03 02:06:45 UTC (rev 169)
@@ -157,11 +157,11 @@
in1[14] * in2[11] + in1[15] * in2[15];
}
-void VectorMultiply4f( const FloatRow* matrix, const FloatRow* pa, FloatRow* outa )
+void VectorMultiply4f( const FloatRow* matrix, const FloatRow& pa, FloatRow& outa )
{
const float *m = *matrix;
- const float *p = *pa;
- float *out = *outa;
+ const float *p = pa;
+ float *out = outa;
out[0] = p[0] * m[0] + p[1] * m[1] + p[2] * m[2] + p[3] * m[3];
out[1] = p[0] * m[4] + p[1] * m[5] + p[2] * m[6] + p[3] * m[7];
out[2] = p[0] * m[8] + p[1] * m[9] + p[2] * m[10] + p[3] * m[11];
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <axl...@us...> - 2010-04-25 22:34:03
|
Revision: 700
http://hgengine.svn.sourceforge.net/hgengine/?rev=700&view=rev
Author: axlecrusher
Date: 2010-04-25 22:33:57 +0000 (Sun, 25 Apr 2010)
Log Message:
-----------
proper command for copying data with SSE. It tries to avoid polluting caches
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2010-04-25 20:16:49 UTC (rev 699)
+++ Mercury2/src/MercuryMath.cpp 2010-04-25 22:33:57 UTC (rev 700)
@@ -184,38 +184,21 @@
void Copy4f( void * dest, const void * source )
{
- __m128 xmm;
-
- xmm = _mm_load_ps((float*)source);
- _mm_store_ps((float*)dest, xmm);
+ _mm_stream_ps(((float*)dest),((__m128*)source)[0]);
}
void Copy8f( void * dest, const void * source )
{
- __m128 xmm[2];
-
- xmm[0] = _mm_load_ps((float*)source);
- _mm_store_ps((float*)dest, xmm[0]);
-
- xmm[1] = _mm_load_ps((float*)&(((float*)source)[4]));
- _mm_store_ps((float*)&(((float*)dest)[4]), xmm[1]);
+ _mm_stream_ps(((float*)dest),((__m128*)source)[0]);
+ _mm_stream_ps(((float*)dest)+4,((__m128*)source)[1]);
}
void Copy16f( void * dest, const void * source )
{
- __m128 xmm[4];
-
- xmm[0] = _mm_load_ps((float*)source);
- _mm_store_ps((float*)dest, xmm[0]);
-
- xmm[1] = _mm_load_ps((float*)&(((float*)source)[4]));
- _mm_store_ps((float*)&(((float*)dest)[4]), xmm[1]);
-
- xmm[2] = _mm_load_ps((float*)&(((float*)source)[8]));
- _mm_store_ps((float*)&(((float*)dest)[8]), xmm[2]);
-
- xmm[3] = _mm_load_ps((float*)&(((float*)source)[12]));
- _mm_store_ps((float*)&(((float*)dest)[12]), xmm[3]);
+ _mm_stream_ps(((float*)dest),((__m128*)source)[0]);
+ _mm_stream_ps(((float*)dest)+4,((__m128*)source)[1]);
+ _mm_stream_ps(((float*)dest)+8,((__m128*)source)[2]);
+ _mm_stream_ps(((float*)dest)+12,((__m128*)source)[3]);
}
void MatrixMultiply4f( const FloatRow* in1, const FloatRow* in2, FloatRow* out)
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <axl...@us...> - 2010-04-27 18:16:19
|
Revision: 703
http://hgengine.svn.sourceforge.net/hgengine/?rev=703&view=rev
Author: axlecrusher
Date: 2010-04-27 18:16:10 +0000 (Tue, 27 Apr 2010)
Log Message:
-----------
fix windows compile
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2010-04-27 00:16:04 UTC (rev 702)
+++ Mercury2/src/MercuryMath.cpp 2010-04-27 18:16:10 UTC (rev 703)
@@ -275,7 +275,8 @@
c = _mm_shuffle_ps(r2, r2, 0xc9);
d = _mm_shuffle_ps(r1, r1, 0xd2);
- r -= _mm_mul_ps( c, d );
+ a = _mm_mul_ps( c, d );
+ a = _mm_sub_ps(r,a);
result = r;
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <axl...@us...> - 2010-04-28 02:26:38
|
Revision: 708
http://hgengine.svn.sourceforge.net/hgengine/?rev=708&view=rev
Author: axlecrusher
Date: 2010-04-28 02:26:32 +0000 (Wed, 28 Apr 2010)
Log Message:
-----------
fix bug
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2010-04-28 02:05:26 UTC (rev 707)
+++ Mercury2/src/MercuryMath.cpp 2010-04-28 02:26:32 UTC (rev 708)
@@ -276,7 +276,7 @@
c = _mm_shuffle_ps(r2, r2, 0xc9);
d = _mm_shuffle_ps(r1, r1, 0xd2);
a = _mm_mul_ps( c, d );
- a = _mm_sub_ps(r,a);
+ r = _mm_sub_ps(r,a);
result = r;
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <axl...@us...> - 2010-05-02 14:32:49
|
Revision: 716
http://hgengine.svn.sourceforge.net/hgengine/?rev=716&view=rev
Author: axlecrusher
Date: 2010-05-02 14:32:43 +0000 (Sun, 02 May 2010)
Log Message:
-----------
fixe SSE since FloatRow is no longer __m128
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2010-04-29 23:53:04 UTC (rev 715)
+++ Mercury2/src/MercuryMath.cpp 2010-05-02 14:32:43 UTC (rev 716)
@@ -164,22 +164,38 @@
void Mul4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
{
- out = _mm_mul_ps( first, second );
+ __m128 a,b,o;
+ a = _mm_load_ps(first);
+ b = _mm_load_ps(second);
+ o = _mm_mul_ps( a, b );
+ _mm_store_ps(out,o);
}
void Div4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
{
- out = _mm_div_ps( first, second );
+ __m128 a,b,o;
+ a = _mm_load_ps(first);
+ b = _mm_load_ps(second);
+ o = _mm_div_ps( a, b );
+ _mm_store_ps(out,o);
}
void Add4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
{
- out = _mm_add_ps( first, second );
+ __m128 a,b,o;
+ a = _mm_load_ps(first);
+ b = _mm_load_ps(second);
+ o = _mm_add_ps( a, b );
+ _mm_store_ps(out,o);
}
void Sub4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
{
- out = _mm_sub_ps( first, second );
+ __m128 a,b,o;
+ a = _mm_load_ps(first);
+ b = _mm_load_ps(second);
+ o = _mm_sub_ps( a, b );
+ _mm_store_ps(out,o);
}
void Copy4f( void * dest, const void * source )
@@ -195,6 +211,11 @@
void Copy16f( void * dest, const void * source )
{
+/* _mm_stream_si128((__m128i*)dest,((__m128i*)source)[0]);
+ _mm_stream_si128(&((__m128i*)dest)[1],((__m128i*)source)[1]);
+ _mm_stream_si128(&((__m128i*)dest)[2],((__m128i*)source)[2]);
+ _mm_stream_si128(&((__m128i*)dest)[3],((__m128i*)source)[3]);
+*/
_mm_stream_ps(((float*)dest),((__m128*)source)[0]);
_mm_stream_ps(((float*)dest)+4,((__m128*)source)[1]);
_mm_stream_ps(((float*)dest)+8,((__m128*)source)[2]);
@@ -204,52 +225,68 @@
void MatrixMultiply4f( const FloatRow* in1, const FloatRow* in2, FloatRow* out)
{
unsigned int y;
- __m128 xmm[4];
+ __m128 xmm[4], a[4], b[4];
// PREFETCH(in1, _MM_HINT_T0);
// PREFETCH(in2, _MM_HINT_T1);
// PREFETCH(out, _MM_HINT_T1);
+ b[0] = _mm_load_ps(in2[0]);
+ b[1] = _mm_load_ps(in2[1]);
+ b[2] = _mm_load_ps(in2[2]);
+ b[3] = _mm_load_ps(in2[3]);
for (y = 0; y < 4; ++y)
{
+ a[y] = _mm_load_ps(in1[y]);
+
//load rows as columns
- xmm[3] = _mm_shuffle_ps (in1[y], in1[y], 0xff);
- xmm[2] = _mm_shuffle_ps (in1[y], in1[y], 0xaa);
- xmm[1] = _mm_shuffle_ps (in1[y], in1[y], 0x55);
- xmm[0] = _mm_shuffle_ps (in1[y], in1[y], 0x00);
+ xmm[3] = _mm_shuffle_ps (a[y], a[y], 0xff);
+ xmm[2] = _mm_shuffle_ps (a[y], a[y], 0xaa);
+ xmm[1] = _mm_shuffle_ps (a[y], a[y], 0x55);
+ xmm[0] = _mm_shuffle_ps (a[y], a[y], 0x00);
- xmm[0] = _mm_mul_ps( xmm[0], in2[0] );
- xmm[1] = _mm_mul_ps( xmm[1], in2[1] );
- xmm[2] = _mm_mul_ps( xmm[2], in2[2] );
- xmm[3] = _mm_mul_ps( xmm[3], in2[3] );
+ xmm[0] = _mm_mul_ps( xmm[0], b[0] );
+ xmm[1] = _mm_mul_ps( xmm[1], b[1] );
+ xmm[2] = _mm_mul_ps( xmm[2], b[2] );
+ xmm[3] = _mm_mul_ps( xmm[3], b[3] );
xmm[0] = _mm_add_ps( xmm[0], xmm[1] );
xmm[2] = _mm_add_ps( xmm[2], xmm[3] );
- out[y] = _mm_add_ps( xmm[0], xmm[2] );
+ a[y] = _mm_add_ps( xmm[0], xmm[2] );
}
+
+ //try to use the CPU's write-combining
+ _mm_store_ps(out[0], a[0]);
+ _mm_store_ps(out[1], a[1]);
+ _mm_store_ps(out[2], a[2]);
+ _mm_store_ps(out[3], a[3]);
}
//This is an SSE matrix vector multiply, see the standard C++ code
//for a clear algorithim. This seems like it works.
void VectorMultiply4f( const FloatRow* matrix, const FloatRow& p, FloatRow& out )
{
- __m128 tmp, XY;
+ __m128 tmp,tmp2, XY, pp;
+ pp=_mm_load_ps(p);
+
//compute term X and term Y and store them in the low order of XY
- XY = Hadd4( _mm_mul_ps( matrix[0], p ) ); //compute X
- tmp = Hadd4( _mm_mul_ps( matrix[1], p ) ); //compute Y
+ XY = Hadd4( _mm_mul_ps( _mm_load_ps(matrix[0]), pp ) ); //compute X
+ tmp = Hadd4( _mm_mul_ps( _mm_load_ps(matrix[1]), pp ) ); //compute Y
XY = _mm_unpacklo_ps(XY, tmp);
//compute term Z and term W and store them in the low order of out
- out = Hadd4( _mm_mul_ps( matrix[2], p ) ); //compute Z
- tmp = Hadd4( _mm_mul_ps( matrix[3], p ) ); //compute W
- out = _mm_unpacklo_ps(out, tmp);
+ tmp2 = Hadd4( _mm_mul_ps( _mm_load_ps(matrix[2]), pp ) ); //compute Z
+ tmp = Hadd4( _mm_mul_ps( _mm_load_ps(matrix[3]), pp ) ); //compute W
+ pp = _mm_unpacklo_ps(tmp2, tmp);
//shuffle the low order of XY into the loworder of out
//and shuffle the low order of out into the high order of out
- out = _mm_movelh_ps(XY, out);
+ tmp = _mm_movelh_ps(XY, pp);
+
+ _mm_store_ps(out, tmp);
}
-
+/*
void ZeroFloatRow(FloatRow& r)
{
r = _mm_setzero_ps();
@@ -264,20 +301,25 @@
{
_mm_store_ps( f, r );
}
-
+*/
void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result)
{
__m128 a,b,c,d,r;//using more registers is faster
+ __m128 t1,t2;
+
+ t1 = _mm_load_ps(r1);
+ t2 = _mm_load_ps(r2);
- a = _mm_shuffle_ps(r1, r1, 0xc9);
- b = _mm_shuffle_ps(r2, r2, 0xd2);
+ a = _mm_shuffle_ps(t1, t1, 0xc9);
+ b = _mm_shuffle_ps(t2, t2, 0xd2);
r = _mm_mul_ps( a, b );
- c = _mm_shuffle_ps(r2, r2, 0xc9);
- d = _mm_shuffle_ps(r1, r1, 0xd2);
+ c = _mm_shuffle_ps(t2, t2, 0xc9);
+ d = _mm_shuffle_ps(t1, t1, 0xd2);
a = _mm_mul_ps( c, d );
r = _mm_sub_ps(r,a);
- result = r;
+
+ _mm_store_ps(result, r);
}
#endif
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <axl...@us...> - 2010-05-03 02:07:30
|
Revision: 717
http://hgengine.svn.sourceforge.net/hgengine/?rev=717&view=rev
Author: axlecrusher
Date: 2010-05-03 02:07:24 +0000 (Mon, 03 May 2010)
Log Message:
-----------
Don't write results directly to output reference. Use local variables and copy the results then the calculations are finished. This allows the input and output to be the same address.
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2010-05-02 14:32:43 UTC (rev 716)
+++ Mercury2/src/MercuryMath.cpp 2010-05-03 02:07:24 UTC (rev 717)
@@ -41,26 +41,34 @@
void Mul4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
{
+ FloatRow r;
for (uint8_t i = 0; i < 4; ++i)
- out[i] = first[i] * second[i];
+ r[i] = first[i] * second[i];
+ Copy4f(out,r);
}
void Div4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
{
+ FloatRow r;
for (uint8_t i = 0; i < 4; ++i)
out[i] = first[i] / second[i];
+ Copy4f(out,r);
}
void Add4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
{
+ FloatRow r;
for (uint8_t i = 0; i < 4; ++i)
out[i] = first[i] + second[i];
+ Copy4f(out,r);
}
void Sub4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
{
+ FloatRow r;
for (uint8_t i = 0; i < 4; ++i)
out[i] = first[i] - second[i];
+ Copy4f(out,r);
}
void Copy4f( void * dest, const void * source )
@@ -82,56 +90,61 @@
{
const float *in1 = *in1a;
const float *in2 = *in2a;
- float *out = *outa;
+ FloatRow r[4];
- out[0] = in1[0] * in2[0] + in1[1] * in2[4] +
+ (*r)[0] = in1[0] * in2[0] + in1[1] * in2[4] +
in1[2] * in2[8] + in1[3] * in2[12];
- out[1] = in1[0] * in2[1] + in1[1] * in2[5] +
+ (*r)[1] = in1[0] * in2[1] + in1[1] * in2[5] +
in1[2] * in2[9] + in1[3] * in2[13];
- out[2] = in1[0] * in2[2] + in1[1] * in2[6] +
+ (*r)[2] = in1[0] * in2[2] + in1[1] * in2[6] +
in1[2] * in2[10] + in1[3] * in2[14];
- out[3] = in1[0] * in2[3] + in1[1] * in2[7] +
+ (*r)[3] = in1[0] * in2[3] + in1[1] * in2[7] +
in1[2] * in2[11] + in1[3] * in2[15];
- out[4] = in1[4] * in2[0] + in1[5] * in2[4] +
+ (*r)[4] = in1[4] * in2[0] + in1[5] * in2[4] +
in1[6] * in2[8] + in1[7] * in2[12];
- out[5] = in1[4] * in2[1] + in1[5] * in2[5] +
+ (*r)[5] = in1[4] * in2[1] + in1[5] * in2[5] +
in1[6] * in2[9] + in1[7] * in2[13];
- out[6] = in1[4] * in2[2] + in1[5] * in2[6] +
+ (*r)[6] = in1[4] * in2[2] + in1[5] * in2[6] +
in1[6] * in2[10] + in1[7] * in2[14];
- out[7] = in1[4] * in2[3] + in1[5] * in2[7] +
+ (*r)[7] = in1[4] * in2[3] + in1[5] * in2[7] +
in1[6] * in2[11] + in1[7] * in2[15];
- out[8] = in1[8] * in2[0] + in1[9] * in2[4] +
+ (*r)[8] = in1[8] * in2[0] + in1[9] * in2[4] +
in1[10] * in2[8] + in1[11] * in2[12];
- out[9] = in1[8] * in2[1] + in1[9] * in2[5] +
+ (*r)[9] = in1[8] * in2[1] + in1[9] * in2[5] +
in1[10] * in2[9] + in1[11] * in2[13];
- out[10] = in1[8] * in2[2] + in1[9] * in2[6] +
+ (*r)[10] = in1[8] * in2[2] + in1[9] * in2[6] +
in1[10] * in2[10] + in1[11] * in2[14];
- out[11] = in1[8] * in2[3] + in1[9] * in2[7] +
+ (*r)[11] = in1[8] * in2[3] + in1[9] * in2[7] +
in1[10] * in2[11] + in1[11] * in2[15];
- out[12] = in1[12] * in2[0] + in1[13] * in2[4] +
+ (*r)[12] = in1[12] * in2[0] + in1[13] * in2[4] +
in1[14] * in2[8] + in1[15] * in2[12];
- out[13] = in1[12] * in2[1] + in1[13] * in2[5] +
+ (*r)[13] = in1[12] * in2[1] + in1[13] * in2[5] +
in1[14] * in2[9] + in1[15] * in2[13];
- out[14] = in1[12] * in2[2] + in1[13] * in2[6] +
+ (*r)[14] = in1[12] * in2[2] + in1[13] * in2[6] +
in1[14] * in2[10] + in1[15] * in2[14];
- out[15] = in1[12] * in2[3] + in1[13] * in2[7] +
+ (*r)[15] = in1[12] * in2[3] + in1[13] * in2[7] +
in1[14] * in2[11] + in1[15] * in2[15];
+
+ Copy16f(outa,r);
}
void VectorMultiply4f( const FloatRow* matrix, const FloatRow& pa, FloatRow& outa )
{
+ FloatRow r;
const float *m = *matrix;
const float *p = pa;
- float *out = outa;
- out[0] = p[0] * m[0] + p[1] * m[1] + p[2] * m[2] + p[3] * m[3];
- out[1] = p[0] * m[4] + p[1] * m[5] + p[2] * m[6] + p[3] * m[7];
- out[2] = p[0] * m[8] + p[1] * m[9] + p[2] * m[10] + p[3] * m[11];
- out[3] = p[0] * m[12] + p[1] * m[13] + p[2] * m[14] + p[3] * m[15];
+
+ r[0] = p[0] * m[0] + p[1] * m[1] + p[2] * m[2] + p[3] * m[3];
+ r[1] = p[0] * m[4] + p[1] * m[5] + p[2] * m[6] + p[3] * m[7];
+ r[2] = p[0] * m[8] + p[1] * m[9] + p[2] * m[10] + p[3] * m[11];
+ r[3] = p[0] * m[12] + p[1] * m[13] + p[2] * m[14] + p[3] * m[15];
+
+ Copy4f(outa,r);
}
-
+/*
void Float2FloatRow(const float* f, FloatRow& r)
{
for (uint8_t i = 0; i < 4; ++i)
@@ -143,12 +156,16 @@
for (uint8_t i = 0; i < 4; ++i)
f[i] = r[i];
}
-
+*/
void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result)
{
- result[0] = r1[1]*r2[2] - r1[2]*r2[1];
- result[1] = r1[2]*r2[0] - r1[0]*r2[2];
- result[2] = r1[0]*r2[1] - r1[1]*r2[0];
+ FloatRow r;
+
+ r[0] = r1[1]*r2[2] - r1[2]*r2[1];
+ r[1] = r1[2]*r2[0] - r1[0]*r2[2];
+ r[2] = r1[0]*r2[1] - r1[1]*r2[0];
+
+ Copy4f(result,r);
}
#else
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|