|
From: <axl...@us...> - 2010-11-13 21:41:34
|
Revision: 764
http://hgengine.svn.sourceforge.net/hgengine/?rev=764&view=rev
Author: axlecrusher
Date: 2010-11-13 21:41:24 +0000 (Sat, 13 Nov 2010)
Log Message:
-----------
Use unaligned memory operations
Modified Paths:
--------------
Mercury2/src/MercuryMath.cpp
Mercury2/src/MercuryMath.h
Modified: Mercury2/src/MercuryMath.cpp
===================================================================
--- Mercury2/src/MercuryMath.cpp 2010-11-13 21:40:52 UTC (rev 763)
+++ Mercury2/src/MercuryMath.cpp 2010-11-13 21:41:24 UTC (rev 764)
@@ -176,37 +176,37 @@
void Mul4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
{
__m128 a,b,o;
- a = _mm_load_ps(first);
- b = _mm_load_ps(second);
+ a = _mm_loadu_ps(first);
+ b = _mm_loadu_ps(second);
o = _mm_mul_ps( a, b );
- _mm_store_ps(out,o);
+ _mm_storeu_ps(out,o);
}
void Div4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
{
__m128 a,b,o;
- a = _mm_load_ps(first);
- b = _mm_load_ps(second);
+ a = _mm_loadu_ps(first);
+ b = _mm_loadu_ps(second);
o = _mm_div_ps( a, b );
- _mm_store_ps(out,o);
+ _mm_storeu_ps(out,o);
}
void Add4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
{
__m128 a,b,o;
- a = _mm_load_ps(first);
- b = _mm_load_ps(second);
+ a = _mm_loadu_ps(first);
+ b = _mm_loadu_ps(second);
o = _mm_add_ps( a, b );
- _mm_store_ps(out,o);
+ _mm_storeu_ps(out,o);
}
void Sub4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
{
__m128 a,b,o;
- a = _mm_load_ps(first);
- b = _mm_load_ps(second);
+ a = _mm_loadu_ps(first);
+ b = _mm_loadu_ps(second);
o = _mm_sub_ps( a, b );
- _mm_store_ps(out,o);
+ _mm_storeu_ps(out,o);
}
void MatrixMultiply4f( const FloatRow* in1, const FloatRow* in2, FloatRow* out)
@@ -327,7 +327,13 @@
{
__m128 tmp,tmp2, XY, pp;
- pp=_mm_load_ps(p);
+ //load and loadu seem to run at nearly the same speed
+ //see benchmark file
+ pp=_mm_loadu_ps(p);
+// pp=_mm_load_ps(p);
+
+ //this function can run long so try to move the output clocer to the CPU while function is running
+ PREFETCH((const char*)out,_MM_HINT_T1);
//compute term X and term Y and store them in the low order of XY
XY = Hadd4( _mm_mul_ps( _mm_load_ps(matrix[0]), pp ) ); //compute X
@@ -343,7 +349,7 @@
//and shuffle the low order of out into the high order of out
tmp = _mm_movelh_ps(XY, pp);
- _mm_store_ps(out, tmp);
+ _mm_storeu_ps(out, tmp);
}
/*
void ZeroFloatRow(FloatRow& r)
Modified: Mercury2/src/MercuryMath.h
===================================================================
--- Mercury2/src/MercuryMath.h 2010-11-13 21:40:52 UTC (rev 763)
+++ Mercury2/src/MercuryMath.h 2010-11-13 21:41:24 UTC (rev 764)
@@ -1,157 +1,158 @@
-#ifndef _MERCURYMATH_H
+#ifndef _MERCURYMATH_H
#define _MERCURYMATH_H
-
-#include <math.h>
-#include <string.h>
-
-#ifdef HGENGINE
-#ifndef WIN32
-#include <configuration.h>
-#endif
-#endif
-
+
+#include <math.h>
+#include <string.h>
+
+#ifdef HGENGINE
+#ifndef WIN32
+#include <configuration.h>
+#endif
+#endif
+
#if defined(__GNUC__)
#define VC_ALIGN(n)
#define CC_ALIGN(n) __attribute__((aligned(n)))
#else
#define VC_ALIGN(n) __declspec(align(n))
#define CC_ALIGN(n)
-#endif
-
-#ifdef USE_SSE
-#include <xmmintrin.h>
-#define PREFETCH(a,sel) _mm_prefetch(a,sel); //prefetch a cache line (64 bytes)
-#else
-#define PREFETCH(a,sel) ; //prefetch a cache line (64 bytes)
-#endif
-/*
-VC_ALIGN(16) class FloatRow
-{
- public:
- inline operator float*() { return (float*)&m_floats; }
- inline operator const float*() const { return (const float*)&m_floats; }
-
-#ifndef USE_SSE
- float m_floats[4];
-#else
- inline FloatRow& operator=(const __m128& f) { m_floats=f; return *this; }
-
- inline operator __m128&() { return m_floats; }
- inline operator const __m128&() const { return m_floats; }
-// __m128 m_floats __attribute__((aligned(16)));
- __m128 m_floats;
-#endif
-} CC_ALIGN(16);
-*/
-
-typedef VC_ALIGN(16) float FloatRow[4] CC_ALIGN(16);
-
-#ifdef WIN32
-#include <limits>
-#define INFINITY (std::numeric_limits<float>::infinity())
-#else
-#define MAXINT (0x7FFFFFFF)
-#endif
-
-
-void ZeroFloatRow(FloatRow& r);
-
-#define DEGRAD 0.01745329251994329576f //degree to radian
-#define RADDEG 57.2957795130823208767f //radian to degree
-#define Q_PI 3.14159265358979323846f
-
-template<typename t, unsigned i>
-inline void COPY(const t* s, t*d)
-{
- d[i-1] = s[i-1];
- COPY<t,i-1>(s,d);
-}
-
-template<> inline void COPY<float,0>(const float* s, float* d)
-{
- d[0] = s[0];
-}
-
-#if defined(WIN32)
-//In win32, sin works faster than sinf and similar functions
-#define SIN( x ) float( sin ( x ) )
-#define COS( x ) float( cos ( x ) )
-#define ATAN2( x, y ) float( atan2( x, y ) )
-#define ASIN( x ) float( asin ( x ) )
-#define ACOS( x ) float( acos ( x ) )
-#define SQRT( x ) float( sqrt ( x ) )
-#define TAN( x ) float( tan ( x ) )
-#define ABS( x ) float( ((x<0)?(-x):(x)) )
-inline int LRINTF(float x) { int r = (int)x; (x-r)>=0.5?++r:0; return r; };
-#else
-//On other OSes in general, sinf works faster than floating a sin
-#define SIN( x ) sinf( x )
-#define COS( x ) cosf( x )
-#define ATAN2( x, y ) atan2f( x, y )
-#define ASIN( x ) asinf ( x )
-#define ACOS( x ) acosf ( x )
-#define SQRT( x ) sqrtf( x )
-#define TAN( x ) tanf( x )
-#define ABS( x ) ((x<0)?(-x):(x))
-#define LRINTF( x ) lrintf( x )
-#endif
-
-#define SQ(x) ((x)*(x));
-
-//#define DotProduct(x,y) ((x)[0]*(y)[0]+(x)[1]*(y)[1]+(x)[2]*(y)[2])
-
-void Mul4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
-void Div4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
-void Add4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
-void Sub4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
-inline void Copy4f( void * dest, const void * source ) { memcpy(dest,source,16); }
-inline void Copy8f( void * dest, const void * source ) { memcpy(dest,source,32); }
-inline void Copy16f( void * dest, const void * source ) { memcpy(dest,source,64); }
-void MatrixMultiply4f ( const FloatRow* in1, const FloatRow* in2, FloatRow* out );
-void VectorMultiply4f(const FloatRow* matrix, const FloatRow& p, FloatRow& out );
-void TransposeMatrix(const FloatRow* matrix, FloatRow* out);
-void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result);
-void LoadIdentity(FloatRow* matrix);
-
-//http://graphics.stanford.edu/~seander/bithacks.html
-inline unsigned int SetBit(unsigned int x, unsigned int mask, bool t)
-{
-#if defined(WIN32)
- #pragma warning( disable : 4804 )
-#endif
- return ((x & ~mask) | (-t & mask)); /*superscalar CPU version*/
-}
-inline bool GetBit(unsigned int x, unsigned int mask) { return ((x & mask)>0); }
-
-//void Float2FloatRow(const float* f, FloatRow& r);
-//void FloatRow2Float(const FloatRow& fr, float* f);
-
-const FloatRow gfrZero = { 0.f, 0.f, 0.f, 0.f };
-
-#endif
-
-/*
- * (c) 2006 Joshua Allen
- * All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, and/or sell copies of the Software, and to permit persons to
- * whom the Software is furnished to do so, provided that the above
- * copyright notice(s) and this permission notice appear in all copies of
- * the Software and that both the above copyright notice(s) and this
- * permission notice appear in supporting documentation.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
- * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
- * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
- * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
- * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
- * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
- * PERFORMANCE OF THIS SOFTWARE.
- */
-
+#endif
+
+#ifdef USE_SSE
+#include <xmmintrin.h>
+#define PREFETCH(a,sel) _mm_prefetch(a,sel); //prefetch a cache line (64 bytes)
+#else
+#define PREFETCH(a,sel) ; //prefetch a cache line (64 bytes)
+#endif
+/*
+VC_ALIGN(16) class FloatRow
+{
+ public:
+ inline operator float*() { return (float*)&m_floats; }
+ inline operator const float*() const { return (const float*)&m_floats; }
+
+#ifndef USE_SSE
+ float m_floats[4];
+#else
+ inline FloatRow& operator=(const __m128& f) { m_floats=f; return *this; }
+
+ inline operator __m128&() { return m_floats; }
+ inline operator const __m128&() const { return m_floats; }
+// __m128 m_floats __attribute__((aligned(16)));
+ __m128 m_floats;
+#endif
+} CC_ALIGN(16);
+*/
+
+typedef VC_ALIGN(16) float FloatRow[4] CC_ALIGN(16);
+//typedef float FloatRow[4];
+
+#ifdef WIN32
+#include <limits>
+#define INFINITY (std::numeric_limits<float>::infinity())
+#else
+#define MAXINT (0x7FFFFFFF)
+#endif
+
+
+void ZeroFloatRow(FloatRow& r);
+
+#define DEGRAD 0.01745329251994329576f //degree to radian
+#define RADDEG 57.2957795130823208767f //radian to degree
+#define Q_PI 3.14159265358979323846f
+
+template<typename t, unsigned i>
+inline void COPY(const t* s, t*d)
+{
+ d[i-1] = s[i-1];
+ COPY<t,i-1>(s,d);
+}
+
+template<> inline void COPY<float,0>(const float* s, float* d)
+{
+ d[0] = s[0];
+}
+
+#if defined(WIN32)
+//In win32, sin works faster than sinf and similar functions
+#define SIN( x ) float( sin ( x ) )
+#define COS( x ) float( cos ( x ) )
+#define ATAN2( x, y ) float( atan2( x, y ) )
+#define ASIN( x ) float( asin ( x ) )
+#define ACOS( x ) float( acos ( x ) )
+#define SQRT( x ) float( sqrt ( x ) )
+#define TAN( x ) float( tan ( x ) )
+#define ABS( x ) float( ((x<0)?(-x):(x)) )
+inline int LRINTF(float x) { int r = (int)x; (x-r)>=0.5?++r:0; return r; };
+#else
+//On other OSes in general, sinf works faster than floating a sin
+#define SIN( x ) sinf( x )
+#define COS( x ) cosf( x )
+#define ATAN2( x, y ) atan2f( x, y )
+#define ASIN( x ) asinf ( x )
+#define ACOS( x ) acosf ( x )
+#define SQRT( x ) sqrtf( x )
+#define TAN( x ) tanf( x )
+#define ABS( x ) ((x<0)?(-x):(x))
+#define LRINTF( x ) lrintf( x )
+#endif
+
+#define SQ(x) ((x)*(x));
+
+//#define DotProduct(x,y) ((x)[0]*(y)[0]+(x)[1]*(y)[1]+(x)[2]*(y)[2])
+
+void Mul4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
+void Div4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
+void Add4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
+void Sub4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
+inline void Copy4f( void * dest, const void * source ) { memcpy(dest,source,16); }
+inline void Copy8f( void * dest, const void * source ) { memcpy(dest,source,32); }
+inline void Copy16f( void * dest, const void * source ) { memcpy(dest,source,64); }
+void MatrixMultiply4f ( const FloatRow* in1, const FloatRow* in2, FloatRow* out );
+void VectorMultiply4f(const FloatRow* matrix, const FloatRow& p, FloatRow& out );
+void TransposeMatrix(const FloatRow* matrix, FloatRow* out);
+void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result);
+void LoadIdentity(FloatRow* matrix);
+
+//http://graphics.stanford.edu/~seander/bithacks.html
+inline unsigned int SetBit(unsigned int x, unsigned int mask, bool t)
+{
+#if defined(WIN32)
+ #pragma warning( disable : 4804 )
+#endif
+ return ((x & ~mask) | (-t & mask)); /*superscalar CPU version*/
+}
+inline bool GetBit(unsigned int x, unsigned int mask) { return ((x & mask)>0); }
+
+//void Float2FloatRow(const float* f, FloatRow& r);
+//void FloatRow2Float(const FloatRow& fr, float* f);
+
+const FloatRow gfrZero = { 0.f, 0.f, 0.f, 0.f };
+
+#endif
+
+/*
+ * (c) 2006 Joshua Allen
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, and/or sell copies of the Software, and to permit persons to
+ * whom the Software is furnished to do so, provided that the above
+ * copyright notice(s) and this permission notice appear in all copies of
+ * the Software and that both the above copyright notice(s) and this
+ * permission notice appear in supporting documentation.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|