[Hgengine-cvs] SF.net SVN: hgengine:[764] Mercury2/src

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 764
          http://hgengine.svn.sourceforge.net/hgengine/?rev=764&view=rev
Author:   axlecrusher
Date:     2010-11-13 21:41:24 +0000 (Sat, 13 Nov 2010)

Log Message:
-----------
Use unaligned memory operations

Modified Paths:
--------------
    Mercury2/src/MercuryMath.cpp
    Mercury2/src/MercuryMath.h

Modified: Mercury2/src/MercuryMath.cpp
===================================================================

--- Mercury2/src/MercuryMath.cpp	2010-11-13 21:40:52 UTC (rev 763)
+++ Mercury2/src/MercuryMath.cpp	2010-11-13 21:41:24 UTC (rev 764)
@@ -176,37 +176,37 @@
 void Mul4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
 {
 	__m128 a,b,o;
-	a = _mm_load_ps(first);
-	b = _mm_load_ps(second);
+	a = _mm_loadu_ps(first);
+	b = _mm_loadu_ps(second);
 	o = _mm_mul_ps( a, b );
-	_mm_store_ps(out,o);
+	_mm_storeu_ps(out,o);
 }
 
 void Div4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
 {
 	__m128 a,b,o;
-	a = _mm_load_ps(first);
-	b = _mm_load_ps(second);
+	a = _mm_loadu_ps(first);
+	b = _mm_loadu_ps(second);
 	o = _mm_div_ps( a, b );
-	_mm_store_ps(out,o);
+	_mm_storeu_ps(out,o);
 }
 
 void Add4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
 {
 	__m128 a,b,o;
-	a = _mm_load_ps(first);
-	b = _mm_load_ps(second);
+	a = _mm_loadu_ps(first);
+	b = _mm_loadu_ps(second);
 	o = _mm_add_ps( a, b );
-	_mm_store_ps(out,o);
+	_mm_storeu_ps(out,o);
 }
 
 void Sub4f(const FloatRow& first, const FloatRow& second, FloatRow& out)
 {
 	__m128 a,b,o;
-	a = _mm_load_ps(first);
-	b = _mm_load_ps(second);
+	a = _mm_loadu_ps(first);
+	b = _mm_loadu_ps(second);
 	o = _mm_sub_ps( a, b );
-	_mm_store_ps(out,o);
+	_mm_storeu_ps(out,o);
 }
 
 void MatrixMultiply4f( const FloatRow* in1, const FloatRow* in2, FloatRow* out)
@@ -327,7 +327,13 @@
 {
 	__m128 tmp,tmp2, XY, pp;
 	
-	pp=_mm_load_ps(p);
+	//load and loadu seem to run at nearly the same speed
+	//see benchmark file
+	pp=_mm_loadu_ps(p);
+//	pp=_mm_load_ps(p);
+
+	//this function can run long so try to move the output clocer to the CPU while function is running
+	PREFETCH((const char*)out,_MM_HINT_T1);
 	
 	//compute term X and term Y and store them in the low order of XY
 	XY = Hadd4( _mm_mul_ps( _mm_load_ps(matrix[0]), pp ) ); //compute X
@@ -343,7 +349,7 @@
 	//and shuffle the low order of out into the high order of out
 	tmp = _mm_movelh_ps(XY, pp);
 	
-	_mm_store_ps(out, tmp);
+	_mm_storeu_ps(out, tmp);
 }
 /*
 void ZeroFloatRow(FloatRow& r)

Modified: Mercury2/src/MercuryMath.h
===================================================================
--- Mercury2/src/MercuryMath.h	2010-11-13 21:40:52 UTC (rev 763)
+++ Mercury2/src/MercuryMath.h	2010-11-13 21:41:24 UTC (rev 764)
@@ -1,157 +1,158 @@
-#ifndef _MERCURYMATH_H
+#ifndef _MERCURYMATH_H
 #define _MERCURYMATH_H
-
-#include <math.h>
-#include <string.h>
-
-#ifdef HGENGINE
-#ifndef WIN32
-#include <configuration.h>
-#endif
-#endif
-
+
+#include <math.h>
+#include <string.h>
+
+#ifdef HGENGINE
+#ifndef WIN32
+#include <configuration.h>
+#endif
+#endif
+
 #if defined(__GNUC__)
 #define VC_ALIGN(n)
 #define CC_ALIGN(n) __attribute__((aligned(n)))
 #else
 #define VC_ALIGN(n) __declspec(align(n)) 
 #define CC_ALIGN(n)
-#endif
-
-#ifdef USE_SSE
-#include <xmmintrin.h>
-#define PREFETCH(a,sel) _mm_prefetch(a,sel); //prefetch a cache line (64 bytes)
-#else
-#define PREFETCH(a,sel) ; //prefetch a cache line (64 bytes)
-#endif
-/*
-VC_ALIGN(16) class FloatRow
-{
-	public:
-		inline operator float*() { return (float*)&m_floats; }
-		inline operator const float*() const { return (const float*)&m_floats; }
-		
-#ifndef USE_SSE
-		float m_floats[4];
-#else
-		inline FloatRow& operator=(const __m128& f) { m_floats=f; return *this; }
-		
-		inline operator __m128&() { return m_floats; }
-		inline operator const __m128&() const { return m_floats; }
-//		__m128 m_floats __attribute__((aligned(16)));
-		__m128 m_floats;
-#endif
-} CC_ALIGN(16);
-*/
-
-typedef VC_ALIGN(16) float FloatRow[4] CC_ALIGN(16);
-
-#ifdef WIN32
-#include <limits>
-#define INFINITY  (std::numeric_limits<float>::infinity())
-#else
-#define MAXINT	(0x7FFFFFFF)
-#endif
-
-
-void ZeroFloatRow(FloatRow& r);
-
-#define DEGRAD	0.01745329251994329576f		//degree to radian
-#define RADDEG	57.2957795130823208767f		//radian to degree
-#define	Q_PI	3.14159265358979323846f
-
-template<typename t, unsigned i>
-inline void COPY(const t* s, t*d)
-{
-	d[i-1] = s[i-1];
-	COPY<t,i-1>(s,d);
-}
-
-template<> inline void COPY<float,0>(const float* s, float* d)
-{
-	d[0] = s[0];
-}
-
-#if defined(WIN32)
-//In win32, sin works faster than sinf and similar functions
-#define SIN( x )		float( sin ( x ) )
-#define COS( x )		float( cos ( x ) )
-#define ATAN2( x, y )	float( atan2( x, y ) )
-#define ASIN( x )		float( asin ( x ) )
-#define ACOS( x )		float( acos ( x ) )
-#define SQRT( x )		float( sqrt ( x ) )
-#define	TAN( x )		float( tan ( x ) )
-#define ABS( x )		float( ((x<0)?(-x):(x)) )
-inline int LRINTF(float x) { int r = (int)x; (x-r)>=0.5?++r:0; return r; };
-#else
-//On other OSes in general, sinf works faster than floating a sin
-#define SIN( x )		sinf( x )
-#define COS( x )		cosf( x )
-#define ATAN2( x, y )	atan2f( x, y )
-#define ASIN( x )		asinf ( x )
-#define ACOS( x )		acosf ( x )
-#define SQRT( x )		sqrtf( x )
-#define TAN( x )		tanf( x )
-#define ABS( x )		((x<0)?(-x):(x))
-#define LRINTF( x )     lrintf( x )
-#endif
-
-#define SQ(x) ((x)*(x));
-
-//#define DotProduct(x,y) ((x)[0]*(y)[0]+(x)[1]*(y)[1]+(x)[2]*(y)[2])
-
-void Mul4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
-void Div4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
-void Add4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
-void Sub4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
-inline void Copy4f( void * dest, const void * source ) { memcpy(dest,source,16); }
-inline void Copy8f( void * dest, const void * source ) { memcpy(dest,source,32); }
-inline void Copy16f( void * dest, const void * source ) { memcpy(dest,source,64); }
-void MatrixMultiply4f ( const FloatRow* in1, const FloatRow* in2, FloatRow* out );
-void VectorMultiply4f(const FloatRow* matrix, const FloatRow& p, FloatRow& out );
-void TransposeMatrix(const FloatRow* matrix, FloatRow* out);
-void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result);
-void LoadIdentity(FloatRow* matrix);
-
-//http://graphics.stanford.edu/~seander/bithacks.html
-inline unsigned int SetBit(unsigned int x, unsigned int mask, bool t)
-{
-#if defined(WIN32)
-	#pragma warning( disable : 4804 )
-#endif
-	return ((x & ~mask) | (-t & mask));  /*superscalar CPU version*/
-}
-inline bool GetBit(unsigned int x, unsigned int mask) { return ((x & mask)>0); }
-
-//void Float2FloatRow(const float* f, FloatRow& r);
-//void FloatRow2Float(const FloatRow& fr, float* f);
-
-const FloatRow gfrZero = { 0.f, 0.f, 0.f, 0.f };
-
-#endif
-
-/*
- * (c) 2006 Joshua Allen
- * All rights reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, and/or sell copies of the Software, and to permit persons to
- * whom the Software is furnished to do so, provided that the above
- * copyright notice(s) and this permission notice appear in all copies of
- * the Software and that both the above copyright notice(s) and this
- * permission notice appear in supporting documentation.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
- * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
- * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
- * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
- * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
- * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
- * PERFORMANCE OF THIS SOFTWARE.
- */
-
+#endif
+
+#ifdef USE_SSE
+#include <xmmintrin.h>
+#define PREFETCH(a,sel) _mm_prefetch(a,sel); //prefetch a cache line (64 bytes)
+#else
+#define PREFETCH(a,sel) ; //prefetch a cache line (64 bytes)
+#endif
+/*
+VC_ALIGN(16) class FloatRow
+{
+	public:
+		inline operator float*() { return (float*)&m_floats; }
+		inline operator const float*() const { return (const float*)&m_floats; }
+		
+#ifndef USE_SSE
+		float m_floats[4];
+#else
+		inline FloatRow& operator=(const __m128& f) { m_floats=f; return *this; }
+		
+		inline operator __m128&() { return m_floats; }
+		inline operator const __m128&() const { return m_floats; }
+//		__m128 m_floats __attribute__((aligned(16)));
+		__m128 m_floats;
+#endif
+} CC_ALIGN(16);
+*/
+
+typedef VC_ALIGN(16) float FloatRow[4] CC_ALIGN(16);
+//typedef float FloatRow[4];
+
+#ifdef WIN32
+#include <limits>
+#define INFINITY  (std::numeric_limits<float>::infinity())
+#else
+#define MAXINT	(0x7FFFFFFF)
+#endif
+
+
+void ZeroFloatRow(FloatRow& r);
+
+#define DEGRAD	0.01745329251994329576f		//degree to radian
+#define RADDEG	57.2957795130823208767f		//radian to degree
+#define	Q_PI	3.14159265358979323846f
+
+template<typename t, unsigned i>
+inline void COPY(const t* s, t*d)
+{
+	d[i-1] = s[i-1];
+	COPY<t,i-1>(s,d);
+}
+
+template<> inline void COPY<float,0>(const float* s, float* d)
+{
+	d[0] = s[0];
+}
+
+#if defined(WIN32)
+//In win32, sin works faster than sinf and similar functions
+#define SIN( x )		float( sin ( x ) )
+#define COS( x )		float( cos ( x ) )
+#define ATAN2( x, y )	float( atan2( x, y ) )
+#define ASIN( x )		float( asin ( x ) )
+#define ACOS( x )		float( acos ( x ) )
+#define SQRT( x )		float( sqrt ( x ) )
+#define	TAN( x )		float( tan ( x ) )
+#define ABS( x )		float( ((x<0)?(-x):(x)) )
+inline int LRINTF(float x) { int r = (int)x; (x-r)>=0.5?++r:0; return r; };
+#else
+//On other OSes in general, sinf works faster than floating a sin
+#define SIN( x )		sinf( x )
+#define COS( x )		cosf( x )
+#define ATAN2( x, y )	atan2f( x, y )
+#define ASIN( x )		asinf ( x )
+#define ACOS( x )		acosf ( x )
+#define SQRT( x )		sqrtf( x )
+#define TAN( x )		tanf( x )
+#define ABS( x )		((x<0)?(-x):(x))
+#define LRINTF( x )     lrintf( x )
+#endif
+
+#define SQ(x) ((x)*(x));
+
+//#define DotProduct(x,y) ((x)[0]*(y)[0]+(x)[1]*(y)[1]+(x)[2]*(y)[2])
+
+void Mul4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
+void Div4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
+void Add4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
+void Sub4f(const FloatRow& first, const FloatRow& second, FloatRow& out);
+inline void Copy4f( void * dest, const void * source ) { memcpy(dest,source,16); }
+inline void Copy8f( void * dest, const void * source ) { memcpy(dest,source,32); }
+inline void Copy16f( void * dest, const void * source ) { memcpy(dest,source,64); }
+void MatrixMultiply4f ( const FloatRow* in1, const FloatRow* in2, FloatRow* out );
+void VectorMultiply4f(const FloatRow* matrix, const FloatRow& p, FloatRow& out );
+void TransposeMatrix(const FloatRow* matrix, FloatRow* out);
+void MMCrossProduct( const FloatRow& r1, const FloatRow& r2, FloatRow& result);
+void LoadIdentity(FloatRow* matrix);
+
+//http://graphics.stanford.edu/~seander/bithacks.html
+inline unsigned int SetBit(unsigned int x, unsigned int mask, bool t)
+{
+#if defined(WIN32)
+	#pragma warning( disable : 4804 )
+#endif
+	return ((x & ~mask) | (-t & mask));  /*superscalar CPU version*/
+}
+inline bool GetBit(unsigned int x, unsigned int mask) { return ((x & mask)>0); }
+
+//void Float2FloatRow(const float* f, FloatRow& r);
+//void FloatRow2Float(const FloatRow& fr, float* f);
+
+const FloatRow gfrZero = { 0.f, 0.f, 0.f, 0.f };
+
+#endif
+
+/*
+ * (c) 2006 Joshua Allen
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, and/or sell copies of the Software, and to permit persons to
+ * whom the Software is furnished to do so, provided that the above
+ * copyright notice(s) and this permission notice appear in all copies of
+ * the Software and that both the above copyright notice(s) and this
+ * permission notice appear in supporting documentation.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+ * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.