[Mplayerxp-cvslog] SF.net SVN: mplayerxp:[130] mplayerxp

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 130
          http://mplayerxp.svn.sourceforge.net/mplayerxp/?rev=130&view=rev
Author:   nickols_k
Date:     2010-01-28 15:43:33 +0000 (Thu, 28 Jan 2010)

Log Message:
-----------
minor speedup and fixes

Modified Paths:
--------------
    TOOLS/asmopt.c
    TOOLS/asmopt_template.h
    mplayerxp/libvo/aclib_template.c
    mplayerxp/libvo/osd_template.c
    mplayerxp/postproc/dsp_accel.h
    mplayerxp/pvector/pvector.h
    mplayerxp/pvector/pvector_f32_x86.h
    mplayerxp/pvector/pvector_int_x86.h

Modified: TOOLS/asmopt.c
===================================================================

--- TOOLS/asmopt.c	2010-01-28 15:41:48 UTC (rev 129)
+++ TOOLS/asmopt.c	2010-01-28 15:43:33 UTC (rev 130)
@@ -32,7 +32,7 @@
 #define INIT_ARRAYS(x) \
 {\
 	for(i=0; i<x; i++) srca[i] = i; \
-	for(i=0; i<x; i++) src[i] = i+64; \
+	for(i=0; i<x; i++) src[i] = i-61; \
 	for(i=0; i<x; i++) dsta[i] = i+128; \
 }
 
@@ -44,7 +44,7 @@
   gettimeofday(&tv,&tz);
 //  s=tv.tv_usec;s*=0.000001;s+=tv.tv_sec;
   return (tv.tv_sec*1000000+tv.tv_usec);
-}  
+}
 
 static inline unsigned long long int read_tsc( void )
 {

Modified: TOOLS/asmopt_template.h
===================================================================
--- TOOLS/asmopt_template.h	2010-01-28 15:41:48 UTC (rev 129)
+++ TOOLS/asmopt_template.h	2010-01-28 15:43:33 UTC (rev 130)
@@ -17,7 +17,7 @@
 #endif
     uint8_t *out_data = dstbase;
     uint8_t *in_data = src;
-    
+
     unsigned i,len;
     i = 0;
     len = asize;
@@ -30,19 +30,13 @@
     for(;i<len;i+=__IVEC_SIZE){
 	    __ivec ind,itmp[2];
 	    ind   = _ivec_loadu(&((uint8_t *)in_data)[i]);
-#if 0 /* slower but portable on non-x86 CPUs version */
-	    itmp[0]= _ivec_sll_s16_imm(_ivec_u16_from_lou8(ind),8);
-	    itmp[1]= _ivec_sll_s16_imm(_ivec_u16_from_hiu8(ind),8);
-#else
-	    itmp[0]= _ivec_interleave_lo_u8(izero,ind);
-	    itmp[1]= _ivec_interleave_hi_u8(izero,ind);
-#endif
+	    itmp[0]= _ivec_s16_from_s8(ind,&itmp[1]);
 	    _ivec_storea(&((uint16_t*)out_data)[i],itmp[0]);
 	    _ivec_storea(&((uint16_t*)out_data)[i+__IVEC_SIZE/2],itmp[1]);
     }
 #endif
     for(;i<len;i++)
-	((uint16_t*)out_data)[i]=((uint16_t)((uint8_t*)in_data)[i])<<8;
+	((int16_t*)out_data)[i]=((int16_t)((int8_t*)in_data)[i]);
 #ifdef HAVE_INT_PVECTOR
     _ivec_empty();
     _ivec_sfence();

Modified: mplayerxp/libvo/aclib_template.c
===================================================================
--- mplayerxp/libvo/aclib_template.c	2010-01-28 15:41:48 UTC (rev 129)
+++ mplayerxp/libvo/aclib_template.c	2010-01-28 15:43:33 UTC (rev 130)
@@ -158,6 +158,7 @@
 }
 
 #undef MEM_STORE
+#undef MEM_SFENCE
 #define MEM_STORE  _ivec_stream
 #define MEM_SFENCE _ivec_sfence();
 static inline void * PVECTOR_RENAME(fast_stream_copy)(void * to, const void * from, size_t len)
@@ -167,6 +168,7 @@
 }
 
 #undef MEM_STORE
+#undef MEM_SFENCE
 #define MEM_STORE _ivec_storea
 #define MEM_SFENCE
 static inline void * PVECTOR_RENAME(fast_memcpy)(void * to, const void * from, size_t len)

Modified: mplayerxp/libvo/osd_template.c
===================================================================
--- mplayerxp/libvo/osd_template.c	2010-01-28 15:41:48 UTC (rev 129)
+++ mplayerxp/libvo/osd_template.c	2010-01-28 15:43:33 UTC (rev 130)
@@ -64,10 +64,11 @@
 	_ivec_prefetch(&src[x]);
 	_ivec_prefetch(&srca[x]);
 	/* MOVNTDQ: #GP(0) - If memory operand is not aligned on a 16-byte boundary */
+	if(!IVEC_ALIGNED(dstbase))
 	for(;x<w;x++){
 	    unsigned char *dst=&dstbase[x];
 	    if(srca[x]) *dst=((dstbase[x]*srca[x])>>8)+src[x];
-	    if((((long)dst)&(__IVEC_SIZE-1))==0) break; /* align on sizeof(MMREG) boundary */
+	    if(IVEC_ALIGNED(dst)) break; /* align on sizeof(MMREG) boundary */
 	}
 	if((w-x)>=__IVEC_SIZE)
 	for(;x<w;x+=__IVEC_SIZE){
@@ -76,13 +77,17 @@
 	    _ivec_prefetch(&src[x+__IVEC_SIZE*4]);
 	    _ivec_prefetch(&srca[x+__IVEC_SIZE*4]);
 	    vdest = _ivec_loada(&dstbase[x]);
-	    vsrc  = _ivec_loadu(&src[x]);
-	    vsrca = _ivec_loadu(&srca[x]);
+	    if(IVEC_ALIGNED(&src[x]))
+		vsrc  = _ivec_loada(&src[x]);
+	    else
+		vsrc  = _ivec_loadu(&src[x]);
+	    if(IVEC_ALIGNED(&srca[x]))
+		vsrca = _ivec_loada(&srca[x]);
+	    else
+		vsrca = _ivec_loadu(&srca[x]);
 	    vmsk  = _ivec_not(_ivec_cmpeq_s8(vsrca,vzero));
-	    vt[0] = _ivec_u16_from_lou8(vdest);
-	    vt[1] = _ivec_u16_from_hiu8(vdest);
-	    vt[2] = _ivec_u16_from_lou8(vsrca);
-	    vt[3] = _ivec_u16_from_hiu8(vsrca);
+	    vt[0] = _ivec_u16_from_u8(vdest,&vt[1]);
+	    vt[2] = _ivec_u16_from_u8(vsrca,&vt[3]);
 	    vt[0] = _ivec_srl_s16_imm(_ivec_mullo_s16(vt[0],vt[2]),8);
 	    vt[1] = _ivec_srl_s16_imm(_ivec_mullo_s16(vt[1],vt[3]),8);
 	    vt[0] = _ivec_add_s8(_ivec_u8_from_u16(vt[0],vt[1]),vsrc);

Modified: mplayerxp/postproc/dsp_accel.h
===================================================================
--- mplayerxp/postproc/dsp_accel.h	2010-01-28 15:41:48 UTC (rev 129)
+++ mplayerxp/postproc/dsp_accel.h	2010-01-28 15:43:33 UTC (rev 130)
@@ -3,27 +3,22 @@
 
 static void __FASTCALL__ PVECTOR_RENAME(int8_to_int16)(const int8_t* in_data, int16_t* out_data, unsigned len, int final)
 {
-#ifdef HAVE_INT_PVECTOR
-    __ivec izero = _ivec_setzero();
-#endif
     unsigned i;
     i = 0;
 #ifdef HAVE_INT_PVECTOR
+    if(!IVEC_ALIGNED(out_data))
     for(;i<len;i++) {
 	((uint16_t*)out_data)[i]=((uint16_t)((const uint8_t*)in_data)[i])<<8;
-	if((((long)out_data)&(__IVEC_SIZE-1))==0) break;
+	if(IVEC_ALIGNED(out_data)) break;
     }
     if((len-i)>=__IVEC_SIZE)
     for(;i<len;i+=__IVEC_SIZE){
 	__ivec ind,itmp[2];
-	ind   = _ivec_loadu(&((const uint8_t *)in_data)[i]);
-#if 0 /* slower but portable on non-x86 CPUs version */
-	itmp[0]= _ivec_sll_s16_imm(_ivec_u16_from_lou8(ind),8);
-	itmp[1]= _ivec_sll_s16_imm(_ivec_u16_from_hiu8(ind),8);
-#else
-	itmp[0]= _ivec_interleave_lo_u8(izero,ind);
-	itmp[1]= _ivec_interleave_hi_u8(izero,ind);
-#endif
+	if(IVEC_ALIGNED(in_data))
+	    ind = _ivec_loada(&((const uint8_t *)in_data)[i]);
+	else
+	    ind = _ivec_loadu(&((const uint8_t *)in_data)[i]);
+	itmp[0] = _ivec_scale_u16_from_u8(ind,&itmp[1]);
 	if(final) {
 	    _ivec_stream(&((uint16_t*)out_data)[i],itmp[0]);
 	    _ivec_stream(&((uint16_t*)out_data)[i+__IVEC_SIZE/2],itmp[1]);
@@ -44,16 +39,23 @@
     unsigned i;
     i = 0;
 #ifdef HAVE_INT_PVECTOR
+    if(!IVEC_ALIGNED(out_data))
     for(;i<len;i++) {
 	((uint8_t*)out_data)[i]=(uint8_t)((((const uint16_t*)in_data)[i])>>8);
-	if((((long)out_data)&(__IVEC_SIZE-1))==0) break;
+	if(IVEC_ALIGNED(out_data)) break;
     }
     if((len-i)>=__IVEC_SIZE)
     for(;i<len;i+=__IVEC_SIZE){
 	__ivec outd,itmp[2];
-	itmp[0]  = _ivec_sra_s16_imm(_ivec_loadu(&((const uint16_t*)in_data)[i]),8);
-	itmp[1]  = _ivec_sra_s16_imm(_ivec_loadu(&((const uint16_t*)in_data)[i+__IVEC_SIZE/2]),8);
-	outd     = _ivec_s8_from_s16(itmp[0],itmp[1]);
+	if(IVEC_ALIGNED(in_data)) {
+	    itmp[0] = _ivec_loada(&((const uint16_t*)in_data)[i]);
+	    itmp[1] = _ivec_loada(&((const uint16_t*)in_data)[i+__IVEC_SIZE/2]);
+	}
+	else {
+	    itmp[0] = _ivec_loadu(&((const uint16_t*)in_data)[i]);
+	    itmp[1] = _ivec_loadu(&((const uint16_t*)in_data)[i+__IVEC_SIZE/2]);
+	}
+	outd     = _ivec_scale_s8_from_s16(itmp[0],itmp[1]);
 	if(final)
 	    _ivec_stream(&((uint8_t*)out_data)[i],outd);
 	else
@@ -69,7 +71,6 @@
 static void __FASTCALL__ PVECTOR_RENAME(int16_to_int32)(const int16_t* in_data, int32_t* out_data, unsigned len, int final)
 {
 #ifdef HAVE_INT_PVECTOR
-  __ivec izero = _ivec_setzero();
     unsigned len_mm,j;
 #endif
     unsigned i;
@@ -77,22 +78,20 @@
 #ifdef HAVE_INT_PVECTOR
     j=0;
     len_mm=len&(~(__IVEC_SIZE-1));
+    if(!IVEC_ALIGNED(out_data))
     for(;i<len;i++,j+=2){
 	((uint32_t*)out_data)[i]=((uint32_t)((const uint16_t*)in_data)[i])<<16;
-	if((((long)out_data)&(__IVEC_SIZE-1))==0) break;
+	if(IVEC_ALIGNED(out_data)) break;
     }
     if((len_mm-i)>=__IVEC_SIZE)
     for(;i<len_mm;i+=__IVEC_SIZE/2,j+=__IVEC_SIZE)
     {
 	__ivec ind,tmp[2];
-	ind   = _ivec_loadu(&((const uint8_t *)in_data)[j]);
-#if 0 /* slower but portable on non-x86 CPUs version */
-	tmp[0]= _ivec_sll_s32_imm(_ivec_u32_from_lou16(ind),16);
-	tmp[1]= _ivec_sll_s32_imm(_ivec_u32_from_hiu16(ind),16);
-#else
-	tmp[0]= _ivec_interleave_lo_u16(izero,ind);
-	tmp[1]= _ivec_interleave_hi_u16(izero,ind);
-#endif
+	if(IVEC_ALIGNED(in_data))
+	    ind = _ivec_loada(&((const uint8_t *)in_data)[j]);
+	else
+	    ind = _ivec_loadu(&((const uint8_t *)in_data)[j]);
+	tmp[0]= _ivec_scale_u32_from_u16(ind,&tmp[1]);
 	if(final) {
 	    _ivec_stream(&((uint8_t *)out_data)[j*2],tmp[0]);
 	    _ivec_stream(&((uint8_t *)out_data)[j*2+__IVEC_SIZE],tmp[1]);
@@ -117,17 +116,23 @@
     i=0;
 #ifdef HAVE_INT_PVECTOR
     j=0;
+    if(!IVEC_ALIGNED(out_data))
     for(;i<len;i++,j+=2){
 	((uint16_t*)out_data)[i]=(uint16_t)((((const uint32_t*)in_data)[i])>>16);
-	if((((long)out_data)&(__IVEC_SIZE-1))==0) break;
+	if(IVEC_ALIGNED(out_data)) break;
     }
     if((len-i)>=__IVEC_SIZE)
     for(;i<len;i+=__IVEC_SIZE/2,j+=__IVEC_SIZE)
     {
 	__ivec ind[2],tmp;
-	ind[0]= _ivec_sra_s32_imm(_ivec_loadu(&((const uint8_t *)in_data)[j*2]),16);
-	ind[1]= _ivec_sra_s32_imm(_ivec_loadu(&((const uint8_t *)in_data)[j*2+__IVEC_SIZE]),16);
-	tmp   = _ivec_s16_from_s32(ind[0],ind[1]);
+	if(IVEC_ALIGNED(in_data)) {
+	    ind[0]=_ivec_loada(&((const uint8_t *)in_data)[j*2]);
+	    ind[1]=_ivec_loada(&((const uint8_t *)in_data)[j*2+__IVEC_SIZE]);
+	} else {
+	    ind[0]=_ivec_loadu(&((const uint8_t *)in_data)[j*2]);
+	    ind[1]=_ivec_loadu(&((const uint8_t *)in_data)[j*2+__IVEC_SIZE]);
+	}
+	tmp   = _ivec_scale_s16_from_s32(ind[0],ind[1]);
 	if(final)
 	    _ivec_stream(&((uint8_t *)out_data)[j],tmp);
 	else
@@ -314,23 +319,27 @@
     i=0;
 #ifdef HAVE_F32_PVECTOR
     int_max = _f32vec_broadcast(INT32_MAX-1);
-    /* SSE engine sometime has unpredictable behaviour. So downscale volume on 1% here. */
-    plus1 = _f32vec_broadcast(+0.99);
-    minus1= _f32vec_broadcast(-0.99);
+    /* SSE float2int engine doesn't have SATURATION functionality.
+       So CLAMP volume on 0.0002% here. */
+    plus1 = _f32vec_broadcast(+0.999998);
+    minus1= _f32vec_broadcast(-0.999998);
+    if(!F32VEC_ALIGNED(out))
     for(;i<len;i++) {
       ftmp=((const float*)in)[i];
       SATURATE(ftmp,-1.0,+1.0);
       ((int32_t*)out)[i]=(int32_t)lrintf((INT_MAX-1)*ftmp);
-      if((((long)out)&(__F32VEC_SIZE-1))==0) break;
+      if(F32VEC_ALIGNED(out)) break;
     }
     _ivec_empty();
     len_mm=len&(~(__F32VEC_SIZE-1));
     if((len_mm-i)>=__F32VEC_SIZE/sizeof(float))
     for(;i<len_mm;i+=__F32VEC_SIZE/sizeof(float)) {
 	__f32vec tmp;
-	tmp = _f32vec_loadu(&((const float*)in)[i]);
-	tmp = _f32vec_min(tmp,plus1);
-	tmp = _f32vec_max(tmp,minus1);
+	if(F32VEC_ALIGNED(in))
+	    tmp = _f32vec_loada(&((const float*)in)[i]);
+	else
+	    tmp = _f32vec_loadu(&((const float*)in)[i]);
+	tmp = _f32vec_clamp(tmp,minus1,plus1);
 	tmp = _f32vec_mul(int_max,tmp);
 	if(final)
 	    _f32vec_to_s32_stream(&((int32_t*)out)[i],tmp);
@@ -354,15 +363,20 @@
 #endif
   register unsigned i=0;
 #ifdef HAVE_F32_PVECTOR
+    if(!F32VEC_ALIGNED(out))
     for(;i<len;i++) {
       ((float*)out)[i]=(1.0/INT_MAX)*((float)((const int32_t*)in)[i]);
-      if((((long)out)&(__F32VEC_SIZE-1))==0) break;
+      if(F32VEC_ALIGNED(out)) break;
     }
     _ivec_empty();
     if((len-i)>=__F32VEC_SIZE)
     for(;i<len;i+=__F32VEC_SIZE/sizeof(float)) {
 	__f32vec tmp;
-	tmp = _f32vec_mul(rev_imax,_f32vec_from_s32u(&((const int32_t*)in)[i]));
+	if(F32VEC_ALIGNED(in))
+	    tmp = _f32vec_from_s32a(&((const int32_t*)in)[i]);
+	else
+	    tmp = _f32vec_from_s32u(&((const int32_t*)in)[i]);
+	tmp = _f32vec_mul(rev_imax,tmp);
 	if(final)
 	    _f32vec_stream(&((float*)out)[i],tmp);
 	else

Modified: mplayerxp/pvector/pvector.h
===================================================================
--- mplayerxp/pvector/pvector.h	2010-01-28 15:41:48 UTC (rev 129)
+++ mplayerxp/pvector/pvector.h	2010-01-28 15:43:33 UTC (rev 130)
@@ -47,6 +47,10 @@
 #undef HAVE_F32_PVECTOR
 #endif
 
+#undef IVEC_ALIGNED
+#define IVEC_ALIGNED(p) ((((long)((void *)(p)))&(__IVEC_SIZE-1))==0)
+#undef F32VEC_ALIGNED
+#define F32VEC_ALIGNED(p) ((((long)((void *)(p)))&(__F32VEC_SIZE-1))==0)
 
 /*
   ABBREVIATION:
@@ -130,15 +134,22 @@
   __ivec _ivec_interleave_lo_u32(__ivec s1, _ivec_ s2);
   __ivec _ivec_interleave_hi_u32(__ivec s1, _ivec_ s2);
 
-  __ivec _ivec_u16_from_lou8(__ivec s); // Convert lo part of mvec from U8 to U16
-  __ivec _ivec_u16_from_hiu8(__ivec s); // Convert hi part of mvec from U8 to U16
-  __ivec _ivec_u32_from_lou16(__ivec s);// Convert lo part of mvec from U16 to U32
-  __ivec _ivec_u32_from_hiu16(__ivec s); // Convert hi part of mvec from U16 to U32
+  __ivec _ivec_u16_from_u8(__ivec s,__ivec *hipart); // Convert ivec from U8 to U16
+  __ivec _ivec_u32_from_u16(__ivec s,__ivec *hipart);// Convert ivec from U16 to U32
+  __ivec _ivec_s16_from_s8(__ivec s,__ivec *hipart); // Convert ivec from S8 to S16
+  __ivec _ivec_s32_from_s16(__ivec s,__ivec *hipart);// Convert ivec from S16 to S32
 
+  __ivec _ivec_scale_u16_from_u8(__ivec s,__ivec *hipart); // Convert ivec from U8 to U16 and shift left on 8-bit
+  __ivec _ivec_scale_u32_from_u16(__ivec s,__ivec *hipart);// Convert ivec from U16 to U32 and shift left on 16-bit
+
   __ivec _ivec_s16_from_s32(__ivec s1,__ivec s2);   // Convert from S32 to S16
   __ivec _ivec_s8_from_s16(__ivec s1,__ivec s2);    // Convert from S16 to S8
   __ivec _ivec_u8_from_u16(__ivec s1,__ivec s2);    // Convert from U16 to U8
 
+  __ivec _ivec_scale_s16_from_s32(__ivec s1,__ivec s2);   // Convert from S32 to S16 and shift right on 16-bit
+  __ivec _ivec_scale_s8_from_s16(__ivec s1,__ivec s2);    // Convert from S16 to S8 and shift right on 8-bit
+  __ivec _ivec_scale_u8_from_u16(__ivec s1,__ivec s2);    // Convert from U16 to U8 and shift right on 8-bit
+
   ARITHMETIC engine:
   ------------------
   __ivec _ivec_add_s8(__ivec s1,__ivec s2);        // Add S8
@@ -202,4 +213,5 @@
   ---------------
   __f32vec _f32vec_min(__f32vec f1, __f32vec f2);	// MIN(f1,f2)
   __f32vec _f32vec_max(__f32vec f1, __f32vec f2);	// MAX(f1,f2)
+  __f32vec _f32vec_clamp(__f32vec f1, __f32vec minval,__f32vec maxval); // CLAMP(f1,minval,maxval);
 */

Modified: mplayerxp/pvector/pvector_f32_x86.h
===================================================================
--- mplayerxp/pvector/pvector_f32_x86.h	2010-01-28 15:41:48 UTC (rev 129)
+++ mplayerxp/pvector/pvector_f32_x86.h	2010-01-28 15:43:33 UTC (rev 130)
@@ -317,3 +317,12 @@
 }
 #undef _f32vec_min
 #define _f32vec_min PVECTOR_RENAME(f32_min)
+
+extern __inline __f32vec __attribute__((__gnu_inline__, __always_inline__))
+PVECTOR_RENAME(f32_clamp)(__f32vec f1,__f32vec minval,__f32vec maxval)
+{
+    return _f32vec_max(_f32vec_min(f1,maxval),minval);
+}
+#undef _f32vec_clamp
+#define _f32vec_clamp PVECTOR_RENAME(f32_clamp)
+

Modified: mplayerxp/pvector/pvector_int_x86.h
===================================================================
--- mplayerxp/pvector/pvector_int_x86.h	2010-01-28 15:41:48 UTC (rev 129)
+++ mplayerxp/pvector/pvector_int_x86.h	2010-01-28 15:43:33 UTC (rev 130)
@@ -414,33 +414,50 @@
 #define _ivec_interleave_hi_u32 PVECTOR_RENAME(interleave_hi_u32)
 
 extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__))
-PVECTOR_RENAME(u16_from_lou8)(__ivec s)
+PVECTOR_RENAME(u16_from_u8)(__ivec s,__ivec *hipart)
 {
-    return _ivec_interleave_lo_u8(s,_ivec_setzero());
+    __ivec filler = _ivec_setzero();
+    *hipart = _ivec_interleave_hi_u8(s,filler);
+    return    _ivec_interleave_lo_u8(s,filler);
 }
-#undef _ivec_u16_from_lou8
-#define _ivec_u16_from_lou8 PVECTOR_RENAME(u16_from_lou8)
+#undef _ivec_u16_from_u8
+#define _ivec_u16_from_u8 PVECTOR_RENAME(u16_from_u8)
+
 extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__))
-PVECTOR_RENAME(u16_from_hiu8)(__ivec s)
+PVECTOR_RENAME(u32_from_u16)(__ivec s,__ivec *hipart)
 {
-    return _ivec_interleave_hi_u8(s,_ivec_setzero());
+    __ivec filler = _ivec_setzero();
+    *hipart = _ivec_interleave_hi_u16(s,filler);
+    return    _ivec_interleave_lo_u16(s,filler);
 }
-#undef _ivec_u16_from_hiu8
-#define _ivec_u16_from_hiu8 PVECTOR_RENAME(u16_from_hiu8)
+#undef _ivec_u32_from_u16
+#define _ivec_u32_from_u16 PVECTOR_RENAME(u32_from_u16)
+
 extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__))
-PVECTOR_RENAME(u32_from_lou16)(__ivec s)
+PVECTOR_RENAME(s16_from_s8)(__ivec s,__ivec* hipart)
 {
-    return _ivec_interleave_lo_u16(s,_ivec_setzero());
+    const __ivec izero = _ivec_setzero();
+    __ivec filler;
+    filler = _ivec_cmpgt_s8(izero,s);
+    *hipart = _ivec_interleave_hi_u8(s,filler);
+    return _ivec_interleave_lo_u8(s,filler);
 }
-#undef _ivec_u32_from_lou16
-#define _ivec_u32_from_lou16 PVECTOR_RENAME(u32_from_lou16)
+#undef _ivec_s16_from_s8
+#define _ivec_s16_from_s8 PVECTOR_RENAME(s16_from_s8)
+
 extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__))
-PVECTOR_RENAME(u32_from_hiu16)(__ivec s)
+PVECTOR_RENAME(s32_from_s16)(__ivec s,__ivec* hipart)
 {
-    return _ivec_interleave_hi_u16(s,_ivec_setzero());
+    const __ivec izero = _ivec_setzero();
+    __ivec filler;
+    filler = _ivec_cmpgt_s16(izero,s);
+    *hipart = _ivec_interleave_hi_u16(s,filler);
+    return _ivec_interleave_lo_u16(s,filler);
 }
-#undef _ivec_u32_from_hiu16
-#define _ivec_u32_from_hiu16 PVECTOR_RENAME(u32_from_hiu16)
+#undef _ivec_s32_from_s16
+#define _ivec_s32_from_s16 PVECTOR_RENAME(s32_from_s16)
+
+
 extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__))
 PVECTOR_RENAME(s16_from_s32)(__ivec s1, __ivec s2)
 {
@@ -840,3 +857,61 @@
 }
 #undef _ivec_srl_s64_imm
 #define _ivec_srl_s64_imm PVECTOR_RENAME(srl_s64_imm)
+
+extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__))
+PVECTOR_RENAME(scale_u16_from_u8)(__ivec s,__ivec *hipart)
+{
+#if 0 /* slower but portable on non-x86 CPUs version */
+    __ivec tmp[2];
+    tmp[0]  = _ivec_u16_from_u8(s,&tmp[1]);
+    *hipart = _ivec_sll_s16_imm(tmp[1],8);
+    return    _ivec_sll_s16_imm(tmp[0],8);
+#else
+    __ivec filler = _ivec_setzero();
+    *hipart = _ivec_interleave_hi_u8(filler,s);
+    return    _ivec_interleave_lo_u8(filler,s);
+#endif
+}
+#undef _ivec_scale_u16_from_u8
+#define _ivec_scale_u16_from_u8 PVECTOR_RENAME(scale_u16_from_u8)
+
+extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__))
+PVECTOR_RENAME(scale_u32_from_u16)(__ivec s,__ivec *hipart)
+{
+#if 0 /* slower but portable on non-x86 CPUs version */
+    __ivec tmp[2];
+    tmp[0]  = _ivec_u32_from_u16(s,&tmp[1]);
+    *hipart = _ivec_sll_s32_imm(tmp[1],16);
+    return    _ivec_sll_s32_imm(tmp[0],16);
+#else
+    __ivec filler = _ivec_setzero();
+    *hipart = _ivec_interleave_hi_u16(filler,s);
+    return    _ivec_interleave_lo_u16(filler,s);
+#endif
+}
+#undef _ivec_scale_u32_from_u16
+#define _ivec_scale_u32_from_u16 PVECTOR_RENAME(scale_u32_from_u16)
+
+extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__))
+PVECTOR_RENAME(scale_s16_from_s32)(__ivec s1, __ivec s2)
+{
+    return _ivec_s16_from_s32(_ivec_sra_s32_imm(s1,16),_ivec_sra_s32_imm(s2,16));
+}
+#undef _ivec_scale_s16_from_s32
+#define _ivec_scale_s16_from_s32 PVECTOR_RENAME(scale_s16_from_s32)
+
+extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__))
+PVECTOR_RENAME(scale_s8_from_s16)(__ivec s1, __ivec s2)
+{
+    return _ivec_s8_from_s16(_ivec_sra_s16_imm(s1,8),_ivec_sra_s16_imm(s2,8));
+}
+#undef _ivec_scale_s8_from_s16
+#define _ivec_scale_s8_from_s16 PVECTOR_RENAME(scale_s8_from_s16)
+extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__))
+PVECTOR_RENAME(scale_u8_from_u16)(__ivec s1, __ivec s2)
+{
+    return _ivec_u8_from_u16(_ivec_sra_s16_imm(s1,8),_ivec_sra_s16_imm(s2,8));
+}
+#undef _ivec_scale_u8_from_u16
+#define _ivec_scale_u8_from_u16 PVECTOR_RENAME(scale_u8_from_u16)
+


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.