[Mplayerxp-cvslog] SF.net SVN: mplayerxp:[130] mplayerxp
Brought to you by:
olov
From: <nic...@us...> - 2010-01-28 15:43:41
|
Revision: 130 http://mplayerxp.svn.sourceforge.net/mplayerxp/?rev=130&view=rev Author: nickols_k Date: 2010-01-28 15:43:33 +0000 (Thu, 28 Jan 2010) Log Message: ----------- minor speedup and fixes Modified Paths: -------------- TOOLS/asmopt.c TOOLS/asmopt_template.h mplayerxp/libvo/aclib_template.c mplayerxp/libvo/osd_template.c mplayerxp/postproc/dsp_accel.h mplayerxp/pvector/pvector.h mplayerxp/pvector/pvector_f32_x86.h mplayerxp/pvector/pvector_int_x86.h Modified: TOOLS/asmopt.c =================================================================== --- TOOLS/asmopt.c 2010-01-28 15:41:48 UTC (rev 129) +++ TOOLS/asmopt.c 2010-01-28 15:43:33 UTC (rev 130) @@ -32,7 +32,7 @@ #define INIT_ARRAYS(x) \ {\ for(i=0; i<x; i++) srca[i] = i; \ - for(i=0; i<x; i++) src[i] = i+64; \ + for(i=0; i<x; i++) src[i] = i-61; \ for(i=0; i<x; i++) dsta[i] = i+128; \ } @@ -44,7 +44,7 @@ gettimeofday(&tv,&tz); // s=tv.tv_usec;s*=0.000001;s+=tv.tv_sec; return (tv.tv_sec*1000000+tv.tv_usec); -} +} static inline unsigned long long int read_tsc( void ) { Modified: TOOLS/asmopt_template.h =================================================================== --- TOOLS/asmopt_template.h 2010-01-28 15:41:48 UTC (rev 129) +++ TOOLS/asmopt_template.h 2010-01-28 15:43:33 UTC (rev 130) @@ -17,7 +17,7 @@ #endif uint8_t *out_data = dstbase; uint8_t *in_data = src; - + unsigned i,len; i = 0; len = asize; @@ -30,19 +30,13 @@ for(;i<len;i+=__IVEC_SIZE){ __ivec ind,itmp[2]; ind = _ivec_loadu(&((uint8_t *)in_data)[i]); -#if 0 /* slower but portable on non-x86 CPUs version */ - itmp[0]= _ivec_sll_s16_imm(_ivec_u16_from_lou8(ind),8); - itmp[1]= _ivec_sll_s16_imm(_ivec_u16_from_hiu8(ind),8); -#else - itmp[0]= _ivec_interleave_lo_u8(izero,ind); - itmp[1]= _ivec_interleave_hi_u8(izero,ind); -#endif + itmp[0]= _ivec_s16_from_s8(ind,&itmp[1]); _ivec_storea(&((uint16_t*)out_data)[i],itmp[0]); _ivec_storea(&((uint16_t*)out_data)[i+__IVEC_SIZE/2],itmp[1]); } #endif for(;i<len;i++) - ((uint16_t*)out_data)[i]=((uint16_t)((uint8_t*)in_data)[i])<<8; + ((int16_t*)out_data)[i]=((int16_t)((int8_t*)in_data)[i]); #ifdef HAVE_INT_PVECTOR _ivec_empty(); _ivec_sfence(); Modified: mplayerxp/libvo/aclib_template.c =================================================================== --- mplayerxp/libvo/aclib_template.c 2010-01-28 15:41:48 UTC (rev 129) +++ mplayerxp/libvo/aclib_template.c 2010-01-28 15:43:33 UTC (rev 130) @@ -158,6 +158,7 @@ } #undef MEM_STORE +#undef MEM_SFENCE #define MEM_STORE _ivec_stream #define MEM_SFENCE _ivec_sfence(); static inline void * PVECTOR_RENAME(fast_stream_copy)(void * to, const void * from, size_t len) @@ -167,6 +168,7 @@ } #undef MEM_STORE +#undef MEM_SFENCE #define MEM_STORE _ivec_storea #define MEM_SFENCE static inline void * PVECTOR_RENAME(fast_memcpy)(void * to, const void * from, size_t len) Modified: mplayerxp/libvo/osd_template.c =================================================================== --- mplayerxp/libvo/osd_template.c 2010-01-28 15:41:48 UTC (rev 129) +++ mplayerxp/libvo/osd_template.c 2010-01-28 15:43:33 UTC (rev 130) @@ -64,10 +64,11 @@ _ivec_prefetch(&src[x]); _ivec_prefetch(&srca[x]); /* MOVNTDQ: #GP(0) - If memory operand is not aligned on a 16-byte boundary */ + if(!IVEC_ALIGNED(dstbase)) for(;x<w;x++){ unsigned char *dst=&dstbase[x]; if(srca[x]) *dst=((dstbase[x]*srca[x])>>8)+src[x]; - if((((long)dst)&(__IVEC_SIZE-1))==0) break; /* align on sizeof(MMREG) boundary */ + if(IVEC_ALIGNED(dst)) break; /* align on sizeof(MMREG) boundary */ } if((w-x)>=__IVEC_SIZE) for(;x<w;x+=__IVEC_SIZE){ @@ -76,13 +77,17 @@ _ivec_prefetch(&src[x+__IVEC_SIZE*4]); _ivec_prefetch(&srca[x+__IVEC_SIZE*4]); vdest = _ivec_loada(&dstbase[x]); - vsrc = _ivec_loadu(&src[x]); - vsrca = _ivec_loadu(&srca[x]); + if(IVEC_ALIGNED(&src[x])) + vsrc = _ivec_loada(&src[x]); + else + vsrc = _ivec_loadu(&src[x]); + if(IVEC_ALIGNED(&srca[x])) + vsrca = _ivec_loada(&srca[x]); + else + vsrca = _ivec_loadu(&srca[x]); vmsk = _ivec_not(_ivec_cmpeq_s8(vsrca,vzero)); - vt[0] = _ivec_u16_from_lou8(vdest); - vt[1] = _ivec_u16_from_hiu8(vdest); - vt[2] = _ivec_u16_from_lou8(vsrca); - vt[3] = _ivec_u16_from_hiu8(vsrca); + vt[0] = _ivec_u16_from_u8(vdest,&vt[1]); + vt[2] = _ivec_u16_from_u8(vsrca,&vt[3]); vt[0] = _ivec_srl_s16_imm(_ivec_mullo_s16(vt[0],vt[2]),8); vt[1] = _ivec_srl_s16_imm(_ivec_mullo_s16(vt[1],vt[3]),8); vt[0] = _ivec_add_s8(_ivec_u8_from_u16(vt[0],vt[1]),vsrc); Modified: mplayerxp/postproc/dsp_accel.h =================================================================== --- mplayerxp/postproc/dsp_accel.h 2010-01-28 15:41:48 UTC (rev 129) +++ mplayerxp/postproc/dsp_accel.h 2010-01-28 15:43:33 UTC (rev 130) @@ -3,27 +3,22 @@ static void __FASTCALL__ PVECTOR_RENAME(int8_to_int16)(const int8_t* in_data, int16_t* out_data, unsigned len, int final) { -#ifdef HAVE_INT_PVECTOR - __ivec izero = _ivec_setzero(); -#endif unsigned i; i = 0; #ifdef HAVE_INT_PVECTOR + if(!IVEC_ALIGNED(out_data)) for(;i<len;i++) { ((uint16_t*)out_data)[i]=((uint16_t)((const uint8_t*)in_data)[i])<<8; - if((((long)out_data)&(__IVEC_SIZE-1))==0) break; + if(IVEC_ALIGNED(out_data)) break; } if((len-i)>=__IVEC_SIZE) for(;i<len;i+=__IVEC_SIZE){ __ivec ind,itmp[2]; - ind = _ivec_loadu(&((const uint8_t *)in_data)[i]); -#if 0 /* slower but portable on non-x86 CPUs version */ - itmp[0]= _ivec_sll_s16_imm(_ivec_u16_from_lou8(ind),8); - itmp[1]= _ivec_sll_s16_imm(_ivec_u16_from_hiu8(ind),8); -#else - itmp[0]= _ivec_interleave_lo_u8(izero,ind); - itmp[1]= _ivec_interleave_hi_u8(izero,ind); -#endif + if(IVEC_ALIGNED(in_data)) + ind = _ivec_loada(&((const uint8_t *)in_data)[i]); + else + ind = _ivec_loadu(&((const uint8_t *)in_data)[i]); + itmp[0] = _ivec_scale_u16_from_u8(ind,&itmp[1]); if(final) { _ivec_stream(&((uint16_t*)out_data)[i],itmp[0]); _ivec_stream(&((uint16_t*)out_data)[i+__IVEC_SIZE/2],itmp[1]); @@ -44,16 +39,23 @@ unsigned i; i = 0; #ifdef HAVE_INT_PVECTOR + if(!IVEC_ALIGNED(out_data)) for(;i<len;i++) { ((uint8_t*)out_data)[i]=(uint8_t)((((const uint16_t*)in_data)[i])>>8); - if((((long)out_data)&(__IVEC_SIZE-1))==0) break; + if(IVEC_ALIGNED(out_data)) break; } if((len-i)>=__IVEC_SIZE) for(;i<len;i+=__IVEC_SIZE){ __ivec outd,itmp[2]; - itmp[0] = _ivec_sra_s16_imm(_ivec_loadu(&((const uint16_t*)in_data)[i]),8); - itmp[1] = _ivec_sra_s16_imm(_ivec_loadu(&((const uint16_t*)in_data)[i+__IVEC_SIZE/2]),8); - outd = _ivec_s8_from_s16(itmp[0],itmp[1]); + if(IVEC_ALIGNED(in_data)) { + itmp[0] = _ivec_loada(&((const uint16_t*)in_data)[i]); + itmp[1] = _ivec_loada(&((const uint16_t*)in_data)[i+__IVEC_SIZE/2]); + } + else { + itmp[0] = _ivec_loadu(&((const uint16_t*)in_data)[i]); + itmp[1] = _ivec_loadu(&((const uint16_t*)in_data)[i+__IVEC_SIZE/2]); + } + outd = _ivec_scale_s8_from_s16(itmp[0],itmp[1]); if(final) _ivec_stream(&((uint8_t*)out_data)[i],outd); else @@ -69,7 +71,6 @@ static void __FASTCALL__ PVECTOR_RENAME(int16_to_int32)(const int16_t* in_data, int32_t* out_data, unsigned len, int final) { #ifdef HAVE_INT_PVECTOR - __ivec izero = _ivec_setzero(); unsigned len_mm,j; #endif unsigned i; @@ -77,22 +78,20 @@ #ifdef HAVE_INT_PVECTOR j=0; len_mm=len&(~(__IVEC_SIZE-1)); + if(!IVEC_ALIGNED(out_data)) for(;i<len;i++,j+=2){ ((uint32_t*)out_data)[i]=((uint32_t)((const uint16_t*)in_data)[i])<<16; - if((((long)out_data)&(__IVEC_SIZE-1))==0) break; + if(IVEC_ALIGNED(out_data)) break; } if((len_mm-i)>=__IVEC_SIZE) for(;i<len_mm;i+=__IVEC_SIZE/2,j+=__IVEC_SIZE) { __ivec ind,tmp[2]; - ind = _ivec_loadu(&((const uint8_t *)in_data)[j]); -#if 0 /* slower but portable on non-x86 CPUs version */ - tmp[0]= _ivec_sll_s32_imm(_ivec_u32_from_lou16(ind),16); - tmp[1]= _ivec_sll_s32_imm(_ivec_u32_from_hiu16(ind),16); -#else - tmp[0]= _ivec_interleave_lo_u16(izero,ind); - tmp[1]= _ivec_interleave_hi_u16(izero,ind); -#endif + if(IVEC_ALIGNED(in_data)) + ind = _ivec_loada(&((const uint8_t *)in_data)[j]); + else + ind = _ivec_loadu(&((const uint8_t *)in_data)[j]); + tmp[0]= _ivec_scale_u32_from_u16(ind,&tmp[1]); if(final) { _ivec_stream(&((uint8_t *)out_data)[j*2],tmp[0]); _ivec_stream(&((uint8_t *)out_data)[j*2+__IVEC_SIZE],tmp[1]); @@ -117,17 +116,23 @@ i=0; #ifdef HAVE_INT_PVECTOR j=0; + if(!IVEC_ALIGNED(out_data)) for(;i<len;i++,j+=2){ ((uint16_t*)out_data)[i]=(uint16_t)((((const uint32_t*)in_data)[i])>>16); - if((((long)out_data)&(__IVEC_SIZE-1))==0) break; + if(IVEC_ALIGNED(out_data)) break; } if((len-i)>=__IVEC_SIZE) for(;i<len;i+=__IVEC_SIZE/2,j+=__IVEC_SIZE) { __ivec ind[2],tmp; - ind[0]= _ivec_sra_s32_imm(_ivec_loadu(&((const uint8_t *)in_data)[j*2]),16); - ind[1]= _ivec_sra_s32_imm(_ivec_loadu(&((const uint8_t *)in_data)[j*2+__IVEC_SIZE]),16); - tmp = _ivec_s16_from_s32(ind[0],ind[1]); + if(IVEC_ALIGNED(in_data)) { + ind[0]=_ivec_loada(&((const uint8_t *)in_data)[j*2]); + ind[1]=_ivec_loada(&((const uint8_t *)in_data)[j*2+__IVEC_SIZE]); + } else { + ind[0]=_ivec_loadu(&((const uint8_t *)in_data)[j*2]); + ind[1]=_ivec_loadu(&((const uint8_t *)in_data)[j*2+__IVEC_SIZE]); + } + tmp = _ivec_scale_s16_from_s32(ind[0],ind[1]); if(final) _ivec_stream(&((uint8_t *)out_data)[j],tmp); else @@ -314,23 +319,27 @@ i=0; #ifdef HAVE_F32_PVECTOR int_max = _f32vec_broadcast(INT32_MAX-1); - /* SSE engine sometime has unpredictable behaviour. So downscale volume on 1% here. */ - plus1 = _f32vec_broadcast(+0.99); - minus1= _f32vec_broadcast(-0.99); + /* SSE float2int engine doesn't have SATURATION functionality. + So CLAMP volume on 0.0002% here. */ + plus1 = _f32vec_broadcast(+0.999998); + minus1= _f32vec_broadcast(-0.999998); + if(!F32VEC_ALIGNED(out)) for(;i<len;i++) { ftmp=((const float*)in)[i]; SATURATE(ftmp,-1.0,+1.0); ((int32_t*)out)[i]=(int32_t)lrintf((INT_MAX-1)*ftmp); - if((((long)out)&(__F32VEC_SIZE-1))==0) break; + if(F32VEC_ALIGNED(out)) break; } _ivec_empty(); len_mm=len&(~(__F32VEC_SIZE-1)); if((len_mm-i)>=__F32VEC_SIZE/sizeof(float)) for(;i<len_mm;i+=__F32VEC_SIZE/sizeof(float)) { __f32vec tmp; - tmp = _f32vec_loadu(&((const float*)in)[i]); - tmp = _f32vec_min(tmp,plus1); - tmp = _f32vec_max(tmp,minus1); + if(F32VEC_ALIGNED(in)) + tmp = _f32vec_loada(&((const float*)in)[i]); + else + tmp = _f32vec_loadu(&((const float*)in)[i]); + tmp = _f32vec_clamp(tmp,minus1,plus1); tmp = _f32vec_mul(int_max,tmp); if(final) _f32vec_to_s32_stream(&((int32_t*)out)[i],tmp); @@ -354,15 +363,20 @@ #endif register unsigned i=0; #ifdef HAVE_F32_PVECTOR + if(!F32VEC_ALIGNED(out)) for(;i<len;i++) { ((float*)out)[i]=(1.0/INT_MAX)*((float)((const int32_t*)in)[i]); - if((((long)out)&(__F32VEC_SIZE-1))==0) break; + if(F32VEC_ALIGNED(out)) break; } _ivec_empty(); if((len-i)>=__F32VEC_SIZE) for(;i<len;i+=__F32VEC_SIZE/sizeof(float)) { __f32vec tmp; - tmp = _f32vec_mul(rev_imax,_f32vec_from_s32u(&((const int32_t*)in)[i])); + if(F32VEC_ALIGNED(in)) + tmp = _f32vec_from_s32a(&((const int32_t*)in)[i]); + else + tmp = _f32vec_from_s32u(&((const int32_t*)in)[i]); + tmp = _f32vec_mul(rev_imax,tmp); if(final) _f32vec_stream(&((float*)out)[i],tmp); else Modified: mplayerxp/pvector/pvector.h =================================================================== --- mplayerxp/pvector/pvector.h 2010-01-28 15:41:48 UTC (rev 129) +++ mplayerxp/pvector/pvector.h 2010-01-28 15:43:33 UTC (rev 130) @@ -47,6 +47,10 @@ #undef HAVE_F32_PVECTOR #endif +#undef IVEC_ALIGNED +#define IVEC_ALIGNED(p) ((((long)((void *)(p)))&(__IVEC_SIZE-1))==0) +#undef F32VEC_ALIGNED +#define F32VEC_ALIGNED(p) ((((long)((void *)(p)))&(__F32VEC_SIZE-1))==0) /* ABBREVIATION: @@ -130,15 +134,22 @@ __ivec _ivec_interleave_lo_u32(__ivec s1, _ivec_ s2); __ivec _ivec_interleave_hi_u32(__ivec s1, _ivec_ s2); - __ivec _ivec_u16_from_lou8(__ivec s); // Convert lo part of mvec from U8 to U16 - __ivec _ivec_u16_from_hiu8(__ivec s); // Convert hi part of mvec from U8 to U16 - __ivec _ivec_u32_from_lou16(__ivec s);// Convert lo part of mvec from U16 to U32 - __ivec _ivec_u32_from_hiu16(__ivec s); // Convert hi part of mvec from U16 to U32 + __ivec _ivec_u16_from_u8(__ivec s,__ivec *hipart); // Convert ivec from U8 to U16 + __ivec _ivec_u32_from_u16(__ivec s,__ivec *hipart);// Convert ivec from U16 to U32 + __ivec _ivec_s16_from_s8(__ivec s,__ivec *hipart); // Convert ivec from S8 to S16 + __ivec _ivec_s32_from_s16(__ivec s,__ivec *hipart);// Convert ivec from S16 to S32 + __ivec _ivec_scale_u16_from_u8(__ivec s,__ivec *hipart); // Convert ivec from U8 to U16 and shift left on 8-bit + __ivec _ivec_scale_u32_from_u16(__ivec s,__ivec *hipart);// Convert ivec from U16 to U32 and shift left on 16-bit + __ivec _ivec_s16_from_s32(__ivec s1,__ivec s2); // Convert from S32 to S16 __ivec _ivec_s8_from_s16(__ivec s1,__ivec s2); // Convert from S16 to S8 __ivec _ivec_u8_from_u16(__ivec s1,__ivec s2); // Convert from U16 to U8 + __ivec _ivec_scale_s16_from_s32(__ivec s1,__ivec s2); // Convert from S32 to S16 and shift right on 16-bit + __ivec _ivec_scale_s8_from_s16(__ivec s1,__ivec s2); // Convert from S16 to S8 and shift right on 8-bit + __ivec _ivec_scale_u8_from_u16(__ivec s1,__ivec s2); // Convert from U16 to U8 and shift right on 8-bit + ARITHMETIC engine: ------------------ __ivec _ivec_add_s8(__ivec s1,__ivec s2); // Add S8 @@ -202,4 +213,5 @@ --------------- __f32vec _f32vec_min(__f32vec f1, __f32vec f2); // MIN(f1,f2) __f32vec _f32vec_max(__f32vec f1, __f32vec f2); // MAX(f1,f2) + __f32vec _f32vec_clamp(__f32vec f1, __f32vec minval,__f32vec maxval); // CLAMP(f1,minval,maxval); */ Modified: mplayerxp/pvector/pvector_f32_x86.h =================================================================== --- mplayerxp/pvector/pvector_f32_x86.h 2010-01-28 15:41:48 UTC (rev 129) +++ mplayerxp/pvector/pvector_f32_x86.h 2010-01-28 15:43:33 UTC (rev 130) @@ -317,3 +317,12 @@ } #undef _f32vec_min #define _f32vec_min PVECTOR_RENAME(f32_min) + +extern __inline __f32vec __attribute__((__gnu_inline__, __always_inline__)) +PVECTOR_RENAME(f32_clamp)(__f32vec f1,__f32vec minval,__f32vec maxval) +{ + return _f32vec_max(_f32vec_min(f1,maxval),minval); +} +#undef _f32vec_clamp +#define _f32vec_clamp PVECTOR_RENAME(f32_clamp) + Modified: mplayerxp/pvector/pvector_int_x86.h =================================================================== --- mplayerxp/pvector/pvector_int_x86.h 2010-01-28 15:41:48 UTC (rev 129) +++ mplayerxp/pvector/pvector_int_x86.h 2010-01-28 15:43:33 UTC (rev 130) @@ -414,33 +414,50 @@ #define _ivec_interleave_hi_u32 PVECTOR_RENAME(interleave_hi_u32) extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__)) -PVECTOR_RENAME(u16_from_lou8)(__ivec s) +PVECTOR_RENAME(u16_from_u8)(__ivec s,__ivec *hipart) { - return _ivec_interleave_lo_u8(s,_ivec_setzero()); + __ivec filler = _ivec_setzero(); + *hipart = _ivec_interleave_hi_u8(s,filler); + return _ivec_interleave_lo_u8(s,filler); } -#undef _ivec_u16_from_lou8 -#define _ivec_u16_from_lou8 PVECTOR_RENAME(u16_from_lou8) +#undef _ivec_u16_from_u8 +#define _ivec_u16_from_u8 PVECTOR_RENAME(u16_from_u8) + extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__)) -PVECTOR_RENAME(u16_from_hiu8)(__ivec s) +PVECTOR_RENAME(u32_from_u16)(__ivec s,__ivec *hipart) { - return _ivec_interleave_hi_u8(s,_ivec_setzero()); + __ivec filler = _ivec_setzero(); + *hipart = _ivec_interleave_hi_u16(s,filler); + return _ivec_interleave_lo_u16(s,filler); } -#undef _ivec_u16_from_hiu8 -#define _ivec_u16_from_hiu8 PVECTOR_RENAME(u16_from_hiu8) +#undef _ivec_u32_from_u16 +#define _ivec_u32_from_u16 PVECTOR_RENAME(u32_from_u16) + extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__)) -PVECTOR_RENAME(u32_from_lou16)(__ivec s) +PVECTOR_RENAME(s16_from_s8)(__ivec s,__ivec* hipart) { - return _ivec_interleave_lo_u16(s,_ivec_setzero()); + const __ivec izero = _ivec_setzero(); + __ivec filler; + filler = _ivec_cmpgt_s8(izero,s); + *hipart = _ivec_interleave_hi_u8(s,filler); + return _ivec_interleave_lo_u8(s,filler); } -#undef _ivec_u32_from_lou16 -#define _ivec_u32_from_lou16 PVECTOR_RENAME(u32_from_lou16) +#undef _ivec_s16_from_s8 +#define _ivec_s16_from_s8 PVECTOR_RENAME(s16_from_s8) + extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__)) -PVECTOR_RENAME(u32_from_hiu16)(__ivec s) +PVECTOR_RENAME(s32_from_s16)(__ivec s,__ivec* hipart) { - return _ivec_interleave_hi_u16(s,_ivec_setzero()); + const __ivec izero = _ivec_setzero(); + __ivec filler; + filler = _ivec_cmpgt_s16(izero,s); + *hipart = _ivec_interleave_hi_u16(s,filler); + return _ivec_interleave_lo_u16(s,filler); } -#undef _ivec_u32_from_hiu16 -#define _ivec_u32_from_hiu16 PVECTOR_RENAME(u32_from_hiu16) +#undef _ivec_s32_from_s16 +#define _ivec_s32_from_s16 PVECTOR_RENAME(s32_from_s16) + + extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__)) PVECTOR_RENAME(s16_from_s32)(__ivec s1, __ivec s2) { @@ -840,3 +857,61 @@ } #undef _ivec_srl_s64_imm #define _ivec_srl_s64_imm PVECTOR_RENAME(srl_s64_imm) + +extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__)) +PVECTOR_RENAME(scale_u16_from_u8)(__ivec s,__ivec *hipart) +{ +#if 0 /* slower but portable on non-x86 CPUs version */ + __ivec tmp[2]; + tmp[0] = _ivec_u16_from_u8(s,&tmp[1]); + *hipart = _ivec_sll_s16_imm(tmp[1],8); + return _ivec_sll_s16_imm(tmp[0],8); +#else + __ivec filler = _ivec_setzero(); + *hipart = _ivec_interleave_hi_u8(filler,s); + return _ivec_interleave_lo_u8(filler,s); +#endif +} +#undef _ivec_scale_u16_from_u8 +#define _ivec_scale_u16_from_u8 PVECTOR_RENAME(scale_u16_from_u8) + +extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__)) +PVECTOR_RENAME(scale_u32_from_u16)(__ivec s,__ivec *hipart) +{ +#if 0 /* slower but portable on non-x86 CPUs version */ + __ivec tmp[2]; + tmp[0] = _ivec_u32_from_u16(s,&tmp[1]); + *hipart = _ivec_sll_s32_imm(tmp[1],16); + return _ivec_sll_s32_imm(tmp[0],16); +#else + __ivec filler = _ivec_setzero(); + *hipart = _ivec_interleave_hi_u16(filler,s); + return _ivec_interleave_lo_u16(filler,s); +#endif +} +#undef _ivec_scale_u32_from_u16 +#define _ivec_scale_u32_from_u16 PVECTOR_RENAME(scale_u32_from_u16) + +extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__)) +PVECTOR_RENAME(scale_s16_from_s32)(__ivec s1, __ivec s2) +{ + return _ivec_s16_from_s32(_ivec_sra_s32_imm(s1,16),_ivec_sra_s32_imm(s2,16)); +} +#undef _ivec_scale_s16_from_s32 +#define _ivec_scale_s16_from_s32 PVECTOR_RENAME(scale_s16_from_s32) + +extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__)) +PVECTOR_RENAME(scale_s8_from_s16)(__ivec s1, __ivec s2) +{ + return _ivec_s8_from_s16(_ivec_sra_s16_imm(s1,8),_ivec_sra_s16_imm(s2,8)); +} +#undef _ivec_scale_s8_from_s16 +#define _ivec_scale_s8_from_s16 PVECTOR_RENAME(scale_s8_from_s16) +extern __inline __ivec __attribute__((__gnu_inline__, __always_inline__)) +PVECTOR_RENAME(scale_u8_from_u16)(__ivec s1, __ivec s2) +{ + return _ivec_u8_from_u16(_ivec_sra_s16_imm(s1,8),_ivec_sra_s16_imm(s2,8)); +} +#undef _ivec_scale_u8_from_u16 +#define _ivec_scale_u8_from_u16 PVECTOR_RENAME(scale_u8_from_u16) + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |