[Mplayerxp-cvslog] SF.net SVN: mplayerxp:[112] mplayerxp
Brought to you by:
olov
From: <nic...@us...> - 2010-01-19 17:51:24
|
Revision: 112 http://mplayerxp.svn.sourceforge.net/mplayerxp/?rev=112&view=rev Author: nickols_k Date: 2010-01-19 17:51:18 +0000 (Tue, 19 Jan 2010) Log Message: ----------- cumulative pathc: enable slices in vf_fmtcvt + pvector based MMX optimization Modified Paths: -------------- TOOLS/Makefile TOOLS/asmopt.c etc/codecs.conf mplayerxp/libmpcodecs/dec_video.c mplayerxp/postproc/dsp.c mplayerxp/postproc/dsp_accel.h mplayerxp/postproc/vf.c mplayerxp/postproc/vf_scale.c Added Paths: ----------- mplayerxp/postproc/dsp_accelf.h Modified: TOOLS/Makefile =================================================================== --- TOOLS/Makefile 2010-01-18 18:37:48 UTC (rev 111) +++ TOOLS/Makefile 2010-01-19 17:51:18 UTC (rev 112) @@ -1,6 +1,6 @@ -include ../mplayerxp/config.mak +include ../mplayerxp/mp_config.mak -CFLAGS = -I.. +CFLAGS = -g3 -I.. OBJS = bios2dump$(EXESUF) \ mem2dump$(EXESUF) \ @@ -12,7 +12,7 @@ ../mplayerxp/libmpdemux/freesdp/libfreesdp.a endif MP_LIBS += ../mplayerxp/libmpcodecs/libmpcodecs.a ../mplayerxp/libao2/libao2.a ../mplayerxp/postproc/libpostproc.a \ -../mplayerxp/postproc/libmenu/libmenu.a ../mplayerxp/input/libinput.a ../mplayerxp/libvo/libvo.a \ +../mplayerxp/input/libinput.a ../mplayerxp/libvo/libvo.a \ ../mplayerxp/osdep/libosdep.a ../mplayerxp/mp_msg.o ../mplayerxp/nls/libnls.a ../mplayerxp/cpudetect.o COMMON_LIBS = $(MP_LIBS) $(EXTRALIBS) -lm Modified: TOOLS/asmopt.c =================================================================== --- TOOLS/asmopt.c 2010-01-18 18:37:48 UTC (rev 111) +++ TOOLS/asmopt.c 2010-01-19 17:51:18 UTC (rev 112) @@ -20,66 +20,48 @@ #include "../mplayerxp/mp_config.h" #include "../mplayerxp/cpudetect.h" -#if defined( ARCH_X86 ) || defined(ARCH_X86_64) -#define CAN_COMPILE_X86_ASM -#endif +#undef OPTIMIZE_AVX +#undef OPTIMIZE_SSE4 +#undef OPTIMIZE_SSSE3 +#undef OPTIMIZE_SSE3 +#undef OPTIMIZE_SSE2 +#undef OPTIMIZE_SSE +#undef OPTIMIZE_MMX2 +#undef OPTIMIZE_MMX +#define RENAME(a) a ## _C +#include "asmopt_template.h" -#ifdef CAN_COMPILE_X86_ASM -#undef HAVE_MMX -#undef HAVE_MMX2 -#undef HAVE_SSE2 -#undef HAVE_SSE3 -#undef HAVE_SSSE3 -#undef HAVE_SSE4 -#undef HAVE_AVX - -/*MMX versions*/ +#ifdef __MMX__ +#define OPTIMIZE_MMX #undef RENAME -#define HAVE_MMX -#undef HAVE_MMX2 #define RENAME(a) a ## _MMX #include "asmopt_template.h" - -/*MMX2 versions*/ +#endif +#ifdef __SSE__ +#define OPTIMIZE_MMX2 #undef RENAME -#define HAVE_MMX -#define HAVE_MMX2 #define RENAME(a) a ## _MMX2 #include "asmopt_template.h" - -/*SSE2 versions*/ +#endif +#ifdef __SSE2__ +#define OPTIMIZE_SSE2 #undef RENAME -#define HAVE_MMX -#define HAVE_MMX2 -#define HAVE_SSE -#define HAVE_SSE2 #define RENAME(a) a ## _SSE2 #include "asmopt_template.h" - -/*SSE2 versions*/ +#endif +#ifdef __SSE3__ +#define OPTIMIZE_SSE3 #undef RENAME -#define HAVE_MMX -#define HAVE_MMX2 -#define HAVE_SSE -#define HAVE_SSE2 -#define HAVE_SSE3 #define RENAME(a) a ## _SSE3 #include "asmopt_template.h" - #endif - -/* generic version */ +#ifdef __SSE4_1__ +#define OPTIMIZE_SSE4 #undef RENAME -#undef HAVE_MMX -#undef HAVE_MMX2 -#undef HAVE_SSE2 -#undef HAVE_SSE3 -#undef HAVE_SSSE3 -#undef HAVE_SSE4 -#undef HAVE_AVX -#define RENAME(a) a ## _C +#define RENAME(a) a ## _SSE4 #include "asmopt_template.h" +#endif #define ARR_SIZE (1024*64*2)*10 unsigned verbose=1; @@ -155,11 +137,17 @@ gCpuCaps.hasSSE, gCpuCaps.hasSSE2); test_simd("asmopt.gen" ,"GENERIC:",convert_C); -#ifdef CAN_COMPILE_X86_ASM // ordered per speed fasterst first +#ifdef __SSE3__ if(gCpuCaps.hasSSE3) test_simd("asmopt.sse3","SSE3 :",convert_SSE3); +#endif +#ifdef __SSE2__ if(gCpuCaps.hasSSE2) test_simd("asmopt.sse2","SSE2 :",convert_SSE2); +#endif +#ifdef __MMX2__ if(gCpuCaps.hasMMX2) test_simd("asmopt.mmx2","MMX2 :",convert_MMX2); +#endif +#ifdef __MMX__ if(gCpuCaps.hasMMX) test_simd("asmopt.mmx", "MMX :",convert_MMX); #endif return 0; Modified: etc/codecs.conf =================================================================== --- etc/codecs.conf 2010-01-18 18:37:48 UTC (rev 111) +++ etc/codecs.conf 2010-01-19 17:51:18 UTC (rev 112) @@ -2219,7 +2219,7 @@ videocodec ffwmv3 info "Windows Media Video 9 DMO" - status untested + status working fourcc WMV3,wmv3 fourcc WMVP,wmvp driver ffmpeg Modified: mplayerxp/libmpcodecs/dec_video.c =================================================================== --- mplayerxp/libmpcodecs/dec_video.c 2010-01-18 18:37:48 UTC (rev 111) +++ mplayerxp/libmpcodecs/dec_video.c 2010-01-19 17:51:18 UTC (rev 112) @@ -167,8 +167,9 @@ if(enable_gomp) { smp_num_cpus=omp_get_num_procs(); use_vf_threads=0; - MSG_DBG2("[mpdec] vf_flags=%08X num_cpus=%u\n",sh_video->vf_flags,smp_num_cpus); if(((sh_video->vf_flags&MPDEC_THREAD_COND)==MPDEC_THREAD_COND) && (smp_num_cpus>1)) use_vf_threads=1; + if(use_vf_threads) + MSG_STATUS("[mpdec] will perform parallel video-filter on %u CPUs\n",smp_num_cpus); } #else MSG_V("[mpdec] GOMP was not compiled-in! Using single threaded video filtering!\n"); Modified: mplayerxp/postproc/dsp.c =================================================================== --- mplayerxp/postproc/dsp.c 2010-01-18 18:37:48 UTC (rev 111) +++ mplayerxp/postproc/dsp.c 2010-01-19 17:51:18 UTC (rev 112) @@ -20,6 +20,59 @@ #endif #include "dsp.h" +extern uint32_t load24bit(void* data, int pos); +extern void store24bit(void* data, int pos, uint32_t expanded_value); + +/* MMX optimized stugff */ +#include <limits.h> +#include "../mp_config.h" +#include "../cpudetect.h" + +#undef OPTIMIZE_AVX +#undef OPTIMIZE_SSE4 +#undef OPTIMIZE_SSSE3 +#undef OPTIMIZE_SSE3 +#undef OPTIMIZE_SSE2 +#undef OPTIMIZE_SSE +#undef OPTIMIZE_MMX2 +#undef OPTIMIZE_MMX +#define RENAME(a) a ## _c +#include "dsp_accel.h" +#include "dsp_accelf.h" + +#ifndef __x86_64__ +#ifdef __MMX__ +#define OPTIMIZE_MMX +#undef RENAME +#define RENAME(a) a ## _MMX +#include "dsp_accel.h" +#endif +#ifdef __SSE__ +#define OPTIMIZE_MMX2 +#undef RENAME +#define RENAME(a) a ## _MMX2 +#include "dsp_accel.h" +#endif +#endif //__x86_64__ +#ifdef __SSE2__ +#define OPTIMIZE_SSE2 +#undef RENAME +#define RENAME(a) a ## _SSE2 +#include "dsp_accel.h" +#endif +#ifdef __SSE3__ +#define OPTIMIZE_SSE3 +#undef RENAME +#define RENAME(a) a ## _SSE3 +#include "dsp_accel.h" +#endif +#ifdef __SSE4_1__ +#define OPTIMIZE_SSE4 +#undef RENAME +#define RENAME(a) a ## _SSE4 +#include "dsp_accel.h" +#endif + /****************************************************************************** * FIR filter implementations ******************************************************************************/ @@ -199,7 +252,7 @@ g += w[end-i-1] * (t3 + t2); // Total gain in filter w[end-i-1] = w[n-end+i] = w[end-i-1] * (t2 - t3); } - } + } else{ // Band stop if (!o) // Band stop filters must have odd length return -1; @@ -219,9 +272,9 @@ // Normalize gain g=1/g; - for (i=0; i<n; i++) + for (i=0; i<n; i++) w[i] *= g; - + return 0; } @@ -244,7 +297,7 @@ int i; // Counters int j; _ftype_t t; // g * w[i] - + // Sanity check if(l<1 || k<1 || !w || !pw) return -1; @@ -277,7 +330,7 @@ /* Pre-warp the coefficients of a numerator or denominator. Note that a0 is assumed to be 1, so there is no wrapping - of it. + of it. */ void __FASTCALL__ prewarp(_ftype_t* a, _ftype_t fc, _ftype_t fs) { @@ -560,7 +613,7 @@ *w++ = 0.2810638602 - 0.5208971735*cos(k1*(_ftype_t)i) + 0.1980389663*cos(k2*(_ftype_t)i); } -/* Computes the 0th order modified Bessel function of the first kind. +/* Computes the 0th order modified Bessel function of the first kind. // (Needed to compute Kaiser window) // // y = sum( (x/(2*n))^2 ) @@ -654,81 +707,30 @@ bp->prev = bp->pprev = 0.0; } -extern uint32_t load24bit(void* data, int pos); -extern void store24bit(void* data, int pos, uint32_t expanded_value); - - -/* MMX optimized stugff */ -#include <limits.h> -#include "../mp_config.h" -#include "../cpudetect.h" - -#undef HAVE_MMX -#undef HAVE_MMX2 -#undef HAVE_3DNOW -#undef HAVE_3DNOW2 -#undef HAVE_SSE -#define RENAME(a) a ## _c -#include "dsp_accel.h" - -#if defined( ARCH_X86 ) || defined(ARCH_X86_64) -#define CAN_COMPILE_X86_ASM +static void __FASTCALL__ init_change_bps(const void* in, void* out, unsigned len, unsigned inbps, unsigned outbps) +{ +#ifdef __SSE4_1__ + if(gCpuCaps.hasSSE41) change_bps = change_bps_SSE4; + else #endif -#ifdef CAN_COMPILE_X86_ASM -//MMX versions -#ifdef CAN_COMPILE_MMX -#undef RENAME -#define HAVE_MMX -#undef HAVE_MMX2 -#undef HAVE_3DNOW -#define RENAME(a) a ## _MMX -#include "dsp_accel.h" +#ifdef __SSE3__ + if(gCpuCaps.hasSSE3) change_bps = change_bps_SSE3; + else #endif - -//3DNow! versions -#ifdef CAN_COMPILE_3DNOW -#undef RENAME -#define HAVE_MMX -#undef HAVE_MMX2 -#define HAVE_3DNOW -#define RENAME(a) a ## _3DNow -#include "dsp_accel.h" +#ifdef __SSE2__ + if(gCpuCaps.hasSSE2) change_bps = change_bps_SSE2; + else #endif - -//3DNowEx! versions -#ifdef CAN_COMPILE_3DNOW2 -#undef RENAME -#define HAVE_MMX -#define HAVE_MMX2 -#define HAVE_3DNOW -#define HAVE_3DNOW2 -#define RENAME(a) a ## _3DNowEx -#include "dsp_accel.h" -#endif - -//MMX2 versions -#ifdef CAN_COMPILE_MMX2 -#undef RENAME -#define HAVE_MMX -#define HAVE_MMX2 -#undef HAVE_3DNOW -#define RENAME(a) a ## _MMX2 -#include "dsp_accel.h" -#endif - -#endif - -static void __FASTCALL__ init_change_bps(const void* in, void* out, unsigned len, unsigned inbps, unsigned outbps) -{ -#ifdef CAN_COMPILE_MMX -/* disable these functions for speed reason ! +#ifndef __x86_64__ +#ifdef __SSE__ if(gCpuCaps.hasMMX2) change_bps = change_bps_MMX2; else - if(gCpuCaps.has3DNow) change_bps = change_bps_3DNow; - else */ +#endif +#ifdef __MMX__ if(gCpuCaps.hasMMX) change_bps = change_bps_MMX; else -#endif //CAN_COMPILE_X86_ASM +#endif +#endif /* __x86_64__ */ change_bps = change_bps_c; (*change_bps)(in,out,len,inbps,outbps); } @@ -736,7 +738,7 @@ static void __FASTCALL__ init_float2int(void* in, void* out, int len, int bps) { -#ifdef CAN_COMPILE_X86_ASM +#if 0 #ifdef CAN_COMPILE_3DNOW2 if(gCpuCaps.has3DNowExt) float2int = float2int_3DNowEx; else @@ -753,7 +755,7 @@ static void __FASTCALL__ init_int2float(void* in, void* out, int len, int bps) { -#ifdef CAN_COMPILE_X86_ASM +#if 0 #ifdef CAN_COMPILE_3DNOW2 if(gCpuCaps.has3DNowExt) int2float = int2float_3DNowEx; else @@ -771,16 +773,20 @@ static int32_t __FASTCALL__ FIR_i16_init(int16_t *x,int16_t *w) { -#ifdef CAN_COMPILE_X86_ASM -#ifdef CAN_COMPILE_MMX2 +#ifdef __SSE2__ + if(gCpuCaps.hasSSE2) FIR_i16 = FIR_i16_SSE2; + else +#endif +#ifndef __x86_64__ +#ifdef __SSE__ if(gCpuCaps.hasMMX2) FIR_i16 = FIR_i16_MMX2; else #endif -#ifdef CAN_COMPILE_MMX +#ifdef __MMX__ if(gCpuCaps.hasMMX) FIR_i16 = FIR_i16_MMX; else #endif -#endif /*CAN_COMPILE_X86_ASM*/ +#endif /*__x86_64__*/ FIR_i16 = FIR_i16_c; return (*FIR_i16)(x,w); } @@ -788,7 +794,7 @@ static float __FASTCALL__ FIR_f32_init(float *x,float *w) { -#ifdef CAN_COMPILE_X86_ASM +#if 0 // if(gCpuCaps.hasSSE) FIR_f32 = FIR_f32_SSE; // else #ifdef CAN_COMPILE_3DNOW Modified: mplayerxp/postproc/dsp_accel.h =================================================================== --- mplayerxp/postproc/dsp_accel.h 2010-01-18 18:37:48 UTC (rev 111) +++ mplayerxp/postproc/dsp_accel.h 2010-01-19 17:51:18 UTC (rev 112) @@ -1,18 +1,64 @@ /* DSP acceleration routines */ -#include "../mmx_defs.h" +#include "pvector/pvector.h" +#ifdef HAVE_INT_PVECTOR +static __inline __m64 __attribute__((__gnu_inline__, __always_inline__)) +RENAME(_m_load)(const void *__P) +{ + return *(const __m64 *)__P; +} +#undef _m_load +#define _m_load RENAME(_m_load) +static __inline __m64 __attribute__((__gnu_inline__, __always_inline__)) +RENAME(_m_load_half)(const void *__P) +{ + return _mm_cvtsi32_si64 (*(const int *)__P); +} +#undef _m_load_half +#define _m_load_half RENAME(_m_load_half) + +static __inline void __attribute__((__gnu_inline__, __always_inline__)) +RENAME(_m_store)(void *__P, __m64 src) +{ + *(__m64 *)__P = src; +} +#undef _m_store +#define _m_store RENAME(_m_store) + +static __inline void __attribute__((__gnu_inline__, __always_inline__)) +RENAME(_m_store_half)(void *__P, __m64 src) +{ + *(int *)__P = _mm_cvtsi64_si32(src); +} +#undef _m_store_half +#define _m_store_half RENAME(_m_store_half) + +static __inline void __attribute__((__gnu_inline__, __always_inline__)) +RENAME(_m_movntq)(void *__P, __m64 src) +{ +#ifdef HAVE_MMX2 + _mm_stream_pi(__P,src); +#else + _m_store(__P,src); +#endif +} +#undef _m_movntq +#define _m_movntq RENAME(_m_movntq) +#endif + static void __FASTCALL__ RENAME(change_bps)(const void* in_data, void* out_data, unsigned len, unsigned inbps, unsigned outbps) { -#ifdef HAVE_MMX - unsigned len_mm; +#ifdef HAVE_INT_PVECTOR + __ivec izero = _ivec_setzero(); + unsigned len_mm,j; #endif - register unsigned i; + unsigned i; // Change the number of bits switch(inbps){ case 1: switch(outbps){ case 2: - i=0; + i=0; for(;i<len;i++) ((uint16_t*)out_data)[i]=((uint16_t)((uint8_t*)in_data)[i])<<8; break; @@ -23,7 +69,7 @@ ((uint8_t*)out_data)[3*i+2]=(((uint8_t*)in_data)[i]); break; case 4: - i=0; + i=0; for(;i<len;i++) ((uint32_t*)out_data)[i]=((uint32_t)((uint8_t*)in_data)[i])<<24; break; @@ -32,7 +78,7 @@ case 2: switch(outbps){ case 1: - i=0; + i=0; for(;i<len;i++) ((uint8_t*)out_data)[i]=(uint8_t)((((uint16_t*)in_data)[i])>>8); break; @@ -44,48 +90,27 @@ break; case 4: i=0; -#ifdef HAVE_MMX - len_mm=len&(~0xF); - for(;i<len_mm;i+=16) +#ifdef HAVE_INT_PVECTOR + j=0; + len_mm=len&(~(__IVEC_SIZE-1)); + for(;i<len;i++,j+=2){ + ((uint32_t*)out_data)[i]=((uint32_t)((uint16_t*)in_data)[i])<<16; + if((((long)out_data)&(__IVEC_SIZE-1))==0) break; + } + if((len_mm-i)>=__IVEC_SIZE) + for(;i<len_mm;i+=__IVEC_SIZE/2,j+=__IVEC_SIZE) { - __asm __volatile( - "movq (%1), %%mm0\n\t" - "movq 8(%1), %%mm1\n\t" - "movq 16(%1), %%mm2\n\t" - "movq 24(%1), %%mm3\n\t" - "pxor %%mm4, %%mm4\n\t" - "pxor %%mm5, %%mm5\n\t" - "pxor %%mm6, %%mm6\n\t" - "pxor %%mm7, %%mm7\n\t" - "punpcklwd %%mm0, %%mm4\n\t" - "punpckhwd %%mm0, %%mm5\n\t" - "punpcklwd %%mm1, %%mm6\n\t" - "punpckhwd %%mm1, %%mm7\n\t" - MOVNTQ" %%mm4, (%0)\n\t" - MOVNTQ" %%mm5, 8(%0)\n\t" - MOVNTQ" %%mm6, 16(%0)\n\t" - MOVNTQ" %%mm7, 24(%0)\n\t" - "pxor %%mm4, %%mm4\n\t" - "pxor %%mm5, %%mm5\n\t" - "pxor %%mm6, %%mm6\n\t" - "pxor %%mm7, %%mm7\n\t" - "punpcklwd %%mm2, %%mm4\n\t" - "punpckhwd %%mm2, %%mm5\n\t" - "punpcklwd %%mm3, %%mm6\n\t" - "punpckhwd %%mm3, %%mm7\n\t" - MOVNTQ" %%mm4, 32(%0)\n\t" - MOVNTQ" %%mm5, 40(%0)\n\t" - MOVNTQ" %%mm6, 48(%0)\n\t" - MOVNTQ" %%mm7, 56(%0)\n\t" - ::"r"(&(((uint32_t*)out_data)[i])),"r"(&(((uint16_t*)in_data)[i])) - :"memory" -#ifdef FPU_CLOBBERED - ,FPU_CLOBBERED + __ivec ind,tmp[2]; + ind = _ivec_loadu(&((uint8_t *)in_data)[j]); +#if 0 /* slower but portable on non-x86 CPUs version */ + tmp[0]= _ivec_sll_s32_imm(_ivec_u32_from_lou16(ind),16); + tmp[1]= _ivec_sll_s32_imm(_ivec_u32_from_hiu16(ind),16); +#else + tmp[0]= _ivec_interleave_lo_u16(izero,ind); + tmp[1]= _ivec_interleave_hi_u16(izero,ind); #endif -#ifdef MMX_CLOBBERED - ,MMX_CLOBBERED -#endif - ); + _ivec_storea(&((uint8_t *)out_data)[j*2],tmp[0]); + _ivec_storea(&((uint8_t *)out_data)[j*2+__IVEC_SIZE],tmp[1]); } #endif for(;i<len;i++) @@ -120,50 +145,27 @@ case 4: switch(outbps){ case 1: - i=0; + i=0; for(;i<len;i++) ((uint8_t*)out_data)[i]=(uint8_t)((((uint32_t*)in_data)[i])>>24); break; case 2: i=0; -#ifdef HAVE_MMX - len_mm=len&(~0xF); - for(;i<len_mm;i+=16) +#ifdef HAVE_INT_PVECTOR + j=0; + len_mm=len&(~(__IVEC_SIZE-1)); + for(;i<len;i++,j+=2){ + ((uint16_t*)out_data)[i]=(uint16_t)((((uint32_t*)in_data)[i])>>16); + if((((long)out_data)&(__IVEC_SIZE-1))==0) break; + } + if((len-i)>=__IVEC_SIZE) + for(;i<len_mm;i+=__IVEC_SIZE/2,j+=__IVEC_SIZE) { - __asm __volatile( - "movq (%1), %%mm0\n\t" - "movq 8(%1), %%mm1\n\t" - "movq 16(%1), %%mm2\n\t" - "movq 24(%1), %%mm3\n\t" - "movq 32(%1), %%mm4\n\t" - "movq 40(%1), %%mm5\n\t" - "movq 48(%1), %%mm6\n\t" - "movq 56(%1), %%mm7\n\t" - "psrad $16, %%mm0\n\t" - "psrad $16, %%mm1\n\t" - "psrad $16, %%mm2\n\t" - "psrad $16, %%mm3\n\t" - "psrad $16, %%mm4\n\t" - "psrad $16, %%mm5\n\t" - "psrad $16, %%mm6\n\t" - "psrad $16, %%mm7\n\t" - "packssdw %%mm1, %%mm0\n\t" - "packssdw %%mm3, %%mm2\n\t" - "packssdw %%mm5, %%mm4\n\t" - "packssdw %%mm7, %%mm6\n\t" - MOVNTQ" %%mm0, (%0)\n\t" - MOVNTQ" %%mm2, 8(%0)\n\t" - MOVNTQ" %%mm4, 16(%0)\n\t" - MOVNTQ" %%mm6, 24(%0)\n\t" - ::"r"(&(((uint16_t*)out_data)[i])),"r"(&(((uint32_t*)in_data)[i])) - :"memory" -#ifdef FPU_CLOBBERED - ,FPU_CLOBBERED -#endif -#ifdef MMX_CLOBBERED - ,MMX_CLOBBERED -#endif - ); + __ivec ind[2],tmp; + ind[0]= _ivec_sra_s32_imm(_ivec_loadu(&((uint8_t *)in_data)[j*2]),16); + ind[1]= _ivec_sra_s32_imm(_ivec_loadu(&((uint8_t *)in_data)[j*2+__IVEC_SIZE]),16); + tmp = _ivec_s16_from_s32(ind[0],ind[1]); + _ivec_storea(&((uint8_t *)out_data)[j],tmp); } #endif for(;i<len;i++) @@ -176,364 +178,40 @@ ((uint8_t*)out_data)[3*i+2]=(((uint8_t*)in_data)[4*i+3]); break; } - break; + break; } -#ifdef HAVE_MMX -#ifndef HAVE_MMX1 - asm volatile(SFENCE:::"memory"); +#ifdef HAVE_INT_PVECTOR + _ivec_sfence(); + _ivec_empty(); #endif - asm volatile(EMMS:: - :"memory" -#ifdef FPU_CLOBBERED - ,FPU_CLOBBERED -#endif -#ifdef MMX_CLOBBERED - ,MMX_CLOBBERED -#endif - ); -#endif /* HAVE_MMX */ } -static void __FASTCALL__ RENAME(float2int)(void* in, void* out, int len, int bps) +static int32_t __FASTCALL__ RENAME(FIR_i16)(int16_t *x,int16_t *w) { -#ifdef HAVE_3DNOW - unsigned len_mm; - float tmp_f32[2]; -#endif - float ftmp; - register int i; - switch(bps){ - case(1): - for(i=0;i<len;i++) { - ftmp=((float*)in)[i]; - SATURATE(ftmp,-1.0,+1.0); - ((int8_t*)out)[i]=(int8_t)lrintf(SCHAR_MAX*ftmp); - } - break; - case(2): - i=0; -#ifdef HAVE_3DNOW - len_mm=len&(~15); - tmp_f32[0]= - tmp_f32[1]=SHRT_MAX; - for(;i<len_mm;i+=16) - { - __asm __volatile( - PREFETCH" 64(%1)\n\t" - PREFETCHW" 32(%0)\n\t" - "movq (%1), %%mm0\n\t" - "movq 8(%1), %%mm1\n\t" - "movq 16(%1), %%mm2\n\t" - "movq 24(%1), %%mm3\n\t" - "movq 32(%1), %%mm4\n\t" - "movq 40(%1), %%mm5\n\t" - "movq 48(%1), %%mm6\n\t" - "movq 56(%1), %%mm7\n\t" - "pfmul %2, %%mm0\n\t" - "pfmul %2, %%mm1\n\t" - "pfmul %2, %%mm2\n\t" - "pfmul %2, %%mm3\n\t" - "pfmul %2, %%mm4\n\t" - "pfmul %2, %%mm5\n\t" - "pfmul %2, %%mm6\n\t" - "pfmul %2, %%mm7\n\t" - "pf2id %%mm0, %%mm0\n\t" - "pf2id %%mm1, %%mm1\n\t" - "pf2id %%mm2, %%mm2\n\t" - "pf2id %%mm3, %%mm3\n\t" - "pf2id %%mm4, %%mm4\n\t" - "pf2id %%mm5, %%mm5\n\t" - "pf2id %%mm6, %%mm6\n\t" - "pf2id %%mm7, %%mm7\n\t" - "packssdw %%mm1, %%mm0\n\t" - "packssdw %%mm3, %%mm2\n\t" - "packssdw %%mm5, %%mm4\n\t" - "packssdw %%mm7, %%mm6\n\t" - "movq %%mm0, (%0)\n\t" - "movq %%mm2, 8(%0)\n\t" - "movq %%mm4, 16(%0)\n\t" - "movq %%mm6, 24(%0)" - ::"r"(&(((uint16_t*)out)[i])),"r"(&(((float*)in)[i])),"m"(tmp_f32[0]) - :"memory" -#ifdef FPU_CLOBBERED - ,FPU_CLOBBERED -#endif -#ifdef MMX_CLOBBERED - ,MMX_CLOBBERED -#endif - ); - } -#endif - for(;i<len;i++) { - ftmp=((float*)in)[i]; - SATURATE(ftmp,-1.0,+1.0); - ((int16_t*)out)[i]=(int16_t)lrintf(SHRT_MAX*ftmp); - } - break; - case(3): - for(i=0;i<len;i++) { - ftmp=((float*)in)[i]; - SATURATE(ftmp,-1.0,+1.0); - store24bit(out, i, (int32_t)lrintf((INT_MAX-1)*ftmp)); - } - break; - case(4): - i=0; -#ifdef HAVE_3DNOW - len_mm=len&(~15); - tmp_f32[0]= - tmp_f32[1]=INT_MAX; - for(;i<len_mm;i+=16) - { - __asm __volatile( - PREFETCH" 64(%1)\n\t" - PREFETCHW" 64(%0)\n\t" - "movq (%1), %%mm0\n\t" - "movq 8(%1), %%mm1\n\t" - "movq 16(%1), %%mm2\n\t" - "movq 24(%1), %%mm3\n\t" - "movq 32(%1), %%mm4\n\t" - "movq 40(%1), %%mm5\n\t" - "movq 48(%1), %%mm6\n\t" - "movq 56(%1), %%mm7\n\t" - "pfmul %2, %%mm0\n\t" - "pfmul %2, %%mm1\n\t" - "pfmul %2, %%mm2\n\t" - "pfmul %2, %%mm3\n\t" - "pfmul %2, %%mm4\n\t" - "pfmul %2, %%mm5\n\t" - "pfmul %2, %%mm6\n\t" - "pfmul %2, %%mm7\n\t" - "pf2id %%mm0, %%mm0\n\t" - "pf2id %%mm1, %%mm1\n\t" - "pf2id %%mm2, %%mm2\n\t" - "pf2id %%mm3, %%mm3\n\t" - "pf2id %%mm4, %%mm4\n\t" - "pf2id %%mm5, %%mm5\n\t" - "pf2id %%mm6, %%mm6\n\t" - "pf2id %%mm7, %%mm7\n\t" - "movq %%mm0, (%0)\n\t" - "movq %%mm1, 8(%0)\n\t" - "movq %%mm2, 16(%0)\n\t" - "movq %%mm3, 24(%0)\n\t" - "movq %%mm4, 32(%0)\n\t" - "movq %%mm5, 40(%0)\n\t" - "movq %%mm6, 48(%0)\n\t" - "movq %%mm7, 56(%0)" - ::"r"(&(((uint32_t*)out)[i])),"r"(&(((float*)in)[i])),"m"(tmp_f32[0]) - :"memory" -#ifdef FPU_CLOBBERED - ,FPU_CLOBBERED -#endif -#ifdef MMX_CLOBBERED - ,MMX_CLOBBERED -#endif - ); - } -#endif - for(;i<len;i++) { - ftmp=((float*)in)[i]; - SATURATE(ftmp,-1.0,+1.0); - ((int32_t*)out)[i]=(int32_t)lrintf((INT_MAX-1)*ftmp); - } - break; - } -#ifdef HAVE_3DNOW - asm volatile(EMMS:: - :"memory" -#ifdef FPU_CLOBBERED - ,FPU_CLOBBERED -#endif -#ifdef MMX_CLOBBERED - ,MMX_CLOBBERED -#endif - ); -#endif -} +#ifdef OPTIMIZE_MMX + __m64 mm[8]; + mm[0] = _m_load(&w[0]); + mm[1] = _m_load(&w[4]); + mm[2] = _m_load(&w[8]); + mm[3] = _m_load(&w[12]); -static void __FASTCALL__ RENAME(int2float)(void* in, void* out, int len, int bps) -{ -#ifdef HAVE_3DNOW - unsigned len_mm; - float tmp_f32[2]; -#endif - register int i; - switch(bps){ - case(1): - for(i=0;i<len;i++) - ((float*)out)[i]=(1.0/SCHAR_MAX)*((float)((int8_t*)in)[i]); - break; - case(2): - i=0; -#ifdef HAVE_3DNOW - tmp_f32[0]= - tmp_f32[1]=1.0/INT_MAX; - len_mm=len&(~15); - for(;i<len_mm;i+=16) - { - __asm __volatile( - PREFETCH" 32(%1)\n\t" - PREFETCHW" 64(%0)\n\t" - "movq (%1), %%mm0\n\t" - "movq 8(%1), %%mm1\n\t" - "movq 16(%1), %%mm2\n\t" - "movq 24(%1), %%mm3\n\t" - "pxor %%mm4, %%mm4\n\t" - "pxor %%mm5, %%mm5\n\t" - "pxor %%mm6, %%mm6\n\t" - "pxor %%mm7, %%mm7\n\t" - "punpcklwd %%mm0, %%mm4\n\t" - "punpckhwd %%mm0, %%mm5\n\t" - "punpcklwd %%mm1, %%mm6\n\t" - "punpckhwd %%mm1, %%mm7\n\t" - "pi2fd %%mm4, %%mm4\n\t" - "pi2fd %%mm5, %%mm5\n\t" - "pi2fd %%mm6, %%mm6\n\t" - "pi2fd %%mm7, %%mm7\n\t" - "pfmul %2, %%mm4\n\t" - "pfmul %2, %%mm5\n\t" - "pfmul %2, %%mm6\n\t" - "pfmul %2, %%mm7\n\t" - "movq %%mm4, (%0)\n\t" - "movq %%mm5, 8(%0)\n\t" - "movq %%mm6, 16(%0)\n\t" - "movq %%mm7, 24(%0)\n\t" - "pxor %%mm4, %%mm4\n\t" - "pxor %%mm5, %%mm5\n\t" - "pxor %%mm6, %%mm6\n\t" - "pxor %%mm7, %%mm7\n\t" - "punpcklwd %%mm2, %%mm4\n\t" - "punpckhwd %%mm2, %%mm5\n\t" - "punpcklwd %%mm3, %%mm6\n\t" - "punpckhwd %%mm3, %%mm7\n\t" - "pi2fd %%mm4, %%mm4\n\t" - "pi2fd %%mm5, %%mm5\n\t" - "pi2fd %%mm6, %%mm6\n\t" - "pi2fd %%mm7, %%mm7\n\t" - "pfmul %2, %%mm4\n\t" - "pfmul %2, %%mm5\n\t" - "pfmul %2, %%mm6\n\t" - "pfmul %2, %%mm7\n\t" - "movq %%mm4, 32(%0)\n\t" - "movq %%mm5, 40(%0)\n\t" - "movq %%mm6, 48(%0)\n\t" - "movq %%mm7, 56(%0)\n\t" - "femms" - ::"r"(&(((float*)out)[i])),"r"(&(((int16_t*)in)[i])),"m"(tmp_f32[0]) - :"memory" -#ifdef FPU_CLOBBERED - ,FPU_CLOBBERED -#endif -#ifdef MMX_CLOBBERED - ,MMX_CLOBBERED -#endif - ); - } -#endif - for(;i<len;i++) - ((float*)out)[i]=(1.0/SHRT_MAX)*((float)((int16_t*)in)[i]); - break; - case(3): - for(i=0;i<len;i++) - ((float*)out)[i]=(1.0/INT_MAX)*((float)((int32_t)load24bit(in, i))); - break; - case(4): - i=0; -#ifdef HAVE_3DNOW - tmp_f32[0]= - tmp_f32[1]=1.0/INT_MAX; - len_mm=len&(~15); - for(;i<len_mm;i+=16) - { - __asm __volatile( - PREFETCH" 64(%1)\n\t" - PREFETCHW" 64(%0)\n\t" - "movq (%1), %%mm0\n\t" - "movq 8(%1), %%mm1\n\t" - "movq 16(%1), %%mm2\n\t" - "movq 24(%1), %%mm3\n\t" - "movq 32(%1), %%mm4\n\t" - "movq 40(%1), %%mm5\n\t" - "movq 48(%1), %%mm6\n\t" - "movq 56(%1), %%mm7\n\t" - "pi2fd %%mm0, %%mm0\n\t" - "pi2fd %%mm1, %%mm1\n\t" - "pi2fd %%mm2, %%mm2\n\t" - "pi2fd %%mm3, %%mm3\n\t" - "pi2fd %%mm4, %%mm4\n\t" - "pi2fd %%mm5, %%mm5\n\t" - "pi2fd %%mm6, %%mm6\n\t" - "pi2fd %%mm7, %%mm7\n\t" - "pfmul %2, %%mm0\n\t" - "pfmul %2, %%mm1\n\t" - "pfmul %2, %%mm2\n\t" - "pfmul %2, %%mm3\n\t" - "pfmul %2, %%mm4\n\t" - "pfmul %2, %%mm5\n\t" - "pfmul %2, %%mm6\n\t" - "pfmul %2, %%mm7\n\t" - "movq %%mm0, (%0)\n\t" - "movq %%mm1, 8(%0)\n\t" - "movq %%mm2, 16(%0)\n\t" - "movq %%mm3, 24(%0)\n\t" - "movq %%mm4, 32(%0)\n\t" - "movq %%mm5, 40(%0)\n\t" - "movq %%mm6, 48(%0)\n\t" - "movq %%mm7, 56(%0)\n\t" - "femms" - ::"r"(&(((float*)out)[i])),"r"(&(((int32_t*)in)[i])),"m"(tmp_f32[0]) - :"memory" -#ifdef FPU_CLOBBERED - ,FPU_CLOBBERED -#endif -#ifdef MMX_CLOBBERED - ,MMX_CLOBBERED -#endif - ); - } -#endif - for(;i<len;i++) - ((float*)out)[i]=(1.0/INT_MAX)*((float)((int32_t*)in)[i]); - break; - } -} + mm[4] = _m_pmaddwd(mm[0],_m_load(&x[0])); + mm[5] = _m_pmaddwd(mm[1],_m_load(&x[4])); + mm[6] = _m_pmaddwd(mm[2],_m_load(&x[8])); + mm[7] = _m_pmaddwd(mm[3],_m_load(&x[12])); -static int32_t __FASTCALL__ RENAME(FIR_i16)(int16_t *x,int16_t *w) -{ -#ifdef HAVE_MMX - int32_t rval; - __asm __volatile( - "movq (%1), %%mm0\n\t" - "movq 8(%1), %%mm1\n\t" - "movq 16(%1), %%mm2\n\t" - "movq 24(%1), %%mm3\n\t" - "pmaddwd (%2), %%mm0\n\t" - "pmaddwd 8(%2), %%mm1\n\t" - "pmaddwd 16(%2), %%mm2\n\t" - "pmaddwd 24(%2), %%mm3\n\t" - "paddd %%mm1, %%mm0\n\t" - "paddd %%mm3, %%mm2\n\t" - "paddd %%mm2, %%mm0\n\t" -#ifdef HAVE_MMX2 - "pshufw $0xFE, %%mm0, %%mm1\n\t" + mm[0] = _m_paddd(mm[4],mm[5]); + mm[1] = _m_paddd(mm[6],mm[7]); + mm[2] = _m_paddd(mm[0],mm[1]); +#ifdef OPTIMIZE_MMX2 + mm[0] = _m_pshufw(mm[2],0xFE); #else - "movq %%mm0, %%mm1\n\t" - "psrlq $32, %%mm1\n\t" + mm[0] = mm[2]; + mm[0] = _m_psrlqi(mm[0],32); #endif - "paddd %%mm1, %%mm0\n\t" - "psrld $16, %%mm0\n\t" - "movd %%mm0, %0\n\t" - "emms" - :"=&r"(rval):"r"(w),"r"(x) - :"memory" -#ifdef FPU_CLOBBERED - ,FPU_CLOBBERED -#endif -#ifdef MMX_CLOBBERED - ,MMX_CLOBBERED -#endif - ); - return rval; + mm[0] = _m_paddd(mm[0],mm[2]); + mm[0] = _m_psrldi(mm[0],16); + return _mm_cvtsi64_si32(mm[0]); #else return ( w[0] *x[0] +w[1] *x[1] +w[2] *x[2] +w[3] *x[3] + w[4] *x[4] +w[5] *x[5] +w[6] *x[6] +w[7] *x[7] @@ -542,51 +220,3 @@ #endif } -static float __FASTCALL__ RENAME(FIR_f32)(float *x,float *w) -{ -#ifdef HAVE_3DNOW - float rval; - __asm __volatile( - "movq (%1), %%mm0\n\t" - "movq 8(%1), %%mm1\n\t" - "movq 16(%1), %%mm2\n\t" - "movq 24(%1), %%mm3\n\t" - "movq 32(%1), %%mm4\n\t" - "movq 40(%1), %%mm5\n\t" - "movq 48(%1), %%mm6\n\t" - "movq 56(%1), %%mm7\n\t" - "pfmul (%2), %%mm0\n\t" - "pfmul 8(%2), %%mm1\n\t" - "pfmul 16(%2), %%mm2\n\t" - "pfmul 24(%2), %%mm3\n\t" - "pfmul 32(%2), %%mm4\n\t" - "pfmul 40(%2), %%mm5\n\t" - "pfmul 48(%2), %%mm6\n\t" - "pfmul 56(%2), %%mm7\n\t" - "pfadd %%mm1, %%mm0\n\t" - "pfadd %%mm3, %%mm2\n\t" - "pfadd %%mm5, %%mm4\n\t" - "pfadd %%mm7, %%mm6\n\t" - "pfadd %%mm2, %%mm0\n\t" - "pfadd %%mm6, %%mm4\n\t" - "pfadd %%mm4, %%mm0\n\t" - "pfacc %%mm0, %%mm0\n\t" - "movd %%mm0, %0\n\t" - "femms" - :"=&r"(rval):"r"(w),"r"(x) - :"memory" -#ifdef FPU_CLOBBERED - ,FPU_CLOBBERED -#endif -#ifdef MMX_CLOBBERED - ,MMX_CLOBBERED -#endif - ); - return rval; -#else - return ( w[0] *x[0] +w[1] *x[1] +w[2] *x[2] +w[3] *x[3] - + w[4] *x[4] +w[5] *x[5] +w[6] *x[6] +w[7] *x[7] - + w[8] *x[8] +w[9] *x[9] +w[10]*x[10]+w[11]*x[11] - + w[12]*x[12]+w[13]*x[13]+w[14]*x[14]+w[15]*x[15] ); -#endif -} Added: mplayerxp/postproc/dsp_accelf.h =================================================================== --- mplayerxp/postproc/dsp_accelf.h (rev 0) +++ mplayerxp/postproc/dsp_accelf.h 2010-01-19 17:51:18 UTC (rev 112) @@ -0,0 +1,354 @@ +/* DSP floating-point acceleration routines */ + +static void __FASTCALL__ RENAME(float2int)(void* in, void* out, int len, int bps) +{ +#ifdef HAVE_3DNOW + unsigned len_mm; + float tmp_f32[2]; +#endif + float ftmp; + register int i; + switch(bps){ + case(1): + for(i=0;i<len;i++) { + ftmp=((float*)in)[i]; + SATURATE(ftmp,-1.0,+1.0); + ((int8_t*)out)[i]=(int8_t)lrintf(SCHAR_MAX*ftmp); + } + break; + case(2): + i=0; +#ifdef HAVE_3DNOW + len_mm=len&(~15); + tmp_f32[0]= + tmp_f32[1]=SHRT_MAX; + for(;i<len_mm;i+=16) + { + __asm __volatile( + PREFETCH" 64(%1)\n\t" + PREFETCHW" 32(%0)\n\t" + "movq (%1), %%mm0\n\t" + "movq 8(%1), %%mm1\n\t" + "movq 16(%1), %%mm2\n\t" + "movq 24(%1), %%mm3\n\t" + "movq 32(%1), %%mm4\n\t" + "movq 40(%1), %%mm5\n\t" + "movq 48(%1), %%mm6\n\t" + "movq 56(%1), %%mm7\n\t" + "pfmul %2, %%mm0\n\t" + "pfmul %2, %%mm1\n\t" + "pfmul %2, %%mm2\n\t" + "pfmul %2, %%mm3\n\t" + "pfmul %2, %%mm4\n\t" + "pfmul %2, %%mm5\n\t" + "pfmul %2, %%mm6\n\t" + "pfmul %2, %%mm7\n\t" + "pf2id %%mm0, %%mm0\n\t" + "pf2id %%mm1, %%mm1\n\t" + "pf2id %%mm2, %%mm2\n\t" + "pf2id %%mm3, %%mm3\n\t" + "pf2id %%mm4, %%mm4\n\t" + "pf2id %%mm5, %%mm5\n\t" + "pf2id %%mm6, %%mm6\n\t" + "pf2id %%mm7, %%mm7\n\t" + "packssdw %%mm1, %%mm0\n\t" + "packssdw %%mm3, %%mm2\n\t" + "packssdw %%mm5, %%mm4\n\t" + "packssdw %%mm7, %%mm6\n\t" + "movq %%mm0, (%0)\n\t" + "movq %%mm2, 8(%0)\n\t" + "movq %%mm4, 16(%0)\n\t" + "movq %%mm6, 24(%0)" + ::"r"(&(((uint16_t*)out)[i])),"r"(&(((float*)in)[i])),"m"(tmp_f32[0]) + :"memory" +#ifdef FPU_CLOBBERED + ,FPU_CLOBBERED +#endif +#ifdef MMX_CLOBBERED + ,MMX_CLOBBERED +#endif + ); + } +#endif + for(;i<len;i++) { + ftmp=((float*)in)[i]; + SATURATE(ftmp,-1.0,+1.0); + ((int16_t*)out)[i]=(int16_t)lrintf(SHRT_MAX*ftmp); + } + break; + case(3): + for(i=0;i<len;i++) { + ftmp=((float*)in)[i]; + SATURATE(ftmp,-1.0,+1.0); + store24bit(out, i, (int32_t)lrintf((INT_MAX-1)*ftmp)); + } + break; + case(4): + i=0; +#ifdef HAVE_3DNOW + len_mm=len&(~15); + tmp_f32[0]= + tmp_f32[1]=INT_MAX; + for(;i<len_mm;i+=16) + { + __asm __volatile( + PREFETCH" 64(%1)\n\t" + PREFETCHW" 64(%0)\n\t" + "movq (%1), %%mm0\n\t" + "movq 8(%1), %%mm1\n\t" + "movq 16(%1), %%mm2\n\t" + "movq 24(%1), %%mm3\n\t" + "movq 32(%1), %%mm4\n\t" + "movq 40(%1), %%mm5\n\t" + "movq 48(%1), %%mm6\n\t" + "movq 56(%1), %%mm7\n\t" + "pfmul %2, %%mm0\n\t" + "pfmul %2, %%mm1\n\t" + "pfmul %2, %%mm2\n\t" + "pfmul %2, %%mm3\n\t" + "pfmul %2, %%mm4\n\t" + "pfmul %2, %%mm5\n\t" + "pfmul %2, %%mm6\n\t" + "pfmul %2, %%mm7\n\t" + "pf2id %%mm0, %%mm0\n\t" + "pf2id %%mm1, %%mm1\n\t" + "pf2id %%mm2, %%mm2\n\t" + "pf2id %%mm3, %%mm3\n\t" + "pf2id %%mm4, %%mm4\n\t" + "pf2id %%mm5, %%mm5\n\t" + "pf2id %%mm6, %%mm6\n\t" + "pf2id %%mm7, %%mm7\n\t" + "movq %%mm0, (%0)\n\t" + "movq %%mm1, 8(%0)\n\t" + "movq %%mm2, 16(%0)\n\t" + "movq %%mm3, 24(%0)\n\t" + "movq %%mm4, 32(%0)\n\t" + "movq %%mm5, 40(%0)\n\t" + "movq %%mm6, 48(%0)\n\t" + "movq %%mm7, 56(%0)" + ::"r"(&(((uint32_t*)out)[i])),"r"(&(((float*)in)[i])),"m"(tmp_f32[0]) + :"memory" +#ifdef FPU_CLOBBERED + ,FPU_CLOBBERED +#endif +#ifdef MMX_CLOBBERED + ,MMX_CLOBBERED +#endif + ); + } +#endif + for(;i<len;i++) { + ftmp=((float*)in)[i]; + SATURATE(ftmp,-1.0,+1.0); + ((int32_t*)out)[i]=(int32_t)lrintf((INT_MAX-1)*ftmp); + } + break; + } +#ifdef HAVE_3DNOW + asm volatile(EMMS:: + :"memory" +#ifdef FPU_CLOBBERED + ,FPU_CLOBBERED +#endif +#ifdef MMX_CLOBBERED + ,MMX_CLOBBERED +#endif + ); +#endif +} + +static void __FASTCALL__ RENAME(int2float)(void* in, void* out, int len, int bps) +{ +#ifdef HAVE_3DNOW + unsigned len_mm; + float tmp_f32[2]; +#endif + register int i; + switch(bps){ + case(1): + for(i=0;i<len;i++) + ((float*)out)[i]=(1.0/SCHAR_MAX)*((float)((int8_t*)in)[i]); + break; + case(2): + i=0; +#ifdef HAVE_3DNOW + tmp_f32[0]= + tmp_f32[1]=1.0/INT_MAX; + len_mm=len&(~15); + for(;i<len_mm;i+=16) + { + __asm __volatile( + PREFETCH" 32(%1)\n\t" + PREFETCHW" 64(%0)\n\t" + "movq (%1), %%mm0\n\t" + "movq 8(%1), %%mm1\n\t" + "movq 16(%1), %%mm2\n\t" + "movq 24(%1), %%mm3\n\t" + "pxor %%mm4, %%mm4\n\t" + "pxor %%mm5, %%mm5\n\t" + "pxor %%mm6, %%mm6\n\t" + "pxor %%mm7, %%mm7\n\t" + "punpcklwd %%mm0, %%mm4\n\t" + "punpckhwd %%mm0, %%mm5\n\t" + "punpcklwd %%mm1, %%mm6\n\t" + "punpckhwd %%mm1, %%mm7\n\t" + "pi2fd %%mm4, %%mm4\n\t" + "pi2fd %%mm5, %%mm5\n\t" + "pi2fd %%mm6, %%mm6\n\t" + "pi2fd %%mm7, %%mm7\n\t" + "pfmul %2, %%mm4\n\t" + "pfmul %2, %%mm5\n\t" + "pfmul %2, %%mm6\n\t" + "pfmul %2, %%mm7\n\t" + "movq %%mm4, (%0)\n\t" + "movq %%mm5, 8(%0)\n\t" + "movq %%mm6, 16(%0)\n\t" + "movq %%mm7, 24(%0)\n\t" + "pxor %%mm4, %%mm4\n\t" + "pxor %%mm5, %%mm5\n\t" + "pxor %%mm6, %%mm6\n\t" + "pxor %%mm7, %%mm7\n\t" + "punpcklwd %%mm2, %%mm4\n\t" + "punpckhwd %%mm2, %%mm5\n\t" + "punpcklwd %%mm3, %%mm6\n\t" + "punpckhwd %%mm3, %%mm7\n\t" + "pi2fd %%mm4, %%mm4\n\t" + "pi2fd %%mm5, %%mm5\n\t" + "pi2fd %%mm6, %%mm6\n\t" + "pi2fd %%mm7, %%mm7\n\t" + "pfmul %2, %%mm4\n\t" + "pfmul %2, %%mm5\n\t" + "pfmul %2, %%mm6\n\t" + "pfmul %2, %%mm7\n\t" + "movq %%mm4, 32(%0)\n\t" + "movq %%mm5, 40(%0)\n\t" + "movq %%mm6, 48(%0)\n\t" + "movq %%mm7, 56(%0)\n\t" + "femms" + ::"r"(&(((float*)out)[i])),"r"(&(((int16_t*)in)[i])),"m"(tmp_f32[0]) + :"memory" +#ifdef FPU_CLOBBERED + ,FPU_CLOBBERED +#endif +#ifdef MMX_CLOBBERED + ,MMX_CLOBBERED +#endif + ); + } +#endif + for(;i<len;i++) + ((float*)out)[i]=(1.0/SHRT_MAX)*((float)((int16_t*)in)[i]); + break; + case(3): + for(i=0;i<len;i++) + ((float*)out)[i]=(1.0/INT_MAX)*((float)((int32_t)load24bit(in, i))); + break; + case(4): + i=0; +#ifdef HAVE_3DNOW + tmp_f32[0]= + tmp_f32[1]=1.0/INT_MAX; + len_mm=len&(~15); + for(;i<len_mm;i+=16) + { + __asm __volatile( + PREFETCH" 64(%1)\n\t" + PREFETCHW" 64(%0)\n\t" + "movq (%1), %%mm0\n\t" + "movq 8(%1), %%mm1\n\t" + "movq 16(%1), %%mm2\n\t" + "movq 24(%1), %%mm3\n\t" + "movq 32(%1), %%mm4\n\t" + "movq 40(%1), %%mm5\n\t" + "movq 48(%1), %%mm6\n\t" + "movq 56(%1), %%mm7\n\t" + "pi2fd %%mm0, %%mm0\n\t" + "pi2fd %%mm1, %%mm1\n\t" + "pi2fd %%mm2, %%mm2\n\t" + "pi2fd %%mm3, %%mm3\n\t" + "pi2fd %%mm4, %%mm4\n\t" + "pi2fd %%mm5, %%mm5\n\t" + "pi2fd %%mm6, %%mm6\n\t" + "pi2fd %%mm7, %%mm7\n\t" + "pfmul %2, %%mm0\n\t" + "pfmul %2, %%mm1\n\t" + "pfmul %2, %%mm2\n\t" + "pfmul %2, %%mm3\n\t" + "pfmul %2, %%mm4\n\t" + "pfmul %2, %%mm5\n\t" + "pfmul %2, %%mm6\n\t" + "pfmul %2, %%mm7\n\t" + "movq %%mm0, (%0)\n\t" + "movq %%mm1, 8(%0)\n\t" + "movq %%mm2, 16(%0)\n\t" + "movq %%mm3, 24(%0)\n\t" + "movq %%mm4, 32(%0)\n\t" + "movq %%mm5, 40(%0)\n\t" + "movq %%mm6, 48(%0)\n\t" + "movq %%mm7, 56(%0)\n\t" + "femms" + ::"r"(&(((float*)out)[i])),"r"(&(((int32_t*)in)[i])),"m"(tmp_f32[0]) + :"memory" +#ifdef FPU_CLOBBERED + ,FPU_CLOBBERED +#endif +#ifdef MMX_CLOBBERED + ,MMX_CLOBBERED +#endif + ); + } +#endif + for(;i<len;i++) + ((float*)out)[i]=(1.0/INT_MAX)*((float)((int32_t*)in)[i]); + break; + } +} + +static float __FASTCALL__ RENAME(FIR_f32)(float *x,float *w) +{ +#ifdef HAVE_3DNOW + float rval; + __asm __volatile( + "movq (%1), %%mm0\n\t" + "movq 8(%1), %%mm1\n\t" + "movq 16(%1), %%mm2\n\t" + "movq 24(%1), %%mm3\n\t" + "movq 32(%1), %%mm4\n\t" + "movq 40(%1), %%mm5\n\t" + "movq 48(%1), %%mm6\n\t" + "movq 56(%1), %%mm7\n\t" + "pfmul (%2), %%mm0\n\t" + "pfmul 8(%2), %%mm1\n\t" + "pfmul 16(%2), %%mm2\n\t" + "pfmul 24(%2), %%mm3\n\t" + "pfmul 32(%2), %%mm4\n\t" + "pfmul 40(%2), %%mm5\n\t" + "pfmul 48(%2), %%mm6\n\t" + "pfmul 56(%2), %%mm7\n\t" + "pfadd %%mm1, %%mm0\n\t" + "pfadd %%mm3, %%mm2\n\t" + "pfadd %%mm5, %%mm4\n\t" + "pfadd %%mm7, %%mm6\n\t" + "pfadd %%mm2, %%mm0\n\t" + "pfadd %%mm6, %%mm4\n\t" + "pfadd %%mm4, %%mm0\n\t" + "pfacc %%mm0, %%mm0\n\t" + "movd %%mm0, %0\n\t" + "femms" + :"=&r"(rval):"r"(w),"r"(x) + :"memory" +#ifdef FPU_CLOBBERED + ,FPU_CLOBBERED +#endif +#ifdef MMX_CLOBBERED + ,MMX_CLOBBERED +#endif + ); + return rval; +#else + return ( w[0] *x[0] +w[1] *x[1] +w[2] *x[2] +w[3] *x[3] + + w[4] *x[4] +w[5] *x[5] +w[6] *x[6] +w[7] *x[7] + + w[8] *x[8] +w[9] *x[9] +w[10]*x[10]+w[11]*x[11] + + w[12]*x[12]+w[13]*x[13]+w[14]*x[14]+w[15]*x[15] ); +#endif +} Property changes on: mplayerxp/postproc/dsp_accelf.h ___________________________________________________________________ Added: svn:eol-style + native Modified: mplayerxp/postproc/vf.c =================================================================== --- mplayerxp/postproc/vf.c 2010-01-18 18:37:48 UTC (rev 111) +++ mplayerxp/postproc/vf.c 2010-01-19 17:51:18 UTC (rev 112) @@ -618,7 +618,7 @@ { MSG_V("vf_reinit->config %i %i %s=> %i %i %s\n",sw,sh,vo_format_name(sfourcc),w,h,vo_format_name(fmt)); _saved=_this->prev; - vf_scaler=vf_open_filter(_this,sh_video,"scale",NULL); + vf_scaler=vf_open_filter(_this,sh_video,(w==sw&&h==sh)?"fmtcvt":"scale",NULL); if(vf_scaler) { void *sfnc; Modified: mplayerxp/postproc/vf_scale.c =================================================================== --- mplayerxp/postproc/vf_scale.c 2010-01-18 18:37:48 UTC (rev 111) +++ mplayerxp/postproc/vf_scale.c 2010-01-19 17:51:18 UTC (rev 112) @@ -118,11 +118,18 @@ static void __FASTCALL__ print_conf(struct vf_instance_s* vf) { - MSG_INFO("[vf_scale]: scaling [%dx%d,%s] -> [%dx%d,%s]\n", + MSG_INFO("[vf_scale]: in[%dx%d,%s] -> out[%dx%d,%s]\n", vf->priv->sw,vf->priv->sh,vo_format_name(vf->priv->sfmt), vf->priv->w,vf->priv->h,vo_format_name(vf->priv->ofmt)); } +static void __FASTCALL__ print_conf_fmtcvt(struct vf_instance_s* vf) +{ + MSG_INFO("[vf_fmtcvt]: video[%dx%d] in[%s] -> out[%s]\n", + vf->priv->sw,vf->priv->sh,vo_format_name(vf->priv->sfmt), + vo_format_name(vf->priv->ofmt)); +} + static int __FASTCALL__ config(struct vf_instance_s* vf, int width, int height, int d_width, int d_height, unsigned int flags, unsigned int outfmt,void *tune){ @@ -272,7 +279,7 @@ vf->priv->palette[4*i+3]=0; } break; } - case IMGFMT_RGB4: + case IMGFMT_RGB4: case IMGFMT_RG4B: { int i; vf->priv->palette=malloc(4*16); @@ -321,7 +328,7 @@ } } -static int __FASTCALL__ put_slice(struct vf_instance_s* vf, mp_image_t *mpi){ +static int __FASTCALL__ put_frame(struct vf_instance_s* vf, mp_image_t *mpi){ mp_image_t *dmpi;//=mpi->priv; uint8_t *planes[3]; int stride[3]; @@ -342,7 +349,7 @@ stride[2]=mpi->stride[2]; } } - MSG_DBG2("vf_scale.put_slice was called\n"); + MSG_DBG2("vf_scale.put_frame was called\n"); dmpi=vf_get_image(vf->next,vf->priv->fmt, MP_IMGTYPE_TEMP, MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_PREFER_ALIGNED_STRIDE, vf->priv->w, vf->priv->h); @@ -350,6 +357,51 @@ return vf_next_put_slice(vf,dmpi); } +static int __FASTCALL__ put_slice(struct vf_instance_s* vf, mp_image_t *mpi){ + mp_image_t *dmpi;//=mpi->priv; + uint8_t *planes[3],*dplanes[3]; + int stride[3],newy,newh; + planes[0]=mpi->planes[0]; + stride[0]=mpi->stride[0]; + if(mpi->flags&MP_IMGFLAG_PLANAR){ + if(mpi->flags&MP_IMGFLAG_SWAPPED){ + // I420/IYUV (Y,U,V) + planes[1]=mpi->planes[2]; + planes[2]=mpi->planes[1]; + stride[1]=mpi->stride[2]; + stride[2]=mpi->stride[1]; + } else { + // YV12,YVU9,IF09 (Y,V,U) + planes[1]=mpi->planes[1]; + planes[2]=mpi->planes[2]; + stride[1]=mpi->stride[1]; + stride[2]=mpi->stride[2]; + } + } + MSG_DBG2("vf_scale.put_slice was called[%i %i]\n",mpi->y, mpi->h); + dmpi=vf_get_image(vf->next,vf->priv->fmt, + MP_IMGTYPE_TEMP, MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_PREFER_ALIGNED_STRIDE, + vf->priv->w, vf->priv->h); + /* Try to fake first slice*/ + dplanes[0] = dmpi->planes[0]; + if(mpi->flags&MP_IMGFLAG_PLANAR) { + dplanes[1] = dmpi->planes[1]; + dplanes[2] = dmpi->planes[2]; + } + planes[0] += mpi->y*mpi->stride[0]; + dplanes[0] += mpi->y*dmpi->stride[0]; + if(mpi->flags&MP_IMGFLAG_PLANAR){ + planes[1] += (mpi->y>>mpi->chroma_y_shift)*mpi->stride[1]; + planes[2] += (mpi->y>>mpi->chroma_y_shift)*mpi->stride[2]; + dplanes[1]+= (mpi->y>>dmpi->chroma_y_shift)*dmpi->stride[0]; + dplanes[1]+= (mpi->y>>dmpi->chroma_y_shift)*dmpi->stride[0]; + } + scale(vf->priv->ctx, vf->priv->ctx2, planes, stride, 0, mpi->h, dplanes, dmpi->stride, vf->priv->interlaced); + dmpi->y = mpi->y; + dmpi->h = mpi->h; + return vf_next_put_slice(vf,dmpi); +} + static int __FASTCALL__ control(struct vf_instance_s* vf, int request, void* data){ int *table; int *inv_table; @@ -468,7 +520,7 @@ static int __FASTCALL__ vf_open(vf_instance_t *vf,const char* args){ vf->config=config; - vf->put_slice=put_slice; + vf->put_slice=put_frame; vf->query_format=query_format; vf->control= control; vf->uninit=uninit; @@ -499,6 +551,13 @@ return 1; } +static int __FASTCALL__ vf_open_fmtcvt(vf_instance_t *vf,const char* args){ + int retval = vf_open(vf,args); + vf->put_slice=put_slice; + vf->print_conf=print_conf_fmtcvt; + return retval; +} + //global sws_flags from the command line int sws_flags=2; @@ -600,8 +659,8 @@ "fmtcvt", "A'rpi", "", - VF_FLAGS_THREADS, - vf_open + VF_FLAGS_THREADS|VF_FLAGS_SLICES, + vf_open_fmtcvt }; //===========================================================================// This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |