From: Charles 'B. K. <kr...@us...> - 2001-02-07 02:57:38
|
Update of /cvsroot/libdv/libdv In directory usw-pr-cvs1:/tmp/cvs-serv16852 Modified Files: .cvsignore ChangeLog Makefile.vanilla YUY2.c YUY2.h acconfig.h configure.in display.c dv.c dv_types.h idct_248.c idct_248.h parse.c quant.c quant.h Log Message: Integrate patch from Stefan Lucke for various speedups: - replace pointer dereferences with indexed accesses. - merge 248 idct prescale into 248 quant. - change IEC PAL (420) to generate YUY2 instead of YV12. Is this really faster? If so, I am suprised. I have added a configure option to allow choosing which version to build (YUY2 is default), so I can verify the performance comparison. - Fix to Makefile.vanilla to include util.c in build. - still to do: it looks like there are incompatable versions of popt around. Need to add a configure check to figure out whether filename needs to be declared as a const char *. Index: .cvsignore =================================================================== RCS file: /cvsroot/libdv/libdv/.cvsignore,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -r1.8 -r1.9 *** .cvsignore 2001/01/20 21:20:51 1.8 --- .cvsignore 2001/02/07 02:57:55 1.9 *************** *** 37,38 **** --- 37,39 ---- testbitstream testvlc + ppmqscale \ No newline at end of file Index: ChangeLog =================================================================== RCS file: /cvsroot/libdv/libdv/ChangeLog,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -r1.16 -r1.17 *** ChangeLog 2001/02/07 00:43:31 1.16 --- ChangeLog 2001/02/07 02:57:55 1.17 *************** *** 1,4 **** --- 1,22 ---- 2001-02-06 Charles 'Buck' Krasic <kr...@ac...> + * Integrate patch from Stefan Lucke for various speedups: + + - replace pointer dereferences with indexed accesses. + + - merge 248 idct prescale into 248 quant. + + - change IEC PAL (420) to generate YUY2 instead of YV12. + + Is this really faster? If so, I am suprised. I have added a + configure option to allow choosing which version to build + (YUY2 is default), so I can verify the performance comparison. + + - Fix to Makefile.vanilla to include util.c in build. + + - still to do: it looks like there are incompatable versions of + popt around. Need to add a configure check to figure out + whether filename needs to be declared as a const char *. + * Change license banner to read "a free DV codec" instead of "a free decoder". Index: Makefile.vanilla =================================================================== RCS file: /cvsroot/libdv/libdv/Makefile.vanilla,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -r1.4 -r1.5 *** Makefile.vanilla 2001/01/15 02:12:30 1.4 --- Makefile.vanilla 2001/02/07 02:57:55 1.5 *************** *** 29,33 **** asm = vlc_x86.S quant_x86.S idct_block_mmx.S dct_block_mmx.S rgbtoyuv.S ! gensources=dv.c dct.c idct_248.c weighting.c quant.c vlc.c place.c parse.c bitstream.c YUY2.c YV12.c rgb.c oss.c audio.c genobjects=$(gensources:.c=.o) $(asm:.S=.o) --- 29,33 ---- asm = vlc_x86.S quant_x86.S idct_block_mmx.S dct_block_mmx.S rgbtoyuv.S ! gensources=dv.c dct.c idct_248.c weighting.c quant.c vlc.c place.c parse.c bitstream.c YUY2.c YV12.c rgb.c oss.c audio.c util.c genobjects=$(gensources:.c=.o) $(asm:.S=.o) Index: YUY2.c =================================================================== RCS file: /cvsroot/libdv/libdv/YUY2.c,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -r1.7 -r1.8 *** YUY2.c 2001/02/07 00:43:31 1.7 --- YUY2.c 2001/02/07 02:57:55 1.8 *************** *** 174,177 **** --- 174,225 ---- } /* dv_mb411_right_YUY2 */ + /* ---------------------------------------------------------------------------- + */ + void + dv_mb420_YUY2 (dv_macroblock_t *mb, guchar **pixels, gint16 *pitches) { + dv_coeff_t *Y [4], *Ytmp0, *cr_frame, *cb_frame; + unsigned char *pyuv, + *pwyuv0, *pwyuv1, + cb, cr; + int i, j, col, row, inc_l2, inc_l4; + + pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]); + + Y [0] = mb->b[0].coeffs; + Y [1] = mb->b[1].coeffs; + Y [2] = mb->b[2].coeffs; + Y [3] = mb->b[3].coeffs; + cr_frame = mb->b[4].coeffs; + cb_frame = mb->b[5].coeffs; + inc_l2 = pitches[0]; + inc_l4 = pitches[0]*2; + + for (j = 0; j < 4; j += 2) { // Two rows of blocks j, j+1 + for (row = 0; row < 8; row+=2) { // 4 pairs of two rows + pwyuv0 = pyuv; + pwyuv1 = pyuv + inc_l2; + for (i = 0; i < 2; ++i) { // Two columns of blocks + Ytmp0 = Y[j + i]; + for (col = 0; col < 4; ++col) { // 4 spans of 2x2 pixels + cb = uvlut [*cb_frame++]; // +128; + cr = uvlut [*cr_frame++]; // +128 + + *pwyuv0++ = ylut [*Ytmp0++]; + *pwyuv0++ = cb; + *pwyuv0++ = ylut [*Ytmp0++]; + *pwyuv0++ = cr; + + *pwyuv1++ = ylut [*(Ytmp0 + 6)]; + *pwyuv1++ = cb; + *pwyuv1++ = ylut [*(Ytmp0 + 7)]; + *pwyuv1++ = cr; + } + Y[j + i] = Ytmp0 + 8; + } + pyuv += inc_l4; + } + } + } + #if ARCH_X86 *************** *** 349,352 **** --- 397,498 ---- emms(); } /* dv_mb411_right_YUY2_mmx */ + + /* ---------------------------------------------------------------------------- + */ + void + dv_mb420_YUY2_mmx (dv_macroblock_t *mb, guchar **pixels, gint16 *pitches) { + dv_coeff_t *Y [4], *Ytmp0, *cr_frame, *cb_frame; + unsigned char *pyuv, + *pwyuv0, *pwyuv1; + int i, j, row, inc_l2, inc_l4; + + pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]); + + Y [0] = mb->b[0].coeffs; + Y [1] = mb->b[1].coeffs; + Y [2] = mb->b[2].coeffs; + Y [3] = mb->b[3].coeffs; + cr_frame = mb->b[4].coeffs; + cb_frame = mb->b[5].coeffs; + inc_l2 = pitches[0]; + inc_l4 = pitches[0]*2; + + movq_m2r (mmx_0x7f94s, mm6); + movq_m2r (mmx_0x7f24s, mm5); + + for (j = 0; j < 4; j += 2) { // Two rows of blocks j, j+1 + for (row = 0; row < 8; row+=2) { // 4 pairs of two rows + pwyuv0 = pyuv; + pwyuv1 = pyuv + inc_l2; + for (i = 0; i < 2; ++i) { // Two columns of blocks + Ytmp0 = Y[j + i]; + + /* ------------------------------------------------------------------- + */ + movq_m2r (*cb_frame, mm2); + paddw_m2r (mmx_0x0080s, mm2); + + psllw_i2r (8, mm2); + movq_m2r (*cr_frame, mm3); + + paddw_m2r (mmx_0x0080s, mm3); + psllw_i2r (8, mm3); + + movq_r2r (mm2, mm4); + punpcklwd_r2r (mm3, mm4); + + /* ------------------------------------------------------------------- + */ + movq_m2r (Ytmp0[0], mm0); + paddsw_r2r (mm6, mm0); + psubusw_r2r (mm5, mm0); + paddw_m2r (mmx_0x0010s, mm0); + por_r2r (mm4, mm0); + + movq_m2r (Ytmp0[8], mm1); + paddsw_r2r (mm6, mm1); + + movq_r2m (mm0, pwyuv0[0]); + + psubusw_r2r (mm5, mm1); + paddw_m2r (mmx_0x0010s, mm1); + por_r2r (mm4, mm1); + + movq_r2m (mm1, pwyuv1[0]); + + movq_r2r (mm2, mm4); + + punpckhwd_r2r (mm3, mm4); + + movq_m2r (Ytmp0[4], mm0); + paddsw_r2r (mm6, mm0); + psubusw_r2r (mm5, mm0); + paddw_m2r (mmx_0x0010s, mm0); + por_r2r (mm4, mm0); + + movq_m2r (Ytmp0[12], mm1); + paddsw_r2r (mm6, mm1); + + movq_r2m (mm0, pwyuv0[8]); + + psubusw_r2r (mm5, mm1); + paddw_m2r (mmx_0x0010s, mm1); + por_r2r (mm4, mm1); + + movq_r2m (mm1, pwyuv1[8]); + + movq_r2r (mm2, mm4); + + pwyuv0 += 16; + pwyuv1 += 16; + cb_frame += 4; + cr_frame += 4; + Y[j + i] = Ytmp0 + 16; + } + pyuv += inc_l4; + } + } + emms (); + } #endif // ARCH_X86 Index: YUY2.h =================================================================== RCS file: /cvsroot/libdv/libdv/YUY2.h,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -r1.7 -r1.8 *** YUY2.h 2001/02/07 00:43:31 1.7 --- YUY2.h 2001/02/07 02:57:55 1.8 *************** *** 53,56 **** --- 53,57 ---- extern void dv_mb411_YUY2(dv_macroblock_t *mb, guchar *pixels, gint pitch); extern void dv_mb411_right_YUY2(dv_macroblock_t *mb, guchar *pixels, gint pitch); + extern void dv_mb420_YUY2(dv_macroblock_t *mb, guchar **pixels, gint16 *pitch); #if ARCH_X86 *************** *** 58,61 **** --- 59,63 ---- extern void dv_mb411_YUY2_mmx(dv_macroblock_t *mb, guchar *pixels, gint pitch); extern void dv_mb411_right_YUY2_mmx(dv_macroblock_t *mb, guchar *pixels, gint pitch); + extern void dv_mb420_YUY2_mmx(dv_macroblock_t *mb, guchar **pixels, gint16 *pitch); #endif // ARCH_X86 Index: acconfig.h =================================================================== RCS file: /cvsroot/libdv/libdv/acconfig.h,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -r1.3 -r1.4 *** acconfig.h 2001/01/19 01:49:57 1.3 --- acconfig.h 2001/02/07 02:57:55 1.4 *************** *** 39,41 **** --- 39,44 ---- #define DEBUG 0 + /* Define to decide which YUV format to use for 420 blocks in IEC PAL */ + #define YUV_420_USE_YV12 0 + #endif // DV_ACCONFIG_H Index: configure.in =================================================================== RCS file: /cvsroot/libdv/libdv/configure.in,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -r1.11 -r1.12 *** configure.in 2001/01/21 05:41:40 1.11 --- configure.in 2001/02/07 02:57:55 1.12 *************** *** 64,67 **** --- 64,82 ---- fi + AC_ARG_WITH(pal-yuv, + [ --with-pal-yuv=(YV12|YUY2) Select YUV format for IEC PAL (YUY2 by default) ], + [ + case "$with_pal_yuv" in + YV12) + AC_MSG_RESULT(YV12) + AC_DEFINE(YUV_420_USE_YV12) + ;; + *) + AC_MSG_RESULT(YUY2) + ;; + esac + ] + ) + AC_MSG_CHECKING(whether to enable debugging) AC_ARG_WITH(debug, *************** *** 84,92 **** ;; esac ! ], ! [ AC_MSG_RESULT(default=yes) ! AC_DEFINE(DEBUG) ! CFLAGS="$CFLAGS -g" ] ) AM_CONDITIONAL(HOST_X86, $arch_x86) --- 99,110 ---- ;; esac ! ], ! [ AC_MSG_RESULT(default=yes) ! AC_DEFINE(DEBUG) ! CFLAGS="$CFLAGS -g" ! ] ) + + AM_CONDITIONAL(HOST_X86, $arch_x86) Index: display.c =================================================================== RCS file: /cvsroot/libdv/libdv/display.c,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -r1.13 -r1.14 *** display.c 2001/02/07 00:43:31 1.13 --- display.c 2001/02/07 02:57:55 1.14 *************** *** 499,509 **** --- 499,520 ---- case e_dv_sample_411: case e_dv_sample_422: + #if ! YUV_420_USE_YV12 + case e_dv_sample_420: + #endif dv_dpy->format = DV_FOURCC_YUY2; + #if 0 dv_dpy->len = dv_dpy->width * dv_dpy->height * 2; + #else + /* don't spare with space. just allocate enough + */ + dv_dpy->len = 720 * 576 * 4; + #endif break; + #if YUV_420_USE_YV12 case e_dv_sample_420: dv_dpy->format = DV_FOURCC_YV12; dv_dpy->len = (dv_dpy->width * dv_dpy->height * 3) / 2; break; + #endif default: /* Not possible */ Index: dv.c =================================================================== RCS file: /cvsroot/libdv/libdv/dv.c,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -r1.16 -r1.17 *** dv.c 2001/02/07 00:43:31 1.16 --- dv.c 2001/02/07 02:57:55 1.17 *************** *** 46,49 **** --- 46,57 ---- #endif + #if YUV_420_USE_YV12 + #define DV_MB420_YUV(a,b,c) dv_mb420_YV12 (a,b,c) + #define DV_MB420_YUV_MMX(a,b,c) dv_mb420_YV12_mmx(a,b,c) + #else + #define DV_MB420_YUV(a,b,c) dv_mb420_YUY2 (a,b,c) + #define DV_MB420_YUV_MMX(a,b,c) dv_mb420_YV12_mmx(a,b,c) + #endif + #if HAVE_LIBPOPT static void *************** *** 60,83 **** #endif // HAVE_LIBPOPT - static void - convert_coeffs(dv_block_t *bl) { - int i; - for(i=0; - i<64; - i++) { - bl->coeffs248[i] = bl->coeffs[i]; - } // for - } // convert_coeffs - - static void - convert_coeffs_prime(dv_block_t *bl) { - int i; - for(i=0; - i<64; - i++) { - bl->coeffs[i] = bl->coeffs248[i]; - } // for - } // convert_coeffs_prime - dv_decoder_t * dv_decoder_new(void) { --- 68,71 ---- *************** *** 142,146 **** dv->use_mmx = mmx_ok(); #endif ! weight_init(); dct_init(); dv_dct_248_init(); --- 130,134 ---- dv->use_mmx = mmx_ok(); #endif ! weight_init(); dct_init(); dv_dct_248_init(); *************** *** 148,151 **** --- 136,140 ---- dv_parse_init(); dv_place_init(); + dv_quant_init (dv); dv_rgb_init(); dv_YUY2_init(); *************** *** 155,178 **** static inline void dv_decode_macroblock(dv_decoder_t *dv, dv_macroblock_t *mb, guint quality) { ! dv_block_t *bl; ! gint b; ! for (b=0,bl = mb->b; ! b<((quality & DV_QUALITY_COLOR) ? 6 : 4); ! b++,bl++) { ! if (bl->dct_mode == DV_DCT_248) { ! quant_248_inverse(bl->coeffs,mb->qno,bl->class_no); ! weight_248_inverse(bl->coeffs); ! convert_coeffs(bl); ! dv_idct_248(bl->coeffs248); ! convert_coeffs_prime(bl); } else { #if ARCH_X86 ! quant_88_inverse_x86(bl->coeffs,mb->qno,bl->class_no); ! weight_88_inverse(bl->coeffs); ! idct_88(bl->coeffs); #else // ARCH_X86 ! quant_88_inverse(bl->coeffs,mb->qno,bl->class_no); ! weight_88_inverse(bl->coeffs); ! idct_88(bl->coeffs); #endif // ARCH_X86 } // else --- 144,164 ---- static inline void dv_decode_macroblock(dv_decoder_t *dv, dv_macroblock_t *mb, guint quality) { ! gint i; ! for (i=0; ! i<((quality & DV_QUALITY_COLOR) ? 6 : 4); ! i++) { ! if (mb->b[i].dct_mode == DV_DCT_248) { ! dv_248_coeff_t co248[64]; ! ! quant_248_inverse (mb->b[i].coeffs, mb->qno, mb->b[i].class_no, co248); ! dv_idct_248 (co248, mb->b[i].coeffs); } else { #if ARCH_X86 ! quant_88_inverse_x86(mb->b[i].coeffs,mb->qno,mb->b[i].class_no); ! idct_88(mb->b[i].coeffs); #else // ARCH_X86 ! quant_88_inverse(mb->b[i].coeffs,mb->qno,mb->b[i].class_no); ! weight_88_inverse(mb->b[i].coeffs); ! idct_88(mb->b[i].coeffs); #endif // ARCH_X86 } // else *************** *** 191,200 **** b<((quality & DV_QUALITY_COLOR) ? 6 : 4); b++,bl++) { ! if (bl->dct_mode == DV_DCT_248) { ! quant_248_inverse(bl->coeffs,mb->qno,bl->class_no); ! weight_248_inverse(bl->coeffs); ! convert_coeffs(bl); ! dv_idct_248(bl->coeffs248); ! convert_coeffs_prime(bl); } else { #if ARCH_X86 --- 177,185 ---- b<((quality & DV_QUALITY_COLOR) ? 6 : 4); b++,bl++) { ! if (bl->dct_mode == DV_DCT_248) { ! dv_248_coeff_t co248[64]; ! ! quant_248_inverse (mb->b[b].coeffs, mb->qno, mb->b[b].class_no, co248); ! dv_idct_248 (co248, mb->b[b].coeffs); } else { #if ARCH_X86 *************** *** 288,292 **** } // else } else { ! dv_mb420_YV12_mmx(mb, pixels, pitches); } // else } else { --- 273,277 ---- } // else } else { ! DV_MB420_YUV_MMX(mb, pixels, pitches); } // else } else { *************** *** 298,302 **** } // else } else { ! dv_mb420_YV12(mb, pixels, pitches); } // else } // else --- 283,287 ---- } // else } else { ! DV_MB420_YUV(mb, pixels, pitches); } // else } // else *************** *** 318,322 **** } // else } else { ! dv_mb420_YV12_mmx(mb, pixels, pitches); } // else } else { --- 303,307 ---- } // else } else { ! DV_MB420_YUV_MMX(mb, pixels, pitches); } // else } else { *************** *** 328,332 **** } // else } else { ! dv_mb420_YV12(mb, pixels, pitches); } // else } // else --- 313,317 ---- } // else } else { ! DV_MB420_YUV(mb, pixels, pitches); } // else } // else *************** *** 345,349 **** } // else } else { ! dv_mb420_YV12(mb, pixels, pitches); } // else } /* dv_render_macroblock_yuv */ --- 330,334 ---- } // else } else { ! DV_MB420_YUV(mb, pixels, pitches); } // else } /* dv_render_macroblock_yuv */ *************** *** 363,367 **** } // else } else { ! dv_mb420_YV12(mb, pixels, pitches); } // else } // for --- 348,352 ---- } // else } else { ! DV_MB420_YUV(mb, pixels, pitches); } // else } // for Index: dv_types.h =================================================================== RCS file: /cvsroot/libdv/libdv/dv_types.h,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -r1.5 -r1.6 *** dv_types.h 2001/02/07 00:43:31 1.5 --- dv_types.h 2001/02/07 02:57:55 1.6 *************** *** 157,162 **** typedef struct { dv_coeff_t coeffs[64] __attribute__ ((aligned (8))); ! dv_248_coeff_t coeffs248[64]; ! gint dct_mode; gint class_no; gint8 *reorder; --- 157,161 ---- typedef struct { dv_coeff_t coeffs[64] __attribute__ ((aligned (8))); ! gint dct_mode; gint class_no; gint8 *reorder; Index: idct_248.c =================================================================== RCS file: /cvsroot/libdv/libdv/idct_248.c,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -r1.5 -r1.6 *** idct_248.c 2001/02/07 00:43:31 1.5 --- idct_248.c 2001/02/07 02:57:55 1.6 *************** *** 48,52 **** #define IDCT_248_UNIT_TEST 0 ! static dv_248_coeff_t dv_idct_248_prescale[64]; /* --- 48,52 ---- #define IDCT_248_UNIT_TEST 0 ! dv_248_coeff_t dv_idct_248_prescale[64]; /* *************** *** 150,171 **** #define DIV_FOUR(A) ((A) / 4) ! void dv_idct_248(dv_248_coeff_t *x248) { dv_248_coeff_t tmp[64]; ! dv_248_coeff_t *in, *out; dv_248_coeff_t u,v,w,z; dv_248_coeff_t in0, in1, in2, in3, in4, in5, in6, in7; gint i; ! ! #if ! IDCT_248_UNIT_TEST ! /* prescale - 64 mults */ ! for(i=0; i<64; i++) { ! x248[i] *= dv_idct_248_prescale[i]; ! } // for ! #endif // ! IDCT_248_UNIT_TEST // Now, tmp = inv(h2) * inv(g2) * (prescale = inv(d2) * x248 * d) // 32 mults, 64 adds, 80 shifts, 16 negates in = x248; ! out = tmp; #if IDCT_248_UNIT_TEST printf("\nt0:\n"); --- 150,165 ---- #define DIV_FOUR(A) ((A) / 4) ! void dv_idct_248(dv_248_coeff_t *x248, dv_coeff_t *out) { dv_248_coeff_t tmp[64]; ! dv_248_coeff_t *in, *lhs; dv_248_coeff_t u,v,w,z; dv_248_coeff_t in0, in1, in2, in3, in4, in5, in6, in7; gint i; ! // Now, tmp = inv(h2) * inv(g2) * (prescale = inv(d2) * x248 * d) // 32 mults, 64 adds, 80 shifts, 16 negates in = x248; ! lhs = tmp; #if IDCT_248_UNIT_TEST printf("\nt0:\n"); *************** *** 180,206 **** w = in[1*8+i]; z = in[3*8+i]; ! out[0*8+i] = DIV_FOUR(u) + DIV_TWO(v); ! out[1*8+i] = DIV_FOUR(u) - DIV_TWO(v); ! out[2*8+i] = fixed_multiply(w,beta0) + fixed_multiply(z,beta1); ! out[3*8+i] = -(DIV_TWO(w+z)); u = in[4*8+i]; v = in[6*8+i]; w = in[5*8+i]; z = in[7*8+i]; ! out[4*8+i] = DIV_FOUR(u) + DIV_TWO(v); ! out[5*8+i] = DIV_FOUR(u) - DIV_TWO(v); ! out[6*8+i] = fixed_multiply(w,beta0) + fixed_multiply(z,beta1); ! out[7*8+i] = -(DIV_TWO(w+z)); } // for #if IDCT_248_UNIT_TEST printf("\nt1:\n"); for(i=0;i<64; i++) { ! printf("%d ", (out[i] + 0x2000) >> 14); if((i+1) % 8 == 0) printf("\n"); } // for #endif // IDCT_248_UNIT_TEST in = tmp; ! out = x248; ! // Do out = inv(f) * inv(L2) * in (butterfly) // 192 adds, 64 shifts for(i=0; i<8; i++) { --- 174,200 ---- w = in[1*8+i]; z = in[3*8+i]; ! lhs[0*8+i] = DIV_FOUR(u) + DIV_TWO(v); ! lhs[1*8+i] = DIV_FOUR(u) - DIV_TWO(v); ! lhs[2*8+i] = fixed_multiply(w,beta0) + fixed_multiply(z,beta1); ! lhs[3*8+i] = -(DIV_TWO(w+z)); u = in[4*8+i]; v = in[6*8+i]; w = in[5*8+i]; z = in[7*8+i]; ! lhs[4*8+i] = DIV_FOUR(u) + DIV_TWO(v); ! lhs[5*8+i] = DIV_FOUR(u) - DIV_TWO(v); ! lhs[6*8+i] = fixed_multiply(w,beta0) + fixed_multiply(z,beta1); ! lhs[7*8+i] = -(DIV_TWO(w+z)); } // for #if IDCT_248_UNIT_TEST printf("\nt1:\n"); for(i=0;i<64; i++) { ! printf("%d ", (lhs[i] + 0x2000) >> 14); if((i+1) % 8 == 0) printf("\n"); } // for #endif // IDCT_248_UNIT_TEST in = tmp; ! lhs = x248; ! // Do lhs = inv(f) * inv(L2) * in (butterfly) // 192 adds, 64 shifts for(i=0; i<8; i++) { *************** *** 209,262 **** w = in[8*4+i]; z = in[8*7+i]; ! out[8*0+i] = DIV_FOUR(u - v + w - z); ! out[8*1+i] = DIV_FOUR(u - v - w + z); ! out[8*6+i] = DIV_FOUR(u + v + w + z); ! out[8*7+i] = DIV_FOUR(u + v - w - z); u = in[8*1+i]; v = in[8*2+i]; w = in[8*5+i]; z = in[8*6+i]; ! out[i+8*2] = DIV_FOUR(u + v + w + z); ! out[i+8*3] = DIV_FOUR(u + v - w - z); ! out[i+8*4] = DIV_FOUR(u - v + w - z); ! out[i+8*5] = DIV_FOUR(u - v - w + z); } // for #if IDCT_248_UNIT_TEST printf("\nt2:\n"); for(i=0;i<64; i++) { ! printf("%d ", (out[i] + 0x2000) >> 14); if((i+1) % 8 == 0) printf("\n"); } // for #endif // IDCT_248_UNIT_TEST in = x248; ! out = tmp; ! // Do out = in * p * b1 * b2 * m // 48 mults, 48 adds for(i=0; i<8; i++) { ! out[i*8+0] = in[i*8+0]; ! out[i*8+1] = in[i*8+4]; u = in[i*8+2]; v = in[i*8+6]; ! out[i*8+2] = fixed_multiply(u - v,beta2); ! out[i*8+3] = u + v; u = in[i*8+1]; v = in[i*8+3]; w = in[i*8+5]; z = in[i*8+7]; ! out[i*8+4] = fixed_multiply(u - z,beta3) + fixed_multiply(v - w,beta4); ! out[i*8+5] = fixed_multiply(u - v - w + z,beta2); ! out[i*8+6] = fixed_multiply(u - z,beta4) + fixed_multiply(w - v,beta3); ! out[i*8+7] = u + v + w + z; } // for #if IDCT_248_UNIT_TEST printf("\nt3:\n"); for(i=0;i<64; i++) { ! printf("%d ", (out[i] + 0x2000) >> 14); if((i+1) % 8 == 0) printf("\n"); } // for #endif // IDCT_248_UNIT_TEST ! in = out; ! out = x248; ! // Do out = in * a1 * a2 * a3 (butterflys...) // 272 adds (will gcc factor some of these out?) for(i=0; i<8; i++) { --- 203,256 ---- w = in[8*4+i]; z = in[8*7+i]; ! lhs[8*0+i] = DIV_FOUR(u - v + w - z); ! lhs[8*1+i] = DIV_FOUR(u - v - w + z); ! lhs[8*6+i] = DIV_FOUR(u + v + w + z); ! lhs[8*7+i] = DIV_FOUR(u + v - w - z); u = in[8*1+i]; v = in[8*2+i]; w = in[8*5+i]; z = in[8*6+i]; ! lhs[i+8*2] = DIV_FOUR(u + v + w + z); ! lhs[i+8*3] = DIV_FOUR(u + v - w - z); ! lhs[i+8*4] = DIV_FOUR(u - v + w - z); ! lhs[i+8*5] = DIV_FOUR(u - v - w + z); } // for #if IDCT_248_UNIT_TEST printf("\nt2:\n"); for(i=0;i<64; i++) { ! printf("%d ", (lhs[i] + 0x2000) >> 14); if((i+1) % 8 == 0) printf("\n"); } // for #endif // IDCT_248_UNIT_TEST in = x248; ! lhs = tmp; ! // Do lhs = in * p * b1 * b2 * m // 48 mults, 48 adds for(i=0; i<8; i++) { ! lhs[i*8+0] = in[i*8+0]; ! lhs[i*8+1] = in[i*8+4]; u = in[i*8+2]; v = in[i*8+6]; ! lhs[i*8+2] = fixed_multiply(u - v,beta2); ! lhs[i*8+3] = u + v; u = in[i*8+1]; v = in[i*8+3]; w = in[i*8+5]; z = in[i*8+7]; ! lhs[i*8+4] = fixed_multiply(u - z,beta3) + fixed_multiply(v - w,beta4); ! lhs[i*8+5] = fixed_multiply(u - v - w + z,beta2); ! lhs[i*8+6] = fixed_multiply(u - z,beta4) + fixed_multiply(w - v,beta3); ! lhs[i*8+7] = u + v + w + z; } // for #if IDCT_248_UNIT_TEST printf("\nt3:\n"); for(i=0;i<64; i++) { ! printf("%d ", (lhs[i] + 0x2000) >> 14); if((i+1) % 8 == 0) printf("\n"); } // for #endif // IDCT_248_UNIT_TEST ! in = lhs; ! lhs = x248; ! // Do lhs = in * a1 * a2 * a3 (butterflys...) // 272 adds (will gcc factor some of these out?) for(i=0; i<8; i++) { *************** *** 269,290 **** in6 = in[i*8+6]; in7 = in[i*8+7]; ! out[i*8+0] = in0 + in1 + in2 + in3 + in6 + in7; ! out[i*8+1] = in0 - in1 + in2 + in5 + in6; ! out[i*8+2] = in0 - in1 - in2 - in4 + in5; ! out[i*8+3] = in0 + in1 - in2 - in3 - in4; ! out[i*8+4] = in0 + in1 - in2 - in3 + in4; ! out[i*8+5] = in0 - in1 - in2 + in4 - in5; ! out[i*8+6] = in0 - in1 + in2 - in5 - in6; ! out[i*8+7] = in0 + in1 + in2 + in3 - in6 - in7; } // for #if IDCT_248_UNIT_TEST printf("\nout:\n"); for(i=0;i<64; i++) { ! printf("%d ", (out[i] + 0x2000) >> 14); if((i+1) % 8 == 0) printf("\n"); } // for #endif // IDCT_248_UNIT_TEST ! for(i=0; i<64; i++) ! out[i] = (out[i] + 0x2000) >> 14; } // dv_idct_248 --- 263,284 ---- in6 = in[i*8+6]; in7 = in[i*8+7]; ! lhs[i*8+0] = in0 + in1 + in2 + in3 + in6 + in7; ! lhs[i*8+1] = in0 - in1 + in2 + in5 + in6; ! lhs[i*8+2] = in0 - in1 - in2 - in4 + in5; ! lhs[i*8+3] = in0 + in1 - in2 - in3 - in4; ! lhs[i*8+4] = in0 + in1 - in2 - in3 + in4; ! lhs[i*8+5] = in0 - in1 - in2 + in4 - in5; ! lhs[i*8+6] = in0 - in1 + in2 - in5 - in6; ! lhs[i*8+7] = in0 + in1 + in2 + in3 - in6 - in7; } // for #if IDCT_248_UNIT_TEST printf("\nout:\n"); for(i=0;i<64; i++) { ! printf("%d ", (lhs[i] + 0x2000) >> 14); if((i+1) % 8 == 0) printf("\n"); } // for #endif // IDCT_248_UNIT_TEST ! for(i=0; i<64; i++) ! out [i] = (lhs[i] + 0x2000) >> 14; } // dv_idct_248 Index: idct_248.h =================================================================== RCS file: /cvsroot/libdv/libdv/idct_248.h,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -r1.5 -r1.6 *** idct_248.h 2001/02/07 00:43:31 1.5 --- idct_248.h 2001/02/07 02:57:55 1.6 *************** *** 34,39 **** #endif extern void dv_dct_248_init(void); ! extern void dv_idct_248(dv_248_coeff_t *x248); #ifdef __cplusplus --- 34,41 ---- #endif + extern dv_248_coeff_t dv_idct_248_prescale[64]; + extern void dv_dct_248_init(void); ! extern void dv_idct_248(dv_248_coeff_t *x248,dv_coeff_t *out); #ifdef __cplusplus Index: parse.c =================================================================== RCS file: /cvsroot/libdv/libdv/parse.c,v retrieving revision 1.18 retrieving revision 1.19 diff -C2 -r1.18 -r1.19 *** parse.c 2001/02/07 00:43:31 1.18 --- parse.c 2001/02/07 02:57:55 1.19 *************** *** 167,184 **** // Scan the blocks of a macroblock. We're looking to find the next // block from which unused space was borrowed ! static gboolean dv_find_mb_unused_bits(dv_macroblock_t *mb, dv_block_t **lender) { ! dv_block_t *bl; gint b; ! for(b=0,bl=mb->b; ! b<6; ! b++,bl++) { ! if((bl->eob) && /* an incomplete block can only "borrow" bits * from other blocks that are themselves * already completely decoded */ ! (bl->end > bl->offset) && // the lender must have unused bits ! (!bl->mark)) { // the lender musn't already be lending... ! bl->mark = TRUE; ! *lender = bl; return(TRUE); } // if --- 167,182 ---- // Scan the blocks of a macroblock. We're looking to find the next // block from which unused space was borrowed ! static inline ! gboolean dv_find_mb_unused_bits(dv_macroblock_t *mb, dv_block_t **lender) { gint b; ! for(b=0; b<6; b++) { ! if((mb->b[b].eob) && /* an incomplete block can only "borrow" bits * from other blocks that are themselves * already completely decoded */ ! (mb->b[b].end > mb->b[b].offset) && // the lender must have unused bits ! (!mb->b[b].mark)) { // the lender musn't already be lending... ! mb->b[b].mark = TRUE; ! *lender = &mb->b[b]; return(TRUE); } // if Index: quant.c =================================================================== RCS file: /cvsroot/libdv/libdv/quant.c,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -r1.10 -r1.11 *** quant.c 2001/02/07 00:43:31 1.10 --- quant.c 2001/02/07 02:57:55 1.11 *************** *** 31,38 **** #if ARCH_X86 ! #include "mmx.h" #endif #include "quant.h" static guint8 dv_88_areas[64] = { --- 31,39 ---- #if ARCH_X86 ! #include <mmx.h> #endif #include "quant.h" + #include "idct_248.h" static guint8 dv_88_areas[64] = { *************** *** 126,133 **** }; ! guint8 dv_quant_offset[4] = { 6,3,0,1 }; ! extern void quant_x86(dv_coeff_t *block,int qno,int class); void quant(dv_coeff_t *block,int qno,int class) { --- 127,162 ---- }; ! guint8 dv_quant_offset[4] = { 6,3,0,1 }; ! guint32 dv_quant_248_mul_tab [2] [22] [64]; ! guint32 dv_quant_88_mul_tab [2] [22] [64]; ! ! extern void quant_x86(dv_coeff_t *block,int qno,int class); ! extern void quant_248_inverse_std(dv_coeff_t *block,int qno,int class,dv_248_coeff_t *co); ! extern void quant_248_inverse_mmx(dv_coeff_t *block,int qno,int class,dv_248_coeff_t *co); ! void (*quant_248_inverse) (dv_coeff_t *block,int qno,int class,dv_248_coeff_t *co); + void + dv_quant_init (dv_decoder_t *dv) + { + int ex, qno, i; + + for (ex = 0; ex < 2; ++ex) { + for (qno = 0; qno < 22; ++qno) { + for (i = 0; i < 64; ++i) { + dv_quant_248_mul_tab [ex] [qno] [i] = + (1 << (dv_quant_shifts [qno] [dv_248_areas [i]] + ex)) * dv_idct_248_prescale[i]; + } + dv_quant_248_mul_tab [ex] [qno] [0] = dv_idct_248_prescale[0]; + } + } + quant_248_inverse = quant_248_inverse_std; + #if ARCH_X86 + if (dv->use_mmx) { + quant_248_inverse = quant_248_inverse_mmx; + } + #endif + } + void quant(dv_coeff_t *block,int qno,int class) { *************** *** 173,177 **** } ! void quant_248_inverse(dv_coeff_t *block,int qno,int class) { int i; guint8 *pq; /* pointer to the four quantization --- 202,207 ---- } ! void ! quant_248_inverse_std(dv_coeff_t *block,int qno,int class,dv_248_coeff_t *co) { int i; guint8 *pq; /* pointer to the four quantization *************** *** 182,186 **** one more place */ pq = dv_quant_shifts[qno + dv_quant_offset[class]]; for (i = 1; i < 64; i++) ! block[i] <<= (pq[dv_248_areas[i]] + extra); } --- 212,228 ---- one more place */ pq = dv_quant_shifts[qno + dv_quant_offset[class]]; + co [0] = block [0] * dv_idct_248_prescale[0]; for (i = 1; i < 64; i++) ! co [i] = (block[i] << (pq[dv_248_areas[i]] + extra)) * dv_idct_248_prescale[i]; ! } ! ! void ! quant_248_inverse_mmx(dv_coeff_t *block,int qno,int class,dv_248_coeff_t *co) { ! int i; ! guint32 *pm; ! ! pm = dv_quant_248_mul_tab [class == 3] [qno + dv_quant_offset[class]]; ! for (i = 0; i < 64; i++) { ! co [i] = block [i] * pm [i]; ! } } Index: quant.h =================================================================== RCS file: /cvsroot/libdv/libdv/quant.h,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -r1.7 -r1.8 *** quant.h 2001/02/07 00:43:31 1.7 --- quant.h 2001/02/07 02:57:55 1.8 *************** *** 17,23 **** extern void quant(dv_coeff_t *block,int qno,int class); extern void quant_88_inverse(dv_coeff_t *block,int qno,int class); ! extern void quant_248_inverse(dv_coeff_t *block,int qno,int class); extern void quant_88_inverse_x86(dv_coeff_t *block,int qno,int class); ! #ifdef __cplusplus } --- 17,24 ---- extern void quant(dv_coeff_t *block,int qno,int class); extern void quant_88_inverse(dv_coeff_t *block,int qno,int class); ! extern void (*quant_248_inverse) (dv_coeff_t *block,int qno,int class, ! dv_248_coeff_t *co); extern void quant_88_inverse_x86(dv_coeff_t *block,int qno,int class); ! extern void dv_quant_init (dv_decoder_t *dv); #ifdef __cplusplus } |