[Mplayerxp-cvslog] SF.net SVN: mplayerxp:[107] mplayerxp

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 107
          http://mplayerxp.svn.sourceforge.net/mplayerxp/?rev=107&view=rev
Author:   nickols_k
Date:     2010-01-18 15:55:23 +0000 (Mon, 18 Jan 2010)

Log Message:
-----------
fastmemcpy-related improvements

Modified Paths:
--------------
    DOCS/mplayerxp.1
    mplayerxp/cfg-mplayer.h
    mplayerxp/dec_ahead.h
    mplayerxp/libmpcodecs/ad_mp3.c
    mplayerxp/libmpcodecs/dec_video.c
    mplayerxp/libmpcodecs/vd_ffmpeg.c
    mplayerxp/libvo/aclib.c
    mplayerxp/libvo/aclib_template.c
    mplayerxp/libvo/dri_vo.h
    mplayerxp/libvo/fastmemcpy.h
    mplayerxp/libvo/osd.c
    mplayerxp/libvo/osd.h
    mplayerxp/libvo/osd_template.c
    mplayerxp/libvo/video_out.c
    mplayerxp/libvo/video_out.h
    mplayerxp/libvo/vo_x11.c
    mplayerxp/libvo/vosub_vidix.c
    mplayerxp/mp_image.h
    mplayerxp/postproc/af_export.c
    mplayerxp/postproc/af_scaletempo.c
    mplayerxp/postproc/libmenu/menu.c
    mplayerxp/postproc/vf_delogo.c
    mplayerxp/postproc/vf_dint.c
    mplayerxp/postproc/vf_down3dright.c
    mplayerxp/postproc/vf_expand.c
    mplayerxp/postproc/vf_il.c
    mplayerxp/postproc/vf_mirror.c
    mplayerxp/postproc/vf_noise.c
    mplayerxp/postproc/vf_panscan.c
    mplayerxp/postproc/vf_rectangle.c
    mplayerxp/postproc/vf_scale.c
    mplayerxp/postproc/vf_softpulldown.c
    mplayerxp/postproc/vf_unsharp.c
    mplayerxp/postproc/vf_vo.c
    mplayerxp/postproc/vf_yvu9.c

Removed Paths:
-------------
    mplayerxp/libvo/aclib_x86_64.h

Modified: DOCS/mplayerxp.1
===================================================================

--- DOCS/mplayerxp.1	2010-01-17 18:46:44 UTC (rev 106)
+++ DOCS/mplayerxp.1	2010-01-18 15:55:23 UTC (rev 107)
@@ -210,7 +210,7 @@
 .TP
 modprobe dhahelper
 .TP
-mplayerxp -vo xvidix -core.xp -video.bm -video.fs -video.zoom videoout.avi
+mplayerxp -vo xvidix -core.xp -video.bm -video.fs -video.aspect-ratio videoout.avi
 .SS
 .I Another ways to speedup playback:
 In general, there are two ways to increase performance of playback -
@@ -666,13 +666,13 @@
 
     -video.monitorpixelaspect=4:3  or 1.3333
 .TP
-.B \-video.x\ <x>
-scale image to x width (if driver supports)
+.B \-video.width\ <x>
+scale image to width (if driver supports)
 .TP
-.B \-video.y\ <y>
-scale image to y height (if driver supports)
+.B \-video.height\ <y>
+scale image to height (if driver supports)
 .TP
-.B \-video.xy\ <factor>
+.B \-video.zoom\ <factor>
 scale image by <factor>
 .TP
 
@@ -717,7 +717,7 @@
 try to change to a different video mode. dga2, x11 (XF86VidMode) and sdl
 output drivers support it.
 .TP
-.B \-video.zoom
+.B \-video.aspect-ratio
 Keeps aspect ratio on the screen
 .I [default]
 .TP
@@ -809,7 +809,7 @@
 .TP
 .B \-video.sws\ <software\ scaler\ type>
 this option sets the quality (and speed, respectively) of the software scaler,
-with the -zoom option. For example with x11 or other outputs which lack
+with the -video.aspect-ratio option. For example with x11 or other outputs which lack
 hardware scaler. Possible settings are:
 
     0 - fast bilinear (default)

Modified: mplayerxp/cfg-mplayer.h
===================================================================
--- mplayerxp/cfg-mplayer.h	2010-01-17 18:46:44 UTC (rev 106)
+++ mplayerxp/cfg-mplayer.h	2010-01-18 15:55:23 UTC (rev 107)
@@ -132,12 +132,12 @@
  */
 
 static const config_t xpcore_config[]={
-        {"xp", &enable_xp, CONF_TYPE_INT, CONF_RANGE, 0, 4, NULL, "starts MPlayerXP in multi-thread and multi-buffer XP mode"},
-        {"noxp", &enable_xp, CONF_TYPE_FLAG, 0, 1, 0, NULL, "starts MPlayerXP in single-thread mode"},
+	{"xp", &enable_xp, CONF_TYPE_INT, CONF_RANGE, 0, 4, NULL, "starts MPlayerXP in multi-thread and multi-buffer XP mode"},
+	{"noxp", &enable_xp, CONF_TYPE_FLAG, 0, 1, 0, NULL, "starts MPlayerXP in single-thread mode"},
 	{"dump", &stream_dump, CONF_TYPE_STRING, 0, 0, 0, NULL, "specifies dump type and name for the dump of stream"},
-        {"gomp", &enable_gomp, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables usage of OpenMP extensions"},
-        {"nogomp", &enable_gomp, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables usage of OpenMP extensions"},
-        {"da_buffs", &vo_da_buffs, CONF_TYPE_INT, CONF_RANGE, 4, 1024, NULL, "specifies number of buffers for decoding-ahead in XP mode"},
+	{"gomp", &enable_gomp, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables usage of OpenMP extensions"},
+	{"nogomp", &enable_gomp, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables usage of OpenMP extensions"},
+	{"da_buffs", &vo_da_buffs, CONF_TYPE_INT, CONF_RANGE, 4, 1024, NULL, "specifies number of buffers for decoding-ahead in XP mode"},
 	{"double", &vo_doublebuffering, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables double-buffering for single-thread decoding"},
 	{"nodouble", &vo_doublebuffering, CONF_TYPE_FLAG, 0, 1, 0, NULL, "enables single-buffer for single-thread decoding"},
 	{"cache", &stream_cache_size, CONF_TYPE_INT, CONF_RANGE, 4, 65536, NULL,"specifies amount of memory for precaching a file/URL"},
@@ -168,32 +168,32 @@
 
 #if defined( ARCH_X86 ) || defined(ARCH_X86_64)
 static const config_t cpu_config[]={
-        {"mmx", &x86_mmx, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of MMX extensions of CPU"},
-        {"nommx", &x86_mmx, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of MMX extensions of CPU"},
-        {"mmx2", &x86_mmx2, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of MMX2 extensions of CPU"},
-        {"nommx2", &x86_mmx2, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of MMX2 extensions of CPU"},
-        {"3dnow", &x86_3dnow, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of 3DNow! extensions of CPU"},
-        {"no3dnow", &x86_3dnow, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of 3DNow! extensions of CPU"},
-        {"3dnow2", &x86_3dnow2, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of 3DNow-2! extensions of CPU"},
-        {"no3dnow2", &x86_3dnow2, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of 3DNow-2! extensions of CPU"},
-        {"sse", &x86_sse, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of SSE extensions of CPU"},
-        {"nosse", &x86_sse, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of SSE extensions of CPU"},
-        {"sse2", &x86_sse2, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of SSE2 extensions of CPU"},
-        {"nosse2", &x86_sse2, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of SSE2 extensions of CPU"},
-        {"sse3", &x86_sse3, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of SSE3 extensions of CPU"},
-        {"nosse3", &x86_sse3, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of SSE3 extensions of CPU"},
-        {"ssse3", &x86_ssse3, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of SSSE3 extensions of CPU"},
-        {"nossse3", &x86_ssse3, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of SSSE3 extensions of CPU"},
-        {"sse41", &x86_sse41, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of SSE41 extensions of CPU"},
-        {"nosse41", &x86_sse41, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of SSE41 extensions of CPU"},
-        {"sse42", &x86_sse42, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of SSE42 extensions of CPU"},
-        {"nosse42", &x86_sse42, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of SSE42 extensions of CPU"},
-        {"aes", &x86_aes, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of AES extensions of CPU"},
-        {"noaes", &x86_aes, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of AES extensions of CPU"},
-        {"avx", &x86_avx, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of AVX extensions of CPU"},
-        {"noavx", &x86_avx, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of AVX extensions of CPU"},
-        {"fma", &x86_fma, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of FMA extensions of CPU"},
-        {"nofma", &x86_fma, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of FMA extensions of CPU"},
+	{"mmx", &x86_mmx, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of MMX extensions of CPU"},
+	{"nommx", &x86_mmx, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of MMX extensions of CPU"},
+	{"mmx2", &x86_mmx2, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of MMX2 extensions of CPU"},
+	{"nommx2", &x86_mmx2, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of MMX2 extensions of CPU"},
+	{"3dnow", &x86_3dnow, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of 3DNow! extensions of CPU"},
+	{"no3dnow", &x86_3dnow, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of 3DNow! extensions of CPU"},
+	{"3dnow2", &x86_3dnow2, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of 3DNow-2! extensions of CPU"},
+	{"no3dnow2", &x86_3dnow2, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of 3DNow-2! extensions of CPU"},
+	{"sse", &x86_sse, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of SSE extensions of CPU"},
+	{"nosse", &x86_sse, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of SSE extensions of CPU"},
+	{"sse2", &x86_sse2, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of SSE2 extensions of CPU"},
+	{"nosse2", &x86_sse2, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of SSE2 extensions of CPU"},
+	{"sse3", &x86_sse3, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of SSE3 extensions of CPU"},
+	{"nosse3", &x86_sse3, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of SSE3 extensions of CPU"},
+	{"ssse3", &x86_ssse3, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of SSSE3 extensions of CPU"},
+	{"nossse3", &x86_ssse3, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of SSSE3 extensions of CPU"},
+	{"sse41", &x86_sse41, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of SSE41 extensions of CPU"},
+	{"nosse41", &x86_sse41, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of SSE41 extensions of CPU"},
+	{"sse42", &x86_sse42, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of SSE42 extensions of CPU"},
+	{"nosse42", &x86_sse42, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of SSE42 extensions of CPU"},
+	{"aes", &x86_aes, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of AES extensions of CPU"},
+	{"noaes", &x86_aes, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of AES extensions of CPU"},
+	{"avx", &x86_avx, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of AVX extensions of CPU"},
+	{"noavx", &x86_avx, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of AVX extensions of CPU"},
+	{"fma", &x86_fma, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of FMA extensions of CPU"},
+	{"nofma", &x86_fma, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of FMA extensions of CPU"},
 	{NULL, NULL, 0, 0, 0, 0, NULL,NULL},
 };
 #endif
@@ -227,7 +227,6 @@
 	{"framedrop", &frame_dropping, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables frame-dropping on slow systems: decodes all video frames, but skips displaying some ones"},
 /*UD*/	{"hardframedrop", &frame_dropping, CONF_TYPE_FLAG, 0, 0, 2, NULL, "enables hard frame-dropping on slow systems: skips displaying and decoding of some frames"},
 	{"noframedrop", &frame_dropping, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables frame dropping"},
-	
 	{"pts", &av_sync_pts, CONF_TYPE_FLAG, 0, 0, 1, NULL, "use PTS-based method of A/V synchronization"},
 	{"nopts", &av_sync_pts, CONF_TYPE_FLAG, 0, 1, 0, NULL, "use BPS-based method of A/V synchronization"},
 	{"dap", &dapsync, CONF_TYPE_FLAG, 0, 0, 1, NULL, "use alternative method of A/V synchronization"},
@@ -266,7 +265,7 @@
 	{"nounicode", &sub_unicode, CONF_TYPE_FLAG, 0, 1, 0, NULL, "tells MPlayerXP to handle the subtitle file as non-UNICODE"},
 	{"utf8", &sub_utf8, CONF_TYPE_FLAG, 0, 0, 1, NULL, "tells MPlayerXP to handle the subtitle file as UTF8"},
 	{"noutf8", &sub_utf8, CONF_TYPE_FLAG, 0, 1, 0, NULL, "tells MPlayerXP to handle the subtitle file as non-UTF8"},
- 	{"pos",&sub_pos,  CONF_TYPE_INT, CONF_RANGE, 0, 100, NULL, "specifies vertical shift of subtitles"},
+	{"pos",&sub_pos,  CONF_TYPE_INT, CONF_RANGE, 0, 100, NULL, "specifies vertical shift of subtitles"},
 #endif
 	{"cc", &subcc_enabled, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enable DVD Closed Caption (CC) subtitles"},
 	{"nocc", &subcc_enabled, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disable DVD Closed Caption (CC) subtitles"},
@@ -291,7 +290,7 @@
 static const config_t audio_config[]={
 	{"on", &has_audio, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables audio-steam playback"},
 	{"off", &has_audio, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables audio-stream playback"},
-        {"mixer", &oss_mixer_device, CONF_TYPE_STRING, 0, 0, 0, NULL, "select audio-mixer device"},
+	{"mixer", &oss_mixer_device, CONF_TYPE_STRING, 0, 0, 0, NULL, "select audio-mixer device"},
 	{"channels", &audio_output_channels, CONF_TYPE_INT, CONF_RANGE, 2, 8, NULL, "select number of audio output channels to be used"},
 	{"rate", &force_srate, CONF_TYPE_INT, CONF_RANGE, 1000, 8*48000, NULL, "specifies Hz for audio playback"},
 	{"lang", &audio_lang, CONF_TYPE_STRING, 0, 0, 0, NULL, "specifies language of DVD-audio stream as two-letter country code(s)"},
@@ -304,28 +303,28 @@
 };
 
 static const config_t video_config[]={
-	{"x", &opt_screen_size_x, CONF_TYPE_INT, CONF_RANGE, 0, 4096, NULL, "scale output image to x width (if driver supports)"},
-	{"y", &opt_screen_size_y, CONF_TYPE_INT, CONF_RANGE, 0, 4096, NULL, "scale output image to y height (if driver supports)"},
-	{"xy", &screen_size_xy, CONF_TYPE_FLOAT, CONF_RANGE, 0, 4096, NULL, "scale output image by given factor"},
+	{"width", &opt_screen_size_x, CONF_TYPE_INT, CONF_RANGE, 0, 4096, NULL, "scale output image to width (if driver supports)"},
+	{"height", &opt_screen_size_y, CONF_TYPE_INT, CONF_RANGE, 0, 4096, NULL, "scale output image to height (if driver supports)"},
+	{"zoom", &screen_size_xy, CONF_TYPE_FLOAT, CONF_RANGE, 0, 4096, NULL, "scale output image by given factor"},
 	{"screenw", &vo_screenwidth, CONF_TYPE_INT, CONF_RANGE, 0, 4096, NULL, "specifies the horizontal resolution of the screen (if supported)"},
 	{"screenh", &vo_screenheight, CONF_TYPE_INT, CONF_RANGE, 0, 4096, NULL, "specifies the vertical resolution of the screen (if supported)"},
 	{"speed", &playbackspeed_factor, CONF_TYPE_FLOAT, CONF_RANGE, 0.01, 100.0, NULL, "sets playback speed factor"},
 	{"aspect", &movie_aspect, CONF_TYPE_FLOAT, CONF_RANGE, 0.2, 3.0, NULL, "sets aspect-ratio of movies (autodetect)"},
 	{"noaspect", &movie_aspect, CONF_TYPE_FLAG, 0, 0, 0, NULL, "unsets aspect-ratio of movies"},
+	{"aspect-ratio", &softzoom, CONF_TYPE_FLAG, 0, 0, 1, NULL, "keeps aspect-ratio of the movie during window resize"},
+	{"noaspect-ratio", &softzoom, CONF_TYPE_FLAG, 0, 1, 0, NULL, "render movie to the user-defined window's geometry"},
 	{"monitorpixelaspect", &monitor_pixel_aspect, CONF_TYPE_FLOAT, CONF_RANGE, 0.2, 9.0, NULL, "sets the aspect-ratio of a single pixel of TV screen"},
-        {"vm", &vidmode, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables video-mode changing during playback"},
+	{"vm", &vidmode, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables video-mode changing during playback"},
 	{"novm", &vidmode, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables video-mode changing during playback"},
 	{"fs", &fullscreen, CONF_TYPE_FLAG, 0, 0, 1, NULL, "fullscreen playback"},
 	{"nofs", &fullscreen, CONF_TYPE_FLAG, 0, 1, 0, NULL, "windowed playback"},
 	{"fsmode", &vo_fsmode, CONF_TYPE_INT, CONF_RANGE, 0, 15, NULL, "enables workaround for some fullscreen related problems"},
-        {"zoom", &softzoom, CONF_TYPE_FLAG, 0, 0, 1, NULL, "keeps aspect-ratio of the movie during window resize"},
-        {"nozoom", &softzoom, CONF_TYPE_FLAG, 0, 1, 0, NULL, "render movie to the user-defined window's geometry"},
-        {"flip", &flip, CONF_TYPE_FLAG, 0, -1, 1, NULL, "flip output image upside-down"},
-        {"noflip", &flip, CONF_TYPE_FLAG, 0, -1, 0, NULL, "render output image as is"},
-        {"bpp", &vo_dbpp, CONF_TYPE_INT, CONF_RANGE, 0, 32, NULL, "use different color depth than autodetect"},
-        {"bm", &vo_use_bm, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of bus-mastering (if it available for given OS/videocard)"},
-        {"bm2", &vo_use_bm, CONF_TYPE_FLAG, 0, 0, 2, NULL, "enables using of bus-mastering to store all decoded-ahead frames in video-memory"},
-        {"nobm", &vo_use_bm, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of bus-mastering"},
+	{"flip", &flip, CONF_TYPE_FLAG, 0, -1, 1, NULL, "flip output image upside-down"},
+	{"noflip", &flip, CONF_TYPE_FLAG, 0, -1, 0, NULL, "render output image as is"},
+	{"bpp", &vo_dbpp, CONF_TYPE_INT, CONF_RANGE, 0, 32, NULL, "use different color depth than autodetect"},
+	{"bm", &vo_use_bm, CONF_TYPE_FLAG, 0, 0, 1, NULL, "enables using of bus-mastering (if it available for given OS/videocard)"},
+	{"bm2", &vo_use_bm, CONF_TYPE_FLAG, 0, 0, 2, NULL, "enables using of bus-mastering to store all decoded-ahead frames in video-memory"},
+	{"nobm", &vo_use_bm, CONF_TYPE_FLAG, 0, 1, 0, NULL, "disables using of bus-mastering"},
 	{"id", &video_id, CONF_TYPE_INT, CONF_RANGE, 0, 255, NULL, "selects video channel"},
 	{"pp", &npp_options, CONF_TYPE_STRING, 0, 0, 0, NULL, "specifies options of post-processing"},
 	{"sws", &sws_flags, CONF_TYPE_INT, 0, 0, 2, NULL, "specifies the quality of the software scaler"},
@@ -336,7 +335,7 @@
 	{"noxv", &sdl_noxv, CONF_TYPE_FLAG, 0, 0, 1, NULL, "disable XVideo hardware acceleration for SDL"},
 	{"forcexv", &sdl_forcexv, CONF_TYPE_FLAG, 0, 0, 1, NULL, "force XVideo hardware acceleration for SDL"},
 	{"forcegl", &sdl_forcegl, CONF_TYPE_FLAG, 0, 0, 1, NULL, "force OpenGL hardware acceleration for SDL"},
-#endif	
+#endif
 	{"eq",&veq_config, CONF_TYPE_SUBCONFIG, 0, 0, 0, NULL, "Video-equalizer specific options"},
 	{NULL, NULL, 0, 0, 0, 0, NULL,NULL},
 };
@@ -346,8 +345,8 @@
 	{"ss", &seek_to_sec, CONF_TYPE_STRING, CONF_MIN, 0, 0, NULL, "seek to given time position before playback"},
 	{"loop", &loop_times, CONF_TYPE_INT, CONF_RANGE, -1, 10000, NULL, "loops movie playback given number of times. 0 means forever"},
 	{"noloop", &loop_times, CONF_TYPE_FLAG, 0, 0, -1, NULL, "disable loop of playback"},
-        {"shuffle",&shuffle_playback, CONF_TYPE_FLAG, 0, 0, 1, NULL, "play files in random order"},
-        {"noshuffle",&shuffle_playback, CONF_TYPE_FLAG, 0, 1, 0, NULL, "play files in regular order"},
+	{"shuffle",&shuffle_playback, CONF_TYPE_FLAG, 0, 0, 1, NULL, "play files in random order"},
+	{"noshuffle",&shuffle_playback, CONF_TYPE_FLAG, 0, 1, 0, NULL, "play files in regular order"},
 	{"list", NULL, CONF_TYPE_STRING, 0, 0, 0, NULL, "specifies playlist (1 file/row or Winamp or ASX format)"},
 	{"frames", &play_n_frames, CONF_TYPE_INT, CONF_MIN, 0, 0, NULL, "play given number of frames and exit"},
 	{NULL, NULL, 0, 0, 0, 0, NULL,NULL},

Modified: mplayerxp/dec_ahead.h
===================================================================
--- mplayerxp/dec_ahead.h	2010-01-17 18:46:44 UTC (rev 106)
+++ mplayerxp/dec_ahead.h	2010-01-18 15:55:23 UTC (rev 107)
@@ -60,6 +60,8 @@
 #define LOCK_VIDEO_DECODE() { MSG_D(DA_PREFIX"LOCK_VIDEO_DECODE\n"); pthread_mutex_lock(&video_decode_mutex); }
 #define UNLOCK_VIDEO_DECODE() { MSG_D(DA_PREFIX"UNLOCK_VIDEO_DECODE\n"); pthread_mutex_unlock(&video_decode_mutex); }
 
+#define __MP_ATOMIC(OP) { pthread_mutex_t loc_mutex; pthread_mutex_lock(&loc_mutex); OP; pthread_mutex_unlock(&loc_mutex); }
+
 typedef struct sh_video_attr
 {
   int eof;			/* indicates last frame in stream */

Modified: mplayerxp/libmpcodecs/ad_mp3.c
===================================================================
--- mplayerxp/libmpcodecs/ad_mp3.c	2010-01-17 18:46:44 UTC (rev 106)
+++ mplayerxp/libmpcodecs/ad_mp3.c	2010-01-18 15:55:23 UTC (rev 107)
@@ -361,8 +361,10 @@
 	if(!((err==MPG123_OK)||(err==MPG123_NEED_MORE))) {
 	    MSG_ERR("mpg123_read = %s done = %u minlen = %u\n",mpg123_plain_strerror(err),done,minlen);
 	}
-	else
+	else {
+	    MSG_DBG2("ad_mp3.decode: copy %u bytes from %p\n",done,outdata);
 	    memcpy(buf,outdata,done);
+	}
 	if(err==MPG123_NEED_MORE) {
 	    indata_size=ds_get_packet_r(sh->ds,&indata,pts);
 	    if(indata_size<0) return 0;

Modified: mplayerxp/libmpcodecs/dec_video.c
===================================================================
--- mplayerxp/libmpcodecs/dec_video.c	2010-01-17 18:46:44 UTC (rev 106)
+++ mplayerxp/libmpcodecs/dec_video.c	2010-01-18 15:55:23 UTC (rev 107)
@@ -80,12 +80,10 @@
     sh_video->inited=0;
 }
 
-#ifdef _OPENMP
 #define MPDEC_THREAD_COND (VF_FLAGS_THREADS|VF_FLAGS_SLICES)
 static unsigned smp_num_cpus=1;
 static unsigned use_vf_threads=0;
 extern int enable_gomp;
-#endif
 
 extern char *video_codec;
 int init_video(sh_video_t *sh_video,const char* codecname,const char * vfm,int status){
@@ -207,12 +205,12 @@
 	    for(j=0;j<num_slices;j+=smp_num_cpus) {
 #pragma omp parallel for shared(vf) private(i)
 		for(i=j;i<smp_num_cpus;i++) {
-		    MSG_DBG2("Put slice[%u %u] in threads\n",ampi[i].y,ampi[i].h);
+		    MSG_DBG2("parallel: dec_video.put_slice[%ux%u] %i %i %i %i\n",ampi[i].width,ampi[i].height,ampi[i].x,ampi[i].y,ampi[i].w,ampi[i].h);
 		    vf->put_slice(vf,&ampi[i]);
 		}
 	    }
 	    for(;j<num_slices;j++) {
-		MSG_DBG2("Put slice[%u %u] in threads\n",ampi[j].y,h_step);
+		MSG_DBG2("par_tail: dec_video.put_slice[%ux%u] %i %i %i %i\n",ampi[i].width,ampi[i].height,ampi[i].x,ampi[i].y,ampi[i].w,ampi[i].h);
 		vf->put_slice(vf,&ampi[j]);
 	    }
 	}
@@ -221,12 +219,12 @@
 	{
 	    /* execute slices instead of whole frame make faster multiple filters */
 	    for(i=0;i<num_slices;i++) {
-		MSG_DBG2("vf(%s) Put slice[%u %u] in threads\n",vf->info->name,ampi[i].y,ampi[i].h);
+		MSG_DBG2("dec_video.put_slice[%ux%u] %i %i %i %i\n",ampi[i].width,ampi[i].height,ampi[i].x,ampi[i].y,ampi[i].w,ampi[i].h);
 		vf->put_slice(vf,&ampi[i]);
 	    }
 	}
     } else {
-	MSG_DBG2("Put whole frame\n");
+	MSG_DBG2("Put whole frame[%ux%u]\n",mpi->width,mpi->height);
 	vf->put_slice(vf,mpi);
     }
   }

Modified: mplayerxp/libmpcodecs/vd_ffmpeg.c
===================================================================
--- mplayerxp/libmpcodecs/vd_ffmpeg.c	2010-01-17 18:46:44 UTC (rev 106)
+++ mplayerxp/libmpcodecs/vd_ffmpeg.c	2010-01-18 15:55:23 UTC (rev 107)
@@ -5,6 +5,7 @@
 #include <dlfcn.h> /* GLIBC specific. Exists under cygwin too! */
 
 #include "mp_config.h"
+#include "../dec_ahead.h"
 #ifdef HAVE_GOMP
 #include <omp.h>
 #endif
@@ -578,17 +579,18 @@
     priv_t *vdff_ctx=sh->context;
     mp_image_t *mpi;
     if(vdff_ctx->use_dr1) { MSG_DBG2("Ignoring draw_slice due dr1\n"); return; } /* we may call vo_start_slice() here */
-    mpi=mpcodecs_get_image(sh,MP_IMGTYPE_EXPORT, MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_DRAW_CALLBACK|MP_IMGFLAG_DIRECT,s->width,height);
+    mpi=mpcodecs_get_image(sh,MP_IMGTYPE_EXPORT, MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_DRAW_CALLBACK|MP_IMGFLAG_DIRECT,s->width,s->height);
 
     mpi->stride[0]=src->linesize[0];
     mpi->stride[1]=src->linesize[1];
     mpi->stride[2]=src->linesize[2];
-    mpi->planes[0] = src->base[0]+offset[0];
-    mpi->planes[1] = src->base[1]+offset[1];
-    mpi->planes[2] = src->base[2]+offset[2];
+    mpi->planes[0] = src->data[0];
+    mpi->planes[1] = src->data[1];
+    mpi->planes[2] = src->data[2];
     mpi->w=s->width;
     mpi->y=y;
     mpi->h=height;
+    mpi->chroma_height = height >> mpi->chroma_y_shift;
     /* provide info for pp */
     mpi->qscale=(QP_STORE_T *)vdff_ctx->lavc_picture->qscale_table;
     mpi->qstride=vdff_ctx->lavc_picture->qstride;
@@ -607,14 +609,10 @@
 	mpi->stride[2]=mpi->stride[1];
 	mpi->stride[1]=ls;
     }
-    MSG_DBG2("ff_draw_callback %i %i %i %i\n",mpi->x,mpi->y,mpi->w,mpi->h);
-    pthread_mutex_lock(&sh->mutex);
-    sh->active_slices++;
-    pthread_mutex_unlock(&sh->mutex);
+    MSG_DBG2("ff_draw_callback[%ux%u] %i %i %i %i\n",mpi->width,mpi->height,mpi->x,mpi->y,mpi->w,mpi->h);
+    __MP_ATOMIC(sh->active_slices++);
     mpcodecs_draw_slice (sh, mpi);
-    pthread_mutex_lock(&sh->mutex);
-    sh->active_slices--;
-    pthread_mutex_unlock(&sh->mutex);
+    __MP_ATOMIC(sh->active_slices--);
 }
 
 /* copypaste from demux_real.c - it should match to get it working!*/

Modified: mplayerxp/libvo/aclib.c
===================================================================
--- mplayerxp/libvo/aclib.c	2010-01-17 18:46:44 UTC (rev 106)
+++ mplayerxp/libvo/aclib.c	2010-01-18 15:55:23 UTC (rev 107)
@@ -8,110 +8,81 @@
 #if defined(USE_FASTMEMCPY)
 #include "fastmemcpy.h"
 #include "../cpudetect.h"
-/*
-  aclib - advanced C library ;)
-  This file contains functions which improve and expand standard C-library
-  see aclib_template.c ... this file only contains runtime cpu detection and config options stuff
-  runtime cpu detection by michael niedermayer (mic...@gm...) is under GPL
-*/
-#if defined( CAN_COMPILE_MMX ) && defined (ARCH_X86)
 
 #define BLOCK_SIZE 4096
 #define CONFUSION_FACTOR 0
-//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
 
-//#define STATISTICS
+/* generic version */
+#undef OPTIMIZE_AVX
+#undef OPTIMIZE_SSE4
+#undef OPTIMIZE_SSSE3
+#undef OPTIMIZE_SSE3
+#undef OPTIMIZE_SSE2
+#undef OPTIMIZE_SSE
+#undef OPTIMIZE_MMX2
+#undef OPTIMIZE_MMX
 
-#if defined( ARCH_X86 )
-#define CAN_COMPILE_X86_ASM
-#endif
-
-//Note: we have MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
-//Plain C versions
-//#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
-//#define COMPILE_C
-//#endif
-
-#ifdef CAN_COMPILE_X86_ASM
-
-#undef HAVE_MMX
-#undef HAVE_MMX2
-#undef HAVE_3DNOW
-#undef HAVE_SSE
-
-//MMX versions
-#ifdef CAN_COMPILE_MMX
-#undef RENAME
+#ifndef __x86_64__
+#ifdef __MMX__
+#define OPTIMIZE_MMX
 #undef CL_SIZE
 #define CL_SIZE 32
-#define HAVE_MMX
-#undef HAVE_MMX2
-#undef HAVE_3DNOW
+#undef RENAME
 #define RENAME(a) a ## _MMX
 #include "aclib_template.c"
 #endif
-
-//MMX2 versions 32-byte cache-line size
-#ifdef CAN_COMPILE_MMX2
-#undef RENAME
+#ifdef __MMX2__
+#define OPTIMIZE_MMX2
 #undef CL_SIZE
 #define CL_SIZE 32
-#define HAVE_MMX
-#define HAVE_MMX2
-#undef HAVE_3DNOW
+#undef RENAME
 #define RENAME(a) a ## _MMX2_CL32
 #include "aclib_template.c"
 #endif
-
-//MMX2 versions 64-byte cache-line size
-#ifdef CAN_COMPILE_MMX2
-#undef RENAME
+#ifdef __MMX2__
+#define OPTIMIZE_MMX2
 #undef CL_SIZE
 #define CL_SIZE 64
-#define HAVE_MMX
-#define HAVE_MMX2
-#undef HAVE_3DNOW
+#undef RENAME
 #define RENAME(a) a ## _MMX2_CL64
 #include "aclib_template.c"
 #endif
-
-//MMX2 versions 128-byte cache-line size
-#ifdef CAN_COMPILE_MMX2
-#undef RENAME
+#ifdef __MMX2__
+#define OPTIMIZE_MMX2
 #undef CL_SIZE
 #define CL_SIZE 128
-#define HAVE_MMX
-#define HAVE_MMX2
-#undef HAVE_3DNOW
+#undef RENAME
 #define RENAME(a) a ## _MMX2_CL128
 #include "aclib_template.c"
 #endif
-
-//3DNOW versions all K6 have 32-bit cache-line size
-#ifdef CAN_COMPILE_3DNOW
+#endif // __x86_64__
+#ifdef __SSE2__
+#define OPTIMIZE_SSE2
 #undef RENAME
 #undef CL_SIZE
-#define CL_SIZE 32
-#define HAVE_MMX
-#undef HAVE_MMX2
-#define HAVE_3DNOW
-#define RENAME(a) a ## _3DNow
+#define CL_SIZE 128
+#define RENAME(a) a ## _SSE2
 #include "aclib_template.c"
 #endif
-#endif // CAN_COMPILE_X86_ASM
+/*
+  aclib - advanced C library ;)
+  This file contains functions which improve and expand standard C-library
+  see aclib_template.c ... this file only contains runtime cpu detection and config options stuff
+  runtime cpu detection by michael niedermayer (mic...@gm...) is under GPL
+*/
 
-#elif defined( ARCH_X86_64 )
-#define RENAME(a) a ## _x86_64
-#include "aclib_x86_64.h"
-#endif
-
 static void * init_fast_memcpy(void * to, const void * from, size_t len)
 {
-#if defined( ARCH_X86_64 ) && defined( USE_FASTMEMCPY )
-	fast_memcpy_ptr = fast_memcpy_x86_64;
-#elif defined( CAN_COMPILE_X86_ASM )
-	// ordered per speed fasterst first
-#ifdef CAN_COMPILE_MMX2
+#ifdef __SSE2__
+	if(gCpuCaps.hasSSE2)
+	{
+		MSG_V("Using SSE2 optimized memcpy\n");
+		fast_memcpy_ptr = fast_memcpy_SSE2;
+	}
+	else
+#endif
+#ifndef __x86_64__
+#ifdef __MMX2__
 	if(gCpuCaps.hasMMX2)
 	{
 		MSG_V("Using MMX2 optimized memcpy\n");
@@ -123,14 +94,6 @@
 	}
 	else
 #endif
-#ifdef CAN_COMPILE_3DNOW
-	if(gCpuCaps.has3DNow)
-	{
-		MSG_V("Using 3DNow optimized memcpy\n");
-		fast_memcpy_ptr = fast_memcpy_3DNow;
-	}
-	else
-#endif
 #ifdef CAN_COMPILE_MMX
 	if(gCpuCaps.hasMMX)
 	{
@@ -139,60 +102,54 @@
 	}
 	else
 #endif
-#else
+#endif
 	{
 		MSG_V("Using generic memcpy\n");
 		fast_memcpy_ptr = memcpy; /* prior to mmx we use the standart memcpy */
 	}
-#endif
 	return (*fast_memcpy_ptr)(to,from,len);
 }
 
-static void * init_mem2agpcpy(void * to, const void * from, size_t len)
+static void * init_stream_copy(void * to, const void * from, size_t len)
 {
-#if defined( ARCH_X86_64 ) && defined( USE_FASTMEMCPY )
-	mem2agpcpy_ptr = mem2agpcpy_x86_64;
-#elif defined ( CAN_COMPILE_X86_ASM )
-	// ordered per speed fasterst first
-#ifdef CAN_COMPILE_MMX2
+#ifdef __SSE2__
+	if(gCpuCaps.hasSSE2)
+	{
+		MSG_V("Using SSE2 optimized agpcpy\n");
+		fast_stream_copy_ptr = fast_stream_copy_SSE2;
+	}
+#endif
+#ifndef __x86_64__
+#ifdef __MMX2__
 	if(gCpuCaps.hasMMX2)
 	{
 		MSG_V("Using MMX2 optimized agpcpy\n");
-		if(gCpuCaps.cl_size >= 128) mem2agpcpy_ptr = mem2agpcpy_MMX2_CL128;
+		if(gCpuCaps.cl_size >= 128) fast_stream_copy_ptr = fast_stream_copy_MMX2_CL128;
 		else
-		if(gCpuCaps.cl_size == 64) mem2agpcpy_ptr = mem2agpcpy_MMX2_CL64;
+		if(gCpuCaps.cl_size == 64) fast_stream_copy_ptr = fast_stream_copy_MMX2_CL64;
 		else
-		mem2agpcpy_ptr = mem2agpcpy_MMX2_CL32;
+		fast_stream_copy_ptr = fast_stream_copy_MMX2_CL32;
 	}
 	else
 #endif
-#ifdef CAN_COMPILE_3DNOW
-	if(gCpuCaps.has3DNow)
-	{
-		MSG_V("Using 3DNow optimized agpcpy\n");
-		mem2agpcpy_ptr = mem2agpcpy_3DNow;
-	}
-	else
-#endif
-#ifdef CAN_COMPILE_MMX
+#ifdef __MMX__
 	if(gCpuCaps.hasMMX)
 	{
 		MSG_V("Using MMX optimized agpcpy\n");
-		mem2agpcpy_ptr = mem2agpcpy_MMX;
+		fast_stream_copy_ptr = fast_stream_copy_MMX;
 	}
 	else
 #endif
-#else
+#endif
 	{
 		MSG_V("Using generic optimized agpcpy\n");
-		mem2agpcpy_ptr = memcpy; /* prior to mmx we use the standart memcpy */
+		fast_stream_copy_ptr = memcpy; /* prior to mmx we use the standart memcpy */
 	}
-#endif
-	return (*mem2agpcpy_ptr)(to,from,len);
+	return (*fast_stream_copy_ptr)(to,from,len);
 }
 
 void *(*fast_memcpy_ptr)(void * to, const void * from, size_t len) = init_fast_memcpy;
-void *(*mem2agpcpy_ptr)(void * to, const void * from, size_t len) = init_mem2agpcpy;
+void *(*fast_stream_copy_ptr)(void * to, const void * from, size_t len) = init_stream_copy;
 
 #endif /* use fastmemcpy */
 

Modified: mplayerxp/libvo/aclib_template.c
===================================================================
--- mplayerxp/libvo/aclib_template.c	2010-01-17 18:46:44 UTC (rev 106)
+++ mplayerxp/libvo/aclib_template.c	2010-01-18 15:55:23 UTC (rev 107)
@@ -2,102 +2,70 @@
   aclib - advanced C library ;)
   This file contains functions which improve and expand standard C-library
 */
+#include "pvector/pvector.h"
 
-#ifndef HAVE_SSE2
-/*
-   P3 processor has only one SSE decoder so can execute only 1 sse insn per
-   cpu clock, but it has 3 mmx decoders (include load/store unit)
-   and executes 3 mmx insns per cpu clock.
-   P4 processor has some chances, but after reading:
-   http://www.emulators.com/pentium4.htm
-   I have doubts. Anyway SSE2 version of this code can be written better.
-*/
-#undef HAVE_SSE
-#endif
-
-
-/*
- This part of code was taken by me from Linux-2.4.3 and slightly modified
-for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
-blocks but mplayer uses weakly ordered data and original sources can not
-speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
-
->From IA-32 Intel Architecture Software Developer's Manual Volume 1,
-
-Order Number 245470:
-"10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
-
-Data referenced by a program can be temporal (data will be used again) or
-non-temporal (data will be referenced once and not reused in the immediate
-future). To make efficient use of the processor's caches, it is generally
-desirable to cache temporal data and not cache non-temporal data. Overloading
-the processor's caches with non-temporal data is sometimes referred to as
-"polluting the caches".
-The non-temporal data is written to memory with Write-Combining semantics.
-
-The PREFETCHh instructions permits a program to load data into the processor
-at a suggested cache level, so that it is closer to the processors load and
-store unit when it is needed. If the data is already present in a level of
-the cache hierarchy that is closer to the processor, the PREFETCHh instruction
-will not result in any data movement.
-But we should you PREFETCHNTA: Non-temporal data fetch data into location
-close to the processor, minimizing cache pollution.
-
-The MOVNTQ (store quadword using non-temporal hint) instruction stores
-packed integer data from an MMX register to memory, using a non-temporal hint.
-The MOVNTPS (store packed single-precision floating-point values using
-non-temporal hint) instruction stores packed floating-point data from an
-XMM register to memory, using a non-temporal hint.
-
-The SFENCE (Store Fence) instruction controls write ordering by creating a
-fence for memory store operations. This instruction guarantees that the results
-of every store instruction that precedes the store fence in program order is
-globally visible before any store instruction that follows the fence. The
-SFENCE instruction provides an efficient way of ensuring ordering between
-procedures that produce weakly-ordered data and procedures that consume that
-data.
-
-If you have questions please contact with me: Nickols_K <nic...@ma...>.
-*/
-
-/* 3dnow memcpy support from kernel 2.4.2
-   by Pontscho/fresh!mindworkz */
-
-
 /* for small memory blocks (<256 bytes) this version is faster */
+#ifdef __x86_64__
 #define small_memcpy(to,from,n)\
 {\
+register unsigned long int siz;\
 register unsigned long int dummy;\
+    siz=n&0x7;  n>>=3;\
+    if(siz)\
 __asm__ __volatile__(\
 	"rep; movsb"\
 	:"=&D"(to), "=&S"(from), "=&c"(dummy)\
 /* It's most portable way to notify compiler */\
 /* that edi, esi and ecx are clobbered in asm block. */\
 /* Thanks to A'rpi for hint!!! */\
+        :"0" (to), "1" (from),"2" (siz)\
+	: "memory","cc");\
+    if(n)\
+__asm__ __volatile__(\
+	"rep; movsq"\
+	:"=&D"(to), "=&S"(from), "=&c"(dummy)\
+/* It's most portable way to notify compiler */\
+/* that edi, esi and ecx are clobbered in asm block. */\
+/* Thanks to A'rpi for hint!!! */\
         :"0" (to), "1" (from),"2" (n)\
 	: "memory","cc");\
 }
+#else
+#define small_memcpy(to,from,n)\
+{\
+register unsigned long int dummy;\
+__asm__ __volatile__(\
+	"rep; movsb"\
+	:"=&D"(to), "=&S"(from), "=&c"(dummy)\
+/* It's most portable way to notify compiler */\
+/* that edi, esi and ecx are clobbered in asm block. */\
+/* Thanks to A'rpi for hint!!! */\
+        :"0" (to), "1" (from),"2" (n)\
+	: "memory","cc");\
+}
+#endif
 
-#include "../mmx_defs.h"
 #undef MMREG_SIZE
-#ifdef HAVE_SSE
+#ifdef OPTIMIZE_SSE2
 #define MMREG_SIZE 16
 #else
 #define MMREG_SIZE 64 //8
 #endif
 #undef MIN_LEN
-#ifdef HAVE_MMX1
+#ifndef OPTIMIZE_MMX2
 #define MIN_LEN 0x800  /* 2K blocks */
 #else
 #define MIN_LEN 0x40  /* 64-byte blocks */
 #endif
 
 
-static inline void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
+static inline void * RENAME(fast_memory_copy)(void * to, const void * from, size_t len,int final)
 {
 	void *retval;
 	const unsigned char *cfrom=from;
 	unsigned char *tto=to;
+	const unsigned ivec_block_size = 8*__IVEC_SIZE;
+	__ivec iarr[8];
 	size_t i;
 	retval = to;
 	if(!len) return retval;
@@ -114,45 +82,41 @@
 				MSG_V("freq < %8d %4d\n", 1<<i, freq[i]);
 	}
 #endif
-#ifndef HAVE_MMX1
-        /* PREFETCH has effect even for MOVSB instruction ;) */
-	__asm__ __volatile__ (
-	        PREFETCH" (%0)\n"
+
+    _ivec_prefetch(cfrom);
 #if CL_SIZE == 32
-		PREFETCH" 32(%0)\n"
+    _ivec_prefetch(&cfrom[32]);
 #endif
 #if CL_SIZE < 128
-	        PREFETCH" 64(%0)\n"
+    _ivec_prefetch(&cfrom[64]);
 #endif
 #if CL_SIZE == 32
-		PREFETCH" 96(%0)\n"
+    _ivec_prefetch(&cfrom[96]);
 #endif
-	        PREFETCH" 128(%0)\n"
+    _ivec_prefetch(&cfrom[128]);
 #if CL_SIZE == 32
-		PREFETCH" 160(%0)\n"
+    _ivec_prefetch(&cfrom[160]);
 #endif
 #if CL_SIZE < 128
-        	PREFETCH" 192(%0)\n"
+    _ivec_prefetch(&cfrom[192]);
 #endif
 #if CL_SIZE == 32
-		PREFETCH" 224(%0)\n"
+    _ivec_prefetch(&cfrom[224]);
 #endif
-        	PREFETCH" 256(%0)\n"
-		: : "r" (cfrom) );
-#endif
-        if(len >= MIN_LEN)
-	{
-	  register unsigned long int delta;
-          /* Align destinition to cache-line size -boundary */
-          delta = ((unsigned long int)tto)&(CL_SIZE-1);
-          if(delta)
-	  {
+    _ivec_prefetch(&cfrom[256]);
+
+    if(len >= MIN_LEN)
+    {
+	register unsigned long int delta;
+        /* Align destinition to cache-line size -boundary */
+        delta = ((unsigned long int)tto)&(CL_SIZE-1);
+        if(delta) {
 	    delta=MMREG_SIZE-delta;
 	    len -= delta;
 	    small_memcpy(tto, cfrom, delta);
-	  }
-	  i = len >> 6; /* len/64 */
-	  len&=63;
+	}
+	i = len/ivec_block_size;
+	len&=(ivec_block_size-1);
         /*
            This algorithm is top effective when the code consequently
            reads and writes blocks which have size of cache line.
@@ -162,340 +126,73 @@
            perform reading and writing to be multiple to a number of
            processor's decoders, but it's not always possible.
         */
-#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
-	if(((unsigned long)cfrom) & 15)
-	/* if SRC is misaligned */
 	for(; i>0; i--)
 	{
-		__asm__ __volatile__ (
-		PREFETCH" 320(%0)\n"
+	    _ivec_prefetch(&cfrom[320]);
 #if CL_SIZE == 32
-		PREFETCH" 352(%0)\n"
+	    _ivec_prefetch(&cfrom[352]);
 #endif
-		"movups (%0), %%xmm0\n"
-		"movups 16(%0), %%xmm1\n"
-		"movups 32(%0), %%xmm2\n"
-		"movups 48(%0), %%xmm3\n"
-		"movntps %%xmm0, (%1)\n"
-		"movntps %%xmm1, 16(%1)\n"
-		"movntps %%xmm2, 32(%1)\n"
-		"movntps %%xmm3, 48(%1)\n"
-		:: "r" (cfrom), "r" (tto):
-		"memory"
-#ifdef SSE_CLOBBERED
-		,SSE_CLOBBERED
-#endif
-		);
-		cfrom+=64;
-		tto+=64;
+	    if(((unsigned long)cfrom) & 15) {
+		/* if SRC is misaligned */
+		iarr[0] = _ivec_loadu(&cfrom[__IVEC_SIZE*0]);
+		iarr[1] = _ivec_loadu(&cfrom[__IVEC_SIZE*1]);
+		iarr[2] = _ivec_loadu(&cfrom[__IVEC_SIZE*2]);
+		iarr[3] = _ivec_loadu(&cfrom[__IVEC_SIZE*3]);
+		iarr[4] = _ivec_loadu(&cfrom[__IVEC_SIZE*4]);
+		iarr[5] = _ivec_loadu(&cfrom[__IVEC_SIZE*5]);
+		iarr[6] = _ivec_loadu(&cfrom[__IVEC_SIZE*6]);
+		iarr[7] = _ivec_loadu(&cfrom[__IVEC_SIZE*7]);
+	    } else {
+		iarr[0] = _ivec_loada(&cfrom[__IVEC_SIZE*0]);
+		iarr[1] = _ivec_loada(&cfrom[__IVEC_SIZE*1]);
+		iarr[2] = _ivec_loada(&cfrom[__IVEC_SIZE*2]);
+		iarr[3] = _ivec_loada(&cfrom[__IVEC_SIZE*3]);
+		iarr[4] = _ivec_loada(&cfrom[__IVEC_SIZE*4]);
+		iarr[5] = _ivec_loada(&cfrom[__IVEC_SIZE*5]);
+		iarr[6] = _ivec_loada(&cfrom[__IVEC_SIZE*6]);
+		iarr[7] = _ivec_loada(&cfrom[__IVEC_SIZE*7]);
+	    }
+	    if(final) {
+		_ivec_stream(&tto[__IVEC_SIZE*0],iarr[0]);
+		_ivec_stream(&tto[__IVEC_SIZE*1],iarr[1]);
+		_ivec_stream(&tto[__IVEC_SIZE*2],iarr[2]);
+		_ivec_stream(&tto[__IVEC_SIZE*3],iarr[3]);
+		_ivec_stream(&tto[__IVEC_SIZE*4],iarr[4]);
+		_ivec_stream(&tto[__IVEC_SIZE*5],iarr[5]);
+		_ivec_stream(&tto[__IVEC_SIZE*6],iarr[6]);
+		_ivec_stream(&tto[__IVEC_SIZE*7],iarr[7]);
+	    } else {
+		_ivec_storea(&tto[__IVEC_SIZE*0],iarr[0]);
+		_ivec_storea(&tto[__IVEC_SIZE*1],iarr[1]);
+		_ivec_storea(&tto[__IVEC_SIZE*2],iarr[2]);
+		_ivec_storea(&tto[__IVEC_SIZE*3],iarr[3]);
+		_ivec_storea(&tto[__IVEC_SIZE*4],iarr[4]);
+		_ivec_storea(&tto[__IVEC_SIZE*5],iarr[5]);
+		_ivec_storea(&tto[__IVEC_SIZE*6],iarr[6]);
+		_ivec_storea(&tto[__IVEC_SIZE*7],iarr[7]);
+	    }
+	    cfrom+=ivec_block_size;
+	    tto+=ivec_block_size;
 	}
-	else
-	/*
-	   Only if SRC is aligned on 16-byte boundary.
-	   It allows to use movaps instead of movups, which required data
-	   to be aligned or a general-protection exception (#GP) is generated.
-	*/
-	for(; i>0; i--)
-	{
-		__asm__ __volatile__ (
-		PREFETCH" 320(%0)\n"
-#if CL_SIZE == 32
-		PREFETCH" 352(%0)\n"
-#endif
-		"movaps (%0), %%xmm0\n"
-		"movaps 16(%0), %%xmm1\n"
-		"movaps 32(%0), %%xmm2\n"
-		"movaps 48(%0), %%xmm3\n"
-		"movntps %%xmm0, (%1)\n"
-		"movntps %%xmm1, 16(%1)\n"
-		"movntps %%xmm2, 32(%1)\n"
-		"movntps %%xmm3, 48(%1)\n"
-		:: "r" (cfrom), "r" (tto)
-		:"memory"
-#ifdef SSE_CLOBBERED
-		,SSE_CLOBBERED
-#endif
-		);
-		cfrom+=64;
-		tto+=64;
-	}
-#else
-	// Align destination at BLOCK_SIZE boundary
-	for(; ((int)tto & (BLOCK_SIZE-1)) && i>0; i--)
-	{
-		__asm__ __volatile__ (
-#ifndef HAVE_MMX1
-        	PREFETCH" 320(%0)\n"
-#if CL_SIZE == 32
-		PREFETCH" 352(%0)\n"
-#endif
-#endif
-		"movq (%0), %%mm0\n"
-		"movq 8(%0), %%mm1\n"
-		"movq 16(%0), %%mm2\n"
-		"movq 24(%0), %%mm3\n"
-		"movq 32(%0), %%mm4\n"
-		"movq 40(%0), %%mm5\n"
-		"movq 48(%0), %%mm6\n"
-		"movq 56(%0), %%mm7\n"
-		MOVNTQ" %%mm0, (%1)\n"
-		MOVNTQ" %%mm1, 8(%1)\n"
-		MOVNTQ" %%mm2, 16(%1)\n"
-		MOVNTQ" %%mm3, 24(%1)\n"
-		MOVNTQ" %%mm4, 32(%1)\n"
-		MOVNTQ" %%mm5, 40(%1)\n"
-		MOVNTQ" %%mm6, 48(%1)\n"
-		MOVNTQ" %%mm7, 56(%1)\n"
-		:: "r" (cfrom), "r" (tto)
-		: "memory"
-#ifdef FPU_CLOBBERED
-		,FPU_CLOBBERED
-#endif
-#ifdef MMX_CLOBBERED
-		,MMX_CLOBBERED
-#endif
-		);
-		cfrom+=64;
-		tto+=64;
-	}
-
-	// Pure Assembly cuz gcc is a bit unpredictable ;)
-	if(i>=BLOCK_SIZE/64)
-		asm volatile(
-			"xorl %%"REG_a", %%"REG_a"	\n\t"
-			".balign 16		\n\t"
-			"1:			\n\t"
-				"movl (%0, %%"REG_a"), %%"REG_b" 	\n\t"
-#if CL_SIZE == 32
-				"movl 32(%0, %%"REG_a"), %%"REG_b" 	\n\t"
-#endif
-#if CL_SIZE < 128
-				"movl 64(%0, %%"REG_a"), %%"REG_b" 	\n\t"
-#endif
-#if CL_SIZE == 32
-				"movl 96(%0, %%"REG_a"), %%"REG_b" 	\n\t"
-#endif
-				"addl $128, %%"REG_a"		\n\t"
-				"cmpl %3, %%"REG_a"			\n\t"
-				" jb 1b				\n\t"
-
-			"xorl %%"REG_a", %%"REG_a"	\n\t"
-
-				".balign 16		\n\t"
-				"2:			\n\t"
-				"movq (%0, %%"REG_a"), %%mm0\n"
-				"movq 8(%0, %%"REG_a"), %%mm1\n"
-				"movq 16(%0, %%"REG_a"), %%mm2\n"
-				"movq 24(%0, %%"REG_a"), %%mm3\n"
-				"movq 32(%0, %%"REG_a"), %%mm4\n"
-				"movq 40(%0, %%"REG_a"), %%mm5\n"
-				"movq 48(%0, %%"REG_a"), %%mm6\n"
-				"movq 56(%0, %%"REG_a"), %%mm7\n"
-				MOVNTQ" %%mm0, (%1, %%"REG_a")\n"
-				MOVNTQ" %%mm1, 8(%1, %%"REG_a")\n"
-				MOVNTQ" %%mm2, 16(%1, %%"REG_a")\n"
-				MOVNTQ" %%mm3, 24(%1, %%"REG_a")\n"
-				MOVNTQ" %%mm4, 32(%1, %%"REG_a")\n"
-				MOVNTQ" %%mm5, 40(%1, %%"REG_a")\n"
-				MOVNTQ" %%mm6, 48(%1, %%"REG_a")\n"
-				MOVNTQ" %%mm7, 56(%1, %%"REG_a")\n"
-				"addl $64, %%"REG_a"		\n\t"
-				"cmpl %3, %%"REG_a"		\n\t"
-				"jb 2b				\n\t"
-
-#if CONFUSION_FACTOR > 0
-	// a few percent speedup on out of order executing CPUs
-			"movl %5, %%"REG_a"		\n\t"
-				"2:			\n\t"
-				"movl (%0), %%"REG_b"	\n\t"
-				"movl (%0), %%"REG_b"	\n\t"
-				"movl (%0), %%"REG_b"	\n\t"
-				"movl (%0), %%"REG_b"	\n\t"
-				"decl %%"REG_a"		\n\t"
-				" jnz 2b		\n\t"
-#endif
-
-			"xorl %%"REG_a", %%"REG_a"	\n\t"
-			"addl %3, %0		\n\t"
-			"addl %3, %1		\n\t"
-			"subl %4, %2		\n\t"
-			"cmpl %4, %2		\n\t"
-			" jae 1b		\n\t"
-				: "+r" (cfrom), "+r" (tto), "+r" (i)
-				: "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
-				: "%"REG_a, "%"REG_b, "memory", "cc"
-#ifdef FPU_CLOBBERED
-				,FPU_CLOBBERED
-#endif
-#ifdef MMX_CLOBBERED
-				,MMX_CLOBBERED
-#endif
-		);
-
-	for(; i>0; i--)
-	{
-		__asm__ __volatile__ (
-#ifndef HAVE_MMX1
-        	PREFETCH" 320(%0)\n"
-#if CL_SIZE == 32
-		PREFETCH" 352(%0)\n"
-#endif
-#endif
-		"movq (%0), %%mm0\n"
-		"movq 8(%0), %%mm1\n"
-		"movq 16(%0), %%mm2\n"
-		"movq 24(%0), %%mm3\n"
-		"movq 32(%0), %%mm4\n"
-		"movq 40(%0), %%mm5\n"
-		"movq 48(%0), %%mm6\n"
-		"movq 56(%0), %%mm7\n"
-		MOVNTQ" %%mm0, (%1)\n"
-		MOVNTQ" %%mm1, 8(%1)\n"
-		MOVNTQ" %%mm2, 16(%1)\n"
-		MOVNTQ" %%mm3, 24(%1)\n"
-		MOVNTQ" %%mm4, 32(%1)\n"
-		MOVNTQ" %%mm5, 40(%1)\n"
-		MOVNTQ" %%mm6, 48(%1)\n"
-		MOVNTQ" %%mm7, 56(%1)\n"
-		:: "r" (cfrom), "r" (tto)
-		: "memory"
-#ifdef FPU_CLOBBERED
-		,FPU_CLOBBERED
-#endif
-#ifdef MMX_CLOBBERED
-		,MMX_CLOBBERED
-#endif
-		);
-		cfrom+=64;
-		tto+=64;
-	}
-
-#endif /* Have SSE */
-#ifdef HAVE_MMX2
-                /* since movntq is weakly-ordered, a "sfence"
-		 * is needed to become ordered again. */
-		__asm__ __volatile__ ("sfence":::"memory");
-#endif
-#ifndef HAVE_SSE
-		/* enables to use FPU */
-		__asm__ __volatile__ (EMMS::
-		:"memory"
-#ifdef FPU_CLOBBERED
-		,FPU_CLOBBERED
-#endif
-#ifdef MMX_CLOBBERED
-		,MMX_CLOBBERED
-#endif
-		);
-#endif
-	}
-	/*
-	 *	Now do the tail of the block
-	 */
-	if(len) small_memcpy(tto, cfrom, len);
-	return retval;
+	_ivec_sfence();
+	_ivec_empty();
+    }
+    /*
+     *	Now do the tail of the block
+     */
+    if(len) small_memcpy(tto, cfrom, len);
+    return retval;
 }
 
 /**
  * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
  */
-static inline void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
+static inline void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
 {
-	void *retval;
-	const unsigned char *cfrom=from;
-	unsigned char *tto=to;
-	size_t i;
-	retval = to;
-	if(!len) return retval;
-#ifdef STATISTICS
-	{
-		static int freq[33];
-		static int t=0;
-		int i;
-		for(i=0; len>(1<<i); i++);
-		freq[i]++;
-		t++;
-		if(1024*1024*1024 % t == 0)
-			for(i=0; i<32; i++)
-				MSG_V("mem2agp freq < %8d %4d\n", 1<<i, freq[i]);
-	}
-#endif
-        if(len >= MIN_LEN)
-	{
-	  register unsigned long int delta;
-          /* Align destinition to cache-line size -boundary */
-          delta = ((unsigned long int)tto)&(CL_SIZE-1);
-          if(delta)
-	  {
-	    delta=8-delta;
-	    len -= delta;
-	    small_memcpy(tto, cfrom, delta);
-	  }
-	  i = len >> 6; /* len/64 */
-	  len &= 63;
-        /*
-           This algorithm is top effective when the code consequently
-           reads and writes blocks which have size of cache line.
-           Size of cache line is processor-dependent.
-           It will, however, be a minimum of 32 bytes on any processors.
-           It would be better to have a number of instructions which
-           perform reading and writing to be multiple to a number of
-           processor's decoders, but it's not always possible.
-        */
-	for(; i>0; i--)
-	{
-		__asm__ __volatile__ (
-        	PREFETCH" 320(%0)\n"
-#if CL_SIZE == 32
-		PREFETCH" 352(%0)\n"
-#endif
-		"movq (%0), %%mm0\n"
-		"movq 8(%0), %%mm1\n"
-		"movq 16(%0), %%mm2\n"
-		"movq 24(%0), %%mm3\n"
-		"movq 32(%0), %%mm4\n"
-		"movq 40(%0), %%mm5\n"
-		"movq 48(%0), %%mm6\n"
-		"movq 56(%0), %%mm7\n"
-		MOVNTQ" %%mm0, (%1)\n"
-		MOVNTQ" %%mm1, 8(%1)\n"
-		MOVNTQ" %%mm2, 16(%1)\n"
-		MOVNTQ" %%mm3, 24(%1)\n"
-		MOVNTQ" %%mm4, 32(%1)\n"
-		MOVNTQ" %%mm5, 40(%1)\n"
-		MOVNTQ" %%mm6, 48(%1)\n"
-		MOVNTQ" %%mm7, 56(%1)\n"
-		:: "r" (cfrom), "r" (tto)
-		: "memory"
-#ifdef FPU_CLOBBERED
-		,FPU_CLOBBERED
-#endif
-#ifdef MMX_CLOBBERED
-		,MMX_CLOBBERED
-#endif
-		);
-		cfrom+=64;
-		tto+=64;
-	}
-#ifdef HAVE_MMX2
-                /* since movntq is weakly-ordered, a "sfence"
-		 * is needed to become ordered again. */
-		__asm__ __volatile__ ("sfence":::"memory");
-#endif
-		/* enables to use FPU */
-		__asm__ __volatile__ (EMMS::
-		:"memory"
-#ifdef FPU_CLOBBERED
-		,FPU_CLOBBERED
-#endif
-#ifdef MMX_CLOBBERED
-		,MMX_CLOBBERED
-#endif
-		);
-	}
-	/*
-	 *	Now do the tail of the block
-	 */
-	if(len) small_memcpy(tto, cfrom, len);
-	return retval;
+	return RENAME(fast_memory_copy)(to,from,len,0);
 }
+
+static inline void * RENAME(fast_stream_copy)(void * to, const void * from, size_t len)
+{
+	return RENAME(fast_memory_copy)(to,from,len,1);
+}

Deleted: mplayerxp/libvo/aclib_x86_64.h
===================================================================
--- mplayerxp/libvo/aclib_x86_64.h	2010-01-17 18:46:44 UTC (rev 106)
+++ mplayerxp/libvo/aclib_x86_64.h	2010-01-18 15:55:23 UTC (rev 107)
@@ -1,191 +0,0 @@
-/*
-  aclib - advanced C library ;)
-  This file contains functions which improve and expand standard C-library
-*/
-
-/* for small memory blocks (<256 bytes) this version is faster */
-#define small_memcpy(to,from,n)\
-{\
-register unsigned long int siz;\
-register unsigned long int dummy;\
-    siz=n&0x7;  n>>=3;\
-    if(siz)\
-__asm__ __volatile__(\
-	"rep; movsb"\
-	:"=&D"(to), "=&S"(from), "=&c"(dummy)\
-/* It's most portable way to notify compiler */\
-/* that edi, esi and ecx are clobbered in asm block. */\
-/* Thanks to A'rpi for hint!!! */\
-        :"0" (to), "1" (from),"2" (siz)\
-	: "memory","cc");\
-    if(n)\
-__asm__ __volatile__(\
-	"rep; movsq"\
-	:"=&D"(to), "=&S"(from), "=&c"(dummy)\
-/* It's most portable way to notify compiler */\
-/* that edi, esi and ecx are clobbered in asm block. */\
-/* Thanks to A'rpi for hint!!! */\
-        :"0" (to), "1" (from),"2" (n)\
-	: "memory","cc");\
-}
-
-
-#define MMREG_SIZE 16ULL
-#define MIN_LEN 257ULL
-#define CL_SIZE 256ULL /*always align on 256 byte boundary */
-
-static inline void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
-{
-	void *retval;
-	const unsigned char *cfrom=from;
-	unsigned char *tto=to;
-	size_t i=0;
-	retval = to;
-	if(!len) return retval;
-        /* PREFETCH has effect even for MOVSB instruction ;) */
-	__asm__ __volatile__ (
-		"prefetcht0 (%0)\n"
-		"prefetcht0 64(%0)\n"
-		"prefetcht0 128(%0)\n"
-		"prefetcht0 192(%0)\n"
-		:: "r" (cfrom));
-        if(len >= MIN_LEN)
-	{
-	  register unsigned long int delta;
-          /* Align destinition to cache-line size -boundary */
-          delta = ((unsigned long int)tto)&(CL_SIZE-1ULL);
-          if(delta)
-	  {
-	    delta=CL_SIZE-delta;
-	    len -=delta;
-	    small_memcpy(tto, cfrom, delta);
-	  }
-	  i = len>>8; /* len/256 */
-	  len=len-(i<<8);
-	}
-	if(i) {
-        /*
-           This algorithm is top effective when the code consequently
-           reads and writes blocks which have size of cache line.
-           Size of cache line is processor-dependent.
-           It will, however, be a minimum of 32 bytes on any processors.
-           It would be better to have a number of instructions which
-           perform reading and writing to be multiple to a number of
-           processor's decoders, but it's not always possible.
-        */
-	if(((unsigned long)cfrom) & 15)
-	/* if SRC is misaligned */
-	for(; i>0; i--)
-	{
-		__asm__ __volatile__ (
-		"prefetcht0 256(%0)\n"
-		"prefetcht0 320(%0)\n"
-		"movdqu (%0), %%xmm0\n"
-		"movdqu 16(%0), %%xmm1\n"
-		"movdqu 32(%0), %%xmm2\n"
-		"movdqu 48(%0), %%xmm3\n"
-		"movdqu 64(%0), %%xmm4\n"
-		"movdqu 80(%0), %%xmm5\n"
-		"movdqu 96(%0), %%xmm6\n"
-		"movdqu 112(%0), %%xmm7\n"
-		"prefetcht0 384(%0)\n"
-		"prefetcht0 448(%0)\n"
-		"movdqu 128(%0), %%xmm8\n"
-		"movdqu 144(%0), %%xmm9\n"
-		"movdqu 160(%0), %%xmm10\n"
-		"movdqu 176(%0), %%xmm11\n"
-		"movdqu 192(%0), %%xmm12\n"
-		"movdqu 208(%0), %%xmm13\n"
-		"movdqu 224(%0), %%xmm14\n"
-		"movdqu 240(%0), %%xmm15\n"
-		"movntdq %%xmm0, (%1)\n"
-		"movntdq %%xmm1, 16(%1)\n"
-		"movntdq %%xmm2, 32(%1)\n"
-		"movntdq %%xmm3, 48(%1)\n"
-		"movntdq %%xmm4, 64(%1)\n"
-		"movntdq %%xmm5, 80(%1)\n"
-		"movntdq %%xmm6, 96(%1)\n"
-		"movntdq %%xmm7, 112(%1)\n"
-		"movntdq %%xmm8, 128(%1)\n"
-		"movntdq %%xmm9, 144(%1)\n"
-		"movntdq %%xmm10, 160(%1)\n"
-		"movntdq %%xmm11, 176(%1)\n"
-		"movntdq %%xmm12, 192(%1)\n"
-		"movntdq %%xmm13, 208(%1)\n"
-		"movntdq %%xmm14, 224(%1)\n"
-		"movntdq %%xmm15, 240(%1)\n"
-		:: "r" (cfrom), "r" (tto):
-		"memory"
-		,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"
-		);
-		cfrom+=256ULL;
-		tto+=256ULL;
-	}
-	else
-	/*
-	   Only if SRC is aligned on 16-byte boundary.
-	   It allows to use movdqa instead of movdqu, which required data
-	   to be aligned or a general-protection exception (#GP) is generated.
-	*/
-	for(; i>0; i--)
-	{
-		__asm__ __volatile__ (
-		"prefetcht0 256(%0)\n"
-		"prefetcht0 320(%0)\n"
-		"movdqa (%0), %%xmm0\n"
-		"movdqa 16(%0), %%xmm1\n"
-		"movdqa 32(%0), %%xmm2\n"
-		"movdqa 48(%0), %%xmm3\n"
-		"movdqa 64(%0), %%xmm4\n"
-		"movdqa 80(%0), %%xmm5\n"
-		"movdqa 96(%0), %%xmm6\n"
-		"movdqa 112(%0), %%xmm7\n"
-		"prefetcht0 384(%0)\n"
-		"prefetcht0 448(%0)\n"
-		"movdqa 128(%0), %%xmm8\n"
-		"movdqa 144(%0), %%xmm9\n"
-		"movdqa 160(%0), %%xmm10\n"
-		"movdqa 176(%0), %%xmm11\n"
-		"movdqa 192(%0), %%xmm12\n"
-		"movdqa 208(%0), %%xmm13\n"
-		"movdqa 224(%0), %%xmm14\n"
-		"movdqa 240(%0), %%xmm15\n"
-		"movntdq %%xmm0, (%1)\n"
-		"movntdq %%xmm1, 16(%1)\n"
-		"movntdq %%xmm2, 32(%1)\n"
-		"movntdq %%xmm3, 48(%1)\n"
-		"movntdq %%xmm4, 64(%1)\n"
-		"movntdq %%xmm5, 80(%1)\n"
-		"movntdq %%xmm6, 96(%1)\n"
-		"movntdq %%xmm7, 112(%1)\n"
-		"movntdq %%xmm8, 128(%1)\n"
-		"movntdq %%xmm9, 144(%1)\n"
-		"movntdq %%xmm10, 160(%1)\n"
-		"movntdq %%xmm11, 176(%1)\n"
-		"movntdq %%xmm12, 192(%1)\n"
-		"movntdq %%xmm13, 208(%1)\n"
-		"movntdq %%xmm14, 224(%1)\n"
-		"movntdq %%xmm15, 240(%1)\n"
-		:: "r" (cfrom), "r" (tto):
-		"memory"
-		,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"
-		);
-		cfrom+=256ULL;
-		tto+=256ULL;
-	  }
-	__asm__ __volatile__ ("sfence":::"memory");
-	}
-	/*
-	 *	Now do the tail of the block
-	 */
-	if(len) small_memcpy(tto, cfrom, len);
-	return retval;
-}
-
-/**
- * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
- */
-static inline void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
-{
-    return memcpy(to,from,len);
-}

Modified: mplayerxp/libvo/dri_vo.h
===================================================================
--- mplayerxp/libvo/dri_vo.h	2010-01-17 18:46:44 UTC (rev 106)
+++ mplayerxp/libvo/dri_vo.h	2010-01-18 15:55:23 UTC (rev 107)
@@ -20,6 +20,7 @@
 #define DRI_CAP_HORZSCALER	0x00000040UL	/**< Driver supports horizontal scaling */
 #define DRI_CAP_VERTSCALER	0x00000080UL	/**< Driver supports vertical scaling */
 #define DRI_CAP_HWOSD		0x00000100UL	/**< Driver supports OSD painting */
+#define DRI_CAP_BUSMASTERING	0x80000000UL	/**< Means: final video buffer but allocated in RAM */
 
 typedef struct dri_surface_cap_s
 {

Modified: mplayerxp/libvo/fastmemcpy.h
===================================================================
--- mplayerxp/libvo/fastmemcpy.h	2010-01-17 18:46:44 UTC (rev 106)
+++ mplayerxp/libvo/fastmemcpy.h	2010-01-18 15:55:23 UTC (rev 107)
@@ -7,24 +7,24 @@
 #include <stddef.h>
 #include <string.h> /* memcpy prototypes */
 extern void * (*fast_memcpy_ptr)(void * to, const void * from, size_t len);
-extern void * (*mem2agpcpy_ptr)(void * to, const void * from, size_t len);
+extern void * (*fast_stream_copy_ptr)(void * to, const void * from, size_t len);
 #define memcpy(a,b,c) (*fast_memcpy_ptr)(a,b,c)
-#define mem2agpcpy(a,b,c) (*mem2agpcpy_ptr)(a,b,c)
+#define stream_copy(a,b,c) (*fast_stream_copy_ptr)(a,b,c)
 #else
-#define mem2agpcpy(a,b,c) memcpy(a,b,c)
+#define stream_copy(a,b,c) memcpy(a,b,c)
 #endif
 
-static inline void * mem2agpcpy_pic(void * dst, const void * src, int bytesPerLine, int height, int dstStride, int srcStride)
+static inline void * stream_copy_pic(void * dst, const void * src, int bytesPerLine, int height, int dstStride, int srcStride)
 {
 	int i;
 	void *retval=dst;
 
-	if(dstStride == srcStride) mem2agpcpy(dst, src, srcStride*height);
+	if(dstStride == srcStride) stream_copy(dst, src, srcStride*height);
 	else
 	{
 		for(i=0; i<height; i++)
 		{
-			mem2agpcpy(dst, src, bytesPerLine);
+			stream_copy(dst, src, bytesPerLine);
 			src+= srcStride;
 			dst+= dstStride;
 		}

Modified: mplayerxp/libvo/osd.c
===================================================================
--- mplayerxp/libvo/osd.c	2010-01-17 18:46:44 UTC (rev 106)
+++ mplayerxp/libvo/osd.c	2010-01-18 15:55:23 UTC (rev 107)
@@ -73,7 +73,7 @@
 static unsigned short fast_osd_16bpp_table[256];
 #endif
 
-static void __FASTCALL__ vo_draw_alpha_rgb15_C(int w,int h, const unsigned char* src, const unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+static void __FASTCALL__ vo_draw_alpha_rgb15_C(int w,int h, const unsigned char* src, const unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride,int finalize){
     int y;
     for(y=0;y<h;y++){
         register unsigned short *dst = (unsigned short*) dstbase;
@@ -105,7 +105,7 @@
     return;
 }
 
-static void __FASTCALL__ vo_draw_alpha_rgb16_C(int w,int h, const unsigned char* src, const unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
+static void __FASTCALL__ vo_draw_alpha_rgb16_C(int w,int h, const unsigned char* src, const unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride,int finalize){
     int y;
     for(y=0;y<h;y++){
         register unsigned short *dst = (unsigned short*) dstbase;
@@ -136,8 +136,8 @@
     return;
 }
 
-static void __FASTCALL__ vo_draw_alpha_uyvy_C(int w,int h, const unsigned char* src, const unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
-    (*vo_draw_alpha_yuy2_ptr)(w,h,src,srca,srcstride,dstbase+1,dststride);
+static void __FASTCALL__ vo_draw_alpha_uyvy_C(int w,int h, const unsigned char* src, const unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride,int finalize){
+    (*vo_draw_alpha_yuy2_ptr)(w,h,src,srca,srcstride,dstbase+1,dststride,finalize);
 }
 
 draw_alpha_f vo_draw_alpha_yv12_ptr=NULL;

Modified: mplayerxp/libvo/osd.h
===================================================================
--- mplayerxp/libvo/osd.h	2010-01-17 18:46:44 UTC (rev 106)
+++ mplayerxp/libvo/osd.h	2010-01-18 15:55:23 UTC (rev 107)
@@ -6,7 +6,7 @@
 
 extern void vo_draw_alpha_init( void ); /* build tables */
 
-typedef void (* __FASTCALL__ draw_alpha_f)(int w,int h, const unsigned char* src, const unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride);
+typedef void (* __FASTCALL__ draw_alpha_f)(int w,int h, const unsigned char* src, const unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride,int finalize);
 
 extern draw_alpha_f vo_draw_alpha_yv12_ptr;
 extern draw_alpha_f vo_draw_alpha_yuy2_ptr;
@@ -15,11 +15,11 @@
 extern draw_alpha_f vo_draw_alpha_rgb32_ptr;
 extern draw_alpha_f vo_draw_alpha_rgb15_ptr;
 extern draw_alpha_f vo_draw_alpha_rgb16_ptr;
-#define vo_draw_alpha_yv12(w,h,src,srca,srcstride,dstbase,dstrstride) (*vo_draw_alpha_yv12_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride)
-#define vo_draw_alpha_yuy2(w,h,src,srca,srcstride,dstbase,dstrstride) (*vo_draw_alpha_yuy2_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride)
-#define vo_draw_alpha_uyvy(w,h,src,srca,srcstride,dstbase,dstrstride) (*vo_draw_alpha_uyvy_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride)
-#define vo_draw_alpha_rgb24(w,h,src,srca,srcstride,dstbase,dstrstride) (*vo_draw_alpha_rgb24_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride)
-#define vo_draw_alpha_rgb32(w,h,src,srca,srcstride,dstbase,dstrstride) (*vo_draw_alpha_rgb32_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride)
-#define vo_draw_alpha_rgb15(w,h,src,srca,srcstride,dstbase,dstrstride) (*vo_draw_alpha_rgb15_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride)
-#define vo_draw_alpha_rgb16(w,h,src,srca,srcstride,dstbase,dstrstride) (*vo_draw_alpha_rgb16_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride)
+#define vo_draw_alpha_yv12(w,h,src,srca,srcstride,dstbase,dstrstride,finalize) (*vo_draw_alpha_yv12_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride,finalize)
+#define vo_draw_alpha_yuy2(w,h,src,srca,srcstride,dstbase,dstrstride,finalize) (*vo_draw_alpha_yuy2_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride,finalize)
+#define vo_draw_alpha_uyvy(w,h,src,srca,srcstride,dstbase,dstrstride,finalize) (*vo_draw_alpha_uyvy_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride,finalize)
+#define vo_draw_alpha_rgb24(w,h,src,srca,srcstride,dstbase,dstrstride,finalize) (*vo_draw_alpha_rgb24_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride,finalize)
+#define vo_draw_alpha_rgb32(w,h,src,srca,srcstride,dstbase,dstrstride,finalize) (*vo_draw_alpha_rgb32_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride,finalize)
+#define vo_draw_alpha_rgb15(w,h,src,srca,srcstride,dstbase,dstrstride,finalize) (*vo_draw_alpha_rgb15_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride,finalize)
+#define vo_draw_alpha_rgb16(w,h,src,srca,srcstride,dstbase,dstrstride,finalize) (*vo_draw_alpha_rgb16_ptr)(w,h,src,srca,srcstride,dstbase,dstrstride,finalize)
 #endif

Modified: mplayerxp/libvo/osd_template.c
===================================================================
--- mplayerxp/libvo/osd_template.c	2010-01-17 18:46:44 UTC (rev 106)
+++ mplayerxp/libvo/osd_template.c	2010-01-18 15:55:23 UTC (rev 107)
@@ -50,7 +50,7 @@
 
 #endif
 
-static inline void RENAME(vo_draw_alpha_yv12)(int w,int h,const unsigned char* src,const unsig...
 
[truncated message content]