Re: [Mesa3d-dev] Texutil optimizations - an interesting story

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Gareth Hughes <ga...@va...> writes:

> The movzx instructions are very fast (they've been specially optimized
> for doing mixed 8/16 and 32 bit operations on PPro/PII/PIII processors),
> while the second listing has lots of partial register stalls which seem
> to be killing the performance.
> 
> I'm stunned and amazed.

It looks like gcc is trying to keep the values as ubytes for too long.
If you change r,g,b,a from GLubyte to GLuint, it seems to to a better
job.

With an old GCC 2.96 snapshot (20000529) and using uints instead of
ubytes, you get this:

	.file	"mesaprs.c"
	.version	"01.01"
gcc2_compiled.:
.text
	.align 4
.globl _mesa_convert_teximage_argb_4444
	.type	 _mesa_convert_teximage_argb_4444,@function
_mesa_convert_teximage_argb_4444:
	pushl	%ebp
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	subl	$12, %esp
	movl	52(%esp), %eax
	sall	$1, %eax
	movl	$0, (%esp)
	movl	40(%esp), %edx
	movl	%eax, 8(%esp)
	cmpl	%edx, (%esp)
	movl	44(%esp), %eax
	movl	68(%esp), %esi
	movl	%eax, 4(%esp)
	jae	.L13
	.p2align 2
.L6:
	xorl	%ebp, %ebp
	xorl	%edi, %edi
	cmpl	36(%esp), %ebp
	jae	.L14
	.p2align 2
.L10:
	movzbl	(%edi,%esi), %edx
	movzbl	3(%edi,%esi), %eax
	andl	$240, %eax
	andl	$240, %edx
	movzbl	1(%edi,%esi), %ecx
	sall	$4, %edx
	sall	$8, %eax
	movzbl	2(%edi,%esi), %ebx
	orl	%edx, %eax
	andl	$240, %ecx
	orl	%ecx, %eax
	shrl	$4, %ebx
	orl	%ebx, %eax
	movl	4(%esp), %edx
	movw	%ax, (%edx,%ebp,2)
	incl	%ebp
	addl	$4, %edi
	cmpl	36(%esp), %ebp
	jb	.L10
.L14:
	movl	48(%esp), %eax
	incl	(%esp)
	movl	40(%esp), %edx
	addl	8(%esp), %esi
	addl	%eax, 4(%esp)
	cmpl	%edx, (%esp)
	jb	.L6
.L13:
	addl	$12, %esp
	popl	%ebx
	popl	%esi
	movl	$1, %eax
	popl	%edi
	popl	%ebp
	ret
.Lfe1:
	.size	 _mesa_convert_teximage_argb_4444,.Lfe1-_mesa_convert_teximage_argb_4444
	.ident	"GCC: (GNU) 2.96 20000529 (experimental)"