From: Josh V. <ho...@na...> - 2000-10-16 18:57:49
|
Gareth Hughes <ga...@va...> writes: > The movzx instructions are very fast (they've been specially optimized > for doing mixed 8/16 and 32 bit operations on PPro/PII/PIII processors), > while the second listing has lots of partial register stalls which seem > to be killing the performance. > > I'm stunned and amazed. It looks like gcc is trying to keep the values as ubytes for too long. If you change r,g,b,a from GLubyte to GLuint, it seems to to a better job. With an old GCC 2.96 snapshot (20000529) and using uints instead of ubytes, you get this: .file "mesaprs.c" .version "01.01" gcc2_compiled.: .text .align 4 .globl _mesa_convert_teximage_argb_4444 .type _mesa_convert_teximage_argb_4444,@function _mesa_convert_teximage_argb_4444: pushl %ebp pushl %edi pushl %esi pushl %ebx subl $12, %esp movl 52(%esp), %eax sall $1, %eax movl $0, (%esp) movl 40(%esp), %edx movl %eax, 8(%esp) cmpl %edx, (%esp) movl 44(%esp), %eax movl 68(%esp), %esi movl %eax, 4(%esp) jae .L13 .p2align 2 .L6: xorl %ebp, %ebp xorl %edi, %edi cmpl 36(%esp), %ebp jae .L14 .p2align 2 .L10: movzbl (%edi,%esi), %edx movzbl 3(%edi,%esi), %eax andl $240, %eax andl $240, %edx movzbl 1(%edi,%esi), %ecx sall $4, %edx sall $8, %eax movzbl 2(%edi,%esi), %ebx orl %edx, %eax andl $240, %ecx orl %ecx, %eax shrl $4, %ebx orl %ebx, %eax movl 4(%esp), %edx movw %ax, (%edx,%ebp,2) incl %ebp addl $4, %edi cmpl 36(%esp), %ebp jb .L10 .L14: movl 48(%esp), %eax incl (%esp) movl 40(%esp), %edx addl 8(%esp), %esi addl %eax, 4(%esp) cmpl %edx, (%esp) jb .L6 .L13: addl $12, %esp popl %ebx popl %esi movl $1, %eax popl %edi popl %ebp ret .Lfe1: .size _mesa_convert_teximage_argb_4444,.Lfe1-_mesa_convert_teximage_argb_4444 .ident "GCC: (GNU) 2.96 20000529 (experimental)" |