From: Gareth H. <ga...@va...> - 2000-12-29 22:39:12
|
Before I forget (again), here's a fix for the truncation of pixel values when reading back from a 16bpp colour buffer. In driver_span.c, change the following definition for RGB565 spans from this: #define READ_RGBA( rgba, _x, _y ) \ do { \ GLushort p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ rgba[0] = (p >> 8) & 0xf8; \ rgba[1] = (p >> 3) & 0xfc; \ rgba[2] = (p << 3) & 0xf8; \ rgba[3] = 0xff; \ } while (0) to this: #define READ_RGBA( rgba, _x, _y ) \ do { \ GLushort p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ rgba[0] = (p >> 8) & 0xf8; \ rgba[1] = (p >> 3) & 0xfc; \ rgba[2] = (p << 3) & 0xf8; \ rgba[3] = 0xff; \ if ( rgba[0] & 0x08 ) rgba[0] |= 0x07; \ if ( rgba[1] & 0x04 ) rgba[1] |= 0x03; \ if ( rgba[2] & 0x08 ) rgba[2] |= 0x07; \ } while (0) Rather busy or I'd do it myself. Fixes glean test problems on r128, Radeon. -- Gareth |
From: <ra...@ra...> - 2000-12-30 02:48:09
|
On 29 Dec, Gareth Hughes scribbled: -> Before I forget (again), here's a fix for the truncation of pixel values -> when reading back from a 16bpp colour buffer. -> -> In driver_span.c, change the following definition for RGB565 spans from -> this: -> -> #define READ_RGBA( rgba, _x, _y ) \ -> do { \ -> GLushort p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ -> rgba[0] = (p >> 8) & 0xf8; \ -> rgba[1] = (p >> 3) & 0xfc; \ -> rgba[2] = (p << 3) & 0xf8; \ -> rgba[3] = 0xff; \ -> } while (0) -> -> to this: -> -> #define READ_RGBA( rgba, _x, _y ) \ -> do { \ -> GLushort p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ -> rgba[0] = (p >> 8) & 0xf8; \ -> rgba[1] = (p >> 3) & 0xfc; \ -> rgba[2] = (p << 3) & 0xf8; \ -> rgba[3] = 0xff; \ -> if ( rgba[0] & 0x08 ) rgba[0] |= 0x07; \ -> if ( rgba[1] & 0x04 ) rgba[1] |= 0x03; \ -> if ( rgba[2] & 0x08 ) rgba[2] |= 0x07; \ -> } while (0) -> -> Rather busy or I'd do it myself. Fixes glean test problems on r128, -> Radeon. just thoguht i'd mention - you'll bypass the cmp's and jne/jeq' branches and get some better performance with: #define READ_RGBA( rgba, _x, _y ) \ do { \ GLushort p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ rgba[0] = ((p >> 8) & 0xf8) | ((p >> 13) & 0x7); \ rgba[1] = ((p >> 3) & 0xfc) | ((p >> 9) & 0x3); \ rgba[2] = ((p << 3) & 0xf8) | ((p >> 2) & 0x7); \ rgba[3] = 0xff; \ } while (0) now even smarter is do 2 pixels at once and just do alignment/single pixel cleanups either end of the span if tis not a multipel of 2 or not aligned to 2 pixel boundaries... #define READ_2_RGBA( rgba, _x, _y ) \ do { \ GLuint p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ GLuint r, g, b; \ r = ((p >> 8) & 0xf8) | ((p >> 13) & 0x7); \ g = ((p >> 3) & 0xfc) | ((p >> 9) & 0x3); \ b = ((p << 3) & 0xf8) | ((p >> 2) & 0x7); \ rgba[0] = r & 0xff; \ rgba[1] = g & 0xff; \ rgba[3] = b & 0xff; \ rgba[4] = 0xff; \ rgba[5] = (r >> 16) & 0xff; \ rgba[6] = (g >> 16) & 0xff; \ rgba[7] = (b >> 16) & 0xff; \ rgba[8] = 0xff; \ } while (0) :) -- --------------- Codito, ergo sum - "I code, therefore I am" -------------------- The Rasterman (Carsten Haitzler) ra...@ra... ra...@va... ra...@en... ra...@li... ra...@zi... |
From: Gareth H. <ga...@va...> - 2000-12-30 03:06:17
|
ra...@ra... wrote: > > just thoguht i'd mention - you'll bypass the cmp's and jne/jeq' branches > and get some better performance with: > > #define READ_RGBA( rgba, _x, _y ) \ > do { \ > GLushort p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ > rgba[0] = ((p >> 8) & 0xf8) | ((p >> 13) & 0x7); \ > rgba[1] = ((p >> 3) & 0xfc) | ((p >> 9) & 0x3); \ > rgba[2] = ((p << 3) & 0xf8) | ((p >> 2) & 0x7); \ > rgba[3] = 0xff; \ > } while (0) > > now even smarter is do 2 pixels at once and just do alignment/single > pixel cleanups either end of the span if tis not a multipel of 2 or not > aligned to 2 pixel boundaries... > #define READ_2_RGBA( rgba, _x, _y ) \ > do { \ > GLuint p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ > GLuint r, g, b; \ > r = ((p >> 8) & 0xf8) | ((p >> 13) & 0x7); \ > g = ((p >> 3) & 0xfc) | ((p >> 9) & 0x3); \ > b = ((p << 3) & 0xf8) | ((p >> 2) & 0x7); \ > rgba[0] = r & 0xff; \ > rgba[1] = g & 0xff; \ > rgba[3] = b & 0xff; \ > rgba[4] = 0xff; \ > rgba[5] = (r >> 16) & 0xff; \ > rgba[6] = (g >> 16) & 0xff; \ > rgba[7] = (b >> 16) & 0xff; \ > rgba[8] = 0xff; \ > } while (0) Software fallbacks are slow for so many other reasons that this just isn't worth it (well, maybe the first option). Hell, we could do the whole thing in assembly and see a 0% speedup... The whole point of that macro is that it reads individual pixels. Thus, you can't just go ahead and read two pixels. The software fallback mechanism in Mesa could be made a lot faster, but the core contributors certainly have better things to do. It would require significantly more work than changing a pixel reading macro. If your hardware can do it blitting scanlines or even rectangles would be much faster, but it's arguable whether this is worth it as the main point of software fallbacks is correctness not performance. -- Gareth |
From: <ra...@ra...> - 2001-01-02 23:42:57
|
On 30 Dec, Gareth Hughes scribbled: -> ra...@ra... wrote: ->> ->> just thoguht i'd mention - you'll bypass the cmp's and jne/jeq' branches ->> and get some better performance with: ->> ->> #define READ_RGBA( rgba, _x, _y ) \ ->> do { \ ->> GLushort p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ ->> rgba[0] = ((p >> 8) & 0xf8) | ((p >> 13) & 0x7); \ ->> rgba[1] = ((p >> 3) & 0xfc) | ((p >> 9) & 0x3); \ ->> rgba[2] = ((p << 3) & 0xf8) | ((p >> 2) & 0x7); \ ->> rgba[3] = 0xff; \ ->> } while (0) ->> ->> now even smarter is do 2 pixels at once and just do alignment/single ->> pixel cleanups either end of the span if tis not a multipel of 2 or not ->> aligned to 2 pixel boundaries... ->> #define READ_2_RGBA( rgba, _x, _y ) \ ->> do { \ ->> GLuint p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ ->> GLuint r, g, b; \ ->> r = ((p >> 8) & 0xf8) | ((p >> 13) & 0x7); \ ->> g = ((p >> 3) & 0xfc) | ((p >> 9) & 0x3); \ ->> b = ((p << 3) & 0xf8) | ((p >> 2) & 0x7); \ ->> rgba[0] = r & 0xff; \ ->> rgba[1] = g & 0xff; \ ->> rgba[3] = b & 0xff; \ ->> rgba[4] = 0xff; \ ->> rgba[5] = (r >> 16) & 0xff; \ ->> rgba[6] = (g >> 16) & 0xff; \ ->> rgba[7] = (b >> 16) & 0xff; \ ->> rgba[8] = 0xff; \ ->> } while (0) -> -> Software fallbacks are slow for so many other reasons that this just -> isn't worth it (well, maybe the first option). Hell, we could do the -> whole thing in assembly and see a 0% speedup... -> -> The whole point of that macro is that it reads individual pixels. Thus, -> you can't just go ahead and read two pixels. -> -> The software fallback mechanism in Mesa could be made a lot faster, but -> the core contributors certainly have better things to do. It would -> require significantly more work than changing a pixel reading macro. If -> your hardware can do it blitting scanlines or even rectangles would be -> much faster, but it's arguable whether this is worth it as the main -> point of software fallbacks is correctness not performance. I know.. but after having spent a good time with imlib2 to make it fast as a software image rendering lib - i actually can get a decent framerate out of it in software no problems - so software renderign is feasible.. if done well.. anyway - just thought i'd suggest :) -- --------------- Codito, ergo sum - "I code, therefore I am" -------------------- The Rasterman (Carsten Haitzler) ra...@ra... ra...@va... ra...@en... ra...@li... ra...@zi... |
From: Nathan H. <na...@ma...> - 2001-01-03 00:31:22
|
On Tue, Jan 02, 2001 at 03:50:21PM -0800, ra...@ra... wrote: > On 30 Dec, Gareth Hughes scribbled: > -> > -> Software fallbacks are slow for so many other reasons that this just > -> isn't worth it (well, maybe the first option). Hell, we could do the > -> whole thing in assembly and see a 0% speedup... > -> > -> The whole point of that macro is that it reads individual pixels. Thus, > -> you can't just go ahead and read two pixels. > -> > -> The software fallback mechanism in Mesa could be made a lot faster, but > -> the core contributors certainly have better things to do. It would > -> require significantly more work than changing a pixel reading macro. If > -> your hardware can do it blitting scanlines or even rectangles would be > -> much faster, but it's arguable whether this is worth it as the main > -> point of software fallbacks is correctness not performance. > > I know.. but after having spent a good time with imlib2 to make it fast > as a software image rendering lib - i actually can get a decent > framerate out of it in software no problems - so software renderign is > feasible.. if done well.. anyway - just thought i'd suggest :) Sure, imlib is a good example of fast software rendering. I think Gareth's main point is that the software renderer in Mesa is slow because of the inherent nature of the Mesa code, not because some code paths aren't written in assembly. Improving readpixel functions either by using assembly or through CPU-specific code re-arrangements is nibbling at the edges, where the only way to make Mesa software rendering fast is to break out the meataxe and perform some potentially lethal surgery. |
From: <ra...@ra...> - 2001-01-03 00:43:48
|
On 3 Jan, Nathan Hand scribbled: -> On Tue, Jan 02, 2001 at 03:50:21PM -0800, ra...@ra... wrote: ->> On 30 Dec, Gareth Hughes scribbled: ->> -> ->> -> Software fallbacks are slow for so many other reasons that this just ->> -> isn't worth it (well, maybe the first option). Hell, we could do the ->> -> whole thing in assembly and see a 0% speedup... ->> -> ->> -> The whole point of that macro is that it reads individual pixels. Thus, ->> -> you can't just go ahead and read two pixels. ->> -> ->> -> The software fallback mechanism in Mesa could be made a lot faster, but ->> -> the core contributors certainly have better things to do. It would ->> -> require significantly more work than changing a pixel reading macro. If ->> -> your hardware can do it blitting scanlines or even rectangles would be ->> -> much faster, but it's arguable whether this is worth it as the main ->> -> point of software fallbacks is correctness not performance. ->> ->> I know.. but after having spent a good time with imlib2 to make it fast ->> as a software image rendering lib - i actually can get a decent ->> framerate out of it in software no problems - so software renderign is ->> feasible.. if done well.. anyway - just thought i'd suggest :) -> -> Sure, imlib is a good example of fast software rendering. I think -> Gareth's main point is that the software renderer in Mesa is slow -> because of the inherent nature of the Mesa code, not because some -> code paths aren't written in assembly. well - yes and no - it could be improved markedly form the little i saw - but thats an issue all on its own :) theres a lot of fariyl common cases that coudl eb special-cased and accelerated - with use of mmx i'm sure you'd see a 10 tines speedup - if not more - i wish i had the time to help with that :( -> Improving readpixel functions either by using assembly or through -> CPU-specific code re-arrangements is nibbling at the edges, where -> the only way to make Mesa software rendering fast is to break out -> the meataxe and perform some potentially lethal surgery. definitely - i got that impression from reading it - its clean and correct - but speed wise would need a godo butchering to make ti ready for speed :) -- --------------- Codito, ergo sum - "I code, therefore I am" -------------------- The Rasterman (Carsten Haitzler) ra...@ra... ra...@va... ra...@en... ra...@li... ra...@zi... |
From: Brian P. <br...@va...> - 2001-01-03 15:02:34
|
ra...@ra... wrote: > > On 3 Jan, Nathan Hand scribbled: > -> On Tue, Jan 02, 2001 at 03:50:21PM -0800, ra...@ra... wrote: > ->> On 30 Dec, Gareth Hughes scribbled: > ->> -> > ->> -> Software fallbacks are slow for so many other reasons that this just > ->> -> isn't worth it (well, maybe the first option). Hell, we could do the > ->> -> whole thing in assembly and see a 0% speedup... > ->> -> > ->> -> The whole point of that macro is that it reads individual pixels. Thus, > ->> -> you can't just go ahead and read two pixels. > ->> -> > ->> -> The software fallback mechanism in Mesa could be made a lot faster, but > ->> -> the core contributors certainly have better things to do. It would > ->> -> require significantly more work than changing a pixel reading macro. If > ->> -> your hardware can do it blitting scanlines or even rectangles would be > ->> -> much faster, but it's arguable whether this is worth it as the main > ->> -> point of software fallbacks is correctness not performance. > ->> > ->> I know.. but after having spent a good time with imlib2 to make it fast > ->> as a software image rendering lib - i actually can get a decent > ->> framerate out of it in software no problems - so software renderign is > ->> feasible.. if done well.. anyway - just thought i'd suggest :) > -> > -> Sure, imlib is a good example of fast software rendering. I think > -> Gareth's main point is that the software renderer in Mesa is slow > -> because of the inherent nature of the Mesa code, not because some > -> code paths aren't written in assembly. > > well - yes and no - it could be improved markedly form the little i saw > - but thats an issue all on its own :) theres a lot of fariyl common > cases that coudl eb special-cased and accelerated - with use of mmx i'm > sure you'd see a 10 tines speedup - if not more - i wish i had the time > to help with that :( > > -> Improving readpixel functions either by using assembly or through > -> CPU-specific code re-arrangements is nibbling at the edges, where > -> the only way to make Mesa software rendering fast is to break out > -> the meataxe and perform some potentially lethal surgery. > > definitely - i got that impression from reading it - its clean and > correct - but speed wise would need a godo butchering to make ti ready > for speed :) I don't think so. As I wrote before, one could readily optimize the primitives that need special attention. There's no need for any sort of butchering. -Brian |
From: Brian P. <br...@va...> - 2001-01-03 14:59:52
|
Nathan Hand wrote: > > On Tue, Jan 02, 2001 at 03:50:21PM -0800, ra...@ra... wrote: > > On 30 Dec, Gareth Hughes scribbled: > > -> > > -> Software fallbacks are slow for so many other reasons that this just > > -> isn't worth it (well, maybe the first option). Hell, we could do the > > -> whole thing in assembly and see a 0% speedup... > > -> > > -> The whole point of that macro is that it reads individual pixels. Thus, > > -> you can't just go ahead and read two pixels. > > -> > > -> The software fallback mechanism in Mesa could be made a lot faster, but > > -> the core contributors certainly have better things to do. It would > > -> require significantly more work than changing a pixel reading macro. If > > -> your hardware can do it blitting scanlines or even rectangles would be > > -> much faster, but it's arguable whether this is worth it as the main > > -> point of software fallbacks is correctness not performance. > > > > I know.. but after having spent a good time with imlib2 to make it fast > > as a software image rendering lib - i actually can get a decent > > framerate out of it in software no problems - so software renderign is > > feasible.. if done well.. anyway - just thought i'd suggest :) > > Sure, imlib is a good example of fast software rendering. I think > Gareth's main point is that the software renderer in Mesa is slow > because of the inherent nature of the Mesa code, not because some > code paths aren't written in assembly. > > Improving readpixel functions either by using assembly or through > CPU-specific code re-arrangements is nibbling at the edges, where > the only way to make Mesa software rendering fast is to break out > the meataxe and perform some potentially lethal surgery. That's not really accurate. If there's a particular type of operation that you want to optimize for software rendering (such as smooth-shaded, texture modulated triangles) there's nothing stopping one from writing a specialized triangle function for that case. It's pretty simple to hook in optimized point, line, triangle, glReadPixels, glDrawPixels and glBitmap functions into any software-based Mesa driver. The issue is whether it's worthwhile to put a lot of time and effort into this sort of thing when h/w rendering is the norm. -Brian |
From: Nathan H. <na...@ma...> - 2000-12-30 06:18:47
|
On Fri, Dec 29, 2000 at 03:59:46PM -0800, ra...@ra... wrote: > On 29 Dec, Gareth Hughes scribbled: > -> > -> #define READ_RGBA( rgba, _x, _y ) \ > -> do { \ > -> GLushort p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ > -> rgba[0] = (p >> 8) & 0xf8; \ > -> rgba[1] = (p >> 3) & 0xfc; \ > -> rgba[2] = (p << 3) & 0xf8; \ > -> rgba[3] = 0xff; \ > -> if ( rgba[0] & 0x08 ) rgba[0] |= 0x07; \ > -> if ( rgba[1] & 0x04 ) rgba[1] |= 0x03; \ > -> if ( rgba[2] & 0x08 ) rgba[2] |= 0x07; \ > -> } while (0) > > just thoguht i'd mention - you'll bypass the cmp's and jne/jeq' branches > and get some better performance with: > > #define READ_RGBA( rgba, _x, _y ) \ > do { \ > GLushort p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ > rgba[0] = ((p >> 8) & 0xf8) | ((p >> 13) & 0x7); \ > rgba[1] = ((p >> 3) & 0xfc) | ((p >> 9) & 0x3); \ > rgba[2] = ((p << 3) & 0xf8) | ((p >> 2) & 0x7); \ > rgba[3] = 0xff; \ > } while (0) Performance is an exceptional non-issue with the software fallback paths. Incremental improvements like this are welcome for hardware accelerated paths but they are lost in the noise otherwise. |