From: Keith W. <ke...@va...> - 2001-03-08 16:35:35
|
Brian Paul wrote: > > Jeff Epler wrote: > > > > On Tue, Mar 06, 2001 at 08:02:46PM -0500, Brian Paul wrote: > > > GL_NV_vertex_program is a really nice, exciting extension. It would > > > be great to have it in Mesa. The issue is getting permission from > > > NVIDIA to implement it in open-source. "NVIDIA Proprietary" is pretty > > > clearly plastered over the pdf file. > > > > You're right. I had read a version with the "proprietary" watermark, > > but when I saw the version in this other PDF file the watermark was gone > > and I thought it must have been released without restriction. However, > > it's still marked as proprietary. What does this mean for the status > > of the code I've written based on the document? Should I avoid sharing it > > with anybody at this time? > You may want to avoid sharing the nv-specific stuff, but any progress on otf codegen has lots of application beyond that extension -- I can think of a dozen uses for something like this. Keith |
From: Josh V. <ho...@na...> - 2001-03-08 19:24:34
|
Keith Whitwell <ke...@va...> writes: > You may want to avoid sharing the nv-specific stuff, but any progress on otf > codegen has lots of application beyond that extension -- I can think of a > dozen uses for something like this. What would you guys think of putting stuff like this in Mesa? #include <dlfcn.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> void * compile_function(const char *name, const char *source, void **r_handle) { char name_c[256], name_so[256], cmdline[768]; FILE *fp; size_t len = strlen(source); pid_t pid = getpid(); void *handle, *sym; sprintf(name_c, "/tmp/%ld_%.200s.c", (long)pid, name); sprintf(name_so, "/tmp/%ld_%.200s.so", (long)pid, name); fp = fopen(name_c, "w"); if (fp == NULL) return NULL; if (fwrite(source, len, 1, fp) != 1) { fclose(fp); remove(name_c); return NULL; } fclose(fp); sprintf(cmdline, "gcc -O2 -fomit-frame-pointer -shared %s -o %s", name_c, name_so); if (system(cmdline) != 0) { remove(name_c); return NULL; } remove(name_c); handle = dlopen(name_so, RTLD_NOW); remove(name_so); if (handle == NULL) return NULL; sym = dlsym(handle, name); if (sym == NULL) { dlclose(handle); return NULL; } *r_handle = handle; return sym; } void free_function(void *handle) { dlclose(handle); } static const char *convert_template = " void NAME(void *vsrc, void *vdst, int count) { SRC_TYPE *src = vsrc; DST_TYPE *dst = vdst; int i; unsigned int s, d, r, g, b, a; for (i = 0; i < count; i++) { s = src[i]; r = s >> SRC_RPOS; r &= (1 << SRC_RSZ) - 1; #if SRC_RSZ < DST_RSZ r = r * ((1 << DST_RSZ) - 1) / ((1 << SRC_RSZ - 1)); #else r >>= SRC_RSZ - DST_RSZ; #endif r <<= DST_RPOS; g = s >> SRC_GPOS; g &= (1 << SRC_GSZ) - 1; #if SRC_GSZ < DST_GSZ g = g * ((1 << DST_GSZ) - 1) / ((1 << SRC_GSZ - 1)); #else g >>= SRC_GSZ - DST_GSZ; #endif g <<= DST_GPOS; b = s >> SRC_BPOS; b &= (1 << SRC_BSZ) - 1; #if SRC_BSZ < DST_BSZ b = b * ((1 << DST_BSZ) - 1) / ((1 << SRC_BSZ - 1)); #else b >>= SRC_BSZ - DST_BSZ; #endif b <<= DST_BPOS; a = s >> SRC_APOS; a &= (1 << SRC_ASZ) - 1; #if SRC_ASZ < DST_ASZ a = a * ((1 << DST_ASZ) - 1) / ((1 << SRC_ASZ - 1)); #else a >>= SRC_ASZ - DST_ASZ; #endif a <<= DST_APOS; d = r | g | b | a; dst[i] = d; } } "; int convert_pixels(void *src, int src_pixel_size, int src_rpos, int src_gpos, int src_bpos, int src_apos, int src_rsz, int src_gsz, int src_bsz, int src_asz, void *dst, int dst_pixel_size, int dst_rpos, int dst_gpos, int dst_bpos, int dst_apos, int dst_rsz, int dst_gsz, int dst_bsz, int dst_asz, int count) { char buf[10000], name[200]; char *p = buf; void (*fn)(void *src, void *dst, int count); void *handle; switch (dst_pixel_size) { case 1: case 2: case 4: break; default: return 0; } sprintf(name, "convert_%d%d%d%d_%d%d%d%d", src_rsz, src_gsz, src_bsz, src_asz, dst_rsz, dst_gsz, dst_bsz, dst_asz); p += sprintf(p, "#define NAME %s\n", name); p += sprintf(p, "#define SRC_TYPE "); switch (src_pixel_size) { case 1: p += sprintf(p, "unsigned char\n"); break; case 2: p += sprintf(p, "unsigned short\n"); break; case 4: p += sprintf(p, "unsigned int\n"); break; default: return 0; } p += sprintf(p, "#define SRC_RPOS %d\n", src_rpos); p += sprintf(p, "#define SRC_GPOS %d\n", src_gpos); p += sprintf(p, "#define SRC_BPOS %d\n", src_bpos); p += sprintf(p, "#define SRC_APOS %d\n", src_apos); p += sprintf(p, "#define SRC_RSZ %d\n", src_rsz); p += sprintf(p, "#define SRC_GSZ %d\n", src_gsz); p += sprintf(p, "#define SRC_BSZ %d\n", src_bsz); p += sprintf(p, "#define SRC_ASZ %d\n", src_asz); p += sprintf(p, "#define DST_TYPE "); switch (dst_pixel_size) { case 1: p += sprintf(p, "unsigned char\n"); break; case 2: p += sprintf(p, "unsigned short\n"); break; case 4: p += sprintf(p, "unsigned int\n"); break; default: return 0; } p += sprintf(p, "#define DST_RPOS %d\n", dst_rpos); p += sprintf(p, "#define DST_GPOS %d\n", dst_gpos); p += sprintf(p, "#define DST_BPOS %d\n", dst_bpos); p += sprintf(p, "#define DST_APOS %d\n", dst_apos); p += sprintf(p, "#define DST_RSZ %d\n", dst_rsz); p += sprintf(p, "#define DST_GSZ %d\n", dst_gsz); p += sprintf(p, "#define DST_BSZ %d\n", dst_bsz); p += sprintf(p, "#define DST_ASZ %d\n", dst_asz); strcpy(p, convert_template); fn = compile_function(name, buf, &handle); if (fn != NULL) fn(src, dst, count); else return 0; free_function(handle); return 1; } int main(void) { unsigned int src[8] = { 0xff555555, 0x55ff5555, 0x5555ff55, 0x555555ff, 0x00555555, 0x55005555, 0x55550055, 0x55555500 }; unsigned short dst[8]; int i; i = convert_pixels( src, 4, /* pixel size */ 16, 8, 0, 24, /* offsets for r, g, b, a */ 8, 8, 8, 8, /* sizes for r, g, b, a */ dst, 2, 8, 4, 0, 12, 4, 4, 4, 4, 8); /* count */ if (i == 0) return 1; for (i = 0; i < 8; i++) printf("%08x %04x\n", src[i], dst[i]); return 0; } |
From: Stephen J B. <sj...@li...> - 2001-03-08 19:40:39
|
On 8 Mar 2001, Josh Vanderhoof wrote: > Keith Whitwell <ke...@va...> writes: > > > You may want to avoid sharing the nv-specific stuff, but any progress on otf > > codegen has lots of application beyond that extension -- I can think of a > > dozen uses for something like this. > > What would you guys think of putting stuff like this in Mesa? <snip compile-on-the-fly stuff> I think the problems are significant: 1) Whilst it'll (perhaps) improve frame rates once the code has been compiled, you could easily get a several-second pause when you first refer to something that triggers this action. In some applications, that would be disasterous. 2) You are depending on picking a valid compiler correctly and that there is a compiler on the system at all. There are many (and growing) platforms where software-only-Mesa might make sense (eg http://www.agendacomputing.com - a Linux-based PDA) where no compiler exists on the target machine. I believe that when SGI do this kind of thing in their Windoze software-OpenGL, they were generating x86 machine code directly into memory without using compilers or even assemblers. That's a viable technique...at least for the most popular CPU types. ---- Steve Baker (817)619-2657 (Vox/Vox-Mail) L3Com/Link Simulation & Training (817)619-2466 (Fax) Work: sj...@li... http://www.link.com Home: sjb...@ai... http://web2.airmail.net/sjbaker1 |
From: Josh V. <ho...@na...> - 2001-03-08 21:46:45
|
"Stephen J Baker" <sj...@li...> writes: > I think the problems are significant: > > 1) Whilst it'll (perhaps) improve frame rates once the code > has been compiled, you could easily get a several-second > pause when you first refer to something that triggers this > action. In some applications, that would be disasterous. This doesn't seem like a big problem to me. In most cases, you could compile the entire pipeline in a fraction of a second. If you had to compile something really big, I guess you could fork() the compiler at a low priority level and use a general purpose routine until the compile finishes. > 2) You are depending on picking a valid compiler correctly and > that there is a compiler on the system at all. There are > many (and growing) platforms where software-only-Mesa might > make sense (eg http://www.agendacomputing.com - a Linux-based > PDA) where no compiler exists on the target machine. You're right about this. I would never suggest that Mesa not function without having a compiler available at run time. Lots of people run Linux without installing a compiler. > I believe that when SGI do this kind of thing in their Windoze > software-OpenGL, they were generating x86 machine code directly > into memory without using compilers or even assemblers. That's > a viable technique...at least for the most popular CPU types. That's the obvious way to approach the problem, but there are problems there too. Say I have a routine that uses 1 more variable that there are registers in the worst case, but usually fits in the registers. Should the x86 generator have a register allocator? How about if some versions of the generated code have common subexpressions? Should the generator have a CSE pass? The generator will eventually start to look like a bad compiler that can only generate x86 code. Josh |
From: Allen A. <ak...@po...> - 2001-03-08 22:00:33
|
I'm a great fan of dynamic code generation. Here are two of my favorite online references on the subject: http://www.cs.washington.edu/research/projects/unisw/DynComp/www/ http://www.cs.columbia.edu/~library/TR-repository/reports/reports-1992/cucs-039-92.ps.gz Plenty more available from Google, of course. Now back to the discussion...I tend to agree with Steve. Allen |
From: Gareth H. <ga...@va...> - 2001-03-09 00:07:05
|
Josh Vanderhoof wrote: > > > I believe that when SGI do this kind of thing in their Windoze > > software-OpenGL, they were generating x86 machine code directly > > into memory without using compilers or even assemblers. That's > > a viable technique...at least for the most popular CPU types. > > That's the obvious way to approach the problem, but there are problems > there too. Say I have a routine that uses 1 more variable that there > are registers in the worst case, but usually fits in the registers. > Should the x86 generator have a register allocator? How about if some > versions of the generated code have common subexpressions? Should the > generator have a CSE pass? The generator will eventually start to > look like a bad compiler that can only generate x86 code. My approach for this is perhaps slightly different to yours. I was thinking more along the lines of having the compiled functions stored as strings, which can be copied and edited by the context as needed. This allows the context to insert hard-coded memory references and so on. Similarly, I've been kicking around a design of a dynamic software renderer, which is built from chunks of compiled code that can be tweaked and chained together depending on the current GL state etc. I don't think actually "compiling" code is the answer -- it's more a customization of pre-compiled code to suit the current context. -- Gareth |
From: Gareth H. <ga...@va...> - 2001-03-09 00:18:51
|
Gareth Hughes wrote: > > My approach for this is perhaps slightly different to yours. I was > thinking more along the lines of having the compiled functions stored as > strings, which can be copied and edited by the context as needed. This > allows the context to insert hard-coded memory references and so on. > Similarly, I've been kicking around a design of a dynamic software > renderer, which is built from chunks of compiled code that can be > tweaked and chained together depending on the current GL state etc. I > don't think actually "compiling" code is the answer -- it's more a > customization of pre-compiled code to suit the current context. I should also add that functions can be built up from basic blocks, and these blocks are stored as strings and are edited/chained together to form the function as required. -- Gareth |
From: Josh V. <ho...@na...> - 2001-03-09 20:41:14
|
Gareth Hughes <ga...@va...> writes: > My approach for this is perhaps slightly different to yours. I was > thinking more along the lines of having the compiled functions stored as > strings, which can be copied and edited by the context as needed. This > allows the context to insert hard-coded memory references and so on. > Similarly, I've been kicking around a design of a dynamic software > renderer, which is built from chunks of compiled code that can be > tweaked and chained together depending on the current GL state etc. I > don't think actually "compiling" code is the answer -- it's more a > customization of pre-compiled code to suit the current context. In that situation, compiling the code would help greatly. Here is what you give up by pre-compiling the code: 1. Global optimizations. In the pre-compiled version, there will be an artificial boundary at each chunk. Where the dynamic-compiled version would be free to optimize across chunks, you would be stuck forcing the cpu into a known state at each boundary. 2. Easy processor specific optimizations. If you're compiling at run time, you can get processor specific optimizations by just changing the compiler flags. 3. Portability. The dynamic-compiled version would splice the chunks together automatically. I can't think of a portable way to concatenate pre-compiled code correctly. (Does gcc have an attribute for it?) 4. Flexibility. The pre-compiled code would have to follow a rigid template. If you compile at run time, you have the flexibily to change variable types and structure layouts at run time. Of course, you do have to take a start up penalty with run-time compiled code. Considering that the average system is around 700MHz (just a guess) and getting faster every day, I think people may be overestimating how expensive using a real compiler would be. Josh |
From: Stephen J B. <sj...@li...> - 2001-03-12 14:45:30
|
On 9 Mar 2001, Josh Vanderhoof wrote: > Of course, you do have to take a start up penalty with run-time > compiled code. Considering that the average system is around 700MHz > (just a guess) and getting faster every day, I think people may be > overestimating how expensive using a real compiler would be. The "average" system with a 700MHz CPU also has a kick-ass graphics card that makes this discussion largely irrelevent. If software-only rendering has any kind of a future at all, it's in PDA's, phones and internet-capable toasters...where the overhead of having a compiler on board at all (let alone actually running it on anything) tends to be unacceptable. ---- Steve Baker (817)619-2657 (Vox/Vox-Mail) L3Com/Link Simulation & Training (817)619-2466 (Fax) Work: sj...@li... http://www.link.com Home: sjb...@ai... http://web2.airmail.net/sjbaker1 |
From: Gareth H. <ga...@va...> - 2001-03-12 15:07:47
|
Stephen J Baker wrote: > > On 9 Mar 2001, Josh Vanderhoof wrote: > > > Of course, you do have to take a start up penalty with run-time > > compiled code. Considering that the average system is around 700MHz > > (just a guess) and getting faster every day, I think people may be > > overestimating how expensive using a real compiler would be. > > The "average" system with a 700MHz CPU also has a kick-ass graphics > card that makes this discussion largely irrelevent. If software-only > rendering has any kind of a future at all, it's in PDA's, phones and > internet-capable toasters...where the overhead of having a compiler > on board at all (let alone actually running it on anything) tends to > be unacceptable. Which is why I've been focusing on code generation for hardware drivers, particularly the begin/end functions used in immediate mode rendering. With hardware T&L, you basically want the non-glVertex* functions to write directly to the "current" hardware-format vertex, with glVertex flushing this to a DMA buffer. There isn't really much a compiler can do with this: struct foo_vertex_o3n3tc2 { GLfloat obj[3]; GLfloat normal[3]; GLfloat tc[2]; } void foo_Normal3fv( const GLfloat *v ) { GET_FOO_CONTEXT(fmesa); COPY_3V( fmesa->current.o3n3t2.normal, v ); } void foo_Vertex3fv( const GLfloat *v ) { GET_FOO_CONTEXT(fmesa); COPY_3V( fmesa->current.o3n3t2.obj, v ); if ( fmesa->dma.space >= 8 ) { COPY_DWORDS( fmesa->dma.head, fmesa->current.o3n3tc2, 8 ); fmesa->dma.head += 8; fmesa->dma.space -= 8; } else { fmesa->get_dma( fmesa, fmesa->current.o3n3tc2, 8 ); } } (The above is based on code by Keith Whitwell) You can, however, substitute most of that with hard-coded addresses for the current context and make it as streamlined as possible. If you want to call these functions 10, 30, 100 million times a second, you want them to be *fast*... -- Gareth |
From: Keith W. <ke...@va...> - 2001-03-12 21:47:11
|
Gareth Hughes wrote: > > Stephen J Baker wrote: > > > > On 9 Mar 2001, Josh Vanderhoof wrote: > > > > > Of course, you do have to take a start up penalty with run-time > > > compiled code. Considering that the average system is around 700MHz > > > (just a guess) and getting faster every day, I think people may be > > > overestimating how expensive using a real compiler would be. > > > > The "average" system with a 700MHz CPU also has a kick-ass graphics > > card that makes this discussion largely irrelevent. If software-only > > rendering has any kind of a future at all, it's in PDA's, phones and > > internet-capable toasters...where the overhead of having a compiler > > on board at all (let alone actually running it on anything) tends to > > be unacceptable. > > Which is why I've been focusing on code generation for hardware drivers, > particularly the begin/end functions used in immediate mode rendering. > With hardware T&L, you basically want the non-glVertex* functions to > write directly to the "current" hardware-format vertex, with glVertex > flushing this to a DMA buffer. > > There isn't really much a compiler can do with this: > > struct foo_vertex_o3n3tc2 { > GLfloat obj[3]; > GLfloat normal[3]; > GLfloat tc[2]; > } > > void foo_Normal3fv( const GLfloat *v ) > { > GET_FOO_CONTEXT(fmesa); > COPY_3V( fmesa->current.o3n3t2.normal, v ); > } > > void foo_Vertex3fv( const GLfloat *v ) > { > GET_FOO_CONTEXT(fmesa); > COPY_3V( fmesa->current.o3n3t2.obj, v ); > if ( fmesa->dma.space >= 8 ) { > COPY_DWORDS( fmesa->dma.head, fmesa->current.o3n3tc2, 8 ); > fmesa->dma.head += 8; > fmesa->dma.space -= 8; > } else { > fmesa->get_dma( fmesa, fmesa->current.o3n3tc2, 8 ); > } > } > > (The above is based on code by Keith Whitwell) > > You can, however, substitute most of that with hard-coded addresses for > the current context and make it as streamlined as possible. If you want > to call these functions 10, 30, 100 million times a second, you want > them to be *fast*... I agree with this, but I'm inclined to persue gcc-based codegen, at least as a prototype for a more hard-wired system to follow it. I think we need to make some progress in this area, and gcc looks like it's got a real low entry level. It might be possible to use a tokenized generation language that can either be expanded by the C preprocessor, or understood explicitly by a follow-on bespoke codegen module. Some of the optimizations for the tnl functions like you've got above, such as hardwiring addresses, using the right (ie non -fPIC) compiler options, can be acheived using gcc. So in short, I don't know whether the overhead of gcc will be a problem at runtime, but the low overhead for us right now makes it look like a real attractive way to get started. If it works out ok at runtime, we've finished unexpectedly early. Keith |
From: Gareth H. <ga...@va...> - 2001-03-13 00:26:10
|
Keith Whitwell wrote: > > I agree with this, but I'm inclined to persue gcc-based codegen, at least as a > prototype for a more hard-wired system to follow it. I think we need to make > some progress in this area, and gcc looks like it's got a real low entry > level. Fair enough. I just hadn't thought it would be worth going to that amount of trouble, but there are obvious advantages in doing so. > It might be possible to use a tokenized generation language that can either be > expanded by the C preprocessor, or understood explicitly by a follow-on > bespoke codegen module. > > Some of the optimizations for the tnl functions like you've got above, such as > hardwiring addresses, using the right (ie non -fPIC) compiler options, can be > acheived using gcc. > > So in short, I don't know whether the overhead of gcc will be a problem at > runtime, but the low overhead for us right now makes it look like a real > attractive way to get started. If it works out ok at runtime, we've finished > unexpectedly early. I'm putting the finishing touches on the driver tnl module code that goes along with the core Mesa stuff I committed yesterday, so once that's done I might play with this a little (at least get some basic generation happening). -- Gareth |
From: Josh V. <ho...@na...> - 2001-03-12 23:30:41
|
Gareth Hughes <ga...@va...> writes: > struct foo_vertex_o3n3tc2 { > GLfloat obj[3]; > GLfloat normal[3]; > GLfloat tc[2]; > } > > void foo_Normal3fv( const GLfloat *v ) > { > GET_FOO_CONTEXT(fmesa); > COPY_3V( fmesa->current.o3n3t2.normal, v ); > } > > void foo_Vertex3fv( const GLfloat *v ) > { > GET_FOO_CONTEXT(fmesa); > COPY_3V( fmesa->current.o3n3t2.obj, v ); > if ( fmesa->dma.space >= 8 ) { > COPY_DWORDS( fmesa->dma.head, fmesa->current.o3n3tc2, 8 ); > fmesa->dma.head += 8; > fmesa->dma.space -= 8; > } else { > fmesa->get_dma( fmesa, fmesa->current.o3n3tc2, 8 ); > } > } > > (The above is based on code by Keith Whitwell) > > You can, however, substitute most of that with hard-coded addresses for > the current context and make it as streamlined as possible. If you want > to call these functions 10, 30, 100 million times a second, you want > them to be *fast*... I'm getting sidetracked here but: 1. Before you start doing crazy optimizations why wouldn't you rewrite it like this (get rid of the "dma.space" variable): void foo_Vertex3fv( const GLfloat *v ) { GET_FOO_CONTEXT(fmesa); COPY_3V( fmesa->current.o3n3t2.obj, v ); if ( fmesa->dma.head + 8 <= fmesa->dma.end_of_space ) { COPY_DWORDS( fmesa->dma.head, fmesa->current.o3n3tc2, 8 ); fmesa->dma.head += 8; } else { fmesa->get_dma( fmesa, fmesa->current.o3n3tc2, 8 ); } } 2. Hard-coding the address would help you, but not by very much. I would expect it to save you one MOV instruction. On Intel cpu's, the "reg+offset" addressing mode is "free". (You probably knew that already.) 3. If you want to go all out on this code, you could probably use mprotect() to avoid the buffer overflow test entirely. That would only be a good idea if buffer overflows are rare though. Josh |
From: Gareth H. <ga...@va...> - 2001-03-13 00:33:03
|
Josh Vanderhoof wrote: > > I'm getting sidetracked here but: > > 1. Before you start doing crazy optimizations why wouldn't you rewrite > it like this (get rid of the "dma.space" variable): > > void foo_Vertex3fv( const GLfloat *v ) > { > GET_FOO_CONTEXT(fmesa); > COPY_3V( fmesa->current.o3n3t2.obj, v ); > if ( fmesa->dma.head + 8 <= fmesa->dma.end_of_space ) { > COPY_DWORDS( fmesa->dma.head, fmesa->current.o3n3tc2, 8 ); > fmesa->dma.head += 8; > } else { > fmesa->get_dma( fmesa, fmesa->current.o3n3tc2, 8 ); > } > } Sure -- it was just a cut and paste of some old code Keith sent me. Minor point. > 2. Hard-coding the address would help you, but not by very much. I > would expect it to save you one MOV instruction. On Intel cpu's, > the "reg+offset" addressing mode is "free". (You probably knew > that already.) Yep :-) GET_FOO_CONTEXT() is the big one, and perhaps I should have stressed that a little more. This may involve a function call to determine the current context, due to thread-safety issues. > 3. If you want to go all out on this code, you could probably use > mprotect() to avoid the buffer overflow test entirely. That would > only be a good idea if buffer overflows are rare though. You need buffer overflows as they end up flushing the DMA buffer. In this case, get_dma() would flush the current buffer and acquire a new one. -- Gareth |
From: Josh V. <ho...@na...> - 2001-03-13 01:35:53
|
Gareth Hughes <ga...@va...> writes: > > 3. If you want to go all out on this code, you could probably use > > mprotect() to avoid the buffer overflow test entirely. That would > > only be a good idea if buffer overflows are rare though. > > You need buffer overflows as they end up flushing the DMA buffer. In > this case, get_dma() would flush the current buffer and acquire a new > one. I was too vague. You could mprotect() the page after the DMA buffer to PROT_NONE and install a SIGSEGV handler that flushes the buffer if the SIGSEGV was on the end of the DMA buffer. Then you can just let Vertex3f segfault when it uses the buffer up. You would save yourself a test and a predicted jump per call. Not much, but it sounds like you really want to optimize this. Josh |
From: Gareth H. <ga...@va...> - 2001-03-13 01:47:01
|
Josh Vanderhoof wrote: > > I was too vague. You could mprotect() the page after the DMA buffer > to PROT_NONE and install a SIGSEGV handler that flushes the buffer if > the SIGSEGV was on the end of the DMA buffer. Then you can just let > Vertex3f segfault when it uses the buffer up. You would save yourself > a test and a predicted jump per call. Not much, but it sounds like > you really want to optimize this. Sorry, I get you. You might even be able to do fancy things and catch this in the kernel module -- that would be neat :-) Save us an ioctl... However, the way we manage DMA space will change soon and this will no longer be possible. The drivers will be using a dynamic allocation scheme to get better utilization of the DMA space. Fixed size buffers suck. I'll play around with similar ideas -- mprotect()ing the DMA buffer and catching the segfault in the kernel sure would be neat... Don't know if we can do that, but it's worth looking at. -- Gareth |
From: Keith W. <ke...@va...> - 2001-03-13 17:12:47
|
Gareth Hughes wrote: > > Josh Vanderhoof wrote: > > > > I was too vague. You could mprotect() the page after the DMA buffer > > to PROT_NONE and install a SIGSEGV handler that flushes the buffer if > > the SIGSEGV was on the end of the DMA buffer. Then you can just let > > Vertex3f segfault when it uses the buffer up. You would save yourself > > a test and a predicted jump per call. Not much, but it sounds like > > you really want to optimize this. > > Sorry, I get you. You might even be able to do fancy things and catch > this in the kernel module -- that would be neat :-) Save us an ioctl... Now that is kindof neat... Keith |
From: Allen A. <ak...@po...> - 2001-03-13 01:48:01
|
On Mon, Mar 12, 2001 at 08:39:30PM -0500, Josh Vanderhoof wrote: | ... You could mprotect() the page after the DMA buffer | to PROT_NONE and install a SIGSEGV handler that flushes the buffer if | the SIGSEGV was on the end of the DMA buffer. Then you can just let | Vertex3f segfault when it uses the buffer up. I haven't used the POSIX signalling system, so please pardon an uninformed question. We couldn't afford to install a short-term signal handler each time Vertex3f is called; it would be cheaper just to test for overflow. However, any handler installed long-term by the driver could interfere with a handler installed by the application. Is there a good way to work around that problem? Allen |
From: Gareth H. <ga...@va...> - 2001-03-13 01:55:00
|
Allen Akin wrote: > > On Mon, Mar 12, 2001 at 08:39:30PM -0500, Josh Vanderhoof wrote: > | ... You could mprotect() the page after the DMA buffer > | to PROT_NONE and install a SIGSEGV handler that flushes the buffer if > | the SIGSEGV was on the end of the DMA buffer. Then you can just let > | Vertex3f segfault when it uses the buffer up. > > I haven't used the POSIX signalling system, so please pardon an > uninformed question. We couldn't afford to install a short-term > signal handler each time Vertex3f is called; it would be cheaper just > to test for overflow. However, any handler installed long-term by the > driver could interfere with a handler installed by the application. > Is there a good way to work around that problem? I agree. And yes, just test for buffer overflow :-) Still, it's a neat idea. -- Gareth |
From: Gareth H. <ga...@va...> - 2001-03-13 02:02:53
|
Compiling this with the code Josh sent takes around 70 msec on my 700MHz PIII laptop. void codegen_test( void ) { printf( "hello, world!\n" ); } That's a long time at 60fps... -- Gareth |
From: Josh V. <ho...@na...> - 2001-03-13 03:32:37
|
Gareth Hughes <ga...@va...> writes: > Compiling this with the code Josh sent takes around 70 msec on my 700MHz > PIII laptop. > > void codegen_test( void ) > { > printf( "hello, world!\n" ); > } > > That's a long time at 60fps... I would think you would keep a cache of the stuff that you've compiled to avoid re-compiling every time you want to call the routine. So you're really looking at a one time start up cost. Is there something where the code would change on every single frame? Josh |
From: Gareth H. <ga...@va...> - 2001-03-13 05:15:37
|
Josh Vanderhoof wrote: > > I would think you would keep a cache of the stuff that you've compiled > to avoid re-compiling every time you want to call the routine. So > you're really looking at a one time start up cost. Is there something > where the code would change on every single frame? Absolutely. I'm not sure how much of it needs to be truly dynamic, but it's useful to know how expensive compilation can be. It might be worth compiling it as a .o file and using the BFD library to extract the function, for instance. -- Gareth |
From: Stephen J B. <sj...@li...> - 2001-03-13 17:14:54
|
On 12 Mar 2001, Josh Vanderhoof wrote: > Gareth Hughes <ga...@va...> writes: > > > Compiling this with the code Josh sent takes around 70 msec on my 700MHz > > PIII laptop. > > > > void codegen_test( void ) > > { > > printf( "hello, world!\n" ); > > } > > > > That's a long time at 60fps... > > I would think you would keep a cache of the stuff that you've compiled > to avoid re-compiling every time you want to call the routine. So > you're really looking at a one time start up cost. Is there something > where the code would change on every single frame? It depends what you use this code for. If it's for something like varying the Z buffer depth - then a large overhead might be acceptable (although I still maintain that creating a dependence on a C compiler is a disasterously bad decision). If it's something like catering for a particular combination of (say) blend mode, texture type, dither and fogging (the kind of thing that the SGI OpenGL-for-Windoze appears to do) - then taking even a couple of milliseconds at runtime when you first happen to see a polygon with that combination of states would be completely unacceptable - even with caching. ---- Steve Baker (817)619-2657 (Vox/Vox-Mail) L3Com/Link Simulation & Training (817)619-2466 (Fax) Work: sj...@li... http://www.link.com Home: sjb...@ai... http://web2.airmail.net/sjbaker1 |
From: Josh V. <ho...@na...> - 2001-03-13 23:44:43
|
"Stephen J Baker" <sj...@li...> writes: > If it's for something like varying the Z buffer depth - then a large > overhead might be acceptable (although I still maintain that creating > a dependence on a C compiler is a disasterously bad decision). I just want to make sure there isn't a misunderstanding here - the C compiler would only be used to compile special case functions. There would always be a generic fallback, so Mesa would work (although more slowly) even without a compiler installed. > If it's something like catering for a particular combination of (say) > blend mode, texture type, dither and fogging (the kind of thing that > the SGI OpenGL-for-Windoze appears to do) - then taking even a couple > of milliseconds at runtime when you first happen to see a polygon with > that combination of states would be completely unacceptable - even with > caching. If it was just 20 milliseconds or so, I doubt most users would notice. The problem is, I can easily see the delay being 500 ms for a big function. I can think of two workarounds: 1. Make the cache persistent, then the delay would only happen the first time you run your application. 2. Run the compiler in the background at low priority (say 10% cpu). You would have to make do with the generic fallback until the compile completes, but there wouldn't be a 'hitch' when the application exposes a different path. Do you think that would be acceptable? Josh |
From: Keith W. <ke...@va...> - 2001-03-14 06:59:38
|
> > If it was just 20 milliseconds or so, I doubt most users would notice. > The problem is, I can easily see the delay being 500 ms for a big > function. > > I can think of two workarounds: > > 1. Make the cache persistent, then the delay would only happen the > first time you run your application. > > 2. Run the compiler in the background at low priority (say 10% cpu). > You would have to make do with the generic fallback until the > compile completes, but there wouldn't be a 'hitch' when the > application exposes a different path. > > Do you think that would be acceptable? They sound like reasonable strategies, but first I think I'd just like to see something working. Keith |