From: quzar <qu...@us...> - 2024-09-28 18:16:37
|
This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "A pseudo Operating System for the Dreamcast.". The branch, master has been updated via f9786ba4a0532de4ccf53635319993c9d643e97a (commit) via e63b693fd53647cba334b1aba1acc593e5439dd0 (commit) from 9726ac31349d72eb0af754f473799a398e1ab69a (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit f9786ba4a0532de4ccf53635319993c9d643e97a Author: Falco Girgis <gyr...@gm...> Date: Sat Sep 28 13:14:47 2024 -0500 Cleaned up + optimized pvrmark_strips_direct bench (#707) * Cleaned up + optimized pvrmark_strips_direct bench - The pvrmark_strips_direct benchmark is an EXTREMELY important example, both for our ability to benchmark the PVR API's absolute fastest render path, AND as the only example we have given to users for how to use it. - The benchmark happens to do a bunch of dumb things that are artifically limiting performance here and are making runs inconstistent. - Removed branch from critical path - Stopped clearing out unused vertex attributes like u, v, and oargb - Stopped performing printf logic witin the timed sections - Cleaned up benchmark - Increased the rate at which the INCREMENT phase increased poly count (because it takes forever to normalize, atm) - Added brief description * Addressed code review feedback. commit e63b693fd53647cba334b1aba1acc593e5439dd0 Author: Falco Girgis <gyr...@gm...> Date: Sat Sep 28 13:03:30 2024 -0500 pvr_prim() Performance Improvement + PVR Hybrid Submission Model (#740) * Initial implementation of a "hybrid" vertex submission strategy using the existing PVR APIs. To use this method: 1) Enable DMA mode when initializing the PVR 2) Set the in-RAM vertex buffers for each list type to be deferred with the DMA (pvr_set_vertbuf()). 3) For the list type to be immediately submitted: - pvr_list_begin(LIST_TYPE); - pvr_dr_begin(dr_state); - /* COMMIT POLYGONS */ - pvr_list_end(); //!!!! NOTICE YOU DO NOT CALL PVR_DR_END()!!! * pvr_prim()'s performance was destroyed previously when making it have to lock/unlock a mutex for SQ ownership every call. Fixed the regression and then optimized further, by forwarding directly to sq_fast_cpy() (when DMA isn't active), which is now okay, since every vertex and header has a 32-byte alignment. ----------------------------------------------------------------------- Summary of changes: doc/CHANGELOG.md | 1 + examples/dreamcast/pvr/pvrmark/pvrmark.c | 38 +++++-- .../pvrmark_strips_direct/pvrmark_strips_direct.c | 115 ++++++++++++--------- kernel/arch/dreamcast/hardware/pvr/pvr_internal.h | 1 + kernel/arch/dreamcast/hardware/pvr/pvr_irq.c | 10 ++ kernel/arch/dreamcast/hardware/pvr/pvr_scene.c | 63 ++++++++--- kernel/arch/dreamcast/include/dc/pvr.h | 3 + 7 files changed, 158 insertions(+), 73 deletions(-) diff --git a/doc/CHANGELOG.md b/doc/CHANGELOG.md index 107191f2..fad4dd20 100644 --- a/doc/CHANGELOG.md +++ b/doc/CHANGELOG.md @@ -6,6 +6,7 @@ Platform-specific changes are prefixed with the platform name, otherwise the cha - Added pvrtex utility by TapamN to utils [DF == Daniel Fairchild] - Added . & .. directories to filesystems that lack it [AB] - Replaced previous implementation of realpath() to remove license from AUTHORS [AB] +- Enabled hybrid PVR DR/DMA vertex submission in driver + sped up pvr_prim() [FG] ## KallistiOS version 2.1.0 - Cleaned up generated stubs files on a make clean [Lawrence Sebald == LS] diff --git a/examples/dreamcast/pvr/pvrmark/pvrmark.c b/examples/dreamcast/pvr/pvrmark/pvrmark.c index c2edb7a1..8d0eccd6 100644 --- a/examples/dreamcast/pvr/pvrmark/pvrmark.c +++ b/examples/dreamcast/pvr/pvrmark/pvrmark.c @@ -10,7 +10,7 @@ pvr_init_params_t pvr_params = { { PVR_BINSIZE_16, PVR_BINSIZE_0, PVR_BINSIZE_0, PVR_BINSIZE_0, PVR_BINSIZE_0 }, - 512 * 1024 + 1024 * 1024 }; enum { PHASE_HALVE, PHASE_INCR, PHASE_DECR, PHASE_FINAL }; @@ -69,11 +69,16 @@ void setup(void) { pvr_poly_compile(&hdr, &cxt); } +int oldseed = 0xdeadbeef; void do_frame(void) { pvr_vertex_t vert; int x, y, z; int size; int i, col; + int seed = oldseed; + +#define nextnum() seed = seed * 1164525 + 1013904223; +#define getnum(mn) (seed & ((mn) - 1)) vid_border_color(0, 0, 0); pvr_wait_ready(); @@ -82,20 +87,32 @@ void do_frame(void) { pvr_list_begin(PVR_LIST_OP_POLY); pvr_prim(&hdr, sizeof(hdr)); + x = getnum(1024); + nextnum(); + y = getnum(512); + nextnum(); + z = getnum(128) + 1; + nextnum(); + size = getnum(64) + 1; + nextnum(); + col = getnum(256); + nextnum(); + for(i = 0; i < polycnt; i++) { - x = rand() % 640; - y = rand() % 480; - z = rand() % 100 + 1; - size = rand() % 50; - col = rand() % 256; + x = (x + ((getnum(128)) - 64)) & 1023; + nextnum(); + y = (y + ((getnum(128)) - 64)) % 511; + nextnum(); + size = getnum(64) + 1; + nextnum(); + col = getnum(256); + nextnum(); vert.flags = PVR_CMD_VERTEX; vert.x = x - size; vert.y = y + size; vert.z = z; - vert.u = vert.v = 0.0f; vert.argb = col | (col << 8) | (col << 16) | 0xff000000; - vert.oargb = 0; pvr_prim(&vert, sizeof(vert)); vert.y = y - size; @@ -110,6 +127,7 @@ void do_frame(void) { pvr_list_finish(); pvr_scene_finish(); vid_border_color(0, 255, 0); + oldseed = seed; } time_t begin; @@ -126,9 +144,9 @@ void check_switch(void) { now = time(NULL); if(now >= (begin + 5)) { - begin = time(NULL); printf(" Average Frame Rate: ~%f fps (%d pps)\n", (double)avgfps, (int)(polycnt * avgfps)); - + begin = time(NULL); + switch(phase) { case PHASE_HALVE: diff --git a/examples/dreamcast/pvr/pvrmark_strips_direct/pvrmark_strips_direct.c b/examples/dreamcast/pvr/pvrmark_strips_direct/pvrmark_strips_direct.c index df3e1d73..6f0bd3ad 100644 --- a/examples/dreamcast/pvr/pvrmark_strips_direct/pvrmark_strips_direct.c +++ b/examples/dreamcast/pvr/pvrmark_strips_direct/pvrmark_strips_direct.c @@ -1,25 +1,34 @@ /* KallistiOS ##version## pvrmark_strips_direct.c - (c)2002 Megan Potter + Copyright (C) 2002 Megan Potter + Copyright (C) 2024 Falco Girgis +*/ + +/* + This file serves as both an example of and benchmark for KOS's + rendering fast path: the PVR direct rendering API. */ #include <kos.h> #include <stdlib.h> #include <time.h> +#include <limits.h> + +enum { PHASE_HALVE, PHASE_INCR, PHASE_DECR, PHASE_FINAL }; -pvr_init_params_t pvr_params = { +static pvr_init_params_t pvr_params = { { PVR_BINSIZE_16, PVR_BINSIZE_0, PVR_BINSIZE_0, PVR_BINSIZE_0, PVR_BINSIZE_0 }, 512 * 1024 }; -enum { PHASE_HALVE, PHASE_INCR, PHASE_DECR, PHASE_FINAL }; - -int polycnt; -int phase = PHASE_HALVE; -float avgfps = -1; +static int polycnt; +static int phase = PHASE_HALVE; +static float avgfps = -1; +static pvr_poly_hdr_t hdr; +static time_t begin; -void running_stats(void) { +static void running_stats(void) { pvr_stats_t stats; pvr_get_stats(&stats); @@ -29,7 +38,7 @@ void running_stats(void) { avgfps = (avgfps + stats.frame_rate) / 2.0f; } -void stats(void) { +static void stats(void) { pvr_stats_t stats; pvr_get_stats(&stats); @@ -38,7 +47,7 @@ void stats(void) { } -int check_start(void) { +static int check_start(void) { maple_device_t *cont; cont_state_t *state; @@ -56,9 +65,7 @@ int check_start(void) { return 0; } -pvr_poly_hdr_t hdr; - -void setup(void) { +static void setup(void) { pvr_poly_cxt_t cxt; pvr_init(&pvr_params); @@ -69,17 +76,26 @@ void setup(void) { pvr_poly_compile(&hdr, &cxt); } -int oldseed = 0xdeadbeef; -void do_frame(void) { - pvr_vertex_t * vert; - int x, y, z; - int i, col; +inline static int getnum(int *seed, int mn) { + int num = (*seed & ((mn) - 1)); + *seed = *seed * 1164525 + 1013904223; + return num; +} + +inline static void get_vert(int *seed, int *x, int *y, int *col) { + *x = (*x + ((getnum(seed, 64)) - 32)) & 1023; + *y = (*y + ((getnum(seed, 64)) - 32)) & 511; + *col = getnum(seed, INT32_MAX); +} + +static void do_frame(void) { + pvr_vertex_t *vert; + int x=0, y=0, z=0, col=0; + int i; + static int oldseed = 0xdeadbeef; int seed = oldseed; pvr_dr_state_t dr_state; -#define nextnum() seed = seed * 1164525 + 1013904223; -#define getnum(mn) (seed & ((mn) - 1)) - vid_border_color(0, 0, 0); pvr_wait_ready(); vid_border_color(255, 0, 0); @@ -89,54 +105,45 @@ void do_frame(void) { pvr_dr_init(&dr_state); - x = getnum(1024); - nextnum(); - y = getnum(512); - nextnum(); - z = getnum(128) + 1; - nextnum(); - col = getnum(256); - nextnum(); + get_vert(&seed, &x, &y, &col); + z = getnum(&seed, 128) + 1; vert = pvr_dr_target(dr_state); vert->flags = PVR_CMD_VERTEX; vert->x = x; vert->y = y; vert->z = z; - vert->u = vert->v = 0.0f; - vert->argb = col | (col << 8) | (col << 16) | 0xff000000; - vert->oargb = 0; + vert->argb = 0xff000000 | col; pvr_dr_commit(vert); for(i = 0; i < polycnt; i++) { - x = (x + ((getnum(64)) - 32)) & 1023; - nextnum(); - y = (y + ((getnum(64)) - 32)) % 511; - nextnum(); - col = getnum(256); - nextnum(); + get_vert(&seed, &x, &y, &col); + vert = pvr_dr_target(dr_state); vert->flags = PVR_CMD_VERTEX; vert->x = x; vert->y = y; vert->z = z; - vert->u = vert->v = 0.0f; - vert->argb = col | (col << 8) | (col << 16) | 0xff000000; - vert->oargb = 0; - - if(i == (polycnt - 1)) - vert->flags = PVR_CMD_VERTEX_EOL; - + vert->argb = 0xff000000 | col; pvr_dr_commit(vert); } + get_vert(&seed, &x, &y, &col); + + vert = pvr_dr_target(dr_state); + vert->flags = PVR_CMD_VERTEX_EOL; + vert->x = x; + vert->y = y; + vert->z = z; + vert->argb = 0xff000000 | col; + pvr_dr_commit(vert); + pvr_list_finish(); pvr_scene_finish(); vid_border_color(0, 255, 0); oldseed = seed; } -time_t begin; void switch_tests(int ppf) { printf("Beginning new test: %d polys per frame (%d per second at 60fps)\n", ppf, ppf * 60); @@ -146,50 +153,58 @@ void switch_tests(int ppf) { void check_switch(void) { time_t now; + int new_polycnt = polycnt; now = time(NULL); if(now >= (begin + 5)) { - begin = time(NULL); printf(" Average Frame Rate: ~%f fps (%d pps)\n", (double)avgfps, (int)(polycnt * avgfps)); switch(phase) { case PHASE_HALVE: if(avgfps < 55) { - switch_tests(polycnt / 2); + new_polycnt = polycnt / 2; } else { printf(" Entering PHASE_INCR\n"); phase = PHASE_INCR; + break; } break; case PHASE_INCR: if(avgfps >= 55) { - switch_tests(polycnt + 500); + new_polycnt = polycnt + 2500; } else { printf(" Entering PHASE_DECR\n"); phase = PHASE_DECR; + break; } break; case PHASE_DECR: if(avgfps < 55) { - switch_tests(polycnt - 200); + new_polycnt = polycnt - 200; } else { printf(" Entering PHASE_FINAL\n"); phase = PHASE_FINAL; + break; } break; case PHASE_FINAL: break; } + + begin = time(NULL); + + if(new_polycnt != polycnt) + switch_tests(new_polycnt); } } diff --git a/kernel/arch/dreamcast/hardware/pvr/pvr_internal.h b/kernel/arch/dreamcast/hardware/pvr/pvr_internal.h index f7846232..3db2278d 100644 --- a/kernel/arch/dreamcast/hardware/pvr/pvr_internal.h +++ b/kernel/arch/dreamcast/hardware/pvr/pvr_internal.h @@ -223,6 +223,7 @@ typedef struct { // Output address for to-texture mode uint32 to_txr_addr[2]; + // Whether direct rendering is active or not uint32 dr_used; } pvr_state_t; diff --git a/kernel/arch/dreamcast/hardware/pvr/pvr_irq.c b/kernel/arch/dreamcast/hardware/pvr/pvr_irq.c index 11cb2045..8e72fb94 100644 --- a/kernel/arch/dreamcast/hardware/pvr/pvr_irq.c +++ b/kernel/arch/dreamcast/hardware/pvr/pvr_irq.c @@ -40,6 +40,16 @@ static void dma_next_list(void *data) { // Get the buffers for this frame. b = pvr_state.dma_buffers + (pvr_state.ram_target ^ 1); + /* If we are in PVR DMA mode, yet we haven't associated a + RAM-residing vertex buffer with the current list + (because we submitted it directly, for example), + mark it as complete, so we skip trying to DMA it. */ + if(!b->base[i]) { + pvr_state.lists_dmaed |= 1 << i; + pvr_state.lists_transferred |= 1 << i; + continue; + } + // Flush the last 32 bytes out of dcache, just in case. // dcache_flush_range((ptr_t)(b->base[i] + b->ptr[i] - 32), 32); dcache_flush_range((ptr_t)(b->base[i]), b->ptr[i] + 32); diff --git a/kernel/arch/dreamcast/hardware/pvr/pvr_scene.c b/kernel/arch/dreamcast/hardware/pvr/pvr_scene.c index 0b897bae..922c6500 100644 --- a/kernel/arch/dreamcast/hardware/pvr/pvr_scene.c +++ b/kernel/arch/dreamcast/hardware/pvr/pvr_scene.c @@ -1,13 +1,15 @@ /* KallistiOS ##version## pvr_scene.c - Copyright (C)2002,2004 Megan Potter + Copyright (C) 2002,2004 Megan Potter + Copyright (C) 2024 Falco Girgis */ #include <assert.h> #include <stdio.h> #include <string.h> +#include <kos/string.h> #include <kos/thread.h> #include <dc/pvr.h> #include <dc/sq.h> @@ -135,6 +137,13 @@ void pvr_scene_begin_txr(pvr_ptr_t txr, uint32 *rx, uint32 *ry) { pvr_scene_begin(); } +static bool pvr_list_dma; + +inline static bool pvr_list_uses_dma(pvr_list_t list) { + return pvr_state.dma_mode && + pvr_state.dma_buffers[pvr_state.ram_target].base[list]; +} + /* Begin collecting data for the given list type. Lists do not have to be submitted in any particular order, but all types of a list must be submitted at once. If the given list has already been closed, then an @@ -153,6 +162,11 @@ int pvr_list_begin(pvr_list_t list) { if(pvr_state.list_reg_open != -1 && pvr_state.list_reg_open != (int)list) pvr_list_finish(); + pvr_list_dma = pvr_list_uses_dma(list); + + if(!pvr_list_dma) + sq_lock((void *)PVR_TA_INPUT); + /* Ok, set the flag */ pvr_state.list_reg_open = list; @@ -175,7 +189,13 @@ int pvr_list_finish(void) { #endif /* !NDEBUG */ - if(!pvr_state.dma_mode) { + /* Check for immediate submission: + A. If we are not in DMA mode, we must be submitting polygons + immediately. + B. If we are in DMA mode, yet there's no vertex buffer associated + with the list type, assume we're doing hybrid drawing and + are directly submitting this list type. */ + if(!pvr_list_dma) { /* Release Store Queues if they are used */ if(pvr_state.dr_used) { pvr_dr_finish(); @@ -184,6 +204,8 @@ int pvr_list_finish(void) { /* In case we haven't sent anything in this list, send a dummy */ pvr_blank_polyhdr(pvr_state.list_reg_open); + sq_unlock(); + /* Set the flags */ pvr_state.lists_closed |= (1 << pvr_state.list_reg_open); @@ -203,16 +225,22 @@ int pvr_prim(void * data, int size) { dbglog(DBG_WARNING, "pvr_prim: attempt to submit to unopened list\n"); return -1; } +#endif /* !NDEBUG */ + if(!pvr_list_dma) { +#ifndef NDEBUG + if((uintptr_t)data & 0x7) { + dbglog(DBG_WARNING, "pvr_prim: attempt to submit data unaligned " + "to 8 bytes.\n"); + return -1; + } #endif /* !NDEBUG */ - if(!pvr_state.dma_mode) { - /* Send the data */ - pvr_sq_load((void *)0, data, size, PVR_DMA_TA); - } - else { - return pvr_list_prim(pvr_state.list_reg_open, data, size); + /* Immediately send data via SQs. */ + sq_fast_cpy(SQ_MASK_DEST(PVR_TA_INPUT), data, size >> 5); } + /* Defer data to RAM buffer for DMA-ing later. */ + else return pvr_list_prim(pvr_state.list_reg_open, data, size); return 0; } @@ -221,12 +249,19 @@ int pvr_list_prim(pvr_list_t list, void * data, int size) { volatile pvr_dma_buffers_t * b; b = pvr_state.dma_buffers + pvr_state.ram_target; + + /* Ensure we associated a DMA vertex buffer with this list type. */ assert(b->base[list]); + /* Ensure data size is multiple of 32-bytes. */ assert(!(size & 31)); + /* Ensure at least 4-byte alignment. */ + assert(!((uintptr_t)data & 0x3)); memcpy(b->base[list] + b->ptr[list], data, size); b->ptr[list] += size; + + /* Ensure we didn't overflow the vertex buffer. */ assert(b->ptr[list] <= b->size[list]); return 0; @@ -234,13 +269,11 @@ int pvr_list_prim(pvr_list_t list, void * data, int size) { void pvr_dr_init(pvr_dr_state_t *vtx_buf_ptr) { ...<truncated>... hooks/post-receive -- A pseudo Operating System for the Dreamcast. |