From: ljsebald <ljs...@us...> - 2023-10-24 01:49:32
|
This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "A pseudo Operating System for the Dreamcast.". The branch, master has been updated via 8b380271151dea406c82ff4dd9179621ef1247cd (commit) from 1526b24db29723634ae096a139f70a918eb428fd (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 8b380271151dea406c82ff4dd9179621ef1247cd Author: Andress Barajas <and...@gm...> Date: Mon Oct 23 18:48:59 2023 -0700 Store Queue Refresh (#320) * Refresh store queue code with some potential (big) speed improvements under certain circumstances. * Use __builtin_prefetch where appropriate in SQ code. * Add specific SQ function for accessing PVR RAM with 64-bit access. * Improve doxygen comments for SQ functions. ----------------------------------------------------------------------- Summary of changes: doc/CHANGELOG | 1 + kernel/arch/dreamcast/hardware/sq.c | 223 ++++++++++++++++++---------------- kernel/arch/dreamcast/include/dc/sq.h | 176 ++++++++++++++++++++++----- 3 files changed, 265 insertions(+), 135 deletions(-) diff --git a/doc/CHANGELOG b/doc/CHANGELOG index b52310c..19d1e75 100644 --- a/doc/CHANGELOG +++ b/doc/CHANGELOG @@ -192,6 +192,7 @@ KallistiOS version 2.1.0 ----------------------------------------------- - *** Added toolchain and KOS support for C/C++ compiler-level TLS [CP && FG] - DC Added vmu functions to check/enable/disable the extra 41 blocks [AB] - *** Added driver for the SH4's Watchdog Timer peripheral [FG] +- DC Added Moop powered fast path to sq_cpy, added TapamN pvr related sq functions [AB] KallistiOS version 2.0.0 ----------------------------------------------- - DC Broadband Adapter driver fixes [Megan Potter == MP] diff --git a/kernel/arch/dreamcast/hardware/sq.c b/kernel/arch/dreamcast/hardware/sq.c index 263074a..8fa9947 100644 --- a/kernel/arch/dreamcast/hardware/sq.c +++ b/kernel/arch/dreamcast/hardware/sq.c @@ -2,165 +2,176 @@ kernel/arch/dreamcast/hardware/sq.c Copyright (C) 2001 Andrew Kieschnick + Copyright (C) 2023 Falco Girgis + Copyright (C) 2023 Andy Barajas */ -#include <arch/memory.h> #include <dc/sq.h> +#include <kos/dbglog.h> /* Functions to clear, copy, and set memory using the sh4 store queues - Based on code by Marcus Comstedt (store_q_clear from tatest) + Based on code by Marcus Comstedt, TapamN, and Moop */ -/* clears n bytes at dest, dest must be 32-byte aligned */ -void sq_clr(void *dest, int n) { - unsigned int *d = (unsigned int *)(void *) - (MEM_AREA_SQ_BASE | (((unsigned long)dest) & 0x03ffffe0)); - - /* Set store queue memory area as desired */ - QACR0 = ((((unsigned int)dest) >> 26) << 2) & 0x1c; - QACR1 = ((((unsigned int)dest) >> 26) << 2) & 0x1c; - - /* Fill both store queues with zeroes */ - d[0] = d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = d[7] = - d[8] = d[9] = d[10] = d[11] = d[12] = d[13] = d[14] = d[15] = 0; - - /* Write them as many times necessary */ - n >>= 5; - - while(n--) { - __asm__("pref @%0" : : "r"(d)); - d += 8; - } - - /* Wait for both store queues to complete */ - d = (unsigned int *)MEM_AREA_SQ_BASE; - d[0] = d[8] = 0; -} - -/* copies n bytes from src to dest, dest must be 32-byte aligned */ +/* Copies n bytes from src to dest, dest must be 32-byte aligned */ void * sq_cpy(void *dest, const void *src, int n) { - unsigned int *d = (unsigned int *)(void *) - (MEM_AREA_SQ_BASE | (((unsigned long)dest) & 0x03ffffe0)); - const unsigned int *s = src; + uint32_t *d = SQ_MASK_DEST(dest); + const uint32_t *s = src; + + _Complex float ds; + _Complex float ds2; + _Complex float ds3; + _Complex float ds4; /* Set store queue memory area as desired */ - QACR0 = ((((unsigned int)dest) >> 26) << 2) & 0x1c; - QACR1 = ((((unsigned int)dest) >> 26) << 2) & 0x1c; + SET_QACR_REGS(dest); - /* fill/write queues as many times necessary */ + /* Fill/write queues as many times necessary */ n >>= 5; - while(n--) { - __asm__("pref @%0" : : "r"(s + 8)); /* prefetch 32 bytes */ - d[0] = *(s++); - d[1] = *(s++); - d[2] = *(s++); - d[3] = *(s++); - d[4] = *(s++); - d[5] = *(s++); - d[6] = *(s++); - d[7] = *(s++); - __asm__("pref @%0" : : "r"(d)); - d += 8; + /* If src is not 8-byte aligned, slow path */ + if ((uintptr_t)src & 7) { + while(n--) { + __builtin_prefetch(s + 8); /* Prefetch 32 bytes for next loop */ + d[0] = *(s++); + d[1] = *(s++); + d[2] = *(s++); + d[3] = *(s++); + d[4] = *(s++); + d[5] = *(s++); + d[6] = *(s++); + d[7] = *(s++); + + /* Fire off store queue. __builtin would move it to the top so + use __asm__ instead */ + __asm__("pref @%0" : : "r"(d)); + d += 8; + } + } else { /* If src is 8-byte aligned, fast path */ + /* Moop algorithm; Using the fpu we can fill the queue faster before + firing it out off */ + __asm__ __volatile__ ( + "fschg\n\t" + "clrs\n" + ".align 2\n" + "1:\n\t" + /* *d++ = *s++ */ + "fmov.d @%[in]+, %[scratch]\n\t" + "fmov.d @%[in]+, %[scratch2]\n\t" + "fmov.d @%[in]+, %[scratch3]\n\t" + "fmov.d @%[in]+, %[scratch4]\n\t" + "add #32, %[out]\n\t" + "pref @%[in]\n\t" /* Prefetch 32 bytes for next loop */ + "dt %[size]\n\t" /* while(n--) */ + "fmov.d %[scratch4], @-%[out]\n\t" + "fmov.d %[scratch3], @-%[out]\n\t" + "fmov.d %[scratch2], @-%[out]\n\t" + "fmov.d %[scratch], @-%[out]\n\t" + "pref @%[out]\n\t" /* Fire off store queue */ + "bf.s 1b\n\t" + "add #32, %[out]\n\t" + "fschg\n" + : [in] "+&r" ((uint32_t)s), [out] "+&r" ((uint32_t)d), + [size] "+&r" (n), [scratch] "=&d" (ds), [scratch2] "=&d" (ds2), + [scratch3] "=&d" (ds3), [scratch4] "=&d" (ds4) /* outputs */ + : /* inputs */ + : "t", "memory" /* clobbers */ + ); } /* Wait for both store queues to complete */ - d = (unsigned int *)MEM_AREA_SQ_BASE; + d = (uint32_t *)MEM_AREA_SQ_BASE; d[0] = d[8] = 0; return dest; } -/* fills n bytes at s with byte c, s must be 32-byte aligned */ -void * sq_set(void *s, uint32 c, int n) { - unsigned int *d = (unsigned int *)(void *) - (MEM_AREA_SQ_BASE | (((unsigned long)s) & 0x03ffffe0)); - - /* Set store queue memory area as desired */ - QACR0 = ((((unsigned int)s) >> 26) << 2) & 0x1c; - QACR1 = ((((unsigned int)s) >> 26) << 2) & 0x1c; - - /* duplicate low 8-bits of c into high 24-bits */ +/* Fills n bytes at dest with byte c, dest must be 32-byte aligned */ +void * sq_set(void *dest, uint32_t c, int n) { + /* Duplicate low 8-bits of c into high 24-bits */ c = c & 0xff; c = (c << 24) | (c << 16) | (c << 8) | c; - /* Fill both store queues with c */ - d[0] = d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = d[7] = - d[8] = d[9] = d[10] = d[11] = d[12] = d[13] = d[14] = d[15] = c; - - /* Write them as many times necessary */ - n >>= 5; - - while(n--) { - __asm__("pref @%0" : : "r"(d)); - d += 8; - } + return sq_set32(dest, c, n); +} - /* Wait for both store queues to complete */ - d = (unsigned int *)MEM_AREA_SQ_BASE; - d[0] = d[8] = 0; +/* Fills n bytes at dest with short c, dest must be 32-byte aligned */ +void * sq_set16(void *dest, uint32_t c, int n) { + /* Duplicate low 16-bits of c into high 16-bits */ + c = c & 0xffff; + c = (c << 16) | c; - return s; + return sq_set32(dest, c, n); } -/* fills n bytes at s with short c, s must be 32-byte aligned */ -void * sq_set16(void *s, uint32 c, int n) { - unsigned int *d = (unsigned int *)(void *) - (MEM_AREA_SQ_BASE | (((unsigned long)s) & 0x03ffffe0)); +/* Fills n bytes at dest with int c, dest must be 32-byte aligned */ +void * sq_set32(void *dest, uint32_t c, int n) { + uint32_t *d = SQ_MASK_DEST(dest); /* Set store queue memory area as desired */ - QACR0 = ((((unsigned int)s) >> 26) << 2) & 0x1c; - QACR1 = ((((unsigned int)s) >> 26) << 2) & 0x1c; - - /* duplicate low 16-bits of c into high 16-bits */ - c = c & 0xffff; - c = (c << 16) | c; + SET_QACR_REGS(dest); /* Fill both store queues with c */ d[0] = d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = d[7] = - d[8] = d[9] = d[10] = d[11] = d[12] = d[13] = d[14] = d[15] = c; + d[8] = d[9] = d[10] = d[11] = d[12] = d[13] = d[14] = d[15] = c; /* Write them as many times necessary */ n >>= 5; while(n--) { - __asm__("pref @%0" : : "r"(d)); + __builtin_prefetch(d); d += 8; } /* Wait for both store queues to complete */ - d = (unsigned int *)MEM_AREA_SQ_BASE; + d = (uint32_t *)MEM_AREA_SQ_BASE; d[0] = d[8] = 0; - return s; + return dest; } -/* fills n bytes at s with int c, s must be 32-byte aligned */ -void * sq_set32(void *s, uint32 c, int n) { - unsigned int *d = (unsigned int *)(void *) - (MEM_AREA_SQ_BASE | (((unsigned long)s) & 0x03ffffe0)); +/* Clears n bytes at dest, dest must be 32-byte aligned */ +void sq_clr(void *dest, int n) { + sq_set32(dest, 0, n); +} - /* Set store queue memory area as desired */ - QACR0 = ((((unsigned int)s) >> 26) << 2) & 0x1c; - QACR1 = ((((unsigned int)s) >> 26) << 2) & 0x1c; +#define PVR_LMMODE (*(volatile uint32_t *)(void *)0xa05f6884) +#define PVR_DMA_DEST (*(volatile uint32_t *)(void *)0xa05f6808) - /* Fill both store queues with c */ - d[0] = d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = d[7] = - d[8] = d[9] = d[10] = d[11] = d[12] = d[13] = d[14] = d[15] = c; +/* Copies n bytes from src to dest (in VRAM), dest must be 32-byte aligned */ +void * sq_cpy_pvr(void *dest, const void *src, int n) { + if(PVR_DMA_DEST != 0) { + dbglog(DBG_ERROR, "sq_cpy_pvr: Previous DMA has not finished\n"); + return NULL; + } - /* Write them as many times necessary */ - n >>= 5; + /* Set PVR LMMODE register */ + PVR_LMMODE = 0; - while(n--) { - __asm__("pref @%0" : : "r"(d)); - d += 8; + /* Convert read/write area pointer to DMA write only area pointer */ + uint32_t dma_area_ptr = (((uintptr_t)dest & 0xffffff) | 0x11000000); + + sq_cpy((void *)dma_area_ptr, src, n); + + return dest; +} + +/* Fills n bytes at PVR dest with short c, dest must be 32-byte aligned */ +void * sq_set_pvr(void *dest, uint32_t c, int n) { + if(PVR_DMA_DEST != 0) { + dbglog(DBG_ERROR, "sq_set_pvr: Previous DMA has not finished\n"); + return NULL; } - /* Wait for both store queues to complete */ - d = (unsigned int *)MEM_AREA_SQ_BASE; - d[0] = d[8] = 0; + /* Set PVR LMMODE register */ + PVR_LMMODE = 0; - return s; + /* Convert read/write area pointer to DMA write only area pointer */ + uint32_t dma_area_ptr = (((uintptr_t)dest & 0xffffff) | 0x11000000); + + sq_set16((void *)dma_area_ptr, c, n); + + return dest; } diff --git a/kernel/arch/dreamcast/include/dc/sq.h b/kernel/arch/dreamcast/include/dc/sq.h index 011f557..c570a0e 100644 --- a/kernel/arch/dreamcast/include/dc/sq.h +++ b/kernel/arch/dreamcast/include/dc/sq.h @@ -1,19 +1,33 @@ /* KallistiOS ##version## kernel/arch/dreamcast/include/dc/sq.h - (C)2000-2001 Andrew Kieschnick + Copyright (C) 2000-2001 Andrew Kieschnick + Copyright (C) 2023 Falco Girgis + Copyright (C) 2023 Andy Barajas +*/ + +/** \file dc/sq.h + \ingroup store_queues + \brief Functions to access the SH4 Store Queues. + \author Andrew Kieschnick */ -/** \file dc/sq.h - \brief Functions to access the SH4 Store Queues. +/** \defgroup store_queues Store Queues + \brief SH4 CPU Peripheral for burst memory transactions. The store queues are a way to do efficient burst transfers from the CPU to external memory. They can be used in a variety of ways, such as to transfer a texture to PVR memory. The transfers are in units of 32-bytes, and the destinations must be 32-byte aligned. - \author Andrew Kieschnick + \note + Mastery over knowing when and how to utilize the store queues is + important when trying to push the limits of the Dreamcast, specifically + when transferring chunks of data between regions of memory. It is often + the case that the DMA is faster for transactions which are consistently + large; however, the store queues tend to have better performance and + have less configuration overhead when bursting smaller chunks of data. */ #ifndef __DC_SQ_H @@ -22,73 +36,177 @@ #include <sys/cdefs.h> __BEGIN_DECLS +#include <stdint.h> #include <arch/types.h> +#include <arch/memory.h> -/** \brief Store Queue 0 access register */ -#define QACR0 (*(volatile unsigned int *)(void *)0xff000038) - -/** \brief Store Queue 1 access register */ -#define QACR1 (*(volatile unsigned int *)(void *)0xff00003c) - -/** \brief Clear a block of memory. +/** \brief Store Queue 0 access register + \ingroup store_queues +*/ +#define QACR0 (*(volatile uint32_t *)(void *)0xff000038) - This function is similar to calling memset() with a value to set of 0, but - uses the store queues to do its work. +/** \brief Store Queue 1 access register + \ingroup store_queues +*/ +#define QACR1 (*(volatile uint32_t *)(void *)0xff00003c) - \param dest The address to begin clearing at (32-byte aligned). - \param n The number of bytes to clear (multiple of 32). +/** \brief Set Store Queue QACR* registers + \ingroup store_queues */ -void sq_clr(void *dest, int n); +#define SET_QACR_REGS(dest) \ + do { \ + uint32_t val = ((uint32_t)(dest)) >> 24; \ + QACR0 = val; \ + QACR1 = val; \ + } while(0) + +/** \brief Mask dest to Store Queue area + \ingroup store_queues +*/ +#define SQ_MASK_DEST(dest) \ + ((uint32_t *)(void *) \ + (MEM_AREA_SQ_BASE | \ + (((uint32_t)(dest)) & 0x03ffffe0))) -/** \brief Copy a block of memory. +/** \brief Copy a block of memory. + \ingroup store_queues This function is similar to memcpy4(), but uses the store queues to do its work. + \warning + The dest pointer must be at least 32-byte aligned, the src pointer + must be at least 4-byte aligned (8-byte aligned uses fast path), + and n must be a multiple of 32! + \param dest The address to copy to (32-byte aligned). - \param src The address to copy from (32-bit (4-byte) aligned). + \param src The address to copy from (32-bit (4/8-byte) aligned). \param n The number of bytes to copy (multiple of 32). \return The original value of dest. + + \sa sq_cpy_pvr() */ void * sq_cpy(void *dest, const void *src, int n); -/** \brief Set a block of memory to an 8-bit value. +/** \brief Set a block of memory to an 8-bit value. + \ingroup store_queues This function is similar to calling memset(), but uses the store queues to do its work. - \param s The address to begin setting at (32-byte aligned). - \param c The value to set (in the low 8-bits). + \warning + The dest pointer must be a 32-byte aligned with n being a multiple of 32, + and only the low 8-bits are used from c. + + \param dest The address to begin setting at (32-byte aligned). + \param src The value to set (in the low 8-bits). \param n The number of bytes to set (multiple of 32). \return The original value of dest. + + \sa sq_set16(), sq_set32(), sq_set_pvr() */ -void * sq_set(void *s, uint32 c, int n); +void * sq_set(void *dest, uint32 c, int n); -/** \brief Set a block of memory to a 16-bit value. +/** \brief Set a block of memory to a 16-bit value. + \ingroup store_queues This function is similar to calling memset2(), but uses the store queues to do its work. - \param s The address to begin setting at (32-byte aligned). + \warning + The dest pointer must be a 32-byte aligned with n being a multiple of 32, + and only the low 16-bits are used from c. + + \param dest The address to begin setting at (32-byte aligned). \param c The value to set (in the low 16-bits). \param n The number of bytes to set (multiple of 32). \return The original value of dest. + + \sa sq_set(), sq_set32(), sq_set_pvr() */ -void * sq_set16(void *s, uint32 c, int n); +void * sq_set16(void *dest, uint32 c, int n); -/** \brief Set a block of memory to a 32-bit value. +/** \brief Set a block of memory to a 32-bit value. + \ingroup store_queues This function is similar to calling memset4(), but uses the store queues to do its work. - \param s The address to begin setting at (32-byte aligned). + \warning + The dest pointer must be a 32-byte aligned with n being a multiple of 32! + + \param dest The address to begin setting at (32-byte aligned). \param c The value to set (all 32-bits). \param n The number of bytes to set (multiple of 32). \return The original value of dest. + + \sa sq_set(), sq_set16(), sq_set_pvr() */ -void * sq_set32(void *s, uint32 c, int n); +void * sq_set32(void *dest, uint32 c, int n); + +/** \brief Clear a block of memory. + \ingroup store_queues + + This function is similar to calling memset() with a value to set of 0, but + uses the store queues to do its work. + + \warning + The dest pointer must be a 32-byte aligned with n being a multiple of 32! + + \param dest The address to begin clearing at (32-byte aligned). + \param n The number of bytes to clear (multiple of 32). +*/ +void sq_clr(void *dest, int n); + +/** \brief Copy a block of memory to VRAM + \ingroup store_queues + \author TapamN + + This function is similar to sq_cpy(), but it has been + optimized for writing to a destination residing within VRAM. + + \note + TapamN has reported over a 2x speedup versus the regular + sq_cpy() when using this function to write to VRAM. + + \warning + This function cannot be used at the same time as a PVR DMA transfer. + + The dest pointer must be at least 32-byte aligned and reside + in video memory, the src pointer must be at least 8-byte aligned, ...<truncated>... hooks/post-receive -- A pseudo Operating System for the Dreamcast. |