From: ljsebald <ljs...@us...> - 2023-11-08 03:23:14
|
This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "A pseudo Operating System for the Dreamcast.". The branch, master has been updated via a6fbf57e281b4cb231fe3a3f4bb68e57df58ecf8 (commit) from da29948e4c088cbbb2e475a59e0e0ce2e18af970 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit a6fbf57e281b4cb231fe3a3f4bb68e57df58ecf8 Author: Andress Barajas <and...@gm...> Date: Tue Nov 7 19:21:14 2023 -0800 Cache Refresh (#342) * Add two new cache manipulation functions. Speed up the old ones for ranges larger than the cache * Edited algorithm checks to get the fastest speed. Added warnings to let user know that range flush and purges can still be faster under a certain threshold of bytes ----------------------------------------------------------------------- Summary of changes: kernel/arch/dreamcast/include/arch/cache.h | 80 ++++-- kernel/arch/dreamcast/kernel/cache.s | 392 ++++++++++++++++++----------- 2 files changed, 308 insertions(+), 164 deletions(-) diff --git a/kernel/arch/dreamcast/include/arch/cache.h b/kernel/arch/dreamcast/include/arch/cache.h index 602f2b3..d5df496 100644 --- a/kernel/arch/dreamcast/include/arch/cache.h +++ b/kernel/arch/dreamcast/include/arch/cache.h @@ -3,7 +3,7 @@ arch/dreamcast/include/cache.h Copyright (C) 2001 Megan Potter Copyright (C) 2014, 2016, 2023 Ruslan Rostovtsev - + Copyright (C) 2023 Andy Barajas */ /** \file arch/cache.h @@ -15,6 +15,7 @@ \author Megan Potter \author Ruslan Rostovtsev + \author Andy Barajas */ #ifndef __ARCH_CACHE_H @@ -23,12 +24,12 @@ #include <sys/cdefs.h> __BEGIN_DECLS +#include <stdint.h> #include <arch/types.h> /** \brief SH4 cache block size. - The physical address will be aligned to this size in all - functions except dcache_alloc_write. + The size of a cache block. */ #define CPU_CACHE_BLOCK_SIZE 32 @@ -39,7 +40,7 @@ __BEGIN_DECLS \param start The physical address to begin flushing at. \param count The number of bytes to flush. */ -void icache_flush_range(uint32 start, uint32 count); +void icache_flush_range(uintptr_t start, size_t count); /** \brief Invalidate the data/operand cache. @@ -50,42 +51,77 @@ void icache_flush_range(uint32 start, uint32 count); \param start The physical address to begin invalidating at. \param count The number of bytes to invalidate. */ -void dcache_inval_range(uint32 start, uint32 count); +void dcache_inval_range(uintptr_t start, size_t count); /** \brief Flush the data/operand cache. This function flushes a range of the data/operand cache, forcing a write- - back on all of the data in the specified range. This does not invalidate the - cache in the process (meaning the blocks will still be in the cache, just - not marked as dirty after this has completed). If you wish to invalidate the - cache as well, call dcache_inval_range() after calling this function or - use dcache_purge_range() instead of dcache_flush_range(). + back on all of the data in the specified range. This does not invalidate + the cache in the process (meaning the blocks will still be in the cache, + just not marked as dirty after this has completed). If you wish to + invalidate the cache as well, call dcache_inval_range() after calling this + function or use dcache_purge_range() instead of dcache_flush_range(). \param start The physical address to begin flushing at. \param count The number of bytes to flush. */ -void dcache_flush_range(uint32 start, uint32 count); +void dcache_flush_range(uintptr_t start, size_t count); + +/** \brief Flush all the data/operand cache. + + This function flushes all the data/operand cache, forcing a write- + back on all of the cache blocks that are marked as dirty. + + \note + dcache_flush_range() is faster than dcache_flush_all() if the count + param is 66560 or less. +*/ +void dcache_flush_all(void); /** \brief Purge the data/operand cache. This function flushes a range of the data/operand cache, forcing a write- - back and invalidate on all of the data in the specified range. + back and then invalidates all of the data in the specified range. \param start The physical address to begin purging at. \param count The number of bytes to purge. */ -void dcache_purge_range(uint32 start, uint32 count); +void dcache_purge_range(uintptr_t start, size_t count); /** \brief Purge all the data/operand cache. - This function flushes all the data/operand cache, forcing a write- - back and invalidate on all of the cache blocks. + This function flushes the entire data/operand cache, ensuring that all + cache blocks marked as dirty are written back to memory and all cache + entries are invalidated. It does not require an additional buffer and is + preferred when memory resources are constrained. - \param start The physical address for temporary buffer (32-byte aligned) - \param count The number of bytes of temporary buffer (8 KB or 16 KB) + \note + dcache_purge_range() is faster than dcache_purge_all() if the count + param is 39936 or less. +*/ +void dcache_purge_all(void); + +/** \brief Purge all the data/operand cache with buffer. + + This function performs a purge of all data/operand cache blocks by + utilizing an external buffer to speed up the write-back and invalidation + process. It is always faster than dcache_purge_all() and is recommended + where maximum speed is required. + + \note While this function offers a superior purge rate, it does require + the use of a temporary buffer. So use this function if you have an extra + 8/16 kb of memory laying around that you can utilize for no other purpose + than for this function. + + \param start The physical address for temporary buffer (32-byte + aligned) + \param count The size of the temporary buffer, which can be + either 8 KB or 16 KB, depending on cache + configuration - 8 KB buffer with OCRAM enabled, + otherwise 16 KB. */ -void dcache_purge_all(uint32 start, uint32 count); +void dcache_purge_all_with_buffer(uintptr_t start, size_t count); /** \brief Prefetch one block to the data/operand cache. @@ -108,15 +144,15 @@ static __always_inline void dcache_pref_block(const void *src) { \param src The physical address to allocate. \param value The value written to first 4-byte. */ -static __always_inline void dcache_alloc_block(const void *src, uint32 value) { - register int __value __asm__("r0") = value; - __asm__ __volatile__ ("movca.l r0,@%0\n\t" +static __always_inline void dcache_alloc_block(const void *src, uint32_t value) { + __asm__ __volatile__ ("movca.l r0, @%0\n\t" : - : "r" (src), "r" (__value) + : "r" (src), "z" (value) : "memory" ); } + __END_DECLS #endif /* __ARCH_CACHE_H */ diff --git a/kernel/arch/dreamcast/kernel/cache.s b/kernel/arch/dreamcast/kernel/cache.s index c2b8bdc..cdbceb7 100644 --- a/kernel/arch/dreamcast/kernel/cache.s +++ b/kernel/arch/dreamcast/kernel/cache.s @@ -1,172 +1,280 @@ -! This routine was such a PIA to get working in inside the C program -! that I finally gave up and moved it out to an assembler file. -! Routine to flush parts of cache.. thanks to the Linux-SH guys -! for the algorithm. The original version of this routine was -! taken from sh-stub.c. +! KallistiOS ##version## +! +! arch/dreamcast/kernel/cache.s +! +! Copyright (C) 2001 Megan Potter +! Copyright (C) 2014, 2016, 2023 Ruslan Rostovtsev +! Copyright (C) 2023 Andy Barajas +! +! Optimized assembler code for managing the cache. +! - .text - .globl _icache_flush_range - .globl _dcache_inval_range - .globl _dcache_flush_range - .globl _dcache_purge_range - .globl _dcache_purge_all + .text + .globl _icache_flush_range + .globl _dcache_inval_range + .globl _dcache_flush_range + .globl _dcache_flush_all + .globl _dcache_purge_range + .globl _dcache_purge_all + .globl _dcache_purge_all_with_buffer +! Routine to flush parts of cache.. Thanks to the Linux-SH guys +! for the algorithm. The original version of this routine was +! taken from sh-stub.c. +! ! r4 is starting address ! r5 is count + .align 2 _icache_flush_range: - mov.l fraddr,r0 - mov.l p2mask,r1 - or r1,r0 - jmp @r0 - nop - - .align 2 -fraddr: .long flush_real -p2mask: .long 0x20000000 - - -flush_real: - ! Save old SR and disable interrupts - stc sr,r0 - mov.l r0,@-r15 - mov.l ormask,r1 - or r1,r0 - ldc r0,sr - - ! Get ending address from count and align start address - add r4,r5 - mov.l l1align,r0 - and r0,r4 - mov.l addrarray,r1 - mov.l entrymask,r2 - mov.l validmask,r3 - -flush_loop: - ! Write back O cache - ocbwb @r4 - - ! Invalidate I cache - mov r4,r6 ! v & CACHE_IC_ENTRY_MASK - and r2,r6 - or r1,r6 ! CACHE_IC_ADDRESS_ARRAY | ^ - - mov r4,r7 ! v & 0xfffffc00 - and r3,r7 - - add #32,r4 ! += CPU_CACHE_BLOCK_SIZE - cmp/hs r4,r5 - bt/s flush_loop - mov.l r7,@r6 ! *addr = data - - ! Restore old SR - mov.l @r15+,r0 - ldc r0,sr - - ! make sure we have enough instrs before returning to P1 - nop - nop - nop - nop - nop - nop - nop - rts - nop - - .align 2 -ormask: - .long 0x100000f0 -addrarray: - .long 0xf0000000 ! CACHE_IC_ADDRESS_ARRAY -entrymask: - .long 0x1fe0 ! CACHE_IC_ENTRY_MASK -validmask: - .long 0xfffffc00 - - -! Goes through and invalidates the O-cache for a given block of -! RAM. Make sure that you've called dcache_flush_range first if -! you care about the contents. + mov.l ifr_addr, r0 + mov.l p2_mask, r1 + or r1, r0 + jmp @r0 + nop + +.iflush_real: + ! Save old SR and disable interrupts + stc sr, r0 + mov.l r0, @-r15 + mov.l ormask, r1 + or r1, r0 + ldc r0, sr + + ! Get ending address from count and align start address + add r4, r5 + mov.l align_mask, r0 + and r0, r4 + mov.l ica_addr, r1 + mov.l ic_entry_mask, r2 + mov.l ic_valid_mask, r3 + +.flush_loop: + ! Write back D cache + ocbwb @r4 + + ! Invalidate I cache + mov r4, r6 ! v & CACHE_IC_ENTRY_MASK + and r2, r6 + or r1, r6 ! CACHE_IC_ADDRESS_ARRAY | ^ + + mov r4, r7 ! v & 0xfffffc00 + and r3, r7 + + add #32, r4 ! Move on to next cache block + cmp/hs r4, r5 + bt/s .flush_loop + mov.l r7, @r6 ! *addr = data + + ! Restore old SR + mov.l @r15+, r0 + ldc r0, sr + + ! make sure we have enough instrs before returning to P1 + nop + nop + nop + nop + nop + nop + nop + rts + nop + + +! This routine goes through and invalidates the dcache for a given +! range of RAM. Make sure that you've called dcache_flush_range first +! if you care about the contents. +! ! r4 is starting address ! r5 is count + .align 2 _dcache_inval_range: - ! Get ending address from count and align start address - add r4,r5 - mov.l l1align,r0 - and r0,r4 + ! Get ending address from count and align start address + add r4, r5 + mov.l align_mask, r0 + and r0, r4 -dinval_loop: - ! Invalidate the O cache - ocbi @r4 - cmp/hs r4,r5 - bt/s dinval_loop - add #32,r4 ! += CPU_CACHE_BLOCK_SIZE +.dinval_loop: + ! Invalidate the dcache + ocbi @r4 + cmp/hs r4, r5 + bt/s .dinval_loop + add #32, r4 ! Move on to next cache block - rts - nop + rts + nop -! This routine just goes through and forces a write-back on the +! This routine goes through and forces a write-back on the ! specified data range. Use prior to dcache_inval_range if you -! care about the contents. +! care about the contents. If the range is bigger than the dcache, +! we flush the whole cache instead. +! ! r4 is starting address ! r5 is count + .align 2 _dcache_flush_range: - ! Get ending address from count and align start address - add r4,r5 - mov.l l1align,r0 - and r0,r4 + ! Divide byte count by 32 + mov #-5, r1 + shad r1, r5 + + ! Compare with flush_check + mov.w flush_check, r2 + cmp/hi r2, r5 + bt _dcache_flush_all ! If lines > flush_check, jump to _dcache_flush_all + + ! Align start address + mov.l align_mask, r0 + and r0, r4 + +.dflush_loop: + ! Write back the dcache + ocbwb @r4 + dt r5 + bf/s .dflush_loop + add #32, r4 ! Move on to next cache block + + rts + nop + -dflush_loop: - ! Write back the O cache - ocbwb @r4 - cmp/hs r4,r5 - bt/s dflush_loop - add #32,r4 ! += CPU_CACHE_BLOCK_SIZE +! This routine uses the OC address array to have direct access to the +! dcache entries. It forces a write-back on all dcache entries where +! the U bit and V bit are set to 1. Then updates the entry with +! U bit cleared. + .align 2 +_dcache_flush_all: + mov.l dca_addr, r1 + mov.w cache_lines, r2 + mov.l dc_ubit_mask, r3 - rts - nop +.dflush_all_loop: + mov.l @r1, r0 ! Get dcache array entry value + and r3, r0 ! Zero out U bit + dt r2 + mov.l r0, @r1 ! Update dcache entry + bf/s .dflush_all_loop + add #32, r1 ! Move on to next entry -! This routine just goes through and forces a write-back and invalidate -! on the specified data range. + rts + nop + + +! This routine goes through and forces a write-back and invalidate +! on the specified data range. If the range is bigger than the dcache, +! we purge the whole cache instead. +! ! r4 is starting address ! r5 is count + .align 2 _dcache_purge_range: - ! Get ending address from count and align start address - add r4,r5 - mov.l l1align,r0 - and r0,r4 + ! Divide byte count by 32 + mov #-5, r1 + shad r1, r5 + + ! Compare with purge_check + mov.w purge_check, r2 + cmp/hi r2, r5 + bt _dcache_purge_all ! If lines > purge_check, jump to _dcache_purge_all + + ! Align start address + mov.l align_mask, r0 + and r0, r4 + +.dpurge_loop: + ! Write back and invalidate the D cache + ocbp @r4 + dt r5 + bf/s .dpurge_loop + add #32, r4 ! Move on to next cache block -dpurge_loop: - ! Write back and invalidate the O cache - ocbp @r4 - cmp/hs r4,r5 - bt/s dpurge_loop - add #32,r4 ! += CPU_CACHE_BLOCK_SIZE + rts + nop - rts - nop + +! This routine uses the OC address array to have direct access to the +! dcache entries. It goes through and forces a write-back and invalidate +! on all of the dcache. + .align 2 +_dcache_purge_all: + mov.l dca_addr, r1 + mov.w cache_lines, r2 + mov #0, r3 + +.dpurge_all_loop: + mov.l r3, @r1 ! Update dcache entry + dt r2 + bf/s .dpurge_all_loop + add #32, r1 ! Move on to next entry + + rts + nop -! This routine just forces a write-back and invalidate all O cache. +! This routine forces a write-back and invalidate all dcache +! using a 8kb or 16kb 32-byte aligned buffer. +! ! r4 is address for temporary buffer 32-byte aligned ! r5 is size of temporary buffer (8 KB or 16 KB) -_dcache_purge_all: ...<truncated>... hooks/post-receive -- A pseudo Operating System for the Dreamcast. |