From: SUGIOKA T. <su...@it...> - 2002-04-24 12:05:04
|
Cache handling improvement and such. (1) If we use P1 address while accessing Cache Address Array, we can use ASSOC bit without UTLB entry. (2) For flushing O-cache entry, we can use movca.l/ocbi pair while BL bit is set. This sequence can be executed in P1 area and does not cause any unnecessary bus cycles. (3) Delete some local labels from System.map in order to clean-up profiling output. I checked kernel profile on my SH-4(SH7750) board after following command 3 times. $ time for x in 0 1 2 3 4 5 6 7 8 9; do for y in 0 1 2 3 4 5 6 7 8 9; \ do sh -c "echo >/dev/null"; done; done Result is, kernel profile with old cache-sh4.c/clear_page.S 18 copy_page_range 0.0352 18 handle_mm_fault 0.0563 19 call_dpf 0.3167 19 do_page_fault 0.0205 19 filemap_nopage 0.0270 19 flush_tlb_range 0.0540 22 zap_page_range 0.0382 23 flush_cache_range 0.0513 56 __copy_user_page 0.6667 110 __flush_cache_4096_all 0.2022 2543 cpu_idle 15.8938 3161 total 0.0028 kernel profile with new cache-sh4.c/clear_page.S 11 system_call 0.1250 12 update_mmu_cache 0.0417 13 flush_tlb_range 0.0369 15 do_page_fault 0.0162 18 zap_page_range 0.0312 19 call_dpf 0.3167 19 copy_page_range 0.0371 20 flush_cache_4096_all 0.3125 23 filemap_nopage 0.0327 24 copy_page 0.3333 72 __copy_user_page 0.8571 2092 cpu_idle 13.0750 2610 total 0.0023 I will commit these changes if there is no objection. * arch/sh/mm/cache-sh4.c (CACHE_ASSOC): Defined. (flush_cache_4096_all): Moved to clear_page.S. (flush_cache_4096): Rewritten. Use __flush_cache_4096. (flush_dcache_page,__flush_cache_page): Don't protect from interrupt. (flush_cache_all): Rewritten. Use __flush_dcache_all. (flush_cache_range): Follow change of flush_cache_4096_all. * arch/sh/mm/clear_page.S (__flush_cache_4096_all): Removed. (flush_cache_4096_all): New implementation. (__flush_cache_4096): Optimized. Use ASSOC bit. (__flush_dcache_all): New function. (__clear_user): Clean-up local label. * copy_page.S (__copy_user): Clean-up local label. * arch/sh/kernel/time.c (sh_do_profile): Add profile even if PC is in P2 area. Index: arch/sh/kernel/time.c =================================================================== RCS file: /cvsroot/linuxsh/linux/arch/sh/kernel/time.c,v retrieving revision 1.1.1.1.2.1 diff -u -r1.1.1.1.2.1 time.c --- arch/sh/kernel/time.c 29 Mar 2002 00:01:07 -0000 1.1.1.1.2.1 +++ arch/sh/kernel/time.c 24 Apr 2002 10:11:16 -0000 @@ -183,6 +183,8 @@ if (!prof_buffer) return; + if(pc >= 0xa0000000UL && pc < 0xc0000000UL) + pc -= 0x20000000; pc -= (unsigned long) &_stext; pc >>= prof_shift; /* Index: arch/sh/mm/cache-sh4.c =================================================================== RCS file: /cvsroot/linuxsh/linux/arch/sh/mm/cache-sh4.c,v retrieving revision 1.1.1.1.2.5 diff -u -r1.1.1.1.2.5 cache-sh4.c --- arch/sh/mm/cache-sh4.c 3 Apr 2002 02:33:16 -0000 1.1.1.1.2.5 +++ arch/sh/mm/cache-sh4.c 24 Apr 2002 10:11:16 -0000 @@ -41,6 +41,7 @@ #define CACHE_OC_ADDRESS_ARRAY 0xf4000000 #define CACHE_VALID 1 #define CACHE_UPDATED 2 +#define CACHE_ASSOC 8 #define CACHE_OC_WAY_SHIFT 13 #define CACHE_IC_WAY_SHIFT 13 @@ -207,63 +208,25 @@ restore_flags(flags); } -static void flush_cache_4096_all(unsigned long start) -{ -#if defined(CONFIG_CPU_SUBTYPE_SH7751) || defined(CONFIG_CPU_SUBTYPE_ST40STB1) - /* - * SH7751 and ST40 have no restriction to handle cache. - * (While SH7750 must do that at P2 area.) - */ - unsigned long addr; - for (addr = start; addr < start + 4096; addr += 32) - ctrl_outl(0, addr); -#else - register unsigned long __r0 __asm__ ("r0") = 0; - register unsigned long __r1 __asm__ ("r1") = 128; - register unsigned long __r4 __asm__ ("r4"); - register unsigned long __r5 __asm__ ("r5"); - register unsigned long __r6 __asm__ ("r6"); - register unsigned long __r7 __asm__ ("r7"); - extern void __flush_cache_4096_all(unsigned long); - - asm volatile("jsr @%7; nop" - : "=&r" (__r4), "=&r" (__r5), "=&r" (__r6), "=&r" (__r7) - : "0" (start), "r" (__r0), "r" (__r1), - "r" (__flush_cache_4096_all + 0x20000000) - : "pr"); -#endif -} - static inline void flush_cache_4096(unsigned long start, unsigned long phys) { + unsigned long flags; + extern void __flush_cache_4096(unsigned long addr, unsigned long phys, unsigned long exec_offset); + #if defined(CONFIG_CPU_SUBTYPE_SH7751) || defined(CONFIG_CPU_SUBTYPE_ST40STB1) if (start >= CACHE_OC_ADDRESS_ARRAY) { /* * SH7751 and ST40 have no restriction to handle cache. * (While SH7750 must do that at P2 area.) */ - unsigned long addr, data; - for (addr = start; addr < start + 4096; addr += 32) { - data = ctrl_inl(addr)&(0x1ffff000|CACHE_VALID); - if (data == phys) - ctrl_outl(0, addr); - } + __flush_cache_4096(start | CACHE_ASSOC, phys | 0x80000000, 0); } else #endif { - register unsigned long addr __asm__ ("r4"); - register unsigned long data __asm__ ("r0"); - register unsigned long __r5 __asm__ ("r5") = phys; - register unsigned long __r6 __asm__ ("r6") = (0x1ffff000|CACHE_VALID); - register unsigned long __r7 __asm__ ("r7") = 0; - extern void __flush_cache_4096(unsigned long, unsigned long); - - asm volatile("jsr @%1; nop" - : "=r" (addr), "=r" (data) - : "0" (start), "1" (__flush_cache_4096 + 0x20000000), - "r" (__r5), "r" (__r6), "r" (__r7) - : "pr"); + save_and_cli(flags); + __flush_cache_4096(start | CACHE_ASSOC, phys | 0x80000000, 0x20000000); + restore_flags(flags); } } @@ -275,19 +238,12 @@ { if (test_bit(PG_mapped, &page->flags)) { unsigned long phys = PHYSADDR(page_address(page)); - unsigned long flags; - - phys |= CACHE_VALID; - - save_and_cli(flags); /* Loop all the D-cache */ flush_cache_4096(CACHE_OC_ADDRESS_ARRAY, phys); flush_cache_4096(CACHE_OC_ADDRESS_ARRAY | 0x1000, phys); flush_cache_4096(CACHE_OC_ADDRESS_ARRAY | 0x2000, phys); flush_cache_4096(CACHE_OC_ADDRESS_ARRAY | 0x3000, phys); - - restore_flags(flags); } } @@ -303,42 +259,11 @@ restore_flags(flags); } -#undef C_IMPLEMENTATION_OF_CACHE_ALL - void flush_cache_all(void) { - extern unsigned long empty_zero_page[1024]; + extern void __flush_dcache_all(void); - /* Prefetch the data to write back D-cache */ - -#ifdef C_IMPLEMENTATION_OF_CACHE_ALL - unsigned long addr; - - for (addr = (unsigned long)empty_zero_page; - addr < (unsigned long)empty_zero_page + 1024*16; - addr += L1_CACHE_BYTES) - asm volatile("pref @%0"::"r" (addr)); -#else - unsigned long a0, a1, a2, a3, cnt; - asm volatile( - "mov %0, %1; add #32, %1\n\t" - "mov %0, %2; add #64, %2\n\t" - "mov %1, %3; add #64, %3\n\t" - "1:\n\t" - "pref @%0\n\t" - "dt %4\n\t" - "pref @%1\n\t" - "add %5, %0\n\t" - "pref @%2\n\t" - "add %5, %1\n\t" - "pref @%3\n\t" - "add %5, %2\n\t" - "bf/s 1b\n\t" - " add %5, %3" - : "=&r" (a0), "=&r" (a1), "=&r" (a2), "=&r" (a3), "=&r" (cnt) - : "r" (32*4), "0" (empty_zero_page), "4" (1024*16/32/4) - : "t"); -#endif + __flush_dcache_all(); flush_icache_all(); } @@ -363,11 +288,6 @@ unsigned long address, unsigned long phys) { - unsigned long flags; - - phys |= CACHE_VALID; - save_and_cli(flags); - /* We only need to flush D-cache when we have alias */ if ((address^phys) & CACHE_ALIAS) { /* Loop 4K of the D-cache */ @@ -385,8 +305,6 @@ flush_cache_4096( CACHE_IC_ADDRESS_ARRAY | (address & 0x1000), phys); - - restore_flags(flags); } /* @@ -401,6 +319,8 @@ void flush_cache_range(struct mm_struct *mm, unsigned long start, unsigned long end) { + extern void flush_cache_4096_all(unsigned long start); + unsigned long p = start & PAGE_MASK; pgd_t *dir; pmd_t *pmd; @@ -438,13 +358,13 @@ } while (p < end); loop_exit: if (d & 1) - flush_cache_4096_all(CACHE_OC_ADDRESS_ARRAY); + flush_cache_4096_all(0); if (d & 2) - flush_cache_4096_all(CACHE_OC_ADDRESS_ARRAY | 0x1000); + flush_cache_4096_all(0x1000); if (d & 4) - flush_cache_4096_all(CACHE_OC_ADDRESS_ARRAY | 0x2000); + flush_cache_4096_all(0x2000); if (d & 8) - flush_cache_4096_all(CACHE_OC_ADDRESS_ARRAY | 0x3000); + flush_cache_4096_all(0x3000); flush_icache_all(); } Index: arch/sh/mm/clear_page.S =================================================================== RCS file: /cvsroot/linuxsh/linux/arch/sh/mm/clear_page.S,v retrieving revision 1.1.1.1.2.3 diff -u -r1.1.1.1.2.3 clear_page.S --- arch/sh/mm/clear_page.S 13 Apr 2002 02:35:20 -0000 1.1.1.1.2.3 +++ arch/sh/mm/clear_page.S 24 Apr 2002 10:11:16 -0000 @@ -67,25 +67,25 @@ add #31, r2 and r1, r2 cmp/eq r4, r2 - bt area1 + bt .Larea1 mov r2, r3 sub r4, r3 mov r3, r7 mov r4, r2 ! -l0: dt r3 +.L0: dt r3 0: mov.b r0, @r2 - bf/s l0 + bf/s .L0 add #1, r2 ! sub r7, r5 mov r2, r4 -area1: +.Larea1: mov r4, r3 add r5, r3 and r1, r3 cmp/hi r2, r3 - bf area2 + bf .Larea2 ! ! Clear area 1 #if defined(__SH4__) @@ -113,22 +113,22 @@ nop ! ! Clear area 2 -area2: +.Larea2: mov r4, r3 add r5, r3 cmp/hs r2, r3 - bt/s done + bt/s .Ldone sub r2, r3 -l2: dt r3 +.L2: dt r3 9: mov.b r0, @r2 - bf/s l2 + bf/s .L2 add #1, r2 ! -done: rts +.Ldone: rts mov #0, r0 ! return 0 as normal return ! return the number of bytes remained -bad_clear_user: +.Lbad_clear_user: mov r4, r0 add r5, r0 rts @@ -136,16 +136,16 @@ .section __ex_table,"a" .align 2 - .long 0b, bad_clear_user - .long 1b, bad_clear_user - .long 2b, bad_clear_user - .long 3b, bad_clear_user - .long 4b, bad_clear_user - .long 5b, bad_clear_user - .long 6b, bad_clear_user - .long 7b, bad_clear_user - .long 8b, bad_clear_user - .long 9b, bad_clear_user + .long 0b, .Lbad_clear_user + .long 1b, .Lbad_clear_user + .long 2b, .Lbad_clear_user + .long 3b, .Lbad_clear_user + .long 4b, .Lbad_clear_user + .long 5b, .Lbad_clear_user + .long 6b, .Lbad_clear_user + .long 7b, .Lbad_clear_user + .long 8b, .Lbad_clear_user + .long 9b, .Lbad_clear_user .previous #if defined(__SH4__) @@ -191,13 +191,27 @@ .L4096: .word 4096 ENTRY(__flush_cache_4096) - .rept 128 - mov.l @r4,r0 - and r6,r0 - cmp/eq r5,r0 - bf 1f - mov.l r7,@r4 -1: add #32,r4 + mov.l 1f,r3 + add r6,r3 + mov r4,r0 + mov #64,r2 + shll r2 + mov #64,r6 + jmp @r3 + mov #96,r7 + .align 2 +1: .long 2f +2: + .rept 32 + mov.l r5,@r0 + add #32,r5 + mov.l r5,@(32,r0) + add #32,r5 + mov.l r5,@(r0,r6) + add #32,r5 + mov.l r5,@(r0,r7) + add #32,r5 + add r2,r0 .endr nop nop @@ -209,32 +223,72 @@ rts nop -#if defined(CONFIG_CPU_SUBTYPE_SH7750) -ENTRY(__flush_cache_4096_all) - mov r4,r5 - mov r4,r6 - mov r4,r7 - add #32,r5 - add #-64,r6 - add #-32,r7 - .rept 32 - mov.l r0,@r4 - add r1,r6 - mov.l r0,@r5 - add r1,r7 - mov.l r0,@r6 - add r1,r4 - mov.l r0,@r7 - add r1,r5 - .endr - nop - nop - nop - nop - nop - nop - nop +ENTRY(__flush_dcache_all) + mov.l 2f,r0 + mov.l 3f,r4 + and r0,r4 ! r4 = (unsigned long)&empty_zero_page[0] & ~0xffffc000 + stc sr,r1 ! save SR + mov.l 4f,r2 + or r1,r2 + mov #32,r3 + shll2 r3 +1: + ldc r2,sr ! set BL bit + movca.l r0,@r4 + ocbi @r4 + add #32,r4 + movca.l r0,@r4 + ocbi @r4 + add #32,r4 + movca.l r0,@r4 + ocbi @r4 + add #32,r4 + movca.l r0,@r4 + ocbi @r4 + ldc r1,sr ! restore SR + dt r3 + bf/s 1b + add #32,r4 + rts nop -#endif + .align 2 +2: .long 0xffffc000 +3: .long SYMBOL_NAME(empty_zero_page) +4: .long 0x10000000 ! BL bit + +/* flush_cache_4096_all(unsigned long addr) */ +ENTRY(flush_cache_4096_all) + mov.l 2f,r0 + mov.l 3f,r2 + and r0,r2 + or r2,r4 ! r4 = addr | (unsigned long)&empty_zero_page[0] & ~0x3fff + stc sr,r1 ! save SR + mov.l 4f,r2 + or r1,r2 + mov #32,r3 +1: + ldc r2,sr ! set BL bit + movca.l r0,@r4 + ocbi @r4 + add #32,r4 + movca.l r0,@r4 + ocbi @r4 + add #32,r4 + movca.l r0,@r4 + ocbi @r4 + add #32,r4 + movca.l r0,@r4 + ocbi @r4 + ldc r1,sr ! restore SR + dt r3 + bf/s 1b + add #32,r4 + + rts + nop + .align 2 +2: .long 0xffffc000 +3: .long SYMBOL_NAME(empty_zero_page) +4: .long 0x10000000 ! BL bit #endif Index: arch/sh/mm/copy_page.S =================================================================== RCS file: /cvsroot/linuxsh/linux/arch/sh/mm/copy_page.S,v retrieving revision 1.1.1.1.2.4 diff -u -r1.1.1.1.2.4 copy_page.S --- arch/sh/mm/copy_page.S 17 Apr 2002 04:14:03 -0000 1.1.1.1.2.4 +++ arch/sh/mm/copy_page.S 24 Apr 2002 10:11:16 -0000 @@ -153,50 +153,50 @@ mov #12,r0 ! Check if small number of bytes cmp/gt r0,r6 bt 2f - bra L_cleanup_loop + bra .L_cleanup_loop nop 2: neg r5,r0 ! Calculate bytes needed to align source add #4,r0 and #3,r0 tst r0,r0 - bt L_jump + bt .L_jump mov r0,r1 -L_loop1: +.L_loop1: ! Copy bytes to align source EX( mov.b @r5+,r0 ) dt r1 EX( mov.b r0,@r4 ) add #-1,r6 - bf/s L_loop1 + bf/s .L_loop1 add #1,r4 -L_jump: +.L_jump: mov r6,r2 ! Calculate number of longwords to copy shlr2 r2 tst r2,r2 - bt L_cleanup + bt .L_cleanup mov r4,r0 ! Jump to appropriate routine and #3,r0 mov r0,r1 shll2 r1 - mova L_jump_tbl,r0 + mova .L_jump_tbl,r0 mov.l @(r0,r1),r1 jmp @r1 nop .align 2 -L_jump_tbl: - .long L_dest00 - .long L_dest01 - .long L_dest10 - .long L_dest11 +.L_jump_tbl: + .long .L_dest00 + .long .L_dest01 + .long .L_dest10 + .long .L_dest11 ! Destination = 00 -L_dest00: +.L_dest00: mov r2,r7 shlr2 r7 shlr r7 @@ -226,7 +226,7 @@ bf/s 2b add #32,r4 tst r2,r2 - bt L_cleanup + bt .L_cleanup 1: mov.l @r5+,r0 dt r2 @@ -234,12 +234,12 @@ bf/s 1b add #4,r4 - bra L_cleanup + bra .L_cleanup nop ! Destination = 10 -L_dest10: +.L_dest10: mov r2,r7 shlr2 r7 shlr r7 @@ -314,7 +314,7 @@ add #34,r4 #endif tst r2,r2 - bt L_cleanup + bt .L_cleanup 1: ! Read longword, write two words per iteration EX( mov.l @r5+,r0 ) @@ -331,13 +331,13 @@ bf/s 1b add #4,r4 - bra L_cleanup + bra .L_cleanup nop ! Destination = 01 or 11 -L_dest01: -L_dest11: +.L_dest01: +.L_dest11: ! Read longword, write byte, word, byte per iteration EX( mov.l @r5+,r0 ) dt r2 @@ -348,7 +348,7 @@ EX( mov.w r0,@r4 ) shlr16 r0 EX( mov.b r0,@(2,r4) ) - bf/s L_dest01 + bf/s .L_dest01 add #3,r4 #else EX( mov.b r0,@(3,r4) ) @@ -357,26 +357,26 @@ EX( mov.b r7,@r4 ) add #1,r4 EX( mov.w r0,@r4 ) - bf/s L_dest01 + bf/s .L_dest01 add #3,r4 #endif ! Cleanup last few bytes -L_cleanup: +.L_cleanup: mov r6,r0 and #3,r0 tst r0,r0 - bt L_exit + bt .L_exit mov r0,r6 -L_cleanup_loop: +.L_cleanup_loop: EX( mov.b @r5+,r0 ) dt r6 EX( mov.b r0,@r4 ) - bf/s L_cleanup_loop + bf/s .L_cleanup_loop add #1,r4 -L_exit: +.L_exit: mov #0,r0 ! normal return 5000: ---- SUGIOKA Toshinobu |