From: NIIBE Y. <gn...@m1...> - 2002-03-27 05:56:00
|
Optimized the flushing. There is a requirement in SH4 to control cache from P2 area. P2 area is the area where cache is not used at all. This means, the cycles for instruction could be quite large. In other word, if we could save number of instructions, we see performance improvement. I've done: Four --> Three instruction fetch per single cache line flush. 2002-03-27 NIIBE Yutaka <gn...@m1...> * arch/sh/mm/copy_page.S: File merged with __copy_user_page-sh4.S. * arch/sh/mm/clear_page.S: File merged with __clear_user.S. (__flush_cache_4096): New function. * arch/sh/mm/Makefile: Remove __copy_user_page-sh4.S and __clear_user.S. * arch/sh/mm/cache-sh4.c (__flush_icache_page): Removed. (flush_cache_4096): New function. 2002-03-27 NIIBE Yutaka <gn...@m1...> * arch/sh/mm/cache-sh4.c (__flush_cache_page): New function. (flush_cache_range, flush_cache_page, flush_icache_user_range): Use __flush_cache_page. Index: arch/sh/mm/Makefile =================================================================== RCS file: /cvsroot/linuxsh/linux/arch/sh/mm/Makefile,v retrieving revision 1.2 diff -u -3 -p -r1.2 Makefile --- arch/sh/mm/Makefile 26 Mar 2002 01:56:39 -0000 1.2 +++ arch/sh/mm/Makefile 27 Mar 2002 05:15:07 -0000 @@ -10,8 +10,8 @@ O_TARGET := mm.o obj-y := init.o fault.o extable.o clear_page.o copy_page.o -obj-$(CONFIG_CPU_SH3) += cache-sh3.o __clear_user.o -obj-$(CONFIG_CPU_SH4) += cache-sh4.o __clear_user.o __copy_user_page-sh4.o ioremap.o +obj-$(CONFIG_CPU_SH3) += cache-sh3.o +obj-$(CONFIG_CPU_SH4) += cache-sh4.o ioremap.o USE_STANDARD_AS_RULE := true Index: arch/sh/mm/__clear_user.S =================================================================== RCS file: arch/sh/mm/__clear_user.S diff -N arch/sh/mm/__clear_user.S --- arch/sh/mm/__clear_user.S 26 Mar 2002 01:56:39 -0000 1.1 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,138 +0,0 @@ -/* $Id: __clear_user.S,v 1.1 2002/03/26 01:56:39 gniibe Exp $ - * - * __clear_user_page implementation of SuperH - * - * Copyright (C) 2001, 2002 Niibe Yutaka & Kaz Kojima - * - */ - -#if defined(__SH4__) -/* - * __clear_user_page - * @to: P1 address (with same color) - * @orig_to: P1 address - * - * void __clear_user_page(void *to, void *orig_to) - */ - -/* - * r0 --- scratch - * r4 --- to - * r5 --- orig_to - * r6 --- to + 4096 - */ -#include <linux/linkage.h> -ENTRY(__clear_user_page) - mov.w .L4096,r0 - mov r4,r6 - add r0,r6 - mov #0,r0 - ! -1: ocbi @r5 - add #32,r5 - movca.l r0,@r4 - mov r4,r1 - add #32,r4 - mov.l r0,@-r4 - mov.l r0,@-r4 - mov.l r0,@-r4 - mov.l r0,@-r4 - mov.l r0,@-r4 - mov.l r0,@-r4 - mov.l r0,@-r4 - add #28,r4 - cmp/eq r6,r4 - bf/s 1b - ocbwb @r1 - ! - rts - nop -.L4096: .word 4096 -#endif - -ENTRY(__clear_user) - ! - mov #0, r0 - mov #0xe0, r1 ! 0xffffffe0 - ! - ! r4..r4&~32 -------- not aligned [ Area 0 ] - ! r4&~32..(r4+r5)&~32 -------- aligned [ Area 1 ] - ! (r4+r5)&~32..r4+r5 -------- not aligned [ Area 2 ] - ! - ! Clear area 0 - mov r4, r2 - and r1, r2 - cmp/eq r4, r2 - bt/s area1 - mov r4, r3 - sub r2, r3 - mov r4, r2 - ! -l0: dt r3 -0: mov.b r0, @r2 - bf/s l0 - add #1, r2 - ! - mov r4, r3 - add r5, r3 - and r1, r3 - ! - ! Clear area 1 -area1: -#if defined(__SH4__) -1: movca.l r0, @r2 -#else -1: mov.l r0, @r2 -#endif - add #4, r2 -2: mov.l r0, @r2 - add #4, r2 -3: mov.l r0, @r2 - add #4, r2 -4: mov.l r0, @r2 - add #4, r2 -5: mov.l r0, @r2 - add #4, r2 -6: mov.l r0, @r2 - add #4, r2 -7: mov.l r0, @r2 - add #4, r2 -8: mov.l r0, @r2 - add #4, r2 - cmp/hi r2, r3 - bt/s 1b - nop - ! - ! Clear area 2 - add r5, r4 - cmp/eq r4, r2 - bt/s done - sub r2, r4 -l2: dt r4 -9: mov.b r0, @r2 - bf/s l2 - add #1, r2 - ! -done: rts - nop ! return 0 as normal return - - ! return the number of bytes remained -bad_clear_user: - mov r4, r0 - mov r5, r0 - rts - sub r2, r0 - -.section __ex_table,"a" - .align 2 - .long 0b, bad_clear_user - .long 1b, bad_clear_user - .long 2b, bad_clear_user - .long 3b, bad_clear_user - .long 4b, bad_clear_user - .long 5b, bad_clear_user - .long 6b, bad_clear_user - .long 7b, bad_clear_user - .long 8b, bad_clear_user - .long 9b, bad_clear_user -.previous Index: arch/sh/mm/__copy_user_page-sh4.S =================================================================== RCS file: arch/sh/mm/__copy_user_page-sh4.S diff -N arch/sh/mm/__copy_user_page-sh4.S --- arch/sh/mm/__copy_user_page-sh4.S 15 Oct 2001 20:44:53 -0000 1.1.1.1 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,69 +0,0 @@ -/* $Id: __copy_user_page-sh4.S,v 1.1.1.1 2001/10/15 20:44:53 mrbrown Exp $ - * - * __copy_user_page implementation of SuperH - * - * Copyright (C) 2001 Niibe Yutaka & Kaz Kojima - * - */ - -/* - * __copy_user_page - * @to: P1 address (with same color) - * @from: P1 address - * @orig_to: P1 address - * - * void __copy_user_page(void *to, void *from, void *orig_to) - */ - -/* - * r0, r1, r2, r3, r4, r5, r6, r7 --- scratch - * r8 --- from + 4096 - * r9 --- orig_to - * r10 --- to - * r11 --- from - */ -#include <linux/linkage.h> -ENTRY(__copy_user_page) - mov.l r8,@-r15 - mov.l r9,@-r15 - mov.l r10,@-r15 - mov.l r11,@-r15 - mov r4,r10 - mov r5,r11 - mov r6,r9 - mov r5,r8 - mov.w .L4096,r0 - add r0,r8 - ! -1: ocbi @r9 - add #32,r9 - mov.l @r11+,r0 - mov.l @r11+,r1 - mov.l @r11+,r2 - mov.l @r11+,r3 - mov.l @r11+,r4 - mov.l @r11+,r5 - mov.l @r11+,r6 - mov.l @r11+,r7 - movca.l r0,@r10 - mov r10,r0 - add #32,r10 - mov.l r7,@-r10 - mov.l r6,@-r10 - mov.l r5,@-r10 - mov.l r4,@-r10 - mov.l r3,@-r10 - mov.l r2,@-r10 - mov.l r1,@-r10 - ocbwb @r0 - cmp/eq r11,r8 - bf/s 1b - add #28,r10 - ! - mov.l @r15+,r11 - mov.l @r15+,r10 - mov.l @r15+,r9 - mov.l @r15+,r8 - rts - nop -.L4096: .word 4096 Index: arch/sh/mm/cache-sh4.c =================================================================== RCS file: /cvsroot/linuxsh/linux/arch/sh/mm/cache-sh4.c,v retrieving revision 1.7 diff -u -3 -p -r1.7 cache-sh4.c --- arch/sh/mm/cache-sh4.c 27 Mar 2002 00:07:19 -0000 1.7 +++ arch/sh/mm/cache-sh4.c 27 Mar 2002 05:15:07 -0000 @@ -208,54 +208,40 @@ void flush_cache_sigtramp(unsigned long restore_flags(flags); } +static inline void flush_cache_4096(unsigned long start, + unsigned long phys) +{ + register unsigned long addr __asm__ ("r4"); + register unsigned long data __asm__ ("r0"); + register unsigned long __r5 __asm__ ("r5") = phys; + register unsigned long __r6 __asm__ ("r6") = (0x1ffff000|CACHE_VALID); + register unsigned long __r7 __asm__ ("r7") = 0; + extern void __flush_cache_4096(unsigned long, unsigned long); + + asm volatile("jsr @%1; nop" + : "=r" (addr), "=r" (data) + : "0" (start), "1" (__flush_cache_4096 + 0x20000000), + "r" (__r5), "r" (__r6), "r" (__r7) + : "pr"); +} + /* * Writeback&Invalidate the D-cache of the page */ static void __flush_dcache_page(unsigned long phys) { - unsigned long addr, data; unsigned long flags; phys |= CACHE_VALID; save_and_cli(flags); - jump_to_P2(); /* Loop all the D-cache */ - for (addr = CACHE_OC_ADDRESS_ARRAY; - addr < (CACHE_OC_ADDRESS_ARRAY - +(CACHE_OC_NUM_ENTRIES<< CACHE_OC_ENTRY_SHIFT)); - addr += (1<<CACHE_OC_ENTRY_SHIFT)) { - data = ctrl_inl(addr)&(0x1ffff000|CACHE_VALID); - if (data == phys) - ctrl_outl(0, addr); - } - - back_to_P1(); - restore_flags(flags); -} - -static void __flush_icache_page(unsigned long phys) -{ - unsigned long addr, data; - unsigned long flags; - - phys |= CACHE_VALID; - - save_and_cli(flags); - jump_to_P2(); + flush_cache_4096(CACHE_OC_ADDRESS_ARRAY, phys); + flush_cache_4096(CACHE_OC_ADDRESS_ARRAY | 0x1000, phys); + flush_cache_4096(CACHE_OC_ADDRESS_ARRAY | 0x2000, phys); + flush_cache_4096(CACHE_OC_ADDRESS_ARRAY | 0x3000, phys); - /* Loop all the I-cache */ - for (addr = CACHE_IC_ADDRESS_ARRAY; - addr < (CACHE_IC_ADDRESS_ARRAY - +(CACHE_IC_NUM_ENTRIES<< CACHE_IC_ENTRY_SHIFT)); - addr += (1<<CACHE_IC_ENTRY_SHIFT)) { - data = ctrl_inl(addr)&(0x1ffff000|CACHE_VALID); - if (data == phys) - ctrl_outl(0, addr); - } - - back_to_P1(); restore_flags(flags); } @@ -307,6 +293,36 @@ void flush_cache_mm(struct mm_struct *mm flush_cache_all(); } +static void __flush_cache_page(struct vm_area_struct *vma, + unsigned long address, + unsigned long phys) +{ + unsigned long flags; + + phys |= CACHE_VALID; + save_and_cli(flags); + + /* We only need to flush D-cache when we have alias */ + if ((address^phys) & CACHE_ALIAS) { + /* Loop 4K of the D-cache */ + flush_cache_4096( + CACHE_OC_ADDRESS_ARRAY | (address & CACHE_ALIAS), + phys); + /* Loop another 4K of the D-cache */ + flush_cache_4096( + CACHE_OC_ADDRESS_ARRAY | (phys & CACHE_ALIAS), + phys); + } + + if (vma->vm_flags & VM_EXEC) + /* Loop 4K (half) of the I-cache */ + flush_cache_4096( + CACHE_IC_ADDRESS_ARRAY | (address & 0x1000), + phys); + + restore_flags(flags); +} + /* * Write back and invalidate D-caches. * @@ -319,31 +335,35 @@ void flush_cache_mm(struct mm_struct *mm void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - unsigned long flags; - struct mm_struct *mm = vma->vm_mm; + unsigned long p = start & PAGE_MASK; + pgd_t *dir; + pmd_t *pmd; + pte_t *pte; + pte_t entry; + unsigned long phys; - start &= PAGE_MASK; - - save_and_cli(flags); - for (; start < end; start += PAGE_SIZE) { - pgd_t *pgd = pgd_offset(mm, start); - pmd_t *pmd = pmd_offset(pgd, start); - pte_t *pte; - unsigned long phys; + dir = pgd_offset(vma->vm_mm, p); + pmd = pmd_offset(dir, p); + do { if (pmd_none(*pmd) || pmd_bad(*pmd)) { - start &= ~((1 << PMD_SHIFT) -1); - start += (1 << PMD_SHIFT); + p &= ~((1 << PMD_SHIFT) -1); + p += (1 << PMD_SHIFT); + pmd++; continue; } - pte = pte_offset_kernel(pmd, start); - phys = pte_val(*pte)&PTE_PHYS_MASK; - if (pte_val(*pte) & _PAGE_PRESENT) { - __flush_icache_page(phys); - __flush_dcache_page(phys); - } - } - restore_flags(flags); + pte = pte_offset_kernel(pmd, p); + do { + entry = *pte; + if ((pte_val(entry) & _PAGE_PRESENT)) { + phys = pte_val(entry)&PTE_PHYS_MASK; + __flush_cache_page(vma, p, phys); + } + pte++; + p += PAGE_SIZE; + } while (p < end && (unsigned long)pte & PAGE_MASK); + pmd++; + } while (p < end); } /* @@ -357,8 +377,7 @@ void flush_cache_page(struct vm_area_str pmd_t *pmd; pte_t *pte; pte_t entry; - unsigned long phys, addr, data; - unsigned long flags; + unsigned long phys; dir = pgd_offset(vma->vm_mm, address); pmd = pmd_offset(dir, address); @@ -366,49 +385,11 @@ void flush_cache_page(struct vm_area_str return; pte = pte_offset_kernel(pmd, address); entry = *pte; - if (pte_none(entry) || !pte_present(entry)) + if (!(pte_val(entry) & _PAGE_PRESENT)) return; phys = pte_val(entry)&PTE_PHYS_MASK; - - phys |= CACHE_VALID; - save_and_cli(flags); - jump_to_P2(); - - /* We only need to flush D-cache when we have alias */ - if ((address^phys) & CACHE_ALIAS) { - /* Loop 4K of the D-cache */ - for (addr = CACHE_OC_ADDRESS_ARRAY | (address & CACHE_ALIAS); - addr < (CACHE_OC_ADDRESS_ARRAY + (address & CACHE_ALIAS) - +(CACHE_OC_NUM_ENTRIES/4<<CACHE_OC_ENTRY_SHIFT)); - addr += (1<<CACHE_OC_ENTRY_SHIFT)) { - data = ctrl_inl(addr)&(0x1ffff000|CACHE_VALID); - if (data == phys) - ctrl_outl(0, addr); - } - /* Loop another 4K of the D-cache */ - for (addr = CACHE_OC_ADDRESS_ARRAY | (phys & CACHE_ALIAS); - addr < (CACHE_OC_ADDRESS_ARRAY + (phys & CACHE_ALIAS) - +(CACHE_OC_NUM_ENTRIES/4<<CACHE_OC_ENTRY_SHIFT)); - addr += (1<<CACHE_OC_ENTRY_SHIFT)) { - data = ctrl_inl(addr)&(0x1ffff000|CACHE_VALID); - if (data == phys) - ctrl_outl(0, addr); - } - } - - if (vma->vm_flags & VM_EXEC) - /* Loop 4K of the I-cache */ - for (addr = CACHE_IC_ADDRESS_ARRAY|(address&0x1000); - addr < ((CACHE_IC_ADDRESS_ARRAY|(address&0x1000)) - +(CACHE_IC_NUM_ENTRIES/2<<CACHE_IC_ENTRY_SHIFT)); - addr += (1<<CACHE_IC_ENTRY_SHIFT)) { - data = ctrl_inl(addr)&(0x1ffff000|CACHE_VALID); - if (data == phys) - ctrl_outl(0, addr); - } - back_to_P1(); - restore_flags(flags); + __flush_cache_page(vma, address, phys); } /* @@ -421,10 +402,7 @@ void flush_cache_page(struct vm_area_str void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { - if (test_bit(PG_mapped, &page->flags)) { - __flush_icache_page(PHYSADDR(page_address(page))); - __flush_dcache_page(PHYSADDR(page_address(page))); - } + __flush_cache_page(vma, addr, PHYSADDR(page_address(page))); } /* Index: arch/sh/mm/clear_page.S =================================================================== RCS file: /cvsroot/linuxsh/linux/arch/sh/mm/clear_page.S,v retrieving revision 1.1.1.1 diff -u -3 -p -r1.1.1.1 clear_page.S --- arch/sh/mm/clear_page.S 15 Oct 2001 20:44:53 -0000 1.1.1.1 +++ arch/sh/mm/clear_page.S 27 Mar 2002 05:15:07 -0000 @@ -1,10 +1,11 @@ /* $Id: clear_page.S,v 1.1.1.1 2001/10/15 20:44:53 mrbrown Exp $ * - * clear_page implementation of SuperH + * __clear_user_page, __clear_user, clear_page implementation of SuperH * - * Copyright (C) 2001 Niibe Yutaka & Kaz Kojima + * Copyright (C) 2001, 2002 Niibe Yutaka & Kaz Kojima * */ +#include <linux/linkage.h> /* * clear_page @@ -18,7 +19,6 @@ * r4 --- to * r5 --- to + 4096 */ -#include <linux/linkage.h> ENTRY(clear_page) mov r4,r5 mov.w .Llimit,r0 @@ -50,3 +50,160 @@ ENTRY(clear_page) rts nop .Llimit: .word (4096-28) + +ENTRY(__clear_user) + ! + mov #0, r0 + mov #0xe0, r1 ! 0xffffffe0 + ! + ! r4..r4&~32 -------- not aligned [ Area 0 ] + ! r4&~32..(r4+r5)&~32 -------- aligned [ Area 1 ] + ! (r4+r5)&~32..r4+r5 -------- not aligned [ Area 2 ] + ! + ! Clear area 0 + mov r4, r2 + and r1, r2 + cmp/eq r4, r2 + bt/s area1 + mov r4, r3 + sub r2, r3 + mov r4, r2 + ! +l0: dt r3 +0: mov.b r0, @r2 + bf/s l0 + add #1, r2 + ! + mov r4, r3 + add r5, r3 + and r1, r3 + ! + ! Clear area 1 +area1: +#if defined(__SH4__) +1: movca.l r0, @r2 +#else +1: mov.l r0, @r2 +#endif + add #4, r2 +2: mov.l r0, @r2 + add #4, r2 +3: mov.l r0, @r2 + add #4, r2 +4: mov.l r0, @r2 + add #4, r2 +5: mov.l r0, @r2 + add #4, r2 +6: mov.l r0, @r2 + add #4, r2 +7: mov.l r0, @r2 + add #4, r2 +8: mov.l r0, @r2 + add #4, r2 + cmp/hi r2, r3 + bt/s 1b + nop + ! + ! Clear area 2 + add r5, r4 + cmp/eq r4, r2 + bt/s done + sub r2, r4 +l2: dt r4 +9: mov.b r0, @r2 + bf/s l2 + add #1, r2 + ! +done: rts + nop ! return 0 as normal return + + ! return the number of bytes remained +bad_clear_user: + mov r4, r0 + mov r5, r0 + rts + sub r2, r0 + +.section __ex_table,"a" + .align 2 + .long 0b, bad_clear_user + .long 1b, bad_clear_user + .long 2b, bad_clear_user + .long 3b, bad_clear_user + .long 4b, bad_clear_user + .long 5b, bad_clear_user + .long 6b, bad_clear_user + .long 7b, bad_clear_user + .long 8b, bad_clear_user + .long 9b, bad_clear_user +.previous + +#if defined(__SH4__) +/* + * __clear_user_page + * @to: P1 address (with same color) + * @orig_to: P1 address + * + * void __clear_user_page(void *to, void *orig_to) + */ + +/* + * r0 --- scratch + * r4 --- to + * r5 --- orig_to + * r6 --- to + 4096 + */ +ENTRY(__clear_user_page) + mov.w .L4096,r0 + mov r4,r6 + add r0,r6 + mov #0,r0 + ! +1: ocbi @r5 + add #32,r5 + movca.l r0,@r4 + mov r4,r1 + add #32,r4 + mov.l r0,@-r4 + mov.l r0,@-r4 + mov.l r0,@-r4 + mov.l r0,@-r4 + mov.l r0,@-r4 + mov.l r0,@-r4 + mov.l r0,@-r4 + add #28,r4 + cmp/eq r6,r4 + bf/s 1b + ocbwb @r1 + ! + rts + nop +.L4096: .word 4096 + +/************* + unsigned long addr, data; + for (addr = start; addr < start + 4096; addr += 32) { + data = ctrl_inl(addr)&(0x1ffff000|CACHE_VALID); + if (data == phys) + ctrl_outl(0, addr); + } +*************/ +ENTRY(__flush_cache_4096) + .rept 128 + mov.l @r4,r0 + and r6,r0 + cmp/eq r5,r0 + bf 1f + mov.l r7,@r4 +1: add #32,r4 + .endr + nop + nop + nop + nop + nop + nop + nop + rts + nop +#endif Index: arch/sh/mm/copy_page.S =================================================================== RCS file: /cvsroot/linuxsh/linux/arch/sh/mm/copy_page.S,v retrieving revision 1.1.1.1 diff -u -3 -p -r1.1.1.1 copy_page.S --- arch/sh/mm/copy_page.S 15 Oct 2001 20:44:53 -0000 1.1.1.1 +++ arch/sh/mm/copy_page.S 27 Mar 2002 05:15:07 -0000 @@ -1,10 +1,11 @@ /* $Id: copy_page.S,v 1.1.1.1 2001/10/15 20:44:53 mrbrown Exp $ * - * copy_page implementation of SuperH + * copy_page, __copy_user_page implementation of SuperH * * Copyright (C) 2001 Niibe Yutaka & Kaz Kojima * */ +#include <linux/linkage.h> /* * copy_page @@ -21,7 +22,6 @@ * r10 --- to * r11 --- from */ -#include <linux/linkage.h> ENTRY(copy_page) mov.l r8,@-r15 mov.l r10,@-r15 @@ -66,4 +66,67 @@ ENTRY(copy_page) mov.l @r15+,r8 rts nop + +#if defined(__SH4__) +/* + * __copy_user_page + * @to: P1 address (with same color) + * @from: P1 address + * @orig_to: P1 address + * + * void __copy_user_page(void *to, void *from, void *orig_to) + */ + +/* + * r0, r1, r2, r3, r4, r5, r6, r7 --- scratch + * r8 --- from + 4096 + * r9 --- orig_to + * r10 --- to + * r11 --- from + */ +#include <linux/linkage.h> +ENTRY(__copy_user_page) + mov.l r8,@-r15 + mov.l r9,@-r15 + mov.l r10,@-r15 + mov.l r11,@-r15 + mov r4,r10 + mov r5,r11 + mov r6,r9 + mov r5,r8 + mov.w .L4096,r0 + add r0,r8 + ! +1: ocbi @r9 + add #32,r9 + mov.l @r11+,r0 + mov.l @r11+,r1 + mov.l @r11+,r2 + mov.l @r11+,r3 + mov.l @r11+,r4 + mov.l @r11+,r5 + mov.l @r11+,r6 + mov.l @r11+,r7 + movca.l r0,@r10 + mov r10,r0 + add #32,r10 + mov.l r7,@-r10 + mov.l r6,@-r10 + mov.l r5,@-r10 + mov.l r4,@-r10 + mov.l r3,@-r10 + mov.l r2,@-r10 + mov.l r1,@-r10 + ocbwb @r0 + cmp/eq r11,r8 + bf/s 1b + add #28,r10 + ! + mov.l @r15+,r11 + mov.l @r15+,r10 + mov.l @r15+,r9 + mov.l @r15+,r8 + rts + nop +#endif .L4096: .word 4096 |