From: Andy P. <at...@us...> - 2002-04-09 17:08:20
|
Update of /cvsroot/linux-vax/kernel-2.4/arch/s390x/mm In directory usw-pr-cvs1:/tmp/cvs-serv13825/s390x/mm Modified Files: extable.c fault.c init.c ioremap.c Log Message: synch 2.4.15 commit 29 Index: extable.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/arch/s390x/mm/extable.c,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -u -r1.1.1.1 -r1.2 --- extable.c 25 Feb 2001 23:15:23 -0000 1.1.1.1 +++ extable.c 9 Apr 2002 17:03:18 -0000 1.2 @@ -10,6 +10,7 @@ #include <linux/config.h> #include <linux/module.h> +#include <linux/spinlock.h> #include <asm/uaccess.h> extern const struct exception_table_entry __start___ex_table[]; @@ -36,26 +37,32 @@ return 0; } +extern spinlock_t modlist_lock; + unsigned long search_exception_table(unsigned long addr) { - unsigned long ret; + unsigned long ret = 0; + unsigned long flags; #ifndef CONFIG_MODULES /* There is only the kernel to search. */ ret = search_one_table(__start___ex_table, __stop___ex_table-1, addr); - if (ret) return FIX_PSW(ret); + return ret; #else /* The kernel is the last "module" -- no need to treat it special. */ struct module *mp; + + spin_lock_irqsave(&modlist_lock, flags); for (mp = module_list; mp != NULL; mp = mp->next) { - if (mp->ex_table_start == NULL) + if (mp->ex_table_start == NULL || !(mp->flags&(MOD_RUNNING|MOD_INITIALIZING))) continue; ret = search_one_table(mp->ex_table_start, mp->ex_table_end - 1, addr); - if (ret) return FIX_PSW(ret); + if (ret) + break; } + spin_unlock_irqrestore(&modlist_lock, flags); + return ret; #endif - - return 0; } Index: fault.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/arch/s390x/mm/fault.c,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -u -r1.1.1.1 -r1.2 --- fault.c 25 Feb 2001 23:15:23 -0000 1.1.1.1 +++ fault.c 9 Apr 2002 17:03:18 -0000 1.2 @@ -4,6 +4,7 @@ * S390 version * Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation * Author(s): Hartmut Penner (hp...@de...) + * Ulrich Weigand (uwe...@de...) * * Derived from "arch/i386/mm/fault.c" * Copyright (C) 1995 Linus Torvalds @@ -21,6 +22,8 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/smp_lock.h> +#include <linux/init.h> +#include <linux/console.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -32,6 +35,34 @@ #endif extern void die(const char *,struct pt_regs *,long); +static void force_sigsegv(struct task_struct *tsk, int code, void *address); + +extern spinlock_t timerlist_lock; + +/* + * Unlock any spinlocks which will prevent us from getting the + * message out (timerlist_lock is acquired through the + * console unblank code) + */ +void bust_spinlocks(int yes) +{ + spin_lock_init(&timerlist_lock); + if (yes) { + oops_in_progress = 1; + } else { + int loglevel_save = console_loglevel; + oops_in_progress = 0; + console_unblank(); + /* + * OK, the message is on the console. Now we call printk() + * without oops_in_progress set so that printk will give klogd + * a poke. Hold onto your hats... + */ + console_loglevel = 15; + printk(" "); + console_loglevel = loglevel_save; + } +} /* * This routine handles page faults. It determines the address, @@ -52,18 +83,31 @@ unsigned long address; unsigned long fixup; int write; - unsigned long psw_mask; - unsigned long psw_addr; int si_code = SEGV_MAPERR; int kernel_address = 0; - /* - * get psw mask of Program old psw to find out, - * if user or kernel mode - */ + tsk = current; + mm = tsk->mm; + + /* + * Check for low-address protection. This needs to be treated + * as a special case because the translation exception code + * field is not guaranteed to contain valid data in this case. + */ + if ((error_code & 0xff) == 4 && !(S390_lowcore.trans_exc_code & 4)) { - psw_mask = S390_lowcore.program_old_psw.mask; - psw_addr = S390_lowcore.program_old_psw.addr; + /* Low-address protection hit in kernel mode means + NULL pointer write access in kernel mode. */ + if (!(regs->psw.mask & PSW_PROBLEM_STATE)) { + address = 0; + kernel_address = 1; + goto no_context; + } + + /* Low-address protection hit in user mode 'cannot happen'. */ + die ("Low-address protection", regs, error_code); + do_exit(SIGKILL); + } /* * get the failing address @@ -73,11 +117,6 @@ address = S390_lowcore.trans_exc_code&-4096L; - tsk = current; - mm = tsk->mm; - - if (in_interrupt() || !mm) - goto no_context; /* * Check which address space the address belongs to @@ -108,6 +147,7 @@ } } die("page fault via unknown access register", regs, error_code); + do_exit(SIGKILL); break; case 2: /* Secondary Segment Table Descriptor */ @@ -116,19 +156,25 @@ break; } + /* + * Check whether we have a user MM in the first place. + */ + if (in_interrupt() || !mm || !(regs->psw.mask & _PSW_IO_MASK_BIT)) + goto no_context; /* * When we get here, the fault happened in the current - * task's user address space, so we search the VMAs + * task's user address space, so we can switch on the + * interrupts again and then search the VMAs */ - down(&mm->mmap_sem); + __sti(); + + down_read(&mm->mmap_sem); vma = find_vma(mm, address); - if (!vma) { - printk("no vma for address %lX\n",address); + if (!vma) goto bad_area; - } if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) @@ -158,6 +204,7 @@ goto bad_area; } + survive: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -176,7 +223,7 @@ goto out_of_memory; } - up(&mm->mmap_sem); + up_read(&mm->mmap_sem); return; /* @@ -184,11 +231,10 @@ * Fix it, but check if it's kernel or user first.. */ bad_area: - up(&mm->mmap_sem); + up_read(&mm->mmap_sem); /* User mode accesses just cause a SIGSEGV */ - if (psw_mask & PSW_PROBLEM_STATE) { - struct siginfo si; + if (regs->psw.mask & PSW_PROBLEM_STATE) { tsk->thread.prot_addr = address; tsk->thread.trap_no = error_code; #ifndef CONFIG_SYSCTL @@ -205,10 +251,8 @@ show_regs(regs); } #endif - si.si_signo = SIGSEGV; - si.si_code = si_code; - si.si_addr = (void*) address; - force_sig_info(SIGSEGV, &si, tsk); + + force_sigsegv(tsk, si_code, (void *)address); return; } @@ -223,6 +267,7 @@ * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ + if (kernel_address) printk(KERN_ALERT "Unable to handle kernel pointer dereference" " at virtual kernel address %016lx\n", address); @@ -230,10 +275,6 @@ printk(KERN_ALERT "Unable to handle kernel paging request" " at virtual user address %016lx\n", address); -/* - * need to define, which information is useful here - */ - die("Oops", regs, error_code); do_exit(SIGKILL); @@ -243,14 +284,20 @@ * us unable to handle the page fault gracefully. */ out_of_memory: - up(&mm->mmap_sem); + up_read(&mm->mmap_sem); + if (tsk->pid == 1) { + tsk->policy |= SCHED_YIELD; + schedule(); + down_read(&mm->mmap_sem); + goto survive; + } printk("VM: killing process %s\n", tsk->comm); - if (psw_mask & PSW_PROBLEM_STATE) + if (regs->psw.mask & PSW_PROBLEM_STATE) do_exit(SIGKILL); goto no_context; do_sigbus: - up(&mm->mmap_sem); + up_read(&mm->mmap_sem); /* * Send a sigbus, regardless of whether we were in kernel @@ -261,6 +308,158 @@ force_sig(SIGBUS, tsk); /* Kernel mode? Handle exceptions or die */ - if (!(psw_mask & PSW_PROBLEM_STATE)) + if (!(regs->psw.mask & PSW_PROBLEM_STATE)) goto no_context; } + +/* + * Send SIGSEGV to task. This is an external routine + * to keep the stack usage of do_page_fault small. + */ +static void force_sigsegv(struct task_struct *tsk, int code, void *address) +{ + struct siginfo si; + si.si_signo = SIGSEGV; + si.si_code = code; + si.si_addr = address; + force_sig_info(SIGSEGV, &si, tsk); +} + + +#ifdef CONFIG_PFAULT +/* + * 'pfault' pseudo page faults routines. + */ +static int pfault_disable = 0; + +static int __init nopfault(char *str) +{ + pfault_disable = 1; + return 1; +} + +__setup("nopfault", nopfault); + +typedef struct { + __u16 refdiagc; + __u16 reffcode; + __u16 refdwlen; + __u16 refversn; + __u64 refgaddr; + __u64 refselmk; + __u64 refcmpmk; + __u64 reserved; +} __attribute__ ((packed)) pfault_refbk_t; + +typedef struct _pseudo_wait_t { + struct _pseudo_wait_t *next; + wait_queue_head_t queue; + unsigned long address; + int resolved; +} pseudo_wait_t; + +int pfault_init(void) +{ + pfault_refbk_t refbk = + { 0x258, 0, 5, 2, __LC_KERNEL_STACK, 1ULL << 48, 1ULL << 48, + 0x8000000000000000ULL }; + int rc; + + if (pfault_disable) + return -1; + __asm__ __volatile__( + " diag %1,%0,0x258\n" + "0: j 2f\n" + "1: la %0,8\n" + "2:\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .quad 0b,1b\n" + ".previous" + : "=d" (rc) : "a" (&refbk) : "cc" ); + __ctl_set_bit(0, 9); + return rc; +} + +void pfault_fini(void) +{ + pfault_refbk_t refbk = + { 0x258, 1, 5, 2, 0ULL, 0ULL, 0ULL, 0ULL }; + + if (pfault_disable) + return; + __ctl_clear_bit(0, 9); + __asm__ __volatile__( + " diag %0,0,0x258\n" + "0:\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .quad 0b,0b\n" + ".previous" + : : "a" (&refbk) : "cc" ); +} + +asmlinkage void +pfault_interrupt(struct pt_regs *regs, __u16 error_code) +{ + struct task_struct *tsk; + wait_queue_head_t queue; + wait_queue_head_t *qp; + __u16 subcode; + + /* + * Get the external interruption subcode & pfault + * initial/completion signal bit. VM stores this + * in the 'cpu address' field associated with the + * external interrupt. + */ + subcode = S390_lowcore.cpu_addr; + if ((subcode & 0xff00) != 0x0600) + return; + + /* + * Get the token (= address of kernel stack of affected task). + */ + tsk = (struct task_struct *) + (*((unsigned long *) __LC_PFAULT_INTPARM) - THREAD_SIZE); + + /* + * We got all needed information from the lowcore and can + * now safely switch on interrupts. + */ + if (regs->psw.mask & PSW_PROBLEM_STATE) + __sti(); + + if (subcode & 0x0080) { + /* signal bit is set -> a page has been swapped in by VM */ + qp = (wait_queue_head_t *) + xchg(&tsk->thread.pfault_wait, -1); + if (qp != NULL) { + /* Initial interrupt was faster than the completion + * interrupt. pfault_wait is valid. Set pfault_wait + * back to zero and wake up the process. This can + * safely be done because the task is still sleeping + * and can't procude new pfaults. */ + tsk->thread.pfault_wait = 0ULL; + wake_up(qp); + } + } else { + /* signal bit not set -> a real page is missing. */ + init_waitqueue_head (&queue); + qp = (wait_queue_head_t *) + xchg(&tsk->thread.pfault_wait, (addr_t) &queue); + if (qp != NULL) { + /* Completion interrupt was faster than the initial + * interrupt (swapped in a -1 for pfault_wait). Set + * pfault_wait back to zero and exit. This can be + * done safely because tsk is running in kernel + * mode and can't produce new pfaults. */ + tsk->thread.pfault_wait = 0ULL; + } + + /* go to sleep */ + wait_event(queue, tsk->thread.pfault_wait == 0ULL); + } +} +#endif + Index: init.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/arch/s390x/mm/init.c,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -u -r1.1.1.1 -r1.2 --- init.c 25 Feb 2001 23:15:23 -0000 1.1.1.1 +++ init.c 9 Apr 2002 17:03:18 -0000 1.2 @@ -35,143 +35,32 @@ #include <asm/pgalloc.h> #include <asm/dma.h> #include <asm/lowcore.h> +#include <asm/tlb.h> -static unsigned long totalram_pages; +mmu_gather_t mmu_gathers[NR_CPUS]; -/* - * empty_bad_page is the page that is used for page faults when linux - * is out-of-memory. Older versions of linux just did a - * do_exit(), but using this instead means there is less risk - * for a process dying in kernel mode, possibly leaving an inode - * unused etc.. - * - * empty_bad_pte_table is the accompanying page-table: it is initialized - * to point to BAD_PAGE entries. - * - * empty_bad_pmd_table is the accompanying segment table: it is initialized - * to point to empty_bad_pte_table page tables. - * - * ZERO_PAGE is a special page that is used for zero-initialized - * data and COW. - */ +static unsigned long totalram_pages; pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE))); -char empty_bad_page[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); char empty_zero_page[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); -pmd_t empty_bad_pmd_table[PTRS_PER_PMD] __attribute__((__aligned__(PAGE_SIZE))); -pte_t empty_bad_pte_table[PTRS_PER_PTE] __attribute__((__aligned__(PAGE_SIZE))); - -static int test_access(unsigned long loc) -{ - static const int ssm_mask = 0x07000000L; - int rc, i; - - rc = 0; - for (i=0; i<2; i++) { - __asm__ __volatile__( - " slgr %0,%0\n" - " ssm %1\n" - " tprot 0(%2),0\n" - "0: jne 1f\n" - " lghi %0,1\n" - "1: ssm %3\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" - " .quad 0b,1b\n" - ".previous" - : "+&d" (rc) : "i" (0), "a" (loc), "m" (ssm_mask) - : "cc"); - if (rc == 0) - break; - loc += 0x100000; - } - return rc; -} - -static pmd_t *get_bad_pmd_table(void) -{ - pmd_t v; - int i; - - pmd_set(&v, empty_bad_pte_table); - - for (i = 0; i < PTRS_PER_PMD; i++) - empty_bad_pmd_table[i] = v; - - return empty_bad_pmd_table; -} - -static pte_t *get_bad_pte_table(void) -{ - pte_t v; - int i; - - v = pte_mkdirty(mk_pte_phys(__pa(empty_bad_page), PAGE_SHARED)); - - for (i = 0; i < PAGE_SIZE/sizeof(pte_t); i++) - empty_bad_pte_table[i] = v; - - return empty_bad_pte_table; -} - -pmd_t * -get_pmd_slow(pgd_t *pgd, unsigned long offset) -{ - pmd_t *pmd; - int i; - - pmd = (pmd_t *) __get_free_pages(GFP_KERNEL,2); - if (pgd_none(*pgd)) { - if (pmd) { - for (i = 0; i < PTRS_PER_PMD; i++) - pmd_clear(pmd+i); - pgd_set(pgd, pmd); - return pmd + offset; - } - pmd = (pmd_t *) get_bad_pmd_table(); - pgd_set(pgd, pmd); - return NULL; - } - free_pages((unsigned long)pmd,2); - if (pgd_bad(*pgd)) - BUG(); - return (pmd_t *) pgd_page(*pgd) + offset; -} - -pte_t *get_pte_slow(pmd_t *pmd, unsigned long offset) -{ - pte_t *pte; - int i; - - pte = (pte_t*) __get_free_page(GFP_KERNEL); - if (pmd_none(*pmd)) { - if (pte) { - for (i=0;i<PTRS_PER_PTE;i++) - pte_clear(pte+i); - pmd_set(pmd,pte); - return pte + offset; - } - pte = (pte_t*) get_bad_pte_table(); - pmd_set(pmd,pte); - return NULL; - } - free_page(__pa(pte)); - if (pmd_bad(*pmd)) - BUG(); - return (pte_t *) pmd_page(*pmd) + offset; -} int do_check_pgt_cache(int low, int high) { int freed = 0; if(pgtable_cache_size > high) { do { - if(pgd_quicklist) - free_pgd_slow(get_pgd_fast()), freed += 4; - if(pmd_quicklist) - free_pmd_slow(get_pmd_fast()), freed += 4; - if(pte_quicklist) - free_pte_slow(get_pte_fast()), freed++; + if(pgd_quicklist) { + free_pgd_slow(get_pgd_fast()); + freed += 4; + } + if(pmd_quicklist) { + pmd_free_slow(pmd_alloc_one_fast(NULL, 0)); + freed += 4; + } + if(pte_quicklist) { + pte_free_slow(pte_alloc_one_fast(NULL, 0)); + freed += 1; + } } while(pgtable_cache_size > low); } return freed; @@ -229,7 +118,7 @@ int i,j,k; unsigned long address=0; unsigned long pgdir_k = (__pa(swapper_pg_dir) & PAGE_MASK) | - _REGION_TABLE; + _KERN_REGION_TABLE; unsigned long end_mem = (unsigned long) __va(max_low_pfn*PAGE_SIZE); static const int ssm_mask = 0x04000000L; @@ -259,34 +148,34 @@ for (i = 0 ; i < PTRS_PER_PGD ; i++,pg_dir++) { if (address >= end_mem) { - pgd_clear(pg_dir); - continue; + pgd_clear(pg_dir); + continue; } pm_dir = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE*4); - pgd_set(pg_dir,pm_dir); + pgd_populate(&init_mm, pg_dir, pm_dir); for (j = 0 ; j < PTRS_PER_PMD ; j++,pm_dir++) { - if (address >= end_mem) { - pmd_clear(pm_dir); - continue; - } - + if (address >= end_mem) { + pmd_clear(pm_dir); + continue; + } + pt_dir = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - pmd_set(pm_dir,pt_dir); + pmd_populate(&init_mm, pm_dir, pt_dir); for (k = 0 ; k < PTRS_PER_PTE ; k++,pt_dir++) { - pte = mk_pte_phys(address, PAGE_KERNEL); - if (address >= end_mem) { - pte_clear(&pte); - continue; - } - set_pte(pt_dir, pte); - address += PAGE_SIZE; + pte = mk_pte_phys(address, PAGE_KERNEL); + if (address >= end_mem) { + pte_clear(&pte); + continue; + } + set_pte(pt_dir, pte); + address += PAGE_SIZE; } } } - + /* enable virtual mapping in kernel mode */ __asm__ __volatile__("lctlg 1,1,%0\n\t" "lctlg 7,7,%0\n\t" @@ -302,7 +191,6 @@ void __init mem_init(void) { unsigned long codesize, reservedpages, datasize, initsize; - unsigned long tmp; max_mapnr = num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); @@ -313,25 +201,7 @@ /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); - /* mark usable pages in the mem_map[] and count reserved pages */ reservedpages = 0; - tmp = 0; - do { - if (tmp && (tmp & 0x1ff) == 0 && - test_access(tmp * PAGE_SIZE) == 0) { - printk("2M Segment 0x%016lX not available\n", - tmp * PAGE_SIZE); - do { - set_bit(PG_reserved, &mem_map[tmp].flags); - reservedpages++; - tmp++; - } while (tmp < max_low_pfn && (tmp & 0x1ff)); - } else { - if (PageReserved(mem_map+tmp)) - reservedpages++; - tmp++; - } - } while (tmp < max_low_pfn); codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; Index: ioremap.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/arch/s390x/mm/ioremap.c,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -u -r1.1.1.1 -r1.2 --- ioremap.c 25 Feb 2001 23:15:23 -0000 1.1.1.1 +++ ioremap.c 9 Apr 2002 17:03:18 -0000 1.2 @@ -54,7 +54,7 @@ if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(pmd, address); + pte_t * pte = pte_alloc(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -67,6 +67,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, unsigned long size, unsigned long flags) { + int error; pgd_t * dir; unsigned long end = address + size; @@ -75,17 +76,21 @@ flush_cache_all(); if (address >= end) BUG(); + spin_lock(&init_mm.page_table_lock); do { - pmd_t *pmd = pmd_alloc_kernel(dir, address); + pmd_t *pmd; + pmd = pmd_alloc(&init_mm, dir, address); + error = -ENOMEM; if (!pmd) - return -ENOMEM; + break; if (remap_area_pmd(pmd, address, end - address, phys_addr + address, flags)) - return -ENOMEM; - set_pgdir(address, *dir); + break; + error = 0; address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return 0; } |