From: Andy P. <at...@us...> - 2002-04-09 17:08:06
|
Update of /cvsroot/linux-vax/kernel-2.4/arch/s390/mm In directory usw-pr-cvs1:/tmp/cvs-serv13825/s390/mm Modified Files: Makefile extable.c fault.c init.c ioremap.c Log Message: synch 2.4.15 commit 29 Index: Makefile =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/arch/s390/mm/Makefile,v retrieving revision 1.1.1.2 retrieving revision 1.2 diff -u -r1.1.1.2 -r1.2 --- Makefile 25 Feb 2001 23:15:22 -0000 1.1.1.2 +++ Makefile 9 Apr 2002 17:03:17 -0000 1.2 @@ -1,5 +1,5 @@ # -# Makefile for the linux i386-specific parts of the memory manager. +# Makefile for the linux s390-specific parts of the memory manager. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here Index: extable.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/arch/s390/mm/extable.c,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -u -r1.1.1.1 -r1.2 --- extable.c 14 Jan 2001 19:55:45 -0000 1.1.1.1 +++ extable.c 9 Apr 2002 17:03:17 -0000 1.2 @@ -10,6 +10,7 @@ #include <linux/config.h> #include <linux/module.h> +#include <linux/spinlock.h> #include <asm/uaccess.h> extern const struct exception_table_entry __start___ex_table[]; @@ -36,28 +37,37 @@ return 0; } +extern spinlock_t modlist_lock; + unsigned long search_exception_table(unsigned long addr) { - unsigned long ret; + unsigned long ret = 0; + unsigned long flags; #ifndef CONFIG_MODULES addr &= 0x7fffffff; /* remove amode bit from address */ /* There is only the kernel to search. */ ret = search_one_table(__start___ex_table, __stop___ex_table-1, addr); - if (ret) return FIX_PSW(ret); + if (ret) ret = FIX_PSW(ret); + return ret; #else /* The kernel is the last "module" -- no need to treat it special. */ struct module *mp; addr &= 0x7fffffff; /* remove amode bit from address */ + + spin_lock_irqsave(&modlist_lock, flags); for (mp = module_list; mp != NULL; mp = mp->next) { - if (mp->ex_table_start == NULL) + if (mp->ex_table_start == NULL || !(mp->flags&(MOD_RUNNING|MOD_INITIALIZING))) continue; ret = search_one_table(mp->ex_table_start, mp->ex_table_end - 1, addr); - if (ret) return FIX_PSW(ret); + if (ret) { + ret = FIX_PSW(ret); + break; + } } + spin_unlock_irqrestore(&modlist_lock, flags); + return ret; #endif - - return 0; } Index: fault.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/arch/s390/mm/fault.c,v retrieving revision 1.1.1.2 retrieving revision 1.2 diff -u -r1.1.1.2 -r1.2 --- fault.c 25 Feb 2001 23:15:22 -0000 1.1.1.2 +++ fault.c 9 Apr 2002 17:03:17 -0000 1.2 @@ -4,6 +4,7 @@ * S390 version * Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation * Author(s): Hartmut Penner (hp...@de...) + * Ulrich Weigand (uwe...@de...) * * Derived from "arch/i386/mm/fault.c" * Copyright (C) 1995 Linus Torvalds @@ -21,6 +22,9 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/smp_lock.h> +#include <linux/compatmac.h> +#include <linux/init.h> +#include <linux/console.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -32,6 +36,34 @@ #endif extern void die(const char *,struct pt_regs *,long); +static void force_sigsegv(struct task_struct *tsk, int code, void *address); + +extern spinlock_t timerlist_lock; + +/* + * Unlock any spinlocks which will prevent us from getting the + * message out (timerlist_lock is acquired through the + * console unblank code) + */ +void bust_spinlocks(int yes) +{ + spin_lock_init(&timerlist_lock); + if (yes) { + oops_in_progress = 1; + } else { + int loglevel_save = console_loglevel; + oops_in_progress = 0; + console_unblank(); + /* + * OK, the message is on the console. Now we call printk() + * without oops_in_progress set so that printk will give klogd + * a poke. Hold onto your hats... + */ + console_loglevel = 15; + printk(" "); + console_loglevel = loglevel_save; + } +} /* * This routine handles page faults. It determines the address, @@ -51,18 +83,31 @@ unsigned long address; unsigned long fixup; int write; - unsigned long psw_mask; - unsigned long psw_addr; int si_code = SEGV_MAPERR; int kernel_address = 0; - /* - * get psw mask of Program old psw to find out, - * if user or kernel mode - */ + tsk = current; + mm = tsk->mm; + + /* + * Check for low-address protection. This needs to be treated + * as a special case because the translation exception code + * field is not guaranteed to contain valid data in this case. + */ + if ((error_code & 0xff) == 4 && !(S390_lowcore.trans_exc_code & 4)) { + + /* Low-address protection hit in kernel mode means + NULL pointer write access in kernel mode. */ + if (!(regs->psw.mask & PSW_PROBLEM_STATE)) { + address = 0; + kernel_address = 1; + goto no_context; + } - psw_mask = S390_lowcore.program_old_psw.mask; - psw_addr = S390_lowcore.program_old_psw.addr; + /* Low-address protection hit in user mode 'cannot happen'. */ + die ("Low-address protection", regs, error_code); + do_exit(SIGKILL); + } /* * get the failing address @@ -72,12 +117,6 @@ address = S390_lowcore.trans_exc_code&0x7ffff000; - tsk = current; - mm = tsk->mm; - - if (in_interrupt() || !mm) - goto no_context; - /* * Check which address space the address belongs to @@ -108,6 +147,7 @@ } } die("page fault via unknown access register", regs, error_code); + do_exit(SIGKILL); break; case 2: /* Secondary Segment Table Descriptor */ @@ -116,13 +156,21 @@ break; } + /* + * Check whether we have a user MM in the first place. + */ + if (in_interrupt() || !mm || !(regs->psw.mask & _PSW_IO_MASK_BIT)) + goto no_context; /* * When we get here, the fault happened in the current - * task's user address space, so we search the VMAs + * task's user address space, so we can switch on the + * interrupts again and then search the VMAs */ - down(&mm->mmap_sem); + __sti(); + + down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) @@ -155,6 +203,7 @@ goto bad_area; } + survive: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -173,7 +222,7 @@ goto out_of_memory; } - up(&mm->mmap_sem); + up_read(&mm->mmap_sem); return; /* @@ -181,11 +230,10 @@ * Fix it, but check if it's kernel or user first.. */ bad_area: - up(&mm->mmap_sem); + up_read(&mm->mmap_sem); /* User mode accesses just cause a SIGSEGV */ - if (psw_mask & PSW_PROBLEM_STATE) { - struct siginfo si; + if (regs->psw.mask & PSW_PROBLEM_STATE) { tsk->thread.prot_addr = address; tsk->thread.trap_no = error_code; #ifndef CONFIG_SYSCTL @@ -202,10 +250,8 @@ show_regs(regs); } #endif - si.si_signo = SIGSEGV; - si.si_code = si_code; - si.si_addr = (void*) address; - force_sig_info(SIGSEGV, &si, tsk); + + force_sigsegv(tsk, si_code, (void *)address); return; } @@ -227,9 +273,6 @@ else printk(KERN_ALERT "Unable to handle kernel paging request" " at virtual user address %08lx\n", address); -/* - * need to define, which information is useful here - */ die("Oops", regs, error_code); do_exit(SIGKILL); @@ -240,14 +283,20 @@ * us unable to handle the page fault gracefully. */ out_of_memory: - up(&mm->mmap_sem); + up_read(&mm->mmap_sem); + if (tsk->pid == 1) { + tsk->policy |= SCHED_YIELD; + schedule(); + down_read(&mm->mmap_sem); + goto survive; + } printk("VM: killing process %s\n", tsk->comm); - if (psw_mask & PSW_PROBLEM_STATE) + if (regs->psw.mask & PSW_PROBLEM_STATE) do_exit(SIGKILL); goto no_context; do_sigbus: - up(&mm->mmap_sem); + up_read(&mm->mmap_sem); /* * Send a sigbus, regardless of whether we were in kernel @@ -258,8 +307,265 @@ force_sig(SIGBUS, tsk); /* Kernel mode? Handle exceptions or die */ - if (!(psw_mask & PSW_PROBLEM_STATE)) + if (!(regs->psw.mask & PSW_PROBLEM_STATE)) goto no_context; } +/* + * Send SIGSEGV to task. This is an external routine + * to keep the stack usage of do_page_fault small. + */ +static void force_sigsegv(struct task_struct *tsk, int code, void *address) +{ + struct siginfo si; + si.si_signo = SIGSEGV; + si.si_code = code; + si.si_addr = address; + force_sig_info(SIGSEGV, &si, tsk); +} + +typedef struct _pseudo_wait_t { + struct _pseudo_wait_t *next; + wait_queue_head_t queue; + unsigned long address; + int resolved; +} pseudo_wait_t; + +static pseudo_wait_t *pseudo_lock_queue = NULL; +static spinlock_t pseudo_wait_spinlock; /* spinlock to protect lock queue */ + +/* + * This routine handles 'pagex' pseudo page faults. + */ +asmlinkage void +do_pseudo_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + pseudo_wait_t wait_struct; + pseudo_wait_t *ptr, *last, *next; + unsigned long address; + int kernel_address; + + /* + * get the failing address + * more specific the segment and page table portion of + * the address + */ + address = S390_lowcore.trans_exc_code & 0xfffff000; + + if (address & 0x80000000) { + /* high bit set -> a page has been swapped in by VM */ + address &= 0x7fffffff; + spin_lock(&pseudo_wait_spinlock); + last = NULL; + ptr = pseudo_lock_queue; + while (ptr != NULL) { + next = ptr->next; + if (address == ptr->address) { + /* + * This is one of the processes waiting + * for the page. Unchain from the queue. + * There can be more than one process + * waiting for the same page. VM presents + * an initial and a completion interrupt for + * every process that tries to access a + * page swapped out by VM. + */ + if (last == NULL) + pseudo_lock_queue = next; + else + last->next = next; + /* now wake up the process */ + ptr->resolved = 1; + wake_up(&ptr->queue); + } else + last = ptr; + ptr = next; + } + spin_unlock(&pseudo_wait_spinlock); + } else { + /* Pseudo page faults in kernel mode is a bad idea */ + if (!(regs->psw.mask & PSW_PROBLEM_STATE)) { + /* + * VM presents pseudo page faults if the interrupted + * state was not disabled for interrupts. So we can + * get pseudo page fault interrupts while running + * in kernel mode. We simply access the page here + * while we are running disabled. VM will then swap + * in the page synchronously. + */ + kernel_address = 0; + switch (S390_lowcore.trans_exc_code & 3) { + case 0: /* Primary Segment Table Descriptor */ + kernel_address = 1; + break; + case 1: /* STD determined via access register */ + if (S390_lowcore.exc_access_id == 0 || + regs->acrs[S390_lowcore.exc_access_id]==0) + kernel_address = 1; + break; + case 2: /* Secondary Segment Table Descriptor */ + case 3: /* Home Segment Table Descriptor */ + break; + } + if (kernel_address) + /* dereference a virtual kernel address */ + __asm__ __volatile__ ( + " ic 0,0(%0)" + : : "a" (address) : "0"); + else + /* dereference a virtual user address */ + __asm__ __volatile__ ( + " la 2,0(%0)\n" + " sacf 512\n" + " ic 2,0(2)\n" + "0:sacf 0\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 0b,0b\n" + ".previous" + : : "a" (address) : "2" ); + + return; + } + /* initialize and add element to pseudo_lock_queue */ + init_waitqueue_head (&wait_struct.queue); + wait_struct.address = address; + wait_struct.resolved = 0; + spin_lock(&pseudo_wait_spinlock); + wait_struct.next = pseudo_lock_queue; + pseudo_lock_queue = &wait_struct; + spin_unlock(&pseudo_wait_spinlock); + /* go to sleep */ + wait_event(wait_struct.queue, wait_struct.resolved); + } +} + +#ifdef CONFIG_PFAULT +/* + * 'pfault' pseudo page faults routines. + */ +static int pfault_disable = 0; + +static int __init nopfault(char *str) +{ + pfault_disable = 1; + return 1; +} + +__setup("nopfault", nopfault); + +typedef struct { + __u16 refdiagc; + __u16 reffcode; + __u16 refdwlen; + __u16 refversn; + __u64 refgaddr; + __u64 refselmk; + __u64 refcmpmk; + __u64 reserved; +} __attribute__ ((packed)) pfault_refbk_t; + +int pfault_init(void) +{ + pfault_refbk_t refbk = + { 0x258, 0, 5, 2, __LC_KERNEL_STACK, 1ULL << 48, 1ULL << 48, 0ULL }; + int rc; + + if (pfault_disable) + return -1; + __asm__ __volatile__( + " diag %1,%0,0x258\n" + "0: j 2f\n" + "1: la %0,8\n" + "2:\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 0b,1b\n" + ".previous" + : "=d" (rc) : "a" (&refbk) : "cc" ); + __ctl_set_bit(0, 9); + return rc; +} + +void pfault_fini(void) +{ + pfault_refbk_t refbk = + { 0x258, 1, 5, 2, 0ULL, 0ULL, 0ULL, 0ULL }; + + if (pfault_disable) + return; + __ctl_clear_bit(0,9); + __asm__ __volatile__( + " diag %0,0,0x258\n" + "0:\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 0b,0b\n" + ".previous" + : : "a" (&refbk) : "cc" ); +} + +asmlinkage void +pfault_interrupt(struct pt_regs *regs, __u16 error_code) +{ + struct task_struct *tsk; + wait_queue_head_t queue; + wait_queue_head_t *qp; + __u16 subcode; + + /* + * Get the external interruption subcode & pfault + * initial/completion signal bit. VM stores this + * in the 'cpu address' field associated with the + * external interrupt. + */ + subcode = S390_lowcore.cpu_addr; + if ((subcode & 0xff00) != 0x0200) + return; + + /* + * Get the token (= address of kernel stack of affected task). + */ + tsk = (struct task_struct *) + (*((unsigned long *) __LC_PFAULT_INTPARM) - THREAD_SIZE); + + /* + * We got all needed information from the lowcore and can + * now safely switch on interrupts. + */ + if (regs->psw.mask & PSW_PROBLEM_STATE) + __sti(); + + if (subcode & 0x0080) { + /* signal bit is set -> a page has been swapped in by VM */ + qp = (wait_queue_head_t *) + xchg(&tsk->thread.pfault_wait, -1); + if (qp != NULL) { + /* Initial interrupt was faster than the completion + * interrupt. pfault_wait is valid. Set pfault_wait + * back to zero and wake up the process. This can + * safely be done because the task is still sleeping + * and can't procude new pfaults. */ + tsk->thread.pfault_wait = 0ULL; + wake_up(qp); + } + } else { + /* signal bit not set -> a real page is missing. */ + init_waitqueue_head (&queue); + qp = (wait_queue_head_t *) + xchg(&tsk->thread.pfault_wait, (addr_t) &queue); + if (qp != NULL) { + /* Completion interrupt was faster than the initial + * interrupt (swapped in a -1 for pfault_wait). Set + * pfault_wait back to zero and exit. This can be + * done safely because tsk is running in kernel + * mode and can't produce new pfaults. */ + tsk->thread.pfault_wait = 0ULL; + } + + /* go to sleep */ + wait_event(queue, tsk->thread.pfault_wait == 0ULL); + } +} +#endif Index: init.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/arch/s390/mm/init.c,v retrieving revision 1.1.1.2 retrieving revision 1.2 diff -u -r1.1.1.2 -r1.2 --- init.c 25 Feb 2001 23:15:22 -0000 1.1.1.2 +++ init.c 9 Apr 2002 17:03:17 -0000 1.2 @@ -35,113 +35,32 @@ #include <asm/pgalloc.h> #include <asm/dma.h> #include <asm/lowcore.h> +#include <asm/tlb.h> -static unsigned long totalram_pages; +mmu_gather_t mmu_gathers[NR_CPUS]; -/* - * BAD_PAGE is the page that is used for page faults when linux - * is out-of-memory. Older versions of linux just did a - * do_exit(), but using this instead means there is less risk - * for a process dying in kernel mode, possibly leaving an inode - * unused etc.. - * - * BAD_PAGETABLE is the accompanying page-table: it is initialized - * to point to BAD_PAGE entries. - * - * ZERO_PAGE is a special page that is used for zero-initialized - * data and COW. - */ +static unsigned long totalram_pages; pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE))); -char empty_bad_page[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); char empty_zero_page[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); -pte_t empty_bad_pte_table[PTRS_PER_PTE] __attribute__((__aligned__(PAGE_SIZE))); - -static int test_access(unsigned long loc) -{ - static const int ssm_mask = 0x07000000L; - int rc, i; - - rc = 0; - for (i=0; i<4; i++) { - __asm__ __volatile__( - " slr %0,%0\n" - " ssm %1\n" - " tprot 0(%2),0\n" - "0: jne 1f\n" - " lhi %0,1\n" - "1: ssm %3\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 0b,1b\n" - ".previous" - : "+&d" (rc) : "i" (0), "a" (loc), "m" (ssm_mask) - : "cc"); - if (rc == 0) - break; - loc += 0x100000; - } - return rc; -} - -static pte_t * get_bad_pte_table(void) -{ - pte_t v; - int i; - - v = pte_mkdirty(mk_pte_phys(__pa(empty_bad_page), PAGE_SHARED)); - - for (i = 0; i < PAGE_SIZE/sizeof(pte_t); i++) - empty_bad_pte_table[i] = v; - - return empty_bad_pte_table; -} - -static inline void invalidate_page(pte_t *pte) -{ - int i; - for (i=0;i<PTRS_PER_PTE;i++) - pte_clear(pte++); -} - -pte_t *get_pte_slow(pmd_t *pmd, unsigned long offset) -{ - unsigned long pte; - - pte = (unsigned long) __get_free_page(GFP_KERNEL); - if (pmd_none(*pmd)) { - if (pte) { - invalidate_page((pte_t*) pte); - pmd_val(pmd[0]) = _PAGE_TABLE + __pa(pte); - pmd_val(pmd[1]) = _PAGE_TABLE + __pa(pte)+1024; - pmd_val(pmd[2]) = _PAGE_TABLE + __pa(pte)+2048; - pmd_val(pmd[3]) = _PAGE_TABLE + __pa(pte)+3072; - return (pte_t *) pte + offset; - } - pte = (unsigned long) get_bad_pte_table(); - pmd_val(pmd[0]) = _PAGE_TABLE + __pa(pte); - pmd_val(pmd[1]) = _PAGE_TABLE + __pa(pte)+1024; - pmd_val(pmd[2]) = _PAGE_TABLE + __pa(pte)+2048; - pmd_val(pmd[3]) = _PAGE_TABLE + __pa(pte)+3072; - return NULL; - } - free_page(pte); - if (pmd_bad(*pmd)) - BUG(); - return (pte_t *) pmd_page(*pmd) + offset; -} int do_check_pgt_cache(int low, int high) { int freed = 0; if(pgtable_cache_size > high) { do { - if(pgd_quicklist) - free_pgd_slow(get_pgd_fast()), freed += 2; - if(pmd_quicklist) - free_pmd_slow(get_pmd_fast()), freed++; - if(pte_quicklist) - free_pte_slow(get_pte_fast()), freed++; + if(pgd_quicklist) { + free_pgd_slow(get_pgd_fast()); + freed += 2; + } + if(pmd_quicklist) { + pmd_free_slow(pmd_alloc_one_fast(NULL, 0)); + freed++; + } + if(pte_quicklist) { + pte_free_slow(pte_alloc_one_fast(NULL, 0)); + freed++; + } } while(pgtable_cache_size > low); } return freed; @@ -260,7 +179,6 @@ void __init mem_init(void) { int codesize, reservedpages, datasize, initsize; - int tmp; max_mapnr = num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); @@ -271,24 +189,7 @@ /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); - /* mark usable pages in the mem_map[] and count reserved pages */ reservedpages = 0; - tmp = 0; - do { - if (tmp && (tmp & 0x3ff) == 0 && - test_access(tmp * PAGE_SIZE) == 0) { - printk("4M Segment %lX not available\n",tmp*PAGE_SIZE); - do { - set_bit(PG_reserved, &mem_map[tmp].flags); - reservedpages++; - tmp++; - } while (tmp < max_low_pfn && (tmp & 0x3ff)); - } else { - if (PageReserved(mem_map+tmp)) - reservedpages++; - tmp++; - } - } while (tmp < max_low_pfn); codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; Index: ioremap.c =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/arch/s390/mm/ioremap.c,v retrieving revision 1.1.1.2 retrieving revision 1.2 diff -u -r1.1.1.2 -r1.2 --- ioremap.c 25 Feb 2001 23:15:22 -0000 1.1.1.2 +++ ioremap.c 9 Apr 2002 17:03:17 -0000 1.2 @@ -54,7 +54,7 @@ if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(pmd, address); + pte_t * pte = pte_alloc(&init_mm, pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -67,6 +67,7 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, unsigned long size, unsigned long flags) { + int error; pgd_t * dir; unsigned long end = address + size; @@ -75,17 +76,21 @@ flush_cache_all(); if (address >= end) BUG(); + spin_lock(&init_mm.page_table_lock); do { - pmd_t *pmd = pmd_alloc_kernel(dir, address); + pmd_t *pmd; + pmd = pmd_alloc(&init_mm, dir, address); + error = -ENOMEM; if (!pmd) - return -ENOMEM; + break; if (remap_area_pmd(pmd, address, end - address, phys_addr + address, flags)) - return -ENOMEM; - set_pgdir(address, *dir); + break; + error = 0; address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); + spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return 0; } |