From: Andrea A. <an...@qu...> - 2008-04-08 15:57:18
|
The difference with #v11 is a different implementation of mm_lock that guarantees handling signals in O(N). It's also more lowlatency friendly. Note that mmu_notifier_unregister may also fail with -EINTR if there are signal pending or the system runs out of vmalloc space or physical memory, only exit_mmap guarantees that any kernel module can be unloaded in presence of an oom condition. Either #v11 or the first three #v12 1,2,3 patches are suitable for inclusion in -mm, pick what you prefer looking at the mmu_notifier_register retval and mm_lock retval difference, I implemented and slighty tested both. GRU and KVM only needs 1,2,3, XPMEM needs the rest of the patchset too (4, ...) but all patches from 4 to the end can be deffered to a second merge window. |
From: Andrea A. <an...@qu...> - 2008-04-08 15:56:56
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1207666463 -7200 # Node ID 33de2e17d0f5670515833bf8d3d2ea19e2a85b09 # Parent baceb322b45ed43280654dac6c964c9d3d8a936f Moves all mmu notifier methods outside the PT lock (first and not last step to make them sleep capable). Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -117,27 +117,6 @@ INIT_HLIST_HEAD(&mm->mmu_notifier_list); } -#define ptep_clear_flush_notify(__vma, __address, __ptep) \ -({ \ - pte_t __pte; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __pte = ptep_clear_flush(___vma, ___address, __ptep); \ - mmu_notifier_invalidate_page(___vma->vm_mm, ___address); \ - __pte; \ -}) - -#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ - __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ - ___address); \ - __young; \ -}) - #else /* CONFIG_MMU_NOTIFIER */ static inline void mmu_notifier_release(struct mm_struct *mm) @@ -169,9 +148,6 @@ { } -#define ptep_clear_flush_young_notify ptep_clear_flush_young -#define ptep_clear_flush_notify ptep_clear_flush - #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */ diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -194,11 +194,13 @@ if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); page_remove_rmap(page, vma); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); + /* must invalidate_page _before_ freeing the page */ + mmu_notifier_invalidate_page(mm, address); page_cache_release(page); } } diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -1626,9 +1626,10 @@ */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - page_cache_release(old_page); + new_page = NULL; if (!pte_same(*page_table, orig_pte)) goto unlock; + page_cache_release(old_page); page_mkwrite = 1; } @@ -1644,6 +1645,7 @@ if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, entry); ret |= VM_FAULT_WRITE; + old_page = new_page = NULL; goto unlock; } @@ -1688,7 +1690,7 @@ * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, address, page_table); + ptep_clear_flush(vma, address, page_table); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); lru_cache_add_active(new_page); @@ -1700,12 +1702,18 @@ } else mem_cgroup_uncharge_page(new_page); - if (new_page) +unlock: + pte_unmap_unlock(page_table, ptl); + + if (new_page) { + if (new_page == old_page) + /* cow happened, notify before releasing old_page */ + mmu_notifier_invalidate_page(mm, address); page_cache_release(new_page); + } if (old_page) page_cache_release(old_page); -unlock: - pte_unmap_unlock(page_table, ptl); + if (dirty_page) { if (vma->vm_file) file_update_time(vma->vm_file); diff --git a/mm/rmap.c b/mm/rmap.c --- a/mm/rmap.c +++ b/mm/rmap.c @@ -275,7 +275,7 @@ unsigned long address; pte_t *pte; spinlock_t *ptl; - int referenced = 0; + int referenced = 0, clear_flush_young = 0; address = vma_address(page, vma); if (address == -EFAULT) @@ -288,8 +288,11 @@ if (vma->vm_flags & VM_LOCKED) { referenced++; *mapcount = 1; /* break early from loop */ - } else if (ptep_clear_flush_young_notify(vma, address, pte)) - referenced++; + } else { + clear_flush_young = 1; + if (ptep_clear_flush_young(vma, address, pte)) + referenced++; + } /* Pretend the page is referenced if the task has the swap token and is in the middle of a page fault. */ @@ -299,6 +302,10 @@ (*mapcount)--; pte_unmap_unlock(pte, ptl); + + if (clear_flush_young) + referenced += mmu_notifier_clear_flush_young(mm, address); + out: return referenced; } @@ -457,7 +464,7 @@ pte_t entry; flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush_notify(vma, address, pte); + entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(mm, address, pte, entry); @@ -465,6 +472,10 @@ } pte_unmap_unlock(pte, ptl); + + if (ret) + mmu_notifier_invalidate_page(mm, address); + out: return ret; } @@ -717,15 +728,14 @@ * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!migration && ((vma->vm_flags & VM_LOCKED) || - (ptep_clear_flush_young_notify(vma, address, pte)))) { + if (!migration && (vma->vm_flags & VM_LOCKED)) { ret = SWAP_FAIL; goto out_unmap; } /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -780,6 +790,8 @@ out_unmap: pte_unmap_unlock(pte, ptl); + if (ret != SWAP_FAIL) + mmu_notifier_invalidate_page(mm, address); out: return ret; } @@ -818,7 +830,7 @@ spinlock_t *ptl; struct page *page; unsigned long address; - unsigned long end; + unsigned long start, end; address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -839,6 +851,8 @@ if (!pmd_present(*pmd)) return; + start = address; + mmu_notifier_invalidate_range_start(mm, start, end); pte = pte_offset_map_lock(mm, pmd, address, &ptl); /* Update high watermark before we lower rss */ @@ -850,12 +864,12 @@ page = vm_normal_page(vma, address, *pte); BUG_ON(!page || PageAnon(page)); - if (ptep_clear_flush_young_notify(vma, address, pte)) + if (ptep_clear_flush_young(vma, address, pte)) continue; /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) @@ -871,6 +885,7 @@ (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + mmu_notifier_invalidate_range_end(mm, start, end); } static int try_to_unmap_anon(struct page *page, int migration) |
From: Christoph L. <cla...@sg...> - 2008-04-14 19:57:50
|
Not sure why this patch is not merged into 2 of 9. Same comment as last round. |
From: Andrea A. <an...@qu...> - 2008-04-08 15:56:59
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1207666462 -7200 # Node ID baceb322b45ed43280654dac6c964c9d3d8a936f # Parent ec6d8f91b299cf26cce5c3d49bb25d35ee33c137 Core of mmu notifiers. Signed-off-by: Andrea Arcangeli <an...@qu...> Signed-off-by: Nick Piggin <np...@su...> Signed-off-by: Christoph Lameter <cla...@sg...> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -225,6 +225,9 @@ #ifdef CONFIG_CGROUP_MEM_RES_CTLR struct mem_cgroup *mem_cgroup; #endif +#ifdef CONFIG_MMU_NOTIFIER + struct hlist_head mmu_notifier_list; +#endif }; #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h new file mode 100644 --- /dev/null +++ b/include/linux/mmu_notifier.h @@ -0,0 +1,177 @@ +#ifndef _LINUX_MMU_NOTIFIER_H +#define _LINUX_MMU_NOTIFIER_H + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/mm_types.h> + +struct mmu_notifier; +struct mmu_notifier_ops; + +#ifdef CONFIG_MMU_NOTIFIER + +struct mmu_notifier_ops { + /* + * Called when nobody can register any more notifier in the mm + * and after the "mn" notifier has been disarmed already. + */ + void (*release)(struct mmu_notifier *mn, + struct mm_struct *mm); + + /* + * clear_flush_young is called after the VM is + * test-and-clearing the young/accessed bitflag in the + * pte. This way the VM will provide proper aging to the + * accesses to the page through the secondary MMUs and not + * only to the ones through the Linux pte. + */ + int (*clear_flush_young)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address); + + /* + * Before this is invoked any secondary MMU is still ok to + * read/write to the page previously pointed by the Linux pte + * because the old page hasn't been freed yet. If required + * set_page_dirty has to be called internally to this method. + */ + void (*invalidate_page)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address); + + /* + * invalidate_range_start() and invalidate_range_end() must be + * paired. Multiple invalidate_range_start/ends may be nested + * or called concurrently. + */ + void (*invalidate_range_start)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end); + void (*invalidate_range_end)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end); +}; + +struct mmu_notifier { + struct hlist_node hlist; + const struct mmu_notifier_ops *ops; +}; + +static inline int mm_has_notifiers(struct mm_struct *mm) +{ + return unlikely(!hlist_empty(&mm->mmu_notifier_list)); +} + +extern int mmu_notifier_register(struct mmu_notifier *mn, + struct mm_struct *mm); +extern int mmu_notifier_unregister(struct mmu_notifier *mn, + struct mm_struct *mm); +extern void __mmu_notifier_release(struct mm_struct *mm); +extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long address); +extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, + unsigned long address); +extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end); +extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, + unsigned long start, unsigned long end); + + +static inline void mmu_notifier_release(struct mm_struct *mm) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_release(mm); +} + +static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long address) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_clear_flush_young(mm, address); + return 0; +} + +static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, + unsigned long address) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_invalidate_page(mm, address); +} + +static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_invalidate_range_start(mm, start, end); +} + +static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_invalidate_range_end(mm, start, end); +} + +static inline void mmu_notifier_mm_init(struct mm_struct *mm) +{ + INIT_HLIST_HEAD(&mm->mmu_notifier_list); +} + +#define ptep_clear_flush_notify(__vma, __address, __ptep) \ +({ \ + pte_t __pte; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + __pte = ptep_clear_flush(___vma, ___address, __ptep); \ + mmu_notifier_invalidate_page(___vma->vm_mm, ___address); \ + __pte; \ +}) + +#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ +({ \ + int __young; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ + __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ + ___address); \ + __young; \ +}) + +#else /* CONFIG_MMU_NOTIFIER */ + +static inline void mmu_notifier_release(struct mm_struct *mm) +{ +} + +static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long address) +{ + return 0; +} + +static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, + unsigned long address) +{ +} + +static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + +static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + +static inline void mmu_notifier_mm_init(struct mm_struct *mm) +{ +} + +#define ptep_clear_flush_young_notify ptep_clear_flush_young +#define ptep_clear_flush_notify ptep_clear_flush + +#endif /* CONFIG_MMU_NOTIFIER */ + +#endif /* _LINUX_MMU_NOTIFIER_H */ diff --git a/kernel/fork.c b/kernel/fork.c --- a/kernel/fork.c +++ b/kernel/fork.c @@ -53,6 +53,7 @@ #include <linux/tty.h> #include <linux/proc_fs.h> #include <linux/blkdev.h> +#include <linux/mmu_notifier.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -362,6 +363,7 @@ if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; + mmu_notifier_mm_init(mm); return mm; } diff --git a/mm/Kconfig b/mm/Kconfig --- a/mm/Kconfig +++ b/mm/Kconfig @@ -193,3 +193,7 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS + +config MMU_NOTIFIER + def_bool y + bool "MMU notifier, for paging KVM/RDMA" diff --git a/mm/Makefile b/mm/Makefile --- a/mm/Makefile +++ b/mm/Makefile @@ -33,4 +33,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o +obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -194,7 +194,7 @@ if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page, vma); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); diff --git a/mm/fremap.c b/mm/fremap.c --- a/mm/fremap.c +++ b/mm/fremap.c @@ -15,6 +15,7 @@ #include <linux/rmap.h> #include <linux/module.h> #include <linux/syscalls.h> +#include <linux/mmu_notifier.h> #include <asm/mmu_context.h> #include <asm/cacheflush.h> @@ -214,7 +215,9 @@ spin_unlock(&mapping->i_mmap_lock); } + mmu_notifier_invalidate_range_start(mm, start, start + size); err = populate_range(mm, vma, start, size, pgoff); + mmu_notifier_invalidate_range_end(mm, start, start + size); if (!err && !(flags & MAP_NONBLOCK)) { if (unlikely(has_write_lock)) { downgrade_write(&mm->mmap_sem); diff --git a/mm/hugetlb.c b/mm/hugetlb.c --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -14,6 +14,7 @@ #include <linux/mempolicy.h> #include <linux/cpuset.h> #include <linux/mutex.h> +#include <linux/mmu_notifier.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -799,6 +800,7 @@ BUG_ON(start & ~HPAGE_MASK); BUG_ON(end & ~HPAGE_MASK); + mmu_notifier_invalidate_range_start(mm, start, end); spin_lock(&mm->page_table_lock); for (address = start; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); @@ -819,6 +821,7 @@ } spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { list_del(&page->lru); put_page(page); diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -51,6 +51,7 @@ #include <linux/init.h> #include <linux/writeback.h> #include <linux/memcontrol.h> +#include <linux/mmu_notifier.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> @@ -611,6 +612,9 @@ if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_start(src_mm, addr, end); + dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { @@ -621,6 +625,11 @@ vma, addr, next)) return -ENOMEM; } while (dst_pgd++, src_pgd++, addr = next, addr != end); + + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_end(src_mm, + vma->vm_start, end); + return 0; } @@ -897,7 +906,9 @@ lru_add_drain(); tlb = tlb_gather_mmu(mm, 0); update_hiwater_rss(mm); + mmu_notifier_invalidate_range_start(mm, address, end); end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); + mmu_notifier_invalidate_range_end(mm, address, end); if (tlb) tlb_finish_mmu(tlb, address, end); return end; @@ -1463,10 +1474,11 @@ { pgd_t *pgd; unsigned long next; - unsigned long end = addr + size; + unsigned long start = addr, end = addr + size; int err; BUG_ON(addr >= end); + mmu_notifier_invalidate_range_start(mm, start, end); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -1474,6 +1486,7 @@ if (err) break; } while (pgd++, addr = next, addr != end); + mmu_notifier_invalidate_range_end(mm, start, end); return err; } EXPORT_SYMBOL_GPL(apply_to_page_range); @@ -1675,7 +1688,7 @@ * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush(vma, address, page_table); + ptep_clear_flush_notify(vma, address, page_table); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); lru_cache_add_active(new_page); diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -27,6 +27,7 @@ #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/vmalloc.h> +#include <linux/mmu_notifier.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -1748,11 +1749,13 @@ lru_add_drain(); tlb = tlb_gather_mmu(mm, 0); update_hiwater_rss(mm); + mmu_notifier_invalidate_range_start(mm, start, end); unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); tlb_finish_mmu(tlb, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); } /* @@ -2038,6 +2041,7 @@ unsigned long end; /* mm's last user has gone, and its about to be pulled down */ + mmu_notifier_release(mm); arch_exit_mmap(mm); lru_add_drain(); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c new file mode 100644 --- /dev/null +++ b/mm/mmu_notifier.c @@ -0,0 +1,126 @@ +/* + * linux/mm/mmu_notifier.c + * + * Copyright (C) 2008 Qumranet, Inc. + * Copyright (C) 2008 SGI + * Christoph Lameter <cla...@sg...> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/mmu_notifier.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/err.h> + +/* + * No synchronization. This function can only be called when only a single + * process remains that performs teardown. + */ +void __mmu_notifier_release(struct mm_struct *mm) +{ + struct mmu_notifier *mn; + + while (unlikely(!hlist_empty(&mm->mmu_notifier_list))) { + mn = hlist_entry(mm->mmu_notifier_list.first, + struct mmu_notifier, + hlist); + hlist_del(&mn->hlist); + if (mn->ops->release) + mn->ops->release(mn, mm); + } +} + +/* + * If no young bitflag is supported by the hardware, ->clear_flush_young can + * unmap the address and return 1 or 0 depending if the mapping previously + * existed or not. + */ +int __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int young = 0; + + hlist_for_each_entry(mn, n, &mm->mmu_notifier_list, hlist) { + if (mn->ops->clear_flush_young) + young |= mn->ops->clear_flush_young(mn, mm, address); + } + + return young; +} + +void __mmu_notifier_invalidate_page(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + hlist_for_each_entry(mn, n, &mm->mmu_notifier_list, hlist) { + if (mn->ops->invalidate_page) + mn->ops->invalidate_page(mn, mm, address); + } +} + +void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + hlist_for_each_entry(mn, n, &mm->mmu_notifier_list, hlist) { + if (mn->ops->invalidate_range_start) + mn->ops->invalidate_range_start(mn, mm, start, end); + } +} + +void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + hlist_for_each_entry(mn, n, &mm->mmu_notifier_list, hlist) { + if (mn->ops->invalidate_range_end) + mn->ops->invalidate_range_end(mn, mm, start, end); + } +} + +/* + * Must not hold mmap_sem nor any other VM related lock when calling + * this registration function. + */ +int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + struct mm_lock_data *data; + + data = mm_lock(mm); + if (unlikely(IS_ERR(data))) + return PTR_ERR(data); + hlist_add_head(&mn->hlist, &mm->mmu_notifier_list); + mm_unlock(mm, data); + return 0; +} +EXPORT_SYMBOL_GPL(mmu_notifier_register); + +/* + * mm_users can't go down to zero while mmu_notifier_unregister() + * runs or it can race with ->release. So a mm_users pin must + * be taken by the caller (if mm can be different from current->mm). + */ +int mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) +{ + struct mm_lock_data *data; + + BUG_ON(!atomic_read(&mm->mm_users)); + + data = mm_lock(mm); + if (unlikely(IS_ERR(data))) + return PTR_ERR(data); + hlist_del(&mn->hlist); + mm_unlock(mm, data); + return 0; +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister); diff --git a/mm/mprotect.c b/mm/mprotect.c --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -21,6 +21,7 @@ #include <linux/syscalls.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/mmu_notifier.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -198,10 +199,12 @@ dirty_accountable = 1; } + mmu_notifier_invalidate_range_start(mm, start, end); if (is_vm_hugetlb_page(vma)) hugetlb_change_protection(vma, start, end, vma->vm_page_prot); else change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + mmu_notifier_invalidate_range_end(mm, start, end); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; diff --git a/mm/mremap.c b/mm/mremap.c --- a/mm/mremap.c +++ b/mm/mremap.c @@ -18,6 +18,7 @@ #include <linux/highmem.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/mmu_notifier.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -74,7 +75,11 @@ struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; + unsigned long old_start; + old_start = old_addr; + mmu_notifier_invalidate_range_start(vma->vm_mm, + old_start, old_end); if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before @@ -116,6 +121,7 @@ pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); + mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); } #define LATENCY_LIMIT (64 * PAGE_SIZE) diff --git a/mm/rmap.c b/mm/rmap.c --- a/mm/rmap.c +++ b/mm/rmap.c @@ -49,6 +49,7 @@ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/memcontrol.h> +#include <linux/mmu_notifier.h> #include <asm/tlbflush.h> @@ -287,7 +288,7 @@ if (vma->vm_flags & VM_LOCKED) { referenced++; *mapcount = 1; /* break early from loop */ - } else if (ptep_clear_flush_young(vma, address, pte)) + } else if (ptep_clear_flush_young_notify(vma, address, pte)) referenced++; /* Pretend the page is referenced if the task has the @@ -456,7 +457,7 @@ pte_t entry; flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush(vma, address, pte); + entry = ptep_clear_flush_notify(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(mm, address, pte, entry); @@ -717,14 +718,14 @@ * skipped over this mm) then we should reactivate it. */ if (!migration && ((vma->vm_flags & VM_LOCKED) || - (ptep_clear_flush_young(vma, address, pte)))) { + (ptep_clear_flush_young_notify(vma, address, pte)))) { ret = SWAP_FAIL; goto out_unmap; } /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -849,12 +850,12 @@ page = vm_normal_page(vma, address, *pte); BUG_ON(!page || PageAnon(page)); - if (ptep_clear_flush_young(vma, address, pte)) + if (ptep_clear_flush_young_notify(vma, address, pte)) continue; /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) |
From: Christoph L. <cla...@sg...> - 2008-04-14 19:56:57
|
On Tue, 8 Apr 2008, Andrea Arcangeli wrote: > + /* > + * Called when nobody can register any more notifier in the mm > + * and after the "mn" notifier has been disarmed already. > + */ > + void (*release)(struct mmu_notifier *mn, > + struct mm_struct *mm); Hmmm... The unregister function does not call this. Guess driver calls unregister function and does release like stuff on its own. > + /* > + * invalidate_range_start() and invalidate_range_end() must be > + * paired. Multiple invalidate_range_start/ends may be nested > + * or called concurrently. > + */ How could they be nested or called concurrently? > +/* > + * mm_users can't go down to zero while mmu_notifier_unregister() > + * runs or it can race with ->release. So a mm_users pin must > + * be taken by the caller (if mm can be different from current->mm). > + */ > +int mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) > +{ > + struct mm_lock_data *data; > + > + BUG_ON(!atomic_read(&mm->mm_users)); > + > + data = mm_lock(mm); > + if (unlikely(IS_ERR(data))) > + return PTR_ERR(data); > + hlist_del(&mn->hlist); > + mm_unlock(mm, data); > + return 0; Hmmm.. Ok, the user of the notifier does not get notified that it was unregistered. |
From: Christoph L. <cla...@sg...> - 2008-04-14 19:59:50
|
Where is the documentation on locking that you wanted to provide? |
From: Andrea A. <an...@qu...> - 2008-04-08 15:57:04
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1207666893 -7200 # Node ID b0cb674314534b9cc4759603f123474d38427b2d # Parent 20e829e35dfeceeb55a816ef495afda10cd50b98 We no longer abort unmapping in unmap vmas because we can reschedule while unmapping since we are holding a semaphore. This would allow moving more of the tlb flusing into unmap_vmas reducing code in various places. Signed-off-by: Christoph Lameter <cla...@sg...> diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -723,8 +723,7 @@ struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t); unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); -unsigned long unmap_vmas(struct mmu_gather **tlb, - struct vm_area_struct *start_vma, unsigned long start_addr, +unsigned long unmap_vmas(struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -805,7 +805,6 @@ /** * unmap_vmas - unmap a range of memory covered by a list of vma's - * @tlbp: address of the caller's struct mmu_gather * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping @@ -817,20 +816,13 @@ * Unmap all pages in the vma list. * * We aim to not hold locks for too long (for scheduling latency reasons). - * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to - * return the ending mmu_gather to the caller. + * So zap pages in ZAP_BLOCK_SIZE bytecounts. * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. - * - * unmap_vmas() assumes that the caller will flush the whole unmapped address - * range after unmap_vmas() returns. So the only responsibility here is to - * ensure that any thus-far unmapped pages are flushed before unmap_vmas() - * drops the lock and schedules. */ -unsigned long unmap_vmas(struct mmu_gather **tlbp, - struct vm_area_struct *vma, unsigned long start_addr, +unsigned long unmap_vmas(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) { @@ -838,7 +830,15 @@ unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; unsigned long start = start_addr; - int fullmm = (*tlbp)->fullmm; + int fullmm; + struct mmu_gather *tlb; + struct mm_struct *mm = vma->vm_mm; + + mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); + lru_add_drain(); + tlb = tlb_gather_mmu(mm, 0); + update_hiwater_rss(mm); + fullmm = tlb->fullmm; for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; @@ -865,7 +865,7 @@ (HPAGE_SIZE / PAGE_SIZE); start = end; } else - start = unmap_page_range(*tlbp, vma, + start = unmap_page_range(tlb, vma, start, end, &zap_work, details); if (zap_work > 0) { @@ -873,13 +873,15 @@ break; } - tlb_finish_mmu(*tlbp, tlb_start, start); + tlb_finish_mmu(tlb, tlb_start, start); cond_resched(); - *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); + tlb = tlb_gather_mmu(vma->vm_mm, fullmm); tlb_start_valid = 0; zap_work = ZAP_BLOCK_SIZE; } } + tlb_finish_mmu(tlb, start_addr, end_addr); + mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); return start; /* which is now the end (or restart) address */ } @@ -893,20 +895,10 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { - struct mm_struct *mm = vma->vm_mm; - struct mmu_gather *tlb; unsigned long end = address + size; unsigned long nr_accounted = 0; - lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); - update_hiwater_rss(mm); - mmu_notifier_invalidate_range_start(mm, address, end); - end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); - mmu_notifier_invalidate_range_end(mm, address, end); - if (tlb) - tlb_finish_mmu(tlb, address, end); - return end; + return unmap_vmas(vma, address, end, &nr_accounted, details); } /* diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1743,19 +1743,12 @@ unsigned long start, unsigned long end) { struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; - struct mmu_gather *tlb; unsigned long nr_accounted = 0; - lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); - update_hiwater_rss(mm); - mmu_notifier_invalidate_range_start(mm, start, end); - unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); + unmap_vmas(vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - tlb_finish_mmu(tlb, start, end); free_pgtables(vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); - mmu_notifier_invalidate_range_end(mm, start, end); } /* @@ -2035,7 +2028,6 @@ /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { - struct mmu_gather *tlb; struct vm_area_struct *vma = mm->mmap; unsigned long nr_accounted = 0; unsigned long end; @@ -2046,12 +2038,9 @@ lru_add_drain(); flush_cache_mm(mm); - tlb = tlb_gather_mmu(mm, 1); - /* Don't update_hiwater_rss(mm) here, do_exit already did */ - /* Use -1 here to ensure all VMAs in the mm are unmapped */ - end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); + + end = unmap_vmas(vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - tlb_finish_mmu(tlb, 0, end); free_pgtables(vma, FIRST_USER_ADDRESS, 0); /* |
From: Andrea A. <an...@qu...> - 2008-04-08 15:57:04
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1207666972 -7200 # Node ID 3b14e26a4e0491f00bb989be04d8b7e0755ed2d7 # Parent a0c52e4b9b71e2627238b69c0a58905097973279 XPMEM would have used sys_madvise() except that madvise_dontneed() returns an -EINVAL if VM_PFNMAP is set, which is always true for the pages XPMEM imports from other partitions and is also true for uncached pages allocated locally via the mspec allocator. XPMEM needs zap_page_range() functionality for these types of pages as well as 'normal' pages. Signed-off-by: Dean Nelson <dc...@sg...> diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -900,6 +900,7 @@ return unmap_vmas(vma, address, end, &nr_accounted, details); } +EXPORT_SYMBOL_GPL(zap_page_range); /* * Do a quick page-table lookup for a single page. |
From: Andrea A. <an...@qu...> - 2008-04-08 15:57:04
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1207666463 -7200 # Node ID 2c2ed514f294dbbfc66157f771bc900789ac6005 # Parent 33de2e17d0f5670515833bf8d3d2ea19e2a85b09 Move the tlb flushing into free_pgtables. The conversion of the locks taken for reverse map scanning would require taking sleeping locks in free_pgtables(). Moving the tlb flushing into free_pgtables allows sleeping in parts of free_pgtables(). This means that we do a tlb_finish_mmu() before freeing the page tables. Strictly speaking there may not be the need to do another tlb flush after freeing the tables. But its the only way to free a series of page table pages from the tlb list. And we do not want to call into the page allocator for performance reasons. Aim9 numbers look okay after this patch. Signed-off-by: Christoph Lameter <cla...@sg...> diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -751,8 +751,8 @@ void *private); void free_pgd_range(struct mmu_gather **tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, - unsigned long floor, unsigned long ceiling); +void free_pgtables(struct vm_area_struct *start_vma, unsigned long floor, + unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -272,9 +272,11 @@ } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, - unsigned long floor, unsigned long ceiling) +void free_pgtables(struct vm_area_struct *vma, unsigned long floor, + unsigned long ceiling) { + struct mmu_gather *tlb; + while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; @@ -286,8 +288,10 @@ unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { - hugetlb_free_pgd_range(tlb, addr, vma->vm_end, + tlb = tlb_gather_mmu(vma->vm_mm, 0); + hugetlb_free_pgd_range(&tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); + tlb_finish_mmu(tlb, addr, vma->vm_end); } else { /* * Optimization: gather nearby vmas into one call down @@ -299,8 +303,10 @@ anon_vma_unlink(vma); unlink_file_vma(vma); } - free_pgd_range(tlb, addr, vma->vm_end, + tlb = tlb_gather_mmu(vma->vm_mm, 0); + free_pgd_range(&tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); + tlb_finish_mmu(tlb, addr, vma->vm_end); } vma = next; } diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1752,9 +1752,9 @@ mmu_notifier_invalidate_range_start(mm, start, end); unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, + tlb_finish_mmu(tlb, start, end); + free_pgtables(vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); - tlb_finish_mmu(tlb, start, end); mmu_notifier_invalidate_range_end(mm, start, end); } @@ -2051,8 +2051,8 @@ /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); + free_pgtables(vma, FIRST_USER_ADDRESS, 0); /* * Walk the list again, actually closing and freeing it, |
From: Andrea A. <an...@qu...> - 2008-04-08 15:57:05
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1207666968 -7200 # Node ID a0c52e4b9b71e2627238b69c0a58905097973279 # Parent b0cb674314534b9cc4759603f123474d38427b2d Convert the anon_vma spinlock to a rw semaphore. This allows concurrent traversal of reverse maps for try_to_unmap and page_mkclean. It also allows the calling of sleeping functions from reverse map traversal. An additional complication is that rcu is used in some context to guarantee the presence of the anon_vma while we acquire the lock. We cannot take a semaphore within an rcu critical section. Add a refcount to the anon_vma structure which allow us to give an existence guarantee for the anon_vma structure independent of the spinlock or the list contents. The refcount can then be taken within the RCU section. If it has been taken successfully then the refcount guarantees the existence of the anon_vma. The refcount in anon_vma also allows us to fix a nasty issue in page migration where we fudged by using rcu for a long code path to guarantee the existence of the anon_vma. The refcount in general allows a shortening of RCU critical sections since we can do a rcu_unlock after taking the refcount. This is particularly relevant if the anon_vma chains contain hundreds of entries. Issues: - Atomic overhead increases in situations where a new reference to the anon_vma has to be established or removed. Overhead also increases when a speculative reference is used (try_to_unmap, page_mkclean, page migration). There is also the more frequent processor change due to up_xxx letting waiting tasks run first. This results in f.e. the Aim9 brk performance test to got down by 10-15%. Signed-off-by: Christoph Lameter <cla...@sg...> diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1051,9 +1051,9 @@ struct mm_lock_data { struct rw_semaphore **i_mmap_sems; - spinlock_t **anon_vma_locks; + struct rw_semaphore **anon_vma_sems; unsigned long nr_i_mmap_sems; - unsigned long nr_anon_vma_locks; + unsigned long nr_anon_vma_sems; }; extern struct mm_lock_data *mm_lock(struct mm_struct * mm); extern void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data); diff --git a/include/linux/rmap.h b/include/linux/rmap.h --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -25,7 +25,8 @@ * pointing to this anon_vma once its vma list is empty. */ struct anon_vma { - spinlock_t lock; /* Serialize access to vma list */ + atomic_t refcount; /* vmas on the list */ + struct rw_semaphore sem;/* Serialize access to vma list */ struct list_head head; /* List of private "related" vmas */ }; @@ -43,18 +44,31 @@ kmem_cache_free(anon_vma_cachep, anon_vma); } +struct anon_vma *grab_anon_vma(struct page *page); + +static inline void get_anon_vma(struct anon_vma *anon_vma) +{ + atomic_inc(&anon_vma->refcount); +} + +static inline void put_anon_vma(struct anon_vma *anon_vma) +{ + if (atomic_dec_and_test(&anon_vma->refcount)) + anon_vma_free(anon_vma); +} + static inline void anon_vma_lock(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - spin_lock(&anon_vma->lock); + down_write(&anon_vma->sem); } static inline void anon_vma_unlock(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - spin_unlock(&anon_vma->lock); + up_write(&anon_vma->sem); } /* diff --git a/mm/migrate.c b/mm/migrate.c --- a/mm/migrate.c +++ b/mm/migrate.c @@ -235,15 +235,16 @@ return; /* - * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. + * We hold either the mmap_sem lock or a reference on the + * anon_vma. So no need to call page_lock_anon_vma. */ anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); + down_read(&anon_vma->sem); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) remove_migration_pte(vma, old, new); - spin_unlock(&anon_vma->lock); + up_read(&anon_vma->sem); } /* @@ -623,7 +624,7 @@ int rc = 0; int *result = NULL; struct page *newpage = get_new_page(page, private, &result); - int rcu_locked = 0; + struct anon_vma *anon_vma = NULL; int charge = 0; if (!newpage) @@ -647,16 +648,14 @@ } /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, - * we cannot notice that anon_vma is freed while we migrates a page. + * we cannot notice that anon_vma is freed while we migrate a page. * This rcu_read_lock() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. */ - if (PageAnon(page)) { - rcu_read_lock(); - rcu_locked = 1; - } + if (PageAnon(page)) + anon_vma = grab_anon_vma(page); /* * Corner case handling: @@ -674,10 +673,7 @@ if (!PageAnon(page) && PagePrivate(page)) { /* * Go direct to try_to_free_buffers() here because - * a) that's what try_to_release_page() would do anyway - * b) we may be under rcu_read_lock() here, so we can't - * use GFP_KERNEL which is what try_to_release_page() - * needs to be effective. + * that's what try_to_release_page() would do anyway */ try_to_free_buffers(page); } @@ -698,8 +694,8 @@ } else if (charge) mem_cgroup_end_migration(newpage); rcu_unlock: - if (rcu_locked) - rcu_read_unlock(); + if (anon_vma) + put_anon_vma(anon_vma); unlock: diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -566,7 +566,7 @@ if (vma->anon_vma) anon_vma = vma->anon_vma; if (anon_vma) { - spin_lock(&anon_vma->lock); + down_write(&anon_vma->sem); /* * Easily overlooked: when mprotect shifts the boundary, * make sure the expanding vma has anon_vma set if the @@ -620,7 +620,7 @@ } if (anon_vma) - spin_unlock(&anon_vma->lock); + up_write(&anon_vma->sem); if (mapping) up_write(&mapping->i_mmap_sem); @@ -2247,16 +2247,15 @@ struct mm_lock_data *mm_lock(struct mm_struct * mm) { struct vm_area_struct *vma; - struct rw_semaphore *i_mmap_sem_last; - spinlock_t *anon_vma_lock_last; - unsigned long nr_i_mmap_sems, nr_anon_vma_locks, i; + struct rw_semaphore *i_mmap_sem_last, *anon_vma_sem_last; + unsigned long nr_i_mmap_sems, nr_anon_vma_sems, i; struct mm_lock_data *data; int err; down_write(&mm->mmap_sem); err = -EINTR; - nr_i_mmap_sems = nr_anon_vma_locks = 0; + nr_i_mmap_sems = nr_anon_vma_sems = 0; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); if (unlikely(signal_pending(current))) @@ -2265,7 +2264,7 @@ if (vma->vm_file && vma->vm_file->f_mapping) nr_i_mmap_sems++; if (vma->anon_vma) - nr_anon_vma_locks++; + nr_anon_vma_sems++; } err = -ENOMEM; @@ -2281,13 +2280,13 @@ } else data->i_mmap_sems = NULL; - if (nr_anon_vma_locks) { - data->anon_vma_locks = vmalloc(nr_anon_vma_locks * - sizeof(spinlock_t)); - if (!data->anon_vma_locks) + if (nr_anon_vma_sems) { + data->anon_vma_sems = vmalloc(nr_anon_vma_sems * + sizeof(struct rw_semaphore)); + if (!data->anon_vma_sems) goto out_vfree; } else - data->anon_vma_locks = NULL; + data->anon_vma_sems = NULL; err = -EINTR; i_mmap_sem_last = NULL; @@ -2318,10 +2317,11 @@ } data->nr_i_mmap_sems = nr_i_mmap_sems; - anon_vma_lock_last = NULL; - nr_anon_vma_locks = 0; + anon_vma_sem_last = NULL; + nr_anon_vma_sems = 0; for (;;) { - spinlock_t *anon_vma_lock = (spinlock_t *) -1UL; + struct rw_semaphore *anon_vma_sem; + anon_vma_sem = (struct rw_semaphore *) -1UL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); if (unlikely(signal_pending(current))) @@ -2329,28 +2329,28 @@ if (!vma->anon_vma) continue; - if ((unsigned long) anon_vma_lock > - (unsigned long) &vma->anon_vma->lock && - (unsigned long) &vma->anon_vma->lock > - (unsigned long) anon_vma_lock_last) - anon_vma_lock = &vma->anon_vma->lock; + if ((unsigned long) anon_vma_sem > + (unsigned long) &vma->anon_vma->sem && + (unsigned long) &vma->anon_vma->sem > + (unsigned long) anon_vma_sem_last) + anon_vma_sem = &vma->anon_vma->sem; } - if (anon_vma_lock == (spinlock_t *) -1UL) + if (anon_vma_sem == (struct rw_semaphore *) -1UL) break; - anon_vma_lock_last = anon_vma_lock; - data->anon_vma_locks[nr_anon_vma_locks++] = anon_vma_lock; + anon_vma_sem_last = anon_vma_sem; + data->anon_vma_sems[nr_anon_vma_sems++] = anon_vma_sem; } - data->nr_anon_vma_locks = nr_anon_vma_locks; + data->nr_anon_vma_sems = nr_anon_vma_sems; for (i = 0; i < nr_i_mmap_sems; i++) down_write(data->i_mmap_sems[i]); - for (i = 0; i < nr_anon_vma_locks; i++) - spin_lock(data->anon_vma_locks[i]); + for (i = 0; i < nr_anon_vma_sems; i++) + down_write(data->anon_vma_sems[i]); return data; out_vfree_both: - vfree(data->anon_vma_locks); + vfree(data->anon_vma_sems); out_vfree: vfree(data->i_mmap_sems); out_kfree: @@ -2366,12 +2366,12 @@ for (i = 0; i < data->nr_i_mmap_sems; i++) up_write(data->i_mmap_sems[i]); - for (i = 0; i < data->nr_anon_vma_locks; i++) - spin_unlock(data->anon_vma_locks[i]); + for (i = 0; i < data->nr_anon_vma_sems; i++) + up_write(data->anon_vma_sems[i]); up_write(&mm->mmap_sem); vfree(data->i_mmap_sems); - vfree(data->anon_vma_locks); + vfree(data->anon_vma_sems); kfree(data); } diff --git a/mm/rmap.c b/mm/rmap.c --- a/mm/rmap.c +++ b/mm/rmap.c @@ -69,7 +69,7 @@ if (anon_vma) { allocated = NULL; locked = anon_vma; - spin_lock(&locked->lock); + down_write(&locked->sem); } else { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) @@ -81,6 +81,7 @@ /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { + get_anon_vma(anon_vma); vma->anon_vma = anon_vma; list_add_tail(&vma->anon_vma_node, &anon_vma->head); allocated = NULL; @@ -88,7 +89,7 @@ spin_unlock(&mm->page_table_lock); if (locked) - spin_unlock(&locked->lock); + up_write(&locked->sem); if (unlikely(allocated)) anon_vma_free(allocated); } @@ -99,14 +100,17 @@ { BUG_ON(vma->anon_vma != next->anon_vma); list_del(&next->anon_vma_node); + put_anon_vma(vma->anon_vma); } void __anon_vma_link(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; - if (anon_vma) + if (anon_vma) { + get_anon_vma(anon_vma); list_add_tail(&vma->anon_vma_node, &anon_vma->head); + } } void anon_vma_link(struct vm_area_struct *vma) @@ -114,36 +118,32 @@ struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) { - spin_lock(&anon_vma->lock); + get_anon_vma(anon_vma); + down_write(&anon_vma->sem); list_add_tail(&vma->anon_vma_node, &anon_vma->head); - spin_unlock(&anon_vma->lock); + up_write(&anon_vma->sem); } } void anon_vma_unlink(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; - int empty; if (!anon_vma) return; - spin_lock(&anon_vma->lock); + down_write(&anon_vma->sem); list_del(&vma->anon_vma_node); - - /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); - - if (empty) - anon_vma_free(anon_vma); + up_write(&anon_vma->sem); + put_anon_vma(anon_vma); } static void anon_vma_ctor(struct kmem_cache *cachep, void *data) { struct anon_vma *anon_vma = data; - spin_lock_init(&anon_vma->lock); + init_rwsem(&anon_vma->sem); + atomic_set(&anon_vma->refcount, 0); INIT_LIST_HEAD(&anon_vma->head); } @@ -157,9 +157,9 @@ * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *grab_anon_vma(struct page *page) { - struct anon_vma *anon_vma; + struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; rcu_read_lock(); @@ -170,17 +170,26 @@ goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); - return anon_vma; + if (!atomic_inc_not_zero(&anon_vma->refcount)) + anon_vma = NULL; out: rcu_read_unlock(); - return NULL; + return anon_vma; +} + +static struct anon_vma *page_lock_anon_vma(struct page *page) +{ + struct anon_vma *anon_vma = grab_anon_vma(page); + + if (anon_vma) + down_read(&anon_vma->sem); + return anon_vma; } static void page_unlock_anon_vma(struct anon_vma *anon_vma) { - spin_unlock(&anon_vma->lock); - rcu_read_unlock(); + up_read(&anon_vma->sem); + put_anon_vma(anon_vma); } /* |
From: Andrea A. <an...@qu...> - 2008-04-08 15:57:06
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1207666972 -7200 # Node ID bd55023b22769ecb14b26c2347947f7d6d63bcea # Parent 3b14e26a4e0491f00bb989be04d8b7e0755ed2d7 This patch adds a lock ordering rule to avoid a potential deadlock when multiple mmap_sems need to be locked. Signed-off-by: Dean Nelson <dc...@sg...> diff --git a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c +++ b/mm/filemap.c @@ -79,6 +79,9 @@ * * ->i_mutex (generic_file_buffered_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) + * + * When taking multiple mmap_sems, one should lock the lowest-addressed + * one first proceeding on up to the highest-addressed one. * * ->i_mutex * ->i_alloc_sem (various) |
From: Andrea A. <an...@qu...> - 2008-04-08 15:57:07
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1207666463 -7200 # Node ID 20e829e35dfeceeb55a816ef495afda10cd50b98 # Parent 2c2ed514f294dbbfc66157f771bc900789ac6005 The conversion to a rwsem allows callbacks during rmap traversal for files in a non atomic context. A rw style lock also allows concurrent walking of the reverse map. This is fairly straightforward if one removes pieces of the resched checking. [Restarting unmapping is an issue to be discussed]. This slightly increases Aim9 performance results on an 8p. Signed-off-by: Andrea Arcangeli <an...@qu...> Signed-off-by: Christoph Lameter <cla...@sg...> diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -69,7 +69,7 @@ if (!vma_shareable(vma, addr)) return; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -94,7 +94,7 @@ put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); out: - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); } /* diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -454,10 +454,10 @@ pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); if (!prio_tree_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); truncate_hugepages(inode, offset); return 0; } diff --git a/fs/inode.c b/fs/inode.c --- a/fs/inode.c +++ b/fs/inode.c @@ -210,7 +210,7 @@ INIT_LIST_HEAD(&inode->i_devices); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); rwlock_init(&inode->i_data.tree_lock); - spin_lock_init(&inode->i_data.i_mmap_lock); + init_rwsem(&inode->i_data.i_mmap_sem); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); diff --git a/include/linux/fs.h b/include/linux/fs.h --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -503,7 +503,7 @@ unsigned int i_mmap_writable;/* count VM_SHARED mappings */ struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ - spinlock_t i_mmap_lock; /* protect tree, count, list */ + struct rw_semaphore i_mmap_sem; /* protect tree, count, list */ unsigned int truncate_count; /* Cover race condition with truncate */ unsigned long nrpages; /* number of total pages */ pgoff_t writeback_index;/* writeback starts here */ diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -716,7 +716,7 @@ struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ - spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */ + struct rw_semaphore *i_mmap_sem; /* For unmap_mapping_range: */ unsigned long truncate_count; /* Compare vm_truncate_count */ }; @@ -1051,9 +1051,9 @@ unsigned long flags, struct page **pages); struct mm_lock_data { - spinlock_t **i_mmap_locks; + struct rw_semaphore **i_mmap_sems; spinlock_t **anon_vma_locks; - unsigned long nr_i_mmap_locks; + unsigned long nr_i_mmap_sems; unsigned long nr_anon_vma_locks; }; extern struct mm_lock_data *mm_lock(struct mm_struct * mm); diff --git a/kernel/fork.c b/kernel/fork.c --- a/kernel/fork.c +++ b/kernel/fork.c @@ -274,12 +274,12 @@ atomic_dec(&inode->i_writecount); /* insert tmp into the share list, just after mpnt */ - spin_lock(&file->f_mapping->i_mmap_lock); + down_write(&file->f_mapping->i_mmap_sem); tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(file->f_mapping); vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(file->f_mapping); - spin_unlock(&file->f_mapping->i_mmap_lock); + up_write(&file->f_mapping->i_mmap_sem); } /* diff --git a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c +++ b/mm/filemap.c @@ -61,16 +61,16 @@ /* * Lock ordering: * - * ->i_mmap_lock (vmtruncate) + * ->i_mmap_sem (vmtruncate) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_mutex - * ->i_mmap_lock (truncate->unmap_mapping_range) + * ->i_mmap_sem (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_mmap_lock + * ->i_mmap_sem * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * @@ -87,7 +87,7 @@ * ->sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * - * ->i_mmap_lock + * ->i_mmap_sem * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -184,7 +184,7 @@ if (!page) return; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + @@ -204,7 +204,7 @@ page_cache_release(page); } } - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); } /* diff --git a/mm/fremap.c b/mm/fremap.c --- a/mm/fremap.c +++ b/mm/fremap.c @@ -206,13 +206,13 @@ } goto out; } - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; vma_prio_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); } mmu_notifier_invalidate_range_start(mm, start, start + size); diff --git a/mm/hugetlb.c b/mm/hugetlb.c --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -790,7 +790,7 @@ struct page *page; struct page *tmp; /* - * A page gathering list, protected by per file i_mmap_lock. The + * A page gathering list, protected by per file i_mmap_sem. The * lock is used to avoid list corruption from multiple unmapping * of the same page since we are using page->lru. */ @@ -840,9 +840,9 @@ * do nothing in this case. */ if (vma->vm_file) { - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + down_write(&vma->vm_file->f_mapping->i_mmap_sem); __unmap_hugepage_range(vma, start, end); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + up_write(&vma->vm_file->f_mapping->i_mmap_sem); } } @@ -1085,7 +1085,7 @@ BUG_ON(address >= end); flush_cache_range(vma, address, end); - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + down_write(&vma->vm_file->f_mapping->i_mmap_sem); spin_lock(&mm->page_table_lock); for (; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); @@ -1100,7 +1100,7 @@ } } spin_unlock(&mm->page_table_lock); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + up_write(&vma->vm_file->f_mapping->i_mmap_sem); flush_tlb_range(vma, start, end); } diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -838,7 +838,6 @@ unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; unsigned long start = start_addr; - spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; int fullmm = (*tlbp)->fullmm; for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { @@ -875,22 +874,12 @@ } tlb_finish_mmu(*tlbp, tlb_start, start); - - if (need_resched() || - (i_mmap_lock && spin_needbreak(i_mmap_lock))) { - if (i_mmap_lock) { - *tlbp = NULL; - goto out; - } - cond_resched(); - } - + cond_resched(); *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); tlb_start_valid = 0; zap_work = ZAP_BLOCK_SIZE; } } -out: return start; /* which is now the end (or restart) address */ } @@ -1752,7 +1741,7 @@ /* * Helper functions for unmap_mapping_range(). * - * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ + * __ Notes on dropping i_mmap_sem to reduce latency while unmapping __ * * We have to restart searching the prio_tree whenever we drop the lock, * since the iterator is only valid while the lock is held, and anyway @@ -1771,7 +1760,7 @@ * can't efficiently keep all vmas in step with mapping->truncate_count: * so instead reset them all whenever it wraps back to 0 (then go to 1). * mapping->truncate_count and vma->vm_truncate_count are protected by - * i_mmap_lock. + * i_mmap_sem. * * In order to make forward progress despite repeatedly restarting some * large vma, note the restart_addr from unmap_vmas when it breaks out: @@ -1821,7 +1810,7 @@ restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr, details); - need_break = need_resched() || spin_needbreak(details->i_mmap_lock); + need_break = need_resched(); if (restart_addr >= end_addr) { /* We have now completed this vma: mark it so */ @@ -1835,9 +1824,9 @@ goto again; } - spin_unlock(details->i_mmap_lock); + up_write(details->i_mmap_sem); cond_resched(); - spin_lock(details->i_mmap_lock); + down_write(details->i_mmap_sem); return -EINTR; } @@ -1931,9 +1920,9 @@ details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) details.last_index = ULONG_MAX; - details.i_mmap_lock = &mapping->i_mmap_lock; + details.i_mmap_sem = &mapping->i_mmap_sem; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); /* Protect against endless unmapping loops */ mapping->truncate_count++; @@ -1948,7 +1937,7 @@ unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); } EXPORT_SYMBOL(unmap_mapping_range); diff --git a/mm/migrate.c b/mm/migrate.c --- a/mm/migrate.c +++ b/mm/migrate.c @@ -211,12 +211,12 @@ if (!mapping) return; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) remove_migration_pte(vma, old, new); - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); } /* diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -188,7 +188,7 @@ } /* - * Requires inode->i_mapping->i_mmap_lock + * Requires inode->i_mapping->i_mmap_sem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) @@ -216,9 +216,9 @@ if (file) { struct address_space *mapping = file->f_mapping; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); __remove_shared_vm_struct(vma, file, mapping); - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); } } @@ -441,7 +441,7 @@ mapping = vma->vm_file->f_mapping; if (mapping) { - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); vma->vm_truncate_count = mapping->truncate_count; } anon_vma_lock(vma); @@ -451,7 +451,7 @@ anon_vma_unlock(vma); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); mm->map_count++; validate_mm(mm); @@ -538,7 +538,7 @@ mapping = file->f_mapping; if (!(vma->vm_flags & VM_NONLINEAR)) root = &mapping->i_mmap; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); if (importer && vma->vm_truncate_count != next->vm_truncate_count) { /* @@ -622,7 +622,7 @@ if (anon_vma) spin_unlock(&anon_vma->lock); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); if (remove_next) { if (file) @@ -2066,7 +2066,7 @@ /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_lock is taken here. + * then i_mmap_sem is taken here. */ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) { @@ -2258,22 +2258,23 @@ struct mm_lock_data *mm_lock(struct mm_struct * mm) { struct vm_area_struct *vma; - spinlock_t *i_mmap_lock_last, *anon_vma_lock_last; - unsigned long nr_i_mmap_locks, nr_anon_vma_locks, i; + struct rw_semaphore *i_mmap_sem_last; + spinlock_t *anon_vma_lock_last; + unsigned long nr_i_mmap_sems, nr_anon_vma_locks, i; struct mm_lock_data *data; int err; down_write(&mm->mmap_sem); err = -EINTR; - nr_i_mmap_locks = nr_anon_vma_locks = 0; + nr_i_mmap_sems = nr_anon_vma_locks = 0; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); if (unlikely(signal_pending(current))) goto out; if (vma->vm_file && vma->vm_file->f_mapping) - nr_i_mmap_locks++; + nr_i_mmap_sems++; if (vma->anon_vma) nr_anon_vma_locks++; } @@ -2283,13 +2284,13 @@ if (!data) goto out; - if (nr_i_mmap_locks) { - data->i_mmap_locks = vmalloc(nr_i_mmap_locks * - sizeof(spinlock_t)); - if (!data->i_mmap_locks) + if (nr_i_mmap_sems) { + data->i_mmap_sems = vmalloc(nr_i_mmap_sems * + sizeof(struct rw_semaphore)); + if (!data->i_mmap_sems) goto out_kfree; } else - data->i_mmap_locks = NULL; + data->i_mmap_sems = NULL; if (nr_anon_vma_locks) { data->anon_vma_locks = vmalloc(nr_anon_vma_locks * @@ -2300,10 +2301,11 @@ data->anon_vma_locks = NULL; err = -EINTR; - i_mmap_lock_last = NULL; - nr_i_mmap_locks = 0; + i_mmap_sem_last = NULL; + nr_i_mmap_sems = 0; for (;;) { - spinlock_t *i_mmap_lock = (spinlock_t *) -1UL; + struct rw_semaphore *i_mmap_sem; + i_mmap_sem = (struct rw_semaphore *) -1UL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); if (unlikely(signal_pending(current))) @@ -2311,21 +2313,21 @@ if (!vma->vm_file || !vma->vm_file->f_mapping) continue; - if ((unsigned long) i_mmap_lock > + if ((unsigned long) i_mmap_sem > (unsigned long) - &vma->vm_file->f_mapping->i_mmap_lock && + &vma->vm_file->f_mapping->i_mmap_sem && (unsigned long) - &vma->vm_file->f_mapping->i_mmap_lock > - (unsigned long) i_mmap_lock_last) - i_mmap_lock = - &vma->vm_file->f_mapping->i_mmap_lock; + &vma->vm_file->f_mapping->i_mmap_sem > + (unsigned long) i_mmap_sem_last) + i_mmap_sem = + &vma->vm_file->f_mapping->i_mmap_sem; } - if (i_mmap_lock == (spinlock_t *) -1UL) + if (i_mmap_sem == (struct rw_semaphore *) -1UL) break; - i_mmap_lock_last = i_mmap_lock; - data->i_mmap_locks[nr_i_mmap_locks++] = i_mmap_lock; + i_mmap_sem_last = i_mmap_sem; + data->i_mmap_sems[nr_i_mmap_sems++] = i_mmap_sem; } - data->nr_i_mmap_locks = nr_i_mmap_locks; + data->nr_i_mmap_sems = nr_i_mmap_sems; anon_vma_lock_last = NULL; nr_anon_vma_locks = 0; @@ -2351,8 +2353,8 @@ } data->nr_anon_vma_locks = nr_anon_vma_locks; - for (i = 0; i < nr_i_mmap_locks; i++) - spin_lock(data->i_mmap_locks[i]); + for (i = 0; i < nr_i_mmap_sems; i++) + down_write(data->i_mmap_sems[i]); for (i = 0; i < nr_anon_vma_locks; i++) spin_lock(data->anon_vma_locks[i]); @@ -2361,7 +2363,7 @@ out_vfree_both: vfree(data->anon_vma_locks); out_vfree: - vfree(data->i_mmap_locks); + vfree(data->i_mmap_sems); out_kfree: kfree(data); out: @@ -2373,14 +2375,14 @@ { unsigned long i; - for (i = 0; i < data->nr_i_mmap_locks; i++) - spin_unlock(data->i_mmap_locks[i]); + for (i = 0; i < data->nr_i_mmap_sems; i++) + up_write(data->i_mmap_sems[i]); for (i = 0; i < data->nr_anon_vma_locks; i++) spin_unlock(data->anon_vma_locks[i]); up_write(&mm->mmap_sem); - vfree(data->i_mmap_locks); + vfree(data->i_mmap_sems); vfree(data->anon_vma_locks); kfree(data); } diff --git a/mm/mremap.c b/mm/mremap.c --- a/mm/mremap.c +++ b/mm/mremap.c @@ -88,7 +88,7 @@ * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); if (new_vma->vm_truncate_count && new_vma->vm_truncate_count != vma->vm_truncate_count) new_vma->vm_truncate_count = 0; @@ -120,7 +120,7 @@ pte_unmap_nested(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); } diff --git a/mm/rmap.c b/mm/rmap.c --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@ * inode->i_alloc_sem (vmtruncate_range) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_lock + * mapping->i_mmap_sem * anon_vma->lock * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) @@ -373,14 +373,14 @@ * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_lock. + * so we can safely take mapping->i_mmap_sem. */ BUG_ON(!PageLocked(page)); - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); /* - * i_mmap_lock does not stabilize mapcount at all, but mapcount + * i_mmap_sem does not stabilize mapcount at all, but mapcount * is more likely to be accurate if we note it after spinning. */ mapcount = page_mapcount(page); @@ -403,7 +403,7 @@ break; } - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); return referenced; } @@ -489,12 +489,12 @@ BUG_ON(PageAnon(page)); - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) ret += page_mkclean_one(page, vma); } - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); return ret; } @@ -930,7 +930,7 @@ unsigned long max_nl_size = 0; unsigned int mapcount; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { ret = try_to_unmap_one(page, vma, migration); if (ret == SWAP_FAIL || !page_mapped(page)) @@ -967,7 +967,6 @@ mapcount = page_mapcount(page); if (!mapcount) goto out; - cond_resched_lock(&mapping->i_mmap_lock); max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; if (max_nl_cursor == 0) @@ -989,7 +988,6 @@ } vma->vm_private_data = (void *) max_nl_cursor; } - cond_resched_lock(&mapping->i_mmap_lock); max_nl_cursor += CLUSTER_SIZE; } while (max_nl_cursor <= max_nl_size); @@ -1001,7 +999,7 @@ list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) vma->vm_private_data = NULL; out: - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); return ret; } |
From: Andrea A. <an...@qu...> - 2008-04-08 15:57:33
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1207666462 -7200 # Node ID ec6d8f91b299cf26cce5c3d49bb25d35ee33c137 # Parent d4c25404de6376297ed34fada14cd6b894410eb0 Lock the entire mm to prevent any mmu related operation to happen. Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1050,6 +1050,15 @@ unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); +struct mm_lock_data { + spinlock_t **i_mmap_locks; + spinlock_t **anon_vma_locks; + unsigned long nr_i_mmap_locks; + unsigned long nr_anon_vma_locks; +}; +extern struct mm_lock_data *mm_lock(struct mm_struct * mm); +extern void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data); + extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -26,6 +26,7 @@ #include <linux/mount.h> #include <linux/mempolicy.h> #include <linux/rmap.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -2242,3 +2243,140 @@ return 0; } + +/* + * This operation locks against the VM for all pte/vma/mm related + * operations that could ever happen on a certain mm. This includes + * vmtruncate, try_to_unmap, and all page faults. The holder + * must not hold any mm related lock. A single task can't take more + * than one mm lock in a row or it would deadlock. + */ +struct mm_lock_data *mm_lock(struct mm_struct * mm) +{ + struct vm_area_struct *vma; + spinlock_t *i_mmap_lock_last, *anon_vma_lock_last; + unsigned long nr_i_mmap_locks, nr_anon_vma_locks, i; + struct mm_lock_data *data; + int err; + + down_write(&mm->mmap_sem); + + err = -EINTR; + nr_i_mmap_locks = nr_anon_vma_locks = 0; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + cond_resched(); + if (unlikely(signal_pending(current))) + goto out; + + if (vma->vm_file && vma->vm_file->f_mapping) + nr_i_mmap_locks++; + if (vma->anon_vma) + nr_anon_vma_locks++; + } + + err = -ENOMEM; + data = kmalloc(sizeof(struct mm_lock_data), GFP_KERNEL); + if (!data) + goto out; + + if (nr_i_mmap_locks) { + data->i_mmap_locks = vmalloc(nr_i_mmap_locks * + sizeof(spinlock_t)); + if (!data->i_mmap_locks) + goto out_kfree; + } else + data->i_mmap_locks = NULL; + + if (nr_anon_vma_locks) { + data->anon_vma_locks = vmalloc(nr_anon_vma_locks * + sizeof(spinlock_t)); + if (!data->anon_vma_locks) + goto out_vfree; + } else + data->anon_vma_locks = NULL; + + err = -EINTR; + i_mmap_lock_last = NULL; + nr_i_mmap_locks = 0; + for (;;) { + spinlock_t *i_mmap_lock = (spinlock_t *) -1UL; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + cond_resched(); + if (unlikely(signal_pending(current))) + goto out_vfree_both; + + if (!vma->vm_file || !vma->vm_file->f_mapping) + continue; + if ((unsigned long) i_mmap_lock > + (unsigned long) + &vma->vm_file->f_mapping->i_mmap_lock && + (unsigned long) + &vma->vm_file->f_mapping->i_mmap_lock > + (unsigned long) i_mmap_lock_last) + i_mmap_lock = + &vma->vm_file->f_mapping->i_mmap_lock; + } + if (i_mmap_lock == (spinlock_t *) -1UL) + break; + i_mmap_lock_last = i_mmap_lock; + data->i_mmap_locks[nr_i_mmap_locks++] = i_mmap_lock; + } + data->nr_i_mmap_locks = nr_i_mmap_locks; + + anon_vma_lock_last = NULL; + nr_anon_vma_locks = 0; + for (;;) { + spinlock_t *anon_vma_lock = (spinlock_t *) -1UL; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + cond_resched(); + if (unlikely(signal_pending(current))) + goto out_vfree_both; + + if (!vma->anon_vma) + continue; + if ((unsigned long) anon_vma_lock > + (unsigned long) &vma->anon_vma->lock && + (unsigned long) &vma->anon_vma->lock > + (unsigned long) anon_vma_lock_last) + anon_vma_lock = &vma->anon_vma->lock; + } + if (anon_vma_lock == (spinlock_t *) -1UL) + break; + anon_vma_lock_last = anon_vma_lock; + data->anon_vma_locks[nr_anon_vma_locks++] = anon_vma_lock; + } + data->nr_anon_vma_locks = nr_anon_vma_locks; + + for (i = 0; i < nr_i_mmap_locks; i++) + spin_lock(data->i_mmap_locks[i]); + for (i = 0; i < nr_anon_vma_locks; i++) + spin_lock(data->anon_vma_locks[i]); + + return data; + +out_vfree_both: + vfree(data->anon_vma_locks); +out_vfree: + vfree(data->i_mmap_locks); +out_kfree: + kfree(data); +out: + up_write(&mm->mmap_sem); + return ERR_PTR(err); +} + +void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data) +{ + unsigned long i; + + for (i = 0; i < data->nr_i_mmap_locks; i++) + spin_unlock(data->i_mmap_locks[i]); + for (i = 0; i < data->nr_anon_vma_locks; i++) + spin_unlock(data->anon_vma_locks[i]); + + up_write(&mm->mmap_sem); + + vfree(data->i_mmap_locks); + vfree(data->anon_vma_locks); + kfree(data); +} |
From: Rusty R. <ru...@ru...> - 2008-04-22 05:06:27
|
On Wednesday 09 April 2008 01:44:04 Andrea Arcangeli wrote: > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1050,6 +1050,15 @@ > unsigned long addr, unsigned long len, > unsigned long flags, struct page **pages); > > +struct mm_lock_data { > + spinlock_t **i_mmap_locks; > + spinlock_t **anon_vma_locks; > + unsigned long nr_i_mmap_locks; > + unsigned long nr_anon_vma_locks; > +}; > +extern struct mm_lock_data *mm_lock(struct mm_struct * mm); > +extern void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data); As far as I can tell you don't actually need to expose this struct at all? > + data->i_mmap_locks = vmalloc(nr_i_mmap_locks * > + sizeof(spinlock_t)); This is why non-typesafe allocators suck. You want 'sizeof(spinlock_t *)' here. > + data->anon_vma_locks = vmalloc(nr_anon_vma_locks * > + sizeof(spinlock_t)); and here. > + err = -EINTR; > + i_mmap_lock_last = NULL; > + nr_i_mmap_locks = 0; > + for (;;) { > + spinlock_t *i_mmap_lock = (spinlock_t *) -1UL; > + for (vma = mm->mmap; vma; vma = vma->vm_next) { ... > + data->i_mmap_locks[nr_i_mmap_locks++] = i_mmap_lock; > + } > + data->nr_i_mmap_locks = nr_i_mmap_locks; How about you track your running counter in data->nr_i_mmap_locks, leave nr_i_mmap_locks alone, and BUG_ON(data->nr_i_mmap_locks != nr_i_mmap_locks)? Even nicer would be to wrap this in a "get_sorted_mmap_locks()" function. Similarly for anon_vma locks. Unfortunately, I just don't think we can fail locking like this. In your next patch unregistering a notifier can fail because of it: that not usable. I think it means you need to add a linked list element to the vma for the CONFIG_MMU_NOTIFIER case. Or track the max number of vmas for any mm, and keep a pool to handle mm_lock for this number (ie. if you can't enlarge the pool, fail the vma allocation). Both have their problems though... Rusty. |
From: Andrea A. <an...@qu...> - 2008-04-25 16:56:47
|
I somehow lost missed this email in my inbox, found it now because it was strangely still unread... Sorry for the late reply! On Tue, Apr 22, 2008 at 03:06:24PM +1000, Rusty Russell wrote: > On Wednesday 09 April 2008 01:44:04 Andrea Arcangeli wrote: > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -1050,6 +1050,15 @@ > > unsigned long addr, unsigned long len, > > unsigned long flags, struct page **pages); > > > > +struct mm_lock_data { > > + spinlock_t **i_mmap_locks; > > + spinlock_t **anon_vma_locks; > > + unsigned long nr_i_mmap_locks; > > + unsigned long nr_anon_vma_locks; > > +}; > > +extern struct mm_lock_data *mm_lock(struct mm_struct * mm); > > +extern void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data); > > As far as I can tell you don't actually need to expose this struct at all? Yes, it should be possible to only expose 'struct mm_lock_data;'. > > + data->i_mmap_locks = vmalloc(nr_i_mmap_locks * > > + sizeof(spinlock_t)); > > This is why non-typesafe allocators suck. You want 'sizeof(spinlock_t *)' > here. > > > + data->anon_vma_locks = vmalloc(nr_anon_vma_locks * > > + sizeof(spinlock_t)); > > and here. Great catch! (it was temporarily wasting some ram which isn't nice at all) > > + err = -EINTR; > > + i_mmap_lock_last = NULL; > > + nr_i_mmap_locks = 0; > > + for (;;) { > > + spinlock_t *i_mmap_lock = (spinlock_t *) -1UL; > > + for (vma = mm->mmap; vma; vma = vma->vm_next) { > ... > > + data->i_mmap_locks[nr_i_mmap_locks++] = i_mmap_lock; > > + } > > + data->nr_i_mmap_locks = nr_i_mmap_locks; > > How about you track your running counter in data->nr_i_mmap_locks, leave > nr_i_mmap_locks alone, and BUG_ON(data->nr_i_mmap_locks != nr_i_mmap_locks)? > > Even nicer would be to wrap this in a "get_sorted_mmap_locks()" function. I'll try to clean this up further and I'll make a further update for review. > Unfortunately, I just don't think we can fail locking like this. In your next > patch unregistering a notifier can fail because of it: that not usable. Fortunately I figured out we don't really need mm_lock in unregister because it's ok to unregister in the middle of the range_begin/end critical section (that's definitely not ok for register that's why register needs mm_lock). And it's perfectly ok to fail in register(). Also it wasn't ok to unpin the module count in ->release as ->release needs to 'ret' to get back to the mmu notifier code. And without any unregister at all, the module can't be unloaded at all which is quite unacceptable... The logic is to prevent mmu_notifier_register to race with mmu_notifier_release because it takes the mm_users pin (implicit or explicit, and then mmput just after mmu_notifier_register returns). Then _register serializes against all the mmu notifier methods (except ->release) with srcu (->release can't run thanks to the mm_users pin). The mmu_notifier_mm->lock then serializes the modification on the list (register vs unregister) and it ensures one and only one between _unregister and _releases calls ->release before _unregister returns. All other methods runs freely with srcu. Having the guarante that ->release is called just before all pages are freed or inside _unregister, allows the module to zap and freeze its secondary mmu inside ->release with the race condition of exit() against mmu_notifier_unregister internally by the mmu notifier code and without dependency on exit_files/exit_mm ordering depending if the fd of the driver is open the filetables or in the vma only. The mmu_notifier_mm can be reset to 0 only after the last mmdrop. About the mm_count refcounting for _release and _unregiste: no mmu notifier and not even mmu_notifier_unregister and _release can cope with mmu_notfier_mm list and srcu structures going away out of order. exit_mmap is safe as it holds an mm_count implicitly because mmdrop is run after exit_mmap returns. mmu_notifier_unregister is safe too as _register takes the mm_count pin. We can't prevent mmu_notifer_mm to go away with mm_users as that will screwup the vma filedescriptor closure that only happens inside exit_mmap (mm_users pinned prevents exit_mmap to run, and it can only be taken temporarily until _register returns). |
From: Andrea A. <an...@qu...> - 2008-04-25 17:04:33
|
On Fri, Apr 25, 2008 at 06:56:39PM +0200, Andrea Arcangeli wrote: > > > + data->i_mmap_locks = vmalloc(nr_i_mmap_locks * > > > + sizeof(spinlock_t)); > > > > This is why non-typesafe allocators suck. You want 'sizeof(spinlock_t *)' > > here. > > > > > + data->anon_vma_locks = vmalloc(nr_anon_vma_locks * > > > + sizeof(spinlock_t)); > > > > and here. > > Great catch! (it was temporarily wasting some ram which isn't nice at all) As I went into the editor I just found the above already fixed in #v14-pre3. And I can't move the structure into the file anymore without kmallocing it. Exposing that structure avoids the ERR_PTR/PTR_ERR on the retvals and one kmalloc so I think it makes the code simpler in the end to keep it as it is now. I'd rather avoid further changes to the 1/N patch, as long as they don't make any difference at runtime and as long as they involve more than cut-and-pasting a structure from .h to .c file. |
From: Robin H. <ho...@sg...> - 2008-04-25 19:25:43
|
On Fri, Apr 25, 2008 at 06:56:40PM +0200, Andrea Arcangeli wrote: > Fortunately I figured out we don't really need mm_lock in unregister > because it's ok to unregister in the middle of the range_begin/end > critical section (that's definitely not ok for register that's why > register needs mm_lock). And it's perfectly ok to fail in register(). I think you still need mm_lock (unless I miss something). What happens when one callout is scanning mmu_notifier_invalidate_range_start() and you unlink. That list next pointer with LIST_POISON1 which is a really bad address for the processor to track. Maybe I misunderstood your description. Thanks, Robin |
From: Andrea A. <an...@qu...> - 2008-04-26 00:57:23
|
On Fri, Apr 25, 2008 at 02:25:32PM -0500, Robin Holt wrote: > I think you still need mm_lock (unless I miss something). What happens > when one callout is scanning mmu_notifier_invalidate_range_start() and > you unlink. That list next pointer with LIST_POISON1 which is a really > bad address for the processor to track. Ok, _release list_del_init qcan't race with that because it happens in exit_mmap when no other mmu notifier can trigger anymore. _unregister can run concurrently but it does list_del_rcu, that only overwrites the pprev pointer with LIST_POISON2. The mmu_notifier_invalidate_range_start won't crash on LIST_POISON1 thanks to srcu. Actually I did more changes than necessary, for example I noticed the mmu_notifier_register can return a list_add_head instead of list_add_head_rcu. _register can't race against _release thanks to the mm_users temporary or implicit pin. _register can't race against _unregister thanks to the mmu_notifier_mm->lock. And register can't race against all other mmu notifiers thanks to the mm_lock. At this time I've no other pending patches on top of v14-pre3 other than the below micro-optimizing cleanup. It'd be great to have confirmation that v14-pre3 passes GRU/XPMEM regressions tests as well as my KVM testing already passed successfully on it. I'll forward v14-pre3 mmu-notifier-core plus the below to Andrew tomorrow, I'm trying to be optimistic here! ;) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -187,7 +187,7 @@ int mmu_notifier_register(struct mmu_not * current->mm or explicitly with get_task_mm() or similar). */ spin_lock(&mm->mmu_notifier_mm->lock); - hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); + hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); spin_unlock(&mm->mmu_notifier_mm->lock); out_unlock: mm_unlock(mm, &data); |
From: Robin H. <ho...@sg...> - 2008-04-08 16:26:36
|
This one does not build on ia64. I get the following: [holt@attica mmu_v12_xpmem_v003_v1]$ make compressed CHK include/linux/version.h CHK include/linux/utsrelease.h CALL scripts/checksyscalls.sh CHK include/linux/compile.h CC mm/mmu_notifier.o In file included from include/linux/mmu_notifier.h:6, from mm/mmu_notifier.c:12: include/linux/mm_types.h:200: error: expected specifier-qualifier-list before ‘cpumask_t’ In file included from mm/mmu_notifier.c:12: include/linux/mmu_notifier.h: In function ‘mm_has_notifiers’: include/linux/mmu_notifier.h:62: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’ include/linux/mmu_notifier.h: In function ‘mmu_notifier_mm_init’: include/linux/mmu_notifier.h:117: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’ In file included from include/asm/pgtable.h:155, from include/linux/mm.h:39, from mm/mmu_notifier.c:14: include/asm/mmu_context.h: In function ‘get_mmu_context’: include/asm/mmu_context.h:81: error: ‘struct mm_struct’ has no member named ‘context’ include/asm/mmu_context.h:88: error: ‘struct mm_struct’ has no member named ‘context’ include/asm/mmu_context.h:90: error: ‘struct mm_struct’ has no member named ‘cpu_vm_mask’ include/asm/mmu_context.h:99: error: ‘struct mm_struct’ has no member named ‘context’ include/asm/mmu_context.h: In function ‘init_new_context’: include/asm/mmu_context.h:120: error: ‘struct mm_struct’ has no member named ‘context’ include/asm/mmu_context.h: In function ‘activate_context’: include/asm/mmu_context.h:173: error: ‘struct mm_struct’ has no member named ‘cpu_vm_mask’ include/asm/mmu_context.h:174: error: ‘struct mm_struct’ has no member named ‘cpu_vm_mask’ include/asm/mmu_context.h:180: error: ‘struct mm_struct’ has no member named ‘context’ mm/mmu_notifier.c: In function ‘__mmu_notifier_release’: mm/mmu_notifier.c:25: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’ mm/mmu_notifier.c:26: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’ mm/mmu_notifier.c: In function ‘__mmu_notifier_clear_flush_young’: mm/mmu_notifier.c:47: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’ mm/mmu_notifier.c: In function ‘__mmu_notifier_invalidate_page’: mm/mmu_notifier.c:61: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’ mm/mmu_notifier.c: In function ‘__mmu_notifier_invalidate_range_start’: mm/mmu_notifier.c:73: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’ mm/mmu_notifier.c: In function ‘__mmu_notifier_invalidate_range_end’: mm/mmu_notifier.c:85: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’ mm/mmu_notifier.c: In function ‘mmu_notifier_register’: mm/mmu_notifier.c:102: error: ‘struct mm_struct’ has no member named ‘mmu_notifier_list’ make[1]: *** [mm/mmu_notifier.o] Error 1 make: *** [mm] Error 2 |
From: Andrea A. <an...@qu...> - 2008-04-08 17:33:50
|
On Tue, Apr 08, 2008 at 11:26:19AM -0500, Robin Holt wrote: > This one does not build on ia64. I get the following: I think it's a common code compilation bug not related to my patch. Can you test this? diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -10,6 +10,7 @@ #include <linux/rbtree.h> #include <linux/rwsem.h> #include <linux/completion.h> +#include <linux/cpumask.h> #include <asm/page.h> #include <asm/mmu.h> |
From: Avi K. <av...@qu...> - 2008-04-08 21:47:47
|
Andrea Arcangeli wrote: > Note that mmu_notifier_unregister may also fail with -EINTR if there are > signal pending or the system runs out of vmalloc space or physical memory, > only exit_mmap guarantees that any kernel module can be unloaded in presence > of an oom condition. > > That's unusual. What happens to the notifier? Suppose I destroy a vm without exiting the process, what happens if it fires? -- Any sufficiently difficult bug is indistinguishable from a feature. |
From: Andrea A. <an...@qu...> - 2008-04-08 22:06:34
|
On Wed, Apr 09, 2008 at 12:46:49AM +0300, Avi Kivity wrote: > That's unusual. What happens to the notifier? Suppose I destroy a vm Yes it's quite unusual. > without exiting the process, what happens if it fires? The mmu notifier ops should stop doing stuff (if there will be no memslots they will be noops), or the ops can be replaced atomically with null pointers. The important thing is that the module can't go away until ->release is invoked or until mmu_notifier_unregister returned 0. Previously there was no mmu_notifier_unregister, so adding it can't be a regression compared to #v11, even if it can fail and you may have to retry later after returning to userland. Retrying from userland is always safe in oom kill terms, only looping inside the kernel isn't safe as do_exit has no chance to run. |
From: Robin H. <ho...@sg...> - 2008-04-09 13:17:19
|
I applied this patch set with the xpmem version I am working up for submission and the basic level-1 and level-2 tests passed. The full mpi regression test still tends to hang, but that appears to be a common problem failure affecting either emm or mmu notifiers and therefore, I am certain is a problem in my code. Please note this is not an endorsement of one method over the other, merely that under conditions where we would expect xpmem to pass the regression tests, it does pass those tests. Thanks, Robin On Tue, Apr 08, 2008 at 05:44:03PM +0200, Andrea Arcangeli wrote: > The difference with #v11 is a different implementation of mm_lock that > guarantees handling signals in O(N). It's also more lowlatency friendly. > > Note that mmu_notifier_unregister may also fail with -EINTR if there are > signal pending or the system runs out of vmalloc space or physical memory, > only exit_mmap guarantees that any kernel module can be unloaded in presence > of an oom condition. > > Either #v11 or the first three #v12 1,2,3 patches are suitable for inclusion > in -mm, pick what you prefer looking at the mmu_notifier_register retval and > mm_lock retval difference, I implemented and slighty tested both. GRU and KVM > only needs 1,2,3, XPMEM needs the rest of the patchset too (4, ...) but all > patches from 4 to the end can be deffered to a second merge window. |
From: Andrea A. <an...@qu...> - 2008-04-09 14:45:53
|
On Wed, Apr 09, 2008 at 08:17:09AM -0500, Robin Holt wrote: > I applied this patch set with the xpmem version I am working up for > submission and the basic level-1 and level-2 tests passed. The full mpi > regression test still tends to hang, but that appears to be a common > problem failure affecting either emm or mmu notifiers and therefore, I > am certain is a problem in my code. > > Please note this is not an endorsement of one method over the other, > merely that under conditions where we would expect xpmem to pass the > regression tests, it does pass those tests. Thanks a lot for testing! #v12 works great with KVM too. (I'm now in the process of chagning the KVM patch to drop the page pinning) BTW, how did you implement invalidate_page? As this? invalidate_page() { invalidate_range_begin() invalidate_range_end() } If yes, I prefer to remind you that normally invalidate_range_begin is always called before zapping the pte. In the invalidate_page case instead, invalidate_range_begin is called _after_ the pte has been zapped already. Now there's no problem if the pte is established and the spte isn't established. But it must never happen that the spte is established and the pte isn't established (with page-pinning that means unswappable memlock leak, without page-pinning it would mean memory corruption). So the range_begin must serialize against the secondary mmu page fault so that it can't establish the spte on a pte that was zapped by the rmap code after get_user_pages/follow_page returned. I think your range_begin already does that so you should be ok but I wanted to remind about this slight difference in implementing invalidate_page as I suggested above in previous email just to be sure ;). This is the race you must guard against in invalidate_page: CPU0 CPU1 try_to_unmap on page secondary mmu page fault get_user_pages()/follow_page found a page ptep_clear_flush invalidate_page() invalidate_range_begin() invalidate_range_end() return from invalidate_page establish spte on page return from secodnary mmu page fault If your range_begin already serializes in a hard way against the secondary mmu page fault, my previously "trivial" suggested implementation for invalidate_page should work just fine and this this saves 1 branch for each try_to_unmap_one if compared to the emm implementation. The branch check is inlined and it checks against the mmu_notifier_head that is the hot cacheline, no new cachline is checked just one branch is saved and so it worth it IMHO even if it doesn't provide any other advantage if you implement it the way above. |
From: Robin H. <ho...@sg...> - 2008-04-09 18:55:05
|
On Wed, Apr 09, 2008 at 04:44:01PM +0200, Andrea Arcangeli wrote: > BTW, how did you implement invalidate_page? As this? > > invalidate_page() { > invalidate_range_begin() > invalidate_range_end() > } Essentially, I did the work of each step without releasing and reacquiring locks. > If yes, I prefer to remind you that normally invalidate_range_begin is > always called before zapping the pte. In the invalidate_page case > instead, invalidate_range_begin is called _after_ the pte has been > zapped already. > > Now there's no problem if the pte is established and the spte isn't > established. But it must never happen that the spte is established and > the pte isn't established (with page-pinning that means unswappable > memlock leak, without page-pinning it would mean memory corruption). I am not sure I follow what you are saying. Here is a very terse breakdown of how PFNs flow through xpmem's structures. We have a PFN table associated with our structure describing a grant. We use get_user_pages() to acquire information for that table and we fill the table in under a mutex. Remote hosts (on the same numa network so they have direct access to the users memory) have a PROXY version of that structure. It is filled out in a similar fashion to the local table. PTEs are created for the other processes while holding the mutex for this table (either local or remote). During the process of faulting, we have a simple linked list of ongoing faults that is maintained whenever the mutex is going to be released. Our version of a zap_page_range is called recall_PFNs. The recall process grabs the mutex, scans the faulters list for any that cover the range and mark them as needing a retry. It then calls zap_page_range for any processes that have attached the granted memory to clear out their page tables. Finally, we release the mutex and proceed. The locking is more complex than this, but that is the essential idea. What that means for mmu_notifiers is we have a single reference on the page for all the remote processes using it. When the callout to invalidate_page() is made, we will still have processes with that PTE in their page tables and potentially TLB entries. When we return from the invalidate_page() callout, we will have removed all those page table entries, we will have no in-progress page table or tlb insertions that will complete, and we will have released all our references to the page. Does that meet your expectations? Thanks, Robin > > So the range_begin must serialize against the secondary mmu page fault > so that it can't establish the spte on a pte that was zapped by the > rmap code after get_user_pages/follow_page returned. I think your > range_begin already does that so you should be ok but I wanted to > remind about this slight difference in implementing invalidate_page as > I suggested above in previous email just to be sure ;). > > This is the race you must guard against in invalidate_page: > > > CPU0 CPU1 > try_to_unmap on page > secondary mmu page fault > get_user_pages()/follow_page found a page > ptep_clear_flush > invalidate_page() > invalidate_range_begin() > invalidate_range_end() > return from invalidate_page > establish spte on page > return from secodnary mmu page fault > > If your range_begin already serializes in a hard way against the > secondary mmu page fault, my previously "trivial" suggested > implementation for invalidate_page should work just fine and this this > saves 1 branch for each try_to_unmap_one if compared to the emm > implementation. The branch check is inlined and it checks against the > mmu_notifier_head that is the hot cacheline, no new cachline is > checked just one branch is saved and so it worth it IMHO even if it > doesn't provide any other advantage if you implement it the way above. |