From: Andrea A. <an...@qu...> - 2008-05-07 15:23:29
|
Hello, this is the last update of the mmu notifier patch. Jack asked a __mmu_notifier_register to call under mmap_sem in write mode. Here an update with that change plus allowing ->release not to be implemented (two liner change to mmu_notifier.c). The entire diff between v15 and v16 mmu-notifier-core was posted in separate email. |
From: Andrea A. <an...@qu...> - 2008-05-07 14:37:47
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1210115129 -7200 # Node ID d60d200565abde6a8ed45271e53cde9c5c75b426 # Parent c5badbefeee07518d9d1acca13e94c981420317c invalidate_page outside PT lock Moves all mmu notifier methods outside the PT lock (first and not last step to make them sleep capable). Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -210,35 +210,6 @@ static inline void mmu_notifier_mm_destr __mmu_notifier_mm_destroy(mm); } -/* - * These two macros will sometime replace ptep_clear_flush. - * ptep_clear_flush is impleemnted as macro itself, so this also is - * implemented as a macro until ptep_clear_flush will converted to an - * inline function, to diminish the risk of compilation failure. The - * invalidate_page method over time can be moved outside the PT lock - * and these two macros can be later removed. - */ -#define ptep_clear_flush_notify(__vma, __address, __ptep) \ -({ \ - pte_t __pte; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __pte = ptep_clear_flush(___vma, ___address, __ptep); \ - mmu_notifier_invalidate_page(___vma->vm_mm, ___address); \ - __pte; \ -}) - -#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ - __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ - ___address); \ - __young; \ -}) - #else /* CONFIG_MMU_NOTIFIER */ static inline void mmu_notifier_release(struct mm_struct *mm) @@ -274,9 +245,6 @@ static inline void mmu_notifier_mm_destr { } -#define ptep_clear_flush_young_notify ptep_clear_flush_young -#define ptep_clear_flush_notify ptep_clear_flush - #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */ diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -188,11 +188,13 @@ __xip_unmap (struct address_space * mapp if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); page_remove_rmap(page, vma); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); + /* must invalidate_page _before_ freeing the page */ + mmu_notifier_invalidate_page(mm, address); page_cache_release(page); } } diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -1714,9 +1714,10 @@ static int do_wp_page(struct mm_struct * */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - page_cache_release(old_page); + new_page = NULL; if (!pte_same(*page_table, orig_pte)) goto unlock; + page_cache_release(old_page); page_mkwrite = 1; } @@ -1732,6 +1733,7 @@ static int do_wp_page(struct mm_struct * if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, entry); ret |= VM_FAULT_WRITE; + old_page = new_page = NULL; goto unlock; } @@ -1776,7 +1778,7 @@ gotten: * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, address, page_table); + ptep_clear_flush(vma, address, page_table); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); lru_cache_add_active(new_page); @@ -1788,12 +1790,18 @@ gotten: } else mem_cgroup_uncharge_page(new_page); - if (new_page) +unlock: + pte_unmap_unlock(page_table, ptl); + + if (new_page) { + if (new_page == old_page) + /* cow happened, notify before releasing old_page */ + mmu_notifier_invalidate_page(mm, address); page_cache_release(new_page); + } if (old_page) page_cache_release(old_page); -unlock: - pte_unmap_unlock(page_table, ptl); + if (dirty_page) { if (vma->vm_file) file_update_time(vma->vm_file); diff --git a/mm/rmap.c b/mm/rmap.c --- a/mm/rmap.c +++ b/mm/rmap.c @@ -275,7 +275,7 @@ static int page_referenced_one(struct pa unsigned long address; pte_t *pte; spinlock_t *ptl; - int referenced = 0; + int referenced = 0, clear_flush_young = 0; address = vma_address(page, vma); if (address == -EFAULT) @@ -288,8 +288,11 @@ static int page_referenced_one(struct pa if (vma->vm_flags & VM_LOCKED) { referenced++; *mapcount = 1; /* break early from loop */ - } else if (ptep_clear_flush_young_notify(vma, address, pte)) - referenced++; + } else { + clear_flush_young = 1; + if (ptep_clear_flush_young(vma, address, pte)) + referenced++; + } /* Pretend the page is referenced if the task has the swap token and is in the middle of a page fault. */ @@ -299,6 +302,10 @@ static int page_referenced_one(struct pa (*mapcount)--; pte_unmap_unlock(pte, ptl); + + if (clear_flush_young) + referenced += mmu_notifier_clear_flush_young(mm, address); + out: return referenced; } @@ -458,7 +465,7 @@ static int page_mkclean_one(struct page pte_t entry; flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush_notify(vma, address, pte); + entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(mm, address, pte, entry); @@ -466,6 +473,10 @@ static int page_mkclean_one(struct page } pte_unmap_unlock(pte, ptl); + + if (ret) + mmu_notifier_invalidate_page(mm, address); + out: return ret; } @@ -717,15 +728,14 @@ static int try_to_unmap_one(struct page * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!migration && ((vma->vm_flags & VM_LOCKED) || - (ptep_clear_flush_young_notify(vma, address, pte)))) { + if (!migration && (vma->vm_flags & VM_LOCKED)) { ret = SWAP_FAIL; goto out_unmap; } /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -780,6 +790,8 @@ static int try_to_unmap_one(struct page out_unmap: pte_unmap_unlock(pte, ptl); + if (ret != SWAP_FAIL) + mmu_notifier_invalidate_page(mm, address); out: return ret; } @@ -818,7 +830,7 @@ static void try_to_unmap_cluster(unsigne spinlock_t *ptl; struct page *page; unsigned long address; - unsigned long end; + unsigned long start, end; address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -839,6 +851,8 @@ static void try_to_unmap_cluster(unsigne if (!pmd_present(*pmd)) return; + start = address; + mmu_notifier_invalidate_range_start(mm, start, end); pte = pte_offset_map_lock(mm, pmd, address, &ptl); /* Update high watermark before we lower rss */ @@ -850,12 +864,12 @@ static void try_to_unmap_cluster(unsigne page = vm_normal_page(vma, address, *pte); BUG_ON(!page || PageAnon(page)); - if (ptep_clear_flush_young_notify(vma, address, pte)) + if (ptep_clear_flush_young(vma, address, pte)) continue; /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) @@ -871,6 +885,7 @@ static void try_to_unmap_cluster(unsigne (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + mmu_notifier_invalidate_range_end(mm, start, end); } static int try_to_unmap_anon(struct page *page, int migration) |
From: Rik v. R. <ri...@re...> - 2008-05-07 17:40:44
|
On Wed, 07 May 2008 16:35:53 +0200 Andrea Arcangeli <an...@qu...> wrote: > # HG changeset patch > # User Andrea Arcangeli <an...@qu...> > # Date 1210115129 -7200 > # Node ID d60d200565abde6a8ed45271e53cde9c5c75b426 > # Parent c5badbefeee07518d9d1acca13e94c981420317c > invalidate_page outside PT lock > > Moves all mmu notifier methods outside the PT lock (first and not last > step to make them sleep capable). This patch appears to undo some of the changes made by patch 01/11. Would it be an idea to merge them into one, so the first patch introduces the right conventions directly? -- All rights reversed. |
From: Andrea A. <an...@qu...> - 2008-05-07 17:57:15
|
On Wed, May 07, 2008 at 01:39:43PM -0400, Rik van Riel wrote: > Would it be an idea to merge them into one, so the first patch > introduces the right conventions directly? The only reason this isn't merged into one, is that this requires non obvious (not difficult though) to the core VM code. I wanted to keep an obviously safe approach for 2.6.26. The other conventions are only needed by XPMEM and XPMEM can't work without all other patches anyway. |
From: Andrea A. <an...@qu...> - 2008-05-07 14:37:47
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1210115130 -7200 # Node ID 34f6a4bf67ce66714ba2d5c13a5fed241d34fb09 # Parent d60d200565abde6a8ed45271e53cde9c5c75b426 free-pgtables Move the tlb flushing into free_pgtables. The conversion of the locks taken for reverse map scanning would require taking sleeping locks in free_pgtables() and we cannot sleep while gathering pages for a tlb flush. Move the tlb_gather/tlb_finish call to free_pgtables() to be done for each vma. This may add a number of tlb flushes depending on the number of vmas that cannot be coalesced into one. The first pointer argument to free_pgtables() can then be dropped. Signed-off-by: Christoph Lameter <cla...@sg...> Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -772,8 +772,8 @@ int walk_page_range(const struct mm_stru void *private); void free_pgd_range(struct mmu_gather **tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, - unsigned long floor, unsigned long ceiling); +void free_pgtables(struct vm_area_struct *start_vma, unsigned long floor, + unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -272,9 +272,11 @@ void free_pgd_range(struct mmu_gather ** } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, - unsigned long floor, unsigned long ceiling) +void free_pgtables(struct vm_area_struct *vma, unsigned long floor, + unsigned long ceiling) { + struct mmu_gather *tlb; + while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; @@ -286,7 +288,8 @@ void free_pgtables(struct mmu_gather **t unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { - hugetlb_free_pgd_range(tlb, addr, vma->vm_end, + tlb = tlb_gather_mmu(vma->vm_mm, 0); + hugetlb_free_pgd_range(&tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } else { /* @@ -299,9 +302,11 @@ void free_pgtables(struct mmu_gather **t anon_vma_unlink(vma); unlink_file_vma(vma); } - free_pgd_range(tlb, addr, vma->vm_end, + tlb = tlb_gather_mmu(vma->vm_mm, 0); + free_pgd_range(&tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } + tlb_finish_mmu(tlb, addr, vma->vm_end); vma = next; } } diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1759,9 +1759,9 @@ static void unmap_region(struct mm_struc update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, + tlb_finish_mmu(tlb, start, end); + free_pgtables(vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); - tlb_finish_mmu(tlb, start, end); } /* @@ -2060,8 +2060,8 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); + free_pgtables(vma, FIRST_USER_ADDRESS, 0); /* * Walk the list again, actually closing and freeing it, |
From: Rik v. R. <ri...@re...> - 2008-05-07 18:12:50
|
On Wed, 07 May 2008 16:35:54 +0200 Andrea Arcangeli <an...@qu...> wrote: > Signed-off-by: Christoph Lameter <cla...@sg...> > Signed-off-by: Andrea Arcangeli <an...@qu...> Acked-by: Rik van Riel <ri...@re...> -- All rights reversed. |
From: Andrea A. <an...@qu...> - 2008-05-07 14:37:48
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1210115127 -7200 # Node ID c5badbefeee07518d9d1acca13e94c981420317c # Parent e20917dcc8284b6a07cfcced13dda4cbca850a9c get_task_mm get_task_mm should not succeed if mmput() is running and has reduced the mm_users count to zero. This can occur if a processor follows a tasks pointer to an mm struct because that pointer is only cleared after the mmput(). If get_task_mm() succeeds after mmput() reduced the mm_users to zero then we have the lovely situation that one portion of the kernel is doing all the teardown work for an mm while another portion is happily using it. Signed-off-by: Christoph Lameter <cla...@sg...> Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/kernel/fork.c b/kernel/fork.c --- a/kernel/fork.c +++ b/kernel/fork.c @@ -465,7 +465,8 @@ struct mm_struct *get_task_mm(struct tas if (task->flags & PF_BORROWED_MM) mm = NULL; else - atomic_inc(&mm->mm_users); + if (!atomic_inc_not_zero(&mm->mm_users)) + mm = NULL; } task_unlock(task); return mm; |
From: Andrea A. <an...@qu...> - 2008-05-07 14:37:52
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1210115797 -7200 # Node ID 5b2eb7d28a4517daf91b08b4dcfbb58fd2b42d0b # Parent 94eaa1515369e8ef183e2457f6f25a7f36473d70 export zap_page_range for XPMEM XPMEM would have used sys_madvise() except that madvise_dontneed() returns an -EINVAL if VM_PFNMAP is set, which is always true for the pages XPMEM imports from other partitions and is also true for uncached pages allocated locally via the mspec allocator. XPMEM needs zap_page_range() functionality for these types of pages as well as 'normal' pages. Signed-off-by: Dean Nelson <dc...@sg...> Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -954,6 +954,7 @@ unsigned long zap_page_range(struct vm_a return unmap_vmas(vma, address, end, &nr_accounted, details); } +EXPORT_SYMBOL_GPL(zap_page_range); /* * Do a quick page-table lookup for a single page. |
From: Andrea A. <an...@qu...> - 2008-05-07 14:37:53
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1210115798 -7200 # Node ID eb924315351f6b056428e35c983ad28040420fea # Parent 5b2eb7d28a4517daf91b08b4dcfbb58fd2b42d0b mmap sems This patch adds a lock ordering rule to avoid a potential deadlock when multiple mmap_sems need to be locked. Signed-off-by: Dean Nelson <dc...@sg...> Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c +++ b/mm/filemap.c @@ -79,6 +79,9 @@ generic_file_direct_IO(int rw, struct ki * * ->i_mutex (generic_file_buffered_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) + * + * When taking multiple mmap_sems, one should lock the lowest-addressed + * one first proceeding on up to the highest-addressed one. * * ->i_mutex * ->i_alloc_sem (various) |
From: Andrea A. <an...@qu...> - 2008-05-07 14:37:58
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1210115508 -7200 # Node ID 94eaa1515369e8ef183e2457f6f25a7f36473d70 # Parent 6b384bb988786aa78ef07440180e4b2948c4c6a2 mm_lock-rwsem Convert mm_lock to use semaphores after i_mmap_lock and anon_vma_lock conversion. Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1084,10 +1084,10 @@ extern int install_special_mapping(struc unsigned long flags, struct page **pages); struct mm_lock_data { - spinlock_t **i_mmap_locks; - spinlock_t **anon_vma_locks; - size_t nr_i_mmap_locks; - size_t nr_anon_vma_locks; + struct rw_semaphore **i_mmap_sems; + struct rw_semaphore **anon_vma_sems; + size_t nr_i_mmap_sems; + size_t nr_anon_vma_sems; }; extern int mm_lock(struct mm_struct *mm, struct mm_lock_data *data); extern void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data); diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2255,8 +2255,8 @@ int install_special_mapping(struct mm_st static int mm_lock_cmp(const void *a, const void *b) { - unsigned long _a = (unsigned long)*(spinlock_t **)a; - unsigned long _b = (unsigned long)*(spinlock_t **)b; + unsigned long _a = (unsigned long)*(struct rw_semaphore **)a; + unsigned long _b = (unsigned long)*(struct rw_semaphore **)b; cond_resched(); if (_a < _b) @@ -2266,7 +2266,7 @@ static int mm_lock_cmp(const void *a, co return 0; } -static unsigned long mm_lock_sort(struct mm_struct *mm, spinlock_t **locks, +static unsigned long mm_lock_sort(struct mm_struct *mm, struct rw_semaphore **sems, int anon) { struct vm_area_struct *vma; @@ -2275,59 +2275,59 @@ static unsigned long mm_lock_sort(struct for (vma = mm->mmap; vma; vma = vma->vm_next) { if (anon) { if (vma->anon_vma) - locks[i++] = &vma->anon_vma->lock; + sems[i++] = &vma->anon_vma->sem; } else { if (vma->vm_file && vma->vm_file->f_mapping) - locks[i++] = &vma->vm_file->f_mapping->i_mmap_lock; + sems[i++] = &vma->vm_file->f_mapping->i_mmap_sem; } } if (!i) goto out; - sort(locks, i, sizeof(spinlock_t *), mm_lock_cmp, NULL); + sort(sems, i, sizeof(struct rw_semaphore *), mm_lock_cmp, NULL); out: return i; } static inline unsigned long mm_lock_sort_anon_vma(struct mm_struct *mm, - spinlock_t **locks) + struct rw_semaphore **sems) { - return mm_lock_sort(mm, locks, 1); + return mm_lock_sort(mm, sems, 1); } static inline unsigned long mm_lock_sort_i_mmap(struct mm_struct *mm, - spinlock_t **locks) + struct rw_semaphore **sems) { - return mm_lock_sort(mm, locks, 0); + return mm_lock_sort(mm, sems, 0); } -static void mm_lock_unlock(spinlock_t **locks, size_t nr, int lock) +static void mm_lock_unlock(struct rw_semaphore **sems, size_t nr, int lock) { - spinlock_t *last = NULL; + struct rw_semaphore *last = NULL; size_t i; for (i = 0; i < nr; i++) /* Multiple vmas may use the same lock. */ - if (locks[i] != last) { - BUG_ON((unsigned long) last > (unsigned long) locks[i]); - last = locks[i]; + if (sems[i] != last) { + BUG_ON((unsigned long) last > (unsigned long) sems[i]); + last = sems[i]; if (lock) - spin_lock(last); + down_write(last); else - spin_unlock(last); + up_write(last); } } -static inline void __mm_lock(spinlock_t **locks, size_t nr) +static inline void __mm_lock(struct rw_semaphore **sems, size_t nr) { - mm_lock_unlock(locks, nr, 1); + mm_lock_unlock(sems, nr, 1); } -static inline void __mm_unlock(spinlock_t **locks, size_t nr) +static inline void __mm_unlock(struct rw_semaphore **sems, size_t nr) { - mm_lock_unlock(locks, nr, 0); + mm_lock_unlock(sems, nr, 0); } /* @@ -2358,10 +2358,10 @@ static inline void __mm_unlock(spinlock_ * of vmas is defined in /proc/sys/vm/max_map_count. * * mm_lock() can fail if memory allocation fails. The worst case - * vmalloc allocation required is 2*max_map_count*sizeof(spinlock_t *), - * so around 1Mbyte, but in practice it'll be much less because - * normally there won't be max_map_count vmas allocated in the task - * that runs mm_lock(). + * vmalloc allocation required is 2*max_map_count*sizeof(struct + * rw_semaphore *), so around 1Mbyte, but in practice it'll be much + * less because normally there won't be max_map_count vmas allocated + * in the task that runs mm_lock(). * * The vmalloc memory allocated by mm_lock is stored in the * mm_lock_data structure that must be allocated by the caller and it @@ -2375,16 +2375,16 @@ static inline void __mm_unlock(spinlock_ */ int mm_lock(struct mm_struct *mm, struct mm_lock_data *data) { - spinlock_t **anon_vma_locks, **i_mmap_locks; + struct rw_semaphore **anon_vma_sems, **i_mmap_sems; if (mm->map_count) { - anon_vma_locks = vmalloc(sizeof(spinlock_t *) * mm->map_count); - if (unlikely(!anon_vma_locks)) + anon_vma_sems = vmalloc(sizeof(struct rw_semaphore *) * mm->map_count); + if (unlikely(!anon_vma_sems)) return -ENOMEM; - i_mmap_locks = vmalloc(sizeof(spinlock_t *) * mm->map_count); - if (unlikely(!i_mmap_locks)) { - vfree(anon_vma_locks); + i_mmap_sems = vmalloc(sizeof(struct rw_semaphore *) * mm->map_count); + if (unlikely(!i_mmap_sems)) { + vfree(anon_vma_sems); return -ENOMEM; } @@ -2392,31 +2392,31 @@ int mm_lock(struct mm_struct *mm, struct * When mm_lock_sort_anon_vma/i_mmap returns zero it * means there's no lock to take and so we can free * the array here without waiting mm_unlock. mm_unlock - * will do nothing if nr_i_mmap/anon_vma_locks is + * will do nothing if nr_i_mmap/anon_vma_sems is * zero. */ - data->nr_anon_vma_locks = mm_lock_sort_anon_vma(mm, anon_vma_locks); - data->nr_i_mmap_locks = mm_lock_sort_i_mmap(mm, i_mmap_locks); + data->nr_anon_vma_sems = mm_lock_sort_anon_vma(mm, anon_vma_sems); + data->nr_i_mmap_sems = mm_lock_sort_i_mmap(mm, i_mmap_sems); - if (data->nr_anon_vma_locks) { - __mm_lock(anon_vma_locks, data->nr_anon_vma_locks); - data->anon_vma_locks = anon_vma_locks; + if (data->nr_anon_vma_sems) { + __mm_lock(anon_vma_sems, data->nr_anon_vma_sems); + data->anon_vma_sems = anon_vma_sems; } else - vfree(anon_vma_locks); + vfree(anon_vma_sems); - if (data->nr_i_mmap_locks) { - __mm_lock(i_mmap_locks, data->nr_i_mmap_locks); - data->i_mmap_locks = i_mmap_locks; + if (data->nr_i_mmap_sems) { + __mm_lock(i_mmap_sems, data->nr_i_mmap_sems); + data->i_mmap_sems = i_mmap_sems; } else - vfree(i_mmap_locks); + vfree(i_mmap_sems); } return 0; } -static void mm_unlock_vfree(spinlock_t **locks, size_t nr) +static void mm_unlock_vfree(struct rw_semaphore **sems, size_t nr) { - __mm_unlock(locks, nr); - vfree(locks); + __mm_unlock(sems, nr); + vfree(sems); } /* @@ -2435,11 +2435,11 @@ void mm_unlock(struct mm_struct *mm, str void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data) { if (mm->map_count) { - if (data->nr_anon_vma_locks) - mm_unlock_vfree(data->anon_vma_locks, - data->nr_anon_vma_locks); - if (data->nr_i_mmap_locks) - mm_unlock_vfree(data->i_mmap_locks, - data->nr_i_mmap_locks); + if (data->nr_anon_vma_sems) + mm_unlock_vfree(data->anon_vma_sems, + data->nr_anon_vma_sems); + if (data->nr_i_mmap_sems) + mm_unlock_vfree(data->i_mmap_sems, + data->nr_i_mmap_sems); } } |
From: Andrea A. <an...@qu...> - 2008-05-07 14:38:07
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1210115132 -7200 # Node ID 0621238970155f8ff2d60ca4996dcdd470f9c6ce # Parent 20bc6a66a86ef6bd60919cc77ff51d4af741b057 rwsem contended Add a function to rw_semaphores to check if there are any processes waiting for the semaphore. Add rwsem_needbreak to sched.h that works in the same way as spinlock_needbreak(). Signed-off-by: Christoph Lameter <cla...@sg...> Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -57,6 +57,8 @@ extern void up_write(struct rw_semaphore */ extern void downgrade_write(struct rw_semaphore *sem); +extern int rwsem_is_contended(struct rw_semaphore *sem); + #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * nested locking. NOTE: rwsems are not allowed to recurse diff --git a/include/linux/sched.h b/include/linux/sched.h --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2030,6 +2030,15 @@ static inline int spin_needbreak(spinloc #endif } +static inline int rwsem_needbreak(struct rw_semaphore *sem) +{ +#ifdef CONFIG_PREEMPT + return rwsem_is_contended(sem); +#else + return 0; +#endif +} + /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c --- a/lib/rwsem-spinlock.c +++ b/lib/rwsem-spinlock.c @@ -305,6 +305,18 @@ void __downgrade_write(struct rw_semapho spin_unlock_irqrestore(&sem->wait_lock, flags); } +int rwsem_is_contended(struct rw_semaphore *sem) +{ + /* + * Racy check for an empty list. False positives or negatives + * would be okay. False positive may cause a useless dropping of + * locks. False negatives may cause locks to be held a bit + * longer until the next check. + */ + return !list_empty(&sem->wait_list); +} + +EXPORT_SYMBOL(rwsem_is_contended); EXPORT_SYMBOL(__init_rwsem); EXPORT_SYMBOL(__down_read); EXPORT_SYMBOL(__down_read_trylock); diff --git a/lib/rwsem.c b/lib/rwsem.c --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -251,6 +251,18 @@ asmregparm struct rw_semaphore *rwsem_do return sem; } +int rwsem_is_contended(struct rw_semaphore *sem) +{ + /* + * Racy check for an empty list. False positives or negatives + * would be okay. False positive may cause a useless dropping of + * locks. False negatives may cause locks to be held a bit + * longer until the next check. + */ + return !list_empty(&sem->wait_list); +} + +EXPORT_SYMBOL(rwsem_is_contended); EXPORT_SYMBOL(rwsem_down_read_failed); EXPORT_SYMBOL(rwsem_down_write_failed); EXPORT_SYMBOL(rwsem_wake); |
From: Andrea A. <an...@qu...> - 2008-05-07 14:38:11
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1210115131 -7200 # Node ID 20bc6a66a86ef6bd60919cc77ff51d4af741b057 # Parent 34f6a4bf67ce66714ba2d5c13a5fed241d34fb09 unmap vmas tlb flushing Move the tlb flushing inside of unmap vmas. This saves us from passing a pointer to the TLB structure around and simplifies the callers. Signed-off-by: Christoph Lameter <cla...@sg...> Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -744,8 +744,7 @@ struct page *vm_normal_page(struct vm_ar unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); -unsigned long unmap_vmas(struct mmu_gather **tlb, - struct vm_area_struct *start_vma, unsigned long start_addr, +unsigned long unmap_vmas(struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -849,7 +849,6 @@ static unsigned long unmap_page_range(st /** * unmap_vmas - unmap a range of memory covered by a list of vma's - * @tlbp: address of the caller's struct mmu_gather * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping @@ -861,20 +860,13 @@ static unsigned long unmap_page_range(st * Unmap all pages in the vma list. * * We aim to not hold locks for too long (for scheduling latency reasons). - * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to - * return the ending mmu_gather to the caller. + * So zap pages in ZAP_BLOCK_SIZE bytecounts. * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. - * - * unmap_vmas() assumes that the caller will flush the whole unmapped address - * range after unmap_vmas() returns. So the only responsibility here is to - * ensure that any thus-far unmapped pages are flushed before unmap_vmas() - * drops the lock and schedules. */ -unsigned long unmap_vmas(struct mmu_gather **tlbp, - struct vm_area_struct *vma, unsigned long start_addr, +unsigned long unmap_vmas(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) { @@ -883,9 +875,14 @@ unsigned long unmap_vmas(struct mmu_gath int tlb_start_valid = 0; unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; - int fullmm = (*tlbp)->fullmm; + int fullmm; + struct mmu_gather *tlb; struct mm_struct *mm = vma->vm_mm; + lru_add_drain(); + tlb = tlb_gather_mmu(mm, 0); + update_hiwater_rss(mm); + fullmm = tlb->fullmm; mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; @@ -912,7 +909,7 @@ unsigned long unmap_vmas(struct mmu_gath (HPAGE_SIZE / PAGE_SIZE); start = end; } else - start = unmap_page_range(*tlbp, vma, + start = unmap_page_range(tlb, vma, start, end, &zap_work, details); if (zap_work > 0) { @@ -920,22 +917,23 @@ unsigned long unmap_vmas(struct mmu_gath break; } - tlb_finish_mmu(*tlbp, tlb_start, start); + tlb_finish_mmu(tlb, tlb_start, start); if (need_resched() || (i_mmap_lock && spin_needbreak(i_mmap_lock))) { if (i_mmap_lock) { - *tlbp = NULL; + tlb = NULL; goto out; } cond_resched(); } - *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); + tlb = tlb_gather_mmu(vma->vm_mm, fullmm); tlb_start_valid = 0; zap_work = ZAP_BLOCK_SIZE; } } + tlb_finish_mmu(tlb, start_addr, end_addr); out: mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); return start; /* which is now the end (or restart) address */ @@ -951,18 +949,10 @@ unsigned long zap_page_range(struct vm_a unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { - struct mm_struct *mm = vma->vm_mm; - struct mmu_gather *tlb; unsigned long end = address + size; unsigned long nr_accounted = 0; - lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); - update_hiwater_rss(mm); - end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); - if (tlb) - tlb_finish_mmu(tlb, address, end); - return end; + return unmap_vmas(vma, address, end, &nr_accounted, details); } /* diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1751,15 +1751,10 @@ static void unmap_region(struct mm_struc unsigned long start, unsigned long end) { struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; - struct mmu_gather *tlb; unsigned long nr_accounted = 0; - lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); - update_hiwater_rss(mm); - unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); + unmap_vmas(vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - tlb_finish_mmu(tlb, start, end); free_pgtables(vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); } @@ -2044,7 +2039,6 @@ EXPORT_SYMBOL(do_brk); /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { - struct mmu_gather *tlb; struct vm_area_struct *vma = mm->mmap; unsigned long nr_accounted = 0; unsigned long end; @@ -2055,12 +2049,11 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); - tlb = tlb_gather_mmu(mm, 1); + /* Don't update_hiwater_rss(mm) here, do_exit already did */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ - end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); + end = unmap_vmas(vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - tlb_finish_mmu(tlb, 0, end); free_pgtables(vma, FIRST_USER_ADDRESS, 0); /* |
From: Rik v. R. <ri...@re...> - 2008-05-07 17:52:14
|
On Wed, 07 May 2008 16:35:55 +0200 Andrea Arcangeli <an...@qu...> wrote: > Signed-off-by: Christoph Lameter <cla...@sg...> > Signed-off-by: Andrea Arcangeli <an...@qu...> Acked-by: Rik van Riel <ri...@re...> -- All rights reversed. |
From: Andrea A. <an...@qu...> - 2008-05-07 14:38:15
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1210115136 -7200 # Node ID 6b384bb988786aa78ef07440180e4b2948c4c6a2 # Parent 58f716ad4d067afb6bdd1b5f7042e19d854aae0d anon-vma-rwsem Convert the anon_vma spinlock to a rw semaphore. This allows concurrent traversal of reverse maps for try_to_unmap() and page_mkclean(). It also allows the calling of sleeping functions from reverse map traversal as needed for the notifier callbacks. It includes possible concurrency. Rcu is used in some context to guarantee the presence of the anon_vma (try_to_unmap) while we acquire the anon_vma lock. We cannot take a semaphore within an rcu critical section. Add a refcount to the anon_vma structure which allow us to give an existence guarantee for the anon_vma structure independent of the spinlock or the list contents. The refcount can then be taken within the RCU section. If it has been taken successfully then the refcount guarantees the existence of the anon_vma. The refcount in anon_vma also allows us to fix a nasty issue in page migration where we fudged by using rcu for a long code path to guarantee the existence of the anon_vma. I think this is a bug because the anon_vma may become empty and get scheduled to be freed but then we increase the refcount again when the migration entries are removed. The refcount in general allows a shortening of RCU critical sections since we can do a rcu_unlock after taking the refcount. This is particularly relevant if the anon_vma chains contain hundreds of entries. However: - Atomic overhead increases in situations where a new reference to the anon_vma has to be established or removed. Overhead also increases when a speculative reference is used (try_to_unmap, page_mkclean, page migration). - There is the potential for more frequent processor change due to up_xxx letting waiting tasks run first. This results in f.e. the Aim9 brk performance test to got down by 10-15%. Signed-off-by: Christoph Lameter <cla...@sg...> Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/include/linux/rmap.h b/include/linux/rmap.h --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -25,7 +25,8 @@ * pointing to this anon_vma once its vma list is empty. */ struct anon_vma { - spinlock_t lock; /* Serialize access to vma list */ + atomic_t refcount; /* vmas on the list */ + struct rw_semaphore sem;/* Serialize access to vma list */ struct list_head head; /* List of private "related" vmas */ }; @@ -43,18 +44,31 @@ static inline void anon_vma_free(struct kmem_cache_free(anon_vma_cachep, anon_vma); } +struct anon_vma *grab_anon_vma(struct page *page); + +static inline void get_anon_vma(struct anon_vma *anon_vma) +{ + atomic_inc(&anon_vma->refcount); +} + +static inline void put_anon_vma(struct anon_vma *anon_vma) +{ + if (atomic_dec_and_test(&anon_vma->refcount)) + anon_vma_free(anon_vma); +} + static inline void anon_vma_lock(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - spin_lock(&anon_vma->lock); + down_write(&anon_vma->sem); } static inline void anon_vma_unlock(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - spin_unlock(&anon_vma->lock); + up_write(&anon_vma->sem); } /* diff --git a/mm/migrate.c b/mm/migrate.c --- a/mm/migrate.c +++ b/mm/migrate.c @@ -235,15 +235,16 @@ static void remove_anon_migration_ptes(s return; /* - * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. + * We hold either the mmap_sem lock or a reference on the + * anon_vma. So no need to call page_lock_anon_vma. */ anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); + down_read(&anon_vma->sem); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) remove_migration_pte(vma, old, new); - spin_unlock(&anon_vma->lock); + up_read(&anon_vma->sem); } /* @@ -630,7 +631,7 @@ static int unmap_and_move(new_page_t get int rc = 0; int *result = NULL; struct page *newpage = get_new_page(page, private, &result); - int rcu_locked = 0; + struct anon_vma *anon_vma = NULL; int charge = 0; if (!newpage) @@ -654,16 +655,14 @@ static int unmap_and_move(new_page_t get } /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, - * we cannot notice that anon_vma is freed while we migrates a page. + * we cannot notice that anon_vma is freed while we migrate a page. * This rcu_read_lock() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. */ - if (PageAnon(page)) { - rcu_read_lock(); - rcu_locked = 1; - } + if (PageAnon(page)) + anon_vma = grab_anon_vma(page); /* * Corner case handling: @@ -681,10 +680,7 @@ static int unmap_and_move(new_page_t get if (!PageAnon(page) && PagePrivate(page)) { /* * Go direct to try_to_free_buffers() here because - * a) that's what try_to_release_page() would do anyway - * b) we may be under rcu_read_lock() here, so we can't - * use GFP_KERNEL which is what try_to_release_page() - * needs to be effective. + * that's what try_to_release_page() would do anyway */ try_to_free_buffers(page); } @@ -705,8 +701,8 @@ static int unmap_and_move(new_page_t get } else if (charge) mem_cgroup_end_migration(newpage); rcu_unlock: - if (rcu_locked) - rcu_read_unlock(); + if (anon_vma) + put_anon_vma(anon_vma); unlock: diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -570,7 +570,7 @@ again: remove_next = 1 + (end > next-> if (vma->anon_vma) anon_vma = vma->anon_vma; if (anon_vma) { - spin_lock(&anon_vma->lock); + down_write(&anon_vma->sem); /* * Easily overlooked: when mprotect shifts the boundary, * make sure the expanding vma has anon_vma set if the @@ -624,7 +624,7 @@ again: remove_next = 1 + (end > next-> } if (anon_vma) - spin_unlock(&anon_vma->lock); + up_write(&anon_vma->sem); if (mapping) up_write(&mapping->i_mmap_sem); diff --git a/mm/rmap.c b/mm/rmap.c --- a/mm/rmap.c +++ b/mm/rmap.c @@ -69,7 +69,7 @@ int anon_vma_prepare(struct vm_area_stru if (anon_vma) { allocated = NULL; locked = anon_vma; - spin_lock(&locked->lock); + down_write(&locked->sem); } else { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) @@ -81,6 +81,7 @@ int anon_vma_prepare(struct vm_area_stru /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { + get_anon_vma(anon_vma); vma->anon_vma = anon_vma; list_add_tail(&vma->anon_vma_node, &anon_vma->head); allocated = NULL; @@ -88,7 +89,7 @@ int anon_vma_prepare(struct vm_area_stru spin_unlock(&mm->page_table_lock); if (locked) - spin_unlock(&locked->lock); + up_write(&locked->sem); if (unlikely(allocated)) anon_vma_free(allocated); } @@ -99,14 +100,17 @@ void __anon_vma_merge(struct vm_area_str { BUG_ON(vma->anon_vma != next->anon_vma); list_del(&next->anon_vma_node); + put_anon_vma(vma->anon_vma); } void __anon_vma_link(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; - if (anon_vma) + if (anon_vma) { + get_anon_vma(anon_vma); list_add_tail(&vma->anon_vma_node, &anon_vma->head); + } } void anon_vma_link(struct vm_area_struct *vma) @@ -114,36 +118,32 @@ void anon_vma_link(struct vm_area_struct struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) { - spin_lock(&anon_vma->lock); + get_anon_vma(anon_vma); + down_write(&anon_vma->sem); list_add_tail(&vma->anon_vma_node, &anon_vma->head); - spin_unlock(&anon_vma->lock); + up_write(&anon_vma->sem); } } void anon_vma_unlink(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; - int empty; if (!anon_vma) return; - spin_lock(&anon_vma->lock); + down_write(&anon_vma->sem); list_del(&vma->anon_vma_node); - - /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); - - if (empty) - anon_vma_free(anon_vma); + up_write(&anon_vma->sem); + put_anon_vma(anon_vma); } static void anon_vma_ctor(struct kmem_cache *cachep, void *data) { struct anon_vma *anon_vma = data; - spin_lock_init(&anon_vma->lock); + init_rwsem(&anon_vma->sem); + atomic_set(&anon_vma->refcount, 0); INIT_LIST_HEAD(&anon_vma->head); } @@ -157,9 +157,9 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *grab_anon_vma(struct page *page) { - struct anon_vma *anon_vma; + struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; rcu_read_lock(); @@ -170,17 +170,26 @@ static struct anon_vma *page_lock_anon_v goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); - return anon_vma; + if (!atomic_inc_not_zero(&anon_vma->refcount)) + anon_vma = NULL; out: rcu_read_unlock(); - return NULL; + return anon_vma; +} + +static struct anon_vma *page_lock_anon_vma(struct page *page) +{ + struct anon_vma *anon_vma = grab_anon_vma(page); + + if (anon_vma) + down_read(&anon_vma->sem); + return anon_vma; } static void page_unlock_anon_vma(struct anon_vma *anon_vma) { - spin_unlock(&anon_vma->lock); - rcu_read_unlock(); + up_read(&anon_vma->sem); + put_anon_vma(anon_vma); } /* |
From: Linus T. <tor...@li...> - 2008-05-07 20:56:41
|
On Wed, 7 May 2008, Andrea Arcangeli wrote: > > Convert the anon_vma spinlock to a rw semaphore. This allows concurrent > traversal of reverse maps for try_to_unmap() and page_mkclean(). It also > allows the calling of sleeping functions from reverse map traversal as > needed for the notifier callbacks. It includes possible concurrency. This also looks very debatable indeed. The only performance numbers quoted are: > This results in f.e. the Aim9 brk performance test to got down by 10-15%. which just seems like a total disaster. The whole series looks bad, in fact. Lack of authorship, bad single-line description, and the code itself sucks so badly that it's not even funny. NAK NAK NAK. All of it. It stinks. Linus |
From: Christoph L. <cla...@sg...> - 2008-05-08 01:12:32
|
On Wed, 7 May 2008, Linus Torvalds wrote: > and you're now done. You have your "mm_lock()" (which still needs to be > renamed - it should be a "mmu_notifier_lock()" or something like that), > but you don't need the insane sorting. At most you apparently need a way > to recognize duplicates (so that you don't deadlock on yourself), which > looks like a simple bit-per-vma. Andrea's mm_lock could have wider impact. It is the first effective way that I have seen of temporarily holding off reclaim from an address space. It sure is a brute force approach. |
From: Linus T. <tor...@li...> - 2008-05-08 01:32:56
|
On Wed, 7 May 2008, Christoph Lameter wrote: > On Wed, 7 May 2008, Linus Torvalds wrote: > > and you're now done. You have your "mm_lock()" (which still needs to be > > renamed - it should be a "mmu_notifier_lock()" or something like that), > > but you don't need the insane sorting. At most you apparently need a way > > to recognize duplicates (so that you don't deadlock on yourself), which > > looks like a simple bit-per-vma. > > Andrea's mm_lock could have wider impact. It is the first effective > way that I have seen of temporarily holding off reclaim from an address > space. It sure is a brute force approach. Well, I don't think the naming necessarily has to be about notifiers, but it should be at least a *bit* more scary than "mm_lock()", to make it clear that it's pretty dang expensive. Even without the vmalloc and sorting, if it would be used by "normal" things it would still be very expensive for some cases - running thngs like ElectricFence, for example, will easily generate thousands and thousands of vma's in a process. Linus |
From: Andrea A. <an...@qu...> - 2008-05-08 02:56:47
|
On Wed, May 07, 2008 at 06:12:32PM -0700, Christoph Lameter wrote: > Andrea's mm_lock could have wider impact. It is the first effective > way that I have seen of temporarily holding off reclaim from an address > space. It sure is a brute force approach. The only improvement I can imagine on mm_lock, is after changing the name to global_mm_lock() to reestablish the signal_pending check in the loop that takes the spinlock and to backoff and put the cap to 512 vmas so the ram wasted on anon-vmas wouldn't save more than 10-100usec at most (plus the vfree that may be a bigger cost but we're ok to pay it and it surely isn't security related). Then on the long term we need to talk to Matt on returning a parameter to the sort function to break the loop. After that we remove the 512 vma cap and mm_lock is free to run as long as it wants like /dev/urandom, nobody can care less how long it will run before returning as long as it reacts to signals. This is the right way if we want to support XPMEM/GRU efficiently and without introducing unnecessary regressions in the VM fastpaths and VM footprint. |
From: Andrea A. <an...@qu...> - 2008-05-07 21:26:49
|
On Wed, May 07, 2008 at 01:56:23PM -0700, Linus Torvalds wrote: > This also looks very debatable indeed. The only performance numbers quoted > are: > > > This results in f.e. the Aim9 brk performance test to got down by 10-15%. > > which just seems like a total disaster. > > The whole series looks bad, in fact. Lack of authorship, bad single-line Glad you agree. Note that the fact the whole series looks bad, is _exactly_ why I couldn't let Christoph keep going with mmu-notifier-core at the very end of his patchset. I had to move it at the top to have a chance to get the KVM and GRU requirements merged in 2.6.26. I think the spinlock->rwsem conversion is ok under config option, as you can see I complained myself to various of those patches and I'll take care they're in a mergeable state the moment I submit them. What XPMEM requires are different semantics for the methods, and we never had to do any blocking I/O during vmtruncate before, now we have to. And I don't see a problem in making the conversion from spinlock->rwsem only if CONFIG_XPMEM=y as I doubt XPMEM works on anything but ia64. Please ignore all patches but mmu-notifier-core. I regularly forward _only_ mmu-notifier-core to Andrew, that's the only one that is in merge-ready status, everything else is just so XPMEM can test and we can keep discussing it to bring it in a mergeable state like mmu-notifier-core already is. |
From: Jack S. <st...@sg...> - 2008-05-07 22:42:29
|
> And I don't see a problem in making the conversion from > spinlock->rwsem only if CONFIG_XPMEM=y as I doubt XPMEM works on > anything but ia64. That is currently true but we are also working on XPMEM for x86_64. The new XPMEM code should be posted within a few weeks. --- jack |
From: Linus T. <tor...@li...> - 2008-05-07 21:37:33
|
On Wed, 7 May 2008, Andrea Arcangeli wrote: > > I think the spinlock->rwsem conversion is ok under config option, as > you can see I complained myself to various of those patches and I'll > take care they're in a mergeable state the moment I submit them. What > XPMEM requires are different semantics for the methods, and we never > had to do any blocking I/O during vmtruncate before, now we have to. I really suspect we don't really have to, and that it would be better to just fix the code that does that. > Please ignore all patches but mmu-notifier-core. I regularly forward > _only_ mmu-notifier-core to Andrew, that's the only one that is in > merge-ready status, everything else is just so XPMEM can test and we > can keep discussing it to bring it in a mergeable state like > mmu-notifier-core already is. The thing is, I didn't like that one *either*. I thought it was the biggest turd in the series (and by "biggest", I literally mean "most lines of turd-ness" rather than necessarily "ugliest per se"). I literally think that mm_lock() is an unbelievable piece of utter and horrible CRAP. There's simply no excuse for code like that. If you want to avoid the deadlock from taking multiple locks in order, but there is really just a single operation that needs it, there's a really really simple solution. And that solution is *not* to sort the whole damn f*cking list in a vmalloc'ed data structure prior to locking! Damn. No, the simple solution is to just make up a whole new upper-level lock, and get that lock *first*. You can then take all the multiple locks at a lower level in any order you damn well please. And yes, it's one more lock, and yes, it serializes stuff, but: - that code had better not be critical anyway, because if it was, then the whole "vmalloc+sort+lock+vunmap" sh*t was wrong _anyway_ - parallelism is overrated: it doesn't matter one effing _whit_ if something is a hundred times more parallel, if it's also a hundred times *SLOWER*. So dang it, flush the whole damn series down the toilet and either forget the thing entirely, or re-do it sanely. And here's an admission that I lied: it wasn't *all* clearly crap. I did like one part, namely list_del_init_rcu(), but that one should have been in a separate patch. I'll happily apply that one. Linus |
From: Linus T. <tor...@li...> - 2008-05-08 01:40:02
|
On Wed, 7 May 2008, Christoph Lameter wrote: > > > (That said, we're not running out of vm flags yet, and if we were, we > > could just add another word. We're already wasting that space right now on > > 64-bit by calling it "unsigned long"). > > We sure have enough flags. Oh, btw, I was wrong - we wouldn't want to mark the vma's (they are unique), we need to mark the address spaces/anonvma's. So the flag would need to be in the "struct anon_vma" (and struct address_space), not in the vma itself. My bad. So the flag wouldn't be one of the VM_xyzzy flags, and would require adding a new field to "struct anon_vma()" And related to that brain-fart of mine, that obviously also means that yes, the locking has to be stronger than "mm->mmap_sem" held for writing, so yeah, it would have be a separate global spinlock (or perhaps a blocking lock if you have some reason to protect anything else with this too). Linus |
From: Andrea A. <an...@qu...> - 2008-05-08 01:52:45
|
On Wed, May 07, 2008 at 06:39:48PM -0700, Linus Torvalds wrote: > > > On Wed, 7 May 2008, Christoph Lameter wrote: > > > > > (That said, we're not running out of vm flags yet, and if we were, we > > > could just add another word. We're already wasting that space right now on > > > 64-bit by calling it "unsigned long"). > > > > We sure have enough flags. > > Oh, btw, I was wrong - we wouldn't want to mark the vma's (they are > unique), we need to mark the address spaces/anonvma's. So the flag would > need to be in the "struct anon_vma" (and struct address_space), not in the > vma itself. My bad. So the flag wouldn't be one of the VM_xyzzy flags, and > would require adding a new field to "struct anon_vma()" > > And related to that brain-fart of mine, that obviously also means that > yes, the locking has to be stronger than "mm->mmap_sem" held for writing, > so yeah, it would have be a separate global spinlock (or perhaps a > blocking lock if you have some reason to protect anything else with this So because the bitflag can't prevent taking the same lock twice on two different vmas in the same mm, we still can't remove the sorting, and the global lock won't buy much other than reducing the collisions. I can add that though. I think it's more interesting to put a cap on the number of vmas to min(1024,max_map_count). The sort time on an 8k array runs in constant time. kvm runs with 127 vmas allocated... |
From: Christoph L. <cla...@sg...> - 2008-05-08 03:10:28
|
On Thu, 8 May 2008, Andrea Arcangeli wrote: > to the sort function to break the loop. After that we remove the 512 > vma cap and mm_lock is free to run as long as it wants like > /dev/urandom, nobody can care less how long it will run before > returning as long as it reacts to signals. Look Linus has told you what to do. Why not simply do it? |
From: Andrea A. <an...@qu...> - 2008-05-08 03:41:30
|
On Wed, May 07, 2008 at 08:10:33PM -0700, Christoph Lameter wrote: > On Thu, 8 May 2008, Andrea Arcangeli wrote: > > > to the sort function to break the loop. After that we remove the 512 > > vma cap and mm_lock is free to run as long as it wants like > > /dev/urandom, nobody can care less how long it will run before > > returning as long as it reacts to signals. > > Look Linus has told you what to do. Why not simply do it? When it looked like we could use vm_flags to remove sort, that looked an ok optimization, no problem with optimizations, I'm all for optimizations if they cost nothing to the VM fast paths and VM footprint. But removing sort isn't worth it if it takes away ram from the VM even when global_mm_lock will never be called. sort is like /dev/urandom so after sort is fixed to handle signals (and I expect Matt will help with this) we'll remove the 512 vmas cap. In the meantime we can live with the 512 vmas cap that guarantees sort won't take more than a few dozen usec. Removing sort() is the only thing that the anon vma bitflag can achieve and it's clearly not worth it and it would go in the wrong direction (fixing sort to handle signals is clearly the right direction, if sort is a concern at all). Adding the global lock around global_mm_lock to avoid one global_mm_lock to collide against another global_mm_lock is sure ok with me, if that's still wanted now that it's clear removing sort isn't worth it, I'm neutral on this. Christoph please go ahead and add the bitflag to anon-vma yourself if you want. If something isn't technically right I don't do it no matter who asks it. |