From: Andrea A. <an...@qu...> - 2008-04-22 14:10:35
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1208872186 -7200 # Node ID ac9bb1fb3de2aa5d27210a28edf24f6577094076 # Parent a6672bdeead0d41b2ebd6846f731d43a611645b7 Moves all mmu notifier methods outside the PT lock (first and not last step to make them sleep capable). Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -169,27 +169,6 @@ INIT_HLIST_HEAD(&mm->mmu_notifier_list); } -#define ptep_clear_flush_notify(__vma, __address, __ptep) \ -({ \ - pte_t __pte; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __pte = ptep_clear_flush(___vma, ___address, __ptep); \ - mmu_notifier_invalidate_page(___vma->vm_mm, ___address); \ - __pte; \ -}) - -#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ - __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ - ___address); \ - __young; \ -}) - #else /* CONFIG_MMU_NOTIFIER */ static inline void mmu_notifier_release(struct mm_struct *mm) @@ -221,9 +200,6 @@ { } -#define ptep_clear_flush_young_notify ptep_clear_flush_young -#define ptep_clear_flush_notify ptep_clear_flush - #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */ diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -194,11 +194,13 @@ if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); page_remove_rmap(page, vma); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); + /* must invalidate_page _before_ freeing the page */ + mmu_notifier_invalidate_page(mm, address); page_cache_release(page); } } diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -1627,9 +1627,10 @@ */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - page_cache_release(old_page); + new_page = NULL; if (!pte_same(*page_table, orig_pte)) goto unlock; + page_cache_release(old_page); page_mkwrite = 1; } @@ -1645,6 +1646,7 @@ if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, entry); ret |= VM_FAULT_WRITE; + old_page = new_page = NULL; goto unlock; } @@ -1689,7 +1691,7 @@ * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, address, page_table); + ptep_clear_flush(vma, address, page_table); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); lru_cache_add_active(new_page); @@ -1701,12 +1703,18 @@ } else mem_cgroup_uncharge_page(new_page); - if (new_page) +unlock: + pte_unmap_unlock(page_table, ptl); + + if (new_page) { + if (new_page == old_page) + /* cow happened, notify before releasing old_page */ + mmu_notifier_invalidate_page(mm, address); page_cache_release(new_page); + } if (old_page) page_cache_release(old_page); -unlock: - pte_unmap_unlock(page_table, ptl); + if (dirty_page) { if (vma->vm_file) file_update_time(vma->vm_file); diff --git a/mm/rmap.c b/mm/rmap.c --- a/mm/rmap.c +++ b/mm/rmap.c @@ -275,7 +275,7 @@ unsigned long address; pte_t *pte; spinlock_t *ptl; - int referenced = 0; + int referenced = 0, clear_flush_young = 0; address = vma_address(page, vma); if (address == -EFAULT) @@ -288,8 +288,11 @@ if (vma->vm_flags & VM_LOCKED) { referenced++; *mapcount = 1; /* break early from loop */ - } else if (ptep_clear_flush_young_notify(vma, address, pte)) - referenced++; + } else { + clear_flush_young = 1; + if (ptep_clear_flush_young(vma, address, pte)) + referenced++; + } /* Pretend the page is referenced if the task has the swap token and is in the middle of a page fault. */ @@ -299,6 +302,10 @@ (*mapcount)--; pte_unmap_unlock(pte, ptl); + + if (clear_flush_young) + referenced += mmu_notifier_clear_flush_young(mm, address); + out: return referenced; } @@ -457,7 +464,7 @@ pte_t entry; flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush_notify(vma, address, pte); + entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(mm, address, pte, entry); @@ -465,6 +472,10 @@ } pte_unmap_unlock(pte, ptl); + + if (ret) + mmu_notifier_invalidate_page(mm, address); + out: return ret; } @@ -717,15 +728,14 @@ * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!migration && ((vma->vm_flags & VM_LOCKED) || - (ptep_clear_flush_young_notify(vma, address, pte)))) { + if (!migration && (vma->vm_flags & VM_LOCKED)) { ret = SWAP_FAIL; goto out_unmap; } /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -780,6 +790,8 @@ out_unmap: pte_unmap_unlock(pte, ptl); + if (ret != SWAP_FAIL) + mmu_notifier_invalidate_page(mm, address); out: return ret; } @@ -818,7 +830,7 @@ spinlock_t *ptl; struct page *page; unsigned long address; - unsigned long end; + unsigned long start, end; address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -839,6 +851,8 @@ if (!pmd_present(*pmd)) return; + start = address; + mmu_notifier_invalidate_range_start(mm, start, end); pte = pte_offset_map_lock(mm, pmd, address, &ptl); /* Update high watermark before we lower rss */ @@ -850,12 +864,12 @@ page = vm_normal_page(vma, address, *pte); BUG_ON(!page || PageAnon(page)); - if (ptep_clear_flush_young_notify(vma, address, pte)) + if (ptep_clear_flush_young(vma, address, pte)) continue; /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) @@ -871,6 +885,7 @@ (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + mmu_notifier_invalidate_range_end(mm, start, end); } static int try_to_unmap_anon(struct page *page, int migration) |
From: Christoph L. <cla...@sg...> - 2008-04-22 20:24:15
|
Reverts a part of an earlier patch. Why isnt this merged into 1 of 12? |
From: Andrea A. <an...@qu...> - 2008-04-22 22:40:46
|
On Tue, Apr 22, 2008 at 01:24:21PM -0700, Christoph Lameter wrote: > Reverts a part of an earlier patch. Why isnt this merged into 1 of 12? To give zero regression risk to 1/12 when MMU_NOTIFIER=y or =n and the mmu notifiers aren't registered by GRU or KVM. Keep in mind that the whole point of my proposed patch ordering from day 0, is to keep as 1/N, the absolutely minimum change that fully satisfy GRU and KVM requirements. 4/12 isn't required by GRU/KVM so I keep it in a later patch. I now moved mmu_notifier_unregister in a later patch too for the same reason. |
From: Christoph L. <cla...@sg...> - 2008-04-22 23:14:20
|
On Wed, 23 Apr 2008, Andrea Arcangeli wrote: > On Tue, Apr 22, 2008 at 01:24:21PM -0700, Christoph Lameter wrote: > > Reverts a part of an earlier patch. Why isnt this merged into 1 of 12? > > To give zero regression risk to 1/12 when MMU_NOTIFIER=y or =n and the > mmu notifiers aren't registered by GRU or KVM. Keep in mind that the > whole point of my proposed patch ordering from day 0, is to keep as > 1/N, the absolutely minimum change that fully satisfy GRU and KVM > requirements. 4/12 isn't required by GRU/KVM so I keep it in a later > patch. I now moved mmu_notifier_unregister in a later patch too for > the same reason. We want a full solution and this kind of patching makes the patches difficuilt to review because later patches revert earlier ones. |
From: Andrea A. <an...@qu...> - 2008-04-23 13:44:24
|
On Tue, Apr 22, 2008 at 04:14:26PM -0700, Christoph Lameter wrote: > We want a full solution and this kind of patching makes the patches > difficuilt to review because later patches revert earlier ones. I know you rather want to see KVM development stalled for more months than to get a partial solution now that already covers KVM and GRU with the same API that XPMEM will also use later. It's very unfair on your side to pretend to stall other people development if what you need has stronger requirements and can't be merged immediately. This is especially true given it was publically stated that XPMEM never passed all regression tests anyway, so you can't possibly be in such an hurry like we are, we can't progress without this. Infact we can but it would be an huge effort and it would run _slower_ and it would all need to be deleted once mmu notifiers are in. Note that the only patch that you can avoid with your approach is mm_lock-rwsem, given that's software developed and not human developed I don't see a big deal of wasted effort. The main difference is the ordering. Most of the code is orthogonal so there's not much to revert. |
From: Robin H. <ho...@sg...> - 2008-04-23 15:45:34
|
On Wed, Apr 23, 2008 at 03:44:27PM +0200, Andrea Arcangeli wrote: > On Tue, Apr 22, 2008 at 04:14:26PM -0700, Christoph Lameter wrote: > > We want a full solution and this kind of patching makes the patches > > difficuilt to review because later patches revert earlier ones. > > I know you rather want to see KVM development stalled for more months > than to get a partial solution now that already covers KVM and GRU > with the same API that XPMEM will also use later. It's very unfair on > your side to pretend to stall other people development if what you > need has stronger requirements and can't be merged immediately. This > is especially true given it was publically stated that XPMEM never > passed all regression tests anyway, so you can't possibly be in such XPMEM has passed all regression tests using your version 12 notifiers. I have a bug in xpmem which shows up on our 8x oversubscription tests, but that is clearly my bug to figure out. Unfortunately it only shows up on a 128 processor machine so I have 1024 stack traces to sort through each time it fails. Does take a bit of time and a lot of concentration. > an hurry like we are, we can't progress without this. Infact we can SGI is under an equally strict timeline. We really needed the sleeping version into 2.6.26. We may still be able to get this accepted by vendor distros if we make 2.6.27. Thanks, Robin |
From: Avi K. <av...@qu...> - 2008-04-23 21:09:20
|
Robin Holt wrote: >> an hurry like we are, we can't progress without this. Infact we can >> > > SGI is under an equally strict timeline. We really needed the sleeping > version into 2.6.26. We may still be able to get this accepted by > vendor distros if we make 2.6.27. > The difference is that the non-sleeping variant can be shown not to affect stability or performance, even if configed in, as long as its not used. The sleeping variant will raise performance and stability concerns. I have zero objections to sleeping mmu notifiers; I only object to tying the schedules of the two together. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Andrea A. <an...@qu...> - 2008-04-23 16:15:44
|
On Wed, Apr 23, 2008 at 10:45:36AM -0500, Robin Holt wrote: > XPMEM has passed all regression tests using your version 12 notifiers. That's great news, thanks! I'd greatly appreciate if you could test #v13 too as I posted it. It already passed GRU and KVM regressions tests and it should work fine for XPMEM too. You can ignore the purely cosmetical error I managed to introduce in mm_lock_cmp (I implemented a BUG_ON that would have trigger if that wasn't a purely cosmetical issue, and it clearly doesn't trigger so you can be sure it's only cosmetical ;). Once I get confirmation that everyone is ok with #v13 I'll push a #v14 before Saturday with that cosmetical error cleaned up and mmu_notifier_unregister moved at the end (XPMEM will have unregister don't worry). I expect the 1/13 of #v14 to go in -mm and then 2.6.26. > I have a bug in xpmem which shows up on our 8x oversubscription tests, > but that is clearly my bug to figure out. Unfortunately it only shows This is what I meant. As opposed we don't have any known bug left in this area, infact we need mmu_notifiers to _fix_ issues I identified that can't be fixed efficiently without mmu notifiers, and we need the mmu notifier to go productive ASAP. > up on a 128 processor machine so I have 1024 stack traces to sort > through each time it fails. Does take a bit of time and a lot of > concentration. Sure, hope you find it soon! > SGI is under an equally strict timeline. We really needed the sleeping > version into 2.6.26. We may still be able to get this accepted by > vendor distros if we make 2.6.27. I don't think vendor distro are less likely to take the patches 2-12 if 1/N (aka mmu-notifier-core) is merged in 2.6.26 especially at the light of kabi. |
From: Robin H. <ho...@sg...> - 2008-04-23 19:55:00
|
On Wed, Apr 23, 2008 at 06:15:45PM +0200, Andrea Arcangeli wrote: > Once I get confirmation that everyone is ok with #v13 I'll push a #v14 > before Saturday with that cosmetical error cleaned up and > mmu_notifier_unregister moved at the end (XPMEM will have unregister > don't worry). I expect the 1/13 of #v14 to go in -mm and then 2.6.26. I think GRU needs _unregister as well. Thanks, Robin |
From: Christoph L. <cla...@sg...> - 2008-04-23 18:02:28
|
On Wed, 23 Apr 2008, Andrea Arcangeli wrote: > I know you rather want to see KVM development stalled for more months > than to get a partial solution now that already covers KVM and GRU > with the same API that XPMEM will also use later. It's very unfair on > your side to pretend to stall other people development if what you > need has stronger requirements and can't be merged immediately. This > is especially true given it was publically stated that XPMEM never > passed all regression tests anyway, so you can't possibly be in such > an hurry like we are, we can't progress without this. Infact we can > but it would be an huge effort and it would run _slower_ and it would > all need to be deleted once mmu notifiers are in. We have had this workaround effort done years ago and have been suffering the ill effects of pinning for years. Had to deal with it again and again so I guess we do not matter? Certainly we have no interest in stalling KVM development. |
From: Andrea A. <an...@qu...> - 2008-04-23 18:17:11
|
On Wed, Apr 23, 2008 at 11:02:18AM -0700, Christoph Lameter wrote: > We have had this workaround effort done years ago and have been > suffering the ill effects of pinning for years. Had to deal with Yes. In addition to the pinning, there's lot of additional tlb flushing work to do in kvm without mmu notifiers as the swapcache could be freed by the vm the instruction after put_page unpins the page for whatever reason. |
From: Andrea A. <an...@qu...> - 2008-04-22 14:10:36
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1208872187 -7200 # Node ID f8210c45f1c6f8b38d15e5dfebbc5f7c1f890c93 # Parent bdb3d928a0ba91cdce2b61bd40a2f80bddbe4ff2 Convert mm_lock to use semaphores after i_mmap_lock and anon_vma_lock conversion. Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1062,10 +1062,10 @@ * mm_lock and mm_unlock are expensive operations that may take a long time. */ struct mm_lock_data { - spinlock_t **i_mmap_locks; - spinlock_t **anon_vma_locks; - size_t nr_i_mmap_locks; - size_t nr_anon_vma_locks; + struct rw_semaphore **i_mmap_sems; + struct rw_semaphore **anon_vma_sems; + size_t nr_i_mmap_sems; + size_t nr_anon_vma_sems; }; extern int mm_lock(struct mm_struct *mm, struct mm_lock_data *data); extern void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data); diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2243,8 +2243,8 @@ static int mm_lock_cmp(const void *a, const void *b) { cond_resched(); - if ((unsigned long)*(spinlock_t **)a < - (unsigned long)*(spinlock_t **)b) + if ((unsigned long)*(struct rw_semaphore **)a < + (unsigned long)*(struct rw_semaphore **)b) return -1; else if (a == b) return 0; @@ -2252,7 +2252,7 @@ return 1; } -static unsigned long mm_lock_sort(struct mm_struct *mm, spinlock_t **locks, +static unsigned long mm_lock_sort(struct mm_struct *mm, struct rw_semaphore **sems, int anon) { struct vm_area_struct *vma; @@ -2261,59 +2261,59 @@ for (vma = mm->mmap; vma; vma = vma->vm_next) { if (anon) { if (vma->anon_vma) - locks[i++] = &vma->anon_vma->lock; + sems[i++] = &vma->anon_vma->sem; } else { if (vma->vm_file && vma->vm_file->f_mapping) - locks[i++] = &vma->vm_file->f_mapping->i_mmap_lock; + sems[i++] = &vma->vm_file->f_mapping->i_mmap_sem; } } if (!i) goto out; - sort(locks, i, sizeof(spinlock_t *), mm_lock_cmp, NULL); + sort(sems, i, sizeof(struct rw_semaphore *), mm_lock_cmp, NULL); out: return i; } static inline unsigned long mm_lock_sort_anon_vma(struct mm_struct *mm, - spinlock_t **locks) + struct rw_semaphore **sems) { - return mm_lock_sort(mm, locks, 1); + return mm_lock_sort(mm, sems, 1); } static inline unsigned long mm_lock_sort_i_mmap(struct mm_struct *mm, - spinlock_t **locks) + struct rw_semaphore **sems) { - return mm_lock_sort(mm, locks, 0); + return mm_lock_sort(mm, sems, 0); } -static void mm_lock_unlock(spinlock_t **locks, size_t nr, int lock) +static void mm_lock_unlock(struct rw_semaphore **sems, size_t nr, int lock) { - spinlock_t *last = NULL; + struct rw_semaphore *last = NULL; size_t i; for (i = 0; i < nr; i++) /* Multiple vmas may use the same lock. */ - if (locks[i] != last) { - BUG_ON((unsigned long) last > (unsigned long) locks[i]); - last = locks[i]; + if (sems[i] != last) { + BUG_ON((unsigned long) last > (unsigned long) sems[i]); + last = sems[i]; if (lock) - spin_lock(last); + down_write(last); else - spin_unlock(last); + up_write(last); } } -static inline void __mm_lock(spinlock_t **locks, size_t nr) +static inline void __mm_lock(struct rw_semaphore **sems, size_t nr) { - mm_lock_unlock(locks, nr, 1); + mm_lock_unlock(sems, nr, 1); } -static inline void __mm_unlock(spinlock_t **locks, size_t nr) +static inline void __mm_unlock(struct rw_semaphore **sems, size_t nr) { - mm_lock_unlock(locks, nr, 0); + mm_lock_unlock(sems, nr, 0); } /* @@ -2325,57 +2325,57 @@ */ int mm_lock(struct mm_struct *mm, struct mm_lock_data *data) { - spinlock_t **anon_vma_locks, **i_mmap_locks; + struct rw_semaphore **anon_vma_sems, **i_mmap_sems; down_write(&mm->mmap_sem); if (mm->map_count) { - anon_vma_locks = vmalloc(sizeof(spinlock_t *) * mm->map_count); - if (unlikely(!anon_vma_locks)) { + anon_vma_sems = vmalloc(sizeof(struct rw_semaphore *) * mm->map_count); + if (unlikely(!anon_vma_sems)) { up_write(&mm->mmap_sem); return -ENOMEM; } - i_mmap_locks = vmalloc(sizeof(spinlock_t *) * mm->map_count); - if (unlikely(!i_mmap_locks)) { + i_mmap_sems = vmalloc(sizeof(struct rw_semaphore *) * mm->map_count); + if (unlikely(!i_mmap_sems)) { up_write(&mm->mmap_sem); - vfree(anon_vma_locks); + vfree(anon_vma_sems); return -ENOMEM; } - data->nr_anon_vma_locks = mm_lock_sort_anon_vma(mm, anon_vma_locks); - data->nr_i_mmap_locks = mm_lock_sort_i_mmap(mm, i_mmap_locks); + data->nr_anon_vma_sems = mm_lock_sort_anon_vma(mm, anon_vma_sems); + data->nr_i_mmap_sems = mm_lock_sort_i_mmap(mm, i_mmap_sems); - if (data->nr_anon_vma_locks) { - __mm_lock(anon_vma_locks, data->nr_anon_vma_locks); - data->anon_vma_locks = anon_vma_locks; + if (data->nr_anon_vma_sems) { + __mm_lock(anon_vma_sems, data->nr_anon_vma_sems); + data->anon_vma_sems = anon_vma_sems; } else - vfree(anon_vma_locks); + vfree(anon_vma_sems); - if (data->nr_i_mmap_locks) { - __mm_lock(i_mmap_locks, data->nr_i_mmap_locks); - data->i_mmap_locks = i_mmap_locks; + if (data->nr_i_mmap_sems) { + __mm_lock(i_mmap_sems, data->nr_i_mmap_sems); + data->i_mmap_sems = i_mmap_sems; } else - vfree(i_mmap_locks); + vfree(i_mmap_sems); } return 0; } -static void mm_unlock_vfree(spinlock_t **locks, size_t nr) +static void mm_unlock_vfree(struct rw_semaphore **sems, size_t nr) { - __mm_unlock(locks, nr); - vfree(locks); + __mm_unlock(sems, nr); + vfree(sems); } /* avoid memory allocations for mm_unlock to prevent deadlock */ void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data) { if (mm->map_count) { - if (data->nr_anon_vma_locks) - mm_unlock_vfree(data->anon_vma_locks, - data->nr_anon_vma_locks); - if (data->i_mmap_locks) - mm_unlock_vfree(data->i_mmap_locks, - data->nr_i_mmap_locks); + if (data->nr_anon_vma_sems) + mm_unlock_vfree(data->anon_vma_sems, + data->nr_anon_vma_sems); + if (data->i_mmap_sems) + mm_unlock_vfree(data->i_mmap_sems, + data->nr_i_mmap_sems); } up_write(&mm->mmap_sem); } |
From: Christoph L. <cla...@sg...> - 2008-04-22 20:26:07
|
Doing the right patch ordering would have avoided this patch and allow better review. |
From: Andrea A. <an...@qu...> - 2008-04-22 22:54:24
|
On Tue, Apr 22, 2008 at 01:26:13PM -0700, Christoph Lameter wrote: > Doing the right patch ordering would have avoided this patch and allow > better review. I didn't actually write this patch myself. This did it instead: s/anon_vma_lock/anon_vma_sem/ s/i_mmap_lock/i_mmap_sem/ s/locks/sems/ s/spinlock_t/struct rw_semaphore/ so it didn't look a big deal to redo it indefinitely. The right patch ordering isn't necessarily the one that reduces the total number of lines in the patchsets. The mmu-notifier-core is already converged and can go in. The rest isn't converged at all... nearly nobody commented on the other part (the few comments so far were negative), so there's no good reason to delay indefinitely what is already converged, given it's already feature complete for certain users of the code. My patch ordering looks more natural to me. What is finished goes in, the rest is orthogonal anyway. |
From: Christoph L. <cla...@sg...> - 2008-04-22 23:19:00
|
On Wed, 23 Apr 2008, Andrea Arcangeli wrote: > The right patch ordering isn't necessarily the one that reduces the > total number of lines in the patchsets. The mmu-notifier-core is > already converged and can go in. The rest isn't converged at > all... nearly nobody commented on the other part (the few comments so > far were negative), so there's no good reason to delay indefinitely > what is already converged, given it's already feature complete for > certain users of the code. My patch ordering looks more natural to > me. What is finished goes in, the rest is orthogonal anyway. I would not want to review code that is later reverted or essentially changed in later patches. I only review your patches because we have a high interest in the patch. I suspect that others will be more willing to review this material if it would be done the right way. If you cannot produce an easily reviewable and properly formatted patchset that follows conventions then I will have to do it because we really need to get this merged. |
From: Andrea A. <an...@qu...> - 2008-04-22 14:10:40
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1208872186 -7200 # Node ID ee8c0644d5f67c1ef59142cce91b0bb6f34a53e0 # Parent ac9bb1fb3de2aa5d27210a28edf24f6577094076 Move the tlb flushing into free_pgtables. The conversion of the locks taken for reverse map scanning would require taking sleeping locks in free_pgtables() and we cannot sleep while gathering pages for a tlb flush. Move the tlb_gather/tlb_finish call to free_pgtables() to be done for each vma. This may add a number of tlb flushes depending on the number of vmas that cannot be coalesced into one. The first pointer argument to free_pgtables() can then be dropped. Signed-off-by: Christoph Lameter <cla...@sg...> diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -751,8 +751,8 @@ void *private); void free_pgd_range(struct mmu_gather **tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, - unsigned long floor, unsigned long ceiling); +void free_pgtables(struct vm_area_struct *start_vma, unsigned long floor, + unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -272,9 +272,11 @@ } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, - unsigned long floor, unsigned long ceiling) +void free_pgtables(struct vm_area_struct *vma, unsigned long floor, + unsigned long ceiling) { + struct mmu_gather *tlb; + while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; @@ -286,7 +288,8 @@ unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { - hugetlb_free_pgd_range(tlb, addr, vma->vm_end, + tlb = tlb_gather_mmu(vma->vm_mm, 0); + hugetlb_free_pgd_range(&tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } else { /* @@ -299,9 +302,11 @@ anon_vma_unlink(vma); unlink_file_vma(vma); } - free_pgd_range(tlb, addr, vma->vm_end, + tlb = tlb_gather_mmu(vma->vm_mm, 0); + free_pgd_range(&tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } + tlb_finish_mmu(tlb, addr, vma->vm_end); vma = next; } } diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1752,9 +1752,9 @@ update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, + tlb_finish_mmu(tlb, start, end); + free_pgtables(vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); - tlb_finish_mmu(tlb, start, end); } /* @@ -2050,8 +2050,8 @@ /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); + free_pgtables(vma, FIRST_USER_ADDRESS, 0); /* * Walk the list again, actually closing and freeing it, |
From: Christoph L. <cla...@sg...> - 2008-04-22 20:25:04
|
Why are the subjects all screwed up? They are the first line of the description instead of the subject line of my patches. |
[kvm-devel] [PATCH 12 of 12] This patch adds a lock ordering rule
to avoid a potential deadlock when
From: Andrea A. <an...@qu...> - 2008-04-22 14:10:40
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1208872187 -7200 # Node ID e847039ee2e815088661933b7195584847dc7540 # Parent 128d705f38c8a774ac11559db445787ce6e91c77 This patch adds a lock ordering rule to avoid a potential deadlock when multiple mmap_sems need to be locked. Signed-off-by: Dean Nelson <dc...@sg...> diff --git a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c +++ b/mm/filemap.c @@ -79,6 +79,9 @@ * * ->i_mutex (generic_file_buffered_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) + * + * When taking multiple mmap_sems, one should lock the lowest-addressed + * one first proceeding on up to the highest-addressed one. * * ->i_mutex * ->i_alloc_sem (various) |
From: Andrea A. <an...@qu...> - 2008-04-22 14:10:40
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1208872187 -7200 # Node ID 128d705f38c8a774ac11559db445787ce6e91c77 # Parent f8210c45f1c6f8b38d15e5dfebbc5f7c1f890c93 XPMEM would have used sys_madvise() except that madvise_dontneed() returns an -EINVAL if VM_PFNMAP is set, which is always true for the pages XPMEM imports from other partitions and is also true for uncached pages allocated locally via the mspec allocator. XPMEM needs zap_page_range() functionality for these types of pages as well as 'normal' pages. Signed-off-by: Dean Nelson <dc...@sg...> diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -909,6 +909,7 @@ return unmap_vmas(vma, address, end, &nr_accounted, details); } +EXPORT_SYMBOL_GPL(zap_page_range); /* * Do a quick page-table lookup for a single page. |
From: Andrea A. <an...@qu...> - 2008-04-22 14:10:40
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1208872186 -7200 # Node ID fbce3fecb033eb3fba1d9c2398ac74401ce0ecb5 # Parent ee8c0644d5f67c1ef59142cce91b0bb6f34a53e0 Move the tlb flushing inside of unmap vmas. This saves us from passing a pointer to the TLB structure around and simplifies the callers. Signed-off-by: Christoph Lameter <cla...@sg...> diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -723,8 +723,7 @@ struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t); unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); -unsigned long unmap_vmas(struct mmu_gather **tlb, - struct vm_area_struct *start_vma, unsigned long start_addr, +unsigned long unmap_vmas(struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -804,7 +804,6 @@ /** * unmap_vmas - unmap a range of memory covered by a list of vma's - * @tlbp: address of the caller's struct mmu_gather * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping @@ -816,20 +815,13 @@ * Unmap all pages in the vma list. * * We aim to not hold locks for too long (for scheduling latency reasons). - * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to - * return the ending mmu_gather to the caller. + * So zap pages in ZAP_BLOCK_SIZE bytecounts. * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. - * - * unmap_vmas() assumes that the caller will flush the whole unmapped address - * range after unmap_vmas() returns. So the only responsibility here is to - * ensure that any thus-far unmapped pages are flushed before unmap_vmas() - * drops the lock and schedules. */ -unsigned long unmap_vmas(struct mmu_gather **tlbp, - struct vm_area_struct *vma, unsigned long start_addr, +unsigned long unmap_vmas(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) { @@ -838,9 +830,14 @@ int tlb_start_valid = 0; unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; - int fullmm = (*tlbp)->fullmm; + int fullmm; + struct mmu_gather *tlb; struct mm_struct *mm = vma->vm_mm; + lru_add_drain(); + tlb = tlb_gather_mmu(mm, 0); + update_hiwater_rss(mm); + fullmm = tlb->fullmm; mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; @@ -867,7 +864,7 @@ (HPAGE_SIZE / PAGE_SIZE); start = end; } else - start = unmap_page_range(*tlbp, vma, + start = unmap_page_range(tlb, vma, start, end, &zap_work, details); if (zap_work > 0) { @@ -875,22 +872,23 @@ break; } - tlb_finish_mmu(*tlbp, tlb_start, start); + tlb_finish_mmu(tlb, tlb_start, start); if (need_resched() || (i_mmap_lock && spin_needbreak(i_mmap_lock))) { if (i_mmap_lock) { - *tlbp = NULL; + tlb = NULL; goto out; } cond_resched(); } - *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); + tlb = tlb_gather_mmu(vma->vm_mm, fullmm); tlb_start_valid = 0; zap_work = ZAP_BLOCK_SIZE; } } + tlb_finish_mmu(tlb, start_addr, end_addr); out: mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); return start; /* which is now the end (or restart) address */ @@ -906,18 +904,10 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { - struct mm_struct *mm = vma->vm_mm; - struct mmu_gather *tlb; unsigned long end = address + size; unsigned long nr_accounted = 0; - lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); - update_hiwater_rss(mm); - end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); - if (tlb) - tlb_finish_mmu(tlb, address, end); - return end; + return unmap_vmas(vma, address, end, &nr_accounted, details); } /* diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1744,15 +1744,10 @@ unsigned long start, unsigned long end) { struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; - struct mmu_gather *tlb; unsigned long nr_accounted = 0; - lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); - update_hiwater_rss(mm); - unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); + unmap_vmas(vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - tlb_finish_mmu(tlb, start, end); free_pgtables(vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); } @@ -2034,7 +2029,6 @@ /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { - struct mmu_gather *tlb; struct vm_area_struct *vma = mm->mmap; unsigned long nr_accounted = 0; unsigned long end; @@ -2045,12 +2039,11 @@ lru_add_drain(); flush_cache_mm(mm); - tlb = tlb_gather_mmu(mm, 1); + /* Don't update_hiwater_rss(mm) here, do_exit already did */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ - end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); + end = unmap_vmas(vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - tlb_finish_mmu(tlb, 0, end); free_pgtables(vma, FIRST_USER_ADDRESS, 0); /* |
From: Andrea A. <an...@qu...> - 2008-04-22 14:10:41
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1208872187 -7200 # Node ID 8965539f4d174c79bd37e58e8b037d5db906e219 # Parent fbce3fecb033eb3fba1d9c2398ac74401ce0ecb5 Add a function to rw_semaphores to check if there are any processes waiting for the semaphore. Add rwsem_needbreak to sched.h that works in the same way as spinlock_needbreak(). Signed-off-by: Christoph Lameter <cla...@sg...> diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -59,6 +59,8 @@ */ extern void downgrade_write(struct rw_semaphore *sem); +extern int rwsem_is_contended(struct rw_semaphore *sem); + #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * nested locking. NOTE: rwsems are not allowed to recurse diff --git a/include/linux/sched.h b/include/linux/sched.h --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1984,6 +1984,15 @@ #endif } +static inline int rwsem_needbreak(struct rw_semaphore *sem) +{ +#ifdef CONFIG_PREEMPT + return rwsem_is_contended(sem); +#else + return 0; +#endif +} + /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c --- a/lib/rwsem-spinlock.c +++ b/lib/rwsem-spinlock.c @@ -305,6 +305,18 @@ spin_unlock_irqrestore(&sem->wait_lock, flags); } +int rwsem_is_contended(struct rw_semaphore *sem) +{ + /* + * Racy check for an empty list. False positives or negatives + * would be okay. False positive may cause a useless dropping of + * locks. False negatives may cause locks to be held a bit + * longer until the next check. + */ + return !list_empty(&sem->wait_list); +} + +EXPORT_SYMBOL(rwsem_is_contended); EXPORT_SYMBOL(__init_rwsem); EXPORT_SYMBOL(__down_read); EXPORT_SYMBOL(__down_read_trylock); diff --git a/lib/rwsem.c b/lib/rwsem.c --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -251,6 +251,18 @@ return sem; } +int rwsem_is_contended(struct rw_semaphore *sem) +{ + /* + * Racy check for an empty list. False positives or negatives + * would be okay. False positive may cause a useless dropping of + * locks. False negatives may cause locks to be held a bit + * longer until the next check. + */ + return !list_empty(&sem->wait_list); +} + +EXPORT_SYMBOL(rwsem_is_contended); EXPORT_SYMBOL(rwsem_down_read_failed); EXPORT_SYMBOL(rwsem_down_write_failed); EXPORT_SYMBOL(rwsem_wake); |
From: Andrea A. <an...@qu...> - 2008-04-22 14:10:41
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1208872186 -7200 # Node ID a6672bdeead0d41b2ebd6846f731d43a611645b7 # Parent 3c804dca25b15017b22008647783d6f5f3801fa9 get_task_mm should not succeed if mmput() is running and has reduced the mm_users count to zero. This can occur if a processor follows a tasks pointer to an mm struct because that pointer is only cleared after the mmput(). If get_task_mm() succeeds after mmput() reduced the mm_users to zero then we have the lovely situation that one portion of the kernel is doing all the teardown work for an mm while another portion is happily using it. Signed-off-by: Christoph Lameter <cla...@sg...> diff --git a/kernel/fork.c b/kernel/fork.c --- a/kernel/fork.c +++ b/kernel/fork.c @@ -442,7 +442,8 @@ if (task->flags & PF_BORROWED_MM) mm = NULL; else - atomic_inc(&mm->mm_users); + if (!atomic_inc_not_zero(&mm->mm_users)) + mm = NULL; } task_unlock(task); return mm; |
From: Christoph L. <cla...@sg...> - 2008-04-22 20:23:15
|
Missing signoff by you. |
From: Andrea A. <an...@qu...> - 2008-04-22 22:37:35
|
On Tue, Apr 22, 2008 at 01:23:16PM -0700, Christoph Lameter wrote: > Missing signoff by you. I thought I had to signoff if I conributed with anything that could resemble copyright? Given I only merged that patch, I can add an Acked-by if you like, but merging this in my patchset was already an implicit ack ;-). |
From: Christoph L. <cla...@sg...> - 2008-04-22 23:13:16
|
On Wed, 23 Apr 2008, Andrea Arcangeli wrote: > On Tue, Apr 22, 2008 at 01:23:16PM -0700, Christoph Lameter wrote: > > Missing signoff by you. > > I thought I had to signoff if I conributed with anything that could > resemble copyright? Given I only merged that patch, I can add an > Acked-by if you like, but merging this in my patchset was already an > implicit ack ;-). No you have to include a signoff if the patch goes through your custody chain. This one did. Also add a From: Christoph Lameter <cla...@sg...> somewhere if you want to signify that the patch came from me. |