[kvm-devel] [PATCH 09 of 12] Convert the anon_vma spinlock to a rw
semaphore. This allows concurrent
From: Andrea A. <an...@qu...> - 2008-04-22 14:10:41
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1208872187 -7200 # Node ID bdb3d928a0ba91cdce2b61bd40a2f80bddbe4ff2 # Parent 6e04df1f4284689b1c46e57a67559abe49ecf292 Convert the anon_vma spinlock to a rw semaphore. This allows concurrent traversal of reverse maps for try_to_unmap() and page_mkclean(). It also allows the calling of sleeping functions from reverse map traversal as needed for the notifier callbacks. It includes possible concurrency. Rcu is used in some context to guarantee the presence of the anon_vma (try_to_unmap) while we acquire the anon_vma lock. We cannot take a semaphore within an rcu critical section. Add a refcount to the anon_vma structure which allow us to give an existence guarantee for the anon_vma structure independent of the spinlock or the list contents. The refcount can then be taken within the RCU section. If it has been taken successfully then the refcount guarantees the existence of the anon_vma. The refcount in anon_vma also allows us to fix a nasty issue in page migration where we fudged by using rcu for a long code path to guarantee the existence of the anon_vma. I think this is a bug because the anon_vma may become empty and get scheduled to be freed but then we increase the refcount again when the migration entries are removed. The refcount in general allows a shortening of RCU critical sections since we can do a rcu_unlock after taking the refcount. This is particularly relevant if the anon_vma chains contain hundreds of entries. However: - Atomic overhead increases in situations where a new reference to the anon_vma has to be established or removed. Overhead also increases when a speculative reference is used (try_to_unmap, page_mkclean, page migration). - There is the potential for more frequent processor change due to up_xxx letting waiting tasks run first. This results in f.e. the Aim9 brk performance test to got down by 10-15%. Signed-off-by: Christoph Lameter <cla...@sg...> diff --git a/include/linux/rmap.h b/include/linux/rmap.h --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -25,7 +25,8 @@ * pointing to this anon_vma once its vma list is empty. */ struct anon_vma { - spinlock_t lock; /* Serialize access to vma list */ + atomic_t refcount; /* vmas on the list */ + struct rw_semaphore sem;/* Serialize access to vma list */ struct list_head head; /* List of private "related" vmas */ }; @@ -43,18 +44,31 @@ kmem_cache_free(anon_vma_cachep, anon_vma); } +struct anon_vma *grab_anon_vma(struct page *page); + +static inline void get_anon_vma(struct anon_vma *anon_vma) +{ + atomic_inc(&anon_vma->refcount); +} + +static inline void put_anon_vma(struct anon_vma *anon_vma) +{ + if (atomic_dec_and_test(&anon_vma->refcount)) + anon_vma_free(anon_vma); +} + static inline void anon_vma_lock(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - spin_lock(&anon_vma->lock); + down_write(&anon_vma->sem); } static inline void anon_vma_unlock(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - spin_unlock(&anon_vma->lock); + up_write(&anon_vma->sem); } /* diff --git a/mm/migrate.c b/mm/migrate.c --- a/mm/migrate.c +++ b/mm/migrate.c @@ -235,15 +235,16 @@ return; /* - * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. + * We hold either the mmap_sem lock or a reference on the + * anon_vma. So no need to call page_lock_anon_vma. */ anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); + down_read(&anon_vma->sem); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) remove_migration_pte(vma, old, new); - spin_unlock(&anon_vma->lock); + up_read(&anon_vma->sem); } /* @@ -623,7 +624,7 @@ int rc = 0; int *result = NULL; struct page *newpage = get_new_page(page, private, &result); - int rcu_locked = 0; + struct anon_vma *anon_vma = NULL; int charge = 0; if (!newpage) @@ -647,16 +648,14 @@ } /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, - * we cannot notice that anon_vma is freed while we migrates a page. + * we cannot notice that anon_vma is freed while we migrate a page. * This rcu_read_lock() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. */ - if (PageAnon(page)) { - rcu_read_lock(); - rcu_locked = 1; - } + if (PageAnon(page)) + anon_vma = grab_anon_vma(page); /* * Corner case handling: @@ -674,10 +673,7 @@ if (!PageAnon(page) && PagePrivate(page)) { /* * Go direct to try_to_free_buffers() here because - * a) that's what try_to_release_page() would do anyway - * b) we may be under rcu_read_lock() here, so we can't - * use GFP_KERNEL which is what try_to_release_page() - * needs to be effective. + * that's what try_to_release_page() would do anyway */ try_to_free_buffers(page); } @@ -698,8 +694,8 @@ } else if (charge) mem_cgroup_end_migration(newpage); rcu_unlock: - if (rcu_locked) - rcu_read_unlock(); + if (anon_vma) + put_anon_vma(anon_vma); unlock: diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -567,7 +567,7 @@ if (vma->anon_vma) anon_vma = vma->anon_vma; if (anon_vma) { - spin_lock(&anon_vma->lock); + down_write(&anon_vma->sem); /* * Easily overlooked: when mprotect shifts the boundary, * make sure the expanding vma has anon_vma set if the @@ -621,7 +621,7 @@ } if (anon_vma) - spin_unlock(&anon_vma->lock); + up_write(&anon_vma->sem); if (mapping) up_write(&mapping->i_mmap_sem); diff --git a/mm/rmap.c b/mm/rmap.c --- a/mm/rmap.c +++ b/mm/rmap.c @@ -69,7 +69,7 @@ if (anon_vma) { allocated = NULL; locked = anon_vma; - spin_lock(&locked->lock); + down_write(&locked->sem); } else { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) @@ -81,6 +81,7 @@ /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { + get_anon_vma(anon_vma); vma->anon_vma = anon_vma; list_add_tail(&vma->anon_vma_node, &anon_vma->head); allocated = NULL; @@ -88,7 +89,7 @@ spin_unlock(&mm->page_table_lock); if (locked) - spin_unlock(&locked->lock); + up_write(&locked->sem); if (unlikely(allocated)) anon_vma_free(allocated); } @@ -99,14 +100,17 @@ { BUG_ON(vma->anon_vma != next->anon_vma); list_del(&next->anon_vma_node); + put_anon_vma(vma->anon_vma); } void __anon_vma_link(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; - if (anon_vma) + if (anon_vma) { + get_anon_vma(anon_vma); list_add_tail(&vma->anon_vma_node, &anon_vma->head); + } } void anon_vma_link(struct vm_area_struct *vma) @@ -114,36 +118,32 @@ struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) { - spin_lock(&anon_vma->lock); + get_anon_vma(anon_vma); + down_write(&anon_vma->sem); list_add_tail(&vma->anon_vma_node, &anon_vma->head); - spin_unlock(&anon_vma->lock); + up_write(&anon_vma->sem); } } void anon_vma_unlink(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; - int empty; if (!anon_vma) return; - spin_lock(&anon_vma->lock); + down_write(&anon_vma->sem); list_del(&vma->anon_vma_node); - - /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); - - if (empty) - anon_vma_free(anon_vma); + up_write(&anon_vma->sem); + put_anon_vma(anon_vma); } static void anon_vma_ctor(struct kmem_cache *cachep, void *data) { struct anon_vma *anon_vma = data; - spin_lock_init(&anon_vma->lock); + init_rwsem(&anon_vma->sem); + atomic_set(&anon_vma->refcount, 0); INIT_LIST_HEAD(&anon_vma->head); } @@ -157,9 +157,9 @@ * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *grab_anon_vma(struct page *page) { - struct anon_vma *anon_vma; + struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; rcu_read_lock(); @@ -170,17 +170,26 @@ goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); - return anon_vma; + if (!atomic_inc_not_zero(&anon_vma->refcount)) + anon_vma = NULL; out: rcu_read_unlock(); - return NULL; + return anon_vma; +} + +static struct anon_vma *page_lock_anon_vma(struct page *page) +{ + struct anon_vma *anon_vma = grab_anon_vma(page); + + if (anon_vma) + down_read(&anon_vma->sem); + return anon_vma; } static void page_unlock_anon_vma(struct anon_vma *anon_vma) { - spin_unlock(&anon_vma->lock); - rcu_read_unlock(); + up_read(&anon_vma->sem); + put_anon_vma(anon_vma); } /* |
From: Andrea A. <an...@qu...> - 2008-04-22 14:10:42
|
# HG changeset patch # User Andrea Arcangeli <an...@qu...> # Date 1208872187 -7200 # Node ID 6e04df1f4284689b1c46e57a67559abe49ecf292 # Parent 8965539f4d174c79bd37e58e8b037d5db906e219 The conversion to a rwsem allows notifier callbacks during rmap traversal for files. A rw style lock also allows concurrent walking of the reverse map so that multiple processors can expire pages in the same memory area of the same process. So it increases the potential concurrency. Signed-off-by: Andrea Arcangeli <an...@qu...> Signed-off-by: Christoph Lameter <cla...@sg...> diff --git a/Documentation/vm/locking b/Documentation/vm/locking --- a/Documentation/vm/locking +++ b/Documentation/vm/locking @@ -66,7 +66,7 @@ expand_stack(), it is hard to come up with a destructive scenario without having the vmlist protection in this case. -The page_table_lock nests with the inode i_mmap_lock and the kmem cache +The page_table_lock nests with the inode i_mmap_sem and the kmem cache c_spinlock spinlocks. This is okay, since the kmem code asks for pages after dropping c_spinlock. The page_table_lock also nests with pagecache_lock and pagemap_lru_lock spinlocks, and no code asks for memory with these locks diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -69,7 +69,7 @@ if (!vma_shareable(vma, addr)) return; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -94,7 +94,7 @@ put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); out: - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); } /* diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -454,10 +454,10 @@ pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); if (!prio_tree_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); truncate_hugepages(inode, offset); return 0; } diff --git a/fs/inode.c b/fs/inode.c --- a/fs/inode.c +++ b/fs/inode.c @@ -210,7 +210,7 @@ INIT_LIST_HEAD(&inode->i_devices); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); rwlock_init(&inode->i_data.tree_lock); - spin_lock_init(&inode->i_data.i_mmap_lock); + init_rwsem(&inode->i_data.i_mmap_sem); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); diff --git a/include/linux/fs.h b/include/linux/fs.h --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -503,7 +503,7 @@ unsigned int i_mmap_writable;/* count VM_SHARED mappings */ struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ - spinlock_t i_mmap_lock; /* protect tree, count, list */ + struct rw_semaphore i_mmap_sem; /* protect tree, count, list */ unsigned int truncate_count; /* Cover race condition with truncate */ unsigned long nrpages; /* number of total pages */ pgoff_t writeback_index;/* writeback starts here */ diff --git a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -716,7 +716,7 @@ struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ - spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */ + struct rw_semaphore *i_mmap_sem; /* For unmap_mapping_range: */ unsigned long truncate_count; /* Compare vm_truncate_count */ }; diff --git a/kernel/fork.c b/kernel/fork.c --- a/kernel/fork.c +++ b/kernel/fork.c @@ -274,12 +274,12 @@ atomic_dec(&inode->i_writecount); /* insert tmp into the share list, just after mpnt */ - spin_lock(&file->f_mapping->i_mmap_lock); + down_write(&file->f_mapping->i_mmap_sem); tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(file->f_mapping); vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(file->f_mapping); - spin_unlock(&file->f_mapping->i_mmap_lock); + up_write(&file->f_mapping->i_mmap_sem); } /* diff --git a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c +++ b/mm/filemap.c @@ -61,16 +61,16 @@ /* * Lock ordering: * - * ->i_mmap_lock (vmtruncate) + * ->i_mmap_sem (vmtruncate) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_mutex - * ->i_mmap_lock (truncate->unmap_mapping_range) + * ->i_mmap_sem (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_mmap_lock + * ->i_mmap_sem * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * @@ -87,7 +87,7 @@ * ->sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * - * ->i_mmap_lock + * ->i_mmap_sem * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -184,7 +184,7 @@ if (!page) return; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + @@ -204,7 +204,7 @@ page_cache_release(page); } } - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); } /* diff --git a/mm/fremap.c b/mm/fremap.c --- a/mm/fremap.c +++ b/mm/fremap.c @@ -206,13 +206,13 @@ } goto out; } - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; vma_prio_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); } mmu_notifier_invalidate_range_start(mm, start, start + size); diff --git a/mm/hugetlb.c b/mm/hugetlb.c --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -790,7 +790,7 @@ struct page *page; struct page *tmp; /* - * A page gathering list, protected by per file i_mmap_lock. The + * A page gathering list, protected by per file i_mmap_sem. The * lock is used to avoid list corruption from multiple unmapping * of the same page since we are using page->lru. */ @@ -840,9 +840,9 @@ * do nothing in this case. */ if (vma->vm_file) { - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + down_write(&vma->vm_file->f_mapping->i_mmap_sem); __unmap_hugepage_range(vma, start, end); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + up_write(&vma->vm_file->f_mapping->i_mmap_sem); } } @@ -1085,7 +1085,7 @@ BUG_ON(address >= end); flush_cache_range(vma, address, end); - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + down_write(&vma->vm_file->f_mapping->i_mmap_sem); spin_lock(&mm->page_table_lock); for (; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); @@ -1100,7 +1100,7 @@ } } spin_unlock(&mm->page_table_lock); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + up_write(&vma->vm_file->f_mapping->i_mmap_sem); flush_tlb_range(vma, start, end); } diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -829,7 +829,7 @@ unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; unsigned long start = start_addr; - spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; + struct rw_semaphore *i_mmap_sem = details? details->i_mmap_sem: NULL; int fullmm; struct mmu_gather *tlb; struct mm_struct *mm = vma->vm_mm; @@ -875,8 +875,8 @@ tlb_finish_mmu(tlb, tlb_start, start); if (need_resched() || - (i_mmap_lock && spin_needbreak(i_mmap_lock))) { - if (i_mmap_lock) { + (i_mmap_sem && rwsem_needbreak(i_mmap_sem))) { + if (i_mmap_sem) { tlb = NULL; goto out; } @@ -1742,7 +1742,7 @@ /* * Helper functions for unmap_mapping_range(). * - * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ + * __ Notes on dropping i_mmap_sem to reduce latency while unmapping __ * * We have to restart searching the prio_tree whenever we drop the lock, * since the iterator is only valid while the lock is held, and anyway @@ -1761,7 +1761,7 @@ * can't efficiently keep all vmas in step with mapping->truncate_count: * so instead reset them all whenever it wraps back to 0 (then go to 1). * mapping->truncate_count and vma->vm_truncate_count are protected by - * i_mmap_lock. + * i_mmap_sem. * * In order to make forward progress despite repeatedly restarting some * large vma, note the restart_addr from unmap_vmas when it breaks out: @@ -1811,7 +1811,7 @@ restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr, details); - need_break = need_resched() || spin_needbreak(details->i_mmap_lock); + need_break = need_resched() || rwsem_needbreak(details->i_mmap_sem); if (restart_addr >= end_addr) { /* We have now completed this vma: mark it so */ @@ -1825,9 +1825,9 @@ goto again; } - spin_unlock(details->i_mmap_lock); + up_write(details->i_mmap_sem); cond_resched(); - spin_lock(details->i_mmap_lock); + down_write(details->i_mmap_sem); return -EINTR; } @@ -1921,9 +1921,9 @@ details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) details.last_index = ULONG_MAX; - details.i_mmap_lock = &mapping->i_mmap_lock; + details.i_mmap_sem = &mapping->i_mmap_sem; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); /* Protect against endless unmapping loops */ mapping->truncate_count++; @@ -1938,7 +1938,7 @@ unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); } EXPORT_SYMBOL(unmap_mapping_range); diff --git a/mm/migrate.c b/mm/migrate.c --- a/mm/migrate.c +++ b/mm/migrate.c @@ -211,12 +211,12 @@ if (!mapping) return; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) remove_migration_pte(vma, old, new); - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); } /* diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -189,7 +189,7 @@ } /* - * Requires inode->i_mapping->i_mmap_lock + * Requires inode->i_mapping->i_mmap_sem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) @@ -217,9 +217,9 @@ if (file) { struct address_space *mapping = file->f_mapping; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); __remove_shared_vm_struct(vma, file, mapping); - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); } } @@ -442,7 +442,7 @@ mapping = vma->vm_file->f_mapping; if (mapping) { - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); vma->vm_truncate_count = mapping->truncate_count; } anon_vma_lock(vma); @@ -452,7 +452,7 @@ anon_vma_unlock(vma); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); mm->map_count++; validate_mm(mm); @@ -539,7 +539,7 @@ mapping = file->f_mapping; if (!(vma->vm_flags & VM_NONLINEAR)) root = &mapping->i_mmap; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); if (importer && vma->vm_truncate_count != next->vm_truncate_count) { /* @@ -623,7 +623,7 @@ if (anon_vma) spin_unlock(&anon_vma->lock); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); if (remove_next) { if (file) @@ -2058,7 +2058,7 @@ /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_lock is taken here. + * then i_mmap_sem is taken here. */ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) { diff --git a/mm/mremap.c b/mm/mremap.c --- a/mm/mremap.c +++ b/mm/mremap.c @@ -88,7 +88,7 @@ * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); if (new_vma->vm_truncate_count && new_vma->vm_truncate_count != vma->vm_truncate_count) new_vma->vm_truncate_count = 0; @@ -120,7 +120,7 @@ pte_unmap_nested(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); } diff --git a/mm/rmap.c b/mm/rmap.c --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@ * inode->i_alloc_sem (vmtruncate_range) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_lock + * mapping->i_mmap_sem * anon_vma->lock * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) @@ -373,14 +373,14 @@ * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_lock. + * so we can safely take mapping->i_mmap_sem. */ BUG_ON(!PageLocked(page)); - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); /* - * i_mmap_lock does not stabilize mapcount at all, but mapcount + * i_mmap_sem does not stabilize mapcount at all, but mapcount * is more likely to be accurate if we note it after spinning. */ mapcount = page_mapcount(page); @@ -403,7 +403,7 @@ break; } - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); return referenced; } @@ -489,12 +489,12 @@ BUG_ON(PageAnon(page)); - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) ret += page_mkclean_one(page, vma); } - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); return ret; } @@ -930,7 +930,7 @@ unsigned long max_nl_size = 0; unsigned int mapcount; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { ret = try_to_unmap_one(page, vma, migration); if (ret == SWAP_FAIL || !page_mapped(page)) @@ -967,7 +967,6 @@ mapcount = page_mapcount(page); if (!mapcount) goto out; - cond_resched_lock(&mapping->i_mmap_lock); max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; if (max_nl_cursor == 0) @@ -989,7 +988,6 @@ } vma->vm_private_data = (void *) max_nl_cursor; } - cond_resched_lock(&mapping->i_mmap_lock); max_nl_cursor += CLUSTER_SIZE; } while (max_nl_cursor <= max_nl_size); @@ -1001,7 +999,7 @@ list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) vma->vm_private_data = NULL; out: - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); return ret; } |
From: Robin H. <ho...@sg...> - 2008-04-22 18:22:12
|
I believe the differences between your patch set and Christoph's need to be understood and a compromise approach agreed upon. Those differences, as I understand them, are: 1) invalidate_page: You retain an invalidate_page() callout. I believe we have progressed that discussion to the point that it requires some direction for Andrew, Linus, or somebody in authority. The basics of the difference distill down to no expected significant performance difference between the two. The invalidate_page() callout potentially can simplify GRU code. It does provide a more complex api for the users of mmu_notifier which, IIRC, Christoph had interpretted from one of Andrew's earlier comments as being undesirable. I vaguely recall that sentiment as having been expressed. 2) Range callout names: Your range callouts are invalidate_range_start and invalidate_range_end whereas Christoph's are start and end. I do not believe this has been discussed in great detail. I know I have expressed a preference for your names. I admit to having failed to follow up on this issue. I certainly believe we could come to an agreement quickly if pressed. 3) The structure of the patch set: Christoph's upcoming release orders the patches so the prerequisite patches are seperately reviewable and each file is only touched by a single patch. Additionally, that allows mmu_notifiers to be introduced as a single patch with sleeping functionality from its inception and an API which remains unchanged. Your patch set, however, introduces one API, then turns around and changes that API. Again, the desire to make it an unchanging API was expressed by, IIRC, Andrew. This does represent a risk to XPMEM as the non-sleeping API may become entrenched and make acceptance of the sleeping version less acceptable. Can we agree upon this list of issues? Thank you, Robin Holt |
From: Andrea A. <an...@qu...> - 2008-04-22 18:43:36
|
On Tue, Apr 22, 2008 at 01:22:13PM -0500, Robin Holt wrote: > 1) invalidate_page: You retain an invalidate_page() callout. I believe > we have progressed that discussion to the point that it requires some > direction for Andrew, Linus, or somebody in authority. The basics > of the difference distill down to no expected significant performance > difference between the two. The invalidate_page() callout potentially > can simplify GRU code. It does provide a more complex api for the > users of mmu_notifier which, IIRC, Christoph had interpretted from one > of Andrew's earlier comments as being undesirable. I vaguely recall > that sentiment as having been expressed. invalidate_page as demonstrated in KVM pseudocode doesn't change the locking requirements, and it has the benefit of reducing the window of time the secondary page fault has to be masked and at the same time _halves_ the number of _hooks_ in the VM every time the VM deal with single pages (example: do_wp_page hot path). As long as we can't fully converge because of point 3, it'd rather keep invalidate_page to be better. But that's by far not a priority to keep. > 2) Range callout names: Your range callouts are invalidate_range_start > and invalidate_range_end whereas Christoph's are start and end. I do not > believe this has been discussed in great detail. I know I have expressed > a preference for your names. I admit to having failed to follow up on > this issue. I certainly believe we could come to an agreement quickly > if pressed. I think using ->start ->end is a mistake, think when we later add mprotect_range_start/end. Here too I keep the better names only because we can't converge on point 3 (the API will eventually change, like every other kernel interal API, even core things like __free_page have been mostly obsoleted). > 3) The structure of the patch set: Christoph's upcoming release orders > the patches so the prerequisite patches are seperately reviewable > and each file is only touched by a single patch. Additionally, that Each file touched by a single patch? I doubt... The split is about the same, the main difference is the merge ordering, I always had the zero risk part at the head, he moved it at the tail when he incorporated #v12 into his patchset. > allows mmu_notifiers to be introduced as a single patch with sleeping > functionality from its inception and an API which remains unchanged. > Your patch set, however, introduces one API, then turns around and > changes that API. Again, the desire to make it an unchanging API was > expressed by, IIRC, Andrew. This does represent a risk to XPMEM as > the non-sleeping API may become entrenched and make acceptance of the > sleeping version less acceptable. > > Can we agree upon this list of issues? This is a kernel internal API, so it will definitely change over time. It's nothing close to a syscall. Also note: the API is obviously defined in mmu_notifier.h and none of the 2-12 patches touches mmu_notifier.h. So the extension of the method semantics is 100% backwards compatible. My patch order and API backward compatible extension over the patchset is done to allow 2.6.26 to fully support KVM/GRU and 2.6.27 to support XPMEM as well. KVM/GRU won't notice any difference once the support for XPMEM is added, but even if the API would completely change in 2.6.27, that's still better than no functionality at all in 2.6.26. |
From: Christoph L. <cla...@sg...> - 2008-04-22 20:28:21
|
On Tue, 22 Apr 2008, Andrea Arcangeli wrote: > My patch order and API backward compatible extension over the patchset > is done to allow 2.6.26 to fully support KVM/GRU and 2.6.27 to support > XPMEM as well. KVM/GRU won't notice any difference once the support > for XPMEM is added, but even if the API would completely change in > 2.6.27, that's still better than no functionality at all in 2.6.26. Please redo the patchset with the right order. To my knowledge there is no chance of this getting merged for 2.6.26. |
From: Robin H. <ho...@sg...> - 2008-04-22 19:42:21
|
On Tue, Apr 22, 2008 at 08:43:35PM +0200, Andrea Arcangeli wrote: > On Tue, Apr 22, 2008 at 01:22:13PM -0500, Robin Holt wrote: > > 1) invalidate_page: You retain an invalidate_page() callout. I believe > > we have progressed that discussion to the point that it requires some > > direction for Andrew, Linus, or somebody in authority. The basics > > of the difference distill down to no expected significant performance > > difference between the two. The invalidate_page() callout potentially > > can simplify GRU code. It does provide a more complex api for the > > users of mmu_notifier which, IIRC, Christoph had interpretted from one > > of Andrew's earlier comments as being undesirable. I vaguely recall > > that sentiment as having been expressed. > > invalidate_page as demonstrated in KVM pseudocode doesn't change the > locking requirements, and it has the benefit of reducing the window of > time the secondary page fault has to be masked and at the same time > _halves_ the number of _hooks_ in the VM every time the VM deal with > single pages (example: do_wp_page hot path). As long as we can't fully > converge because of point 3, it'd rather keep invalidate_page to be > better. But that's by far not a priority to keep. Christoph, Jack and I just discussed invalidate_page(). I don't think the point Andrew was making is that compelling in this circumstance. The code has change fairly remarkably. Would you have any objection to putting it back into your patch/agreeing to it remaining in Andrea's patch? If not, I think we can put this issue aside until Andrew gets out of the merge window and can decide it. Either way, the patches become much more similar with this in. Thanks, Robin |
From: Christoph L. <cla...@sg...> - 2008-04-22 20:30:47
|
On Tue, 22 Apr 2008, Robin Holt wrote: > putting it back into your patch/agreeing to it remaining in Andrea's > patch? If not, I think we can put this issue aside until Andrew gets > out of the merge window and can decide it. Either way, the patches > become much more similar with this in. One solution would be to separate the invalidate_page() callout into a patch at the very end that can be omitted. AFACIT There is no compelling reason to have this callback and it complicates the API for the device driver writers. Not having this callback makes the way that mmu notifiers are called from the VM uniform which is a desirable goal. |
From: Andrea A. <an...@qu...> - 2008-04-23 13:33:53
|
On Tue, Apr 22, 2008 at 01:30:53PM -0700, Christoph Lameter wrote: > One solution would be to separate the invalidate_page() callout into a > patch at the very end that can be omitted. AFACIT There is no compelling > reason to have this callback and it complicates the API for the device > driver writers. Not having this callback makes the way that mmu notifiers > are called from the VM uniform which is a desirable goal. I agree that the invalidate_page optimization can be moved to a separate patch. That will be a patch that will definitely alter the API in a not backwards compatible way (unlike 2-12 in my #v13, which are all backwards compatible in terms of mmu notifier API). invalidate_page is beneficial to both mmu notifier users, and a bit beneficial to the do_wp_page users too. So there's no point to remove it from my mmu-notifier-core as long as the mmu-notifier-core is 1/N in my patchset, and N/N in your patchset, the differences caused by that ordering difference are a bigger change than invalidate_page existing or not. As I expected invalidate_page provided significant benefits (not just to GRU but to KVM too) without altering the locking scheme at all, this is because the page fault handler has to notice if begin->end both runs anyway after follow_page/get_user_pages. So it's a no brainer to keep and my approach will avoid a not backwards compatible breakage of the API IMHO. Not a big deal, nobody can care if the API will change, it will definitely change eventually, it's a kernel internal one, but given I've already invalidate_page in my patch there's no reason to remove it as long as mmu-notifier-core remains N/N on your patchset. |
From: Jack S. <st...@sg...> - 2008-04-23 00:31:35
|
On Tue, Apr 22, 2008 at 03:51:16PM +0200, Andrea Arcangeli wrote: > Hello, > > This is the latest and greatest version of the mmu notifier patch #v13. > FWIW, I have updated the GRU driver to use this patch (plus the fixeups). No problems. AFAICT, everything works. --- jack |
From: Jack S. <st...@sg...> - 2008-04-23 17:09:42
|
You may have spotted this already. If so, just ignore this. It looks like there is a bug in copy_page_range() around line 667. It's possible to do a mmu_notifier_invalidate_range_start(), then return -ENOMEM w/o doing a corresponding mmu_notifier_invalidate_range_end(). --- jack |
From: Andrea A. <an...@qu...> - 2008-04-23 17:45:57
|
On Wed, Apr 23, 2008 at 12:09:09PM -0500, Jack Steiner wrote: > > You may have spotted this already. If so, just ignore this. > > It looks like there is a bug in copy_page_range() around line 667. > It's possible to do a mmu_notifier_invalidate_range_start(), then > return -ENOMEM w/o doing a corresponding mmu_notifier_invalidate_range_end(). No I didn't spot it yet, great catch!! ;) Thanks a lot. I think we can take example by Jack and use our energy to spot any bug in the mmu-notifier-core like with his above auditing effort (I'm quite certain you didn't reprouce this with real oom ;) so we get a rock solid mmu-notifier implementation in 2.6.26 so XPMEM will also benefit later in 2.6.27 and I hope the last XPMEM internal bugs will also be fixed by that time. (for the not going to become mmu-notifier users, nothing to worry about for you, unless you used KVM or GRU actively with mmu-notifiers this bug would be entirely harmless with both MMU_NOTIFIER=n and =y, as previously guaranteed) Here the still untested fix for review. diff --git a/mm/memory.c b/mm/memory.c --- a/mm/memory.c +++ b/mm/memory.c @@ -597,6 +597,7 @@ unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; + int ret; /* * Don't copy ptes where a page fault will fill them correctly. @@ -604,33 +605,39 @@ * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ + ret = 0; if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { if (!vma->anon_vma) - return 0; + goto out; } - if (is_vm_hugetlb_page(vma)) - return copy_hugetlb_page_range(dst_mm, src_mm, vma); + if (unlikely(is_vm_hugetlb_page(vma))) { + ret = copy_hugetlb_page_range(dst_mm, src_mm, vma); + goto out; + } if (is_cow_mapping(vma->vm_flags)) mmu_notifier_invalidate_range_start(src_mm, addr, end); + ret = 0; dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; - if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next)) - return -ENOMEM; + if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + vma, addr, next))) { + ret = -ENOMEM; + break; + } } while (dst_pgd++, src_pgd++, addr = next, addr != end); if (is_cow_mapping(vma->vm_flags)) mmu_notifier_invalidate_range_end(src_mm, - vma->vm_start, end); - - return 0; + vma->vm_start, end); +out: + return ret; } static unsigned long zap_pte_range(struct mmu_gather *tlb, |