|
From: Andrea A. <an...@qu...> - 2008-04-26 16:47:08
|
Hello everyone, here it is the mmu notifier #v14. http://www.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.25/mmu-notifier-v14/ Please everyone involved review and (hopefully ;) ack that this is safe to go in 2.6.26, the most important is to verify that this is a noop when disarmed regardless of MMU_NOTIFIER=y or =n. http://www.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.25/mmu-notifier-v14/mmu-notifier-core I'll be sending that patch to Andrew inbox. Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8d45fab..ce3251c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -21,6 +21,7 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM select PREEMPT_NOTIFIERS + select MMU_NOTIFIER select ANON_INODES ---help--- Support hosting fully virtualized guest machines using hardware diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2ad6f54..853087a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -663,6 +663,108 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) account_shadowed(kvm, gfn); } +static void kvm_unmap_spte(struct kvm *kvm, u64 *spte) +{ + struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); + get_page(page); + rmap_remove(kvm, spte); + set_shadow_pte(spte, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(kvm); + put_page(page); +} + +static void kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *spte, *curr_spte; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!(*spte & PT_PRESENT_MASK)); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); + curr_spte = spte; + spte = rmap_next(kvm, rmapp, spte); + kvm_unmap_spte(kvm, curr_spte); + } +} + +void kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + int i; + + /* + * If mmap_sem isn't taken, we can look the memslots with only + * the mmu_lock by skipping over the slots with userspace_addr == 0. + */ + for (i = 0; i < kvm->nmemslots; i++) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + unsigned long start = memslot->userspace_addr; + unsigned long end; + + /* mmu_lock protects userspace_addr */ + if (!start) + continue; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + kvm_unmap_rmapp(kvm, &memslot->rmap[gfn_offset]); + } + } +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *spte; + int young = 0; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + int _young; + u64 _spte = *spte; + BUG_ON(!(_spte & PT_PRESENT_MASK)); + _young = _spte & PT_ACCESSED_MASK; + if (_young) { + young = !!_young; + set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK); + } + spte = rmap_next(kvm, rmapp, spte); + } + return young; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + int i; + int young = 0; + + /* + * If mmap_sem isn't taken, we can look the memslots with only + * the mmu_lock by skipping over the slots with userspace_addr == 0. + */ + spin_lock(&kvm->mmu_lock); + for (i = 0; i < kvm->nmemslots; i++) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + unsigned long start = memslot->userspace_addr; + unsigned long end; + + /* mmu_lock protects userspace_addr */ + if (!start) + continue; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]); + } + } + spin_unlock(&kvm->mmu_lock); + + if (young) + kvm_flush_remote_tlbs(kvm); + + return young; +} + #ifdef MMU_DEBUG static int is_empty_shadow_page(u64 *spt) { @@ -1200,6 +1302,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) int r; int largepage = 0; pfn_t pfn; + int mmu_seq; down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { @@ -1207,6 +1310,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) largepage = 1; } + mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq); + /* implicit mb(), we'll read before PT lock is unlocked */ pfn = gfn_to_pfn(vcpu->kvm, gfn); up_read(¤t->mm->mmap_sem); @@ -1217,6 +1322,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) } spin_lock(&vcpu->kvm->mmu_lock); + if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count))) + goto out_unlock; + smp_rmb(); + if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq)) + goto out_unlock; kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, v, write, largepage, gfn, pfn, PT32E_ROOT_LEVEL); @@ -1224,6 +1334,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) return r; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; } @@ -1355,6 +1470,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, int r; int largepage = 0; gfn_t gfn = gpa >> PAGE_SHIFT; + int mmu_seq; ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -1368,6 +1484,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } + mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq); + /* implicit mb(), we'll read before PT lock is unlocked */ pfn = gfn_to_pfn(vcpu->kvm, gfn); up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { @@ -1375,12 +1493,22 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, return 1; } spin_lock(&vcpu->kvm->mmu_lock); + if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count))) + goto out_unlock; + smp_rmb(); + if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq)) + goto out_unlock; kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, largepage, gfn, pfn, TDP_ROOT_LEVEL); spin_unlock(&vcpu->kvm->mmu_lock); return r; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; } static void nonpaging_free(struct kvm_vcpu *vcpu) @@ -1643,11 +1771,11 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int r; u64 gpte = 0; pfn_t pfn; - - vcpu->arch.update_pte.largepage = 0; + int mmu_seq; + int largepage; if (bytes != 4 && bytes != 8) - return; + goto out_lock; /* * Assume that the pte write on a page table of the same type @@ -1660,7 +1788,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if ((bytes == 4) && (gpa % 4 == 0)) { r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8); if (r) - return; + goto out_lock; memcpy((void *)&gpte + (gpa % 8), new, 4); } else if ((bytes == 8) && (gpa % 8 == 0)) { memcpy((void *)&gpte, new, 8); @@ -1670,23 +1798,35 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, memcpy((void *)&gpte, new, 4); } if (!is_present_pte(gpte)) - return; + goto out_lock; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + largepage = 0; down_read(¤t->mm->mmap_sem); if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); - vcpu->arch.update_pte.largepage = 1; + largepage = 1; } + mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq); + /* implicit mb(), we'll read before PT lock is unlocked */ pfn = gfn_to_pfn(vcpu->kvm, gfn); up_read(¤t->mm->mmap_sem); - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return; - } + if (is_error_pfn(pfn)) + goto out_release_and_lock; + + spin_lock(&vcpu->kvm->mmu_lock); + BUG_ON(!is_error_pfn(vcpu->arch.update_pte.pfn)); vcpu->arch.update_pte.gfn = gfn; vcpu->arch.update_pte.pfn = pfn; + vcpu->arch.update_pte.largepage = largepage; + vcpu->arch.update_pte.mmu_seq = mmu_seq; + return; + +out_release_and_lock: + kvm_release_pfn_clean(pfn); +out_lock: + spin_lock(&vcpu->kvm->mmu_lock); } void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -1711,7 +1851,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); - spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, "pre pte write"); @@ -1790,11 +1929,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } } kvm_mmu_audit(vcpu, "post pte write"); - spin_unlock(&vcpu->kvm->mmu_lock); if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); vcpu->arch.update_pte.pfn = bad_pfn; } + spin_unlock(&vcpu->kvm->mmu_lock); } int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 156fe10..4ac73a6 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -263,6 +263,12 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, pfn = vcpu->arch.update_pte.pfn; if (is_error_pfn(pfn)) return; + if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count))) + return; + smp_rmb(); + if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != + vcpu->arch.update_pte.mmu_seq)) + return; kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), @@ -380,6 +386,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int r; pfn_t pfn; int largepage = 0; + int mmu_seq; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); kvm_mmu_audit(vcpu, "pre page fault"); @@ -413,6 +420,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, largepage = 1; } } + mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq); + /* implicit mb(), we'll read before PT lock is unlocked */ pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); up_read(¤t->mm->mmap_sem); @@ -424,6 +433,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, } spin_lock(&vcpu->kvm->mmu_lock); + if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count))) + goto out_unlock; + smp_rmb(); + if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq)) + goto out_unlock; kvm_mmu_free_some_pages(vcpu); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, largepage, &write_pt, pfn); @@ -439,6 +453,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, spin_unlock(&vcpu->kvm->mmu_lock); return write_pt; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ce5563..860559a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -27,6 +27,7 @@ #include <linux/module.h> #include <linux/mman.h> #include <linux/highmem.h> +#include <linux/mmu_notifier.h> #include <asm/uaccess.h> #include <asm/msr.h> @@ -3859,15 +3860,152 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.pio_data); } +static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) +{ + struct kvm_arch *kvm_arch; + kvm_arch = container_of(mn, struct kvm_arch, mmu_notifier); + return container_of(kvm_arch, struct kvm, arch); +} + +static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + /* + * When ->invalidate_page runs, the linux pte has been zapped + * already but the page is still allocated until + * ->invalidate_page returns. So if we increase the sequence + * here the kvm page fault will notice if the spte can't be + * established because the page is going to be freed. If + * instead the kvm page fault establishes the spte before + * ->invalidate_page runs, kvm_unmap_hva will release it + * before returning. + + * No need of memory barriers as the sequence increase only + * need to be seen at spin_unlock time, and not at spin_lock + * time. + * + * Increasing the sequence after the spin_unlock would be + * unsafe because the kvm page fault could then establish the + * pte after kvm_unmap_hva returned, without noticing the page + * is going to be freed. + */ + atomic_inc(&kvm->arch.mmu_notifier_seq); + spin_lock(&kvm->mmu_lock); + kvm_unmap_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); +} + +static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + + /* + * The count increase must become visible at unlock time as no + * spte can be established without taking the mmu_lock and + * count is also read inside the mmu_lock critical section. + */ + atomic_inc(&kvm->arch.mmu_notifier_count); + + spin_lock(&kvm->mmu_lock); + for (; start < end; start += PAGE_SIZE) + kvm_unmap_hva(kvm, start); + spin_unlock(&kvm->mmu_lock); +} + +static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + /* + * + * This sequence increase will notify the kvm page fault that + * the page that is going to be mapped in the spte could have + * been freed. + * + * There's also an implicit mb() here in this comment, + * provided by the last PT lock taken to zap pagetables, and + * that the read side has to take too in follow_page(). The + * sequence increase in the worst case will become visible to + * the kvm page fault after the spin_lock of the last PT lock + * of the last PT-lock-protected critical section preceeding + * invalidate_range_end. So if the kvm page fault is about to + * establish the spte inside the mmu_lock, while we're freeing + * the pages, it will have to backoff and when it retries, it + * will have to take the PT lock before it can check the + * pagetables again. And after taking the PT lock it will + * re-establish the pte even if it will see the already + * increased sequence number before calling gfn_to_pfn. + */ + atomic_inc(&kvm->arch.mmu_notifier_seq); + /* + * The sequence increase must be visible before count + * decrease. The page fault has to read count before sequence + * for this write order to be effective. + */ + wmb(); + atomic_dec(&kvm->arch.mmu_notifier_count); + BUG_ON(atomic_read(&kvm->arch.mmu_notifier_count) < 0); +} + +static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + return kvm_age_hva(kvm, address); +} + +static void kvm_free_vcpus(struct kvm *kvm); +/* This must zap all the sptes because all pages will be freed then */ +static void kvm_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + BUG_ON(mm != kvm->mm); + + kvm_destroy_common_vm(kvm); + + kvm_free_pit(kvm); + kfree(kvm->arch.vpic); + kfree(kvm->arch.vioapic); + kvm_free_vcpus(kvm); + kvm_free_physmem(kvm); + if (kvm->arch.apic_access_page) + put_page(kvm->arch.apic_access_page); +} + +static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { + .release = kvm_mmu_notifier_release, + .invalidate_page = kvm_mmu_notifier_invalidate_page, + .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, + .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, + .clear_flush_young = kvm_mmu_notifier_clear_flush_young, +}; + struct kvm *kvm_arch_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + int err; if (!kvm) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops; + err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm); + if (err) { + kfree(kvm); + return ERR_PTR(err); + } + return kvm; } @@ -3899,13 +4037,12 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { - kvm_free_pit(kvm); - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); - kvm_free_vcpus(kvm); - kvm_free_physmem(kvm); - if (kvm->arch.apic_access_page) - put_page(kvm->arch.apic_access_page); + /* + * kvm_mmu_notifier_release() will be called before + * mmu_notifier_unregister returns, if it didn't run + * already. + */ + mmu_notifier_unregister(&kvm->arch.mmu_notifier, kvm->mm); kfree(kvm); } diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 9d963cd..f07e321 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -13,6 +13,7 @@ #include <linux/types.h> #include <linux/mm.h> +#include <linux/mmu_notifier.h> #include <linux/kvm.h> #include <linux/kvm_para.h> @@ -247,6 +248,7 @@ struct kvm_vcpu_arch { gfn_t gfn; /* presumed gfn during guest pte update */ pfn_t pfn; /* pfn corresponding to that gfn */ int largepage; + int mmu_seq; } update_pte; struct i387_fxsave_struct host_fx_image; @@ -314,6 +316,10 @@ struct kvm_arch{ struct page *apic_access_page; gpa_t wall_clock; + + struct mmu_notifier mmu_notifier; + atomic_t mmu_notifier_seq; + atomic_t mmu_notifier_count; }; struct kvm_vm_stat { @@ -434,6 +440,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu); int kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); +void kvm_unmap_hva(struct kvm *kvm, unsigned long hva); +int kvm_age_hva(struct kvm *kvm, unsigned long hva); int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); void kvm_mmu_zap_all(struct kvm *kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4e16682..f089edc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -267,6 +267,7 @@ void kvm_arch_check_processor_compat(void *rtn); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); void kvm_free_physmem(struct kvm *kvm); +void kvm_destroy_common_vm(struct kvm *kvm); struct kvm *kvm_arch_create_vm(void); void kvm_arch_destroy_vm(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f095b73..4beae7a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -231,15 +231,19 @@ void kvm_free_physmem(struct kvm *kvm) kvm_free_physmem_slot(&kvm->memslots[i], NULL); } -static void kvm_destroy_vm(struct kvm *kvm) +void kvm_destroy_common_vm(struct kvm *kvm) { - struct mm_struct *mm = kvm->mm; - spin_lock(&kvm_lock); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); kvm_io_bus_destroy(&kvm->pio_bus); kvm_io_bus_destroy(&kvm->mmio_bus); +} + +static void kvm_destroy_vm(struct kvm *kvm) +{ + struct mm_struct *mm = kvm->mm; + kvm_arch_destroy_vm(kvm); mmdrop(mm); } As usual you also need the kvm-mmu-notifier-lock patch to read the memslots with only the mmu_lock. Signed-off-by: Andrea Arcangeli <an...@qu...> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c7ad235..8be6551 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3871,16 +3871,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm, */ if (!user_alloc) { if (npages && !old.rmap) { + unsigned long userspace_addr; + down_write(¤t->mm->mmap_sem); - memslot->userspace_addr = do_mmap(NULL, 0, - npages * PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, - 0); + userspace_addr = do_mmap(NULL, 0, + npages * PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + 0); up_write(¤t->mm->mmap_sem); - if (IS_ERR((void *)memslot->userspace_addr)) - return PTR_ERR((void *)memslot->userspace_addr); + if (IS_ERR((void *)userspace_addr)) + return PTR_ERR((void *)userspace_addr); + + /* set userspace_addr atomically for kvm_hva_to_rmapp */ + spin_lock(&kvm->mmu_lock); + memslot->userspace_addr = userspace_addr; + spin_unlock(&kvm->mmu_lock); } else { if (!old.user_alloc && old.rmap) { int ret; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6a52c08..97bcc8d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -342,7 +342,15 @@ int __kvm_set_memory_region(struct kvm *kvm, memset(new.rmap, 0, npages * sizeof(*new.rmap)); new.user_alloc = user_alloc; - new.userspace_addr = mem->userspace_addr; + /* + * hva_to_rmmap() serialzies with the mmu_lock and to be + * safe it has to ignore memslots with !user_alloc && + * !userspace_addr. + */ + if (user_alloc) + new.userspace_addr = mem->userspace_addr; + else + new.userspace_addr = 0; } if (npages && !new.lpage_info) { int largepages = npages / KVM_PAGES_PER_HPAGE; @@ -374,14 +382,18 @@ int __kvm_set_memory_region(struct kvm *kvm, memset(new.dirty_bitmap, 0, dirty_bytes); } + spin_lock(&kvm->mmu_lock); if (mem->slot >= kvm->nmemslots) kvm->nmemslots = mem->slot + 1; *memslot = new; + spin_unlock(&kvm->mmu_lock); r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); if (r) { + spin_lock(&kvm->mmu_lock); *memslot = old; + spin_unlock(&kvm->mmu_lock); goto out_free; } |
|
From: Anthony L. <ali...@us...> - 2008-04-26 19:42:40
|
Andrea Arcangeli wrote: > Hello everyone, > > here it is the mmu notifier #v14. > > http://www.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.25/mmu-notifier-v14/ > > Please everyone involved review and (hopefully ;) ack that this is > safe to go in 2.6.26, the most important is to verify that this is a > noop when disarmed regardless of MMU_NOTIFIER=y or =n. > > http://www.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.25/mmu-notifier-v14/mmu-notifier-core > > I'll be sending that patch to Andrew inbox. > > Signed-off-by: Andrea Arcangeli <an...@qu...> > > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig > index 8d45fab..ce3251c 100644 > --- a/arch/x86/kvm/Kconfig > +++ b/arch/x86/kvm/Kconfig > @@ -21,6 +21,7 @@ config KVM > tristate "Kernel-based Virtual Machine (KVM) support" > depends on HAVE_KVM > select PREEMPT_NOTIFIERS > + select MMU_NOTIFIER > select ANON_INODES > ---help--- > Support hosting fully virtualized guest machines using hardware > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 2ad6f54..853087a 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -663,6 +663,108 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) > account_shadowed(kvm, gfn); > } > > +static void kvm_unmap_spte(struct kvm *kvm, u64 *spte) > +{ > + struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); > + get_page(page); > You should not assume a struct page exists for any given spte. Instead, use kvm_get_pfn() and kvm_release_pfn_clean(). > static void nonpaging_free(struct kvm_vcpu *vcpu) > @@ -1643,11 +1771,11 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, > int r; > u64 gpte = 0; > pfn_t pfn; > - > - vcpu->arch.update_pte.largepage = 0; > + int mmu_seq; > + int largepage; > > if (bytes != 4 && bytes != 8) > - return; > + goto out_lock; > > /* > * Assume that the pte write on a page table of the same type > @@ -1660,7 +1788,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, > if ((bytes == 4) && (gpa % 4 == 0)) { > r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8); > if (r) > - return; > + goto out_lock; > memcpy((void *)&gpte + (gpa % 8), new, 4); > } else if ((bytes == 8) && (gpa % 8 == 0)) { > memcpy((void *)&gpte, new, 8); > @@ -1670,23 +1798,35 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, > memcpy((void *)&gpte, new, 4); > } > if (!is_present_pte(gpte)) > - return; > + goto out_lock; > gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; > > + largepage = 0; > down_read(¤t->mm->mmap_sem); > if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { > gfn &= ~(KVM_PAGES_PER_HPAGE-1); > - vcpu->arch.update_pte.largepage = 1; > + largepage = 1; > } > + mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq); > + /* implicit mb(), we'll read before PT lock is unlocked */ > pfn = gfn_to_pfn(vcpu->kvm, gfn); > up_read(¤t->mm->mmap_sem); > > - if (is_error_pfn(pfn)) { > - kvm_release_pfn_clean(pfn); > - return; > - } > + if (is_error_pfn(pfn)) > + goto out_release_and_lock; > + > + spin_lock(&vcpu->kvm->mmu_lock); > + BUG_ON(!is_error_pfn(vcpu->arch.update_pte.pfn)); > vcpu->arch.update_pte.gfn = gfn; > vcpu->arch.update_pte.pfn = pfn; > + vcpu->arch.update_pte.largepage = largepage; > + vcpu->arch.update_pte.mmu_seq = mmu_seq; > + return; > + > +out_release_and_lock: > + kvm_release_pfn_clean(pfn); > +out_lock: > + spin_lock(&vcpu->kvm->mmu_lock); > } > Perhaps I just have a weak stomach but I am uneasy having a function that takes a lock on exit. I walked through the logic and it doesn't appear to be wrong but it also is pretty clear that you could defer the acquisition of the lock to the caller (in this case, kvm_mmu_pte_write) by moving the update_pte assignment into kvm_mmu_pte_write. > void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, > @@ -1711,7 +1851,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, > > pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); > mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); > Worst case, you pass 4 more pointer arguments here and, take the spin lock, and then depending on the result of mmu_guess_page_from_pte_write, update vcpu->arch.update_pte. > @@ -3899,13 +4037,12 @@ static void kvm_free_vcpus(struct kvm *kvm) > > void kvm_arch_destroy_vm(struct kvm *kvm) > { > - kvm_free_pit(kvm); > - kfree(kvm->arch.vpic); > - kfree(kvm->arch.vioapic); > - kvm_free_vcpus(kvm); > - kvm_free_physmem(kvm); > - if (kvm->arch.apic_access_page) > - put_page(kvm->arch.apic_access_page); > + /* > + * kvm_mmu_notifier_release() will be called before > + * mmu_notifier_unregister returns, if it didn't run > + * already. > + */ > + mmu_notifier_unregister(&kvm->arch.mmu_notifier, kvm->mm); > kfree(kvm); > } > Why move the destruction of the vm to the MMU notifier unregister hook? Does anything else ever call mmu_notifier_unregister that would implicitly destroy the VM? Regards, Anthony Liguori |
|
From: Andrea A. <an...@qu...> - 2008-04-27 00:20:19
|
On Sat, Apr 26, 2008 at 01:59:23PM -0500, Anthony Liguori wrote:
>> +static void kvm_unmap_spte(struct kvm *kvm, u64 *spte)
>> +{
>> + struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >>
>> PAGE_SHIFT);
>> + get_page(page);
>>
>
> You should not assume a struct page exists for any given spte. Instead, use
> kvm_get_pfn() and kvm_release_pfn_clean().
Last email from muli@ibm in my inbox argues it's useless to build rmap
on mmio regions, so the above is more efficient so put_page runs
directly on the page without going back and forth between spte -> pfn
-> page -> pfn -> page in a single function.
Certainly if we start building rmap on mmio regions we'll have to
change that.
> Perhaps I just have a weak stomach but I am uneasy having a function that
> takes a lock on exit. I walked through the logic and it doesn't appear to
> be wrong but it also is pretty clear that you could defer the acquisition
> of the lock to the caller (in this case, kvm_mmu_pte_write) by moving the
> update_pte assignment into kvm_mmu_pte_write.
I agree out_lock is an uncommon exit path, the problem is that the
code was buggy, and I tried to fix it with the smallest possible
change and that resulting in an out_lock. That section likely need a
refactoring, all those update_pte fields should be at least returned
by the function guess_.... but I tried to reduce the changes to make
the issue more readable, I didn't want to rewrite certain functions
just to take a spinlock a few instructions ahead.
> Worst case, you pass 4 more pointer arguments here and, take the spin lock,
> and then depending on the result of mmu_guess_page_from_pte_write, update
> vcpu->arch.update_pte.
Yes that was my same idea, but that's left for a later patch. Fixing
this bug mixed with the mmu notifier patch was perhaps excessive
already ;).
> Why move the destruction of the vm to the MMU notifier unregister hook?
> Does anything else ever call mmu_notifier_unregister that would implicitly
> destroy the VM?
mmu notifier ->release can run at anytime before the filehandle is
closed. ->release has to zap all sptes and freeze the mmu (hence all
vcpus) to prevent any further page fault. After ->release returns all
pages are freed (we'll never relay on the page pin to avoid the
rmap_remove put_page to be a relevant unpin event). So the idea is
that I wanted to maintain the same ordering of the current code in the
vm destroy event, I didn't want to leave a partially shutdown VM on
the vmlist. If the ordering is entirely irrelevant and the
kvm_arch_destroy_vm can run well before kvm_destroy_vm is called, then
I can avoid changes to kvm_main.c but I doubt.
I've done it in a way that archs not needing mmu notifiers like s390
can simply add the kvm_destroy_common_vm at the top of their
kvm_arch_destroy_vm. All others using mmu_notifiers have to invoke
kvm_destroy_common_vm in the ->release of the mmu notifiers.
This will ensure that everything will be ok regardless if exit_mmap is
called before/after exit_files, and it won't make a whole lot of
difference anymore, if the driver fd is pinned through vmas->vm_file
released in exit_mmap or through the task filedescriptors relased in
exit_files etc... Infact this allows to call mmu_notifier_unregister
at anytime later after the task has already been killed, without any
trouble (like if the mmu notifier owner isn't registering in
current->mm but some other tasks mm).
|
|
From: Anthony L. <an...@co...> - 2008-04-27 01:54:23
|
Andrea Arcangeli wrote:
> On Sat, Apr 26, 2008 at 01:59:23PM -0500, Anthony Liguori wrote:
>
>>> +static void kvm_unmap_spte(struct kvm *kvm, u64 *spte)
>>> +{
>>> + struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >>
>>> PAGE_SHIFT);
>>> + get_page(page);
>>>
>>>
>> You should not assume a struct page exists for any given spte. Instead, use
>> kvm_get_pfn() and kvm_release_pfn_clean().
>>
>
> Last email from muli@ibm in my inbox argues it's useless to build rmap
> on mmio regions, so the above is more efficient so put_page runs
> directly on the page without going back and forth between spte -> pfn
> -> page -> pfn -> page in a single function.
>
Avi can correct me if I'm wrong, but I don't think the consensus of that
discussion was that we're going to avoid putting mmio pages in the
rmap. Practically speaking, replacing:
+ struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >>
PAGE_SHIFT);
+ get_page(page);
With:
unsigned long pfn = (*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
kvm_get_pfn(pfn);
Results in exactly the same code except the later allows mmio pfns in
the rmap. So ignoring the whole mmio thing, using accessors that are
already there and used elsewhere seems like a good idea :-)
> Certainly if we start building rmap on mmio regions we'll have to
> change that.
>
>
>> Perhaps I just have a weak stomach but I am uneasy having a function that
>> takes a lock on exit. I walked through the logic and it doesn't appear to
>> be wrong but it also is pretty clear that you could defer the acquisition
>> of the lock to the caller (in this case, kvm_mmu_pte_write) by moving the
>> update_pte assignment into kvm_mmu_pte_write.
>>
>
> I agree out_lock is an uncommon exit path, the problem is that the
> code was buggy, and I tried to fix it with the smallest possible
> change and that resulting in an out_lock. That section likely need a
> refactoring, all those update_pte fields should be at least returned
> by the function guess_.... but I tried to reduce the changes to make
> the issue more readable, I didn't want to rewrite certain functions
> just to take a spinlock a few instructions ahead.
>
I appreciate the desire to minimize changes, but taking a lock on return
seems to take that to a bit of an extreme. It seems like a simple thing
to fix though, no?
>> Why move the destruction of the vm to the MMU notifier unregister hook?
>> Does anything else ever call mmu_notifier_unregister that would implicitly
>> destroy the VM?
>>
>
> mmu notifier ->release can run at anytime before the filehandle is
> closed. ->release has to zap all sptes and freeze the mmu (hence all
> vcpus) to prevent any further page fault. After ->release returns all
> pages are freed (we'll never relay on the page pin to avoid the
> rmap_remove put_page to be a relevant unpin event). So the idea is
> that I wanted to maintain the same ordering of the current code in the
> vm destroy event, I didn't want to leave a partially shutdown VM on
> the vmlist. If the ordering is entirely irrelevant and the
> kvm_arch_destroy_vm can run well before kvm_destroy_vm is called, then
> I can avoid changes to kvm_main.c but I doubt.
>
> I've done it in a way that archs not needing mmu notifiers like s390
> can simply add the kvm_destroy_common_vm at the top of their
> kvm_arch_destroy_vm. All others using mmu_notifiers have to invoke
> kvm_destroy_common_vm in the ->release of the mmu notifiers.
>
> This will ensure that everything will be ok regardless if exit_mmap is
> called before/after exit_files, and it won't make a whole lot of
> difference anymore, if the driver fd is pinned through vmas->vm_file
> released in exit_mmap or through the task filedescriptors relased in
> exit_files etc... Infact this allows to call mmu_notifier_unregister
> at anytime later after the task has already been killed, without any
> trouble (like if the mmu notifier owner isn't registering in
> current->mm but some other tasks mm).
>
I see. It seems a little strange to me as a KVM guest isn't really tied
to the current mm. It seems like the net effect of this is that we are
now tying a KVM guest to an mm.
For instance, if you create a guest, but didn't assign any memory to it,
you could transfer the fd to another process and then close the fd
(without destroying the guest). The other process then could assign
memory to it and presumably run the guest.
With your change, as soon as the first process exits, the guest will be
destroyed. I'm not sure this behavioral difference really matters but
it is a behavioral difference.
Regards,
Anthony Liguori
> -------------------------------------------------------------------------
> This SF.net email is sponsored by the 2008 JavaOne(SM) Conference
> Don't miss this year's exciting event. There's still time to save $100.
> Use priority code J8TL2D2.
> http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone
> _______________________________________________
> kvm-devel mailing list
> kvm...@li...
> https://lists.sourceforge.net/lists/listinfo/kvm-devel
>
|
|
From: Andrea A. <an...@qu...> - 2008-04-27 03:05:22
|
On Sat, Apr 26, 2008 at 08:54:23PM -0500, Anthony Liguori wrote:
> Avi can correct me if I'm wrong, but I don't think the consensus of that
> discussion was that we're going to avoid putting mmio pages in the rmap.
My first impression on that discussion was that pci-passthrough mmio
can't be swapped, can't require write throttling etc.. ;). From a
linux VM pagetable point of view rmap on mmio looks weird. However
thinking some more, it's not like in the linux kernel where write
protect through rmap is needed only for write-throttling MAP_SHARED
which clearly is strictly RAM, for sptes we need it for every cr3
touch too to trap pagetable updates (think ioremap done by guest
kernel). So I think Avi's take that we need rmap for everything mapped
by sptes is probably the only feasible way to go.
> Practically speaking, replacing:
>
> + struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >>
> PAGE_SHIFT);
> + get_page(page);
>
>
> With:
>
> unsigned long pfn = (*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
> kvm_get_pfn(pfn);
>
> Results in exactly the same code except the later allows mmio pfns in the
> rmap. So ignoring the whole mmio thing, using accessors that are already
> there and used elsewhere seems like a good idea :-)
Agreed especially at the light of the above. I didn't actually touch
that function for a while (I clearly wrote it before we started moving
the kvm mmu code from page to pfn), and it was still safe to use to
test the locking of the mmu notifier methods. My current main focus in
the last few days was to get the locking right against the last mmu
notifier code #v14 ;).
Now that I look into it more closely, the get_page/put_page are
unnecessary by now (it was necessary with the older patches that
didn't implement range_begin and that relied on page pinning).
Not just in that function, but all reference counting inside kvm is
now entirely useless and can be removed.
NOTE: it is safe to flush the tlb outside the mmu_lock if done inside
the mmu_notifier methods. But only mmu notifiers can defer the tlb
flush after releasing mmu_lock because the page can't be freed by the
VM until we return.
All other kvm code must instead definitely flush the tlb inside the
mmu_lock, otherwise when the mmu notifier code runs, it will see the
spte nonpresent and so the mmu notifier code will do nothing (it will
not wait kvm to drop the mmu_lock before allowing the main linux VM to
free the page).
The tlb flush must happen before the page is freed, and doing it
inside mmu_lock everywhere (except in mmu-notifier contex where it can
be done after releasing mmu_lock) guarantees it.
The positive side of the tradeoff of having to do the tlb flush inside
the mmu_lock, is that KVM can now safely zap and unmap as many sptes
at it wants and do a single tlb flush at the end. The pages can't be
freed as long as the mmu_lock is hold (this is why the tlb flush has
to be done inside the mmu_lock). This model reduces heavily the tlb
flush frequency for large spte-mangling, and tlb flushes here are
quite expensive because of ipis.
> I appreciate the desire to minimize changes, but taking a lock on return
> seems to take that to a bit of an extreme. It seems like a simple thing to
> fix though, no?
I agree it needs to be rewritten as a cleaner fix but probably in a
separate patch (which has to be incremental as that code will reject
on the mmu notifier patch). I didn't see as a big issue however to
apply my quick fix first and cleanup with an incremental update.
> I see. It seems a little strange to me as a KVM guest isn't really tied to
> the current mm. It seems like the net effect of this is that we are now
> tying a KVM guest to an mm.
>
> For instance, if you create a guest, but didn't assign any memory to it,
> you could transfer the fd to another process and then close the fd (without
> destroying the guest). The other process then could assign memory to it
> and presumably run the guest.
Passing the anon kvm vm fd through unix sockets to another task is
exactly why we need things like ->release not dependent on fd->release
vma->vm_file->release ordering in the do_exit path to teardown the VM.
The guest itself is definitely tied to a "mm", the guest runs using
get_user_pages and get_user_pages is meaningless without an mm. But
the fd where we run the ioctl isn't tied to the mm, it's just an fd
that can be passed across tasks with unix sockets.
> With your change, as soon as the first process exits, the guest will be
> destroyed. I'm not sure this behavioral difference really matters but it
> is a behavioral difference.
The guest-mode of the cpu, can't run safely on any task but the one
with the "mm" tracked by the mmu notifiers and where the memory is
allocated from. The sptes points to the memory allocated in that
"mm". It's definitely memory-corrupting to leave any spte established
when the last thread of that "mm" exists as the memory supposedly
pointed by the orphaned sptes will go immediately in the freelist and
reused by the kernel. Keep in mind that there's no page pin on the
memory pointed by the sptes.
The ioctl of the qemu userland could run in any other task with a mm
different than the one of the guest and ->release allows this to work
fine without memory corruption and without requiring page pinning.
As far a I can tell your example explains why we need this fix ;).
Here an updated patch that passes my swap test (the only missing thing
is the out_lock cleanup).
Signed-off-by: Andrea Arcangeli <an...@qu...>
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8d45fab..ce3251c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
select PREEMPT_NOTIFIERS
+ select MMU_NOTIFIER
select ANON_INODES
---help---
Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2ad6f54..330eaed 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -663,6 +663,101 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
account_shadowed(kvm, gfn);
}
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+ u64 *spte, *curr_spte;
+ int need_tlb_flush = 0;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+ curr_spte = spte;
+ spte = rmap_next(kvm, rmapp, spte);
+ rmap_remove(kvm, curr_spte);
+ set_shadow_pte(curr_spte, shadow_trap_nonpresent_pte);
+ need_tlb_flush = 1;
+ }
+ return need_tlb_flush;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+ int i;
+ int need_tlb_flush = 0;
+
+ /*
+ * If mmap_sem isn't taken, we can look the memslots with only
+ * the mmu_lock by skipping over the slots with userspace_addr == 0.
+ */
+ for (i = 0; i < kvm->nmemslots; i++) {
+ struct kvm_memory_slot *memslot = &kvm->memslots[i];
+ unsigned long start = memslot->userspace_addr;
+ unsigned long end;
+
+ /* mmu_lock protects userspace_addr */
+ if (!start)
+ continue;
+
+ end = start + (memslot->npages << PAGE_SHIFT);
+ if (hva >= start && hva < end) {
+ gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+ need_tlb_flush |= kvm_unmap_rmapp(kvm,
+ &memslot->rmap[gfn_offset]);
+ }
+ }
+
+ return need_tlb_flush;
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+ u64 *spte;
+ int young = 0;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ int _young;
+ u64 _spte = *spte;
+ BUG_ON(!(_spte & PT_PRESENT_MASK));
+ _young = _spte & PT_ACCESSED_MASK;
+ if (_young) {
+ young = !!_young;
+ set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+ return young;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ int i;
+ int young = 0;
+
+ /*
+ * If mmap_sem isn't taken, we can look the memslots with only
+ * the mmu_lock by skipping over the slots with userspace_addr == 0.
+ */
+ for (i = 0; i < kvm->nmemslots; i++) {
+ struct kvm_memory_slot *memslot = &kvm->memslots[i];
+ unsigned long start = memslot->userspace_addr;
+ unsigned long end;
+
+ /* mmu_lock protects userspace_addr */
+ if (!start)
+ continue;
+
+ end = start + (memslot->npages << PAGE_SHIFT);
+ if (hva >= start && hva < end) {
+ gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+ young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]);
+ }
+ }
+
+ return young;
+}
+
#ifdef MMU_DEBUG
static int is_empty_shadow_page(u64 *spt)
{
@@ -1200,6 +1295,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
int r;
int largepage = 0;
pfn_t pfn;
+ int mmu_seq;
down_read(¤t->mm->mmap_sem);
if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1207,6 +1303,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
largepage = 1;
}
+ mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+ /* implicit mb(), we'll read before PT lock is unlocked */
pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(¤t->mm->mmap_sem);
@@ -1217,6 +1315,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
}
spin_lock(&vcpu->kvm->mmu_lock);
+ if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+ goto out_unlock;
+ smp_rmb();
+ if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq))
+ goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
PT32E_ROOT_LEVEL);
@@ -1224,6 +1327,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
return r;
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return 0;
}
@@ -1355,6 +1463,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
int r;
int largepage = 0;
gfn_t gfn = gpa >> PAGE_SHIFT;
+ int mmu_seq;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1368,6 +1477,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
largepage = 1;
}
+ mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+ /* implicit mb(), we'll read before PT lock is unlocked */
pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(¤t->mm->mmap_sem);
if (is_error_pfn(pfn)) {
@@ -1375,12 +1486,22 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
return 1;
}
spin_lock(&vcpu->kvm->mmu_lock);
+ if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+ goto out_unlock;
+ smp_rmb();
+ if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq))
+ goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
largepage, gfn, pfn, TDP_ROOT_LEVEL);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return 0;
}
static void nonpaging_free(struct kvm_vcpu *vcpu)
@@ -1643,11 +1764,11 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int r;
u64 gpte = 0;
pfn_t pfn;
-
- vcpu->arch.update_pte.largepage = 0;
+ int mmu_seq;
+ int largepage;
if (bytes != 4 && bytes != 8)
- return;
+ goto out_lock;
/*
* Assume that the pte write on a page table of the same type
@@ -1660,7 +1781,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if ((bytes == 4) && (gpa % 4 == 0)) {
r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
if (r)
- return;
+ goto out_lock;
memcpy((void *)&gpte + (gpa % 8), new, 4);
} else if ((bytes == 8) && (gpa % 8 == 0)) {
memcpy((void *)&gpte, new, 8);
@@ -1670,23 +1791,35 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
memcpy((void *)&gpte, new, 4);
}
if (!is_present_pte(gpte))
- return;
+ goto out_lock;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ largepage = 0;
down_read(¤t->mm->mmap_sem);
if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
- vcpu->arch.update_pte.largepage = 1;
+ largepage = 1;
}
+ mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+ /* implicit mb(), we'll read before PT lock is unlocked */
pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(¤t->mm->mmap_sem);
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return;
- }
+ if (is_error_pfn(pfn))
+ goto out_release_and_lock;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ BUG_ON(!is_error_pfn(vcpu->arch.update_pte.pfn));
vcpu->arch.update_pte.gfn = gfn;
vcpu->arch.update_pte.pfn = pfn;
+ vcpu->arch.update_pte.largepage = largepage;
+ vcpu->arch.update_pte.mmu_seq = mmu_seq;
+ return;
+
+out_release_and_lock:
+ kvm_release_pfn_clean(pfn);
+out_lock:
+ spin_lock(&vcpu->kvm->mmu_lock);
}
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1711,7 +1844,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
- spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, "pre pte write");
@@ -1790,11 +1922,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
}
kvm_mmu_audit(vcpu, "post pte write");
- spin_unlock(&vcpu->kvm->mmu_lock);
if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
vcpu->arch.update_pte.pfn = bad_pfn;
}
+ spin_unlock(&vcpu->kvm->mmu_lock);
}
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 156fe10..4ac73a6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -263,6 +263,12 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
pfn = vcpu->arch.update_pte.pfn;
if (is_error_pfn(pfn))
return;
+ if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+ return;
+ smp_rmb();
+ if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) !=
+ vcpu->arch.update_pte.mmu_seq))
+ return;
kvm_get_pfn(pfn);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
@@ -380,6 +386,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
int r;
pfn_t pfn;
int largepage = 0;
+ int mmu_seq;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
kvm_mmu_audit(vcpu, "pre page fault");
@@ -413,6 +420,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
largepage = 1;
}
}
+ mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+ /* implicit mb(), we'll read before PT lock is unlocked */
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
up_read(¤t->mm->mmap_sem);
@@ -424,6 +433,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
}
spin_lock(&vcpu->kvm->mmu_lock);
+ if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+ goto out_unlock;
+ smp_rmb();
+ if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq))
+ goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
largepage, &write_pt, pfn);
@@ -439,6 +453,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
spin_unlock(&vcpu->kvm->mmu_lock);
return write_pt;
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return 0;
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0ce5563..a026cb7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -27,6 +27,7 @@
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
#include <asm/uaccess.h>
#include <asm/msr.h>
@@ -3859,15 +3860,173 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
free_page((unsigned long)vcpu->arch.pio_data);
}
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+ struct kvm_arch *kvm_arch;
+ kvm_arch = container_of(mn, struct kvm_arch, mmu_notifier);
+ return container_of(kvm_arch, struct kvm, arch);
+}
+
+static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int need_tlb_flush;
+
+ /*
+ * When ->invalidate_page runs, the linux pte has been zapped
+ * already but the page is still allocated until
+ * ->invalidate_page returns. So if we increase the sequence
+ * here the kvm page fault will notice if the spte can't be
+ * established because the page is going to be freed. If
+ * instead the kvm page fault establishes the spte before
+ * ->invalidate_page runs, kvm_unmap_hva will release it
+ * before returning.
+
+ * No need of memory barriers as the sequence increase only
+ * need to be seen at spin_unlock time, and not at spin_lock
+ * time.
+ *
+ * Increasing the sequence after the spin_unlock would be
+ * unsafe because the kvm page fault could then establish the
+ * pte after kvm_unmap_hva returned, without noticing the page
+ * is going to be freed.
+ */
+ atomic_inc(&kvm->arch.mmu_notifier_seq);
+ spin_lock(&kvm->mmu_lock);
+ need_tlb_flush = kvm_unmap_hva(kvm, address);
+ spin_unlock(&kvm->mmu_lock);
+
+ /* we've to flush the tlb before the pages can be freed */
+ if (need_tlb_flush)
+ kvm_flush_remote_tlbs(kvm);
+
+}
+
+static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int need_tlb_flush = 0;
+
+ /*
+ * The count increase must become visible at unlock time as no
+ * spte can be established without taking the mmu_lock and
+ * count is also read inside the mmu_lock critical section.
+ */
+ atomic_inc(&kvm->arch.mmu_notifier_count);
+
+ spin_lock(&kvm->mmu_lock);
+ for (; start < end; start += PAGE_SIZE)
+ need_tlb_flush |= kvm_unmap_hva(kvm, start);
+ spin_unlock(&kvm->mmu_lock);
+
+ /* we've to flush the tlb before the pages can be freed */
+ if (need_tlb_flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ /*
+ *
+ * This sequence increase will notify the kvm page fault that
+ * the page that is going to be mapped in the spte could have
+ * been freed.
+ *
+ * There's also an implicit mb() here in this comment,
+ * provided by the last PT lock taken to zap pagetables, and
+ * that the read side has to take too in follow_page(). The
+ * sequence increase in the worst case will become visible to
+ * the kvm page fault after the spin_lock of the last PT lock
+ * of the last PT-lock-protected critical section preceeding
+ * invalidate_range_end. So if the kvm page fault is about to
+ * establish the spte inside the mmu_lock, while we're freeing
+ * the pages, it will have to backoff and when it retries, it
+ * will have to take the PT lock before it can check the
+ * pagetables again. And after taking the PT lock it will
+ * re-establish the pte even if it will see the already
+ * increased sequence number before calling gfn_to_pfn.
+ */
+ atomic_inc(&kvm->arch.mmu_notifier_seq);
+ /*
+ * The sequence increase must be visible before count
+ * decrease. The page fault has to read count before sequence
+ * for this write order to be effective.
+ */
+ wmb();
+ atomic_dec(&kvm->arch.mmu_notifier_count);
+ BUG_ON(atomic_read(&kvm->arch.mmu_notifier_count) < 0);
+}
+
+static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int young;
+
+ spin_lock(&kvm->mmu_lock);
+ young = kvm_age_hva(kvm, address);
+ spin_unlock(&kvm->mmu_lock);
+
+ if (young)
+ kvm_flush_remote_tlbs(kvm);
+
+ return young;
+}
+
+static void kvm_free_vcpus(struct kvm *kvm);
+/* This must zap all the sptes because all pages will be freed then */
+static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ BUG_ON(mm != kvm->mm);
+
+ kvm_destroy_common_vm(kvm);
+
+ kvm_free_pit(kvm);
+ kfree(kvm->arch.vpic);
+ kfree(kvm->arch.vioapic);
+ kvm_free_vcpus(kvm);
+ kvm_free_physmem(kvm);
+ if (kvm->arch.apic_access_page)
+ put_page(kvm->arch.apic_access_page);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+ .release = kvm_mmu_notifier_release,
+ .invalidate_page = kvm_mmu_notifier_invalidate_page,
+ .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
+ .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
+ .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
+};
+
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
@@ -3899,13 +4058,12 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
- kvm_free_pit(kvm);
- kfree(kvm->arch.vpic);
- kfree(kvm->arch.vioapic);
- kvm_free_vcpus(kvm);
- kvm_free_physmem(kvm);
- if (kvm->arch.apic_access_page)
- put_page(kvm->arch.apic_access_page);
+ /*
+ * kvm_mmu_notifier_release() will be called before
+ * mmu_notifier_unregister returns, if it didn't run
+ * already.
+ */
+ mmu_notifier_unregister(&kvm->arch.mmu_notifier, kvm->mm);
kfree(kvm);
}
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 9d963cd..7b8deea 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -13,6 +13,7 @@
#include <linux/types.h>
#include <linux/mm.h>
+#include <linux/mmu_notifier.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -247,6 +248,7 @@ struct kvm_vcpu_arch {
gfn_t gfn; /* presumed gfn during guest pte update */
pfn_t pfn; /* pfn corresponding to that gfn */
int largepage;
+ int mmu_seq;
} update_pte;
struct i387_fxsave_struct host_fx_image;
@@ -314,6 +316,10 @@ struct kvm_arch{
struct page *apic_access_page;
gpa_t wall_clock;
+
+ struct mmu_notifier mmu_notifier;
+ atomic_t mmu_notifier_seq;
+ atomic_t mmu_notifier_count;
};
struct kvm_vm_stat {
@@ -434,6 +440,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
void kvm_mmu_zap_all(struct kvm *kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4e16682..f089edc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -267,6 +267,7 @@ void kvm_arch_check_processor_compat(void *rtn);
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
void kvm_free_physmem(struct kvm *kvm);
+void kvm_destroy_common_vm(struct kvm *kvm);
struct kvm *kvm_arch_create_vm(void);
void kvm_arch_destroy_vm(struct kvm *kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f095b73..4beae7a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -231,15 +231,19 @@ void kvm_free_physmem(struct kvm *kvm)
kvm_free_physmem_slot(&kvm->memslots[i], NULL);
}
-static void kvm_destroy_vm(struct kvm *kvm)
+void kvm_destroy_common_vm(struct kvm *kvm)
{
- struct mm_struct *mm = kvm->mm;
-
spin_lock(&kvm_lock);
list_del(&kvm->vm_list);
spin_unlock(&kvm_lock);
kvm_io_bus_destroy(&kvm->pio_bus);
kvm_io_bus_destroy(&kvm->mmio_bus);
+}
+
+static void kvm_destroy_vm(struct kvm *kvm)
+{
+ struct mm_struct *mm = kvm->mm;
+
kvm_arch_destroy_vm(kvm);
mmdrop(mm);
}
|
|
From: Marcelo T. <mto...@re...> - 2008-04-28 14:46:38
|
Hi Andrea, Looks good. Acked-by: Marcelo Tosatti <mto...@re...> |
|
From: Andrea A. <an...@qu...> - 2008-05-01 18:12:59
|
Hello everyone,
this is the v14 to v15 difference to the mmu-notifier-core patch. This
is just for review of the difference, I'll post full v15 soon, please
review the diff in the meantime. Lots of those cleanups are thanks to
Andrew review on mmu-notifier-core in v14. He also spotted the
GFP_KERNEL allocation under spin_lock where DEBUG_SPINLOCK_SLEEP
failed to catch it until I enabled PREEMPT (GFP_KERNEL there was
perfectly safe with all patchset applied but not ok if only
mmu-notifier-core was applied). As usual that bug couldn't hurt
anybody unless the mmu notifiers were armed.
I also wrote a proper changelog to the mmu-notifier-core patch that I
will append before the v14->v15 diff:
Subject: mmu-notifier-core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to
pages. There are secondary MMUs (with secondary sptes and secondary
tlbs) too. sptes in the kvm case are shadow pagetables, but when I say
spte in mmu-notifier context, I mean "secondary pte". In GRU case
there's no actual secondary pte and there's only a secondary tlb
because the GRU secondary MMU has no knowledge about sptes and every
secondary tlb miss event in the MMU always generates a page fault that
has to be resolved by the CPU (this is not the case of KVM where the a
secondary tlb miss will walk sptes in hardware and it will refill the
secondary tlb transparently to software if the corresponding spte is
present). The same way zap_page_range has to invalidate the pte before
freeing the page, the spte (and secondary tlb) must also be
invalidated before any page is freed and reused.
Currently we take a page_count pin on every page mapped by sptes, but
that means the pages can't be swapped whenever they're mapped by any
spte because they're part of the guest working set. Furthermore a spte
unmap event can immediately lead to a page to be freed when the pin is
released (so requiring the same complex and relatively slow tlb_gather
smp safe logic we have in zap_page_range and that can be avoided
completely if the spte unmap event doesn't require an unpin of the
page previously mapped in the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and
know when the VM is swapping or freeing or doing anything on the
primary MMU so that the secondary MMU code can drop sptes before the
pages are freed, avoiding all page pinning and allowing 100% reliable
swapping of guest physical address space. Furthermore it avoids the
code that teardown the mappings of the secondary MMU, to implement a
logic like tlb_gather in zap_page_range that would require many IPI to
flush other cpu tlbs, for each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings
will be invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an
updated spte or secondary-tlb-mapping on the copied page. Or it will
setup a readonly spte or readonly tlb mapping if it's a guest-read, if
it calls get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in
the primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU,
or an full MMU with both sptes and secondary-tlb like the
shadow-pagetable layer with kvm), or a remote DMA in software like
XPMEM (hence needing of schedule in XPMEM code to send the invalidate
to the remote node, while no need to schedule in kvm/gru as it's an
immediate event like invalidating primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) Introduces list_del_init_rcu and documents it (fixes a comment for
list_del_rcu too)
2) mm_lock() to register the mmu notifier when the whole VM isn't
doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to
map any spte or secondary tlb reference, while the VM is in the
middle of range_begin/end as any page returned by get_user_pages in
that critical section could later immediately be freed without any
further ->invalidate_page notification (invalidate_range_begin/end
works on ranges and ->invalidate_page isn't called immediately
before freeing the page). To stop all page freeing and pagetable
overwrites the mmap_sem must be taken in write mode and all other
anon_vma/i_mmap locks must be taken in virtual address order. The
order is critical to avoid mm_lock(mm1) and mm_lock(mm2) running
concurrently to trigger lock inversion deadlocks.
3) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled
if CONFIG_KVM=m/y. In the current kernel kvm won't yet take
advantage of mmu notifiers, but this already allows to compile a
KVM external module against a kernel with mmu notifiers enabled and
from the next pull from kvm.git we'll start using them. And
GRU/XPMEM will also be able to continue the development by enabling
KVM=m in their config, until they submit all GRU/XPMEM GPLv2 code
to the mainline kernel. Then they can also enable MMU_NOTIFIERS in
the same way KVM does it (even if KVM=n). This guarantees nobody
selects MMU_NOTIFIER=y if KVM and GRU and XPMEM are all =n.
The mmu_notifier_register call can fail because mm_lock may not
allocate the required vmalloc space. See the comment on top of
mm_lock() implementation for the worst case memory requirements.
Because mmu_notifier_reigster is used when a driver startup, a failure
can be gracefully handled. Here an example of the change applied to
kvm to register the mmu notifiers. Usually when a driver startups
other allocations are required anyway and -ENOMEM failure paths exists
already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
select PREEMPT_NOTIFIERS
+ select MMU_NOTIFIER
select ANON_INODES
---help---
Support hosting fully virtualized guest machines using hardware
diff --git a/include/linux/list.h b/include/linux/list.h
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -739,7 +739,7 @@ static inline void hlist_del(struct hlis
* or hlist_del_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
- * hlist_for_each_entry().
+ * hlist_for_each_entry_rcu().
*/
static inline void hlist_del_rcu(struct hlist_node *n)
{
@@ -755,6 +755,26 @@ static inline void hlist_del_init(struct
}
}
+/**
+ * hlist_del_init_rcu - deletes entry from hash list with re-initialization
+ * @n: the element to delete from the hash list.
+ *
+ * Note: list_unhashed() on entry does return true after this. It is
+ * useful for RCU based read lockfree traversal if the writer side
+ * must know if the list entry is still hashed or already unhashed.
+ *
+ * In particular, it means that we can not poison the forward pointers
+ * that may still be used for walking the hash list and we can only
+ * zero the pprev pointer so list_unhashed() will return true after
+ * this.
+ *
+ * The caller must take whatever precautions are necessary (such as
+ * holding appropriate locks) to avoid racing with another
+ * list-mutation primitive, such as hlist_add_head_rcu() or
+ * hlist_del_rcu(), running on this same list. However, it is
+ * perfectly legal to run concurrently with the _rcu list-traversal
+ * primitives, such as hlist_for_each_entry_rcu().
+ */
static inline void hlist_del_init_rcu(struct hlist_node *n)
{
if (!hlist_unhashed(n)) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1050,18 +1050,6 @@ extern int install_special_mapping(struc
unsigned long addr, unsigned long len,
unsigned long flags, struct page **pages);
-/*
- * mm_lock will take mmap_sem writably (to prevent all modifications
- * and scanning of vmas) and then also takes the mapping locks for
- * each of the vma to lockout any scans of pagetables of this address
- * space. This can be used to effectively holding off reclaim from the
- * address space.
- *
- * mm_lock can fail if there is not enough memory to store a pointer
- * array to all vmas.
- *
- * mm_lock and mm_unlock are expensive operations that may take a long time.
- */
struct mm_lock_data {
spinlock_t **i_mmap_locks;
spinlock_t **anon_vma_locks;
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -4,17 +4,24 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
+#include <linux/srcu.h>
struct mmu_notifier;
struct mmu_notifier_ops;
#ifdef CONFIG_MMU_NOTIFIER
-#include <linux/srcu.h>
+/*
+ * The mmu notifier_mm structure is allocated and installed in
+ * mm->mmu_notifier_mm inside the mm_lock() protected critical section
+ * and it's released only when mm_count reaches zero in mmdrop().
+ */
struct mmu_notifier_mm {
+ /* all mmu notifiers registerd in this mm are queued in this list */
struct hlist_head list;
+ /* srcu structure for this mm */
struct srcu_struct srcu;
- /* to serialize mmu_notifier_unregister against mmu_notifier_release */
+ /* to serialize the list modifications and hlist_unhashed */
spinlock_t lock;
};
@@ -23,8 +30,8 @@ struct mmu_notifier_ops {
* Called either by mmu_notifier_unregister or when the mm is
* being destroyed by exit_mmap, always before all pages are
* freed. It's mandatory to implement this method. This can
- * run concurrently to other mmu notifier methods and it
- * should teardown all secondary mmu mappings and freeze the
+ * run concurrently with other mmu notifier methods and it
+ * should tear down all secondary mmu mappings and freeze the
* secondary mmu.
*/
void (*release)(struct mmu_notifier *mn,
@@ -43,9 +50,10 @@ struct mmu_notifier_ops {
/*
* Before this is invoked any secondary MMU is still ok to
- * read/write to the page previously pointed by the Linux pte
- * because the old page hasn't been freed yet. If required
- * set_page_dirty has to be called internally to this method.
+ * read/write to the page previously pointed to by the Linux
+ * pte because the page hasn't been freed yet and it won't be
+ * freed until this returns. If required set_page_dirty has to
+ * be called internally to this method.
*/
void (*invalidate_page)(struct mmu_notifier *mn,
struct mm_struct *mm,
@@ -53,20 +61,18 @@ struct mmu_notifier_ops {
/*
* invalidate_range_start() and invalidate_range_end() must be
- * paired and are called only when the mmap_sem is held and/or
- * the semaphores protecting the reverse maps. Both functions
+ * paired and are called only when the mmap_sem and/or the
+ * locks protecting the reverse maps are held. Both functions
* may sleep. The subsystem must guarantee that no additional
- * references to the pages in the range established between
- * the call to invalidate_range_start() and the matching call
- * to invalidate_range_end().
+ * references are taken to the pages in the range established
+ * between the call to invalidate_range_start() and the
+ * matching call to invalidate_range_end().
*
- * Invalidation of multiple concurrent ranges may be permitted
- * by the driver or the driver may exclude other invalidation
- * from proceeding by blocking on new invalidate_range_start()
- * callback that overlap invalidates that are already in
- * progress. Either way the establishment of sptes to the
- * range can only be allowed if all invalidate_range_stop()
- * function have been called.
+ * Invalidation of multiple concurrent ranges may be
+ * optionally permitted by the driver. Either way the
+ * establishment of sptes is forbidden in the range passed to
+ * invalidate_range_begin/end for the whole duration of the
+ * invalidate_range_begin/end critical section.
*
* invalidate_range_start() is called when all pages in the
* range are still mapped and have at least a refcount of one.
@@ -187,6 +193,14 @@ static inline void mmu_notifier_mm_destr
__mmu_notifier_mm_destroy(mm);
}
+/*
+ * These two macros will sometime replace ptep_clear_flush.
+ * ptep_clear_flush is impleemnted as macro itself, so this also is
+ * implemented as a macro until ptep_clear_flush will converted to an
+ * inline function, to diminish the risk of compilation failure. The
+ * invalidate_page method over time can be moved outside the PT lock
+ * and these two macros can be later removed.
+ */
#define ptep_clear_flush_notify(__vma, __address, __ptep) \
({ \
pte_t __pte; \
diff --git a/mm/Kconfig b/mm/Kconfig
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -193,7 +193,3 @@ config VIRT_TO_BUS
config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
-
-config MMU_NOTIFIER
- def_bool y
- bool "MMU notifier, for paging KVM/RDMA"
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -613,6 +613,12 @@ int copy_page_range(struct mm_struct *ds
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+ /*
+ * We need to invalidate the secondary MMU mappings only when
+ * there could be a permission downgrade on the ptes of the
+ * parent mm. And a permission downgrade will only happen if
+ * is_cow_mapping() returns true.
+ */
if (is_cow_mapping(vma->vm_flags))
mmu_notifier_invalidate_range_start(src_mm, addr, end);
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2329,7 +2329,36 @@ static inline void __mm_unlock(spinlock_
* operations that could ever happen on a certain mm. This includes
* vmtruncate, try_to_unmap, and all page faults. The holder
* must not hold any mm related lock. A single task can't take more
- * than one mm lock in a row or it would deadlock.
+ * than one mm_lock in a row or it would deadlock.
+ *
+ * The mmap_sem must be taken in write mode to block all operations
+ * that could modify pagetables and free pages without altering the
+ * vma layout (for example populate_range() with nonlinear vmas).
+ *
+ * The sorting is needed to avoid lock inversion deadlocks if two
+ * tasks run mm_lock at the same time on different mm that happen to
+ * share some anon_vmas/inodes but mapped in different order.
+ *
+ * mm_lock and mm_unlock are expensive operations that may have to
+ * take thousand of locks. Thanks to sort() the complexity is
+ * O(N*log(N)) where N is the number of VMAs in the mm. The max number
+ * of vmas is defined in /proc/sys/vm/max_map_count.
+ *
+ * mm_lock() can fail if memory allocation fails. The worst case
+ * vmalloc allocation required is 2*max_map_count*sizeof(spinlock *),
+ * so around 1Mbyte, but in practice it'll be much less because
+ * normally there won't be max_map_count vmas allocated in the task
+ * that runs mm_lock().
+ *
+ * The vmalloc memory allocated by mm_lock is stored in the
+ * mm_lock_data structure that must be allocated by the caller and it
+ * must be later passed to mm_unlock that will free it after using it.
+ * Allocating the mm_lock_data structure on the stack is fine because
+ * it's only a couple of bytes in size.
+ *
+ * If mm_lock() returns -ENOMEM no memory has been allocated and the
+ * mm_lock_data structure can be freed immediately, and mm_unlock must
+ * not be called.
*/
int mm_lock(struct mm_struct *mm, struct mm_lock_data *data)
{
@@ -2350,6 +2379,13 @@ int mm_lock(struct mm_struct *mm, struct
return -ENOMEM;
}
+ /*
+ * When mm_lock_sort_anon_vma/i_mmap returns zero it
+ * means there's no lock to take and so we can free
+ * the array here without waiting mm_unlock. mm_unlock
+ * will do nothing if nr_i_mmap/anon_vma_locks is
+ * zero.
+ */
data->nr_anon_vma_locks = mm_lock_sort_anon_vma(mm, anon_vma_locks);
data->nr_i_mmap_locks = mm_lock_sort_i_mmap(mm, i_mmap_locks);
@@ -2374,7 +2410,17 @@ static void mm_unlock_vfree(spinlock_t *
vfree(locks);
}
-/* avoid memory allocations for mm_unlock to prevent deadlock */
+/*
+ * mm_unlock doesn't require any memory allocation and it won't fail.
+ *
+ * All memory has been previously allocated by mm_lock and it'll be
+ * all freed before returning. Only after mm_unlock returns, the
+ * caller is allowed to free and forget the mm_lock_data structure.
+ *
+ * mm_unlock runs in O(N) where N is the max number of VMAs in the
+ * mm. The max number of vmas is defined in
+ * /proc/sys/vm/max_map_count.
+ */
void mm_unlock(struct mm_struct *mm, struct mm_lock_data *data)
{
if (mm->map_count) {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -21,12 +21,12 @@
* This function can't run concurrently against mmu_notifier_register
* because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
* runs with mm_users == 0. Other tasks may still invoke mmu notifiers
- * in parallel despite there's no task using this mm anymore, through
- * the vmas outside of the exit_mmap context, like with
+ * in parallel despite there being no task using this mm any more,
+ * through the vmas outside of the exit_mmap context, such as with
* vmtruncate. This serializes against mmu_notifier_unregister with
* the mmu_notifier_mm->lock in addition to SRCU and it serializes
* against the other mmu notifiers with SRCU. struct mmu_notifier_mm
- * can't go away from under us as exit_mmap holds a mm_count pin
+ * can't go away from under us as exit_mmap holds an mm_count pin
* itself.
*/
void __mmu_notifier_release(struct mm_struct *mm)
@@ -41,7 +41,7 @@ void __mmu_notifier_release(struct mm_st
hlist);
/*
* We arrived before mmu_notifier_unregister so
- * mmu_notifier_unregister will do nothing else than
+ * mmu_notifier_unregister will do nothing other than
* to wait ->release to finish and
* mmu_notifier_unregister to return.
*/
@@ -66,7 +66,11 @@ void __mmu_notifier_release(struct mm_st
spin_unlock(&mm->mmu_notifier_mm->lock);
/*
- * Wait ->release if mmu_notifier_unregister is running it.
+ * synchronize_srcu here prevents mmu_notifier_release to
+ * return to exit_mmap (which would proceed freeing all pages
+ * in the mm) until the ->release method returns, if it was
+ * invoked by mmu_notifier_unregister.
+ *
* The mmu_notifier_mm can't go away from under us because one
* mm_count is hold by exit_mmap.
*/
@@ -144,8 +148,9 @@ void __mmu_notifier_invalidate_range_end
* Must not hold mmap_sem nor any other VM related lock when calling
* this registration function. Must also ensure mm_users can't go down
* to zero while this runs to avoid races with mmu_notifier_release,
- * so mm has to be current->mm or the mm should be pinned safely like
- * with get_task_mm(). mmput can be called after mmu_notifier_register
+ * so mm has to be current->mm or the mm should be pinned safely such
+ * as with get_task_mm(). If the mm is not current->mm, the mm_users
+ * pin should be released by calling mmput after mmu_notifier_register
* returns. mmu_notifier_unregister must be always called to
* unregister the notifier. mm_count is automatically pinned to allow
* mmu_notifier_unregister to safely run at any time later, before or
@@ -155,29 +160,29 @@ int mmu_notifier_register(struct mmu_not
int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
{
struct mm_lock_data data;
+ struct mmu_notifier_mm * mmu_notifier_mm;
int ret;
BUG_ON(atomic_read(&mm->mm_users) <= 0);
+ ret = -ENOMEM;
+ mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+ if (unlikely(!mmu_notifier_mm))
+ goto out;
+
+ ret = init_srcu_struct(&mmu_notifier_mm->srcu);
+ if (unlikely(ret))
+ goto out_kfree;
+
ret = mm_lock(mm, &data);
if (unlikely(ret))
- goto out;
+ goto out_cleanup;
if (!mm_has_notifiers(mm)) {
- mm->mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm),
- GFP_KERNEL);
- ret = -ENOMEM;
- if (unlikely(!mm_has_notifiers(mm)))
- goto out_unlock;
-
- ret = init_srcu_struct(&mm->mmu_notifier_mm->srcu);
- if (unlikely(ret)) {
- kfree(mm->mmu_notifier_mm);
- mmu_notifier_mm_init(mm);
- goto out_unlock;
- }
- INIT_HLIST_HEAD(&mm->mmu_notifier_mm->list);
- spin_lock_init(&mm->mmu_notifier_mm->lock);
+ INIT_HLIST_HEAD(&mmu_notifier_mm->list);
+ spin_lock_init(&mmu_notifier_mm->lock);
+ mm->mmu_notifier_mm = mmu_notifier_mm;
+ mmu_notifier_mm = NULL;
}
atomic_inc(&mm->mm_count);
@@ -192,8 +197,14 @@ int mmu_notifier_register(struct mmu_not
spin_lock(&mm->mmu_notifier_mm->lock);
hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
spin_unlock(&mm->mmu_notifier_mm->lock);
-out_unlock:
+
mm_unlock(mm, &data);
+out_cleanup:
+ if (mmu_notifier_mm)
+ cleanup_srcu_struct(&mmu_notifier_mm->srcu);
+out_kfree:
+ /* kfree() does nothing if mmu_notifier_mm is NULL */
+ kfree(mmu_notifier_mm);
out:
BUG_ON(atomic_read(&mm->mm_users) <= 0);
return ret;
|