Thread: [kvm-devel] KVM swapping with mmu notifiers

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Hi everyone,

So far KVM swapping has been a limited feature. Depending on the
workloads huge chunks of the anonymous memory simulating the guest
physical memory could get pinned and unswappable for extended periods
of time. Whenever a spte mapps a host physical page, KVM has to pin
the page to prevent it to be swapped out. The page could still be
unmapped from the Linux VM ptes, it could go in swapcache, but the
boosted reference count (due to the spte pointing to the host physical
page) would prevent the page to be freed (and rightfully so). The big
difference is that the mmu notifier patch now allows KVM to know when
the the main Linux VM wants to unmap a certain host physical
page. When that happens KVM now make sure to release all sptes and to
drop the reference count of the page, so the page can finally be
swapped out for real in any case. This way the KVM task can now be
swapped out fully and at any time regardless of the guest OS activity
and regardless the size of the readonly shadow-pte cache generated by
the guest-OS.

Last test I run on this code was to run two VM on dual core SVM host,
SMP guest (4 vcpus). The linux vm was ~400M the other VM was 3G. Host
system has 2G ram + 4G swap.

Starting a heavy VM job both VM are swapped out quite nicely (one VM
was running my oom deadlock testcase for the linux-mm oom patches, the
other was playing a youtube video):

andrea    9742 57.7  2.5 588536 50104
andrea    9809 69.3  9.4 3211172 182448

After sigstopping both and running the same heavy VM job again I get:

andrea    9742 42.6  0.0 588536   644
andrea    9809 48.2  0.0 3211172  848

So when sigstopped less than 1M of rss remains allocated in ram.

After sigcont and after killing the heavy VM job (that released lots
of ram and swap) both VM gracefully restarts with only swapins firing
in the host:

andrea    9742 57.6  2.0 588536 39308
andrea    9809 61.6 61.4 3211172 1186256

No idea why the non-linux VM after a while grows back to a 1G working
set despite a single youtube playback is playing in the guest... ;),
the linux vm OTOH has only a 39M working set when idling in the oom
loops.

Host must be compiled with CONFIG_MMU_NOTIFIERS=y of course, or this
won't work.

Here the patch to kvm.git (there's some room for optimization in doing
a single tlb flush in the unmap_spte for all sptes pointing to the
page, or even more aggressively for the whole range in the
invalidate_range case, but the invalidate_range isn't an interesting
path for kvm so I guess not worth optimizing in the short/mid term,
but by optimizing the invalidate_page case we may halve the number of
tlb flushes for some common case. I leave it for later, the swapping
is heavily I/O bound anyway so a some more ipi in smp host shouldn't
be very measurable (on UP host it makes no difference to flush
multiple times in practice).

Signed-off-by: Andrea Arcangeli <an...@qu...>

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4086080..c527d7d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -18,6 +18,7 @@ config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on ARCH_SUPPORTS_KVM && EXPERIMENTAL
 	select PREEMPT_NOTIFIERS
+	select MMU_NOTIFIER
 	select ANON_INODES
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 324ff9a..103c270 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -532,6 +532,36 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 		kvm_flush_remote_tlbs(kvm);
 }
 
+static void unmap_spte(struct kvm *kvm, u64 *spte)
+{
+	struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+	get_page(page);
+	rmap_remove(kvm, spte);
+	set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+	kvm_flush_remote_tlbs(kvm);
+	__free_page(page);
+}
+
+void kvm_rmap_unmap_gfn(struct kvm *kvm, gfn_t gfn)
+{
+	unsigned long *rmapp;
+	u64 *spte, *curr_spte;
+
+	spin_lock(&kvm->mmu_lock);
+	gfn = unalias_gfn(kvm, gfn);
+	rmapp = gfn_to_rmap(kvm, gfn);
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		BUG_ON(!(*spte & PT_PRESENT_MASK));
+		rmap_printk("rmap_swap_page: spte %p %llx\n", spte, *spte);
+		curr_spte = spte;
+		spte = rmap_next(kvm, rmapp, spte);
+		unmap_spte(kvm, curr_spte);
+	}
+	spin_unlock(&kvm->mmu_lock);
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8a90403..e9a3f6e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3159,6 +3159,36 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vcpu->arch.pio_data);
 }
 
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+	return container_of(mn, struct kvm, mmu_notifier);
+}
+
+void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+				      struct mm_struct *mm,
+				      unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	gfn_t gfn = hva_to_gfn(kvm, address);
+	BUG_ON(mm != kvm->mm);
+	if (gfn == -1UL)
+		return;
+	kvm_rmap_unmap_gfn(kvm, gfn);
+}
+
+void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long start, unsigned long end)
+{
+	for (; start < end; start += PAGE_SIZE)
+		kvm_mmu_notifier_invalidate_page(mn, mm, start);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.invalidate_range	= kvm_mmu_notifier_invalidate_range,
+	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
+};
+
 struct  kvm *kvm_arch_create_vm(void)
 {
 	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
@@ -3167,6 +3197,7 @@ struct  kvm *kvm_arch_create_vm(void)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 
 	return kvm;
 }
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index d6db0de..feacd77 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -404,6 +404,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 
+void kvm_rmap_unmap_gfn(struct kvm *kvm, gfn_t gfn);
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_zap_all(struct kvm *kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2714068..85da7fa 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -117,6 +117,7 @@ struct kvm {
 	struct kvm_io_bus pio_bus;
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
+	struct mmu_notifier mmu_notifier;
 };
 
 /* The guest did something we don't support. */
@@ -163,6 +164,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot old,
 				int user_alloc);
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
+gfn_t hva_to_gfn(struct kvm *kvm, unsigned long addr);
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4295623..8f1dd86 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -165,6 +165,7 @@ static struct kvm *kvm_create_vm(void)
 
 	kvm->mm = current->mm;
 	atomic_inc(&kvm->mm->mm_count);
+	mmu_notifier_register(&kvm->mmu_notifier, kvm->mm);
 	spin_lock_init(&kvm->mmu_lock);
 	kvm_io_bus_init(&kvm->pio_bus);
 	mutex_init(&kvm->lock);
@@ -454,6 +455,23 @@ static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
 }
 
+gfn_t hva_to_gfn(struct kvm *kvm, unsigned long addr)
+{
+	int i;
+
+	for (i = 0; i < kvm->nmemslots; i++) {
+		struct kvm_memory_slot *memslot = &kvm->memslots[i];
+		unsigned long start = memslot->userspace_addr;
+		unsigned long end = start + (memslot->npages << PAGE_SHIFT);
+
+		if (addr >= start && addr < end) {
+			gfn_t gfn_offset = (addr - start) >> PAGE_SHIFT;
+			return memslot->base_gfn + gfn_offset;
+		}
+	}
+	return -1UL;
+}
+
 /*
  * Requires current->mm->mmap_sem to be held
  */



And here a compatibility patch to kvm-userland so the external module
still compile and runs with older kernels w/o MMU_NOTIFIER patch applied.

Signed-off-by: Andrea Arcangeli <an...@qu...>

diff --git a/kernel/external-module-compat.h b/kernel/external-module-compat.h
index 67b9cc4..34ef0a5 100644
--- a/kernel/external-module-compat.h
+++ b/kernel/external-module-compat.h
@@ -17,6 +17,28 @@
 #include <linux/hrtimer.h>
 #include <asm/bitops.h>
 
+#ifndef CONFIG_MMU_NOTIFIER
+struct mmu_notifier;
+
+struct mmu_notifier_ops {
+	void (*release)(struct mmu_notifier * mn,
+			struct mm_struct *mm);
+	void (*invalidate_page)(struct mmu_notifier * mn,
+				struct mm_struct *mm,
+				unsigned long address);
+	void (*invalidate_range)(struct mmu_notifier * mn,
+				 struct mm_struct *mm,
+				 unsigned long start, unsigned long end);
+};
+
+struct mmu_notifier {
+	const struct mmu_notifier_ops *ops;
+};
+#define mmu_notifier_register(mn, mm) do {} while(0)
+#define mmu_notifier_unregister(mn) do {} while (0)
+#define mmu_notifier_release(mm) do {} while (0)
+#endif
+
 /*
  * 2.6.16 does not have GFP_NOWAIT
  */


Here another patch for kvm-userland where I can't see symmetry between
the lack of atomic_inc despite mmdrop is still run. I can't possibly
see how this is supposedly not required when compiled into the kernel
vs external module. Either atomic_inc is needed in both or none. Even
if I'm right still this bug wasn't destabilizing because
atomic_inc_and_dec only fires once in the overflow check, so it
shouldn't matter to run one mmdrop more than needed, but it's good for
correctness.

Signed-off-by: Andrea Arcangeli <an...@qu...>

diff --git a/kernel/hack-module.awk b/kernel/hack-module.awk
index 5187c96..884bc50 100644
--- a/kernel/hack-module.awk
+++ b/kernel/hack-module.awk
@@ -33,8 +33,6 @@
     vmx_load_host_state = 0
 }
 
-/atomic_inc\(&kvm->mm->mm_count\);/ { $0 = "//" $0 }
-
 /^\t\.fault = / {
     fcn = gensub(/,/, "", "g", $3)
     $0 = "\t.VMA_OPS_FAULT(fault) = VMA_OPS_FAULT_FUNC(" fcn "),"


I'll post the mmu-notifiers patch (required in the host kernel to run
the above) separately in CC with more mailing lists because that's not
KVM code at all and we hope to get it merged in the mainline kernel
soon after getting feedback on the interface from the other users of
the mmu notifiers.

Thanks!
Andrea