From: Kay, A. M <all...@in...> - 2008-05-05 21:36:34
|
Kvm kernel changes. Signed-off-by: Allen M Kay <all...@in...> ------ arch/x86/kvm/Makefile | 2 arch/x86/kvm/vtd.c | 183 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 7 + include/asm-x86/kvm_host.h | 3 include/asm-x86/kvm_para.h | 1 include/linux/kvm_host.h | 6 + virt/kvm/kvm_main.c | 3 7 files changed, 204 insertions(+), 1 deletion(-) ------ diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index c97d35c..b1057fb 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,7 +12,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ i8254.o obj-$(CONFIG_KVM) += kvm.o -kvm-intel-objs = vmx.o +kvm-intel-objs = vmx.o vtd.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o kvm-amd-objs = svm.o obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c new file mode 100644 index 0000000..9a080b5 --- /dev/null +++ b/arch/x86/kvm/vtd.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Author: Allen M. Kay <all...@in...> + * Author: Weidong Han <wei...@in...> + */ + +#include <linux/list.h> +#include <linux/kvm_host.h> +#include <linux/pci.h> +#include <linux/dmar.h> +#include <linux/intel-iommu.h> + +//#define DEBUG + +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 + +struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev); +struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu); +void iommu_free_domain(struct dmar_domain *domain); +int domain_init(struct dmar_domain *domain, int guest_width); +int domain_context_mapping(struct dmar_domain *d, + struct pci_dev *pdev); +int domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, + u64 hpa, size_t size, int prot); +void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn); +struct dmar_domain * find_domain(struct pci_dev *pdev); + + +int kvm_iommu_map_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + unsigned long gpa; + struct page *page; + hpa_t hpa; + int j, write; + struct vm_area_struct *vma; + + if (!kvm->arch.domain) + return 1; + + gpa = base_gfn << PAGE_SHIFT; + page = gfn_to_page(kvm, base_gfn); + hpa = page_to_phys(page); + + printk(KERN_DEBUG "kvm_iommu_map_page: gpa = %lx\n", gpa); + printk(KERN_DEBUG "kvm_iommu_map_page: hpa = %llx\n", hpa); + printk(KERN_DEBUG "kvm_iommu_map_page: size = %lx\n", + npages*PAGE_SIZE); + + for (j = 0; j < npages; j++) { + gpa += PAGE_SIZE; + page = gfn_to_page(kvm, gpa >> PAGE_SHIFT); + hpa = page_to_phys(page); + domain_page_mapping(kvm->arch.domain, gpa, hpa, PAGE_SIZE, + DMA_PTE_READ | DMA_PTE_WRITE); + vma = find_vma(current->mm, gpa); + if (!vma) + return 1; + write = (vma->vm_flags & VM_WRITE) != 0; + get_user_pages(current, current->mm, gpa, + PAGE_SIZE, write, 0, NULL, NULL); + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_iommu_map_pages); + +static int kvm_iommu_map_memslots(struct kvm *kvm) +{ + int i, status; + for (i = 0; i < kvm->nmemslots; i++) { + status = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn, + kvm->memslots[i].npages); + if (status) + return status; + } + return 0; +} + +int kvm_iommu_map_guest(struct kvm *kvm, + struct kvm_pci_passthrough_dev *pci_pt_dev) +{ + struct dmar_drhd_unit *drhd; + struct dmar_domain *domain; + struct intel_iommu *iommu; + struct pci_dev *pdev = NULL; + + printk(KERN_DEBUG "kvm_iommu_map_guest: host bdf = %x:%x:%x\n", + pci_pt_dev->host.busnr, + PCI_SLOT(pci_pt_dev->host.devfn), + PCI_FUNC(pci_pt_dev->host.devfn)); + + for_each_pci_dev(pdev) { + if ((pdev->bus->number == pci_pt_dev->host.busnr) && + (pdev->devfn == pci_pt_dev->host.devfn)) + goto found; + } + goto not_found; +found: + pci_pt_dev->pdev = pdev; + + drhd = dmar_find_matched_drhd_unit(pdev); + if (!drhd) { + printk(KERN_ERR "kvm_iommu_map_guest: drhd == NULL\n"); + goto not_found; + } + + printk(KERN_DEBUG "kvm_iommu_map_guest: reg_base_addr = %llx\n", + drhd->reg_base_addr); + + iommu = drhd->iommu; + if (!iommu) { + printk(KERN_ERR "kvm_iommu_map_guest: iommu == NULL\n"); + goto not_found; + } + domain = iommu_alloc_domain(iommu); + if (!domain) { + printk(KERN_ERR "kvm_iommu_map_guest: domain == NULL\n"); + goto not_found; + } + if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { + printk(KERN_ERR "kvm_iommu_map_guest: domain_init() failed\n"); + goto not_found; + } + kvm->arch.domain = domain; + kvm_iommu_map_memslots(kvm); + domain_context_mapping(kvm->arch.domain, pdev); + return 0; +not_found: + return 1; +} +EXPORT_SYMBOL_GPL(kvm_iommu_map_guest); + +int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + struct dmar_domain *domain; + struct kvm_pci_pt_dev_list *entry; + struct pci_dev *pdev; + + list_for_each_entry(entry, &kvm->arch.domain->devices, list) { + printk(KERN_DEBUG "kvm_iommu_unmap_guest: %x:%x:%x\n", + entry->pt_dev.host.busnr, + PCI_SLOT(entry->pt_dev.host.devfn), + PCI_FUNC(entry->pt_dev.host.devfn)); + + pdev = entry->pt_dev.pdev; + + if (pdev == NULL) { + printk("kvm_iommu_unmap_guest: pdev == NULL\n"); + return 1; + } + + /* detach kvm dmar domain */ + detach_domain_for_dev(kvm->arch.domain, + pdev->bus->number, pdev->devfn); + + /* now restore back linux iommu domain */ + domain = find_domain(pdev); + if (domain) + domain_context_mapping(domain, pdev); + else + printk(KERN_DEBUG + "kvm_iommu_unmap_guest: domain == NULL\n"); + } + /* unmap guest memory in vt-d page table */ + iommu_free_domain(kvm->arch.domain); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_iommu_unmap_guest); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a97d2e2..a877db2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -257,6 +257,7 @@ static void kvm_free_pci_passthrough(struct kvm *kvm) list_del(&pci_pt_dev->list); } + kvm->arch.domain = NULL; } unsigned long segment_base(u16 selector) @@ -1846,6 +1847,10 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&pci_pt_dev, argp, sizeof pci_pt_dev)) goto out; + r = kvm_iommu_map_guest(kvm, &pci_pt_dev); + if (r) + goto out; + r = kvm_vm_ioctl_pci_pt_dev(kvm, &pci_pt_dev); if (r) goto out; @@ -4088,6 +4093,8 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + if (kvm->arch.domain) + kvm_iommu_unmap_guest(kvm); kvm_free_pci_passthrough(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 4662d49..70248cb 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -19,6 +19,8 @@ #include <linux/kvm_types.h> #include <asm/desc.h> +#include <linux/dmar.h> +#include <linux/intel-iommu.h> #define KVM_MAX_VCPUS 16 #define KVM_MEMORY_SLOTS 32 @@ -318,6 +320,7 @@ struct kvm_arch{ */ struct list_head active_mmu_pages; struct list_head pci_pt_dev_head; + struct dmar_domain *domain; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h index 5f93b78..6202ed1 100644 --- a/include/asm-x86/kvm_para.h +++ b/include/asm-x86/kvm_para.h @@ -170,5 +170,6 @@ struct kvm_pci_pt_info { struct kvm_pci_passthrough_dev { struct kvm_pci_pt_info guest; struct kvm_pci_pt_info host; + struct pci_dev *pdev; /* kernel device pointer for host dev */ }; #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4e16682..bcfcf78 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -276,6 +276,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); +int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn, + unsigned long npages); +int kvm_iommu_map_guest(struct kvm *kvm, + struct kvm_pci_passthrough_dev *pci_pt_dev); +int kvm_iommu_unmap_guest(struct kvm *kvm); + static inline void kvm_guest_enter(void) { account_system_vtime(current); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d3cb4cc..e46614a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -309,6 +309,9 @@ int __kvm_set_memory_region(struct kvm *kvm, new.npages = npages; new.flags = mem->flags; + /* map the pages in iommu page table */ + kvm_iommu_map_pages(kvm, base_gfn, npages); + /* Disallow changing a memory slot's size. */ r = -EINVAL; if (npages && old.npa |
From: Amit S. <ami...@qu...> - 2008-05-06 07:36:04
|
On Tuesday 06 May 2008 03:06:23 Kay, Allen M wrote: > Kvm kernel changes. > > Signed-off-by: Allen M Kay <all...@in...> > --- /dev/null > +++ b/arch/x86/kvm/vtd.c > @@ -0,0 +1,183 @@ > + > +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 > + > +struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev > *dev); > +struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu); > +void iommu_free_domain(struct dmar_domain *domain); > +int domain_init(struct dmar_domain *domain, int guest_width); > +int domain_context_mapping(struct dmar_domain *d, > + struct pci_dev *pdev); > +int domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, > + u64 hpa, size_t size, int prot); > +void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 > devfn); > +struct dmar_domain * find_domain(struct pci_dev *pdev); Please move these to a .h file and also prefix appropriate keywords: domain_context_mapping is confusing and since it's an intel iommu-only thing, use something like intel_iommu_domain_context_mapping > +int kvm_iommu_map_pages(struct kvm *kvm, > + gfn_t base_gfn, unsigned long npages) > +{ > + unsigned long gpa; > + struct page *page; > + hpa_t hpa; > + int j, write; > + struct vm_area_struct *vma; > + > + if (!kvm->arch.domain) > + return 1; > + > + gpa = base_gfn << PAGE_SHIFT; > + page = gfn_to_page(kvm, base_gfn); > + hpa = page_to_phys(page); > + > + printk(KERN_DEBUG "kvm_iommu_map_page: gpa = %lx\n", gpa); > + printk(KERN_DEBUG "kvm_iommu_map_page: hpa = %llx\n", hpa); > + printk(KERN_DEBUG "kvm_iommu_map_page: size = %lx\n", > + npages*PAGE_SIZE); > + > + for (j = 0; j < npages; j++) { > + gpa += PAGE_SIZE; > + page = gfn_to_page(kvm, gpa >> PAGE_SHIFT); > + hpa = page_to_phys(page); > + domain_page_mapping(kvm->arch.domain, gpa, hpa, > PAGE_SIZE, > + DMA_PTE_READ | DMA_PTE_WRITE); > + vma = find_vma(current->mm, gpa); > + if (!vma) > + return 1; * > + write = (vma->vm_flags & VM_WRITE) != 0; > + get_user_pages(current, current->mm, gpa, > + PAGE_SIZE, write, 0, NULL, NULL); You should put_page each of the user pages when freeing or exiting (in unmap_guest), else a ref is held on each page and that's a lot of memory leaked. Also, this rules out any form of guest swapping. You should put_page in case a balloon driver in the guest tries to free some pages for the host. > + } > + return 0; > +} > +EXPORT_SYMBOL_GPL(kvm_iommu_map_pages); > + > +static int kvm_iommu_map_memslots(struct kvm *kvm) > +{ > + int i, status; > + for (i = 0; i < kvm->nmemslots; i++) { > + status = kvm_iommu_map_pages(kvm, > kvm->memslots[i].base_gfn, > + kvm->memslots[i].npages); > + if (status) > + return status; * > + } > + return 0; > +} > + > +int kvm_iommu_map_guest(struct kvm *kvm, > + struct kvm_pci_passthrough_dev *pci_pt_dev) > +{ > + struct dmar_drhd_unit *drhd; > + struct dmar_domain *domain; > + struct intel_iommu *iommu; > + struct pci_dev *pdev = NULL; > + > + printk(KERN_DEBUG "kvm_iommu_map_guest: host bdf = %x:%x:%x\n", > + pci_pt_dev->host.busnr, > + PCI_SLOT(pci_pt_dev->host.devfn), > + PCI_FUNC(pci_pt_dev->host.devfn)); > + > + for_each_pci_dev(pdev) { > + if ((pdev->bus->number == pci_pt_dev->host.busnr) && > + (pdev->devfn == pci_pt_dev->host.devfn)) > + goto found; > + } You can use pci_get_device instead of going through the list yourself. > + goto not_found; > +found: > + pci_pt_dev->pdev = pdev; > + > + drhd = dmar_find_matched_drhd_unit(pdev); > + if (!drhd) { > + printk(KERN_ERR "kvm_iommu_map_guest: drhd == NULL\n"); > + goto not_found; > + } > + > + printk(KERN_DEBUG "kvm_iommu_map_guest: reg_base_addr = %llx\n", > + drhd->reg_base_addr); > + > + iommu = drhd->iommu; > + if (!iommu) { > + printk(KERN_ERR "kvm_iommu_map_guest: iommu == NULL\n"); > + goto not_found; > + } > + domain = iommu_alloc_domain(iommu); > + if (!domain) { > + printk(KERN_ERR "kvm_iommu_map_guest: domain == > NULL\n"); > + goto not_found; > + } > + if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { > + printk(KERN_ERR "kvm_iommu_map_guest: domain_init() > failed\n"); > + goto not_found; Memory allocated in iommu_alloc_domain is leaked in this case > + } > + kvm->arch.domain = domain; > + kvm_iommu_map_memslots(kvm); *: You don't check for failure in mapping > + domain_context_mapping(kvm->arch.domain, pdev); > + return 0; > +not_found: > + return 1; > +} > +EXPORT_SYMBOL_GPL(kvm_iommu_map_guest); > + > +int kvm_iommu_unmap_guest(struct kvm *kvm) > +{ > + struct dmar_domain *domain; > + struct kvm_pci_pt_dev_list *entry; > + struct pci_dev *pdev; > + > + list_for_each_entry(entry, &kvm->arch.domain->devices, list) { > + printk(KERN_DEBUG "kvm_iommu_unmap_guest: %x:%x:%x\n", > + entry->pt_dev.host.busnr, > + PCI_SLOT(entry->pt_dev.host.devfn), > + PCI_FUNC(entry->pt_dev.host.devfn)); > + > + pdev = entry->pt_dev.pdev; > + > + if (pdev == NULL) { > + printk("kvm_iommu_unmap_guest: pdev == NULL\n"); > + return 1; > + } > + > + /* detach kvm dmar domain */ > + detach_domain_for_dev(kvm->arch.domain, > + pdev->bus->number, pdev->devfn); > + > + /* now restore back linux iommu domain */ > + domain = find_domain(pdev); > + if (domain) > + domain_context_mapping(domain, pdev); > + else > + printk(KERN_DEBUG > + "kvm_iommu_unmap_guest: domain == > NULL\n"); > + } > + /* unmap guest memory in vt-d page table */ > + iommu_free_domain(kvm->arch.domain); > + return 0; > +} > +EXPORT_SYMBOL_GPL(kvm_iommu_unmap_guest); > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index a97d2e2..a877db2 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -257,6 +257,7 @@ static void kvm_free_pci_passthrough(struct kvm > *kvm) > > list_del(&pci_pt_dev->list); > } > + kvm->arch.domain = NULL; > } > > unsigned long segment_base(u16 selector) > @@ -1846,6 +1847,10 @@ long kvm_arch_vm_ioctl(struct file *filp, > if (copy_from_user(&pci_pt_dev, argp, sizeof > pci_pt_dev)) > goto out; > > + r = kvm_iommu_map_guest(kvm, &pci_pt_dev); > + if (r) > + goto out; > + > r = kvm_vm_ioctl_pci_pt_dev(kvm, &pci_pt_dev); > if (r) > goto out; If the ioctl fails, you don't "unmap" the guest (and also leak memory). I suggest you call the map guest routine after the ioctl since the ioctl also checks if a similar device has already been added and populates the structure. In that case, you should call kvm_free_pci_passthrough() on unsuccessful iommu mapping Looks like you're leaking memory and failing to unmap or free the domain in all the error paths. > @@ -4088,6 +4093,8 @@ static void kvm_free_vcpus(struct kvm *kvm) > > void kvm_arch_destroy_vm(struct kvm *kvm) > { > + if (kvm->arch.domain) > + kvm_iommu_unmap_guest(kvm); This condition can be checked for inside the unmap guest routine. iommu_unmap_guest can be called unconditionally. > kvm_free_pci_passthrough(kvm); > kvm_free_pit(kvm); > kfree(kvm->arch.vpic); > diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h > index 4662d49..70248cb 100644 > --- a/include/asm-x86/kvm_host.h > +++ b/include/asm-x86/kvm_host.h > @@ -19,6 +19,8 @@ > #include <linux/kvm_types.h> > > #include <asm/desc.h> > +#include <linux/dmar.h> > +#include <linux/intel-iommu.h> > > #define KVM_MAX_VCPUS 16 > #define KVM_MEMORY_SLOTS 32 > @@ -318,6 +320,7 @@ struct kvm_arch{ > */ > struct list_head active_mmu_pages; > struct list_head pci_pt_dev_head; > + struct dmar_domain *domain; Use a descriptive name, like intel_iommu_domain. > struct kvm_pic *vpic; > struct kvm_ioapic *vioapic; > struct kvm_pit *vpit; > diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h > index 5f93b78..6202ed1 100644 > --- a/include/asm-x86/kvm_para.h > +++ b/include/asm-x86/kvm_para.h > @@ -170,5 +170,6 @@ struct kvm_pci_pt_info { > struct kvm_pci_passthrough_dev { > struct kvm_pci_pt_info guest; > struct kvm_pci_pt_info host; > + struct pci_dev *pdev; /* kernel device pointer for host dev > */ > }; > #endif > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 4e16682..bcfcf78 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -276,6 +276,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v); > int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); > void kvm_vcpu_kick(struct kvm_vcpu *vcpu); > > +int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn, > + unsigned long npages); > +int kvm_iommu_map_guest(struct kvm *kvm, > + struct kvm_pci_passthrough_dev *pci_pt_dev); > +int kvm_iommu_unmap_guest(struct kvm *kvm); > + Why can't these be put in asm-x86/kvm_host.h? > static inline void kvm_guest_enter(void) > { > account_system_vtime(current); > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index d3cb4cc..e46614a 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -309,6 +309,9 @@ int __kvm_set_memory_region(struct kvm *kvm, > new.npages = npages; > new.flags = mem->flags; > > + /* map the pages in iommu page table */ (if one exists) > + kvm_iommu_map_pages(kvm, base_gfn, npages); > + You should do this once all the memory is set up properly (which is just before returning 0) > /* Disallow changing a memory slot's size. */ > r = -EINVAL; > if (npages && old.npa Amit |
From: Kay, A. M <all...@in...> - 2008-05-06 22:01:49
|
>> + >> +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 >> + >> +struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev >> *dev); >> +struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu); >> +void iommu_free_domain(struct dmar_domain *domain); >> +int domain_init(struct dmar_domain *domain, int guest_width); >> +int domain_context_mapping(struct dmar_domain *d, >> + struct pci_dev *pdev); >> +int domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, >> + u64 hpa, size_t size, int prot); >> +void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 >> devfn); >> +struct dmar_domain * find_domain(struct pci_dev *pdev); > >Please move these to a .h file and also prefix appropriate keywords: > >domain_context_mapping is confusing and since it's an intel >iommu-only thing, >use something like > >intel_iommu_domain_context_mapping > These functions currently are just direct calls into existing functions in drivers/pci/intel-iommu.c - hence the lack of more descriptive name in KVM environment. To get more relavant names in KVM environment, we can either create wrappers for these functions or using a iommu function table. Allen |
From: Avi K. <av...@qu...> - 2008-05-06 10:34:33
|
Kay, Allen M wrote: > + > +#include <linux/list.h> > +#include <linux/kvm_host.h> > +#include <linux/pci.h> > +#include <linux/dmar.h> > +#include <linux/intel-iommu.h> > + > +//#define DEBUG > + > +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 > The name "domain" is too generic; please use dma_domain or io_domain or something similar. > +static int kvm_iommu_map_memslots(struct kvm *kvm) > +{ > + int i, status; > + for (i = 0; i < kvm->nmemslots; i++) { > + status = kvm_iommu_map_pages(kvm, > kvm->memslots[i].base_gfn, > + kvm->memslots[i].npages); > + if (status) > + return status; > Need to undo in case of partial completion. > diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h > index 5f93b78..6202ed1 100644 > --- a/include/asm-x86/kvm_para.h > +++ b/include/asm-x86/kvm_para.h > @@ -170,5 +170,6 @@ struct kvm_pci_pt_info { > struct kvm_pci_passthrough_dev { > struct kvm_pci_pt_info guest; > struct kvm_pci_pt_info host; > + struct pci_dev *pdev; /* kernel device pointer for host dev > */ > This should be stored somewhere private (not sure, but I think kvm_pci_passthrough_dev is a public interface). -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Anthony L. <an...@co...> - 2008-05-07 00:30:59
|
Kay, Allen M wrote: > Kvm kernel changes. > > Signed-off-by: Allen M Kay <all...@in...> > > ------ > arch/x86/kvm/Makefile | 2 > arch/x86/kvm/vtd.c | 183 > +++++++++++++++++++++++++++++++++++++++++++++ > arch/x86/kvm/x86.c | 7 + > include/asm-x86/kvm_host.h | 3 > include/asm-x86/kvm_para.h | 1 > include/linux/kvm_host.h | 6 + > virt/kvm/kvm_main.c | 3 > 7 files changed, 204 insertions(+), 1 deletion(-) > > ------ > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > index c97d35c..b1057fb 100644 > --- a/arch/x86/kvm/Makefile > +++ b/arch/x86/kvm/Makefile > @@ -12,7 +12,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm > kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o > lapic.o \ > i8254.o > obj-$(CONFIG_KVM) += kvm.o > -kvm-intel-objs = vmx.o > +kvm-intel-objs = vmx.o vtd.o > obj-$(CONFIG_KVM_INTEL) += kvm-intel.o > kvm-amd-objs = svm.o > obj-$(CONFIG_KVM_AMD) += kvm-amd.o > diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c > new file mode 100644 > index 0000000..9a080b5 > --- /dev/null > +++ b/arch/x86/kvm/vtd.c > @@ -0,0 +1,183 @@ > +/* > + * Copyright (c) 2006, Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify > it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but > WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY > or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public > License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License > along with > + * this program; if not, write to the Free Software Foundation, Inc., > 59 Temple > + * Place - Suite 330, Boston, MA 02111-1307 USA. > + * > + * Copyright (C) 2006-2008 Intel Corporation > + * Author: Allen M. Kay <all...@in...> > + * Author: Weidong Han <wei...@in...> > + */ > + > +#include <linux/list.h> > +#include <linux/kvm_host.h> > +#include <linux/pci.h> > +#include <linux/dmar.h> > +#include <linux/intel-iommu.h> > + > +//#define DEBUG > + > +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 > + > +struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev > *dev); > +struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu); > +void iommu_free_domain(struct dmar_domain *domain); > +int domain_init(struct dmar_domain *domain, int guest_width); > +int domain_context_mapping(struct dmar_domain *d, > + struct pci_dev *pdev); > +int domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, > + u64 hpa, size_t size, int prot); > +void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 > devfn); > +struct dmar_domain * find_domain(struct pci_dev *pdev); > These definitely need to be moved to a common header. > + > +int kvm_iommu_map_pages(struct kvm *kvm, > + gfn_t base_gfn, unsigned long npages) > +{ > + unsigned long gpa; > + struct page *page; > + hpa_t hpa; > + int j, write; > + struct vm_area_struct *vma; > + > + if (!kvm->arch.domain) > + return 1; > In the kernel, we should be using -errno to return error codes. > + gpa = base_gfn << PAGE_SHIFT; > + page = gfn_to_page(kvm, base_gfn); > + hpa = page_to_phys(page); > Please use gfn_to_pfn(). Keep in mind, by using gfn_to_page/gfn_to_pfn, you take a reference to a page. You're leaking that reference here. > + printk(KERN_DEBUG "kvm_iommu_map_page: gpa = %lx\n", gpa); > + printk(KERN_DEBUG "kvm_iommu_map_page: hpa = %llx\n", hpa); > + printk(KERN_DEBUG "kvm_iommu_map_page: size = %lx\n", > + npages*PAGE_SIZE); > + > + for (j = 0; j < npages; j++) { > + gpa += PAGE_SIZE; > + page = gfn_to_page(kvm, gpa >> PAGE_SHIFT); > + hpa = page_to_phys(page); > Again, gfn_to_pfn() and you're taking a reference that I never see you releasing. > + domain_page_mapping(kvm->arch.domain, gpa, hpa, > PAGE_SIZE, > + DMA_PTE_READ | DMA_PTE_WRITE); > + vma = find_vma(current->mm, gpa); > + if (!vma) > + return 1; > + write = (vma->vm_flags & VM_WRITE) != 0; > + get_user_pages(current, current->mm, gpa, > + PAGE_SIZE, write, 0, NULL, NULL); > I don't quite see what you're doing here. It looks like you're trying to pre-fault the page in? gfn_to_pfn will do that for you. You're taking a bunch of references here that are never getting released. I think the general approach here is a bit faulty. I think what we want to do is mlock() from userspace to ensure all the memory is present for the guest. We should combine this with MMU-notifiers such that whenever the userspace mapping changes, we can reprogram the IOMMU. In the case where we don't have MMU-notifiers, we simply hold on to the memory forever and never program the IOMMU. The initial mlock() in userspace is somewhat of a nop here but it's important with MMU-notifiers because we will no longer be holding a reference for guest memory. We have to ensure we don't swap KVM guest memory while using hardware pass-through, but AFAICT, we do not need to make the memory non-reclaimable As long as we reprogram the IOMMU with a new, valid, mapping everything should be fine. mlock() really gives us the right semantics. Semantically, a PV API that supports DMA window registration simply mlock()s the DMA regions on behalf of the guest. No special logic should be needed. Regards, Anthony Liguori |
From: Kay, A. M <all...@in...> - 2008-05-07 00:47:07
|
>We have to ensure we don't swap KVM guest memory while using hardware >pass-through, but AFAICT, we do not need to make the memory >non-reclaimable As long as we reprogram the IOMMU with a new, valid, >mapping everything should be fine. mlock() really gives us the right >semantics. > >Semantically, a PV API that supports DMA window registration simply >mlock()s the DMA regions on behalf of the guest. No special logic >should be needed. > What should be done for unmodified guest where there is no PV driver in the guest? Would a call to mlock() from qemu/hw/pci-passthrough.c/add_pci_passthrough_device() a reasonable thing to do? Allen |
From: Anthony L. <an...@co...> - 2008-05-07 01:56:31
|
Kay, Allen M wrote: >> We have to ensure we don't swap KVM guest memory while using hardware >> pass-through, but AFAICT, we do not need to make the memory >> non-reclaimable As long as we reprogram the IOMMU with a new, valid, >> mapping everything should be fine. mlock() really gives us the right >> semantics. >> >> Semantically, a PV API that supports DMA window registration simply >> mlock()s the DMA regions on behalf of the guest. No special logic >> should be needed. >> >> > > What should be done for unmodified guest where there is no PV driver in > the guest? Would a call to mlock() from > qemu/hw/pci-passthrough.c/add_pci_passthrough_device() a reasonable > thing to do? > Yup. The idea is to ensure that the memory is always present, without necessarily taking a reference to it. This allows for memory reclaiming which should allow for things like NUMA page migration. We can't swap of course but that doesn't mean reclaimation isn't useful. Regards, Anthony Liguori > Allen > |
From: Anthony L. <an...@co...> - 2008-05-07 19:22:24
|
Avi Kivity wrote: > Anthony Liguori wrote: > > >>> What should be done for unmodified guest where there is no PV driver in >>> the guest? Would a call to mlock() from >>> qemu/hw/pci-passthrough.c/add_pci_passthrough_device() a reasonable >>> thing to do? >>> >> >> Yup. The idea is to ensure that the memory is always present, >> without necessarily taking a reference to it. This allows for memory >> reclaiming which should allow for things like NUMA page migration. >> We can't swap of course but that doesn't mean reclaimation isn't useful. >> > > I don't think we can do page migration with VT-d. You need to be able > to detect whether the page has been changed by dma after you've copied > it but before you changed the pte, but VT-d doesn't allow that AFAICT. Hrm, I would have to look at the VT-d but I suspect you're right. That's unfortunate. That means mlock() isn't sufficient. It also means that the VMAs can't be updated while the guest is running. Is there any way to lock a vma region such that things like madvise/mmap(MAP_FIXED) will always fail? Regards, Anthony Liguori |
From: Avi K. <av...@qu...> - 2008-05-11 15:31:22
|
Anthony Liguori wrote: >> I don't think we can do page migration with VT-d. You need to be able >> to detect whether the page has been changed by dma after you've copied >> it but before you changed the pte, but VT-d doesn't allow that AFAICT. >> > > Hrm, I would have to look at the VT-d but I suspect you're right. > That's unfortunate. > > That means mlock() isn't sufficient. It also means that the VMAs can't > be updated while the guest is running. Is there any way to lock a vma > region such that things like madvise/mmap(MAP_FIXED) will always fail? > Userspace can simply not issue them. Page migration is also controlled by userspace. Once active memory defragmentation goes in, we need a way to tell the kernel that the allocation is unreclaimable, perhaps with a new mmap flag. -- error compiling committee.c: too many arguments to function |
From: Avi K. <av...@qu...> - 2008-05-07 05:51:12
|
Anthony Liguori wrote: >> What should be done for unmodified guest where there is no PV driver in >> the guest? Would a call to mlock() from >> qemu/hw/pci-passthrough.c/add_pci_passthrough_device() a reasonable >> thing to do? >> >> > > Yup. The idea is to ensure that the memory is always present, without > necessarily taking a reference to it. This allows for memory reclaiming > which should allow for things like NUMA page migration. We can't swap > of course but that doesn't mean reclaimation isn't useful. > I don't think we can do page migration with VT-d. You need to be able to detect whether the page has been changed by dma after you've copied it but before you changed the pte, but VT-d doesn't allow that AFAICT. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Muli Ben-Y. <mu...@il...> - 2008-05-11 08:49:21
|
On Mon, May 05, 2008 at 02:36:23PM -0700, Kay, Allen M wrote: > + for (j = 0; j < npages; j++) { > + gpa += PAGE_SIZE; > + page = gfn_to_page(kvm, gpa >> PAGE_SHIFT); > + hpa = page_to_phys(page); > + domain_page_mapping(kvm->arch.domain, gpa, hpa, > PAGE_SIZE, > + DMA_PTE_READ | DMA_PTE_WRITE); > + vma = find_vma(current->mm, gpa); > + if (!vma) > + return 1; > + write = (vma->vm_flags & VM_WRITE) != 0; > + get_user_pages(current, current->mm, gpa, > + PAGE_SIZE, write, 0, NULL, NULL); > + } > + return 0; > +} get_user_pages can fail. We should first try to fault in the pages and only if succesfull map them in the IOMMU. Also, you need to protect against the vma going away here. Cheers, Muli |