|
From: Amit S. <ami...@qu...> - 2008-04-29 10:37:33
|
This patchset implements PVDMA for handling DMA requests from devices assigned to the guest from the host machine. They're also available from git-pull git://git.kernel.org/pub/scm/linux/kernel/git/amit/kvm.git pvdma These patches are based on my pci-passthrough tree, which is available from git-pull git://git.kernel.org/pub/scm/linux/kernel/git/amit/kvm.git and the userspace from git-pull git://git.kernel.org/pub/scm/linux/kernel/git/amit/kvm-userspace.git The first and the third patch in this series is needed on the guest (with some bits from the 2nd as well). The 2nd patch is meant for the host kernel. Amit. |
|
From: Amit S. <ami...@qu...> - 2008-04-29 10:37:33
|
We make the dma_mapping_ops structure to point to our structure
so that every DMA access goes through us.
We make a hypercall for every device that does a DMA operation
to find out if it is an assigned device -- so that we can make
hypercalls on each DMA access. The result of this hypercall is
cached, so that this hypercall is made only once for each device
This can be compiled as a module, but that's only used for debugging.
It can be compiled into the guest kernel directly without any side
effects (if you ignore one error message about the hypercall
failing for hard disks).
Signed-off-by: Amit Shah <ami...@qu...>
---
arch/x86/Kconfig | 8 +
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/kvm_pv_dma.c | 391 ++++++++++++++++++++++++++++++++++++++++++
3 files changed, 400 insertions(+), 0 deletions(-)
create mode 100644 arch/x86/kernel/kvm_pv_dma.c
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e5790fe..aad16d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -392,6 +392,14 @@ config KVM_GUEST
This option enables various optimizations for running under the KVM
hypervisor.
+config KVM_PV_DMA
+ tristate "KVM paravirtualized DMA access"
+ ---help---
+ Provides support for DMA operations in the guest. A hypercall
+ is raised to the host to enable devices owned by guest to use
+ DMA. Select this if compiling a guest kernel and you need
+ paravirtualized DMA operations.
+
source "arch/x86/lguest/Kconfig"
config PARAVIRT
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index fa19c38..0adb37b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
+obj-$(CONFIG_KVM_PV_DMA) += kvm_pv_dma.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
ifdef CONFIG_INPUT_PCSPKR
diff --git a/arch/x86/kernel/kvm_pv_dma.c b/arch/x86/kernel/kvm_pv_dma.c
new file mode 100644
index 0000000..db83324
--- /dev/null
+++ b/arch/x86/kernel/kvm_pv_dma.c
@@ -0,0 +1,391 @@
+/*
+ * KVM guest DMA para-virtualization driver
+ *
+ * Copyright (C) 2007, Qumranet, Inc., Amit Shah <ami...@qu...>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <asm/page.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/miscdevice.h>
+#include <linux/kvm_para.h>
+
+MODULE_AUTHOR("Amit Shah");
+MODULE_DESCRIPTION("Implements guest para-virtualized DMA");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1");
+
+#define KVM_DMA_MINOR MISC_DYNAMIC_MINOR
+
+static struct page *page;
+static unsigned long page_gfn;
+
+const struct dma_mapping_ops *orig_dma_ops;
+
+#include <linux/list.h>
+struct pv_passthrough_dev_list {
+ struct list_head list;
+ struct kvm_pci_pt_info pv_pci_info;
+ int is_pv;
+};
+static LIST_HEAD(pt_devs_head);
+
+static struct pv_passthrough_dev_list*
+find_matching_pt_dev(struct list_head *head,
+ struct kvm_pci_pt_info *pv_pci_info)
+{
+ struct list_head *ptr;
+ struct pv_passthrough_dev_list *match;
+
+ list_for_each(ptr, head) {
+ match = list_entry(ptr, struct pv_passthrough_dev_list, list);
+ if (match &&
+ (match->pv_pci_info.busnr == pv_pci_info->busnr) &&
+ (match->pv_pci_info.devfn == pv_pci_info->devfn))
+ return match;
+ }
+ return NULL;
+}
+
+void empty_pt_dev_list(struct list_head *head)
+{
+ struct pv_passthrough_dev_list *match;
+
+ while (!list_empty(head)) {
+ match = list_entry(head->next, \
+ struct pv_passthrough_dev_list, list);
+ list_del(&match->list);
+ }
+}
+
+static int kvm_is_pv_device(struct device *dev, const char *name)
+{
+ int r;
+ struct pci_dev *pci_dev;
+ struct kvm_pci_pt_info pv_pci_info;
+ struct pv_passthrough_dev_list *match;
+
+ pci_dev = to_pci_dev(dev);
+ pv_pci_info.busnr = pci_dev->bus->number;
+ pv_pci_info.devfn = pci_dev->devfn;
+
+ match = find_matching_pt_dev(&pt_devs_head, &pv_pci_info);
+ if (match) {
+ r = match->is_pv;
+ goto out;
+ }
+
+ memcpy(page_address(page), &pv_pci_info, sizeof(pv_pci_info));
+ r = kvm_hypercall1(KVM_PV_PCI_DEVICE, page_gfn);
+ if (r < 1) {
+ printk(KERN_INFO "%s: Error doing hypercall!\n", __func__);
+ r = 0;
+ goto out;
+ }
+
+ match = kmalloc(sizeof(struct pv_passthrough_dev_list), GFP_KERNEL);
+ if (match == NULL) {
+ printk(KERN_INFO "%s: Out of memory\n", __func__);
+ r = 0;
+ goto out;
+ }
+ match->pv_pci_info.busnr = pv_pci_info.busnr;
+ match->pv_pci_info.devfn = pv_pci_info.devfn;
+ match->is_pv = r;
+ list_add(&match->list, &pt_devs_head);
+ out:
+ return r;
+}
+
+static void *kvm_dma_map(void *vaddr, size_t size, dma_addr_t *dma_handle)
+{
+ int npages, i;
+ unsigned long *dma_addr;
+ dma_addr_t host_addr = bad_dma_address;
+
+ if (page == NULL)
+ goto out;
+
+ npages = get_order(size) + 1;
+ dma_addr = page_address(page);
+
+ /* We have to take into consideration the offsets for the
+ * virtual address provided by the calling
+ * functions. Currently both, pci_alloc_consistent and
+ * pci_map_single call this function. We have to change it so
+ * that we can also pass to the host the offset of the addr in
+ * the page it is in.
+ */
+
+ if (*dma_handle == bad_dma_address)
+ goto out;
+
+ /* It's not really OK to use dma_handle here, as the IOMMU or
+ * swiotlb could have mapped it elsewhere. But what's a better
+ * solution?
+ */
+ *dma_addr++ = *dma_handle;
+ if (npages > 1) {
+ /* All of the pages will be contiguous in guest
+ * physical memory in both, pci_map_consistent and
+ * pci_map_single cases (see DMA-API.txt)
+ */
+ /* FIXME: we're currently not crossing over to
+ * multiple pages to be sent to host, in case
+ * we have a lot of pages that we can't
+ * accomodate in one page.
+ */
+ for (i = 1; i < min((unsigned long)npages, MAX_PVDMA_PAGES); i++)
+ *dma_addr++ = virt_to_phys(vaddr + PAGE_SIZE * i);
+ }
+
+ /* Maybe we need more arguments (we have first two):
+ * @npages: number of gpas pages in this hypercall
+ * @page: page we pass to host with all the gpas in them
+ * @more: are there any more pages coming?
+ * @offset: offset of the address in the first page
+ * @direction: direction for the mapping (only for pci_map_single)
+ */
+ npages = kvm_hypercall2(KVM_PV_DMA_MAP, npages, page_gfn);
+ if (!npages)
+ host_addr = bad_dma_address;
+ else
+ host_addr = *(unsigned long *)page_address(page);
+
+ out:
+ *dma_handle = host_addr;
+ if (host_addr == bad_dma_address)
+ vaddr = NULL;
+ return vaddr;
+}
+
+static void kvm_dma_unmap(dma_addr_t dma_handle)
+{
+ kvm_hypercall1(KVM_PV_DMA_UNMAP, dma_handle);
+ return;
+}
+
+static void *kvm_dma_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp)
+{
+ void *vaddr = NULL;
+ if ((*dma_handle == bad_dma_address)
+ || !dma_ops->is_pv_device(dev, dev->bus_id))
+ goto out;
+
+ vaddr = bus_to_virt(*(unsigned long *)dma_handle);
+ vaddr = kvm_dma_map(vaddr, size, dma_handle);
+ out:
+ return vaddr;
+}
+
+static void kvm_dma_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_handle)
+{
+ kvm_dma_unmap(dma_handle);
+}
+
+static dma_addr_t kvm_dma_map_single(struct device *dev, phys_addr_t paddr,
+ size_t size, int direction)
+{
+ dma_addr_t r;
+
+ r = orig_dma_ops->map_single(dev, paddr, size, direction);
+
+ if (r != bad_dma_address && kvm_is_pv_device(dev, dev->bus_id))
+ kvm_dma_map(phys_to_virt(paddr), size, &r);
+ return r;
+}
+
+static inline void kvm_dma_unmap_single(struct device *dev, dma_addr_t addr,
+ size_t size, int direction)
+{
+ kvm_dma_unmap(addr);
+}
+
+int kvm_pv_dma_mapping_error(dma_addr_t dma_addr)
+{
+ if (orig_dma_ops->mapping_error)
+ return orig_dma_ops->mapping_error(dma_addr);
+
+ printk(KERN_ERR "%s: Unhandled PV DMA operation. Report this.\n",
+ __func__);
+ return dma_addr == bad_dma_address;
+}
+
+/* like map_single, but doesn't check the device mask */
+dma_addr_t kvm_pv_dma_map_simple(struct device *hwdev, phys_addr_t paddr,
+ size_t size, int direction)
+{
+ return orig_dma_ops->map_simple(hwdev, paddr, size, direction);
+}
+
+void kvm_pv_dma_sync_single_for_cpu(struct device *hwdev,
+ dma_addr_t dma_handle, size_t size,
+ int direction)
+{
+ if (orig_dma_ops->sync_single_for_cpu)
+ orig_dma_ops->sync_single_for_cpu(hwdev, dma_handle,
+ size, direction);
+}
+
+void kvm_pv_dma_sync_single_for_device(struct device *hwdev,
+ dma_addr_t dma_handle, size_t size,
+ int direction)
+{
+ if (orig_dma_ops->sync_single_for_device)
+ orig_dma_ops->sync_single_for_device(hwdev, dma_handle,
+ size, direction);
+}
+
+void kvm_pv_dma_sync_single_range_for_cpu(struct device *hwdev,
+ dma_addr_t dma_handle,
+ unsigned long offset,
+ size_t size, int direction)
+{
+ if (orig_dma_ops->sync_single_range_for_cpu)
+ orig_dma_ops->sync_single_range_for_cpu(hwdev, dma_handle,
+ offset, size,
+ direction);
+}
+
+void kvm_pv_dma_sync_single_range_for_device(struct device *hwdev,
+ dma_addr_t dma_handle,
+ unsigned long offset,
+ size_t size, int direction)
+{
+ if (orig_dma_ops->sync_single_range_for_device)
+ orig_dma_ops->sync_single_range_for_device(hwdev, dma_handle,
+ offset, size,
+ direction);
+}
+
+void kvm_pv_dma_sync_sg_for_cpu(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int direction)
+{
+ if (orig_dma_ops->sync_sg_for_cpu)
+ orig_dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
+}
+
+void kvm_pv_dma_sync_sg_for_device(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int direction)
+{
+ if (orig_dma_ops->sync_sg_for_device)
+ orig_dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
+}
+
+int kvm_pv_dma_map_sg(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction)
+{
+ return orig_dma_ops->map_sg(hwdev, sg, nents, direction);
+ printk(KERN_ERR "%s: Unhandled PV DMA operation. Report this.\n",
+ __func__);
+ return 0;
+}
+
+void kvm_pv_dma_unmap_sg(struct device *hwdev,
+ struct scatterlist *sg, int nents,
+ int direction)
+{
+ if (orig_dma_ops->unmap_sg)
+ orig_dma_ops->unmap_sg(hwdev, sg, nents, direction);
+}
+
+int kvm_pv_dma_dma_supported(struct device *hwdev, u64 mask)
+{
+ if (orig_dma_ops->dma_supported)
+ return orig_dma_ops->dma_supported(hwdev, mask);
+ printk(KERN_ERR "%s: Unhandled PV DMA operation. Report this.\n",
+ __func__);
+ return 0;
+}
+
+static const struct dma_mapping_ops kvm_dma_ops = {
+ .alloc_coherent = kvm_dma_alloc_coherent,
+ .free_coherent = kvm_dma_free_coherent,
+ .map_single = kvm_dma_map_single,
+ .unmap_single = kvm_dma_unmap_single,
+ .is_pv_device = kvm_is_pv_device,
+
+ .mapping_error = kvm_pv_dma_mapping_error,
+ .map_simple = kvm_pv_dma_map_simple,
+ .sync_single_for_cpu = kvm_pv_dma_sync_single_for_cpu,
+ .sync_single_for_device = kvm_pv_dma_sync_single_for_device,
+ .sync_single_range_for_cpu = kvm_pv_dma_sync_single_range_for_cpu,
+ .sync_single_range_for_device = kvm_pv_dma_sync_single_range_for_device,
+ .sync_sg_for_cpu = kvm_pv_dma_sync_sg_for_cpu,
+ .sync_sg_for_device = kvm_pv_dma_sync_sg_for_device,
+ .map_sg = kvm_pv_dma_map_sg,
+ .unmap_sg = kvm_pv_dma_unmap_sg,
+};
+
+static struct file_operations dma_chardev_ops;
+static struct miscdevice kvm_dma_dev = {
+ KVM_DMA_MINOR,
+ "kvm_dma",
+ &dma_chardev_ops,
+};
+
+int __init kvm_pv_dma_init(void)
+{
+ int r;
+
+ dma_chardev_ops.owner = THIS_MODULE;
+ if (misc_register(&kvm_dma_dev)) {
+ printk(KERN_ERR "%s: misc device register failed\n",
+ __func__);
+ r = -EBUSY;
+ goto out;
+ }
+ if (!kvm_para_available()) {
+ printk(KERN_ERR "KVM paravirt support not available\n");
+ r = -ENODEV;
+ goto out_dereg;
+ }
+
+ /* FIXME: check for hypercall support */
+ page = alloc_page(GFP_ATOMIC);
+ if (page == NULL) {
+ printk(KERN_ERR "%s: Could not allocate page\n", __func__);
+ r = -ENOMEM;
+ goto out_dereg;
+ }
+ page_gfn = page_to_pfn(page);
+
+ orig_dma_ops = dma_ops;
+ dma_ops = &kvm_dma_ops;
+
+ printk(KERN_INFO "KVM PV DMA engine registered\n");
+ return 0;
+ goto out;
+ goto out_free;
+
+ out_free:
+ __free_page(page);
+ out_dereg:
+ misc_deregister(&kvm_dma_dev);
+ out:
+ return r;
+}
+
+static void __exit kvm_pv_dma_exit(void)
+{
+ dma_ops = orig_dma_ops;
+
+ __free_page(page);
+
+ empty_pt_dev_list(&pt_devs_head);
+
+ misc_deregister(&kvm_dma_dev);
+}
+
+module_init(kvm_pv_dma_init);
+module_exit(kvm_pv_dma_exit);
--
1.5.4.3
|
|
From: Amit S. <ami...@qu...> - 2008-04-29 10:37:33
|
We introduce three hypercalls:
1. When the guest wants to check if a particular device is an assigned device
(this is done once per device by the guest to enable / disable hypercall-
based translation of addresses)
2. map: to convert guest phyical addresses to host physical address to pass on
to the device for DMA. We also pin the pages thus requested so that they're
not swapped out.
3. unmap: to unpin the pages and free any information we might have stored.
Signed-off-by: Amit Shah <ami...@qu...>
---
arch/x86/kvm/x86.c | 211 +++++++++++++++++++++++++++++++++++++++++++-
include/asm-x86/kvm_host.h | 15 +++
include/asm-x86/kvm_para.h | 8 ++
3 files changed, 233 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fb9b329..94ee4db 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -24,8 +24,11 @@
#include <linux/interrupt.h>
#include <linux/kvm.h>
#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/pci.h>
#include <linux/vmalloc.h>
#include <linux/module.h>
+#include <linux/highmem.h>
#include <linux/mman.h>
#include <linux/highmem.h>
@@ -76,6 +79,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "halt_exits", VCPU_STAT(halt_exits) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
+ { "hypercall_map", VCPU_STAT(hypercall_map) },
+ { "hypercall_unmap", VCPU_STAT(hypercall_unmap) },
+ { "hypercall_pv_dev", VCPU_STAT(hypercall_pv_dev) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
@@ -95,9 +101,164 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
+static struct kvm_pv_dma_map*
+find_pci_pv_dmap(struct list_head *head, dma_addr_t dma)
+{
+ struct list_head *ptr;
+ struct kvm_pv_dma_map *match;
+
+ list_for_each(ptr, head) {
+ match = list_entry(ptr, struct kvm_pv_dma_map, list);
+ if (match && match->sg[0].dma_address == dma)
+ return match;
+ }
+ return NULL;
+}
+
+static void prepare_sg_entry(struct scatterlist *sg, struct page *page)
+{
+ unsigned int offset, len;
+
+ offset = page_to_phys(page) & ~PAGE_MASK;
+ len = PAGE_SIZE - offset;
+
+ /* FIXME: Use the sg chaining features */
+ sg_set_page(sg, page, len, offset);
+}
+
+static int pv_map_hypercall(struct kvm_vcpu *vcpu, int npages, gfn_t page_gfn)
+{
+ int i, r = 0;
+ struct page *host_page;
+ struct scatterlist *sg;
+ struct kvm_pv_dma_map *dmap;
+ unsigned long *shared_addr, *hcall_page;
+
+ /* We currently don't support dma mappings which have more than
+ * PAGE_SIZE/sizeof(unsigned long *) pages
+ */
+ if (!npages || npages > MAX_PVDMA_PAGES) {
+ printk(KERN_INFO "%s: Illegal number of pages: %d\n",
+ __func__, npages);
+ goto out;
+ }
+
+ host_page = gfn_to_page(vcpu->kvm, page_gfn);
+ if (is_error_page(host_page)) {
+ printk(KERN_INFO "%s: Bad gfn %p\n", __func__,
+ (void *)page_gfn);
+ goto out;
+ }
+ hcall_page = shared_addr = kmap(host_page);
+
+ /* scatterlist to map guest dma pages into host physical
+ * memory -- if they exceed the DMA map limit
+ */
+ sg = kcalloc(npages, sizeof(struct scatterlist), GFP_KERNEL);
+ if (sg == NULL) {
+ printk(KERN_INFO "%s: Couldn't allocate memory (sg)\n",
+ __func__);
+ goto out_unmap;
+ }
+
+ /* List to store all guest pages mapped into host. This will
+ * be used later to free pages on the host. Think of this as a
+ * translation table from guest dma addresses into host dma
+ * addresses
+ */
+ dmap = kzalloc(sizeof(*dmap), GFP_KERNEL);
+ if (dmap == NULL) {
+ printk(KERN_INFO "%s: Couldn't allocate memory\n",
+ __func__);
+ goto out_unmap_sg;
+ }
+
+ /* FIXME: consider the length of the last page. Guest should
+ * send this info.
+ */
+ for (i = 0; i < npages; i++) {
+ struct page *page;
+ gpa_t gpa;
+
+ gpa = *shared_addr++;
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ int j;
+ printk(KERN_INFO "kvm %s: gpa %p not valid\n",
+ __func__, (void *)gpa);
+
+ for (j = 0; j < i; j++)
+ put_page(sg_page(&sg[j]));
+ goto out_unmap_sg_dmap;
+ }
+ prepare_sg_entry(&sg[i], page);
+ get_page(sg_page(&sg[i]));
+ }
+
+ /* Put this on the dmap_head list, so that we can find it
+ * later for the 'free' operation
+ */
+ dmap->sg = sg;
+ dmap->nents = npages;
+ list_add(&dmap->list, &vcpu->kvm->arch.pci_pv_dmap_head);
+
+ /* FIXME: guest should send the direction */
+ r = dma_ops->map_sg(NULL, sg, npages, PCI_DMA_BIDIRECTIONAL);
+ if (r) {
+ r = npages;
+ *hcall_page = sg[0].dma_address | (*hcall_page & ~PAGE_MASK);
+ }
+
+ out_unmap:
+ if (!r)
+ *hcall_page = bad_dma_address;
+ kunmap(host_page);
+ out:
+ ++vcpu->stat.hypercall_map;
+ return r;
+ out_unmap_sg_dmap:
+ kfree(dmap);
+ out_unmap_sg:
+ kfree(sg);
+ goto out_unmap;
+}
+
+static int free_dmap(struct kvm_pv_dma_map *dmap, struct list_head *head)
+{
+ int i;
+
+ if (!dmap)
+ return 1;
+
+ for (i = 0; i < dmap->nents; i++)
+ put_page(sg_page(&dmap->sg[i]));
+
+ if (dma_ops->unmap_sg)
+ dma_ops->unmap_sg(NULL, dmap->sg, dmap->nents,
+ PCI_DMA_BIDIRECTIONAL);
+ kfree(dmap->sg);
+ list_del(&dmap->list);
+ kfree(dmap);
+
+ return 0;
+}
+
+/* FIXME: the argument passed from guest can be 32-bit. We need 64-bit for
+ * dma_addr_t. Send the dma address in a page (or split in two registers)
+ */
+static int pv_unmap_hypercall(struct kvm_vcpu *vcpu, dma_addr_t dma)
+{
+ struct kvm_pv_dma_map *dmap;
+
+ ++vcpu->stat.hypercall_unmap;
+
+ dmap = find_pci_pv_dmap(&vcpu->kvm->arch.pci_pv_dmap_head, dma);
+ return free_dmap(dmap, &vcpu->kvm->arch.pci_pv_dmap_head);
+}
+
/*
* Used to find a registered host PCI device (a "passthrough" device)
- * during ioctls, interrupts or EOI
+ * during hypercalls, ioctls or interrupts or EOI
*/
static struct kvm_pci_pt_dev_list *
find_pci_pt_dev(struct list_head *head,
@@ -136,6 +297,34 @@ find_pci_pt_dev(struct list_head *head,
return NULL;
}
+static int
+pv_mapped_pci_device_hypercall(struct kvm_vcpu *vcpu, gfn_t page_gfn)
+{
+ int r = 0;
+ unsigned long *shared_addr;
+ struct page *host_page;
+ struct kvm_pci_pt_info pci_pt_info;
+
+ host_page = gfn_to_page(vcpu->kvm, page_gfn);
+ if (is_error_page(host_page)) {
+ printk(KERN_INFO "%s: gfn %p not valid\n",
+ __func__, (void *)page_gfn);
+ r = -1;
+ goto out;
+ }
+ shared_addr = kmap(host_page);
+ memcpy(&pci_pt_info, shared_addr, sizeof(pci_pt_info));
+
+ if (find_pci_pt_dev(&vcpu->kvm->arch.pci_pt_dev_head,
+ &pci_pt_info, 0, KVM_PT_SOURCE_ASSIGN))
+ r++; /* We have assigned the device */
+
+ kunmap(host_page);
+ out:
+ ++vcpu->stat.hypercall_pv_dev;
+ return r;
+}
+
static DECLARE_BITMAP(pt_irq_pending, NR_IRQS);
static DECLARE_BITMAP(pt_irq_handled, NR_IRQS);
@@ -218,6 +407,10 @@ static int kvm_vm_ioctl_pci_pt_dev(struct kvm *kvm,
set_bit(pci_pt_dev->host.irq, pt_irq_handled);
}
list_add(&match->list, &kvm->arch.pci_pt_dev_head);
+
+ printk(KERN_INFO "kvm: Handling hypercalls for device %02x:%02x.%1x\n",
+ pci_pt_dev->host.busnr, PCI_SLOT(pci_pt_dev->host.devfn),
+ PCI_FUNC(pci_pt_dev->host.devfn));
out:
return r;
out_free:
@@ -248,6 +441,7 @@ static void kvm_free_pci_passthrough(struct kvm *kvm)
{
struct list_head *ptr, *ptr2;
struct kvm_pci_pt_dev_list *pci_pt_dev;
+ struct kvm_pv_dma_map *dmap;
list_for_each_safe(ptr, ptr2, &kvm->arch.pci_pt_dev_head) {
pci_pt_dev = list_entry(ptr, struct kvm_pci_pt_dev_list, list);
@@ -257,6 +451,11 @@ static void kvm_free_pci_passthrough(struct kvm *kvm)
list_del(&pci_pt_dev->list);
}
+
+ list_for_each_safe(ptr, ptr2, &kvm->arch.pci_pv_dmap_head) {
+ dmap = list_entry(ptr, struct kvm_pv_dma_map, list);
+ free_dmap(dmap, &kvm->arch.pci_pv_dmap_head);
+ }
}
unsigned long segment_base(u16 selector)
@@ -2672,6 +2871,15 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
}
switch (nr) {
+ case KVM_PV_DMA_MAP:
+ ret = pv_map_hypercall(vcpu, a0, a1);
+ break;
+ case KVM_PV_DMA_UNMAP:
+ ret = pv_unmap_hypercall(vcpu, a0);
+ break;
+ case KVM_PV_PCI_DEVICE:
+ ret = pv_mapped_pci_device_hypercall(vcpu, a0);
+ break;
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
break;
@@ -4059,6 +4267,7 @@ struct kvm *kvm_arch_create_vm(void)
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.pci_pt_dev_head);
+ INIT_LIST_HEAD(&kvm->arch.pci_pv_dmap_head);
return kvm;
}
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 3b9cb50..d52e44e 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -298,6 +298,17 @@ struct kvm_mem_alias {
#define KVM_PT_SOURCE_IRQ_ACK 2
#define KVM_PT_SOURCE_ASSIGN 3
+/* Paravirt DMA: We pin the host-side pages for the GPAs that we get
+ * for the DMA operation. We do a sg_map on the host pages for a DMA
+ * operation on the guest side. We un-pin the pages on the
+ * unmap_hypercall.
+ */
+struct kvm_pv_dma_map {
+ struct list_head list;
+ int nents;
+ struct scatterlist *sg;
+};
+
/* This list is to store the guest bus:device:function-irq and host
* bus:device:function-irq mapping for assigned devices.
*/
@@ -319,6 +330,7 @@ struct kvm_arch{
*/
struct list_head active_mmu_pages;
struct list_head pci_pt_dev_head;
+ struct list_head pci_pv_dmap_head;
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
@@ -366,6 +378,9 @@ struct kvm_vcpu_stat {
u32 insn_emulation;
u32 insn_emulation_fail;
u32 hypercalls;
+ u32 hypercall_map;
+ u32 hypercall_unmap;
+ u32 hypercall_pv_dev;
};
struct descriptor_table {
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
index 5f93b78..e13bf4c 100644
--- a/include/asm-x86/kvm_para.h
+++ b/include/asm-x86/kvm_para.h
@@ -74,6 +74,12 @@ extern void kvmclock_init(void);
*/
#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
+/* Hypercall numbers */
+#define KVM_PV_UNUSED 0
+#define KVM_PV_DMA_MAP 101
+#define KVM_PV_DMA_UNMAP 102
+#define KVM_PV_PCI_DEVICE 103
+
/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
* instruction. The hypervisor may replace it with something else but only the
* instructions are guaranteed to be supported.
@@ -155,6 +161,8 @@ static inline unsigned int kvm_arch_para_features(void)
return cpuid_eax(KVM_CPUID_FEATURES);
}
+/* Max. DMA pages we send from guest to host for mapping */
+#define MAX_PVDMA_PAGES (PAGE_SIZE / sizeof(unsigned long *))
#endif /* KERNEL */
/* Stores information for identifying host PCI devices assigned to the
--
1.5.4.3
|
|
From: Amit S. <ami...@qu...> - 2008-04-29 10:37:34
|
dma_alloc_coherent() doesn't call dma_ops->alloc_coherent in case no IOMMU
translations are necessary. However, if the device doing the DMA is a
physical device assigned to the guest OS by the host, we need to map
all the DMA addresses to the host machine addresses. This is done via
hypercalls to the host.
In KVM, with pci passthrough support, we can assign actual devices to the
guest OS which need this functionality.
Signed-off-by: Amit Shah <ami...@qu...>
---
arch/x86/kernel/pci-dma.c | 11 +++++++++++
include/asm-x86/dma-mapping.h | 2 ++
2 files changed, 13 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 388b113..678cafb 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -443,6 +443,17 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
memset(memory, 0, size);
if (!mmu) {
*dma_handle = bus;
+ if (unlikely(dma_ops->is_pv_device) &&
+ unlikely(dma_ops->is_pv_device(dev, dev->bus_id))) {
+ void *r;
+ r = dma_ops->alloc_coherent(dev, size,
+ dma_handle, gfp);
+ if (r == NULL) {
+ free_pages((unsigned long)memory,
+ get_order(size));
+ memory = NULL;
+ }
+ }
return memory;
}
}
diff --git a/include/asm-x86/dma-mapping.h b/include/asm-x86/dma-mapping.h
index a1a4dc7..b9c6a39 100644
--- a/include/asm-x86/dma-mapping.h
+++ b/include/asm-x86/dma-mapping.h
@@ -55,6 +55,8 @@ struct dma_mapping_ops {
int direction);
int (*dma_supported)(struct device *hwdev, u64 mask);
int is_phys;
+ /* Is this a physical device in a paravirtualized guest? */
+ int (*is_pv_device)(struct device *hwdev, const char *name);
};
extern const struct dma_mapping_ops *dma_ops;
--
1.5.4.3
|
|
From: Muli Ben-Y. <mu...@il...> - 2008-04-30 06:27:15
|
On Tue, Apr 29, 2008 at 01:37:29PM +0300, Amit Shah wrote: > dma_alloc_coherent() doesn't call dma_ops->alloc_coherent in case no > IOMMU translations are necessary. I always thought this was a huge wart in the x86-64 DMA ops. Would there be strong resistance to fixing it so that alloc_coherent matches the way the other ops are used? This will eliminate the need for this patch and will make other DMA ops implementations saner. Cheers, Muli |
|
From: Andi K. <an...@fi...> - 2008-04-29 13:15:59
|
Amit Shah <ami...@qu...> writes: > This patchset implements PVDMA for handling DMA requests from > devices assigned to the guest from the host machine. You forgot to post a high level design overview of how this works, what it is good for, what are the design trade offs etc.? Include that in the first patch. -Andi |
|
From: Andi K. <an...@fi...> - 2008-04-29 13:16:09
|
Amit Shah <ami...@qu...> writes:
> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
> index 388b113..678cafb 100644
> --- a/arch/x86/kernel/pci-dma.c
> +++ b/arch/x86/kernel/pci-dma.c
> @@ -443,6 +443,17 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
> memset(memory, 0, size);
> if (!mmu) {
> *dma_handle = bus;
> + if (unlikely(dma_ops->is_pv_device) &&
> + unlikely(dma_ops->is_pv_device(dev, dev->bus_id))) {
First double unlikely in a condition is useless. Just drop them.
And then ->is_xyz() in a generic vops interface is about as ugly
and non generic as you can get. dma_alloc_coherent is not performance
critical, so you should rather change the interface that ->alloc_coherent
is always called and the other handlers handle the !mmu case correctly.
In fact they need that already I guess (e.g. on DMAR there is not really
a nommu case)
-Andi
|
|
From: Amit S. <ami...@qu...> - 2008-04-29 14:00:27
|
On Tuesday 29 April 2008 18:44:23 Andi Kleen wrote:
> Amit Shah <ami...@qu...> writes:
> > diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
> > index 388b113..678cafb 100644
> > --- a/arch/x86/kernel/pci-dma.c
> > +++ b/arch/x86/kernel/pci-dma.c
> > @@ -443,6 +443,17 @@ dma_alloc_coherent(struct device *dev, size_t size,
> > dma_addr_t *dma_handle, memset(memory, 0, size);
> > if (!mmu) {
> > *dma_handle = bus;
> > + if (unlikely(dma_ops->is_pv_device) &&
> > + unlikely(dma_ops->is_pv_device(dev, dev->bus_id))) {
>
> First double unlikely in a condition is useless. Just drop them.
>
> And then ->is_xyz() in a generic vops interface is about as ugly
> and non generic as you can get. dma_alloc_coherent is not performance
> critical, so you should rather change the interface that ->alloc_coherent
> is always called and the other handlers handle the !mmu case correctly.
> In fact they need that already I guess (e.g. on DMAR there is not really
> a nommu case)
This point came up the last time I sent out the patch; we should do this as
well as implement stackable dma_ops (the need for that is evident in the next
patch).
Thanks for the observation; this should be the next step.
Amit.
|
|
From: Andi K. <an...@fi...> - 2008-04-29 13:35:35
|
Amit Shah <ami...@qu...> writes:
> +
> +static struct page *page;
> +static unsigned long page_gfn;
Bad variable names
> +
> +const struct dma_mapping_ops *orig_dma_ops;
I suspect real dma ops stacking will need some further thought than
your simple hacks
> +
> + match = find_matching_pt_dev(&pt_devs_head, &pv_pci_info);
> + if (match) {
> + r = match->is_pv;
> + goto out;
> + }
> +
> + memcpy(page_address(page), &pv_pci_info, sizeof(pv_pci_info));
Note that on 32bit page_address() might be not mapped.
> +
> + npages = get_order(size) + 1;
Are you sure that's correct? It looks quite bogus. order is a 2 logarithm,
normally npages = 1 << order
if you want npages from order the correct need 1 << order
Haven't read further, but to be honest the code doesn't seem to be anywhere
near merging quality.
-Andi
|
|
From: Amit S. <ami...@qu...> - 2008-04-29 13:58:44
|
On Tuesday 29 April 2008 19:01:32 Andi Kleen wrote: > Amit Shah <ami...@qu...> writes: > > +const struct dma_mapping_ops *orig_dma_ops; > > I suspect real dma ops stacking will need some further thought than > your simple hacks Yes; that's something we're planning to do. > Haven't read further, but to be honest the code doesn't seem to be anywhere > near merging quality. I'm basically using these patches to test the PCI passthrough functionality (by which we can assign host PCI devices to a guest OS via KVM). While other methods of handling DMA operations are being worked on (1-1 mapping of the guest in the host address space and virtualization-aware IOMMU translations), this patchset provides a quick way to check if things indeed work. However, if some version of this patch can be useful upstream, I'll be glad to work on that. That said, as you point out, we need stackable dma ops as well as getting rid of the is_pv_device() function in dma_ops and that's something that can be done right away. Thanks for the review! Amit |
|
From: Glauber C. <gc...@re...> - 2008-04-29 14:51:53
|
Amit Shah wrote:
> We introduce three hypercalls:
> 1. When the guest wants to check if a particular device is an assigned device
> (this is done once per device by the guest to enable / disable hypercall-
> based translation of addresses)
>
> 2. map: to convert guest phyical addresses to host physical address to pass on
> to the device for DMA. We also pin the pages thus requested so that they're
> not swapped out.
>
> 3. unmap: to unpin the pages and free any information we might have stored.
>
> Signed-off-by: Amit Shah <ami...@qu...>
> ---
> arch/x86/kvm/x86.c | 211 +++++++++++++++++++++++++++++++++++++++++++-
> include/asm-x86/kvm_host.h | 15 +++
> include/asm-x86/kvm_para.h | 8 ++
> 3 files changed, 233 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index fb9b329..94ee4db 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -24,8 +24,11 @@
> #include <linux/interrupt.h>
> #include <linux/kvm.h>
> #include <linux/fs.h>
> +#include <linux/list.h>
> +#include <linux/pci.h>
> #include <linux/vmalloc.h>
> #include <linux/module.h>
> +#include <linux/highmem.h>
> #include <linux/mman.h>
> #include <linux/highmem.h>
>
> @@ -76,6 +79,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
> { "halt_exits", VCPU_STAT(halt_exits) },
> { "halt_wakeup", VCPU_STAT(halt_wakeup) },
> { "hypercalls", VCPU_STAT(hypercalls) },
> + { "hypercall_map", VCPU_STAT(hypercall_map) },
> + { "hypercall_unmap", VCPU_STAT(hypercall_unmap) },
> + { "hypercall_pv_dev", VCPU_STAT(hypercall_pv_dev) },
> { "request_irq", VCPU_STAT(request_irq_exits) },
> { "irq_exits", VCPU_STAT(irq_exits) },
> { "host_state_reload", VCPU_STAT(host_state_reload) },
> @@ -95,9 +101,164 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
> { NULL }
> };
>
> +static struct kvm_pv_dma_map*
> +find_pci_pv_dmap(struct list_head *head, dma_addr_t dma)
> +{
might be better to prefix those functions with kvm? Even though they are
static, it seems to be the current practice.
> +static int pv_map_hypercall(struct kvm_vcpu *vcpu, int npages, gfn_t page_gfn)
> +{
> + int i, r = 0;
> + struct page *host_page;
> + struct scatterlist *sg;
> + struct kvm_pv_dma_map *dmap;
> + unsigned long *shared_addr, *hcall_page;
> +
> + /* We currently don't support dma mappings which have more than
> + * PAGE_SIZE/sizeof(unsigned long *) pages
> + */
> + if (!npages || npages > MAX_PVDMA_PAGES) {
> + printk(KERN_INFO "%s: Illegal number of pages: %d\n",
> + __func__, npages);
> + goto out;
> + }
> +
> + host_page = gfn_to_page(vcpu->kvm, page_gfn);
you need mmap_sem held for read to use gfn_to_page.
> + if (is_error_page(host_page)) {
> + printk(KERN_INFO "%s: Bad gfn %p\n", __func__,
> + (void *)page_gfn);
> + goto out;
> + }
> + hcall_page = shared_addr = kmap(host_page);
> +
> + /* scatterlist to map guest dma pages into host physical
> + * memory -- if they exceed the DMA map limit
> + */
> + sg = kcalloc(npages, sizeof(struct scatterlist), GFP_KERNEL);
> + if (sg == NULL) {
> + printk(KERN_INFO "%s: Couldn't allocate memory (sg)\n",
> + __func__);
> + goto out_unmap;
> + }
> +
> + /* List to store all guest pages mapped into host. This will
> + * be used later to free pages on the host. Think of this as a
> + * translation table from guest dma addresses into host dma
> + * addresses
> + */
> + dmap = kzalloc(sizeof(*dmap), GFP_KERNEL);
> + if (dmap == NULL) {
> + printk(KERN_INFO "%s: Couldn't allocate memory\n",
> + __func__);
> + goto out_unmap_sg;
> + }
> +
> + /* FIXME: consider the length of the last page. Guest should
> + * send this info.
> + */
> + for (i = 0; i < npages; i++) {
> + struct page *page;
> + gpa_t gpa;
> +
> + gpa = *shared_addr++;
> + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
care for locking here too.
> + if (is_error_page(page)) {
> + int j;
> + printk(KERN_INFO "kvm %s: gpa %p not valid\n",
> + __func__, (void *)gpa);
> +
> + for (j = 0; j < i; j++)
> + put_page(sg_page(&sg[j]));
> + goto out_unmap_sg_dmap;
> + }
> + prepare_sg_entry(&sg[i], page);
> + get_page(sg_page(&sg[i]));
> + }
> +
> + /* Put this on the dmap_head list, so that we can find it
> + * later for the 'free' operation
> + */
> + dmap->sg = sg;
> + dmap->nents = npages;
> + list_add(&dmap->list, &vcpu->kvm->arch.pci_pv_dmap_head);
> +
> + /* FIXME: guest should send the direction */
> + r = dma_ops->map_sg(NULL, sg, npages, PCI_DMA_BIDIRECTIONAL);
> + if (r) {
> + r = npages;
> + *hcall_page = sg[0].dma_address | (*hcall_page & ~PAGE_MASK);
> + }
> +
> + out_unmap:
> + if (!r)
> + *hcall_page = bad_dma_address;
> + kunmap(host_page);
> + out:
> + ++vcpu->stat.hypercall_map;
> + return r;
> + out_unmap_sg_dmap:
> + kfree(dmap);
> + out_unmap_sg:
> + kfree(sg);
> + goto out_unmap;
those backwards goto are very clumsy. Might be better to give it a
further attention in order to avoid id.
> +}
> +
> +static int free_dmap(struct kvm_pv_dma_map *dmap, struct list_head *head)
> +{
> + int i;
> +
> + if (!dmap)
> + return 1;
that's ugly.
it's better to keep the free function with free-like semantics: just a
void function that plainly returns if !dmap, and check in the caller.
> +static int
> +pv_mapped_pci_device_hypercall(struct kvm_vcpu *vcpu, gfn_t page_gfn)
> +{
> + int r = 0;
> + unsigned long *shared_addr;
> + struct page *host_page;
> + struct kvm_pci_pt_info pci_pt_info;
> +
> + host_page = gfn_to_page(vcpu->kvm, page_gfn);
locking
> + if (is_error_page(host_page)) {
> + printk(KERN_INFO "%s: gfn %p not valid\n",
> + __func__, (void *)page_gfn);
> + r = -1;
r = -1 is not really informative. Better use some meaningful error.
We can return here, and avoid this goto if we always increment the
hypercall counter in the beginning of the function. But this is nitpicking.
> + goto out;
> + }
> + shared_addr = kmap(host_page);
> + memcpy(&pci_pt_info, shared_addr, sizeof(pci_pt_info));
> +
> + if (find_pci_pt_dev(&vcpu->kvm->arch.pci_pt_dev_head,
> + &pci_pt_info, 0, KVM_PT_SOURCE_ASSIGN))
> + r++; /* We have assigned the device */
> +
> + kunmap(host_page);
better use atomic mappings here.
|
|
From: Amit S. <ami...@qu...> - 2008-04-29 16:02:46
|
On Tuesday 29 April 2008 20:14:16 Glauber Costa wrote:
> Amit Shah wrote:
> > +static struct kvm_pv_dma_map*
> > +find_pci_pv_dmap(struct list_head *head, dma_addr_t dma)
> > +{
>
> might be better to prefix those functions with kvm? Even though they are
> static, it seems to be the current practice.
The function names are long enough already. Prefixing everything with kvm_
could hurt the eye as well.
> > + host_page = gfn_to_page(vcpu->kvm, page_gfn);
>
> you need mmap_sem held for read to use gfn_to_page.
Yes; it's going to trickle down soon.
> > + /* FIXME: guest should send the direction */
> > + r = dma_ops->map_sg(NULL, sg, npages, PCI_DMA_BIDIRECTIONAL);
> > + if (r) {
> > + r = npages;
> > + *hcall_page = sg[0].dma_address | (*hcall_page & ~PAGE_MASK);
> > + }
> > +
> > + out_unmap:
> > + if (!r)
> > + *hcall_page = bad_dma_address;
> > + kunmap(host_page);
> > + out:
> > + ++vcpu->stat.hypercall_map;
> > + return r;
> > + out_unmap_sg_dmap:
> > + kfree(dmap);
> > + out_unmap_sg:
> > + kfree(sg);
> > + goto out_unmap;
>
> those backwards goto are very clumsy. Might be better to give it a
> further attention in order to avoid id.
It does keep everything nicely in one place though. You're right though. Some
more attention is needed.
> > +static int free_dmap(struct kvm_pv_dma_map *dmap, struct list_head
> > *head) +{
> > + int i;
> > +
> > + if (!dmap)
> > + return 1;
>
> that's ugly.
>
> it's better to keep the free function with free-like semantics: just a
> void function that plainly returns if !dmap, and check in the caller.
I was lazy and used the return value from here to propagate it down further.
But this kicked me to modify that.
> > + if (is_error_page(host_page)) {
> > + printk(KERN_INFO "%s: gfn %p not valid\n",
> > + __func__, (void *)page_gfn);
> > + r = -1;
>
> r = -1 is not really informative. Better use some meaningful error.
The error's going to the guest. The guest, as we know, has already done a
successful DMA allocation. Something went wrong in the hypercall, and we
don't know why (bad page). Any kind of error here isn't going to be
intelligible to the guest anyway. It's mostly a host thing if we ever hit
this.
> > + if (find_pci_pt_dev(&vcpu->kvm->arch.pci_pt_dev_head,
> > + &pci_pt_info, 0, KVM_PT_SOURCE_ASSIGN))
> > + r++; /* We have assigned the device */
> > +
> > + kunmap(host_page);
>
> better use atomic mappings here.
We can't use atomic mappings for guest pages. They can be swapped out.
|
|
From: Amit S. <ami...@qu...> - 2008-05-01 13:17:00
|
On Tuesday 29 April 2008 21:28:51 Amit Shah wrote: > On Tuesday 29 April 2008 20:14:16 Glauber Costa wrote: > > Amit Shah wrote: > > > + if (find_pci_pt_dev(&vcpu->kvm->arch.pci_pt_dev_head, > > > + &pci_pt_info, 0, KVM_PT_SOURCE_ASSIGN)) > > > + r++; /* We have assigned the device */ > > > + > > > + kunmap(host_page); > > > > better use atomic mappings here. > > We can't use atomic mappings for guest pages. They can be swapped out. Actually you were right: there's no sleeping call here after doing the mapping. I've updated this call with kmap_atomic. The other function that uses kmap can't be converted since we continue to map several pages in a loop (depending on the length of the DMA region) and hence can't use kmap_atomic there. |
|
From: Avi K. <av...@qu...> - 2008-04-29 22:50:09
|
Amit Shah wrote:
>
>>> + if (is_error_page(host_page)) {
>>> + printk(KERN_INFO "%s: gfn %p not valid\n",
>>> + __func__, (void *)page_gfn);
>>> + r = -1;
>>>
>> r = -1 is not really informative. Better use some meaningful error.
>>
>
> The error's going to the guest. The guest, as we know, has already done a
> successful DMA allocation. Something went wrong in the hypercall, and we
> don't know why (bad page). Any kind of error here isn't going to be
> intelligible to the guest anyway. It's mostly a host thing if we ever hit
> this.
>
>
If the guest is not able to handle it, why bother returning an error?
Better to kill it.
But in any case, -1 is not a good error number.
>>> + if (find_pci_pt_dev(&vcpu->kvm->arch.pci_pt_dev_head,
>>> + &pci_pt_info, 0, KVM_PT_SOURCE_ASSIGN))
>>> + r++; /* We have assigned the device */
>>> +
>>> + kunmap(host_page);
>>>
>> better use atomic mappings here.
>>
>
> We can't use atomic mappings for guest pages. They can be swapped out.
>
kmap()ed pages can't be swapped out either. The atomic in kmap_atomic()
only refers to the context in which the pages are used. Atmoic kmaps
are greatly preferable to the nonatomic ones.
--
Any sufficiently difficult bug is indistinguishable from a feature.
|
|
From: Muli Ben-Y. <mu...@il...> - 2008-04-30 06:02:50
|
On Wed, Apr 30, 2008 at 01:48:38AM +0300, Avi Kivity wrote:
> Amit Shah wrote:
>>
>>>> + if (is_error_page(host_page)) {
>>>> + printk(KERN_INFO "%s: gfn %p not valid\n",
>>>> + __func__, (void *)page_gfn);
>>>> + r = -1;
>>>>
>>> r = -1 is not really informative. Better use some meaningful error.
>>>
>>
>> The error's going to the guest. The guest, as we know, has already
>> done a successful DMA allocation. Something went wrong in the
>> hypercall, and we don't know why (bad page). Any kind of error here
>> isn't going to be intelligible to the guest anyway. It's mostly a
>> host thing if we ever hit this.
>>
>
> If the guest is not able to handle it, why bother returning an
> error? Better to kill it.
>
> But in any case, -1 is not a good error number.
The guest should be able to deal with transient DMA mapping errors,
either by retrying, or quiescing the device. This is in line with how
HW IOMMUs work - they may run out of mappings for example and the
driver should be able to cope with it. Killing the guest is a last
resort.
Cheers,
Muli
|