Thread: [kvm-devel] PV DMA for PCI passthrough devices for KVM

Brought to you by: avik, mtosatti

kvm-devel

[kvm-devel] PV DMA for PCI passthrough devices for KVM

From: Amit S. <ami...@qu...> - 2008-04-29 10:37:33

This patchset implements PVDMA for handling DMA requests from
devices assigned to the guest from the host machine.

They're also available from

git-pull git://git.kernel.org/pub/scm/linux/kernel/git/amit/kvm.git pvdma

These patches are based on my pci-passthrough tree, which is available
from

git-pull git://git.kernel.org/pub/scm/linux/kernel/git/amit/kvm.git

and the userspace from

git-pull git://git.kernel.org/pub/scm/linux/kernel/git/amit/kvm-userspace.git

The first and the third patch in this series is needed on the guest (with some
bits from the 2nd as well). The 2nd patch is meant for the host kernel.

Amit.

[kvm-devel] [PATCH] KVM PV Guest: Implement paravirtualized DMA

From: Amit S. <ami...@qu...> - 2008-04-29 10:37:33

We make the dma_mapping_ops structure to point to our structure
so that every DMA access goes through us.

We make a hypercall for every device that does a DMA operation
to find out if it is an assigned device -- so that we can make
hypercalls on each DMA access. The result of this hypercall is
cached, so that this hypercall is made only once for each device

This can be compiled as a module, but that's only used for debugging.
It can be compiled into the guest kernel directly without any side
effects (if you ignore one error message about the hypercall
failing for hard disks).

Signed-off-by: Amit Shah <ami...@qu...>
---
 arch/x86/Kconfig             |    8 +
 arch/x86/kernel/Makefile     |    1 +
 arch/x86/kernel/kvm_pv_dma.c |  391 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 400 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/kernel/kvm_pv_dma.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e5790fe..aad16d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -392,6 +392,14 @@ config KVM_GUEST
 	 This option enables various optimizations for running under the KVM
 	 hypervisor.
 
+config KVM_PV_DMA
+	tristate "KVM paravirtualized DMA access"
+	---help---
+	Provides support for DMA operations in the guest. A hypercall
+	is raised to the host to enable devices owned by guest to use
+	DMA. Select this if compiling a guest kernel and you need
+	paravirtualized DMA operations.
+
 source "arch/x86/lguest/Kconfig"
 
 config PARAVIRT
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index fa19c38..0adb37b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
 obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o
 obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
+obj-$(CONFIG_KVM_PV_DMA)	+= kvm_pv_dma.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
 
 ifdef CONFIG_INPUT_PCSPKR
diff --git a/arch/x86/kernel/kvm_pv_dma.c b/arch/x86/kernel/kvm_pv_dma.c
new file mode 100644
index 0000000..db83324
--- /dev/null
+++ b/arch/x86/kernel/kvm_pv_dma.c
@@ -0,0 +1,391 @@
+/*
+ * KVM guest DMA para-virtualization driver
+ *
+ * Copyright (C) 2007, Qumranet, Inc., Amit Shah <ami...@qu...>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <asm/page.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/miscdevice.h>
+#include <linux/kvm_para.h>
+
+MODULE_AUTHOR("Amit Shah");
+MODULE_DESCRIPTION("Implements guest para-virtualized DMA");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1");
+
+#define KVM_DMA_MINOR MISC_DYNAMIC_MINOR
+
+static struct page *page;
+static unsigned long page_gfn;
+
+const struct dma_mapping_ops *orig_dma_ops;
+
+#include <linux/list.h>
+struct pv_passthrough_dev_list {
+	struct list_head list;
+	struct kvm_pci_pt_info pv_pci_info;
+	int is_pv;
+};
+static LIST_HEAD(pt_devs_head);
+
+static struct pv_passthrough_dev_list*
+find_matching_pt_dev(struct list_head *head,
+		     struct kvm_pci_pt_info *pv_pci_info)
+{
+	struct list_head *ptr;
+	struct pv_passthrough_dev_list *match;
+
+	list_for_each(ptr, head) {
+		match = list_entry(ptr, struct pv_passthrough_dev_list, list);
+		if (match &&
+		    (match->pv_pci_info.busnr == pv_pci_info->busnr) &&
+		    (match->pv_pci_info.devfn == pv_pci_info->devfn))
+			return match;
+	}
+	return NULL;
+}
+
+void empty_pt_dev_list(struct list_head *head)
+{
+	struct pv_passthrough_dev_list *match;
+
+	while (!list_empty(head)) {
+		match = list_entry(head->next, \
+				   struct pv_passthrough_dev_list, list);
+		list_del(&match->list);
+	}
+}
+
+static int kvm_is_pv_device(struct device *dev, const char *name)
+{
+	int r;
+	struct pci_dev *pci_dev;
+	struct kvm_pci_pt_info pv_pci_info;
+	struct pv_passthrough_dev_list *match;
+
+	pci_dev = to_pci_dev(dev);
+	pv_pci_info.busnr = pci_dev->bus->number;
+	pv_pci_info.devfn = pci_dev->devfn;
+
+	match = find_matching_pt_dev(&pt_devs_head, &pv_pci_info);
+	if (match) {
+		r = match->is_pv;
+		goto out;
+	}
+
+	memcpy(page_address(page), &pv_pci_info, sizeof(pv_pci_info));
+	r = kvm_hypercall1(KVM_PV_PCI_DEVICE, page_gfn);
+	if (r < 1) {
+		printk(KERN_INFO "%s: Error doing hypercall!\n", __func__);
+		r = 0;
+		goto out;
+	}
+
+	match = kmalloc(sizeof(struct pv_passthrough_dev_list), GFP_KERNEL);
+	if (match == NULL) {
+		printk(KERN_INFO "%s: Out of memory\n", __func__);
+		r = 0;
+		goto out;
+	}
+	match->pv_pci_info.busnr = pv_pci_info.busnr;
+	match->pv_pci_info.devfn = pv_pci_info.devfn;
+	match->is_pv = r;
+	list_add(&match->list, &pt_devs_head);
+ out:
+	return r;
+}
+
+static void *kvm_dma_map(void *vaddr, size_t size, dma_addr_t *dma_handle)
+{
+	int npages, i;
+	unsigned long *dma_addr;
+	dma_addr_t host_addr = bad_dma_address;
+
+	if (page == NULL)
+		goto out;
+
+	npages = get_order(size) + 1;
+	dma_addr = page_address(page);
+
+	/* We have to take into consideration the offsets for the
+	 * virtual address provided by the calling
+	 * functions. Currently both, pci_alloc_consistent and
+	 * pci_map_single call this function. We have to change it so
+	 * that we can also pass to the host the offset of the addr in
+	 * the page it is in.
+	 */
+
+	if (*dma_handle == bad_dma_address)
+		goto out;
+
+	/* It's not really OK to use dma_handle here, as the IOMMU or
+	 * swiotlb could have mapped it elsewhere. But what's a better
+	 * solution?
+	 */
+	*dma_addr++ = *dma_handle;
+	if (npages > 1) {
+		/* All of the pages will be contiguous in guest
+		 * physical memory in both, pci_map_consistent and
+		 * pci_map_single cases (see DMA-API.txt)
+		 */
+		/* FIXME: we're currently not crossing over to
+		 * multiple pages to be sent to host, in case
+		 * we have a lot of pages that we can't
+		 * accomodate in one page.
+		 */
+		for (i = 1; i < min((unsigned long)npages, MAX_PVDMA_PAGES); i++)
+			*dma_addr++ = virt_to_phys(vaddr + PAGE_SIZE * i);
+	}
+
+	/* Maybe we need more arguments (we have first two):
+	 * @npages: number of gpas pages in this hypercall
+	 * @page: page we pass to host with all the gpas in them
+	 * @more: are there any more pages coming?
+	 * @offset: offset of the address in the first page
+	 * @direction: direction for the mapping (only for pci_map_single)
+	 */
+	npages = kvm_hypercall2(KVM_PV_DMA_MAP, npages, page_gfn);
+	if (!npages)
+		host_addr = bad_dma_address;
+	else
+		host_addr = *(unsigned long *)page_address(page);
+
+ out:
+	*dma_handle = host_addr;
+	if (host_addr == bad_dma_address)
+		vaddr = NULL;
+	return vaddr;
+}
+
+static void kvm_dma_unmap(dma_addr_t dma_handle)
+{
+	kvm_hypercall1(KVM_PV_DMA_UNMAP, dma_handle);
+	return;
+}
+
+static void *kvm_dma_alloc_coherent(struct device *dev, size_t size,
+				    dma_addr_t *dma_handle, gfp_t gfp)
+{
+	void *vaddr = NULL;
+	if ((*dma_handle == bad_dma_address)
+	    || !dma_ops->is_pv_device(dev, dev->bus_id))
+		goto out;
+
+	vaddr = bus_to_virt(*(unsigned long *)dma_handle);
+	vaddr = kvm_dma_map(vaddr, size, dma_handle);
+ out:
+	return vaddr;
+}
+
+static void kvm_dma_free_coherent(struct device *dev, size_t size, void *vaddr,
+				  dma_addr_t dma_handle)
+{
+	kvm_dma_unmap(dma_handle);
+}
+
+static dma_addr_t kvm_dma_map_single(struct device *dev, phys_addr_t paddr,
+				     size_t size, int direction)
+{
+	dma_addr_t r;
+
+	r = orig_dma_ops->map_single(dev, paddr, size, direction);
+
+	if (r != bad_dma_address && kvm_is_pv_device(dev, dev->bus_id))
+		kvm_dma_map(phys_to_virt(paddr), size, &r);
+	return r;
+}
+
+static inline void kvm_dma_unmap_single(struct device *dev, dma_addr_t addr,
+					size_t size, int direction)
+{
+	kvm_dma_unmap(addr);
+}
+
+int kvm_pv_dma_mapping_error(dma_addr_t dma_addr)
+{
+	if (orig_dma_ops->mapping_error)
+		return orig_dma_ops->mapping_error(dma_addr);
+
+	printk(KERN_ERR "%s: Unhandled PV DMA operation. Report this.\n",
+	       __func__);
+	return dma_addr == bad_dma_address;
+}
+
+/* like map_single, but doesn't check the device mask */
+dma_addr_t kvm_pv_dma_map_simple(struct device *hwdev, phys_addr_t paddr,
+				 size_t size, int direction)
+{
+	return orig_dma_ops->map_simple(hwdev, paddr, size, direction);
+}
+
+void kvm_pv_dma_sync_single_for_cpu(struct device *hwdev,
+				    dma_addr_t dma_handle, size_t size,
+				    int direction)
+{
+	if (orig_dma_ops->sync_single_for_cpu)
+		orig_dma_ops->sync_single_for_cpu(hwdev, dma_handle,
+						  size, direction);
+}
+
+void kvm_pv_dma_sync_single_for_device(struct device *hwdev,
+				       dma_addr_t dma_handle, size_t size,
+				       int direction)
+{
+	if (orig_dma_ops->sync_single_for_device)
+		orig_dma_ops->sync_single_for_device(hwdev, dma_handle,
+						     size, direction);
+}
+
+void kvm_pv_dma_sync_single_range_for_cpu(struct device *hwdev,
+					  dma_addr_t dma_handle,
+					  unsigned long offset,
+					  size_t size, int direction)
+{
+	if (orig_dma_ops->sync_single_range_for_cpu)
+		orig_dma_ops->sync_single_range_for_cpu(hwdev, dma_handle,
+							offset, size,
+							direction);
+}
+
+void kvm_pv_dma_sync_single_range_for_device(struct device *hwdev,
+					     dma_addr_t dma_handle,
+					     unsigned long offset,
+					     size_t size, int direction)
+{
+	if (orig_dma_ops->sync_single_range_for_device)
+		orig_dma_ops->sync_single_range_for_device(hwdev, dma_handle,
+							   offset, size,
+							   direction);
+}
+
+void kvm_pv_dma_sync_sg_for_cpu(struct device *hwdev,
+		     struct scatterlist *sg, int nelems,
+		     int direction)
+{
+	if (orig_dma_ops->sync_sg_for_cpu)
+		orig_dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
+}
+
+void kvm_pv_dma_sync_sg_for_device(struct device *hwdev,
+				   struct scatterlist *sg, int nelems,
+				   int direction)
+{
+	if (orig_dma_ops->sync_sg_for_device)
+		orig_dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
+}
+
+int kvm_pv_dma_map_sg(struct device *hwdev, struct scatterlist *sg,
+		      int nents, int direction)
+{
+	return orig_dma_ops->map_sg(hwdev, sg, nents, direction);
+	printk(KERN_ERR "%s: Unhandled PV DMA operation. Report this.\n",
+	       __func__);
+	return 0;
+}
+
+void kvm_pv_dma_unmap_sg(struct device *hwdev,
+			 struct scatterlist *sg, int nents,
+			 int direction)
+{
+	if (orig_dma_ops->unmap_sg)
+		orig_dma_ops->unmap_sg(hwdev, sg, nents, direction);
+}
+
+int kvm_pv_dma_dma_supported(struct device *hwdev, u64 mask)
+{
+	if (orig_dma_ops->dma_supported)
+		return orig_dma_ops->dma_supported(hwdev, mask);
+	printk(KERN_ERR "%s: Unhandled PV DMA operation. Report this.\n",
+	       __func__);
+	return 0;
+}
+
+static const struct dma_mapping_ops kvm_dma_ops = {
+	.alloc_coherent	= kvm_dma_alloc_coherent,
+	.free_coherent	= kvm_dma_free_coherent,
+	.map_single	= kvm_dma_map_single,
+	.unmap_single	= kvm_dma_unmap_single,
+	.is_pv_device	= kvm_is_pv_device,
+
+	.mapping_error  = kvm_pv_dma_mapping_error,
+	.map_simple	= kvm_pv_dma_map_simple,
+	.sync_single_for_cpu = kvm_pv_dma_sync_single_for_cpu,
+	.sync_single_for_device = kvm_pv_dma_sync_single_for_device,
+	.sync_single_range_for_cpu = kvm_pv_dma_sync_single_range_for_cpu,
+	.sync_single_range_for_device = kvm_pv_dma_sync_single_range_for_device,
+	.sync_sg_for_cpu = kvm_pv_dma_sync_sg_for_cpu,
+	.sync_sg_for_device = kvm_pv_dma_sync_sg_for_device,
+	.map_sg		= kvm_pv_dma_map_sg,
+	.unmap_sg	= kvm_pv_dma_unmap_sg,
+};
+
+static struct file_operations dma_chardev_ops;
+static struct miscdevice kvm_dma_dev = {
+	KVM_DMA_MINOR,
+	"kvm_dma",
+	&dma_chardev_ops,
+};
+
+int __init kvm_pv_dma_init(void)
+{
+	int r;
+
+	dma_chardev_ops.owner = THIS_MODULE;
+	if (misc_register(&kvm_dma_dev)) {
+		printk(KERN_ERR "%s: misc device register failed\n",
+		       __func__);
+		r = -EBUSY;
+		goto out;
+	}
+	if (!kvm_para_available()) {
+		printk(KERN_ERR "KVM paravirt support not available\n");
+		r = -ENODEV;
+		goto out_dereg;
+	}
+
+	/* FIXME: check for hypercall support */
+	page = alloc_page(GFP_ATOMIC);
+	if (page == NULL) {
+		printk(KERN_ERR "%s: Could not allocate page\n", __func__);
+		r = -ENOMEM;
+		goto out_dereg;
+	}
+	page_gfn = page_to_pfn(page);
+
+	orig_dma_ops = dma_ops;
+	dma_ops = &kvm_dma_ops;
+
+	printk(KERN_INFO "KVM PV DMA engine registered\n");
+	return 0;
+	goto out;
+	goto out_free;
+
+ out_free:
+	__free_page(page);
+ out_dereg:
+	misc_deregister(&kvm_dma_dev);
+ out:
+	return r;
+}
+
+static void __exit kvm_pv_dma_exit(void)
+{
+	dma_ops = orig_dma_ops;
+
+	__free_page(page);
+
+	empty_pt_dev_list(&pt_devs_head);
+
+	misc_deregister(&kvm_dma_dev);
+}
+
+module_init(kvm_pv_dma_init);
+module_exit(kvm_pv_dma_exit);
-- 
1.5.4.3

[kvm-devel] [PATCH] KVM x86: Handle hypercalls for assigned PCI devices

From: Amit S. <ami...@qu...> - 2008-04-29 10:37:33

We introduce three hypercalls:
1. When the guest wants to check if a particular device is an assigned device
   (this is done once per device by the guest to enable / disable hypercall-
   based translation of addresses)

2. map: to convert guest phyical addresses to host physical address to pass on
   to the device for DMA. We also pin the pages thus requested so that they're
   not swapped out.

3. unmap: to unpin the pages and free any information we might have stored.

Signed-off-by: Amit Shah <ami...@qu...>
---
 arch/x86/kvm/x86.c         |  211 +++++++++++++++++++++++++++++++++++++++++++-
 include/asm-x86/kvm_host.h |   15 +++
 include/asm-x86/kvm_para.h |    8 ++
 3 files changed, 233 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fb9b329..94ee4db 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -24,8 +24,11 @@
 #include <linux/interrupt.h>
 #include <linux/kvm.h>
 #include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
+#include <linux/highmem.h>
 #include <linux/mman.h>
 #include <linux/highmem.h>
 
@@ -76,6 +79,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "halt_exits", VCPU_STAT(halt_exits) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "hypercalls", VCPU_STAT(hypercalls) },
+	{ "hypercall_map", VCPU_STAT(hypercall_map) },
+	{ "hypercall_unmap", VCPU_STAT(hypercall_unmap) },
+	{ "hypercall_pv_dev", VCPU_STAT(hypercall_pv_dev) },
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
 	{ "irq_exits", VCPU_STAT(irq_exits) },
 	{ "host_state_reload", VCPU_STAT(host_state_reload) },
@@ -95,9 +101,164 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
+static struct kvm_pv_dma_map*
+find_pci_pv_dmap(struct list_head *head, dma_addr_t dma)
+{
+	struct list_head *ptr;
+	struct kvm_pv_dma_map *match;
+
+	list_for_each(ptr, head) {
+		match = list_entry(ptr, struct kvm_pv_dma_map, list);
+		if (match && match->sg[0].dma_address == dma)
+			return match;
+	}
+	return NULL;
+}
+
+static void prepare_sg_entry(struct scatterlist *sg, struct page *page)
+{
+	unsigned int offset, len;
+
+	offset = page_to_phys(page) & ~PAGE_MASK;
+	len = PAGE_SIZE - offset;
+
+	/* FIXME: Use the sg chaining features */
+	sg_set_page(sg, page, len, offset);
+}
+
+static int pv_map_hypercall(struct kvm_vcpu *vcpu, int npages, gfn_t page_gfn)
+{
+	int i, r = 0;
+	struct page *host_page;
+	struct scatterlist *sg;
+	struct kvm_pv_dma_map *dmap;
+	unsigned long *shared_addr, *hcall_page;
+
+	/* We currently don't support dma mappings which have more than
+	 * PAGE_SIZE/sizeof(unsigned long *) pages
+	 */
+	if (!npages || npages > MAX_PVDMA_PAGES) {
+		printk(KERN_INFO "%s: Illegal number of pages: %d\n",
+		       __func__, npages);
+		goto out;
+	}
+
+	host_page = gfn_to_page(vcpu->kvm, page_gfn);
+	if (is_error_page(host_page)) {
+		printk(KERN_INFO "%s: Bad gfn %p\n", __func__,
+		       (void *)page_gfn);
+		goto out;
+	}
+	hcall_page = shared_addr = kmap(host_page);
+
+	/* scatterlist to map guest dma pages into host physical
+	 * memory -- if they exceed the DMA map limit
+	 */
+	sg = kcalloc(npages, sizeof(struct scatterlist), GFP_KERNEL);
+	if (sg == NULL) {
+		printk(KERN_INFO "%s: Couldn't allocate memory (sg)\n",
+		       __func__);
+		goto out_unmap;
+	}
+
+	/* List to store all guest pages mapped into host. This will
+	 * be used later to free pages on the host. Think of this as a
+	 * translation table from guest dma addresses into host dma
+	 * addresses
+	 */
+	dmap = kzalloc(sizeof(*dmap), GFP_KERNEL);
+	if (dmap == NULL) {
+		printk(KERN_INFO "%s: Couldn't allocate memory\n",
+		       __func__);
+		goto out_unmap_sg;
+	}
+
+	/* FIXME: consider the length of the last page. Guest should
+	 * send this info.
+	 */
+	for (i = 0; i < npages; i++) {
+		struct page *page;
+		gpa_t gpa;
+
+		gpa = *shared_addr++;
+		page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+		if (is_error_page(page)) {
+			int j;
+			printk(KERN_INFO "kvm %s: gpa %p not valid\n",
+			       __func__, (void *)gpa);
+
+			for (j = 0; j < i; j++)
+				put_page(sg_page(&sg[j]));
+			goto out_unmap_sg_dmap;
+		}
+		prepare_sg_entry(&sg[i], page);
+		get_page(sg_page(&sg[i]));
+	}
+
+	/* Put this on the dmap_head list, so that we can find it
+	 * later for the 'free' operation
+	 */
+	dmap->sg = sg;
+	dmap->nents = npages;
+	list_add(&dmap->list, &vcpu->kvm->arch.pci_pv_dmap_head);
+
+	/* FIXME: guest should send the direction */
+	r = dma_ops->map_sg(NULL, sg, npages, PCI_DMA_BIDIRECTIONAL);
+	if (r) {
+		r = npages;
+		*hcall_page = sg[0].dma_address | (*hcall_page & ~PAGE_MASK);
+	}
+
+ out_unmap:
+	if (!r)
+		*hcall_page = bad_dma_address;
+	kunmap(host_page);
+ out:
+	++vcpu->stat.hypercall_map;
+	return r;
+ out_unmap_sg_dmap:
+	kfree(dmap);
+ out_unmap_sg:
+	kfree(sg);
+	goto out_unmap;
+}
+
+static int free_dmap(struct kvm_pv_dma_map *dmap, struct list_head *head)
+{
+	int i;
+
+	if (!dmap)
+		return 1;
+
+	for (i = 0; i < dmap->nents; i++)
+		put_page(sg_page(&dmap->sg[i]));
+
+	if (dma_ops->unmap_sg)
+		dma_ops->unmap_sg(NULL, dmap->sg, dmap->nents,
+				  PCI_DMA_BIDIRECTIONAL);
+	kfree(dmap->sg);
+	list_del(&dmap->list);
+	kfree(dmap);
+
+	return 0;
+}
+
+/* FIXME: the argument passed from guest can be 32-bit. We need 64-bit for
+ * dma_addr_t. Send the dma address in a page (or split in two registers)
+ */
+static int pv_unmap_hypercall(struct kvm_vcpu *vcpu, dma_addr_t dma)
+{
+	struct kvm_pv_dma_map *dmap;
+
+	++vcpu->stat.hypercall_unmap;
+
+	dmap = find_pci_pv_dmap(&vcpu->kvm->arch.pci_pv_dmap_head, dma);
+	return free_dmap(dmap, &vcpu->kvm->arch.pci_pv_dmap_head);
+}
+
 /*
  * Used to find a registered host PCI device (a "passthrough" device)
- * during ioctls, interrupts or EOI
+ * during hypercalls, ioctls or interrupts or EOI
  */
 static struct kvm_pci_pt_dev_list *
 find_pci_pt_dev(struct list_head *head,
@@ -136,6 +297,34 @@ find_pci_pt_dev(struct list_head *head,
 	return NULL;
 }
 
+static int
+pv_mapped_pci_device_hypercall(struct kvm_vcpu *vcpu, gfn_t page_gfn)
+{
+	int r = 0;
+	unsigned long *shared_addr;
+	struct page *host_page;
+	struct kvm_pci_pt_info pci_pt_info;
+
+	host_page = gfn_to_page(vcpu->kvm, page_gfn);
+	if (is_error_page(host_page)) {
+		printk(KERN_INFO "%s: gfn %p not valid\n",
+		       __func__, (void *)page_gfn);
+		r = -1;
+		goto out;
+	}
+	shared_addr = kmap(host_page);
+	memcpy(&pci_pt_info, shared_addr, sizeof(pci_pt_info));
+
+	if (find_pci_pt_dev(&vcpu->kvm->arch.pci_pt_dev_head,
+			    &pci_pt_info, 0, KVM_PT_SOURCE_ASSIGN))
+		r++; /* We have assigned the device */
+
+	kunmap(host_page);
+ out:
+	++vcpu->stat.hypercall_pv_dev;
+	return r;
+}
+
 static DECLARE_BITMAP(pt_irq_pending, NR_IRQS);
 static DECLARE_BITMAP(pt_irq_handled, NR_IRQS);
 
@@ -218,6 +407,10 @@ static int kvm_vm_ioctl_pci_pt_dev(struct kvm *kvm,
 		set_bit(pci_pt_dev->host.irq, pt_irq_handled);
 	}
 	list_add(&match->list, &kvm->arch.pci_pt_dev_head);
+
+	printk(KERN_INFO "kvm: Handling hypercalls for device %02x:%02x.%1x\n",
+	       pci_pt_dev->host.busnr, PCI_SLOT(pci_pt_dev->host.devfn),
+	       PCI_FUNC(pci_pt_dev->host.devfn));
  out:
 	return r;
  out_free:
@@ -248,6 +441,7 @@ static void kvm_free_pci_passthrough(struct kvm *kvm)
 {
 	struct list_head *ptr, *ptr2;
 	struct kvm_pci_pt_dev_list *pci_pt_dev;
+	struct kvm_pv_dma_map *dmap;
 
 	list_for_each_safe(ptr, ptr2, &kvm->arch.pci_pt_dev_head) {
 		pci_pt_dev = list_entry(ptr, struct kvm_pci_pt_dev_list, list);
@@ -257,6 +451,11 @@ static void kvm_free_pci_passthrough(struct kvm *kvm)
 
 		list_del(&pci_pt_dev->list);
 	}
+
+	list_for_each_safe(ptr, ptr2, &kvm->arch.pci_pv_dmap_head) {
+		dmap = list_entry(ptr, struct kvm_pv_dma_map, list);
+		free_dmap(dmap, &kvm->arch.pci_pv_dmap_head);
+	}
 }
 
 unsigned long segment_base(u16 selector)
@@ -2672,6 +2871,15 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	}
 
 	switch (nr) {
+	case KVM_PV_DMA_MAP:
+		ret = pv_map_hypercall(vcpu, a0, a1);
+		break;
+	case KVM_PV_DMA_UNMAP:
+		ret = pv_unmap_hypercall(vcpu, a0);
+		break;
+	case KVM_PV_PCI_DEVICE:
+		ret = pv_mapped_pci_device_hypercall(vcpu, a0);
+		break;
 	case KVM_HC_VAPIC_POLL_IRQ:
 		ret = 0;
 		break;
@@ -4059,6 +4267,7 @@ struct  kvm *kvm_arch_create_vm(void)
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.pci_pt_dev_head);
+	INIT_LIST_HEAD(&kvm->arch.pci_pv_dmap_head);
 
 	return kvm;
 }
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 3b9cb50..d52e44e 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -298,6 +298,17 @@ struct kvm_mem_alias {
 #define KVM_PT_SOURCE_IRQ_ACK	2
 #define KVM_PT_SOURCE_ASSIGN	3
 
+/* Paravirt DMA: We pin the host-side pages for the GPAs that we get
+ * for the DMA operation. We do a sg_map on the host pages for a DMA
+ * operation on the guest side. We un-pin the pages on the
+ * unmap_hypercall.
+ */
+struct kvm_pv_dma_map {
+	struct list_head list;
+	int nents;
+	struct scatterlist *sg;
+};
+
 /* This list is to store the guest bus:device:function-irq and host
  * bus:device:function-irq mapping for assigned devices.
  */
@@ -319,6 +330,7 @@ struct kvm_arch{
 	 */
 	struct list_head active_mmu_pages;
 	struct list_head pci_pt_dev_head;
+	struct list_head pci_pv_dmap_head;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
@@ -366,6 +378,9 @@ struct kvm_vcpu_stat {
 	u32 insn_emulation;
 	u32 insn_emulation_fail;
 	u32 hypercalls;
+	u32 hypercall_map;
+	u32 hypercall_unmap;
+	u32 hypercall_pv_dev;
 };
 
 struct descriptor_table {
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
index 5f93b78..e13bf4c 100644
--- a/include/asm-x86/kvm_para.h
+++ b/include/asm-x86/kvm_para.h
@@ -74,6 +74,12 @@ extern void kvmclock_init(void);
  */
 #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
 
+/* Hypercall numbers */
+#define KVM_PV_UNUSED		0
+#define KVM_PV_DMA_MAP		101
+#define KVM_PV_DMA_UNMAP	102
+#define KVM_PV_PCI_DEVICE	103
+
 /* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
  * instruction.  The hypervisor may replace it with something else but only the
  * instructions are guaranteed to be supported.
@@ -155,6 +161,8 @@ static inline unsigned int kvm_arch_para_features(void)
 	return cpuid_eax(KVM_CPUID_FEATURES);
 }
 
+/* Max. DMA pages we send from guest to host for mapping */
+#define MAX_PVDMA_PAGES (PAGE_SIZE / sizeof(unsigned long *))
 #endif /* KERNEL */
 
 /* Stores information for identifying host PCI devices assigned to the
-- 
1.5.4.3

[kvm-devel] [PATCH] x86 DMA: Handle devices assigned to the guest by the host

From: Amit S. <ami...@qu...> - 2008-04-29 10:37:34

dma_alloc_coherent() doesn't call dma_ops->alloc_coherent in case no IOMMU
translations are necessary. However, if the device doing the DMA is a
physical device assigned to the guest OS by the host, we need to map
all the DMA addresses to the host machine addresses. This is done via
hypercalls to the host.

In KVM, with pci passthrough support, we can assign actual devices to the
guest OS which need this functionality.

Signed-off-by: Amit Shah <ami...@qu...>
---
 arch/x86/kernel/pci-dma.c     |   11 +++++++++++
 include/asm-x86/dma-mapping.h |    2 ++
 2 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 388b113..678cafb 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -443,6 +443,17 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		memset(memory, 0, size);
 		if (!mmu) {
 			*dma_handle = bus;
+			if (unlikely(dma_ops->is_pv_device) &&
+			    unlikely(dma_ops->is_pv_device(dev, dev->bus_id))) {
+				void *r;
+				r = dma_ops->alloc_coherent(dev, size,
+							    dma_handle, gfp);
+				if (r == NULL) {
+					free_pages((unsigned long)memory,
+						   get_order(size));
+					memory = NULL;
+				}
+			}
 			return memory;
 		}
 	}
diff --git a/include/asm-x86/dma-mapping.h b/include/asm-x86/dma-mapping.h
index a1a4dc7..b9c6a39 100644
--- a/include/asm-x86/dma-mapping.h
+++ b/include/asm-x86/dma-mapping.h
@@ -55,6 +55,8 @@ struct dma_mapping_ops {
 				int direction);
 	int             (*dma_supported)(struct device *hwdev, u64 mask);
 	int		is_phys;
+	/* Is this a physical device in a paravirtualized guest? */
+	int		(*is_pv_device)(struct device *hwdev, const char *name);
 };
 
 extern const struct dma_mapping_ops *dma_ops;
-- 
1.5.4.3

Re: [kvm-devel] [PATCH] x86 DMA: Handle devices assigned to the guest by the host

From: Muli Ben-Y. <mu...@il...> - 2008-04-30 06:27:15

On Tue, Apr 29, 2008 at 01:37:29PM +0300, Amit Shah wrote:

> dma_alloc_coherent() doesn't call dma_ops->alloc_coherent in case no
> IOMMU translations are necessary.

I always thought this was a huge wart in the x86-64 DMA ops. Would
there be strong resistance to fixing it so that alloc_coherent
matches the way the other ops are used? This will eliminate the need
for this patch and will make other DMA ops implementations saner.

Cheers,
Muli

Re: [kvm-devel] PV DMA for PCI passthrough devices for KVM

From: Andi K. <an...@fi...> - 2008-04-29 13:15:59

Amit Shah <ami...@qu...> writes:

> This patchset implements PVDMA for handling DMA requests from
> devices assigned to the guest from the host machine.

You forgot to post a high level design overview of how this works,
what it is good for, what are the design trade offs etc.? 

Include that in the first patch.

-Andi

Re: [kvm-devel] [PATCH] x86 DMA: Handle devices assigned to the guest by the host

From: Andi K. <an...@fi...> - 2008-04-29 13:16:09

Amit Shah <ami...@qu...> writes:
> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
> index 388b113..678cafb 100644
> --- a/arch/x86/kernel/pci-dma.c
> +++ b/arch/x86/kernel/pci-dma.c
> @@ -443,6 +443,17 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
>  		memset(memory, 0, size);
>  		if (!mmu) {
>  			*dma_handle = bus;
> +			if (unlikely(dma_ops->is_pv_device) &&
> +			    unlikely(dma_ops->is_pv_device(dev, dev->bus_id))) {

First double unlikely in a condition is useless. Just drop them.

And then ->is_xyz() in a generic vops interface is about as ugly
and non generic as you can get. dma_alloc_coherent is not performance
critical, so you should rather change the interface that ->alloc_coherent
is always called and the other handlers handle the !mmu case correctly.
In fact they need that already I guess (e.g. on DMAR there is not really
a nommu case) 

-Andi

Re: [kvm-devel] [PATCH] x86 DMA: Handle devices assigned to the guest by the host

From: Amit S. <ami...@qu...> - 2008-04-29 14:00:27

On Tuesday 29 April 2008 18:44:23 Andi Kleen wrote:
> Amit Shah <ami...@qu...> writes:
> > diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
> > index 388b113..678cafb 100644
> > --- a/arch/x86/kernel/pci-dma.c
> > +++ b/arch/x86/kernel/pci-dma.c
> > @@ -443,6 +443,17 @@ dma_alloc_coherent(struct device *dev, size_t size,
> > dma_addr_t *dma_handle, memset(memory, 0, size);
> >  		if (!mmu) {
> >  			*dma_handle = bus;
> > +			if (unlikely(dma_ops->is_pv_device) &&
> > +			    unlikely(dma_ops->is_pv_device(dev, dev->bus_id))) {
>
> First double unlikely in a condition is useless. Just drop them.
>
> And then ->is_xyz() in a generic vops interface is about as ugly
> and non generic as you can get. dma_alloc_coherent is not performance
> critical, so you should rather change the interface that ->alloc_coherent
> is always called and the other handlers handle the !mmu case correctly.
> In fact they need that already I guess (e.g. on DMAR there is not really
> a nommu case)

This point came up the last time I sent out the patch; we should do this as 
well as implement stackable dma_ops (the need for that is evident in the next 
patch).

Thanks for the observation; this should be the next step.

Amit.

Re: [kvm-devel] [PATCH] KVM PV Guest: Implement paravirtualized DMA

From: Andi K. <an...@fi...> - 2008-04-29 13:35:35

Amit Shah <ami...@qu...> writes:
> +
> +static struct page *page;
> +static unsigned long page_gfn;
Bad variable names

> +
> +const struct dma_mapping_ops *orig_dma_ops;

I suspect real dma ops stacking will need some further thought than
your simple hacks
> +
> +	match = find_matching_pt_dev(&pt_devs_head, &pv_pci_info);
> +	if (match) {
> +		r = match->is_pv;
> +		goto out;
> +	}
> +
> +	memcpy(page_address(page), &pv_pci_info, sizeof(pv_pci_info));

Note that on 32bit page_address() might be not mapped.

> +
> +	npages = get_order(size) + 1;

Are you sure that's correct? It looks quite bogus. order is a 2 logarithm,
normally npages = 1 << order

if you want  npages from order the correct  need 1 << order

Haven't read further, but to be honest the code doesn't seem to be anywhere
near merging quality.

-Andi

Re: [kvm-devel] [PATCH] KVM PV Guest: Implement paravirtualized DMA

From: Amit S. <ami...@qu...> - 2008-04-29 13:58:44

On Tuesday 29 April 2008 19:01:32 Andi Kleen wrote:
> Amit Shah <ami...@qu...> writes:

> > +const struct dma_mapping_ops *orig_dma_ops;
>
> I suspect real dma ops stacking will need some further thought than
> your simple hacks

Yes; that's something we're planning to do.

> Haven't read further, but to be honest the code doesn't seem to be anywhere
> near merging quality.

I'm basically using these patches to test the PCI passthrough functionality 
(by which we can assign host PCI devices to a guest OS via KVM). While other 
methods of handling DMA operations are being worked on (1-1 mapping of the 
guest in the host address space and virtualization-aware IOMMU translations), 
this patchset provides a quick way to check if things indeed work.

However, if some version of this patch can be useful upstream, I'll be glad to 
work on that. That said, as you point out, we need stackable dma ops as well 
as getting rid of the is_pv_device() function in dma_ops and that's something 
that can be done right away.

Thanks for the review!

Amit